Introduction¶

Since Jan. 1, 2015, The Washington Post has been compiling a database of every fatal shooting in the US by a police officer in the line of duty.

Import Statements¶

In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

Notebook Presentation¶

In [4]:
pd.options.display.float_format = '{:,.2f}'.format

Load the Data¶

In [5]:
df_fatalities = pd.read_csv('Deaths_by_Police_US.csv', encoding="windows-1252")
In [6]:
df_hh_income = pd.read_csv('Median_Household_Income_2015.csv', encoding="windows-1252")
df_hh_income.head()
Out[6]:
Geographic Area City Median Income
0 AL Abanda CDP 11207
1 AL Abbeville city 25615
2 AL Adamsville city 42575
3 AL Addison town 37083
4 AL Akron town 21667
In [7]:
df_pct_poverty = pd.read_csv('Pct_People_Below_Poverty_Level.csv', encoding="windows-1252")
df_pct_poverty.head()
Out[7]:
Geographic Area City poverty_rate
0 AL Abanda CDP 78.8
1 AL Abbeville city 29.1
2 AL Adamsville city 25.5
3 AL Addison town 30.7
4 AL Akron town 42
In [7]:
 
In [8]:
df_pct_completed_hs = pd.read_csv('Pct_Over_25_Completed_High_School.csv', encoding="windows-1252")
df_pct_completed_hs.head()
Out[8]:
Geographic Area City percent_completed_hs
0 AL Abanda CDP 21.2
1 AL Abbeville city 69.1
2 AL Adamsville city 78.9
3 AL Addison town 81.4
4 AL Akron town 68.6
In [8]:
 
In [9]:
df_share_race_city = pd.read_csv('Share_of_Race_By_City.csv', encoding="windows-1252")
df_share_race_city.head()
Out[9]:
Geographic area City share_white share_black share_native_american share_asian share_hispanic
0 AL Abanda CDP 67.2 30.2 0 0 1.6
1 AL Abbeville city 54.4 41.4 0.1 1 3.1
2 AL Adamsville city 52.3 44.9 0.5 0.3 2.3
3 AL Addison town 99.1 0.1 0 0.1 0.4
4 AL Akron town 13.2 86.5 0 0 0.3
In [13]:
df_fatalities.head()
Out[13]:
id name date manner_of_death armed age gender race city state signs_of_mental_illness threat_level flee body_camera
0 3 Tim Elliot 02/01/15 shot gun 53.00 M A Shelton WA True attack Not fleeing False
1 4 Lewis Lee Lembke 02/01/15 shot gun 47.00 M W Aloha OR False attack Not fleeing False
2 5 John Paul Quintero 03/01/15 shot and Tasered unarmed 23.00 M H Wichita KS False other Not fleeing False
3 8 Matthew Hoffman 04/01/15 shot toy weapon 32.00 M W San Francisco CA True attack Not fleeing False
4 9 Michael Rodriguez 04/01/15 shot nail gun 39.00 M H Evans CO False attack Not fleeing False

Standardize the column names for joins

In [14]:
def standardize_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df
In [15]:
df_hh_income = standardize_columns(df_hh_income)
df_hh_income.head()
Out[15]:
geographic_area city median_income
0 AL Abanda CDP 11207
1 AL Abbeville city 25615
2 AL Adamsville city 42575
3 AL Addison town 37083
4 AL Akron town 21667
In [16]:
df_pct_poverty = standardize_columns(df_pct_poverty)
df_pct_poverty.head()
Out[16]:
geographic_area city poverty_rate
0 AL Abanda CDP 78.8
1 AL Abbeville city 29.1
2 AL Adamsville city 25.5
3 AL Addison town 30.7
4 AL Akron town 42
In [17]:
df_pct_completed_hs = standardize_columns(df_pct_completed_hs)
df_pct_completed_hs.head()
Out[17]:
geographic_area city percent_completed_hs
0 AL Abanda CDP 21.2
1 AL Abbeville city 69.1
2 AL Adamsville city 78.9
3 AL Addison town 81.4
4 AL Akron town 68.6
In [18]:
df_share_race_city = standardize_columns(df_share_race_city)
df_share_race_city.head()
Out[18]:
geographic_area city share_white share_black share_native_american share_asian share_hispanic
0 AL Abanda CDP 67.2 30.2 0 0 1.6
1 AL Abbeville city 54.4 41.4 0.1 1 3.1
2 AL Adamsville city 52.3 44.9 0.5 0.3 2.3
3 AL Addison town 99.1 0.1 0 0.1 0.4
4 AL Akron town 13.2 86.5 0 0 0.3
In [19]:
df_fatalities = standardize_columns(df_fatalities)
df_fatalities.head()
Out[19]:
id name date manner_of_death armed age gender race city state signs_of_mental_illness threat_level flee body_camera
0 3 Tim Elliot 02/01/15 shot gun 53.00 M A Shelton WA True attack Not fleeing False
1 4 Lewis Lee Lembke 02/01/15 shot gun 47.00 M W Aloha OR False attack Not fleeing False
2 5 John Paul Quintero 03/01/15 shot and Tasered unarmed 23.00 M H Wichita KS False other Not fleeing False
3 8 Matthew Hoffman 04/01/15 shot toy weapon 32.00 M W San Francisco CA True attack Not fleeing False
4 9 Michael Rodriguez 04/01/15 shot nail gun 39.00 M H Evans CO False attack Not fleeing False

Now they should all be the spelling and punctuation:

In [20]:
print(f"Column names of df_hh_income: {df_hh_income.columns.tolist()}")
print(f"Column names of pct_poverty: {df_pct_poverty.columns.tolist()}")
print(f"Column names of pct_completed: {df_pct_completed_hs.columns.tolist()}")
print(f"Column names of share_race_city: {df_share_race_city.columns.tolist()}")
print(f"Column names of fatalities: {df_fatalities.columns.tolist()}")
Column names of df_hh_income: ['geographic_area', 'city', 'median_income']
Column names of pct_poverty: ['geographic_area', 'city', 'poverty_rate']
Column names of pct_completed: ['geographic_area', 'city', 'percent_completed_hs']
Column names of share_race_city: ['geographic_area', 'city', 'share_white', 'share_black', 'share_native_american', 'share_asian', 'share_hispanic']
Column names of fatalities: ['id', 'name', 'date', 'manner_of_death', 'armed', 'age', 'gender', 'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level', 'flee', 'body_camera']

Preliminary Data Exploration¶

In [22]:
#shapes of dataframes
print(f"shape of df_hh_income: {df_hh_income.shape},\nshape of pct_poverty: {df_pct_poverty.shape},\nshape of df_pct_completed: {df_pct_completed_hs.shape},\nshape of df_share_race_city: {df_share_race_city.shape},\nshape of fatalities: {df_fatalities.shape}")
shape of df_hh_income: (29322, 3),
shape of pct_poverty: (29329, 3),
shape of df_pct_completed: (29329, 3),
shape of df_share_race_city: (29268, 7),
shape of fatalities: (2535, 14)

Some rows are the same size, some are not, that means there are either duplicates, relationships with null values, or missing data. For example, the income table may not have a record for every city, or there may be two in one table and none in the other, this will be tested.

First I will merge all the tables with the same cardinality, merged from biggest to smallest

In [23]:
# Step 1: Create a Master List of Unique Geographic Area & City.  Thi siwll be used to join all the other tables with the same cardinality
master_ls = pd.concat([df_hh_income[['geographic_area', 'city']]
                          , df_pct_completed_hs[['geographic_area', 'city']]
                          , df_pct_poverty[['geographic_area', 'city']]
                          , df_share_race_city[['geographic_area', 'city']]]).drop_duplicates()
print(master_ls)
      geographic_area                city
0                  AL          Abanda CDP
1                  AL      Abbeville city
2                  AL     Adamsville city
3                  AL        Addison town
4                  AL          Akron town
...               ...                 ...
28243              WV   Summersville town
28256              WV     Upper Falls CDP
28469              WI         Delwood CDP
28667              WI  Lake Shangrila CDP
28828              WI       Pell Lake CDP

[29477 rows x 2 columns]

Perform left Joins with the Master List, use left join to ensure all records from the joining table are joined ot the master table. Created indicators to map if the record joined or not to be used later in the analysis.

In [24]:
df_hh_income_check = master_ls.merge(df_hh_income, on=['geographic_area', 'city'], how='left', indicator=True)
df_pct_completed_hs_check = master_ls.merge(df_pct_completed_hs, on=['geographic_area', 'city'], how='left', indicator=True)
df_pct_poverty_check = master_ls.merge(df_pct_poverty, on=['geographic_area', 'city'], how='left', indicator=True)
df_share_race_city_check = master_ls.merge(df_share_race_city, on=['geographic_area', 'city'], how='left', indicator=True)

Now lets look at the tables that have no matching values.

In [25]:
df_hh_income_missing = df_hh_income_check[df_hh_income_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_pct_completed_hs_missing = df_pct_completed_hs_check[df_pct_completed_hs_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_pct_poverty_missing = df_pct_poverty_check[df_pct_poverty_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_share_race_city_missing = df_share_race_city_check[df_share_race_city_check['_merge'] == 'left_only'].drop(columns=['_merge'])
In [26]:
print("Records missing from df_hh_income:")
print(df_hh_income_missing)
Records missing from df_hh_income:
      geographic_area                                             city  \
29322              CT                                     Milford city   
29323              GA          Athens-Clarke County unified government   
29324              GA  Augusta-Richmond County consolidated government   
29325              IN                                Indianapolis city   
29326              KS                Greeley County unified government   
...               ...                                              ...   
29472              WV                                Summersville town   
29473              WV                                  Upper Falls CDP   
29474              WI                                      Delwood CDP   
29475              WI                               Lake Shangrila CDP   
29476              WI                                    Pell Lake CDP   

      median_income  
29322           NaN  
29323           NaN  
29324           NaN  
29325           NaN  
29326           NaN  
...             ...  
29472           NaN  
29473           NaN  
29474           NaN  
29475           NaN  
29476           NaN  

[155 rows x 3 columns]
In [27]:
print("\nRecords missing from df_pct_completed_hs:")
print(df_pct_completed_hs_missing)
Records missing from df_pct_completed_hs:
      geographic_area                city percent_completed_hs
5676               HI              Hawaii                  NaN
24776              TX    C_sar Chövez CDP                  NaN
29331              AL       Kimberly town                  NaN
29332              AK        Edna Bay CDP                  NaN
29333              AK     Petersburg city                  NaN
...               ...                 ...                  ...
29472              WV   Summersville town                  NaN
29473              WV     Upper Falls CDP                  NaN
29474              WI         Delwood CDP                  NaN
29475              WI  Lake Shangrila CDP                  NaN
29476              WI       Pell Lake CDP                  NaN

[148 rows x 3 columns]
In [28]:
print("\nRecords missing from df_pct_poverty:")
print(df_pct_poverty_missing)
Records missing from df_pct_poverty:
      geographic_area                city poverty_rate
5676               HI              Hawaii          NaN
24776              TX    C_sar Chövez CDP          NaN
29331              AL       Kimberly town          NaN
29332              AK        Edna Bay CDP          NaN
29333              AK     Petersburg city          NaN
...               ...                 ...          ...
29472              WV   Summersville town          NaN
29473              WV     Upper Falls CDP          NaN
29474              WI         Delwood CDP          NaN
29475              WI  Lake Shangrila CDP          NaN
29476              WI       Pell Lake CDP          NaN

[148 rows x 3 columns]
In [29]:
print("\nRecords missing from df_share_race_city:")
print(df_share_race_city_missing)
Records missing from df_share_race_city:
      geographic_area                               city share_white  \
56                 AL                     Bon Secour CDP         NaN   
115                AL                     Cottondale CDP         NaN   
288                AL                      Kimberly city         NaN   
305                AL                        Lillian CDP         NaN   
413                AL                        Perdido CDP         NaN   
...               ...                                ...         ...   
29011              WI                     Summit village         NaN   
29103              WI                          Wiota CDP         NaN   
29265              WY                 Point of Rocks CDP         NaN   
29326              KS  Greeley County unified government         NaN   
29330              TX                   C_sar Chˆvez CDP         NaN   

      share_black share_native_american share_asian share_hispanic  
56            NaN                   NaN         NaN            NaN  
115           NaN                   NaN         NaN            NaN  
288           NaN                   NaN         NaN            NaN  
305           NaN                   NaN         NaN            NaN  
413           NaN                   NaN         NaN            NaN  
...           ...                   ...         ...            ...  
29011         NaN                   NaN         NaN            NaN  
29103         NaN                   NaN         NaN            NaN  
29265         NaN                   NaN         NaN            NaN  
29326         NaN                   NaN         NaN            NaN  
29330         NaN                   NaN         NaN            NaN  

[209 rows x 7 columns]

Let's review what percent of the dat ais nulls to determien how critical they are to the rest of the project and decid ehow to handle them.

In [30]:
# Total number of unique records in the master list
total_records = len(master_ls)

# Calculate missing record counts
missing_df_hh_income_missing = len(df_hh_income_missing)
missing_df_pct_completed_hs_missing = len(df_pct_completed_hs_missing)
missing_df_pct_poverty_missing = len(df_pct_poverty_missing)
missing_df_share_race_city_missing = len(df_share_race_city_missing)


# Calculate missing percentage for each DataFrame
pct_missing_df_hh_income_missing = (missing_df_hh_income_missing / total_records) * 100
pct_missing_df_pct_completed_hs_missing = (missing_df_pct_completed_hs_missing / total_records) * 100
pct_missing_df_pct_poverty_missing = (missing_df_pct_poverty_missing / total_records) * 100
pct_missing_df_share_race_city_missing = (missing_df_share_race_city_missing / total_records) * 100

# Display results
In [32]:
# Display results
print(f"Missing % in hh_income: {pct_missing_df_hh_income_missing:.2f}% ({missing_df_hh_income_missing}/{total_records})")
print(f"Missing % in pct_completed_hs: {pct_missing_df_pct_completed_hs_missing:.2f}% ({missing_df_pct_completed_hs_missing}/{total_records})")
print(f"Missing % in pct_poverty: {pct_missing_df_pct_poverty_missing:.2f}% ({missing_df_pct_poverty_missing}/{total_records})")
print(f"Missing % in share_race: {pct_missing_df_share_race_city_missing:.2f}% ({missing_df_share_race_city_missing}/{total_records})")
Missing % in hh_income: 0.53% (155/29477)
Missing % in pct_completed_hs: 0.50% (148/29477)
Missing % in pct_poverty: 0.50% (148/29477)
Missing % in share_race: 0.71% (209/29477)

Less than 1% missing in each, seems reasonable, I will just replace the NULL values with 0's. Now I am going to create a main big dataframe to use in the analysis.

In [33]:
df_main = master_ls.merge(df_hh_income, on=['geographic_area', 'city'], how='left').merge(df_pct_completed_hs, on=['geographic_area', 'city'], how='left').merge(df_pct_poverty, on=['geographic_area', 'city'], how='left').merge(df_share_race_city, on=['geographic_area', 'city'], how='left')
In [34]:
print(df_main)
# Export the DataFrame to a CSV file
df_main.to_csv('merged_output.csv', index=False)
      geographic_area                city median_income percent_completed_hs  \
0                  AL          Abanda CDP         11207                 21.2   
1                  AL      Abbeville city         25615                 69.1   
2                  AL     Adamsville city         42575                 78.9   
3                  AL        Addison town         37083                 81.4   
4                  AL          Akron town         21667                 68.6   
...               ...                 ...           ...                  ...   
29472              WV   Summersville town           NaN                  NaN   
29473              WV     Upper Falls CDP           NaN                  NaN   
29474              WI         Delwood CDP           NaN                  NaN   
29475              WI  Lake Shangrila CDP           NaN                  NaN   
29476              WI       Pell Lake CDP           NaN                  NaN   

      poverty_rate share_white share_black share_native_american share_asian  \
0             78.8        67.2        30.2                     0           0   
1             29.1        54.4        41.4                   0.1           1   
2             25.5        52.3        44.9                   0.5         0.3   
3             30.7        99.1         0.1                     0         0.1   
4               42        13.2        86.5                     0           0   
...            ...         ...         ...                   ...         ...   
29472          NaN        97.4         0.4                   0.3         0.9   
29473          NaN        96.8         1.1                   0.4         0.3   
29474          NaN        98.6         0.2                   0.4         0.2   
29475          NaN        95.1         2.2                   0.1         0.1   
29476          NaN        94.2         0.3                   0.5         0.4   

      share_hispanic  
0                1.6  
1                3.1  
2                2.3  
3                0.4  
4                0.3  
...              ...  
29472            1.4  
29473            0.4  
29474            0.2  
29475            4.8  
29476            9.1  

[29477 rows x 10 columns]
In [35]:
df_main.head()
Out[35]:
geographic_area city median_income percent_completed_hs poverty_rate share_white share_black share_native_american share_asian share_hispanic
0 AL Abanda CDP 11207 21.2 78.8 67.2 30.2 0 0 1.6
1 AL Abbeville city 25615 69.1 29.1 54.4 41.4 0.1 1 3.1
2 AL Adamsville city 42575 78.9 25.5 52.3 44.9 0.5 0.3 2.3
3 AL Addison town 37083 81.4 30.7 99.1 0.1 0 0.1 0.4
4 AL Akron town 21667 68.6 42 13.2 86.5 0 0 0.3
In [36]:
df_main.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29477 entries, 0 to 29476
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   geographic_area        29477 non-null  object
 1   city                   29477 non-null  object
 2   median_income          29271 non-null  object
 3   percent_completed_hs   29329 non-null  object
 4   poverty_rate           29329 non-null  object
 5   share_white            29268 non-null  object
 6   share_black            29268 non-null  object
 7   share_native_american  29268 non-null  object
 8   share_asian            29268 non-null  object
 9   share_hispanic         29268 non-null  object
dtypes: object(10)
memory usage: 2.2+ MB

Clean up the data using a cleaner function.

In [37]:
def clean_columns(df, column_name):
    cleaned_column = pd.to_numeric(
        df[column_name]
        .astype(str)  # Ensure all values are strings
        .str.replace(',', '', regex=True)  # Remove commas
        .str.replace(r'-$', '', regex=True),  # Remove trailing hyphens
        errors='coerce'  # Convert to float, setting errors to NaN
    )
    
    return cleaned_column
In [40]:
# List of columns to clean
columns_to_clean = [
    'share_asian', 
    'share_black', 
    'share_hispanic', 
    'share_native_american', 
    'share_white', 
    'percent_completed_hs', 
    'poverty_rate', 
    'median_income'
]
In [42]:
for column in columns_to_clean:
    df_main[column] = clean_columns(df_main, column)

df_main.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29477 entries, 0 to 29476
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geographic_area        29477 non-null  object 
 1   city                   29477 non-null  object 
 2   median_income          27394 non-null  float64
 3   percent_completed_hs   29132 non-null  float64
 4   poverty_rate           29128 non-null  float64
 5   share_white            29248 non-null  float64
 6   share_black            29248 non-null  float64
 7   share_native_american  29248 non-null  float64
 8   share_asian            29248 non-null  float64
 9   share_hispanic         29248 non-null  float64
dtypes: float64(8), object(2)
memory usage: 2.2+ MB
In [44]:
# Let pandas infer the date format automatically
df_fatalities['date'] = pd.to_datetime(df_fatalities['date'], format='%Y-%m-%d', errors='coerce')

df_fatalities.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       2535 non-null   int64         
 1   name                     2535 non-null   object        
 2   date                     2535 non-null   datetime64[ns]
 3   manner_of_death          2535 non-null   object        
 4   armed                    2526 non-null   object        
 5   age                      2458 non-null   float64       
 6   gender                   2535 non-null   object        
 7   race                     2340 non-null   object        
 8   city                     2535 non-null   object        
 9   state                    2535 non-null   object        
 10  signs_of_mental_illness  2535 non-null   bool          
 11  threat_level             2535 non-null   object        
 12  flee                     2470 non-null   object        
 13  body_camera              2535 non-null   bool          
dtypes: bool(2), datetime64[ns](1), float64(1), int64(1), object(9)
memory usage: 242.7+ KB

Chart the Poverty Rate in each US State by creating a bar chart that ranks the poverty rate from highest to lowest by US state. Which state has the highest poverty rate? Which state has the lowest poverty rate? Bar Plot

In [45]:
df_main.fillna(0, inplace=True)

# Select only numeric columns for aggregation
numeric_cols = df_main.select_dtypes(include='number').columns

# Group by 'geographic_area' and calculate mean for numeric columns
grouped_df = df_main.groupby('geographic_area')[numeric_cols].mean().reset_index()

grouped_df.head()
Out[45]:
geographic_area median_income percent_completed_hs poverty_rate share_white share_black share_native_american share_asian share_hispanic
0 AK 41,738.05 79.65 18.57 45.01 0.56 45.22 1.37 2.12
1 AL 37,807.53 80.03 20.58 71.52 23.00 0.65 0.47 2.94
2 AR 33,699.45 79.36 22.79 77.87 16.18 0.75 0.47 4.24
3 AZ 34,979.84 79.04 25.21 59.80 0.95 28.53 0.72 20.10
4 CA 54,538.95 80.09 16.73 70.93 2.66 1.70 5.50 29.26
In [47]:
# Sort the DataFrame by poverty_rate in descending order (highest first)
sorted_poverty = grouped_df.sort_values(by='poverty_rate', ascending=False)

# Display the top 5 geographic areas with the highest poverty rates
print("Top 5 Geographic Areas with Highest Poverty Rates:")
print(sorted_poverty[['geographic_area', 'poverty_rate']].head())

# Display the bottom 5 geographic areas with the lowest poverty rates
print("\nBottom 5 Geographic Areas with Lowest Poverty Rates:")
print(sorted_poverty[['geographic_area', 'poverty_rate']].tail())
Top 5 Geographic Areas with Highest Poverty Rates:
   geographic_area  poverty_rate
25              MS         26.66
3               AZ         25.21
10              GA         23.48
2               AR         22.79
32              NM         22.41

Bottom 5 Geographic Areas with Lowest Poverty Rates:
   geographic_area  poverty_rate
20              MD         10.25
19              MA          9.55
6               CT          9.14
50              WY          9.06
31              NJ          8.13
In [39]:
# Loop through each numeric column and create a bar chart
for col in numeric_cols:
    plt.figure(figsize=(10, 6))
    
    # Sort the data for better visualization
    sorted_df = grouped_df.sort_values(by=col, ascending=True)
    
    plt.bar(sorted_df['geographic_area'], sorted_df[col])
    plt.title(f'{col.replace("_", " ").title()} by Geographic Area (Ascending)')
    plt.xlabel('Geographic Area')
    plt.ylabel(col.replace("_", " ").title())
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Visualise the Relationship between Poverty Rates and High School Graduation Rates and create a line chart with two y-axes to show if the ratio of poverty and high school graduation move together.

In [48]:
##Sort the data frame: 
sorted_df = grouped_df.sort_values(by='median_income', ascending=True)

# Create figure and first axis
fig, ax1 = plt.subplots(figsize=(12, 7))

# Plot Poverty Rate on the first y-axis (left)
ax1.plot(sorted_df['geographic_area'], sorted_df['poverty_rate'], color='red', marker='o', label='Poverty Rate')
ax1.set_xlabel('Geographic Area', fontsize=12)
ax1.set_ylabel('Poverty Rate (%)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(range(len(sorted_df['geographic_area'])))
ax1.set_xticklabels(sorted_df['geographic_area'], rotation=45, ha='right')

# Create a second y-axis for High School Graduation Rate (right)
ax2 = ax1.twinx()
ax2.plot(sorted_df['geographic_area'], sorted_df['percent_completed_hs'], color='blue', marker='s', label='HS Graduation Rate')
ax2.set_ylabel('High School Graduation Rate (%)', color='blue', fontsize=12)
ax2.tick_params(axis='y', labelcolor='blue')

# Invert the High School Graduation Rate axis (right y-axis)
ax2.invert_yaxis()  ## need this since there is an inverse relationship between HS grad rate and poverty rate

# Title and layout adjustments
plt.title('Poverty Rate vs High School Graduation Rate by Geographic Area', fontsize=16)
fig.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [49]:
# Set Seaborn style
sns.set(style="white", color_codes=True)

# Create jointplot with scatter and KDE
g = sns.jointplot(
    data=sorted_df,
    x='poverty_rate',
    y='percent_completed_hs',
    kind='scatter',          # Scatter plot
    color='purple',          # Color of points
    height=8,                # Size of the plot
    marginal_kws=dict(bins=15, fill=True)  # Histogram settings for marginals
)

# Add KDE contours to show density
g.plot_joint(sns.kdeplot, color='blue', levels=5)

# Customize titles and labels
g.set_axis_labels('Poverty Rate (%)', 'High School Graduation Rate (%)', fontsize=12)
plt.suptitle('Poverty Rate vs High School Graduation Rate', fontsize=16, y=1.02)

# Show the plot
plt.show()
No description has been provided for this image
In [42]:
# Set Seaborn style
sns.set(style="whitegrid")

# Create a regression plot
plt.figure(figsize=(10, 6))
sns.regplot(
    data=sorted_df,
    x='poverty_rate',
    y='percent_completed_hs',
    scatter_kws={'color': 'red'},    # Color of scatter points
    line_kws={'color': 'blue'},      # Color of regression line
)

# Customize titles and labels
plt.title('Linear Regression: Poverty Rate vs High School Graduation Rate', fontsize=16)
plt.xlabel('Poverty Rate (%)', fontsize=12)
plt.ylabel('High School Graduation Rate (%)', fontsize=12)

# Show the plot
plt.show()
No description has been provided for this image
In [50]:
# Select only the necessary columns for visualization
race_columns = ['share_white', 'share_black', 'share_hispanic', 'share_asian', 'share_native_american']
plot_df = df_main[['geographic_area'] + race_columns]

# Group by 'geographic_area' and sum or average the racial shares
# Assuming these are percentages, we'll use the mean
plot_df = plot_df.groupby('geographic_area')[race_columns].mean().reset_index()

# Sort by the largest racial group for better visual structure (optional)
plot_df = plot_df.sort_values(by='share_white', ascending=False)
In [51]:
# Set figure size
plt.figure(figsize=(14, 8))

# Create the stacked bar chart
bottom = None
colors = ['#d9d9d9', '#636363', '#fdae6b', '#9ecae1', '#bc80bd']  # Custom colors for each group

for i, race in enumerate(race_columns):
    if bottom is None:
        plt.bar(plot_df['geographic_area'], plot_df[race], label=race.replace('_', ' ').title(), color=colors[i])
        bottom = plot_df[race]
    else:
        plt.bar(plot_df['geographic_area'], plot_df[race], bottom=bottom, label=race.replace('_', ' ').title(), color=colors[i])
        bottom += plot_df[race]

# Customize labels and titles
plt.title('Racial Makeup by US State', fontsize=16)
plt.xlabel('US State', fontsize=12)
plt.ylabel('Population Share (%)', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add legend
plt.legend(title='Racial Group', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to avoid clipping
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [52]:
# Count the number of fatalities by race
deaths_by_race = df_fatalities['race'].value_counts(dropna=False)

# Replace NaN with 'Unknown' for visualization purposes
deaths_by_race.index = deaths_by_race.index.fillna('Unknown')

# Display to verify
print(deaths_by_race)
race
W          1201
B           618
H           423
Unknown     195
A            39
N            31
O            28
Name: count, dtype: int64
In [53]:
import matplotlib.pyplot as plt

# Prepare labels and values for the donut chart
labels = deaths_by_race.index
sizes = deaths_by_race.values
colors = ['#d9d9d9', '#636363', '#fdae6b', '#9ecae1', '#bc80bd', '#ffcc99', '#ff6666']  # Adjust as needed for categories

# Create the pie chart (which will be converted to a donut)
fig, ax = plt.subplots(figsize=(8, 8))

# Plot the pie chart
wedges, texts, autotexts = ax.pie(
    sizes,
    labels=labels,
    autopct='%1.1f%%',         # Display percentage
    startangle=140,            # Rotate to start at 140 degrees
    colors=colors[:len(labels)],
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'},  # White edges between segments
    textprops={'fontsize': 12}
)

# Draw the center circle to create the donut shape
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig.gca().add_artist(centre_circle)

# Add title
plt.title('People Killed by Race', fontsize=16)

# Ensure the chart is a circle
ax.axis('equal')  

# Display the chart
plt.show()
No description has been provided for this image
In [54]:
# Count the number of deaths by gender
deaths_by_gender = df_fatalities['gender'].value_counts()

# Display the result to verify
print(deaths_by_gender)
gender
M    2428
F     107
Name: count, dtype: int64
In [55]:
# Prepare labels and values
labels = deaths_by_gender.index.map({'M': 'Men', 'F': 'Women'})  # Convert 'M' and 'F' to readable labels
counts = deaths_by_gender.values
colors = ['#1f77b4', '#ff7f0e']  # Blue for men, orange for women

# Create the bar chart
plt.figure(figsize=(8, 6))
plt.bar(labels, counts, color=colors)

# Add labels and title
plt.title('Total Number of Deaths by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)

# Display counts on top of the bars
for i, count in enumerate(counts):
    plt.text(i, count + 20, str(count), ha='center', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [56]:
# Create a copy of the filtered DataFrame to avoid the warning
plot_df = df_fatalities.dropna(subset=['age', 'manner_of_death', 'gender']).copy()

# Now safely map gender abbreviations without warnings
plot_df['gender'] = plot_df['gender'].map({'M': 'Men', 'F': 'Women'})
In [57]:
plt.figure(figsize=(14, 8))

# Box plot
sns.boxplot(data=plot_df, x='manner_of_death', y='age', hue='gender', palette='Set2')

# Swarm plot to show individual data points
sns.swarmplot(data=plot_df, x='manner_of_death', y='age', hue='gender', dodge=True, color='black', size=3, alpha=0.5)

plt.title('Age Distribution by Manner of Death and Gender with Individual Data Points', fontsize=16)
plt.xlabel('Manner of Death', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Adjust legend to avoid duplication
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:2], labels[:2], title='Gender', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()
C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\3925807255.py:7: FutureWarning: 

Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.swarmplot(data=plot_df, x='manner_of_death', y='age', hue='gender', dodge=True, color='black', size=3, alpha=0.5)
No description has been provided for this image
In [58]:
# Create a new column to classify the armed status
def classify_armed_status(weapon):
    if pd.isna(weapon) or weapon.lower() in ['unknown', 'undetermined']:
        return 'Unknown'
    elif weapon.lower() == 'unarmed':
        return 'Unarmed'
    elif 'gun' in weapon.lower() or 'rifle' in weapon.lower() or 'pistol' in weapon.lower() or 'firearm' in weapon.lower():
        return 'Armed with Gun'
    else:
        return 'Armed with Other Weapon'

# Apply the classification
df_fatalities['armed_status'] = df_fatalities['armed'].apply(classify_armed_status)

# Verify the new categories
print(df_fatalities['armed_status'].value_counts())
armed_status
Armed with Gun             1411
Armed with Other Weapon     827
Unarmed                     171
Unknown                     126
Name: count, dtype: int64
In [59]:
# Calculate the total counts
armed_counts = df_fatalities['armed_status'].value_counts()

# Calculate percentages
armed_percentages = (armed_counts / armed_counts.sum()) * 100

# Display the percentages
print(armed_percentages)
armed_status
Armed with Gun            55.66
Armed with Other Weapon   32.62
Unarmed                    6.75
Unknown                    4.97
Name: count, dtype: float64
In [60]:
# Set up the pie chart
plt.figure(figsize=(8, 8))
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']

# Plot pie chart
plt.pie(
    armed_percentages,
    labels=armed_percentages.index,
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)

# Add title
plt.title('Armed Status in Police Killings', fontsize=16)

# Display the plot
plt.show()
No description has been provided for this image
In [62]:
# Filter out entries with missing ages
df_age_clean = df_fatalities.dropna(subset=['age'])

# Total number of people with known ages
total_with_age = len(df_age_clean)

# Number of people under 25
under_25_count = df_age_clean[df_age_clean['age'] < 25].shape[0]

# Calculate the percentage of people under 25
under_25_percentage = (under_25_count / total_with_age) * 100

# Display the result
print(f"Total people with known ages: {total_with_age}")
print(f"Number of people killed under 25: {under_25_count}")
print(f"Percentage of people killed under 25: {under_25_percentage:.2f}%")
Total people with known ages: 2458
Number of people killed under 25: 450
Percentage of people killed under 25: 18.31%
In [63]:
# Remove entries with missing 'age' values
df_age_clean = df_fatalities.dropna(subset=['age'])

# Check the basic statistics for age
print(df_age_clean['age'].describe())
count   2,458.00
mean       36.61
std        13.03
min         6.00
25%        26.00
50%        34.00
75%        45.00
max        91.00
Name: age, dtype: float64
In [64]:
# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")

# Create the plot
plt.figure(figsize=(12, 7))

# Plot histogram with KDE
sns.histplot(df_age_clean['age'], bins=30, kde=True, color='skyblue', edgecolor='black')

# Customize the plot with titles and labels
plt.title('Distribution of Ages of People Killed by Police', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [65]:
# Filter out rows with missing 'age' or 'race' values
df_race_age = df_fatalities.dropna(subset=['age', 'race'])

# Check the unique races present in the dataset
print(df_race_age['race'].value_counts())

# Set Seaborn style
sns.set(style="whitegrid")

# Create KDE plots for each race using displot with col wrapping
g = sns.displot(
    data=df_race_age,
    x='age',
    col='race',
    kind='kde',
    col_wrap=3,            # Wraps plots into multiple rows after 3 plots
    fill=True,             # Fill under the KDE curve
    height=4,              # Size of each individual plot
    aspect=1.2,            # Aspect ratio of plots
    palette='muted'        # Color palette
)

# Customize titles and labels
g.set_axis_labels("Age", "Density")
g.set_titles("{col_name}")  # Sets the title of each subplot as the race name
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Age Distribution of People Killed by Police by Race', fontsize=16)

# Display the plot
plt.show()
race
W    1192
B     609
H     413
A      38
N      31
O      28
Name: count, dtype: int64
C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\148621240.py:11: UserWarning: Ignoring `palette` because no `hue` variable has been assigned.
  g = sns.displot(
No description has been provided for this image
In [66]:
# Count the number of fatalities by race
deaths_by_race = df_fatalities['race'].value_counts(dropna=False)

# Replace NaN with 'Unknown' for clarity in the chart
deaths_by_race.index = deaths_by_race.index.fillna('Unknown')

# Display the counts to verify
print(deaths_by_race)
race
W          1201
B           618
H           423
Unknown     195
A            39
N            31
O            28
Name: count, dtype: int64
In [67]:
# Set Seaborn style for aesthetics
sns.set(style="whitegrid")

# Create the bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=deaths_by_race.index, y=deaths_by_race.values, palette='Set2')

# Add labels and title
plt.title('Total Number of People Killed by Race', fontsize=16)
plt.xlabel('Race', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)

# Display the counts on top of each bar
for i, value in enumerate(deaths_by_race.values):
    plt.text(i, value + 5, str(value), ha='center', fontsize=10)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Display the plot
plt.tight_layout()
plt.show()
C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\918359945.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=deaths_by_race.index, y=deaths_by_race.values, palette='Set2')
No description has been provided for this image
In [68]:
# Check unique values in the 'signs_of_mental_illness' column
print(df_fatalities['signs_of_mental_illness'].value_counts(dropna=False))
signs_of_mental_illness
False    1902
True      633
Name: count, dtype: int64
In [69]:
# Total number of people killed
total_killed = df_fatalities.shape[0]

# Number of people with signs of mental illness
mental_illness_count = df_fatalities['signs_of_mental_illness'].sum()

# Calculate the percentage
mental_illness_percentage = (mental_illness_count / total_killed) * 100

# Display the result
print(f"Total people killed: {total_killed}")
print(f"Number of people with signs of mental illness: {mental_illness_count}")
print(f"Percentage of people killed with mental illness: {mental_illness_percentage:.2f}%")
Total people killed: 2535
Number of people with signs of mental illness: 633
Percentage of people killed with mental illness: 24.97%
In [70]:
# Prepare data for visualization
labels = ['Signs of Mental Illness', 'No Signs of Mental Illness']
sizes = [mental_illness_count, total_killed - mental_illness_count]
colors = ['#ff9999', '#66b3ff']

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    sizes,
    labels=labels,
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)

# Add title
plt.title('Percentage of People Killed by Police with Signs of Mental Illness', fontsize=16)

# Display the plot
plt.show()
No description has been provided for this image
In [71]:
# Get the top 10 cities with the highest number of police killings
top_cities = df_fatalities['city'].value_counts().head(10).index.tolist()

# Filter the DataFrame to include only these top 10 cities
df_top_cities = df_fatalities[df_fatalities['city'].isin(top_cities)]

# Display to verify
print(df_top_cities[['city', 'race']].head())
           city race
12     Columbus    B
40  Los Angeles    H
48    St. Louis    B
52       Austin    W
64      Phoenix    N
In [72]:
# Group by 'city' and 'race' to count police killings by race in each city
killings_by_city_race = df_top_cities.groupby(['city', 'race']).size().unstack(fill_value=0)

# Display the number of killings by race for each city
print(killings_by_city_race)
race         A   B   H  N  O   W
city                            
Austin       0   3   2  0  0  13
Chicago      0  21   1  0  0   2
Columbus     0  11   0  0  1   6
Houston      1  15   6  1  0   3
Las Vegas    0   3   5  0  0   9
Los Angeles  1  10  19  0  0   6
Miami        0   8   6  0  0   3
Phoenix      0   2  11  3  0  12
San Antonio  0   3  12  0  0   3
St. Louis    0  11   0  0  0   3
In [73]:
# Calculate the percentage of killings by race within each city
killings_by_city_race_percentage = killings_by_city_race.div(killings_by_city_race.sum(axis=1), axis=0) * 100

# Display the percentage table
print(killings_by_city_race_percentage)
race           A     B     H     N    O     W
city                                         
Austin      0.00 16.67 11.11  0.00 0.00 72.22
Chicago     0.00 87.50  4.17  0.00 0.00  8.33
Columbus    0.00 61.11  0.00  0.00 5.56 33.33
Houston     3.85 57.69 23.08  3.85 0.00 11.54
Las Vegas   0.00 17.65 29.41  0.00 0.00 52.94
Los Angeles 2.78 27.78 52.78  0.00 0.00 16.67
Miami       0.00 47.06 35.29  0.00 0.00 17.65
Phoenix     0.00  7.14 39.29 10.71 0.00 42.86
San Antonio 0.00 16.67 66.67  0.00 0.00 16.67
St. Louis   0.00 78.57  0.00  0.00 0.00 21.43
In [74]:
# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")

# Plot the stacked bar chart for racial distribution of police killings
killings_by_city_race_percentage.plot(
    kind='bar',
    stacked=True,
    figsize=(14, 8),
    colormap='Set2'
)

# Add titles and labels
plt.title('Racial Distribution of Police Killings in Top 10 Cities', fontsize=16)
plt.xlabel('City', fontsize=12)
plt.ylabel('Percentage of Police Killings (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add legend
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the plot
plt.tight_layout()
plt.show()
No description has been provided for this image
In [76]:
# Group the data by state to count police killings
killings_by_state = df_fatalities['state'].value_counts().reset_index()
killings_by_state.columns = ['state', 'num_killed']

# Display the aggregated data
print(killings_by_state.head())

# Split the string on ',' and keep only the second part (state abbreviation)
killings_by_state['state'] = killings_by_state['state'].apply(lambda x: x.split(',')[-1].strip())
# Remove 'DC' to avoid rendering issues
killings_by_state = killings_by_state[killings_by_state['state'] != 'DC']

# Verify the cleaned state abbreviations
print(killings_by_state['state'].unique())
  state  num_killed
0    CA         424
1    TX         225
2    FL         154
3    AZ         118
4    OH          79
['CA' 'TX' 'FL' 'AZ' 'OH' 'OK' 'CO' 'GA' 'NC' 'MO' 'WA' 'IL' 'TN' 'LA'
 'NM' 'PA' 'AL' 'VA' 'NY' 'SC' 'WI' 'KY' 'IN' 'NV' 'MD' 'OR' 'MI' 'NJ'
 'MN' 'WV' 'AR' 'KS' 'MS' 'UT' 'MA' 'ID' 'NE' 'AK' 'ME' 'IA' 'MT' 'HI'
 'SD' 'CT' 'WY' 'DE' 'NH' 'ND' 'VT' 'RI']
In [77]:
print(killings_by_state)
   state  num_killed
0     CA         424
1     TX         225
2     FL         154
3     AZ         118
4     OH          79
5     OK          78
6     CO          74
7     GA          70
8     NC          69
9     MO          64
10    WA          62
11    IL          62
12    TN          59
13    LA          57
14    NM          51
15    PA          51
16    AL          50
17    VA          47
18    NY          45
19    SC          44
20    WI          43
21    KY          43
22    IN          43
23    NV          42
24    MD          38
25    OR          38
26    MI          37
27    NJ          35
28    MN          32
29    WV          27
30    AR          26
31    KS          24
32    MS          23
33    UT          23
34    MA          22
35    ID          17
36    NE          15
37    AK          15
38    ME          13
39    IA          12
40    MT          11
41    HI          11
43    SD          10
44    CT           9
45    WY           8
46    DE           8
47    NH           7
48    ND           4
49    VT           3
50    RI           2
In [78]:
# Create the choropleth map with cleaned state codes
fig = px.choropleth(
    killings_by_state,
    locations='state',                # Use cleaned two-letter state abbreviations
    locationmode='USA-states',        # Recognize U.S. state abbreviations
    color='num_killed',               # Color based on the number of killings
    color_continuous_scale='Reds',    # Use a red gradient for intensity
    scope='usa',                      # Focus on U.S. map
    labels={'num_killed': 'Number of Killings'},
    title='Police Killings by U.S. State'
)
import plotly.io as pio

# Set the default renderer to 'notebook_connected' for Jupyter Notebook
pio.renderers.default = 'iframe'

# Now plot your figure again
fig.show()



fig.write_html("police_killings_map.html")
In [79]:
# Group by month and count the number of police killings
killings_over_time = df_fatalities.groupby(df_fatalities['date'].dt.to_period('M')).size().reset_index(name='num_killed')

# Convert 'date' back to datetime for plotting
killings_over_time['date'] = killings_over_time['date'].dt.to_timestamp()

# Display the aggregated data
print(killings_over_time.head())
        date  num_killed
0 2015-01-01          70
1 2015-02-01          78
2 2015-03-01          87
3 2015-04-01          88
4 2015-05-01          85
In [80]:
# Group by year and count the number of police killings
killings_by_year = df_fatalities.groupby(df_fatalities['date'].dt.year).size().reset_index(name='num_killed')

# Rename columns for clarity
killings_by_year.columns = ['year', 'num_killed']

# Display the yearly data
print(killings_by_year)
   year  num_killed
0  2015         991
1  2016         963
2  2017         581
In [81]:
# Line plot for monthly trends
fig = px.line(
    killings_over_time,
    x='date',
    y='num_killed',
    title='Number of Police Killings Over Time (Monthly)',
    labels={'num_killed': 'Number of Killings', 'date': 'Date'},
    markers=True
)

# Customize the layout
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Police Killings', template='plotly_white')

# Display the plot
fig.show()