Introduction¶
Since Jan. 1, 2015, The Washington Post has been compiling a database of every fatal shooting in the US by a police officer in the line of duty.
Import Statements¶
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
Notebook Presentation¶
pd.options.display.float_format = '{:,.2f}'.format
Load the Data¶
df_fatalities = pd.read_csv('Deaths_by_Police_US.csv', encoding="windows-1252")
df_hh_income = pd.read_csv('Median_Household_Income_2015.csv', encoding="windows-1252")
df_hh_income.head()
Geographic Area | City | Median Income | |
---|---|---|---|
0 | AL | Abanda CDP | 11207 |
1 | AL | Abbeville city | 25615 |
2 | AL | Adamsville city | 42575 |
3 | AL | Addison town | 37083 |
4 | AL | Akron town | 21667 |
df_pct_poverty = pd.read_csv('Pct_People_Below_Poverty_Level.csv', encoding="windows-1252")
df_pct_poverty.head()
Geographic Area | City | poverty_rate | |
---|---|---|---|
0 | AL | Abanda CDP | 78.8 |
1 | AL | Abbeville city | 29.1 |
2 | AL | Adamsville city | 25.5 |
3 | AL | Addison town | 30.7 |
4 | AL | Akron town | 42 |
df_pct_completed_hs = pd.read_csv('Pct_Over_25_Completed_High_School.csv', encoding="windows-1252")
df_pct_completed_hs.head()
Geographic Area | City | percent_completed_hs | |
---|---|---|---|
0 | AL | Abanda CDP | 21.2 |
1 | AL | Abbeville city | 69.1 |
2 | AL | Adamsville city | 78.9 |
3 | AL | Addison town | 81.4 |
4 | AL | Akron town | 68.6 |
df_share_race_city = pd.read_csv('Share_of_Race_By_City.csv', encoding="windows-1252")
df_share_race_city.head()
Geographic area | City | share_white | share_black | share_native_american | share_asian | share_hispanic | |
---|---|---|---|---|---|---|---|
0 | AL | Abanda CDP | 67.2 | 30.2 | 0 | 0 | 1.6 |
1 | AL | Abbeville city | 54.4 | 41.4 | 0.1 | 1 | 3.1 |
2 | AL | Adamsville city | 52.3 | 44.9 | 0.5 | 0.3 | 2.3 |
3 | AL | Addison town | 99.1 | 0.1 | 0 | 0.1 | 0.4 |
4 | AL | Akron town | 13.2 | 86.5 | 0 | 0 | 0.3 |
df_fatalities.head()
id | name | date | manner_of_death | armed | age | gender | race | city | state | signs_of_mental_illness | threat_level | flee | body_camera | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | Tim Elliot | 02/01/15 | shot | gun | 53.00 | M | A | Shelton | WA | True | attack | Not fleeing | False |
1 | 4 | Lewis Lee Lembke | 02/01/15 | shot | gun | 47.00 | M | W | Aloha | OR | False | attack | Not fleeing | False |
2 | 5 | John Paul Quintero | 03/01/15 | shot and Tasered | unarmed | 23.00 | M | H | Wichita | KS | False | other | Not fleeing | False |
3 | 8 | Matthew Hoffman | 04/01/15 | shot | toy weapon | 32.00 | M | W | San Francisco | CA | True | attack | Not fleeing | False |
4 | 9 | Michael Rodriguez | 04/01/15 | shot | nail gun | 39.00 | M | H | Evans | CO | False | attack | Not fleeing | False |
Standardize the column names for joins
def standardize_columns(df):
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
return df
df_hh_income = standardize_columns(df_hh_income)
df_hh_income.head()
geographic_area | city | median_income | |
---|---|---|---|
0 | AL | Abanda CDP | 11207 |
1 | AL | Abbeville city | 25615 |
2 | AL | Adamsville city | 42575 |
3 | AL | Addison town | 37083 |
4 | AL | Akron town | 21667 |
df_pct_poverty = standardize_columns(df_pct_poverty)
df_pct_poverty.head()
geographic_area | city | poverty_rate | |
---|---|---|---|
0 | AL | Abanda CDP | 78.8 |
1 | AL | Abbeville city | 29.1 |
2 | AL | Adamsville city | 25.5 |
3 | AL | Addison town | 30.7 |
4 | AL | Akron town | 42 |
df_pct_completed_hs = standardize_columns(df_pct_completed_hs)
df_pct_completed_hs.head()
geographic_area | city | percent_completed_hs | |
---|---|---|---|
0 | AL | Abanda CDP | 21.2 |
1 | AL | Abbeville city | 69.1 |
2 | AL | Adamsville city | 78.9 |
3 | AL | Addison town | 81.4 |
4 | AL | Akron town | 68.6 |
df_share_race_city = standardize_columns(df_share_race_city)
df_share_race_city.head()
geographic_area | city | share_white | share_black | share_native_american | share_asian | share_hispanic | |
---|---|---|---|---|---|---|---|
0 | AL | Abanda CDP | 67.2 | 30.2 | 0 | 0 | 1.6 |
1 | AL | Abbeville city | 54.4 | 41.4 | 0.1 | 1 | 3.1 |
2 | AL | Adamsville city | 52.3 | 44.9 | 0.5 | 0.3 | 2.3 |
3 | AL | Addison town | 99.1 | 0.1 | 0 | 0.1 | 0.4 |
4 | AL | Akron town | 13.2 | 86.5 | 0 | 0 | 0.3 |
df_fatalities = standardize_columns(df_fatalities)
df_fatalities.head()
id | name | date | manner_of_death | armed | age | gender | race | city | state | signs_of_mental_illness | threat_level | flee | body_camera | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | Tim Elliot | 02/01/15 | shot | gun | 53.00 | M | A | Shelton | WA | True | attack | Not fleeing | False |
1 | 4 | Lewis Lee Lembke | 02/01/15 | shot | gun | 47.00 | M | W | Aloha | OR | False | attack | Not fleeing | False |
2 | 5 | John Paul Quintero | 03/01/15 | shot and Tasered | unarmed | 23.00 | M | H | Wichita | KS | False | other | Not fleeing | False |
3 | 8 | Matthew Hoffman | 04/01/15 | shot | toy weapon | 32.00 | M | W | San Francisco | CA | True | attack | Not fleeing | False |
4 | 9 | Michael Rodriguez | 04/01/15 | shot | nail gun | 39.00 | M | H | Evans | CO | False | attack | Not fleeing | False |
Now they should all be the spelling and punctuation:
print(f"Column names of df_hh_income: {df_hh_income.columns.tolist()}")
print(f"Column names of pct_poverty: {df_pct_poverty.columns.tolist()}")
print(f"Column names of pct_completed: {df_pct_completed_hs.columns.tolist()}")
print(f"Column names of share_race_city: {df_share_race_city.columns.tolist()}")
print(f"Column names of fatalities: {df_fatalities.columns.tolist()}")
Column names of df_hh_income: ['geographic_area', 'city', 'median_income'] Column names of pct_poverty: ['geographic_area', 'city', 'poverty_rate'] Column names of pct_completed: ['geographic_area', 'city', 'percent_completed_hs'] Column names of share_race_city: ['geographic_area', 'city', 'share_white', 'share_black', 'share_native_american', 'share_asian', 'share_hispanic'] Column names of fatalities: ['id', 'name', 'date', 'manner_of_death', 'armed', 'age', 'gender', 'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level', 'flee', 'body_camera']
Preliminary Data Exploration¶
#shapes of dataframes
print(f"shape of df_hh_income: {df_hh_income.shape},\nshape of pct_poverty: {df_pct_poverty.shape},\nshape of df_pct_completed: {df_pct_completed_hs.shape},\nshape of df_share_race_city: {df_share_race_city.shape},\nshape of fatalities: {df_fatalities.shape}")
shape of df_hh_income: (29322, 3), shape of pct_poverty: (29329, 3), shape of df_pct_completed: (29329, 3), shape of df_share_race_city: (29268, 7), shape of fatalities: (2535, 14)
Some rows are the same size, some are not, that means there are either duplicates, relationships with null values, or missing data. For example, the income table may not have a record for every city, or there may be two in one table and none in the other, this will be tested.
First I will merge all the tables with the same cardinality, merged from biggest to smallest
# Step 1: Create a Master List of Unique Geographic Area & City. Thi siwll be used to join all the other tables with the same cardinality
master_ls = pd.concat([df_hh_income[['geographic_area', 'city']]
, df_pct_completed_hs[['geographic_area', 'city']]
, df_pct_poverty[['geographic_area', 'city']]
, df_share_race_city[['geographic_area', 'city']]]).drop_duplicates()
print(master_ls)
geographic_area city 0 AL Abanda CDP 1 AL Abbeville city 2 AL Adamsville city 3 AL Addison town 4 AL Akron town ... ... ... 28243 WV Summersville town 28256 WV Upper Falls CDP 28469 WI Delwood CDP 28667 WI Lake Shangrila CDP 28828 WI Pell Lake CDP [29477 rows x 2 columns]
Perform left Joins with the Master List, use left join to ensure all records from the joining table are joined ot the master table. Created indicators to map if the record joined or not to be used later in the analysis.
df_hh_income_check = master_ls.merge(df_hh_income, on=['geographic_area', 'city'], how='left', indicator=True)
df_pct_completed_hs_check = master_ls.merge(df_pct_completed_hs, on=['geographic_area', 'city'], how='left', indicator=True)
df_pct_poverty_check = master_ls.merge(df_pct_poverty, on=['geographic_area', 'city'], how='left', indicator=True)
df_share_race_city_check = master_ls.merge(df_share_race_city, on=['geographic_area', 'city'], how='left', indicator=True)
Now lets look at the tables that have no matching values.
df_hh_income_missing = df_hh_income_check[df_hh_income_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_pct_completed_hs_missing = df_pct_completed_hs_check[df_pct_completed_hs_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_pct_poverty_missing = df_pct_poverty_check[df_pct_poverty_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_share_race_city_missing = df_share_race_city_check[df_share_race_city_check['_merge'] == 'left_only'].drop(columns=['_merge'])
print("Records missing from df_hh_income:")
print(df_hh_income_missing)
Records missing from df_hh_income: geographic_area city \ 29322 CT Milford city 29323 GA Athens-Clarke County unified government 29324 GA Augusta-Richmond County consolidated government 29325 IN Indianapolis city 29326 KS Greeley County unified government ... ... ... 29472 WV Summersville town 29473 WV Upper Falls CDP 29474 WI Delwood CDP 29475 WI Lake Shangrila CDP 29476 WI Pell Lake CDP median_income 29322 NaN 29323 NaN 29324 NaN 29325 NaN 29326 NaN ... ... 29472 NaN 29473 NaN 29474 NaN 29475 NaN 29476 NaN [155 rows x 3 columns]
print("\nRecords missing from df_pct_completed_hs:")
print(df_pct_completed_hs_missing)
Records missing from df_pct_completed_hs: geographic_area city percent_completed_hs 5676 HI Hawaii NaN 24776 TX C_sar Chövez CDP NaN 29331 AL Kimberly town NaN 29332 AK Edna Bay CDP NaN 29333 AK Petersburg city NaN ... ... ... ... 29472 WV Summersville town NaN 29473 WV Upper Falls CDP NaN 29474 WI Delwood CDP NaN 29475 WI Lake Shangrila CDP NaN 29476 WI Pell Lake CDP NaN [148 rows x 3 columns]
print("\nRecords missing from df_pct_poverty:")
print(df_pct_poverty_missing)
Records missing from df_pct_poverty: geographic_area city poverty_rate 5676 HI Hawaii NaN 24776 TX C_sar Chövez CDP NaN 29331 AL Kimberly town NaN 29332 AK Edna Bay CDP NaN 29333 AK Petersburg city NaN ... ... ... ... 29472 WV Summersville town NaN 29473 WV Upper Falls CDP NaN 29474 WI Delwood CDP NaN 29475 WI Lake Shangrila CDP NaN 29476 WI Pell Lake CDP NaN [148 rows x 3 columns]
print("\nRecords missing from df_share_race_city:")
print(df_share_race_city_missing)
Records missing from df_share_race_city: geographic_area city share_white \ 56 AL Bon Secour CDP NaN 115 AL Cottondale CDP NaN 288 AL Kimberly city NaN 305 AL Lillian CDP NaN 413 AL Perdido CDP NaN ... ... ... ... 29011 WI Summit village NaN 29103 WI Wiota CDP NaN 29265 WY Point of Rocks CDP NaN 29326 KS Greeley County unified government NaN 29330 TX C_sar Chˆvez CDP NaN share_black share_native_american share_asian share_hispanic 56 NaN NaN NaN NaN 115 NaN NaN NaN NaN 288 NaN NaN NaN NaN 305 NaN NaN NaN NaN 413 NaN NaN NaN NaN ... ... ... ... ... 29011 NaN NaN NaN NaN 29103 NaN NaN NaN NaN 29265 NaN NaN NaN NaN 29326 NaN NaN NaN NaN 29330 NaN NaN NaN NaN [209 rows x 7 columns]
Let's review what percent of the dat ais nulls to determien how critical they are to the rest of the project and decid ehow to handle them.
# Total number of unique records in the master list
total_records = len(master_ls)
# Calculate missing record counts
missing_df_hh_income_missing = len(df_hh_income_missing)
missing_df_pct_completed_hs_missing = len(df_pct_completed_hs_missing)
missing_df_pct_poverty_missing = len(df_pct_poverty_missing)
missing_df_share_race_city_missing = len(df_share_race_city_missing)
# Calculate missing percentage for each DataFrame
pct_missing_df_hh_income_missing = (missing_df_hh_income_missing / total_records) * 100
pct_missing_df_pct_completed_hs_missing = (missing_df_pct_completed_hs_missing / total_records) * 100
pct_missing_df_pct_poverty_missing = (missing_df_pct_poverty_missing / total_records) * 100
pct_missing_df_share_race_city_missing = (missing_df_share_race_city_missing / total_records) * 100
# Display results
# Display results
print(f"Missing % in hh_income: {pct_missing_df_hh_income_missing:.2f}% ({missing_df_hh_income_missing}/{total_records})")
print(f"Missing % in pct_completed_hs: {pct_missing_df_pct_completed_hs_missing:.2f}% ({missing_df_pct_completed_hs_missing}/{total_records})")
print(f"Missing % in pct_poverty: {pct_missing_df_pct_poverty_missing:.2f}% ({missing_df_pct_poverty_missing}/{total_records})")
print(f"Missing % in share_race: {pct_missing_df_share_race_city_missing:.2f}% ({missing_df_share_race_city_missing}/{total_records})")
Missing % in hh_income: 0.53% (155/29477) Missing % in pct_completed_hs: 0.50% (148/29477) Missing % in pct_poverty: 0.50% (148/29477) Missing % in share_race: 0.71% (209/29477)
Less than 1% missing in each, seems reasonable, I will just replace the NULL values with 0's. Now I am going to create a main big dataframe to use in the analysis.
df_main = master_ls.merge(df_hh_income, on=['geographic_area', 'city'], how='left').merge(df_pct_completed_hs, on=['geographic_area', 'city'], how='left').merge(df_pct_poverty, on=['geographic_area', 'city'], how='left').merge(df_share_race_city, on=['geographic_area', 'city'], how='left')
print(df_main)
# Export the DataFrame to a CSV file
df_main.to_csv('merged_output.csv', index=False)
geographic_area city median_income percent_completed_hs \ 0 AL Abanda CDP 11207 21.2 1 AL Abbeville city 25615 69.1 2 AL Adamsville city 42575 78.9 3 AL Addison town 37083 81.4 4 AL Akron town 21667 68.6 ... ... ... ... ... 29472 WV Summersville town NaN NaN 29473 WV Upper Falls CDP NaN NaN 29474 WI Delwood CDP NaN NaN 29475 WI Lake Shangrila CDP NaN NaN 29476 WI Pell Lake CDP NaN NaN poverty_rate share_white share_black share_native_american share_asian \ 0 78.8 67.2 30.2 0 0 1 29.1 54.4 41.4 0.1 1 2 25.5 52.3 44.9 0.5 0.3 3 30.7 99.1 0.1 0 0.1 4 42 13.2 86.5 0 0 ... ... ... ... ... ... 29472 NaN 97.4 0.4 0.3 0.9 29473 NaN 96.8 1.1 0.4 0.3 29474 NaN 98.6 0.2 0.4 0.2 29475 NaN 95.1 2.2 0.1 0.1 29476 NaN 94.2 0.3 0.5 0.4 share_hispanic 0 1.6 1 3.1 2 2.3 3 0.4 4 0.3 ... ... 29472 1.4 29473 0.4 29474 0.2 29475 4.8 29476 9.1 [29477 rows x 10 columns]
df_main.head()
geographic_area | city | median_income | percent_completed_hs | poverty_rate | share_white | share_black | share_native_american | share_asian | share_hispanic | |
---|---|---|---|---|---|---|---|---|---|---|
0 | AL | Abanda CDP | 11207 | 21.2 | 78.8 | 67.2 | 30.2 | 0 | 0 | 1.6 |
1 | AL | Abbeville city | 25615 | 69.1 | 29.1 | 54.4 | 41.4 | 0.1 | 1 | 3.1 |
2 | AL | Adamsville city | 42575 | 78.9 | 25.5 | 52.3 | 44.9 | 0.5 | 0.3 | 2.3 |
3 | AL | Addison town | 37083 | 81.4 | 30.7 | 99.1 | 0.1 | 0 | 0.1 | 0.4 |
4 | AL | Akron town | 21667 | 68.6 | 42 | 13.2 | 86.5 | 0 | 0 | 0.3 |
df_main.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29477 entries, 0 to 29476 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 geographic_area 29477 non-null object 1 city 29477 non-null object 2 median_income 29271 non-null object 3 percent_completed_hs 29329 non-null object 4 poverty_rate 29329 non-null object 5 share_white 29268 non-null object 6 share_black 29268 non-null object 7 share_native_american 29268 non-null object 8 share_asian 29268 non-null object 9 share_hispanic 29268 non-null object dtypes: object(10) memory usage: 2.2+ MB
Clean up the data using a cleaner function.
def clean_columns(df, column_name):
cleaned_column = pd.to_numeric(
df[column_name]
.astype(str) # Ensure all values are strings
.str.replace(',', '', regex=True) # Remove commas
.str.replace(r'-$', '', regex=True), # Remove trailing hyphens
errors='coerce' # Convert to float, setting errors to NaN
)
return cleaned_column
# List of columns to clean
columns_to_clean = [
'share_asian',
'share_black',
'share_hispanic',
'share_native_american',
'share_white',
'percent_completed_hs',
'poverty_rate',
'median_income'
]
for column in columns_to_clean:
df_main[column] = clean_columns(df_main, column)
df_main.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29477 entries, 0 to 29476 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 geographic_area 29477 non-null object 1 city 29477 non-null object 2 median_income 27394 non-null float64 3 percent_completed_hs 29132 non-null float64 4 poverty_rate 29128 non-null float64 5 share_white 29248 non-null float64 6 share_black 29248 non-null float64 7 share_native_american 29248 non-null float64 8 share_asian 29248 non-null float64 9 share_hispanic 29248 non-null float64 dtypes: float64(8), object(2) memory usage: 2.2+ MB
# Let pandas infer the date format automatically
df_fatalities['date'] = pd.to_datetime(df_fatalities['date'], format='%Y-%m-%d', errors='coerce')
df_fatalities.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2535 entries, 0 to 2534 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 2535 non-null int64 1 name 2535 non-null object 2 date 2535 non-null datetime64[ns] 3 manner_of_death 2535 non-null object 4 armed 2526 non-null object 5 age 2458 non-null float64 6 gender 2535 non-null object 7 race 2340 non-null object 8 city 2535 non-null object 9 state 2535 non-null object 10 signs_of_mental_illness 2535 non-null bool 11 threat_level 2535 non-null object 12 flee 2470 non-null object 13 body_camera 2535 non-null bool dtypes: bool(2), datetime64[ns](1), float64(1), int64(1), object(9) memory usage: 242.7+ KB
Chart the Poverty Rate in each US State by creating a bar chart that ranks the poverty rate from highest to lowest by US state. Which state has the highest poverty rate? Which state has the lowest poverty rate? Bar Plot
df_main.fillna(0, inplace=True)
# Select only numeric columns for aggregation
numeric_cols = df_main.select_dtypes(include='number').columns
# Group by 'geographic_area' and calculate mean for numeric columns
grouped_df = df_main.groupby('geographic_area')[numeric_cols].mean().reset_index()
grouped_df.head()
geographic_area | median_income | percent_completed_hs | poverty_rate | share_white | share_black | share_native_american | share_asian | share_hispanic | |
---|---|---|---|---|---|---|---|---|---|
0 | AK | 41,738.05 | 79.65 | 18.57 | 45.01 | 0.56 | 45.22 | 1.37 | 2.12 |
1 | AL | 37,807.53 | 80.03 | 20.58 | 71.52 | 23.00 | 0.65 | 0.47 | 2.94 |
2 | AR | 33,699.45 | 79.36 | 22.79 | 77.87 | 16.18 | 0.75 | 0.47 | 4.24 |
3 | AZ | 34,979.84 | 79.04 | 25.21 | 59.80 | 0.95 | 28.53 | 0.72 | 20.10 |
4 | CA | 54,538.95 | 80.09 | 16.73 | 70.93 | 2.66 | 1.70 | 5.50 | 29.26 |
# Sort the DataFrame by poverty_rate in descending order (highest first)
sorted_poverty = grouped_df.sort_values(by='poverty_rate', ascending=False)
# Display the top 5 geographic areas with the highest poverty rates
print("Top 5 Geographic Areas with Highest Poverty Rates:")
print(sorted_poverty[['geographic_area', 'poverty_rate']].head())
# Display the bottom 5 geographic areas with the lowest poverty rates
print("\nBottom 5 Geographic Areas with Lowest Poverty Rates:")
print(sorted_poverty[['geographic_area', 'poverty_rate']].tail())
Top 5 Geographic Areas with Highest Poverty Rates: geographic_area poverty_rate 25 MS 26.66 3 AZ 25.21 10 GA 23.48 2 AR 22.79 32 NM 22.41 Bottom 5 Geographic Areas with Lowest Poverty Rates: geographic_area poverty_rate 20 MD 10.25 19 MA 9.55 6 CT 9.14 50 WY 9.06 31 NJ 8.13
# Loop through each numeric column and create a bar chart
for col in numeric_cols:
plt.figure(figsize=(10, 6))
# Sort the data for better visualization
sorted_df = grouped_df.sort_values(by=col, ascending=True)
plt.bar(sorted_df['geographic_area'], sorted_df[col])
plt.title(f'{col.replace("_", " ").title()} by Geographic Area (Ascending)')
plt.xlabel('Geographic Area')
plt.ylabel(col.replace("_", " ").title())
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Visualise the Relationship between Poverty Rates and High School Graduation Rates and create a line chart with two y-axes to show if the ratio of poverty and high school graduation move together.
##Sort the data frame:
sorted_df = grouped_df.sort_values(by='median_income', ascending=True)
# Create figure and first axis
fig, ax1 = plt.subplots(figsize=(12, 7))
# Plot Poverty Rate on the first y-axis (left)
ax1.plot(sorted_df['geographic_area'], sorted_df['poverty_rate'], color='red', marker='o', label='Poverty Rate')
ax1.set_xlabel('Geographic Area', fontsize=12)
ax1.set_ylabel('Poverty Rate (%)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(range(len(sorted_df['geographic_area'])))
ax1.set_xticklabels(sorted_df['geographic_area'], rotation=45, ha='right')
# Create a second y-axis for High School Graduation Rate (right)
ax2 = ax1.twinx()
ax2.plot(sorted_df['geographic_area'], sorted_df['percent_completed_hs'], color='blue', marker='s', label='HS Graduation Rate')
ax2.set_ylabel('High School Graduation Rate (%)', color='blue', fontsize=12)
ax2.tick_params(axis='y', labelcolor='blue')
# Invert the High School Graduation Rate axis (right y-axis)
ax2.invert_yaxis() ## need this since there is an inverse relationship between HS grad rate and poverty rate
# Title and layout adjustments
plt.title('Poverty Rate vs High School Graduation Rate by Geographic Area', fontsize=16)
fig.tight_layout()
# Show the plot
plt.show()
# Set Seaborn style
sns.set(style="white", color_codes=True)
# Create jointplot with scatter and KDE
g = sns.jointplot(
data=sorted_df,
x='poverty_rate',
y='percent_completed_hs',
kind='scatter', # Scatter plot
color='purple', # Color of points
height=8, # Size of the plot
marginal_kws=dict(bins=15, fill=True) # Histogram settings for marginals
)
# Add KDE contours to show density
g.plot_joint(sns.kdeplot, color='blue', levels=5)
# Customize titles and labels
g.set_axis_labels('Poverty Rate (%)', 'High School Graduation Rate (%)', fontsize=12)
plt.suptitle('Poverty Rate vs High School Graduation Rate', fontsize=16, y=1.02)
# Show the plot
plt.show()
# Set Seaborn style
sns.set(style="whitegrid")
# Create a regression plot
plt.figure(figsize=(10, 6))
sns.regplot(
data=sorted_df,
x='poverty_rate',
y='percent_completed_hs',
scatter_kws={'color': 'red'}, # Color of scatter points
line_kws={'color': 'blue'}, # Color of regression line
)
# Customize titles and labels
plt.title('Linear Regression: Poverty Rate vs High School Graduation Rate', fontsize=16)
plt.xlabel('Poverty Rate (%)', fontsize=12)
plt.ylabel('High School Graduation Rate (%)', fontsize=12)
# Show the plot
plt.show()
# Select only the necessary columns for visualization
race_columns = ['share_white', 'share_black', 'share_hispanic', 'share_asian', 'share_native_american']
plot_df = df_main[['geographic_area'] + race_columns]
# Group by 'geographic_area' and sum or average the racial shares
# Assuming these are percentages, we'll use the mean
plot_df = plot_df.groupby('geographic_area')[race_columns].mean().reset_index()
# Sort by the largest racial group for better visual structure (optional)
plot_df = plot_df.sort_values(by='share_white', ascending=False)
# Set figure size
plt.figure(figsize=(14, 8))
# Create the stacked bar chart
bottom = None
colors = ['#d9d9d9', '#636363', '#fdae6b', '#9ecae1', '#bc80bd'] # Custom colors for each group
for i, race in enumerate(race_columns):
if bottom is None:
plt.bar(plot_df['geographic_area'], plot_df[race], label=race.replace('_', ' ').title(), color=colors[i])
bottom = plot_df[race]
else:
plt.bar(plot_df['geographic_area'], plot_df[race], bottom=bottom, label=race.replace('_', ' ').title(), color=colors[i])
bottom += plot_df[race]
# Customize labels and titles
plt.title('Racial Makeup by US State', fontsize=16)
plt.xlabel('US State', fontsize=12)
plt.ylabel('Population Share (%)', fontsize=12)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
# Add legend
plt.legend(title='Racial Group', bbox_to_anchor=(1.05, 1), loc='upper left')
# Adjust layout to avoid clipping
plt.tight_layout()
# Show the plot
plt.show()
# Count the number of fatalities by race
deaths_by_race = df_fatalities['race'].value_counts(dropna=False)
# Replace NaN with 'Unknown' for visualization purposes
deaths_by_race.index = deaths_by_race.index.fillna('Unknown')
# Display to verify
print(deaths_by_race)
race W 1201 B 618 H 423 Unknown 195 A 39 N 31 O 28 Name: count, dtype: int64
import matplotlib.pyplot as plt
# Prepare labels and values for the donut chart
labels = deaths_by_race.index
sizes = deaths_by_race.values
colors = ['#d9d9d9', '#636363', '#fdae6b', '#9ecae1', '#bc80bd', '#ffcc99', '#ff6666'] # Adjust as needed for categories
# Create the pie chart (which will be converted to a donut)
fig, ax = plt.subplots(figsize=(8, 8))
# Plot the pie chart
wedges, texts, autotexts = ax.pie(
sizes,
labels=labels,
autopct='%1.1f%%', # Display percentage
startangle=140, # Rotate to start at 140 degrees
colors=colors[:len(labels)],
wedgeprops={'linewidth': 1, 'edgecolor': 'white'}, # White edges between segments
textprops={'fontsize': 12}
)
# Draw the center circle to create the donut shape
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig.gca().add_artist(centre_circle)
# Add title
plt.title('People Killed by Race', fontsize=16)
# Ensure the chart is a circle
ax.axis('equal')
# Display the chart
plt.show()
# Count the number of deaths by gender
deaths_by_gender = df_fatalities['gender'].value_counts()
# Display the result to verify
print(deaths_by_gender)
gender M 2428 F 107 Name: count, dtype: int64
# Prepare labels and values
labels = deaths_by_gender.index.map({'M': 'Men', 'F': 'Women'}) # Convert 'M' and 'F' to readable labels
counts = deaths_by_gender.values
colors = ['#1f77b4', '#ff7f0e'] # Blue for men, orange for women
# Create the bar chart
plt.figure(figsize=(8, 6))
plt.bar(labels, counts, color=colors)
# Add labels and title
plt.title('Total Number of Deaths by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)
# Display counts on top of the bars
for i, count in enumerate(counts):
plt.text(i, count + 20, str(count), ha='center', fontsize=12)
# Display the plot
plt.tight_layout()
plt.show()
# Create a copy of the filtered DataFrame to avoid the warning
plot_df = df_fatalities.dropna(subset=['age', 'manner_of_death', 'gender']).copy()
# Now safely map gender abbreviations without warnings
plot_df['gender'] = plot_df['gender'].map({'M': 'Men', 'F': 'Women'})
plt.figure(figsize=(14, 8))
# Box plot
sns.boxplot(data=plot_df, x='manner_of_death', y='age', hue='gender', palette='Set2')
# Swarm plot to show individual data points
sns.swarmplot(data=plot_df, x='manner_of_death', y='age', hue='gender', dodge=True, color='black', size=3, alpha=0.5)
plt.title('Age Distribution by Manner of Death and Gender with Individual Data Points', fontsize=16)
plt.xlabel('Manner of Death', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.xticks(rotation=45, ha='right')
# Adjust legend to avoid duplication
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:2], labels[:2], title='Gender', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\3925807255.py:7: FutureWarning: Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect. sns.swarmplot(data=plot_df, x='manner_of_death', y='age', hue='gender', dodge=True, color='black', size=3, alpha=0.5)
# Create a new column to classify the armed status
def classify_armed_status(weapon):
if pd.isna(weapon) or weapon.lower() in ['unknown', 'undetermined']:
return 'Unknown'
elif weapon.lower() == 'unarmed':
return 'Unarmed'
elif 'gun' in weapon.lower() or 'rifle' in weapon.lower() or 'pistol' in weapon.lower() or 'firearm' in weapon.lower():
return 'Armed with Gun'
else:
return 'Armed with Other Weapon'
# Apply the classification
df_fatalities['armed_status'] = df_fatalities['armed'].apply(classify_armed_status)
# Verify the new categories
print(df_fatalities['armed_status'].value_counts())
armed_status Armed with Gun 1411 Armed with Other Weapon 827 Unarmed 171 Unknown 126 Name: count, dtype: int64
# Calculate the total counts
armed_counts = df_fatalities['armed_status'].value_counts()
# Calculate percentages
armed_percentages = (armed_counts / armed_counts.sum()) * 100
# Display the percentages
print(armed_percentages)
armed_status Armed with Gun 55.66 Armed with Other Weapon 32.62 Unarmed 6.75 Unknown 4.97 Name: count, dtype: float64
# Set up the pie chart
plt.figure(figsize=(8, 8))
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']
# Plot pie chart
plt.pie(
armed_percentages,
labels=armed_percentages.index,
autopct='%1.1f%%',
startangle=140,
colors=colors,
wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)
# Add title
plt.title('Armed Status in Police Killings', fontsize=16)
# Display the plot
plt.show()
# Filter out entries with missing ages
df_age_clean = df_fatalities.dropna(subset=['age'])
# Total number of people with known ages
total_with_age = len(df_age_clean)
# Number of people under 25
under_25_count = df_age_clean[df_age_clean['age'] < 25].shape[0]
# Calculate the percentage of people under 25
under_25_percentage = (under_25_count / total_with_age) * 100
# Display the result
print(f"Total people with known ages: {total_with_age}")
print(f"Number of people killed under 25: {under_25_count}")
print(f"Percentage of people killed under 25: {under_25_percentage:.2f}%")
Total people with known ages: 2458 Number of people killed under 25: 450 Percentage of people killed under 25: 18.31%
# Remove entries with missing 'age' values
df_age_clean = df_fatalities.dropna(subset=['age'])
# Check the basic statistics for age
print(df_age_clean['age'].describe())
count 2,458.00 mean 36.61 std 13.03 min 6.00 25% 26.00 50% 34.00 75% 45.00 max 91.00 Name: age, dtype: float64
# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")
# Create the plot
plt.figure(figsize=(12, 7))
# Plot histogram with KDE
sns.histplot(df_age_clean['age'], bins=30, kde=True, color='skyblue', edgecolor='black')
# Customize the plot with titles and labels
plt.title('Distribution of Ages of People Killed by Police', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)
# Display the plot
plt.tight_layout()
plt.show()
# Filter out rows with missing 'age' or 'race' values
df_race_age = df_fatalities.dropna(subset=['age', 'race'])
# Check the unique races present in the dataset
print(df_race_age['race'].value_counts())
# Set Seaborn style
sns.set(style="whitegrid")
# Create KDE plots for each race using displot with col wrapping
g = sns.displot(
data=df_race_age,
x='age',
col='race',
kind='kde',
col_wrap=3, # Wraps plots into multiple rows after 3 plots
fill=True, # Fill under the KDE curve
height=4, # Size of each individual plot
aspect=1.2, # Aspect ratio of plots
palette='muted' # Color palette
)
# Customize titles and labels
g.set_axis_labels("Age", "Density")
g.set_titles("{col_name}") # Sets the title of each subplot as the race name
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Age Distribution of People Killed by Police by Race', fontsize=16)
# Display the plot
plt.show()
race W 1192 B 609 H 413 A 38 N 31 O 28 Name: count, dtype: int64
C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\148621240.py:11: UserWarning: Ignoring `palette` because no `hue` variable has been assigned. g = sns.displot(
# Count the number of fatalities by race
deaths_by_race = df_fatalities['race'].value_counts(dropna=False)
# Replace NaN with 'Unknown' for clarity in the chart
deaths_by_race.index = deaths_by_race.index.fillna('Unknown')
# Display the counts to verify
print(deaths_by_race)
race W 1201 B 618 H 423 Unknown 195 A 39 N 31 O 28 Name: count, dtype: int64
# Set Seaborn style for aesthetics
sns.set(style="whitegrid")
# Create the bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=deaths_by_race.index, y=deaths_by_race.values, palette='Set2')
# Add labels and title
plt.title('Total Number of People Killed by Race', fontsize=16)
plt.xlabel('Race', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)
# Display the counts on top of each bar
for i, value in enumerate(deaths_by_race.values):
plt.text(i, value + 5, str(value), ha='center', fontsize=10)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
# Display the plot
plt.tight_layout()
plt.show()
C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\918359945.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=deaths_by_race.index, y=deaths_by_race.values, palette='Set2')
# Check unique values in the 'signs_of_mental_illness' column
print(df_fatalities['signs_of_mental_illness'].value_counts(dropna=False))
signs_of_mental_illness False 1902 True 633 Name: count, dtype: int64
# Total number of people killed
total_killed = df_fatalities.shape[0]
# Number of people with signs of mental illness
mental_illness_count = df_fatalities['signs_of_mental_illness'].sum()
# Calculate the percentage
mental_illness_percentage = (mental_illness_count / total_killed) * 100
# Display the result
print(f"Total people killed: {total_killed}")
print(f"Number of people with signs of mental illness: {mental_illness_count}")
print(f"Percentage of people killed with mental illness: {mental_illness_percentage:.2f}%")
Total people killed: 2535 Number of people with signs of mental illness: 633 Percentage of people killed with mental illness: 24.97%
# Prepare data for visualization
labels = ['Signs of Mental Illness', 'No Signs of Mental Illness']
sizes = [mental_illness_count, total_killed - mental_illness_count]
colors = ['#ff9999', '#66b3ff']
# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
sizes,
labels=labels,
autopct='%1.1f%%',
startangle=140,
colors=colors,
wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)
# Add title
plt.title('Percentage of People Killed by Police with Signs of Mental Illness', fontsize=16)
# Display the plot
plt.show()
# Get the top 10 cities with the highest number of police killings
top_cities = df_fatalities['city'].value_counts().head(10).index.tolist()
# Filter the DataFrame to include only these top 10 cities
df_top_cities = df_fatalities[df_fatalities['city'].isin(top_cities)]
# Display to verify
print(df_top_cities[['city', 'race']].head())
city race 12 Columbus B 40 Los Angeles H 48 St. Louis B 52 Austin W 64 Phoenix N
# Group by 'city' and 'race' to count police killings by race in each city
killings_by_city_race = df_top_cities.groupby(['city', 'race']).size().unstack(fill_value=0)
# Display the number of killings by race for each city
print(killings_by_city_race)
race A B H N O W city Austin 0 3 2 0 0 13 Chicago 0 21 1 0 0 2 Columbus 0 11 0 0 1 6 Houston 1 15 6 1 0 3 Las Vegas 0 3 5 0 0 9 Los Angeles 1 10 19 0 0 6 Miami 0 8 6 0 0 3 Phoenix 0 2 11 3 0 12 San Antonio 0 3 12 0 0 3 St. Louis 0 11 0 0 0 3
# Calculate the percentage of killings by race within each city
killings_by_city_race_percentage = killings_by_city_race.div(killings_by_city_race.sum(axis=1), axis=0) * 100
# Display the percentage table
print(killings_by_city_race_percentage)
race A B H N O W city Austin 0.00 16.67 11.11 0.00 0.00 72.22 Chicago 0.00 87.50 4.17 0.00 0.00 8.33 Columbus 0.00 61.11 0.00 0.00 5.56 33.33 Houston 3.85 57.69 23.08 3.85 0.00 11.54 Las Vegas 0.00 17.65 29.41 0.00 0.00 52.94 Los Angeles 2.78 27.78 52.78 0.00 0.00 16.67 Miami 0.00 47.06 35.29 0.00 0.00 17.65 Phoenix 0.00 7.14 39.29 10.71 0.00 42.86 San Antonio 0.00 16.67 66.67 0.00 0.00 16.67 St. Louis 0.00 78.57 0.00 0.00 0.00 21.43
# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")
# Plot the stacked bar chart for racial distribution of police killings
killings_by_city_race_percentage.plot(
kind='bar',
stacked=True,
figsize=(14, 8),
colormap='Set2'
)
# Add titles and labels
plt.title('Racial Distribution of Police Killings in Top 10 Cities', fontsize=16)
plt.xlabel('City', fontsize=12)
plt.ylabel('Percentage of Police Killings (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
# Add legend
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left')
# Display the plot
plt.tight_layout()
plt.show()
# Group the data by state to count police killings
killings_by_state = df_fatalities['state'].value_counts().reset_index()
killings_by_state.columns = ['state', 'num_killed']
# Display the aggregated data
print(killings_by_state.head())
# Split the string on ',' and keep only the second part (state abbreviation)
killings_by_state['state'] = killings_by_state['state'].apply(lambda x: x.split(',')[-1].strip())
# Remove 'DC' to avoid rendering issues
killings_by_state = killings_by_state[killings_by_state['state'] != 'DC']
# Verify the cleaned state abbreviations
print(killings_by_state['state'].unique())
state num_killed 0 CA 424 1 TX 225 2 FL 154 3 AZ 118 4 OH 79 ['CA' 'TX' 'FL' 'AZ' 'OH' 'OK' 'CO' 'GA' 'NC' 'MO' 'WA' 'IL' 'TN' 'LA' 'NM' 'PA' 'AL' 'VA' 'NY' 'SC' 'WI' 'KY' 'IN' 'NV' 'MD' 'OR' 'MI' 'NJ' 'MN' 'WV' 'AR' 'KS' 'MS' 'UT' 'MA' 'ID' 'NE' 'AK' 'ME' 'IA' 'MT' 'HI' 'SD' 'CT' 'WY' 'DE' 'NH' 'ND' 'VT' 'RI']
print(killings_by_state)
state num_killed 0 CA 424 1 TX 225 2 FL 154 3 AZ 118 4 OH 79 5 OK 78 6 CO 74 7 GA 70 8 NC 69 9 MO 64 10 WA 62 11 IL 62 12 TN 59 13 LA 57 14 NM 51 15 PA 51 16 AL 50 17 VA 47 18 NY 45 19 SC 44 20 WI 43 21 KY 43 22 IN 43 23 NV 42 24 MD 38 25 OR 38 26 MI 37 27 NJ 35 28 MN 32 29 WV 27 30 AR 26 31 KS 24 32 MS 23 33 UT 23 34 MA 22 35 ID 17 36 NE 15 37 AK 15 38 ME 13 39 IA 12 40 MT 11 41 HI 11 43 SD 10 44 CT 9 45 WY 8 46 DE 8 47 NH 7 48 ND 4 49 VT 3 50 RI 2
# Create the choropleth map with cleaned state codes
fig = px.choropleth(
killings_by_state,
locations='state', # Use cleaned two-letter state abbreviations
locationmode='USA-states', # Recognize U.S. state abbreviations
color='num_killed', # Color based on the number of killings
color_continuous_scale='Reds', # Use a red gradient for intensity
scope='usa', # Focus on U.S. map
labels={'num_killed': 'Number of Killings'},
title='Police Killings by U.S. State'
)
import plotly.io as pio
# Set the default renderer to 'notebook_connected' for Jupyter Notebook
pio.renderers.default = 'iframe'
# Now plot your figure again
fig.show()
fig.write_html("police_killings_map.html")
# Group by month and count the number of police killings
killings_over_time = df_fatalities.groupby(df_fatalities['date'].dt.to_period('M')).size().reset_index(name='num_killed')
# Convert 'date' back to datetime for plotting
killings_over_time['date'] = killings_over_time['date'].dt.to_timestamp()
# Display the aggregated data
print(killings_over_time.head())
date num_killed 0 2015-01-01 70 1 2015-02-01 78 2 2015-03-01 87 3 2015-04-01 88 4 2015-05-01 85
# Group by year and count the number of police killings
killings_by_year = df_fatalities.groupby(df_fatalities['date'].dt.year).size().reset_index(name='num_killed')
# Rename columns for clarity
killings_by_year.columns = ['year', 'num_killed']
# Display the yearly data
print(killings_by_year)
year num_killed 0 2015 991 1 2016 963 2 2017 581
# Line plot for monthly trends
fig = px.line(
killings_over_time,
x='date',
y='num_killed',
title='Number of Police Killings Over Time (Monthly)',
labels={'num_killed': 'Number of Killings', 'date': 'Date'},
markers=True
)
# Customize the layout
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Police Killings', template='plotly_white')
# Display the plot
fig.show()