import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

pd.options.display.float_format = '{:,.2f}'.format

df_fatalities = pd.read_csv('Deaths_by_Police_US.csv', encoding="windows-1252")

df_hh_income = pd.read_csv('Median_Household_Income_2015.csv', encoding="windows-1252")
df_hh_income.head()

df_pct_poverty = pd.read_csv('Pct_People_Below_Poverty_Level.csv', encoding="windows-1252")
df_pct_poverty.head()

df_pct_completed_hs = pd.read_csv('Pct_Over_25_Completed_High_School.csv', encoding="windows-1252")
df_pct_completed_hs.head()

df_share_race_city = pd.read_csv('Share_of_Race_By_City.csv', encoding="windows-1252")
df_share_race_city.head()

df_fatalities.head()

def standardize_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

df_hh_income = standardize_columns(df_hh_income)
df_hh_income.head()

df_pct_poverty = standardize_columns(df_pct_poverty)
df_pct_poverty.head()

df_pct_completed_hs = standardize_columns(df_pct_completed_hs)
df_pct_completed_hs.head()

df_share_race_city = standardize_columns(df_share_race_city)
df_share_race_city.head()

df_fatalities = standardize_columns(df_fatalities)
df_fatalities.head()

print(f"Column names of df_hh_income: {df_hh_income.columns.tolist()}")
print(f"Column names of pct_poverty: {df_pct_poverty.columns.tolist()}")
print(f"Column names of pct_completed: {df_pct_completed_hs.columns.tolist()}")
print(f"Column names of share_race_city: {df_share_race_city.columns.tolist()}")
print(f"Column names of fatalities: {df_fatalities.columns.tolist()}")

Column names of df_hh_income: ['geographic_area', 'city', 'median_income']
Column names of pct_poverty: ['geographic_area', 'city', 'poverty_rate']
Column names of pct_completed: ['geographic_area', 'city', 'percent_completed_hs']
Column names of share_race_city: ['geographic_area', 'city', 'share_white', 'share_black', 'share_native_american', 'share_asian', 'share_hispanic']
Column names of fatalities: ['id', 'name', 'date', 'manner_of_death', 'armed', 'age', 'gender', 'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level', 'flee', 'body_camera']

#shapes of dataframes
print(f"shape of df_hh_income: {df_hh_income.shape},\nshape of pct_poverty: {df_pct_poverty.shape},\nshape of df_pct_completed: {df_pct_completed_hs.shape},\nshape of df_share_race_city: {df_share_race_city.shape},\nshape of fatalities: {df_fatalities.shape}")

shape of df_hh_income: (29322, 3),
shape of pct_poverty: (29329, 3),
shape of df_pct_completed: (29329, 3),
shape of df_share_race_city: (29268, 7),
shape of fatalities: (2535, 14)

# Step 1: Create a Master List of Unique Geographic Area & City.  Thi siwll be used to join all the other tables with the same cardinality
master_ls = pd.concat([df_hh_income[['geographic_area', 'city']]
                          , df_pct_completed_hs[['geographic_area', 'city']]
                          , df_pct_poverty[['geographic_area', 'city']]
                          , df_share_race_city[['geographic_area', 'city']]]).drop_duplicates()
print(master_ls)

      geographic_area                city
0                  AL          Abanda CDP
1                  AL      Abbeville city
2                  AL     Adamsville city
3                  AL        Addison town
4                  AL          Akron town
...               ...                 ...
28243              WV   Summersville town
28256              WV     Upper Falls CDP
28469              WI         Delwood CDP
28667              WI  Lake Shangrila CDP
28828              WI       Pell Lake CDP

[29477 rows x 2 columns]

df_hh_income_check = master_ls.merge(df_hh_income, on=['geographic_area', 'city'], how='left', indicator=True)
df_pct_completed_hs_check = master_ls.merge(df_pct_completed_hs, on=['geographic_area', 'city'], how='left', indicator=True)
df_pct_poverty_check = master_ls.merge(df_pct_poverty, on=['geographic_area', 'city'], how='left', indicator=True)
df_share_race_city_check = master_ls.merge(df_share_race_city, on=['geographic_area', 'city'], how='left', indicator=True)

df_hh_income_missing = df_hh_income_check[df_hh_income_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_pct_completed_hs_missing = df_pct_completed_hs_check[df_pct_completed_hs_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_pct_poverty_missing = df_pct_poverty_check[df_pct_poverty_check['_merge'] == 'left_only'].drop(columns=['_merge'])
df_share_race_city_missing = df_share_race_city_check[df_share_race_city_check['_merge'] == 'left_only'].drop(columns=['_merge'])

print("Records missing from df_hh_income:")
print(df_hh_income_missing)

Records missing from df_hh_income:
      geographic_area                                             city  \
29322              CT                                     Milford city   
29323              GA          Athens-Clarke County unified government   
29324              GA  Augusta-Richmond County consolidated government   
29325              IN                                Indianapolis city   
29326              KS                Greeley County unified government   
...               ...                                              ...   
29472              WV                                Summersville town   
29473              WV                                  Upper Falls CDP   
29474              WI                                      Delwood CDP   
29475              WI                               Lake Shangrila CDP   
29476              WI                                    Pell Lake CDP   

      median_income  
29322           NaN  
29323           NaN  
29324           NaN  
29325           NaN  
29326           NaN  
...             ...  
29472           NaN  
29473           NaN  
29474           NaN  
29475           NaN  
29476           NaN  

[155 rows x 3 columns]

print("\nRecords missing from df_pct_completed_hs:")
print(df_pct_completed_hs_missing)

Records missing from df_pct_completed_hs:
      geographic_area                city percent_completed_hs
5676               HI              Hawaii                  NaN
24776              TX    C_sar Chövez CDP                  NaN
29331              AL       Kimberly town                  NaN
29332              AK        Edna Bay CDP                  NaN
29333              AK     Petersburg city                  NaN
...               ...                 ...                  ...
29472              WV   Summersville town                  NaN
29473              WV     Upper Falls CDP                  NaN
29474              WI         Delwood CDP                  NaN
29475              WI  Lake Shangrila CDP                  NaN
29476              WI       Pell Lake CDP                  NaN

[148 rows x 3 columns]

print("\nRecords missing from df_pct_poverty:")
print(df_pct_poverty_missing)

Records missing from df_pct_poverty:
      geographic_area                city poverty_rate
5676               HI              Hawaii          NaN
24776              TX    C_sar Chövez CDP          NaN
29331              AL       Kimberly town          NaN
29332              AK        Edna Bay CDP          NaN
29333              AK     Petersburg city          NaN
...               ...                 ...          ...
29472              WV   Summersville town          NaN
29473              WV     Upper Falls CDP          NaN
29474              WI         Delwood CDP          NaN
29475              WI  Lake Shangrila CDP          NaN
29476              WI       Pell Lake CDP          NaN

[148 rows x 3 columns]

print("\nRecords missing from df_share_race_city:")
print(df_share_race_city_missing)

Records missing from df_share_race_city:
      geographic_area                               city share_white  \
56                 AL                     Bon Secour CDP         NaN   
115                AL                     Cottondale CDP         NaN   
288                AL                      Kimberly city         NaN   
305                AL                        Lillian CDP         NaN   
413                AL                        Perdido CDP         NaN   
...               ...                                ...         ...   
29011              WI                     Summit village         NaN   
29103              WI                          Wiota CDP         NaN   
29265              WY                 Point of Rocks CDP         NaN   
29326              KS  Greeley County unified government         NaN   
29330              TX                   C_sar Chˆvez CDP         NaN   

      share_black share_native_american share_asian share_hispanic  
56            NaN                   NaN         NaN            NaN  
115           NaN                   NaN         NaN            NaN  
288           NaN                   NaN         NaN            NaN  
305           NaN                   NaN         NaN            NaN  
413           NaN                   NaN         NaN            NaN  
...           ...                   ...         ...            ...  
29011         NaN                   NaN         NaN            NaN  
29103         NaN                   NaN         NaN            NaN  
29265         NaN                   NaN         NaN            NaN  
29326         NaN                   NaN         NaN            NaN  
29330         NaN                   NaN         NaN            NaN  

[209 rows x 7 columns]

# Total number of unique records in the master list
total_records = len(master_ls)

# Calculate missing record counts
missing_df_hh_income_missing = len(df_hh_income_missing)
missing_df_pct_completed_hs_missing = len(df_pct_completed_hs_missing)
missing_df_pct_poverty_missing = len(df_pct_poverty_missing)
missing_df_share_race_city_missing = len(df_share_race_city_missing)


# Calculate missing percentage for each DataFrame
pct_missing_df_hh_income_missing = (missing_df_hh_income_missing / total_records) * 100
pct_missing_df_pct_completed_hs_missing = (missing_df_pct_completed_hs_missing / total_records) * 100
pct_missing_df_pct_poverty_missing = (missing_df_pct_poverty_missing / total_records) * 100
pct_missing_df_share_race_city_missing = (missing_df_share_race_city_missing / total_records) * 100

# Display results

# Display results
print(f"Missing % in hh_income: {pct_missing_df_hh_income_missing:.2f}% ({missing_df_hh_income_missing}/{total_records})")
print(f"Missing % in pct_completed_hs: {pct_missing_df_pct_completed_hs_missing:.2f}% ({missing_df_pct_completed_hs_missing}/{total_records})")
print(f"Missing % in pct_poverty: {pct_missing_df_pct_poverty_missing:.2f}% ({missing_df_pct_poverty_missing}/{total_records})")
print(f"Missing % in share_race: {pct_missing_df_share_race_city_missing:.2f}% ({missing_df_share_race_city_missing}/{total_records})")

Missing % in hh_income: 0.53% (155/29477)
Missing % in pct_completed_hs: 0.50% (148/29477)
Missing % in pct_poverty: 0.50% (148/29477)
Missing % in share_race: 0.71% (209/29477)

df_main = master_ls.merge(df_hh_income, on=['geographic_area', 'city'], how='left').merge(df_pct_completed_hs, on=['geographic_area', 'city'], how='left').merge(df_pct_poverty, on=['geographic_area', 'city'], how='left').merge(df_share_race_city, on=['geographic_area', 'city'], how='left')

print(df_main)
# Export the DataFrame to a CSV file
df_main.to_csv('merged_output.csv', index=False)

      geographic_area                city median_income percent_completed_hs  \
0                  AL          Abanda CDP         11207                 21.2   
1                  AL      Abbeville city         25615                 69.1   
2                  AL     Adamsville city         42575                 78.9   
3                  AL        Addison town         37083                 81.4   
4                  AL          Akron town         21667                 68.6   
...               ...                 ...           ...                  ...   
29472              WV   Summersville town           NaN                  NaN   
29473              WV     Upper Falls CDP           NaN                  NaN   
29474              WI         Delwood CDP           NaN                  NaN   
29475              WI  Lake Shangrila CDP           NaN                  NaN   
29476              WI       Pell Lake CDP           NaN                  NaN   

      poverty_rate share_white share_black share_native_american share_asian  \
0             78.8        67.2        30.2                     0           0   
1             29.1        54.4        41.4                   0.1           1   
2             25.5        52.3        44.9                   0.5         0.3   
3             30.7        99.1         0.1                     0         0.1   
4               42        13.2        86.5                     0           0   
...            ...         ...         ...                   ...         ...   
29472          NaN        97.4         0.4                   0.3         0.9   
29473          NaN        96.8         1.1                   0.4         0.3   
29474          NaN        98.6         0.2                   0.4         0.2   
29475          NaN        95.1         2.2                   0.1         0.1   
29476          NaN        94.2         0.3                   0.5         0.4   

      share_hispanic  
0                1.6  
1                3.1  
2                2.3  
3                0.4  
4                0.3  
...              ...  
29472            1.4  
29473            0.4  
29474            0.2  
29475            4.8  
29476            9.1  

[29477 rows x 10 columns]

df_main.head()

df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29477 entries, 0 to 29476
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   geographic_area        29477 non-null  object
 1   city                   29477 non-null  object
 2   median_income          29271 non-null  object
 3   percent_completed_hs   29329 non-null  object
 4   poverty_rate           29329 non-null  object
 5   share_white            29268 non-null  object
 6   share_black            29268 non-null  object
 7   share_native_american  29268 non-null  object
 8   share_asian            29268 non-null  object
 9   share_hispanic         29268 non-null  object
dtypes: object(10)
memory usage: 2.2+ MB

def clean_columns(df, column_name):
    cleaned_column = pd.to_numeric(
        df[column_name]
        .astype(str)  # Ensure all values are strings
        .str.replace(',', '', regex=True)  # Remove commas
        .str.replace(r'-$', '', regex=True),  # Remove trailing hyphens
        errors='coerce'  # Convert to float, setting errors to NaN
    )
    
    return cleaned_column

# List of columns to clean
columns_to_clean = [
    'share_asian', 
    'share_black', 
    'share_hispanic', 
    'share_native_american', 
    'share_white', 
    'percent_completed_hs', 
    'poverty_rate', 
    'median_income'
]

for column in columns_to_clean:
    df_main[column] = clean_columns(df_main, column)

df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29477 entries, 0 to 29476
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geographic_area        29477 non-null  object 
 1   city                   29477 non-null  object 
 2   median_income          27394 non-null  float64
 3   percent_completed_hs   29132 non-null  float64
 4   poverty_rate           29128 non-null  float64
 5   share_white            29248 non-null  float64
 6   share_black            29248 non-null  float64
 7   share_native_american  29248 non-null  float64
 8   share_asian            29248 non-null  float64
 9   share_hispanic         29248 non-null  float64
dtypes: float64(8), object(2)
memory usage: 2.2+ MB

# Let pandas infer the date format automatically
df_fatalities['date'] = pd.to_datetime(df_fatalities['date'], format='%Y-%m-%d', errors='coerce')

df_fatalities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       2535 non-null   int64         
 1   name                     2535 non-null   object        
 2   date                     2535 non-null   datetime64[ns]
 3   manner_of_death          2535 non-null   object        
 4   armed                    2526 non-null   object        
 5   age                      2458 non-null   float64       
 6   gender                   2535 non-null   object        
 7   race                     2340 non-null   object        
 8   city                     2535 non-null   object        
 9   state                    2535 non-null   object        
 10  signs_of_mental_illness  2535 non-null   bool          
 11  threat_level             2535 non-null   object        
 12  flee                     2470 non-null   object        
 13  body_camera              2535 non-null   bool          
dtypes: bool(2), datetime64[ns](1), float64(1), int64(1), object(9)
memory usage: 242.7+ KB

df_main.fillna(0, inplace=True)

# Select only numeric columns for aggregation
numeric_cols = df_main.select_dtypes(include='number').columns

# Group by 'geographic_area' and calculate mean for numeric columns
grouped_df = df_main.groupby('geographic_area')[numeric_cols].mean().reset_index()

grouped_df.head()

# Sort the DataFrame by poverty_rate in descending order (highest first)
sorted_poverty = grouped_df.sort_values(by='poverty_rate', ascending=False)

# Display the top 5 geographic areas with the highest poverty rates
print("Top 5 Geographic Areas with Highest Poverty Rates:")
print(sorted_poverty[['geographic_area', 'poverty_rate']].head())

# Display the bottom 5 geographic areas with the lowest poverty rates
print("\nBottom 5 Geographic Areas with Lowest Poverty Rates:")
print(sorted_poverty[['geographic_area', 'poverty_rate']].tail())

Top 5 Geographic Areas with Highest Poverty Rates:
   geographic_area  poverty_rate
25              MS         26.66
3               AZ         25.21
10              GA         23.48
2               AR         22.79
32              NM         22.41

Bottom 5 Geographic Areas with Lowest Poverty Rates:
   geographic_area  poverty_rate
20              MD         10.25
19              MA          9.55
6               CT          9.14
50              WY          9.06
31              NJ          8.13

# Loop through each numeric column and create a bar chart
for col in numeric_cols:
    plt.figure(figsize=(10, 6))
    
    # Sort the data for better visualization
    sorted_df = grouped_df.sort_values(by=col, ascending=True)
    
    plt.bar(sorted_df['geographic_area'], sorted_df[col])
    plt.title(f'{col.replace("_", " ").title()} by Geographic Area (Ascending)')
    plt.xlabel('Geographic Area')
    plt.ylabel(col.replace("_", " ").title())
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    plt.show()

##Sort the data frame: 
sorted_df = grouped_df.sort_values(by='median_income', ascending=True)

# Create figure and first axis
fig, ax1 = plt.subplots(figsize=(12, 7))

# Plot Poverty Rate on the first y-axis (left)
ax1.plot(sorted_df['geographic_area'], sorted_df['poverty_rate'], color='red', marker='o', label='Poverty Rate')
ax1.set_xlabel('Geographic Area', fontsize=12)
ax1.set_ylabel('Poverty Rate (%)', color='red', fontsize=12)
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(range(len(sorted_df['geographic_area'])))
ax1.set_xticklabels(sorted_df['geographic_area'], rotation=45, ha='right')

# Create a second y-axis for High School Graduation Rate (right)
ax2 = ax1.twinx()
ax2.plot(sorted_df['geographic_area'], sorted_df['percent_completed_hs'], color='blue', marker='s', label='HS Graduation Rate')
ax2.set_ylabel('High School Graduation Rate (%)', color='blue', fontsize=12)
ax2.tick_params(axis='y', labelcolor='blue')

# Invert the High School Graduation Rate axis (right y-axis)
ax2.invert_yaxis()  ## need this since there is an inverse relationship between HS grad rate and poverty rate

# Title and layout adjustments
plt.title('Poverty Rate vs High School Graduation Rate by Geographic Area', fontsize=16)
fig.tight_layout()

# Show the plot
plt.show()

# Set Seaborn style
sns.set(style="white", color_codes=True)

# Create jointplot with scatter and KDE
g = sns.jointplot(
    data=sorted_df,
    x='poverty_rate',
    y='percent_completed_hs',
    kind='scatter',          # Scatter plot
    color='purple',          # Color of points
    height=8,                # Size of the plot
    marginal_kws=dict(bins=15, fill=True)  # Histogram settings for marginals
)

# Add KDE contours to show density
g.plot_joint(sns.kdeplot, color='blue', levels=5)

# Customize titles and labels
g.set_axis_labels('Poverty Rate (%)', 'High School Graduation Rate (%)', fontsize=12)
plt.suptitle('Poverty Rate vs High School Graduation Rate', fontsize=16, y=1.02)

# Show the plot
plt.show()

# Set Seaborn style
sns.set(style="whitegrid")

# Create a regression plot
plt.figure(figsize=(10, 6))
sns.regplot(
    data=sorted_df,
    x='poverty_rate',
    y='percent_completed_hs',
    scatter_kws={'color': 'red'},    # Color of scatter points
    line_kws={'color': 'blue'},      # Color of regression line
)

# Customize titles and labels
plt.title('Linear Regression: Poverty Rate vs High School Graduation Rate', fontsize=16)
plt.xlabel('Poverty Rate (%)', fontsize=12)
plt.ylabel('High School Graduation Rate (%)', fontsize=12)

# Show the plot
plt.show()

# Select only the necessary columns for visualization
race_columns = ['share_white', 'share_black', 'share_hispanic', 'share_asian', 'share_native_american']
plot_df = df_main[['geographic_area'] + race_columns]

# Group by 'geographic_area' and sum or average the racial shares
# Assuming these are percentages, we'll use the mean
plot_df = plot_df.groupby('geographic_area')[race_columns].mean().reset_index()

# Sort by the largest racial group for better visual structure (optional)
plot_df = plot_df.sort_values(by='share_white', ascending=False)

# Set figure size
plt.figure(figsize=(14, 8))

# Create the stacked bar chart
bottom = None
colors = ['#d9d9d9', '#636363', '#fdae6b', '#9ecae1', '#bc80bd']  # Custom colors for each group

for i, race in enumerate(race_columns):
    if bottom is None:
        plt.bar(plot_df['geographic_area'], plot_df[race], label=race.replace('_', ' ').title(), color=colors[i])
        bottom = plot_df[race]
    else:
        plt.bar(plot_df['geographic_area'], plot_df[race], bottom=bottom, label=race.replace('_', ' ').title(), color=colors[i])
        bottom += plot_df[race]

# Customize labels and titles
plt.title('Racial Makeup by US State', fontsize=16)
plt.xlabel('US State', fontsize=12)
plt.ylabel('Population Share (%)', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add legend
plt.legend(title='Racial Group', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to avoid clipping
plt.tight_layout()

# Show the plot
plt.show()

# Count the number of fatalities by race
deaths_by_race = df_fatalities['race'].value_counts(dropna=False)

# Replace NaN with 'Unknown' for visualization purposes
deaths_by_race.index = deaths_by_race.index.fillna('Unknown')

# Display to verify
print(deaths_by_race)

race
W          1201
B           618
H           423
Unknown     195
A            39
N            31
O            28
Name: count, dtype: int64

import matplotlib.pyplot as plt

# Prepare labels and values for the donut chart
labels = deaths_by_race.index
sizes = deaths_by_race.values
colors = ['#d9d9d9', '#636363', '#fdae6b', '#9ecae1', '#bc80bd', '#ffcc99', '#ff6666']  # Adjust as needed for categories

# Create the pie chart (which will be converted to a donut)
fig, ax = plt.subplots(figsize=(8, 8))

# Plot the pie chart
wedges, texts, autotexts = ax.pie(
    sizes,
    labels=labels,
    autopct='%1.1f%%',         # Display percentage
    startangle=140,            # Rotate to start at 140 degrees
    colors=colors[:len(labels)],
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'},  # White edges between segments
    textprops={'fontsize': 12}
)

# Draw the center circle to create the donut shape
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig.gca().add_artist(centre_circle)

# Add title
plt.title('People Killed by Race', fontsize=16)

# Ensure the chart is a circle
ax.axis('equal')  

# Display the chart
plt.show()

# Count the number of deaths by gender
deaths_by_gender = df_fatalities['gender'].value_counts()

# Display the result to verify
print(deaths_by_gender)

gender
M    2428
F     107
Name: count, dtype: int64

# Prepare labels and values
labels = deaths_by_gender.index.map({'M': 'Men', 'F': 'Women'})  # Convert 'M' and 'F' to readable labels
counts = deaths_by_gender.values
colors = ['#1f77b4', '#ff7f0e']  # Blue for men, orange for women

# Create the bar chart
plt.figure(figsize=(8, 6))
plt.bar(labels, counts, color=colors)

# Add labels and title
plt.title('Total Number of Deaths by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)

# Display counts on top of the bars
for i, count in enumerate(counts):
    plt.text(i, count + 20, str(count), ha='center', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()

# Create a copy of the filtered DataFrame to avoid the warning
plot_df = df_fatalities.dropna(subset=['age', 'manner_of_death', 'gender']).copy()

# Now safely map gender abbreviations without warnings
plot_df['gender'] = plot_df['gender'].map({'M': 'Men', 'F': 'Women'})

plt.figure(figsize=(14, 8))

# Box plot
sns.boxplot(data=plot_df, x='manner_of_death', y='age', hue='gender', palette='Set2')

# Swarm plot to show individual data points
sns.swarmplot(data=plot_df, x='manner_of_death', y='age', hue='gender', dodge=True, color='black', size=3, alpha=0.5)

plt.title('Age Distribution by Manner of Death and Gender with Individual Data Points', fontsize=16)
plt.xlabel('Manner of Death', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Adjust legend to avoid duplication
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:2], labels[:2], title='Gender', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\3925807255.py:7: FutureWarning: 

Setting a gradient palette using color= is deprecated and will be removed in v0.14.0. Set `palette='dark:black'` for the same effect.

  sns.swarmplot(data=plot_df, x='manner_of_death', y='age', hue='gender', dodge=True, color='black', size=3, alpha=0.5)

# Create a new column to classify the armed status
def classify_armed_status(weapon):
    if pd.isna(weapon) or weapon.lower() in ['unknown', 'undetermined']:
        return 'Unknown'
    elif weapon.lower() == 'unarmed':
        return 'Unarmed'
    elif 'gun' in weapon.lower() or 'rifle' in weapon.lower() or 'pistol' in weapon.lower() or 'firearm' in weapon.lower():
        return 'Armed with Gun'
    else:
        return 'Armed with Other Weapon'

# Apply the classification
df_fatalities['armed_status'] = df_fatalities['armed'].apply(classify_armed_status)

# Verify the new categories
print(df_fatalities['armed_status'].value_counts())

armed_status
Armed with Gun             1411
Armed with Other Weapon     827
Unarmed                     171
Unknown                     126
Name: count, dtype: int64

# Calculate the total counts
armed_counts = df_fatalities['armed_status'].value_counts()

# Calculate percentages
armed_percentages = (armed_counts / armed_counts.sum()) * 100

# Display the percentages
print(armed_percentages)

armed_status
Armed with Gun            55.66
Armed with Other Weapon   32.62
Unarmed                    6.75
Unknown                    4.97
Name: count, dtype: float64

# Set up the pie chart
plt.figure(figsize=(8, 8))
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']

# Plot pie chart
plt.pie(
    armed_percentages,
    labels=armed_percentages.index,
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)

# Add title
plt.title('Armed Status in Police Killings', fontsize=16)

# Display the plot
plt.show()

# Filter out entries with missing ages
df_age_clean = df_fatalities.dropna(subset=['age'])

# Total number of people with known ages
total_with_age = len(df_age_clean)

# Number of people under 25
under_25_count = df_age_clean[df_age_clean['age'] < 25].shape[0]

# Calculate the percentage of people under 25
under_25_percentage = (under_25_count / total_with_age) * 100

# Display the result
print(f"Total people with known ages: {total_with_age}")
print(f"Number of people killed under 25: {under_25_count}")
print(f"Percentage of people killed under 25: {under_25_percentage:.2f}%")

Total people with known ages: 2458
Number of people killed under 25: 450
Percentage of people killed under 25: 18.31%

# Remove entries with missing 'age' values
df_age_clean = df_fatalities.dropna(subset=['age'])

# Check the basic statistics for age
print(df_age_clean['age'].describe())

count   2,458.00
mean       36.61
std        13.03
min         6.00
25%        26.00
50%        34.00
75%        45.00
max        91.00
Name: age, dtype: float64

# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")

# Create the plot
plt.figure(figsize=(12, 7))

# Plot histogram with KDE
sns.histplot(df_age_clean['age'], bins=30, kde=True, color='skyblue', edgecolor='black')

# Customize the plot with titles and labels
plt.title('Distribution of Ages of People Killed by Police', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()

# Filter out rows with missing 'age' or 'race' values
df_race_age = df_fatalities.dropna(subset=['age', 'race'])

# Check the unique races present in the dataset
print(df_race_age['race'].value_counts())

# Set Seaborn style
sns.set(style="whitegrid")

# Create KDE plots for each race using displot with col wrapping
g = sns.displot(
    data=df_race_age,
    x='age',
    col='race',
    kind='kde',
    col_wrap=3,            # Wraps plots into multiple rows after 3 plots
    fill=True,             # Fill under the KDE curve
    height=4,              # Size of each individual plot
    aspect=1.2,            # Aspect ratio of plots
    palette='muted'        # Color palette
)

# Customize titles and labels
g.set_axis_labels("Age", "Density")
g.set_titles("{col_name}")  # Sets the title of each subplot as the race name
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Age Distribution of People Killed by Police by Race', fontsize=16)

# Display the plot
plt.show()

race
W    1192
B     609
H     413
A      38
N      31
O      28
Name: count, dtype: int64

C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\148621240.py:11: UserWarning: Ignoring `palette` because no `hue` variable has been assigned.
  g = sns.displot(

# Count the number of fatalities by race
deaths_by_race = df_fatalities['race'].value_counts(dropna=False)

# Replace NaN with 'Unknown' for clarity in the chart
deaths_by_race.index = deaths_by_race.index.fillna('Unknown')

# Display the counts to verify
print(deaths_by_race)

race
W          1201
B           618
H           423
Unknown     195
A            39
N            31
O            28
Name: count, dtype: int64

# Set Seaborn style for aesthetics
sns.set(style="whitegrid")

# Create the bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=deaths_by_race.index, y=deaths_by_race.values, palette='Set2')

# Add labels and title
plt.title('Total Number of People Killed by Race', fontsize=16)
plt.xlabel('Race', fontsize=12)
plt.ylabel('Number of Deaths', fontsize=12)

# Display the counts on top of each bar
for i, value in enumerate(deaths_by_race.values):
    plt.text(i, value + 5, str(value), ha='center', fontsize=10)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Display the plot
plt.tight_layout()
plt.show()

C:\Users\ionne\AppData\Local\Temp\ipykernel_152656\918359945.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=deaths_by_race.index, y=deaths_by_race.values, palette='Set2')

# Check unique values in the 'signs_of_mental_illness' column
print(df_fatalities['signs_of_mental_illness'].value_counts(dropna=False))

signs_of_mental_illness
False    1902
True      633
Name: count, dtype: int64

# Total number of people killed
total_killed = df_fatalities.shape[0]

# Number of people with signs of mental illness
mental_illness_count = df_fatalities['signs_of_mental_illness'].sum()

# Calculate the percentage
mental_illness_percentage = (mental_illness_count / total_killed) * 100

# Display the result
print(f"Total people killed: {total_killed}")
print(f"Number of people with signs of mental illness: {mental_illness_count}")
print(f"Percentage of people killed with mental illness: {mental_illness_percentage:.2f}%")

Total people killed: 2535
Number of people with signs of mental illness: 633
Percentage of people killed with mental illness: 24.97%

# Prepare data for visualization
labels = ['Signs of Mental Illness', 'No Signs of Mental Illness']
sizes = [mental_illness_count, total_killed - mental_illness_count]
colors = ['#ff9999', '#66b3ff']

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    sizes,
    labels=labels,
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)

# Add title
plt.title('Percentage of People Killed by Police with Signs of Mental Illness', fontsize=16)

# Display the plot
plt.show()

# Get the top 10 cities with the highest number of police killings
top_cities = df_fatalities['city'].value_counts().head(10).index.tolist()

# Filter the DataFrame to include only these top 10 cities
df_top_cities = df_fatalities[df_fatalities['city'].isin(top_cities)]

# Display to verify
print(df_top_cities[['city', 'race']].head())

           city race
12     Columbus    B
40  Los Angeles    H
48    St. Louis    B
52       Austin    W
64      Phoenix    N

# Group by 'city' and 'race' to count police killings by race in each city
killings_by_city_race = df_top_cities.groupby(['city', 'race']).size().unstack(fill_value=0)

# Display the number of killings by race for each city
print(killings_by_city_race)

race         A   B   H  N  O   W
city                            
Austin       0   3   2  0  0  13
Chicago      0  21   1  0  0   2
Columbus     0  11   0  0  1   6
Houston      1  15   6  1  0   3
Las Vegas    0   3   5  0  0   9
Los Angeles  1  10  19  0  0   6
Miami        0   8   6  0  0   3
Phoenix      0   2  11  3  0  12
San Antonio  0   3  12  0  0   3
St. Louis    0  11   0  0  0   3

# Calculate the percentage of killings by race within each city
killings_by_city_race_percentage = killings_by_city_race.div(killings_by_city_race.sum(axis=1), axis=0) * 100

# Display the percentage table
print(killings_by_city_race_percentage)

race           A     B     H     N    O     W
city                                         
Austin      0.00 16.67 11.11  0.00 0.00 72.22
Chicago     0.00 87.50  4.17  0.00 0.00  8.33
Columbus    0.00 61.11  0.00  0.00 5.56 33.33
Houston     3.85 57.69 23.08  3.85 0.00 11.54
Las Vegas   0.00 17.65 29.41  0.00 0.00 52.94
Los Angeles 2.78 27.78 52.78  0.00 0.00 16.67
Miami       0.00 47.06 35.29  0.00 0.00 17.65
Phoenix     0.00  7.14 39.29 10.71 0.00 42.86
San Antonio 0.00 16.67 66.67  0.00 0.00 16.67
St. Louis   0.00 78.57  0.00  0.00 0.00 21.43

# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")

# Plot the stacked bar chart for racial distribution of police killings
killings_by_city_race_percentage.plot(
    kind='bar',
    stacked=True,
    figsize=(14, 8),
    colormap='Set2'
)

# Add titles and labels
plt.title('Racial Distribution of Police Killings in Top 10 Cities', fontsize=16)
plt.xlabel('City', fontsize=12)
plt.ylabel('Percentage of Police Killings (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add legend
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the plot
plt.tight_layout()
plt.show()

# Group the data by state to count police killings
killings_by_state = df_fatalities['state'].value_counts().reset_index()
killings_by_state.columns = ['state', 'num_killed']

# Display the aggregated data
print(killings_by_state.head())

# Split the string on ',' and keep only the second part (state abbreviation)
killings_by_state['state'] = killings_by_state['state'].apply(lambda x: x.split(',')[-1].strip())
# Remove 'DC' to avoid rendering issues
killings_by_state = killings_by_state[killings_by_state['state'] != 'DC']

# Verify the cleaned state abbreviations
print(killings_by_state['state'].unique())

  state  num_killed
0    CA         424
1    TX         225
2    FL         154
3    AZ         118
4    OH          79
['CA' 'TX' 'FL' 'AZ' 'OH' 'OK' 'CO' 'GA' 'NC' 'MO' 'WA' 'IL' 'TN' 'LA'
 'NM' 'PA' 'AL' 'VA' 'NY' 'SC' 'WI' 'KY' 'IN' 'NV' 'MD' 'OR' 'MI' 'NJ'
 'MN' 'WV' 'AR' 'KS' 'MS' 'UT' 'MA' 'ID' 'NE' 'AK' 'ME' 'IA' 'MT' 'HI'
 'SD' 'CT' 'WY' 'DE' 'NH' 'ND' 'VT' 'RI']

print(killings_by_state)

   state  num_killed
0     CA         424
1     TX         225
2     FL         154
3     AZ         118
4     OH          79
5     OK          78
6     CO          74
7     GA          70
8     NC          69
9     MO          64
10    WA          62
11    IL          62
12    TN          59
13    LA          57
14    NM          51
15    PA          51
16    AL          50
17    VA          47
18    NY          45
19    SC          44
20    WI          43
21    KY          43
22    IN          43
23    NV          42
24    MD          38
25    OR          38
26    MI          37
27    NJ          35
28    MN          32
29    WV          27
30    AR          26
31    KS          24
32    MS          23
33    UT          23
34    MA          22
35    ID          17
36    NE          15
37    AK          15
38    ME          13
39    IA          12
40    MT          11
41    HI          11
43    SD          10
44    CT           9
45    WY           8
46    DE           8
47    NH           7
48    ND           4
49    VT           3
50    RI           2

# Create the choropleth map with cleaned state codes
fig = px.choropleth(
    killings_by_state,
    locations='state',                # Use cleaned two-letter state abbreviations
    locationmode='USA-states',        # Recognize U.S. state abbreviations
    color='num_killed',               # Color based on the number of killings
    color_continuous_scale='Reds',    # Use a red gradient for intensity
    scope='usa',                      # Focus on U.S. map
    labels={'num_killed': 'Number of Killings'},
    title='Police Killings by U.S. State'
)
import plotly.io as pio

# Set the default renderer to 'notebook_connected' for Jupyter Notebook
pio.renderers.default = 'iframe'

# Now plot your figure again
fig.show()



fig.write_html("police_killings_map.html")

# Group by month and count the number of police killings
killings_over_time = df_fatalities.groupby(df_fatalities['date'].dt.to_period('M')).size().reset_index(name='num_killed')

# Convert 'date' back to datetime for plotting
killings_over_time['date'] = killings_over_time['date'].dt.to_timestamp()

# Display the aggregated data
print(killings_over_time.head())

        date  num_killed
0 2015-01-01          70
1 2015-02-01          78
2 2015-03-01          87
3 2015-04-01          88
4 2015-05-01          85

# Group by year and count the number of police killings
killings_by_year = df_fatalities.groupby(df_fatalities['date'].dt.year).size().reset_index(name='num_killed')

# Rename columns for clarity
killings_by_year.columns = ['year', 'num_killed']

# Display the yearly data
print(killings_by_year)

   year  num_killed
0  2015         991
1  2016         963
2  2017         581

# Line plot for monthly trends
fig = px.line(
    killings_over_time,
    x='date',
    y='num_killed',
    title='Number of Police Killings Over Time (Monthly)',
    labels={'num_killed': 'Number of Killings', 'date': 'Date'},
    markers=True
)

# Customize the layout
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Police Killings', template='plotly_white')

# Display the plot
fig.show()

	id	name	date	manner_of_death	armed	age	gender	race	city	state	signs_of_mental_illness	threat_level	flee	body_camera
0	3	Tim Elliot	02/01/15	shot	gun	53.00	M	A	Shelton	WA	True	attack	Not fleeing	False
1	4	Lewis Lee Lembke	02/01/15	shot	gun	47.00	M	W	Aloha	OR	False	attack	Not fleeing	False
2	5	John Paul Quintero	03/01/15	shot and Tasered	unarmed	23.00	M	H	Wichita	KS	False	other	Not fleeing	False
3	8	Matthew Hoffman	04/01/15	shot	toy weapon	32.00	M	W	San Francisco	CA	True	attack	Not fleeing	False
4	9	Michael Rodriguez	04/01/15	shot	nail gun	39.00	M	H	Evans	CO	False	attack	Not fleeing	False

	id	name	date	manner_of_death	armed	age	gender	race	city	state	signs_of_mental_illness	threat_level	flee	body_camera
0	3	Tim Elliot	02/01/15	shot	gun	53.00	M	A	Shelton	WA	True	attack	Not fleeing	False
1	4	Lewis Lee Lembke	02/01/15	shot	gun	47.00	M	W	Aloha	OR	False	attack	Not fleeing	False
2	5	John Paul Quintero	03/01/15	shot and Tasered	unarmed	23.00	M	H	Wichita	KS	False	other	Not fleeing	False
3	8	Matthew Hoffman	04/01/15	shot	toy weapon	32.00	M	W	San Francisco	CA	True	attack	Not fleeing	False
4	9	Michael Rodriguez	04/01/15	shot	nail gun	39.00	M	H	Evans	CO	False	attack	Not fleeing	False

	geographic_area	median_income	percent_completed_hs	poverty_rate	share_white	share_black	share_native_american	share_asian	share_hispanic
0	AK	41,738.05	79.65	18.57	45.01	0.56	45.22	1.37	2.12
1	AL	37,807.53	80.03	20.58	71.52	23.00	0.65	0.47	2.94
2	AR	33,699.45	79.36	22.79	77.87	16.18	0.75	0.47	4.24
3	AZ	34,979.84	79.04	25.21	59.80	0.95	28.53	0.72	20.10
4	CA	54,538.95	80.09	16.73	70.93	2.66	1.70	5.50	29.26

Introduction¶

Import Statements¶

Notebook Presentation¶

Load the Data¶

Preliminary Data Exploration¶

	Geographic Area	City	Median Income
0	AL	Abanda CDP	11207
1	AL	Abbeville city	25615
2	AL	Adamsville city	42575
3	AL	Addison town	37083
4	AL	Akron town	21667

	Geographic Area	City	poverty_rate
0	AL	Abanda CDP	78.8
1	AL	Abbeville city	29.1
2	AL	Adamsville city	25.5
3	AL	Addison town	30.7
4	AL	Akron town	42

	Geographic Area	City	percent_completed_hs
0	AL	Abanda CDP	21.2
1	AL	Abbeville city	69.1
2	AL	Adamsville city	78.9
3	AL	Addison town	81.4
4	AL	Akron town	68.6

	Geographic area	City	share_white	share_black	share_native_american	share_asian	share_hispanic
0	AL	Abanda CDP	67.2	30.2	0	0	1.6
1	AL	Abbeville city	54.4	41.4	0.1	1	3.1
2	AL	Adamsville city	52.3	44.9	0.5	0.3	2.3
3	AL	Addison town	99.1	0.1	0	0.1	0.4
4	AL	Akron town	13.2	86.5	0	0	0.3