Project 2: Boston Crime DatasetĀ¶
Task 1: Dataset Selection and ConfirmationĀ¶
We chose the dataset: https://data.boston.gov/dataset/crime-incident-reports-august-2015-to-date-source-new-system
InĀ [1]:
# #Importing all Excel Years into one merged document
# import pandas as pd
# import glob
# path = r'C:\Users\Adam\OneDrive - Northeastern University\IE6600\Proj 2\*.csv'
# all_files = glob.glob(path)
# all_data = []
# for filename in all_files:
# all_data.append(pd.read_csv(filename, low_memory=False)) # Ignore the warning
# merged_df = pd.concat(all_data)
# merged_df.to_csv('merged_data.csv', index=False)
Task 2: Data Acquisition and InspectionĀ¶
InĀ [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
# Settings the warnings to be ignored
warnings.filterwarnings('ignore')
InĀ [3]:
#Step 1: read the dataset
missing_values = [' ','NaN']
df = pd.read_csv('merged_data.csv', na_values = missing_values)
df.head()
Out[3]:
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I172040657 | 2629 | Harassment | HARASSMENT | C11 | 397.0 | NaN | 2015-06-15 00:00:00 | 2015 | 6 | Monday | 0 | Part Two | MELBOURNE ST | 42.291093 | -71.065945 | (42.29109287, -71.06594539) |
1 | I182061268 | 3201 | Property Lost | PROPERTY - LOST | NaN | NaN | NaN | 2015-06-15 00:00:00 | 2015 | 6 | Monday | 0 | Part Three | BERNARD | NaN | NaN | NaN |
2 | I162013546 | 3201 | Property Lost | PROPERTY - LOST | B3 | 433.0 | NaN | 2015-06-15 00:00:00 | 2015 | 6 | Monday | 0 | Part Three | NORFOLK ST | 42.283634 | -71.082813 | (42.28363434, -71.08281320) |
3 | I152051083 | 3115 | Investigate Person | INVESTIGATE PERSON | A7 | 20.0 | NaN | 2015-06-15 00:00:00 | 2015 | 6 | Monday | 0 | Part Three | PARIS ST | 42.377023 | -71.032247 | (42.37702319, -71.03224730) |
4 | I152059178 | 2647 | Other | THREATS TO DO BODILY HARM | C11 | 359.0 | NaN | 2015-06-15 00:00:00 | 2015 | 6 | Monday | 0 | Part Two | WASHINGTON ST | 42.293606 | -71.071887 | (42.29360585, -71.07188650) |
InĀ [4]:
#Step 2a: Initial inspection
df.shape
Out[4]:
(765338, 17)
InĀ [5]:
#Step 2b: Initial inspection
df.describe(include=object)
Out[5]:
INCIDENT_NUMBER | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | DAY_OF_WEEK | UCR_PART | STREET | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 765338 | 353253 | 765338 | 760752 | 634917.0 | 413540 | 765338 | 765338 | 353156 | 753450 | 722817 |
unique | 723265 | 67 | 305 | 14 | 1759.0 | 5 | 591167 | 7 | 4 | 19794 | 71441 |
top | I152071596 | Motor Vehicle Accident Response | INVESTIGATE PERSON | B2 | 355.0 | 0 | 2016-08-01 00:00:00 | Friday | Part Three | WASHINGTON ST | (42.29755532959655, -71.05970910242573) |
freq | 20 | 41064 | 57185 | 114005 | 10058.0 | 400568 | 33 | 116718 | 176042 | 43879 | 5862 |
InĀ [6]:
#Step2c: Determine why there are 5 values of Shooting
df.SHOOTING.unique()
Out[6]:
array([nan, 'Y', '0', '1', 0, 1], dtype=object)
InĀ [7]:
#Step 2d: Initial inspection
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 765338 entries, 0 to 765337 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 INCIDENT_NUMBER 765338 non-null object 1 OFFENSE_CODE 765338 non-null int64 2 OFFENSE_CODE_GROUP 353253 non-null object 3 OFFENSE_DESCRIPTION 765338 non-null object 4 DISTRICT 760752 non-null object 5 REPORTING_AREA 634917 non-null object 6 SHOOTING 413540 non-null object 7 OCCURRED_ON_DATE 765338 non-null object 8 YEAR 765338 non-null int64 9 MONTH 765338 non-null int64 10 DAY_OF_WEEK 765338 non-null object 11 HOUR 765338 non-null int64 12 UCR_PART 353156 non-null object 13 STREET 753450 non-null object 14 Lat 722817 non-null float64 15 Long 722817 non-null float64 16 Location 722817 non-null object dtypes: float64(2), int64(4), object(11) memory usage: 99.3+ MB
Task 3: Data Cleaning and PrepartionĀ¶
InĀ [8]:
#Step1: Drop Unneccesary Columns
df = df.drop(columns=['OFFENSE_DESCRIPTION', 'Location'])
df.columns
Out[8]:
Index(['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long'], dtype='object')
InĀ [9]:
#Step2: Convert Date
df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE'], errors='coerce')
df['OCCURRED_ON_DATE'].info()
<class 'pandas.core.series.Series'> RangeIndex: 765338 entries, 0 to 765337 Series name: OCCURRED_ON_DATE Non-Null Count Dtype -------------- ----- 656904 non-null datetime64[ns] dtypes: datetime64[ns](1) memory usage: 5.8 MB
InĀ [10]:
#Step3: Convert Columns to lowercase
df.columns = df.columns.str.lower()
df.offense_code_group = df.offense_code_group.str.lower()
df.head()
Out[10]:
incident_number | offense_code | offense_code_group | district | reporting_area | shooting | occurred_on_date | year | month | day_of_week | hour | ucr_part | street | lat | long | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I172040657 | 2629 | harassment | C11 | 397.0 | NaN | 2015-06-15 | 2015 | 6 | Monday | 0 | Part Two | MELBOURNE ST | 42.291093 | -71.065945 |
1 | I182061268 | 3201 | property lost | NaN | NaN | NaN | 2015-06-15 | 2015 | 6 | Monday | 0 | Part Three | BERNARD | NaN | NaN |
2 | I162013546 | 3201 | property lost | B3 | 433.0 | NaN | 2015-06-15 | 2015 | 6 | Monday | 0 | Part Three | NORFOLK ST | 42.283634 | -71.082813 |
3 | I152051083 | 3115 | investigate person | A7 | 20.0 | NaN | 2015-06-15 | 2015 | 6 | Monday | 0 | Part Three | PARIS ST | 42.377023 | -71.032247 |
4 | I152059178 | 2647 | other | C11 | 359.0 | NaN | 2015-06-15 | 2015 | 6 | Monday | 0 | Part Two | WASHINGTON ST | 42.293606 | -71.071887 |
InĀ [11]:
#Step 4: Investigare Null Values - before fill
df.isnull().sum()
Out[11]:
incident_number 0 offense_code 0 offense_code_group 412085 district 4586 reporting_area 130421 shooting 351798 occurred_on_date 108434 year 0 month 0 day_of_week 0 hour 0 ucr_part 412182 street 11888 lat 42521 long 42521 dtype: int64
InĀ [12]:
#Fill N/A values based on the Offense Code
df.sort_values('offense_code', inplace=True)
df['offense_code_group'].fillna(method='ffill', inplace = True)
InĀ [13]:
#Fill NA values for shootings based on mode
# BUT first -> replace 'Y' with 1, '1' with 1, and '0' and'NAN' with 0
df['shooting'].replace({'Y': 1, '1': 1,'0':0, 'NAN': 0}, inplace=True)
df['shooting'].fillna(df['shooting'].mode()[0], inplace=True)
InĀ [14]:
#Fill N/A values based on the district and street using bfill
df.sort_values(['district','street'], inplace=True)
df['reporting_area'].fillna(method='bfill', inplace = True)
df['lat'].fillna(method='bfill', inplace = True)
df['long'].fillna(method='bfill', inplace = True)
df['street'].fillna(method='ffill', inplace = True)
InĀ [15]:
# Fill in the NA values for Occurred on Date based on year, month, random day
import datetime
import random
# Function to generate a random date within a given month and year
def generate_random_date(row):
if pd.isna(row['occurred_on_date']):
day = random.randint(1, 28) # Assuming a safe range to avoid invalid dates
return datetime.datetime(int(row['year']), int(row['month']), day).strftime('%Y-%m-%d')
else:
return row['occurred_on_date']
# Apply the function to the DataFrame
df['occurred_on_date'] = df.apply(generate_random_date, axis=1)
InĀ [16]:
#Fill the rest by Mode
df['ucr_part'].fillna(df['ucr_part'].mode()[0], inplace=True)
df['offense_code_group'].fillna(df['offense_code_group'].mode()[0], inplace=True)
df['district'].fillna(df['district'].mode()[0], inplace=True)
df['reporting_area'].fillna(df['reporting_area'].mode()[0], inplace=True)
InĀ [18]:
#Step 4: Investigare Null Values - after fill
df.isnull().sum()
Out[18]:
incident_number 0 offense_code 0 offense_code_group 0 district 0 reporting_area 0 shooting 0 occurred_on_date 0 year 0 month 0 day_of_week 0 hour 0 ucr_part 0 street 0 lat 0 long 0 dtype: int64
Task 4: AnalysisĀ¶
InĀ [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#Step 1: Convert date
df['year_incident'] = pd.to_datetime(df['occurred_on_date']).dt.year
#Step 2: Group by district and month
district_incidents = df.groupby(['district', 'year']).size().reset_index(name='incident_count')
#Step 5: Plot
plt.figure(figsize=(15, 15))
sns.boxplot(x='district', y='incident_count', data=district_incidents)
plt.title('Incident Count per District per Year')
plt.xlabel('District')
plt.ylabel('Incident Count')
plt.show()
InĀ [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df['day_incident'] = df['occurred_on_date'].dt.day
district_incidents = df.groupby(['month', 'day_incident']).size().reset_index(name='incident_count')
monthly_incidents = district_incidents.pivot(index='month', columns='day_incident', values='incident_count').fillna(0)
month_names = [calendar.month_name[i] for i in range(1, 13)]
plt.figure(figsize=(12, 8))
ax = sns.heatmap(monthly_incidents, cmap='coolwarm', fmt=".0f", linewidths=.5)
ax.set_yticklabels(month_names[:len(monthly_incidents.index)], rotation=0) # Adjust list slicing based on your data
plt.title('Heatmap of Incidents by Day and Month')
plt.xlabel('Day of Month')
plt.ylabel('Month')
plt.show()
InĀ [21]:
import seaborn as sns
import numpy as np
ax = sns.histplot(df.district)
ax.set_xticklabels(ax.get_xticklabels(), fontsize = 8, rotation = 45)
plt.title('Bar Chart of Crime Rate Count in Each District')
plt.xlabel('Boston District')
plt.ylabel('Count of Crimes')
# plt.axhline(y=np.mean(df.district), color='red', label='Mean')
plt.show()
InĀ [22]:
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
months_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
colors = ['green', 'red', 'blue', 'cyan', 'pink', 'black', 'grey', 'yellow', 'turquoise', 'orange', 'darkgreen', 'darkred']
df_aggregated = df.groupby(['year', 'month']).size().reset_index(name='count')
df_aggregated = df_aggregated.loc[1:len(df_aggregated)-2, :]
plt.figure(figsize=(15, 10))
for year, color in zip(years, colors):
ax = sns.lineplot(
x='month',
y='count',
data=df_aggregated[df_aggregated['year'] == year],
label=str(year),
color=color)
avg_monthly_count = df_aggregated.groupby('month')['count'].mean()
plt.plot(range(1, 13), avg_monthly_count, label='Average', color='purple', linestyle='--')
plt.axvline(x=4, color='red', linestyle='dashdot', label='April 2020')
plt.title('Monthly Crime Counts by Year')
plt.xlabel('Months Discrete')
plt.ylabel('Count of Crimes')
plt.legend(loc='lower right', title='Years')
plt.show()
InĀ [24]:
import numpy as np
plt.figure(figsize=(16, 16))
ax = sns.barplot(y=df.offense_code_group.value_counts().index, x=df.offense_code_group.value_counts())
ax.set_xticklabels(ax.get_xticklabels(), fontsize = 8, rotation = 90)
plt.title('Bar Chart of Each Offense Code Group')
plt.xlabel('Type of Offense')
plt.ylabel('Count')
plt.axvline(x= np.max(df.offense_code_group.value_counts()), color ='red', label = 'Highest Type of Offense')
plt.axvline(x= np.mean(df.offense_code_group.value_counts()), color ='green', label = 'Mean ')
plt.legend(loc='lower right')
plt.show()
InĀ [25]:
df_district = df.groupby(['year', 'district']).size().reset_index(name='count')
districts = df['district'].unique()
colors = sns.color_palette('husl', n_colors=len(districts))
plt.figure(figsize=(15,10))
for district, color in zip(districts, colors):
ax = sns.lineplot(
x='year',
y='count',
data=df_district[df_district['district']==district],
label = district,
color=color)
plt.title('Crime Counts by District Over the Years')
plt.xlabel('Years')
plt.ylabel('Count of Crimes')
plt.legend(title='Districts')
Out[25]:
<matplotlib.legend.Legend at 0x17610733710>
InĀ [43]:
import seaborn as sns
import matplotlib.pyplot as plt
df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df['hour'] = df['occurred_on_date'].dt.hour
df['day_of_week'] = df['occurred_on_date'].dt.dayofweek
crime_counts = df.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)
plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week Overall')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
InĀ [25]:
import seaborn as sns
import matplotlib.pyplot as plt
df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df['hour'] = df['occurred_on_date'].dt.hour
df['day_of_week'] = df['occurred_on_date'].dt.dayofweek
df_filtered = df[df['hour'] != 0]
crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)
plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
InĀ [26]:
#2015- 2019 graph
import seaborn as sns
import matplotlib.pyplot as plt
df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df_filtered = df[(df['occurred_on_date'].dt.year >= 2015) & (df['occurred_on_date'].dt.year <= 2019)&
(df['occurred_on_date'].dt.hour != 0)]
df_filtered['hour'] = df_filtered['occurred_on_date'].dt.hour
df_filtered['day_of_week'] = df_filtered['occurred_on_date'].dt.dayofweek
crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)
plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week for 2015 to 2019 Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
InĀ [42]:
#2020 graph
import seaborn as sns
import matplotlib.pyplot as plt
df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df_filtered = df[(df['occurred_on_date'].dt.year == 2020)&
(df['occurred_on_date'].dt.hour != 0)]
df_filtered['hour'] = df_filtered['occurred_on_date'].dt.hour
df_filtered['day_of_week'] = df_filtered['occurred_on_date'].dt.dayofweek
crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)
plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week for 2020 Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
InĀ [40]:
#2021 to present graph
import seaborn as sns
import matplotlib.pyplot as plt
df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df_filtered = df[df['hour'] != 0]
df_filtered = df[(df['occurred_on_date'].dt.year > 2020) & (df['occurred_on_date'].dt.year <= 2024)&
(df['occurred_on_date'].dt.hour != 0)]
df_filtered['hour'] = df_filtered['occurred_on_date'].dt.hour
df_filtered['day_of_week'] = df_filtered['occurred_on_date'].dt.dayofweek
crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)
plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week for 2021 to Present Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
InĀ [51]:
df_top_offenses['day_of_week'] = df_top_offenses['occurred_on_date'].dt.dayofweek
crime_counts_by_day = df_top_offenses.groupby(['offense_code_group', 'day_of_week']).size().unstack(fill_value=0)
plt.figure(figsize=(8, 4))
sns.heatmap(crime_counts_by_day, linewidths=.5, cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Day of the Week and Offense Type')
plt.ylabel('Offense Type')
plt.xlabel('Day of the Week')
#plt.yticks(rota)
plt.xticks(ticks=np.arange(7)+0.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation =45)
plt.tight_layout()
plt.show()
InĀ [38]:
plt.figure(figsize=(15,20))
crime_count_location = df.groupby(['offense_code_group', 'district']).size().unstack(fill_value=0)
sns.heatmap(crime_count_location, linewidths=.5, cmap='Pastel1')
plt.xlabel('Districts')
plt.ylabel('Offense Code Groups')
plt.title('Offense Code Groups Counts over All Districts')
plt.savefig('OCG.png')
InĀ [53]:
plt.figure(figsize=(10,8))
top_offenses = df['offense_code_group'].value_counts().nlargest(7).index
df_top_offenses = df[df['offense_code_group'].isin(top_offenses)]
crime_count_location = df_top_offenses.groupby(['offense_code_group', 'district']).size().unstack(fill_value=0)
sns.heatmap(crime_count_location, linewidths=.5, cmap='Pastel1')
plt.xlabel('Districts')
plt.ylabel('Top 7 Offense Code Groups')
plt.title('Offense Code Groups Counts over All Districts')
Out[53]:
Text(0.5, 1.0, 'Offense Code Groups Counts over All Districts')
InĀ [127]:
import plotly.express as px
df_filtered = df[df['year'] != 2024]
# Aggregating data by district, year, and month
df_aggregated = df_filtered.groupby(['district', 'year', 'month']).size().reset_index(name='total_crimes')
# Create the animated line plot
fig = px.line(df_aggregated,
x="month",
y="total_crimes",
color="district",
animation_frame="year",
labels={'total_crimes': 'Total Crimes', 'month': 'Month', 'year': 'Year', 'district': 'District'},
title="Total Crimes Over Months by District (Excluding Year 2024)"
)
# Update layout
fig.update_layout(xaxis_title="Month",
yaxis_title="Total Crimes",
legend_title="District",
showlegend=True)