# #Importing all Excel Years into one merged document
# import pandas as pd
# import glob

# path = r'C:\Users\Adam\OneDrive - Northeastern University\IE6600\Proj 2\*.csv'
# all_files = glob.glob(path)

# all_data = []

# for filename in all_files:
#     all_data.append(pd.read_csv(filename, low_memory=False))  # Ignore the warning
# merged_df = pd.concat(all_data)
# merged_df.to_csv('merged_data.csv', index=False)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings 
# Settings the warnings to be ignored 
warnings.filterwarnings('ignore')

#Step 1: read the dataset
missing_values = [' ','NaN']
df = pd.read_csv('merged_data.csv',  na_values = missing_values)
df.head()

#Step 2a: Initial inspection 
df.shape

(765338, 17)

#Step 2b: Initial inspection 
df.describe(include=object)

#Step2c: Determine why there are 5 values of Shooting
df.SHOOTING.unique()

array([nan, 'Y', '0', '1', 0, 1], dtype=object)

#Step 2d: Initial inspection 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 765338 entries, 0 to 765337
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   INCIDENT_NUMBER      765338 non-null  object 
 1   OFFENSE_CODE         765338 non-null  int64  
 2   OFFENSE_CODE_GROUP   353253 non-null  object 
 3   OFFENSE_DESCRIPTION  765338 non-null  object 
 4   DISTRICT             760752 non-null  object 
 5   REPORTING_AREA       634917 non-null  object 
 6   SHOOTING             413540 non-null  object 
 7   OCCURRED_ON_DATE     765338 non-null  object 
 8   YEAR                 765338 non-null  int64  
 9   MONTH                765338 non-null  int64  
 10  DAY_OF_WEEK          765338 non-null  object 
 11  HOUR                 765338 non-null  int64  
 12  UCR_PART             353156 non-null  object 
 13  STREET               753450 non-null  object 
 14  Lat                  722817 non-null  float64
 15  Long                 722817 non-null  float64
 16  Location             722817 non-null  object 
dtypes: float64(2), int64(4), object(11)
memory usage: 99.3+ MB

#Step1: Drop Unneccesary Columns
df = df.drop(columns=['OFFENSE_DESCRIPTION', 'Location'])
df.columns

Index(['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'DISTRICT',
       'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH',
       'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long'],
      dtype='object')

#Step2: Convert Date
df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE'], errors='coerce')
df['OCCURRED_ON_DATE'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 765338 entries, 0 to 765337
Series name: OCCURRED_ON_DATE
Non-Null Count   Dtype         
--------------   -----         
656904 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 5.8 MB

#Step3: Convert Columns to lowercase
df.columns = df.columns.str.lower()
df.offense_code_group = df.offense_code_group.str.lower()
df.head()

#Step 4: Investigare Null Values - before fill
df.isnull().sum()

incident_number            0
offense_code               0
offense_code_group    412085
district                4586
reporting_area        130421
shooting              351798
occurred_on_date      108434
year                       0
month                      0
day_of_week                0
hour                       0
ucr_part              412182
street                 11888
lat                    42521
long                   42521
dtype: int64

#Fill N/A values based on the Offense Code
df.sort_values('offense_code', inplace=True)
df['offense_code_group'].fillna(method='ffill', inplace = True)

#Fill NA values for shootings based on mode
# BUT first -> replace 'Y' with 1, '1' with 1, and '0' and'NAN' with 0
df['shooting'].replace({'Y': 1, '1': 1,'0':0, 'NAN': 0}, inplace=True)
df['shooting'].fillna(df['shooting'].mode()[0], inplace=True)

#Fill N/A values based on the district and street using bfill
df.sort_values(['district','street'], inplace=True)
df['reporting_area'].fillna(method='bfill', inplace = True)    
df['lat'].fillna(method='bfill', inplace = True)    
df['long'].fillna(method='bfill', inplace = True)    
df['street'].fillna(method='ffill', inplace = True)

# Fill in the NA values for Occurred on Date based on year, month, random day
import datetime
import random

# Function to generate a random date within a given month and year
def generate_random_date(row):
    if pd.isna(row['occurred_on_date']):
        day = random.randint(1, 28)  # Assuming a safe range to avoid invalid dates
        return datetime.datetime(int(row['year']), int(row['month']), day).strftime('%Y-%m-%d')
    else:
        return row['occurred_on_date']

# Apply the function to the DataFrame
df['occurred_on_date'] = df.apply(generate_random_date, axis=1)

#Fill the rest by Mode
df['ucr_part'].fillna(df['ucr_part'].mode()[0], inplace=True)
df['offense_code_group'].fillna(df['offense_code_group'].mode()[0], inplace=True)
df['district'].fillna(df['district'].mode()[0], inplace=True)
df['reporting_area'].fillna(df['reporting_area'].mode()[0], inplace=True)

#Step 4: Investigare Null Values - after fill
df.isnull().sum()

incident_number       0
offense_code          0
offense_code_group    0
district              0
reporting_area        0
shooting              0
occurred_on_date      0
year                  0
month                 0
day_of_week           0
hour                  0
ucr_part              0
street                0
lat                   0
long                  0
dtype: int64

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Step 1: Convert date
df['year_incident'] = pd.to_datetime(df['occurred_on_date']).dt.year
#Step 2: Group by district and month 
district_incidents = df.groupby(['district', 'year']).size().reset_index(name='incident_count')

#Step 5: Plot
plt.figure(figsize=(15, 15))
sns.boxplot(x='district', y='incident_count', data=district_incidents)
plt.title('Incident Count per District per Year')
plt.xlabel('District')
plt.ylabel('Incident Count')
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df['day_incident'] = df['occurred_on_date'].dt.day

district_incidents = df.groupby(['month', 'day_incident']).size().reset_index(name='incident_count')

monthly_incidents = district_incidents.pivot(index='month', columns='day_incident', values='incident_count').fillna(0)

month_names = [calendar.month_name[i] for i in range(1, 13)]

plt.figure(figsize=(12, 8))
ax = sns.heatmap(monthly_incidents, cmap='coolwarm', fmt=".0f", linewidths=.5)
ax.set_yticklabels(month_names[:len(monthly_incidents.index)], rotation=0)  # Adjust list slicing based on your data
plt.title('Heatmap of Incidents by Day and Month')
plt.xlabel('Day of Month')
plt.ylabel('Month')
plt.show()

import seaborn as sns
import numpy as np

ax = sns.histplot(df.district)
ax.set_xticklabels(ax.get_xticklabels(), fontsize = 8, rotation = 45)
plt.title('Bar Chart of Crime Rate Count in Each District')
plt.xlabel('Boston District')
plt.ylabel('Count of Crimes')

# plt.axhline(y=np.mean(df.district), color='red', label='Mean')
plt.show()

years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
months_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
colors = ['green', 'red', 'blue', 'cyan', 'pink', 'black', 'grey', 'yellow', 'turquoise', 'orange', 'darkgreen', 'darkred']

df_aggregated = df.groupby(['year', 'month']).size().reset_index(name='count')
df_aggregated = df_aggregated.loc[1:len(df_aggregated)-2, :]

plt.figure(figsize=(15, 10))

for year, color in zip(years, colors):
    ax = sns.lineplot(
        x='month',
        y='count',
        data=df_aggregated[df_aggregated['year'] == year],
        label=str(year),
        color=color)

avg_monthly_count = df_aggregated.groupby('month')['count'].mean()
plt.plot(range(1, 13), avg_monthly_count, label='Average', color='purple', linestyle='--')
plt.axvline(x=4, color='red', linestyle='dashdot', label='April 2020')


    
plt.title('Monthly Crime Counts by Year')
plt.xlabel('Months Discrete')
plt.ylabel('Count of Crimes')
plt.legend(loc='lower right', title='Years')
plt.show()

import numpy as np
plt.figure(figsize=(16, 16))

ax = sns.barplot(y=df.offense_code_group.value_counts().index, x=df.offense_code_group.value_counts())
ax.set_xticklabels(ax.get_xticklabels(), fontsize = 8, rotation = 90)
plt.title('Bar Chart of Each Offense Code Group')
plt.xlabel('Type of Offense')
plt.ylabel('Count')

plt.axvline(x= np.max(df.offense_code_group.value_counts()), color ='red', label = 'Highest Type of Offense')
plt.axvline(x= np.mean(df.offense_code_group.value_counts()), color ='green', label = 'Mean ')
plt.legend(loc='lower right')
plt.show()

df_district = df.groupby(['year', 'district']).size().reset_index(name='count')
districts = df['district'].unique()
colors = sns.color_palette('husl', n_colors=len(districts))

plt.figure(figsize=(15,10))
for district, color in zip(districts, colors):
    ax = sns.lineplot(
        x='year',
        y='count',
        data=df_district[df_district['district']==district],
        label = district,
        color=color)
    
plt.title('Crime Counts by District Over the Years')
plt.xlabel('Years')
plt.ylabel('Count of Crimes')
plt.legend(title='Districts')

<matplotlib.legend.Legend at 0x17610733710>

import seaborn as sns
import matplotlib.pyplot as plt

df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df['hour'] = df['occurred_on_date'].dt.hour
df['day_of_week'] = df['occurred_on_date'].dt.dayofweek  

crime_counts = df.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)

plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week Overall')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df['hour'] = df['occurred_on_date'].dt.hour
df['day_of_week'] = df['occurred_on_date'].dt.dayofweek  
df_filtered = df[df['hour'] != 0]

crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)

plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

#2015- 2019 graph
import seaborn as sns
import matplotlib.pyplot as plt

df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df_filtered = df[(df['occurred_on_date'].dt.year >= 2015) & (df['occurred_on_date'].dt.year <= 2019)& 
                 (df['occurred_on_date'].dt.hour != 0)]
df_filtered['hour'] = df_filtered['occurred_on_date'].dt.hour
df_filtered['day_of_week'] = df_filtered['occurred_on_date'].dt.dayofweek  

crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)

plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week for 2015 to 2019 Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

#2020 graph
import seaborn as sns
import matplotlib.pyplot as plt

df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df_filtered = df[(df['occurred_on_date'].dt.year == 2020)& 
                 (df['occurred_on_date'].dt.hour != 0)]
df_filtered['hour'] = df_filtered['occurred_on_date'].dt.hour
df_filtered['day_of_week'] = df_filtered['occurred_on_date'].dt.dayofweek  

crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)

plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week for 2020 Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

#2021 to present graph
import seaborn as sns
import matplotlib.pyplot as plt

df['occurred_on_date'] = pd.to_datetime(df['occurred_on_date'])
df_filtered = df[df['hour'] != 0]
df_filtered = df[(df['occurred_on_date'].dt.year > 2020) & (df['occurred_on_date'].dt.year <= 2024)& 
                 (df['occurred_on_date'].dt.hour != 0)]
df_filtered['hour'] = df_filtered['occurred_on_date'].dt.hour
df_filtered['day_of_week'] = df_filtered['occurred_on_date'].dt.dayofweek  

crime_counts = df_filtered.groupby(['hour', 'day_of_week']).size().unstack(fill_value=0)

plt.figure(figsize=(8, 8))
sns.heatmap(crime_counts, linewidths=.5, linecolor='black', cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Hour and Day of the Week for 2021 to Present Filtered')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(ticks=np.arange(7)+.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

df_top_offenses['day_of_week'] = df_top_offenses['occurred_on_date'].dt.dayofweek  

crime_counts_by_day = df_top_offenses.groupby(['offense_code_group', 'day_of_week']).size().unstack(fill_value=0)

plt.figure(figsize=(8, 4))
sns.heatmap(crime_counts_by_day, linewidths=.5, cmap='coolwarm')
plt.title('Heatmap of Crime Counts by Day of the Week and Offense Type')
plt.ylabel('Offense Type')
plt.xlabel('Day of the Week')
#plt.yticks(rota)
plt.xticks(ticks=np.arange(7)+0.5, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation =45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(15,20))
crime_count_location = df.groupby(['offense_code_group', 'district']).size().unstack(fill_value=0)
sns.heatmap(crime_count_location, linewidths=.5, cmap='Pastel1')
plt.xlabel('Districts')
plt.ylabel('Offense Code Groups')
plt.title('Offense Code Groups Counts over All Districts')
plt.savefig('OCG.png')

plt.figure(figsize=(10,8))
top_offenses = df['offense_code_group'].value_counts().nlargest(7).index
df_top_offenses = df[df['offense_code_group'].isin(top_offenses)]
crime_count_location = df_top_offenses.groupby(['offense_code_group', 'district']).size().unstack(fill_value=0)
sns.heatmap(crime_count_location, linewidths=.5, cmap='Pastel1')
plt.xlabel('Districts')
plt.ylabel('Top 7 Offense Code Groups')
plt.title('Offense Code Groups Counts over All Districts')

Text(0.5, 1.0, 'Offense Code Groups Counts over All Districts')

import plotly.express as px
df_filtered = df[df['year'] != 2024]

# Aggregating data by district, year, and month
df_aggregated = df_filtered.groupby(['district', 'year', 'month']).size().reset_index(name='total_crimes')

# Create the animated line plot
fig = px.line(df_aggregated, 
              x="month", 
              y="total_crimes", 
              color="district", 
              animation_frame="year",
              labels={'total_crimes': 'Total Crimes', 'month': 'Month', 'year': 'Year', 'district': 'District'},
              title="Total Crimes Over Months by District (Excluding Year 2024)"
             )

# Update layout
fig.update_layout(xaxis_title="Month",
                  yaxis_title="Total Crimes",
                  legend_title="District",
                  showlegend=True)

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	UCR_PART	STREET	Lat	Long	Location
0	I172040657	2629	Harassment	HARASSMENT	C11	397.0	NaN	2015-06-15 00:00:00	2015	6	Monday	Part Two	MELBOURNE ST	42.291093	-71.065945	(42.29109287, -71.06594539)
1	I182061268	3201	Property Lost	PROPERTY - LOST	NaN	NaN	NaN	2015-06-15 00:00:00	2015	6	Monday	Part Three	BERNARD	NaN	NaN	NaN
2	I162013546	3201	Property Lost	PROPERTY - LOST	B3	433.0	NaN	2015-06-15 00:00:00	2015	6	Monday	Part Three	NORFOLK ST	42.283634	-71.082813	(42.28363434, -71.08281320)
3	I152051083	3115	Investigate Person	INVESTIGATE PERSON	A7	20.0	NaN	2015-06-15 00:00:00	2015	6	Monday	Part Three	PARIS ST	42.377023	-71.032247	(42.37702319, -71.03224730)
4	I152059178	2647	Other	THREATS TO DO BODILY HARM	C11	359.0	NaN	2015-06-15 00:00:00	2015	6	Monday	Part Two	WASHINGTON ST	42.293606	-71.071887	(42.29360585, -71.07188650)

	INCIDENT_NUMBER	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	DAY_OF_WEEK	UCR_PART	STREET	Location
count	765338	353253	765338	760752	634917.0	413540	765338	765338	353156	753450	722817
unique	723265	67	305	14	1759.0	5	591167	7	4	19794	71441
top	I152071596	Motor Vehicle Accident Response	INVESTIGATE PERSON	B2	355.0	0	2016-08-01 00:00:00	Friday	Part Three	WASHINGTON ST	(42.29755532959655, -71.05970910242573)
freq	20	41064	57185	114005	10058.0	400568	33	116718	176042	43879	5862

	incident_number	offense_code	offense_code_group	district	reporting_area	shooting	occurred_on_date	year	month	day_of_week	ucr_part	street	lat	long
0	I172040657	2629	harassment	C11	397.0	NaN	2015-06-15	2015	6	Monday	Part Two	MELBOURNE ST	42.291093	-71.065945
1	I182061268	3201	property lost	NaN	NaN	NaN	2015-06-15	2015	6	Monday	Part Three	BERNARD	NaN	NaN
2	I162013546	3201	property lost	B3	433.0	NaN	2015-06-15	2015	6	Monday	Part Three	NORFOLK ST	42.283634	-71.082813
3	I152051083	3115	investigate person	A7	20.0	NaN	2015-06-15	2015	6	Monday	Part Three	PARIS ST	42.377023	-71.032247
4	I152059178	2647	other	C11	359.0	NaN	2015-06-15	2015	6	Monday	Part Two	WASHINGTON ST	42.293606	-71.071887

Project 2: Boston Crime Dataset¶

Task 1: Dataset Selection and Confirmation¶

Task 2: Data Acquisition and Inspection¶

Task 3: Data Cleaning and Prepartion¶

Task 4: Analysis¶