Spaces:

DataVizSem5
/

Motor-Vehicle-Collisions-visualization

Sleeping

App Files Files Community

yousefmazhar commited on Nov 21, 2025

Commit

a1f7117

verified ·

1 Parent(s): ffb21eb

map 9d7aff

Browse files

Files changed (1) hide show

app.py +233 -167

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import plotly.express as px
 import plotly.graph_objects as go
 import pandas as pd
 import warnings
-from datetime import datetime
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -13,7 +13,7 @@ df = pd.read_parquet('nyc_crashes_integrated_clean.parquet')
 df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
 print(f"Data loaded: {len(df):,} records")
-# Clean vehicle types - whitelist of valid categories
 VALID_VEHICLE_TYPES = [
     'SEDAN', 'STATION WAGON/SPORT UTILITY VEHICLE', 'TAXI', 'PICK-UP TRUCK',
     'BOX TRUCK', 'VAN', 'MOTORCYCLE', 'SCOOTER', 'MOPED', 'E-SCOOTER', 'E-BIKE',
@@ -24,7 +24,6 @@ VALID_VEHICLE_TYPES = [
     'SPORT UTILITY / STATION WAGON', 'LIMOUSINE', 'UNKNOWN'
 ]
-# Replace invalid vehicle types with 'OTHER'
 df['VEHICLE TYPE CODE 1'] = df['VEHICLE TYPE CODE 1'].apply(
     lambda x: x if x in VALID_VEHICLE_TYPES else 'OTHER'
 )
@@ -34,7 +33,7 @@ df['VEHICLE TYPE CODE 2'] = df['VEHICLE TYPE CODE 2'].apply(
 print(f"Cleaned vehicle types. Valid categories: {len(df['VEHICLE TYPE CODE 1'].unique())}")
-# Define smart column choices for each graph type
 TEMPORAL_COLS = ['CRASH_YEAR', 'CRASH_MONTH', 'CRASH_DAYOFWEEK', 'CRASH_HOUR']
 CATEGORICAL_COLS = ['BOROUGH', 'PERSON_TYPE', 'PERSON_INJURY',
                     'CONTRIBUTING FACTOR VEHICLE 1', 'VEHICLE TYPE CODE 1',
@@ -45,42 +44,41 @@ NUMERIC_COLS = ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
                 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
                 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']
-# Get unique values for dropdowns - convert to native Python types
-boroughs = ['All'] + sorted([str(b) for b in df['BOROUGH'].dropna().unique() if str(b) != 'nan'])
-years = ['All'] + [int(y) for y in sorted(df['CRASH_YEAR'].unique())]
-months = ['All'] + [int(m) for m in sorted(df['CRASH_MONTH'].unique())]
 vehicles = ['All'] + sorted(VALID_VEHICLE_TYPES + ['OTHER'])
-person_types = ['All'] + sorted([str(s) for s in df['PERSON_TYPE'].dropna().unique() if str(s) != 'nan'])
-injury_types = ['All'] + sorted([str(t) for t in df['PERSON_INJURY'].dropna().unique() if str(t) != 'nan'])
-genders = ['All'] + sorted([str(g) for g in df['PERSON_SEX'].dropna().unique() if str(g) != 'nan'])
-safety_equip = ['All'] + sorted([str(s) for s in df['SAFETY_EQUIPMENT'].dropna().unique()
-                                  if str(s) not in ['nan', 'NOT APPLICABLE', 'NOT REPORTED', 'DOES NOT APPLY']][:15])
 def smart_search_parser(search_text):
-    """Parse natural language search query and return filter values"""
     if not search_text:
         return None
     search_lower = search_text.lower()
     filters = {}
     applied_filters = []
     # Borough detection
-    boroughs_list = ['BROOKLYN', 'MANHATTAN', 'QUEENS', 'BRONX', 'STATEN ISLAND']
-    for b in boroughs_list:
         if b.lower() in search_lower:
             filters['borough'] = b
             applied_filters.append(f"Borough: {b}")
             break
     # Year detection
-    import re
     years_found = re.findall(r'\b(20[1-2][0-9])\b', search_text)
     if years_found:
         filters['year'] = int(years_found[0])
         applied_filters.append(f"Year: {years_found[0]}")
     # Month detection
     months_map = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
                   'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12,
@@ -91,17 +89,18 @@ def smart_search_parser(search_text):
             filters['month'] = m_num
             applied_filters.append(f"Month: {m_name.capitalize()}")
             break
     # Day of week detection
     days_map = {'monday': [0], 'tuesday': [1], 'wednesday': [2], 'thursday': [3],
                 'friday': [4], 'saturday': [5], 'sunday': [6],
                 'weekday': [0, 1, 2, 3, 4], 'weekend': [5, 6]}
     for day_name, day_nums in days_map.items():
         if day_name in search_lower:
             filters['dow'] = day_nums
             applied_filters.append(f"Day: {day_name.capitalize()}")
             break
     # Time of day detection
     if 'morning' in search_lower:
         filters['hour_range'] = (6, 10)
@@ -115,19 +114,23 @@ def smart_search_parser(search_text):
     elif 'night' in search_lower:
         filters['hour_range'] = (20, 23)
         applied_filters.append("Time: Night (20-23)")
     # Vehicle type detection
     vehicle_keywords = {
         'sedan': 'SEDAN', 'suv': 'STATION WAGON/SPORT UTILITY VEHICLE',
         'taxi': 'TAXI', 'truck': 'PICK-UP TRUCK', 'bus': 'BUS',
-        'motorcycle': 'MOTORCYCLE', 'bike': 'BICYCLE', 'scooter': 'SCOOTER'
     }
     for keyword, vehicle_type in vehicle_keywords.items():
         if keyword in search_lower:
             filters['vehicle'] = vehicle_type
             applied_filters.append(f"Vehicle: {keyword.capitalize()}")
             break
     # Person type detection
     if 'pedestrian' in search_lower:
         filters['person_type'] = 'PEDESTRIAN'
@@ -138,7 +141,7 @@ def smart_search_parser(search_text):
     elif 'occupant' in search_lower or 'driver' in search_lower:
         filters['person_type'] = 'OCCUPANT'
         applied_filters.append("Person: Occupant")
     # Injury type detection
     if 'fatal' in search_lower or 'death' in search_lower or 'killed' in search_lower:
         filters['injury'] = 'KILLED'
@@ -146,29 +149,35 @@ def smart_search_parser(search_text):
     elif 'injured' in search_lower or 'injury' in search_lower:
         filters['injury'] = 'INJURED'
         applied_filters.append("Injury: Injured")
     return filters, applied_filters
-def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle, person_type,
-                   person_injury, gender, safety, c1_x, c1_y, c3_x, c3_y, c3_top,
-                   c4_x, c4_y, compare_cat):
-    """Generate complete dashboard report"""
     # Filter data
     filtered_df = df.copy()
     if borough != 'All':
         filtered_df = filtered_df[filtered_df['BOROUGH'] == borough]
     if year != 'All':
-        filtered_df = filtered_df[filtered_df['CRASH_YEAR'] == int(year)]
     if month != 'All':
-        filtered_df = filtered_df[filtered_df['CRASH_MONTH'] == int(month)]
-    if dow_list and len(dow_list) > 0:
-        # Convert to list of ints if needed
-        dow_ints = [int(d) if isinstance(d, str) else d for d in dow_list]
-        filtered_df = filtered_df[filtered_df['CRASH_DAYOFWEEK'].isin(dow_ints)]
-    filtered_df = filtered_df[(filtered_df['CRASH_HOUR'] >= int(hour_min)) &
-                              (filtered_df['CRASH_HOUR'] <= int(hour_max))]
     if vehicle != 'All':
         filtered_df = filtered_df[filtered_df['VEHICLE TYPE CODE 1'] == vehicle]
     if person_type != 'All':
@@ -179,30 +188,35 @@ def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle,
         filtered_df = filtered_df[filtered_df['PERSON_SEX'] == gender]
     if safety != 'All':
         filtered_df = filtered_df[filtered_df['SAFETY_EQUIPMENT'] == safety]
     if len(filtered_df) == 0:
         empty_fig = go.Figure()
         empty_fig.add_annotation(text="No data found. Adjust filters.", xref="paper", yref="paper",
-                                x=0.5, y=0.5, showarrow=False, font=dict(size=16, color="gray"))
         return "No data found", empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig
     # Summary Statistics
     total_records = len(filtered_df)
     total_injuries = int(filtered_df['NUMBER OF PERSONS INJURED'].sum())
     total_fatalities = int(filtered_df['NUMBER OF PERSONS KILLED'].sum())
     injury_rate = (total_injuries / total_records * 100) if total_records > 0 else 0
     summary_text = f"""
-📊 **Summary Statistics**
-- **Total Records:** {total_records:,}
-- **Total Injuries:** {total_injuries:,} ({injury_rate:.2f}%)
-- **Total Fatalities:** {total_fatalities:,}
-- **Pedestrian Injuries:** {int(filtered_df['NUMBER OF PEDESTRIANS INJURED'].sum()):,}
-- **Cyclist Injuries:** {int(filtered_df['NUMBER OF CYCLIST INJURED'].sum()):,}
-- **Motorist Injuries:** {int(filtered_df['NUMBER OF MOTORIST INJURED'].sum()):,}
-- **Unique Crashes:** {len(filtered_df['COLLISION_ID'].unique()):,}
     """
     # Chart 1: Trend Analysis
     if c1_y == 'count':
         chart1_data = filtered_df.groupby(c1_x).size().reset_index(name='count')
@@ -210,20 +224,20 @@ def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle,
     else:
         chart1_data = filtered_df.groupby(c1_x)[c1_y].sum().reset_index()
         y_label = c1_y
     fig1 = px.line(chart1_data, x=c1_x, y=chart1_data.columns[1],
                    labels={chart1_data.columns[1]: y_label, c1_x: c1_x},
                    title='Trend Analysis')
     fig1.update_traces(line_color='#3498db', line_width=3)
     fig1.update_layout(template='plotly_white', height=400)
-    # Chart 2: Person Type Distribution (Pie)
     person_type_data = filtered_df['PERSON_TYPE'].value_counts()
     fig2 = px.pie(values=person_type_data.values, names=person_type_data.index,
                   title='Person Type Distribution',
                   color_discrete_sequence=['#2ecc71', '#f39c12', '#e74c3c', '#3498db'])
     fig2.update_layout(height=400)
     # Chart 3: Categorical Analysis
     if c3_y == 'count':
         chart3_data = filtered_df[c3_x].value_counts().head(int(c3_top))
@@ -231,13 +245,13 @@ def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle,
     else:
         chart3_data = filtered_df.groupby(c3_x)[c3_y].sum().sort_values(ascending=False).head(int(c3_top))
         y_label = c3_y
     fig3 = px.bar(x=chart3_data.index, y=chart3_data.values,
                   labels={'x': c3_x, 'y': y_label},
-                  title='Categorical Analysis')
     fig3.update_traces(marker_color='#3498db')
     fig3.update_layout(template='plotly_white', height=400)
     # Chart 4: Time Distribution
     if c4_y == 'count':
         chart4_data = filtered_df[c4_x].value_counts().sort_index()
@@ -245,39 +259,39 @@ def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle,
     else:
         chart4_data = filtered_df.groupby(c4_x)[c4_y].sum().sort_index()
         y_label = c4_y
     fig4 = px.bar(x=chart4_data.index, y=chart4_data.values,
                   labels={'x': c4_x, 'y': y_label},
                   title='Time Distribution')
     fig4.update_traces(marker_color='#e67e22')
     fig4.update_layout(template='plotly_white', height=400)
     # Chart 5: Contributing Factor 1
     factor1_data = filtered_df['CONTRIBUTING FACTOR VEHICLE 1'].value_counts().head(15)
     factor1_data = factor1_data[factor1_data.index != 'UNSPECIFIED']
     fig5 = px.bar(x=factor1_data.index, y=factor1_data.values,
                   labels={'x': 'Contributing Factor', 'y': 'Number of Crashes'},
                   title='Top Contributing Factors (Vehicle 1)')
     fig5.update_traces(marker_color='#e74c3c')
     fig5.update_layout(template='plotly_white', height=400, xaxis={'tickangle': -45})
     # Chart 6: Contributing Factor 2
     factor2_data = filtered_df['CONTRIBUTING FACTOR VEHICLE 2'].value_counts().head(15)
     factor2_data = factor2_data[~factor2_data.index.isin(['UNSPECIFIED', 'NO SECOND VEHICLE'])]
     if len(factor2_data) > 0:
         fig6 = px.bar(x=factor2_data.index, y=factor2_data.values,
-                     labels={'x': 'Secondary Contributing Factor', 'y': 'Number of Crashes'},
-                     title='Top Contributing Factors (Vehicle 2)')
         fig6.update_traces(marker_color='#f39c12')
         fig6.update_layout(template='plotly_white', height=400, xaxis={'tickangle': -45})
     else:
         fig6 = go.Figure()
         fig6.add_annotation(text="No secondary factors", xref="paper", yref="paper",
-                           x=0.5, y=0.5, showarrow=False)
-        fig6.update_layout(height=400)
     # Chart 7: Injury Rate Comparison
     compare_data = filtered_df.groupby(compare_cat).agg({
         'COLLISION_ID': 'count',
@@ -288,16 +302,16 @@ def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle,
     compare_data['Injury_Rate'] = (compare_data['Total_Injuries'] / compare_data['Total_Records'] * 100)
     compare_data['Fatality_Rate'] = (compare_data['Total_Fatalities'] / compare_data['Total_Records'] * 100)
     compare_data = compare_data.sort_values('Injury_Rate', ascending=False).head(15)
     fig7 = go.Figure()
     fig7.add_trace(go.Bar(x=compare_data[compare_cat], y=compare_data['Injury_Rate'],
-                         name='Injury Rate (%)', marker_color='#f39c12'))
     fig7.add_trace(go.Bar(x=compare_data[compare_cat], y=compare_data['Fatality_Rate'],
-                         name='Fatality Rate (%)', marker_color='#e74c3c'))
     fig7.update_layout(barmode='group', template='plotly_white', height=400,
-                      title='Injury Rate Comparison',
-                      xaxis_title=compare_cat, yaxis_title='Rate (%)')
     # Chart 8: Heatmap
     heatmap_data = filtered_df.groupby(['CRASH_DAYOFWEEK', 'CRASH_HOUR']).size().reset_index(name='count')
     if len(heatmap_data) > 0:
@@ -309,148 +323,200 @@ def generate_report(borough, year, month, dow_list, hour_min, hour_max, vehicle,
             colorscale='YlOrRd'
         ))
         fig8.update_layout(xaxis_title='Hour of Day', yaxis_title='Day of Week',
-                          title='Day × Hour Heatmap', template='plotly_white', height=500)
     else:
         fig8 = go.Figure()
-        fig8.update_layout(height=500)
-    # Chart 9: Geographic Map
-    map_sample = filtered_df[(filtered_df['LATITUDE'].notna()) &
                              (filtered_df['LATITUDE'] != 0) &
-                             (filtered_df['LATITUDE'] > 40) &
                              (filtered_df['LATITUDE'] < 41)]
     if len(map_sample) > 0:
         map_sample = map_sample.sample(n=min(3000, len(map_sample)), random_state=42)
-        map_sample['COLOR_LABEL'] = map_sample['PERSON_INJURY'].fillna('Unknown')
-        fig9 = px.scatter_mapbox(
-            map_sample, lat='LATITUDE', lon='LONGITUDE', color='COLOR_LABEL',
-            title=f'Crash Locations (Sample of {len(map_sample):,})',
-            zoom=10, height=600,
-            mapbox_style="open-street-map"
         )
     else:
         fig9 = go.Figure()
-        fig9.add_annotation(text="No location data available", xref="paper", yref="paper",
-                           x=0.5, y=0.5, showarrow=False, font=dict(size=16, color="gray"))
-        fig9.update_layout(height=600)
     return summary_text, fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8, fig9
 # Create Gradio Interface
 with gr.Blocks(title="NYC Motor Vehicle Crashes Dashboard") as demo:
     gr.Markdown("# 🚗 NYC Motor Vehicle Crashes Dashboard - Enhanced Analytics")
-    gr.Markdown("Comprehensive analysis with 5.7M+ crash records")
-    with gr.Accordion("🔍 Smart Search", open=False):
-        gr.Markdown("Type natural language queries like: 'Brooklyn 2022 pedestrian crashes' or 'Manhattan weekend taxi injured'")
-        search_input = gr.Textbox(label="Search Query",
-                                 placeholder="e.g., Queens Friday night motorcycle fatalities...")
-        search_btn = gr.Button("🔍 Apply Smart Search", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Filters")
             borough = gr.Dropdown(choices=boroughs, value='All', label="Borough")
             year = gr.Dropdown(choices=years, value='All', label="Year")
             month = gr.Dropdown(choices=months, value='All', label="Month")
-            dow = gr.CheckboxGroup(choices=[('Mon', 0), ('Tue', 1), ('Wed', 2), ('Thu', 3),
-                                            ('Fri', 4), ('Sat', 5), ('Sun', 6)],
-                                  label="Day of Week", type="value")
-            hour_min = gr.Slider(minimum=0, maximum=23, value=0, step=1, label="Hour Min")
-            hour_max = gr.Slider(minimum=0, maximum=23, value=23, step=1, label="Hour Max")
             vehicle = gr.Dropdown(choices=vehicles, value='All', label="Vehicle Type 1")
             person_type = gr.Dropdown(choices=person_types, value='All', label="Person Type")
             person_injury = gr.Dropdown(choices=injury_types, value='All', label="Person Injury")
             gender = gr.Dropdown(choices=genders, value='All', label="Gender")
             safety = gr.Dropdown(choices=safety_equip, value='All', label="Safety Equipment")
         with gr.Column(scale=1):
-            gr.Markdown("### Chart Settings")
-            c1_x = gr.Dropdown(choices=TEMPORAL_COLS, value='CRASH_YEAR', label="Chart 1 X-axis")
             c1_y = gr.Dropdown(choices=['count'] + NUMERIC_COLS, value='count', label="Chart 1 Y-axis")
             c3_x = gr.Dropdown(choices=CATEGORICAL_COLS, value='BOROUGH', label="Chart 3 Category")
             c3_y = gr.Dropdown(choices=['count'] + NUMERIC_COLS, value='count', label="Chart 3 Y-axis")
             c3_top = gr.Slider(minimum=5, maximum=20, value=10, step=1, label="Chart 3 Top N")
-            c4_x = gr.Dropdown(choices=TEMPORAL_COLS, value='CRASH_HOUR', label="Chart 4 X-axis")
             c4_y = gr.Dropdown(choices=['count'] + NUMERIC_COLS, value='count', label="Chart 4 Y-axis")
-            compare_cat = gr.Dropdown(choices=['BOROUGH', 'VEHICLE TYPE CODE 1', 'PERSON_TYPE',
-                                              'SAFETY_EQUIPMENT', 'CRASH_HOUR', 'CRASH_DAYOFWEEK'],
-                                     value='BOROUGH', label="Comparison Category")
     with gr.Row():
-        generate_btn = gr.Button("🔍 Generate Report", variant="primary", size="lg")
-        reset_btn = gr.Button("🔄 Reset Filters", variant="secondary", size="lg")
     # Outputs
     summary_output = gr.Markdown(label="Summary Statistics")
     with gr.Row():
-        chart1_output = gr.Plot(label="Trend Analysis")
-        chart2_output = gr.Plot(label="Person Type Distribution")
     with gr.Row():
-        chart3_output = gr.Plot(label="Categorical Analysis")
-        chart4_output = gr.Plot(label="Time Distribution")
     with gr.Row():
-        chart5_output = gr.Plot(label="Contributing Factor 1")
-        chart6_output = gr.Plot(label="Contributing Factor 2")
-    chart7_output = gr.Plot(label="Injury Rate Comparison")
-    chart8_output = gr.Plot(label="Day × Hour Heatmap")
-    chart9_output = gr.Plot(label="Geographic Distribution")
     # Event handlers
     generate_btn.click(
         fn=generate_report,
         inputs=[borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
-               person_injury, gender, safety, c1_x, c1_y, c3_x, c3_y, c3_top,
-               c4_x, c4_y, compare_cat],
         outputs=[summary_output, chart1_output, chart2_output, chart3_output, chart4_output,
-                chart5_output, chart6_output, chart7_output, chart8_output, chart9_output]
     )
     def reset_all():
-        return ('All', 'All', 'All', [], 0, 23, 'All', 'All', 'All', 'All', 'All')
     reset_btn.click(
         fn=reset_all,
         outputs=[borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
-                person_injury, gender, safety]
     )
-    def apply_smart_search(search_text):
-        result = smart_search_parser(search_text)
-        if result is None:
-            return ['All'] * 11
-        filters, applied = result
-        return (
-            filters.get('borough', 'All'),
-            filters.get('year', 'All'),
-            filters.get('month', 'All'),
-            filters.get('dow', []),
-            filters.get('hour_range', (0, 23))[0],
-            filters.get('hour_range', (0, 23))[1],
-            filters.get('vehicle', 'All'),
-            filters.get('person_type', 'All'),
-            filters.get('injury', 'All'),
-            'All',  # gender
-            'All'   # safety
-        )
     search_btn.click(
         fn=apply_smart_search,
         inputs=[search_input],
         outputs=[borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
-                person_injury, gender, safety]
     )
-if __name__ == "__main__":
-    demo.launch()

 import plotly.graph_objects as go
 import pandas as pd
 import warnings
+import re
 warnings.filterwarnings('ignore', category=DeprecationWarning)
 df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
 print(f"Data loaded: {len(df):,} records")
+# Clean vehicle types
 VALID_VEHICLE_TYPES = [
     'SEDAN', 'STATION WAGON/SPORT UTILITY VEHICLE', 'TAXI', 'PICK-UP TRUCK',
     'BOX TRUCK', 'VAN', 'MOTORCYCLE', 'SCOOTER', 'MOPED', 'E-SCOOTER', 'E-BIKE',
     'SPORT UTILITY / STATION WAGON', 'LIMOUSINE', 'UNKNOWN'
 ]
 df['VEHICLE TYPE CODE 1'] = df['VEHICLE TYPE CODE 1'].apply(
     lambda x: x if x in VALID_VEHICLE_TYPES else 'OTHER'
 )
 print(f"Cleaned vehicle types. Valid categories: {len(df['VEHICLE TYPE CODE 1'].unique())}")
+# Define column groups
 TEMPORAL_COLS = ['CRASH_YEAR', 'CRASH_MONTH', 'CRASH_DAYOFWEEK', 'CRASH_HOUR']
 CATEGORICAL_COLS = ['BOROUGH', 'PERSON_TYPE', 'PERSON_INJURY',
                     'CONTRIBUTING FACTOR VEHICLE 1', 'VEHICLE TYPE CODE 1',
                 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
                 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']
+# Prepare dropdown options
+boroughs = ['All'] + sorted([b for b in df['BOROUGH'].dropna().unique() if str(b) != 'nan'])
+years = ['All'] + sorted([int(y) for y in df['CRASH_YEAR'].unique()])
+months = ['All'] + list(range(1, 13))
 vehicles = ['All'] + sorted(VALID_VEHICLE_TYPES + ['OTHER'])
+person_types = ['All'] + sorted([p for p in df['PERSON_TYPE'].dropna().unique() if str(p) != 'nan'])
+injury_types = ['All'] + sorted([i for i in df['PERSON_INJURY'].dropna().unique() if str(i) != 'nan'])
+genders = ['All', 'M', 'F', 'U']
+safety_equip = ['All'] + sorted([s for s in df['SAFETY_EQUIPMENT'].dropna().unique()
+                                 if str(s) not in ['nan', 'NOT APPLICABLE', 'NOT REPORTED', 'DOES NOT APPLY']][:15])
 def smart_search_parser(search_text):
+    """Parse natural language search query into filters"""
     if not search_text:
         return None
     search_lower = search_text.lower()
     filters = {}
     applied_filters = []
     # Borough detection
+    boroughs_map = ['BROOKLYN', 'MANHATTAN', 'QUEENS', 'BRONX', 'STATEN ISLAND']
+    for b in boroughs_map:
         if b.lower() in search_lower:
             filters['borough'] = b
             applied_filters.append(f"Borough: {b}")
             break
     # Year detection
     years_found = re.findall(r'\b(20[1-2][0-9])\b', search_text)
     if years_found:
         filters['year'] = int(years_found[0])
         applied_filters.append(f"Year: {years_found[0]}")
     # Month detection
     months_map = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
                   'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12,
             filters['month'] = m_num
             applied_filters.append(f"Month: {m_name.capitalize()}")
             break
     # Day of week detection
     days_map = {'monday': [0], 'tuesday': [1], 'wednesday': [2], 'thursday': [3],
                 'friday': [4], 'saturday': [5], 'sunday': [6],
+                'mon': [0], 'tue': [1], 'wed': [2], 'thu': [3], 'fri': [4], 'sat': [5], 'sun': [6],
                 'weekday': [0, 1, 2, 3, 4], 'weekend': [5, 6]}
     for day_name, day_nums in days_map.items():
         if day_name in search_lower:
             filters['dow'] = day_nums
             applied_filters.append(f"Day: {day_name.capitalize()}")
             break
     # Time of day detection
     if 'morning' in search_lower:
         filters['hour_range'] = (6, 10)
     elif 'night' in search_lower:
         filters['hour_range'] = (20, 23)
         applied_filters.append("Time: Night (20-23)")
+    elif 'late night' in search_lower or 'midnight' in search_lower:
+        filters['hour_range'] = (0, 5)
+        applied_filters.append("Time: Late Night (0-5)")
     # Vehicle type detection
     vehicle_keywords = {
         'sedan': 'SEDAN', 'suv': 'STATION WAGON/SPORT UTILITY VEHICLE',
         'taxi': 'TAXI', 'truck': 'PICK-UP TRUCK', 'bus': 'BUS',
+        'motorcycle': 'MOTORCYCLE', 'bike': 'BICYCLE', 'scooter': 'SCOOTER',
+        'van': 'VAN', 'ambulance': 'AMBULANCE', 'moped': 'MOPED'
     }
     for keyword, vehicle_type in vehicle_keywords.items():
         if keyword in search_lower:
             filters['vehicle'] = vehicle_type
             applied_filters.append(f"Vehicle: {keyword.capitalize()}")
             break
     # Person type detection
     if 'pedestrian' in search_lower:
         filters['person_type'] = 'PEDESTRIAN'
     elif 'occupant' in search_lower or 'driver' in search_lower:
         filters['person_type'] = 'OCCUPANT'
         applied_filters.append("Person: Occupant")
     # Injury type detection
     if 'fatal' in search_lower or 'death' in search_lower or 'killed' in search_lower:
         filters['injury'] = 'KILLED'
     elif 'injured' in search_lower or 'injury' in search_lower:
         filters['injury'] = 'INJURED'
         applied_filters.append("Injury: Injured")
+    # Gender detection
+    if 'male' in search_lower and 'female' not in search_lower:
+        filters['gender'] = 'M'
+        applied_filters.append("Gender: Male")
+    elif 'female' in search_lower:
+        filters['gender'] = 'F'
+        applied_filters.append("Gender: Female")
     return filters, applied_filters
+def generate_report(borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
+                    person_injury, gender, safety, c1_x, c1_y, c3_x, c3_y, c3_top,
+                    c4_x, c4_y, compare_cat):
+    """Generate all visualizations based on filters"""
     # Filter data
     filtered_df = df.copy()
     if borough != 'All':
         filtered_df = filtered_df[filtered_df['BOROUGH'] == borough]
     if year != 'All':
+        filtered_df = filtered_df[filtered_df['CRASH_YEAR'] == year]
     if month != 'All':
+        filtered_df = filtered_df[filtered_df['CRASH_MONTH'] == month]
+    if dow:
+        filtered_df = filtered_df[filtered_df['CRASH_DAYOFWEEK'].isin(dow)]
+    filtered_df = filtered_df[(filtered_df['CRASH_HOUR'] >= hour_min) &
+                              (filtered_df['CRASH_HOUR'] <= hour_max)]
     if vehicle != 'All':
         filtered_df = filtered_df[filtered_df['VEHICLE TYPE CODE 1'] == vehicle]
     if person_type != 'All':
         filtered_df = filtered_df[filtered_df['PERSON_SEX'] == gender]
     if safety != 'All':
         filtered_df = filtered_df[filtered_df['SAFETY_EQUIPMENT'] == safety]
     if len(filtered_df) == 0:
         empty_fig = go.Figure()
         empty_fig.add_annotation(text="No data found. Adjust filters.", xref="paper", yref="paper",
+                                 x=0.5, y=0.5, showarrow=False, font=dict(size=16, color="gray"))
         return "No data found", empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, empty_fig
     # Summary Statistics
     total_records = len(filtered_df)
     total_injuries = int(filtered_df['NUMBER OF PERSONS INJURED'].sum())
     total_fatalities = int(filtered_df['NUMBER OF PERSONS KILLED'].sum())
     injury_rate = (total_injuries / total_records * 100) if total_records > 0 else 0
+    fatality_rate = (total_fatalities / total_records * 100) if total_records > 0 else 0
     summary_text = f"""
+## 📊 Summary Statistics
+| Metric | Value |
+|--------|-------|
+| **Total Records** | {total_records:,} |
+| **Total Injuries** | {total_injuries:,} ({injury_rate:.2f}%) |
+| **Total Fatalities** | {total_fatalities:,} ({fatality_rate:.2f}%) |
+| **Pedestrian Injuries** | {int(filtered_df['NUMBER OF PEDESTRIANS INJURED'].sum()):,} |
+| **Cyclist Injuries** | {int(filtered_df['NUMBER OF CYCLIST INJURED'].sum()):,} |
+| **Motorist Injuries** | {int(filtered_df['NUMBER OF MOTORIST INJURED'].sum()):,} |
+| **Unique Crashes** | {len(filtered_df['COLLISION_ID'].unique()):,} |
+| **Avg Persons/Crash** | {(total_records / len(filtered_df['COLLISION_ID'].unique())):.1f} |
     """
     # Chart 1: Trend Analysis
     if c1_y == 'count':
         chart1_data = filtered_df.groupby(c1_x).size().reset_index(name='count')
     else:
         chart1_data = filtered_df.groupby(c1_x)[c1_y].sum().reset_index()
         y_label = c1_y
     fig1 = px.line(chart1_data, x=c1_x, y=chart1_data.columns[1],
                    labels={chart1_data.columns[1]: y_label, c1_x: c1_x},
                    title='Trend Analysis')
     fig1.update_traces(line_color='#3498db', line_width=3)
     fig1.update_layout(template='plotly_white', height=400)
+    # Chart 2: Person Type Distribution
     person_type_data = filtered_df['PERSON_TYPE'].value_counts()
     fig2 = px.pie(values=person_type_data.values, names=person_type_data.index,
                   title='Person Type Distribution',
                   color_discrete_sequence=['#2ecc71', '#f39c12', '#e74c3c', '#3498db'])
     fig2.update_layout(height=400)
     # Chart 3: Categorical Analysis
     if c3_y == 'count':
         chart3_data = filtered_df[c3_x].value_counts().head(int(c3_top))
     else:
         chart3_data = filtered_df.groupby(c3_x)[c3_y].sum().sort_values(ascending=False).head(int(c3_top))
         y_label = c3_y
     fig3 = px.bar(x=chart3_data.index, y=chart3_data.values,
                   labels={'x': c3_x, 'y': y_label},
+                  title=f'Categorical Analysis - Top {int(c3_top)}')
     fig3.update_traces(marker_color='#3498db')
     fig3.update_layout(template='plotly_white', height=400)
     # Chart 4: Time Distribution
     if c4_y == 'count':
         chart4_data = filtered_df[c4_x].value_counts().sort_index()
     else:
         chart4_data = filtered_df.groupby(c4_x)[c4_y].sum().sort_index()
         y_label = c4_y
     fig4 = px.bar(x=chart4_data.index, y=chart4_data.values,
                   labels={'x': c4_x, 'y': y_label},
                   title='Time Distribution')
     fig4.update_traces(marker_color='#e67e22')
     fig4.update_layout(template='plotly_white', height=400)
     # Chart 5: Contributing Factor 1
     factor1_data = filtered_df['CONTRIBUTING FACTOR VEHICLE 1'].value_counts().head(15)
     factor1_data = factor1_data[factor1_data.index != 'UNSPECIFIED']
     fig5 = px.bar(x=factor1_data.index, y=factor1_data.values,
                   labels={'x': 'Contributing Factor', 'y': 'Number of Crashes'},
                   title='Top Contributing Factors (Vehicle 1)')
     fig5.update_traces(marker_color='#e74c3c')
     fig5.update_layout(template='plotly_white', height=400, xaxis={'tickangle': -45})
     # Chart 6: Contributing Factor 2
     factor2_data = filtered_df['CONTRIBUTING FACTOR VEHICLE 2'].value_counts().head(15)
     factor2_data = factor2_data[~factor2_data.index.isin(['UNSPECIFIED', 'NO SECOND VEHICLE'])]
     if len(factor2_data) > 0:
         fig6 = px.bar(x=factor2_data.index, y=factor2_data.values,
+                      labels={'x': 'Secondary Contributing Factor', 'y': 'Number of Crashes'},
+                      title='Top Contributing Factors (Vehicle 2)')
         fig6.update_traces(marker_color='#f39c12')
         fig6.update_layout(template='plotly_white', height=400, xaxis={'tickangle': -45})
     else:
         fig6 = go.Figure()
         fig6.add_annotation(text="No secondary factors", xref="paper", yref="paper",
+                            x=0.5, y=0.5, showarrow=False)
+        fig6.update_layout(height=400, title='Top Contributing Factors (Vehicle 2)')
     # Chart 7: Injury Rate Comparison
     compare_data = filtered_df.groupby(compare_cat).agg({
         'COLLISION_ID': 'count',
     compare_data['Injury_Rate'] = (compare_data['Total_Injuries'] / compare_data['Total_Records'] * 100)
     compare_data['Fatality_Rate'] = (compare_data['Total_Fatalities'] / compare_data['Total_Records'] * 100)
     compare_data = compare_data.sort_values('Injury_Rate', ascending=False).head(15)
     fig7 = go.Figure()
     fig7.add_trace(go.Bar(x=compare_data[compare_cat], y=compare_data['Injury_Rate'],
+                          name='Injury Rate (%)', marker_color='#f39c12'))
     fig7.add_trace(go.Bar(x=compare_data[compare_cat], y=compare_data['Fatality_Rate'],
+                          name='Fatality Rate (%)', marker_color='#e74c3c'))
     fig7.update_layout(barmode='group', template='plotly_white', height=400,
+                       title='Injury Rate Comparison',
+                       xaxis_title=compare_cat, yaxis_title='Rate (%)')
     # Chart 8: Heatmap
     heatmap_data = filtered_df.groupby(['CRASH_DAYOFWEEK', 'CRASH_HOUR']).size().reset_index(name='count')
     if len(heatmap_data) > 0:
             colorscale='YlOrRd'
         ))
         fig8.update_layout(xaxis_title='Hour of Day', yaxis_title='Day of Week',
+                           title='Day × Hour Heatmap', template='plotly_white', height=500)
     else:
         fig8 = go.Figure()
+        fig8.update_layout(height=500, title='Day × Hour Heatmap')
+    # Chart 9: Geographic Map (NEW!)
+    map_sample = filtered_df[(filtered_df['LATITUDE'].notna()) &
                              (filtered_df['LATITUDE'] != 0) &
+                             (filtered_df['LATITUDE'] > 40) &
                              (filtered_df['LATITUDE'] < 41)]
     if len(map_sample) > 0:
+        # Sample for performance
         map_sample = map_sample.sample(n=min(3000, len(map_sample)), random_state=42)
+        # Create severity category combining injury and fatality
+        def categorize_severity(row):
+            if row['NUMBER OF PERSONS KILLED'] > 0:
+                return 'Fatal'
+            elif row['NUMBER OF PERSONS INJURED'] > 0:
+                return 'Injury'
+            else:
+                return 'Property Damage Only'
+        map_sample['SEVERITY_CATEGORY'] = map_sample.apply(categorize_severity, axis=1)
+        # Color mapping
+        color_map = {
+            'Fatal': '#e74c3c',
+            'Injury': '#f39c12',
+            'Property Damage Only': '#9d7aff'
+        }
+        fig9 = px.scatter_map(
+            map_sample,
+            lat='LATITUDE',
+            lon='LONGITUDE',
+            color='SEVERITY_CATEGORY',
+            color_discrete_map=color_map,
+            title=f'Geographic Distribution (Sample of {len(map_sample):,} locations)',
+            zoom=10,
+            height=600,
+            hover_data={
+                'LATITUDE': False,
+                'LONGITUDE': False,
+                'SEVERITY_CATEGORY': True,
+                'NUMBER OF PERSONS INJURED': True,
+                'NUMBER OF PERSONS KILLED': True,
+                'BOROUGH': True,
+                'VEHICLE TYPE CODE 1': True
+            }
         )
+        fig9.update_layout(map_style="open-street-map")
     else:
         fig9 = go.Figure()
+        fig9.add_annotation(
+            text="No location data available",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=20, color="gray")
+        )
+        fig9.update_layout(height=600, title='Geographic Distribution')
     return summary_text, fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8, fig9
+def apply_smart_search(search_text):
+    """Apply smart search and return filter values"""
+    result = smart_search_parser(search_text)
+    if result is None:
+        return ['All'] * 11 + ["⚠️ No filters detected. Try: 'Brooklyn 2022 pedestrian crashes'"]
+    filters, applied = result
+    feedback = "✅ Filters Applied: " + ", ".join(applied) + "\n\nClick 'Generate Report' to see results."
+    return (
+        filters.get('borough', 'All'),
+        filters.get('year', 'All'),
+        filters.get('month', 'All'),
+        filters.get('dow', []),
+        filters.get('hour_range', (0, 23))[0],
+        filters.get('hour_range', (0, 23))[1],
+        filters.get('vehicle', 'All'),
+        filters.get('person_type', 'All'),
+        filters.get('injury', 'All'),
+        filters.get('gender', 'All'),
+        filters.get('safety', 'All'),
+        feedback
+    )
 # Create Gradio Interface
 with gr.Blocks(title="NYC Motor Vehicle Crashes Dashboard") as demo:
     gr.Markdown("# 🚗 NYC Motor Vehicle Crashes Dashboard - Enhanced Analytics")
+    gr.Markdown("### Comprehensive analysis with 5.7M+ crash records")
+    with gr.Accordion("🔎 Smart Search", open=True):
+        gr.Markdown("**Type natural language queries** like: `Brooklyn 2022 pedestrian crashes` or `Manhattan weekend taxi injured`")
+        with gr.Row():
+            search_input = gr.Textbox(label="Search Query",
+                                      placeholder="e.g., Queens Friday night motorcycle fatalities...",
+                                      scale=3)
+            search_btn = gr.Button("🔍 Apply Smart Search", variant="primary", scale=1)
+            clear_search_btn = gr.Button("❌ Clear", variant="stop", scale=1)
+        search_feedback = gr.Markdown(visible=True)
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 🎛️ Filters")
             borough = gr.Dropdown(choices=boroughs, value='All', label="Borough")
             year = gr.Dropdown(choices=years, value='All', label="Year")
             month = gr.Dropdown(choices=months, value='All', label="Month")
+            dow = gr.CheckboxGroup(
+                choices=[('Mon', 0), ('Tue', 1), ('Wed', 2), ('Thu', 3),
+                         ('Fri', 4), ('Sat', 5), ('Sun', 6)],
+                label="Day of Week", type="value"
+            )
+            with gr.Row():
+                hour_min = gr.Slider(minimum=0, maximum=23, value=0, step=1, label="Hour Min")
+                hour_max = gr.Slider(minimum=0, maximum=23, value=23, step=1, label="Hour Max")
             vehicle = gr.Dropdown(choices=vehicles, value='All', label="Vehicle Type 1")
             person_type = gr.Dropdown(choices=person_types, value='All', label="Person Type")
             person_injury = gr.Dropdown(choices=injury_types, value='All', label="Person Injury")
             gender = gr.Dropdown(choices=genders, value='All', label="Gender")
             safety = gr.Dropdown(choices=safety_equip, value='All', label="Safety Equipment")
         with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Chart Settings")
+            c1_x = gr.Dropdown(choices=TEMPORAL_COLS, value='CRASH_YEAR', label="Chart 1 X-axis (Trend)")
             c1_y = gr.Dropdown(choices=['count'] + NUMERIC_COLS, value='count', label="Chart 1 Y-axis")
             c3_x = gr.Dropdown(choices=CATEGORICAL_COLS, value='BOROUGH', label="Chart 3 Category")
             c3_y = gr.Dropdown(choices=['count'] + NUMERIC_COLS, value='count', label="Chart 3 Y-axis")
             c3_top = gr.Slider(minimum=5, maximum=20, value=10, step=1, label="Chart 3 Top N")
+            c4_x = gr.Dropdown(choices=TEMPORAL_COLS, value='CRASH_HOUR', label="Chart 4 X-axis (Time)")
             c4_y = gr.Dropdown(choices=['count'] + NUMERIC_COLS, value='count', label="Chart 4 Y-axis")
+            compare_cat = gr.Dropdown(
+                choices=['BOROUGH', 'VEHICLE TYPE CODE 1', 'PERSON_TYPE',
+                         'SAFETY_EQUIPMENT', 'CRASH_HOUR', 'CRASH_DAYOFWEEK'],
+                value='BOROUGH', label="Comparison Category"
+            )
     with gr.Row():
+        generate_btn = gr.Button("🔍 Generate Report", variant="primary", size="lg", scale=2)
+        reset_btn = gr.Button("🔄 Reset All Filters", variant="secondary", size="lg", scale=1)
     # Outputs
     summary_output = gr.Markdown(label="Summary Statistics")
     with gr.Row():
+        chart1_output = gr.Plot(label="Chart 1: Trend Analysis")
+        chart2_output = gr.Plot(label="Chart 2: Person Type Distribution")
     with gr.Row():
+        chart3_output = gr.Plot(label="Chart 3: Categorical Analysis")
+        chart4_output = gr.Plot(label="Chart 4: Time Distribution")
     with gr.Row():
+        chart5_output = gr.Plot(label="Chart 5: Contributing Factor 1")
+        chart6_output = gr.Plot(label="Chart 6: Contributing Factor 2")
+    chart7_output = gr.Plot(label="Chart 7: Injury Rate Comparison")
+    chart8_output = gr.Plot(label="Chart 8: Day × Hour Heatmap")
+    chart9_output = gr.Plot(label="Chart 9: Geographic Distribution Map")
     # Event handlers
     generate_btn.click(
         fn=generate_report,
         inputs=[borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
+                person_injury, gender, safety, c1_x, c1_y, c3_x, c3_y, c3_top,
+                c4_x, c4_y, compare_cat],
         outputs=[summary_output, chart1_output, chart2_output, chart3_output, chart4_output,
+                 chart5_output, chart6_output, chart7_output, chart8_output, chart9_output]
     )
     def reset_all():
+        return ('All', 'All', 'All', [], 0, 23, 'All', 'All', 'All', 'All', 'All', '')
     reset_btn.click(
         fn=reset_all,
         outputs=[borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
+                 person_injury, gender, safety, search_feedback]
     )
     search_btn.click(
         fn=apply_smart_search,
         inputs=[search_input],
         outputs=[borough, year, month, dow, hour_min, hour_max, vehicle, person_type,
+                 person_injury, gender, safety, search_feedback]
     )
+    clear_search_btn.click(
+        fn=lambda: ('', ''),
+        outputs=[search_input, search_feedback]
+    )
+if __name__ == "__main__":
+    demo.launch(share=False)