baiganinn commited on
Commit
86b6abc
·
0 Parent(s):
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
__pycache__/predictor.cpython-311.pyc ADDED
Binary file (64.1 kB). View file
 
app.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import matplotlib.pyplot as plt
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
+ import numpy as np
9
+ from datetime import datetime
10
+ import warnings
11
+ warnings.filterwarnings('ignore')
12
+
13
+ # Import our predictor functions
14
+ from predictor import predict_traffic_patterns_with_plots
15
+
16
+ def validate_csv_file(file):
17
+ """Validate the uploaded CSV file"""
18
+ try:
19
+ df = pd.read_csv(file.name)
20
+ required_columns = ['randomized_id', 'lat', 'lng']
21
+ optional_columns = ['azm', 'alt', 'spd']
22
+
23
+ missing_required = [col for col in required_columns if col not in df.columns]
24
+ available_optional = [col for col in optional_columns if col in df.columns]
25
+
26
+ if missing_required:
27
+ return False, f"❌ Missing required columns: {missing_required}", None, None
28
+
29
+ # Check data quality
30
+ if df.empty:
31
+ return False, "❌ The CSV file is empty", None, None
32
+
33
+ if df['lat'].isna().all() or df['lng'].isna().all():
34
+ return False, "❌ Latitude and longitude columns contain no valid data", None, None
35
+
36
+ # Basic statistics
37
+ stats = {
38
+ 'total_records': len(df),
39
+ 'unique_vehicles': df['randomized_id'].nunique(),
40
+ 'date_range': f"{len(df):,} GPS points",
41
+ 'required_columns': required_columns,
42
+ 'optional_columns_found': available_optional,
43
+ 'lat_range': (df['lat'].min(), df['lat'].max()),
44
+ 'lng_range': (df['lng'].min(), df['lng'].max())
45
+ }
46
+
47
+ return True, "✅ CSV file validated successfully!", df, stats
48
+
49
+ except Exception as e:
50
+ return False, f"❌ Error reading CSV file: {str(e)}", None, None
51
+
52
+ def create_summary_text(predictions, stats):
53
+ """Create a beautiful summary text"""
54
+ if predictions['status'] != 'success':
55
+ return f"❌ **Analysis Failed**: {predictions.get('error_message', 'Unknown error')}"
56
+
57
+ summary = predictions['analysis_summary']
58
+ metadata = predictions['metadata']
59
+
60
+ text = f"""
61
+ # 🚗 Traffic Analysis Report
62
+ **Generated on:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
63
+
64
+ ## 📊 Dataset Overview
65
+ - **Total GPS Records:** {metadata['sample_size_used']:,}
66
+ - **Unique Vehicles:** {metadata['unique_vehicles']:,}
67
+ - **Geographic Coverage:** {stats['lat_range'][0]:.4f}° to {stats['lat_range'][1]:.4f}° (Lat), {stats['lng_range'][0]:.4f}° to {stats['lng_range'][1]:.4f}° (Lng)
68
+
69
+ ## 🛣️ Popular Routes Analysis
70
+ - **Route Clusters Identified:** {summary['popular_routes']['total_route_clusters']}
71
+
72
+ ### Top 5 Popular Routes:
73
+ """
74
+
75
+ if summary['popular_routes']['top_5_routes']:
76
+ for i, route in enumerate(summary['popular_routes']['top_5_routes'], 1):
77
+ text += f"""
78
+ **Route {i}:** `{route['route_id']}`
79
+ - 🚙 **Trips:** {route['trip_count']} ({route['popularity_percentage']:.1f}% of all routes)
80
+ - 📏 **Average Length:** {route['avg_length_km']:.2f} km
81
+ - 📍 **Start:** ({route['start_location']['lat']:.4f}, {route['start_location']['lng']:.4f})
82
+ - 🏁 **End:** ({route['end_location']['lat']:.4f}, {route['end_location']['lng']:.4f})
83
+ """
84
+ else:
85
+ text += "\n*No popular routes identified in the dataset.*"
86
+
87
+ text += f"""
88
+
89
+ ## 🚦 Congestion Analysis
90
+ - **Congestion Areas Found:** {summary['tight_places']['total_congestion_areas']}
91
+ - **Severity Breakdown:**
92
+ - 🔴 High: {summary['tight_places']['severity_breakdown'].get('High', 0)}
93
+ - 🟡 Medium: {summary['tight_places']['severity_breakdown'].get('Medium', 0)}
94
+ - 🟢 Low: {summary['tight_places']['severity_breakdown'].get('Low', 0)}
95
+
96
+ ### Top 5 Congestion Areas:
97
+ """
98
+
99
+ if summary['tight_places']['top_5_congestion_areas']:
100
+ for i, area in enumerate(summary['tight_places']['top_5_congestion_areas'], 1):
101
+ severity_emoji = {'High': '🔴', 'Medium': '🟡', 'Low': '🟢'}
102
+ text += f"""
103
+ **Area {i}:** `{area['area_id']}`
104
+ - {severity_emoji.get(area['severity'], '⚪')} **Severity:** {area['severity']}
105
+ - 🚗 **Vehicles Affected:** {area['unique_vehicles']}
106
+ - ⚡ **Average Speed:** {area['avg_speed_kmh']:.1f} km/h
107
+ - 📍 **Location:** ({area['location']['lat']:.4f}, {area['location']['lng']:.4f})
108
+ - 📈 **Congestion Score:** {area['congestion_score']:.2f}
109
+ """
110
+ else:
111
+ text += "\n*No significant congestion areas detected.*"
112
+
113
+ return text
114
+
115
+ def analyze_traffic_data(file, sample_size, progress=gr.Progress()):
116
+ """Main analysis function"""
117
+ if file is None:
118
+ return (
119
+ "❌ Please upload a CSV file first!",
120
+ "No analysis performed.",
121
+ None, None, None, None,
122
+ None, None
123
+ )
124
+
125
+ progress(0.1, desc="Validating CSV file...")
126
+
127
+ # Validate file
128
+ is_valid, message, df, stats = validate_csv_file(file)
129
+ if not is_valid:
130
+ return (
131
+ message,
132
+ "Please check your CSV file format and try again.",
133
+ None, None, None, None,
134
+ None, None
135
+ )
136
+
137
+ progress(0.2, desc="Starting traffic analysis...")
138
+
139
+ try:
140
+ # Run the analysis
141
+ progress(0.3, desc="Processing GPS data...")
142
+ predictions, figures = predict_traffic_patterns_with_plots(df, sample_size=sample_size)
143
+
144
+ if predictions['status'] != 'success':
145
+ return (
146
+ f"❌ Analysis failed: {predictions['error_message']}",
147
+ "Please check your data and try again.",
148
+ None, None, None, None,
149
+ None, None
150
+ )
151
+
152
+ progress(0.8, desc="Generating visualizations...")
153
+
154
+ # Create summary text
155
+ summary_text = create_summary_text(predictions, stats)
156
+
157
+ # Convert predictions to pretty JSON
158
+ json_output = json.dumps(predictions, indent=2, default=str)
159
+
160
+ progress(1.0, desc="Analysis complete!")
161
+
162
+ return (
163
+ "✅ Analysis completed successfully!",
164
+ summary_text,
165
+ figures.get('popular_routes'),
166
+ figures.get('tight_places'),
167
+ figures.get('combined_analysis'),
168
+ figures.get('statistics_dashboard'),
169
+ json_output,
170
+ gr.update(visible=True)
171
+ )
172
+
173
+ except Exception as e:
174
+ return (
175
+ f"❌ Error during analysis: {str(e)}",
176
+ "An unexpected error occurred. Please check your data format.",
177
+ None, None, None, None,
178
+ None, None
179
+ )
180
+
181
+ def create_sample_data():
182
+ """Create sample data for demonstration"""
183
+ np.random.seed(42)
184
+ n_points = 1000
185
+ n_vehicles = 50
186
+
187
+ # Create sample data around Astana coordinates
188
+ base_lat, base_lng = 51.1694, 71.4491
189
+
190
+ data = []
191
+ for vehicle_id in range(n_vehicles):
192
+ n_points_vehicle = np.random.randint(10, 30)
193
+
194
+ # Random walk for each vehicle
195
+ start_lat = base_lat + np.random.normal(0, 0.02)
196
+ start_lng = base_lng + np.random.normal(0, 0.02)
197
+
198
+ lat, lng = start_lat, start_lng
199
+
200
+ for i in range(n_points_vehicle):
201
+ # Random walk
202
+ lat += np.random.normal(0, 0.001)
203
+ lng += np.random.normal(0, 0.001)
204
+
205
+ data.append({
206
+ 'randomized_id': f'vehicle_{vehicle_id}',
207
+ 'lat': lat,
208
+ 'lng': lng,
209
+ 'azm': np.random.randint(0, 360),
210
+ 'alt': np.random.randint(200, 400),
211
+ 'spd': max(0, np.random.normal(30, 15))
212
+ })
213
+
214
+ df = pd.DataFrame(data)
215
+ sample_file = "sample_traffic_data.csv"
216
+ df.to_csv(sample_file, index=False)
217
+
218
+ return sample_file
219
+
220
+ # Custom CSS for beautiful styling
221
+ custom_css = """
222
+ .gradio-container {
223
+ max-width: 1200px !important;
224
+ margin: auto !important;
225
+ }
226
+
227
+ .header-text {
228
+ text-align: center;
229
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
230
+ -webkit-background-clip: text;
231
+ -webkit-text-fill-color: transparent;
232
+ font-size: 2.5em;
233
+ font-weight: bold;
234
+ margin-bottom: 20px;
235
+ }
236
+
237
+ .description-text {
238
+ text-align: center;
239
+ font-size: 1.2em;
240
+ color: #666;
241
+ margin-bottom: 30px;
242
+ }
243
+
244
+ .status-success {
245
+ background-color: #d4edda;
246
+ border: 1px solid #c3e6cb;
247
+ color: #155724;
248
+ padding: 15px;
249
+ border-radius: 5px;
250
+ margin: 10px 0;
251
+ }
252
+
253
+ .status-error {
254
+ background-color: #f8d7da;
255
+ border: 1px solid #f5c6cb;
256
+ color: #721c24;
257
+ padding: 15px;
258
+ border-radius: 5px;
259
+ margin: 10px 0;
260
+ }
261
+
262
+ .plot-container {
263
+ border: 2px solid #e9ecef;
264
+ border-radius: 10px;
265
+ padding: 10px;
266
+ margin: 10px 0;
267
+ }
268
+ """
269
+
270
+ # Create the Gradio interface
271
+ with gr.Blocks(css=custom_css, title="🚗 Advanced Traffic Analytics", theme=gr.themes.Soft()) as app:
272
+ gr.HTML("""
273
+ <div class="header-text">
274
+ 🚗 Advanced Traffic Analytics Dashboard
275
+ </div>
276
+ <div class="description-text">
277
+ Upload your GPS tracking data and get comprehensive traffic analysis with route optimization and congestion detection
278
+ </div>
279
+ """)
280
+
281
+ with gr.Row():
282
+ with gr.Column(scale=1):
283
+ gr.Markdown("## 📁 Data Upload & Configuration")
284
+
285
+ file_input = gr.File(
286
+ label="📄 Upload CSV File",
287
+ file_types=[".csv"]
288
+ )
289
+ gr.Markdown("*Upload a CSV file with columns: randomized_id, lat, lng, azm (optional), alt (optional), spd (optional)*")
290
+
291
+ sample_size = gr.Slider(
292
+ minimum=1000,
293
+ maximum=1000000,
294
+ value=500000,
295
+ step=10000,
296
+ label="📊 Sample Size for Analysis"
297
+ )
298
+ gr.Markdown("*Number of GPS points to analyze (larger = more accurate but slower)*")
299
+
300
+ with gr.Row():
301
+ analyze_btn = gr.Button("🚀 Analyze Traffic Data", variant="primary", size="lg")
302
+ sample_btn = gr.Button("📋 Generate Sample Data", variant="secondary")
303
+
304
+ gr.Markdown("### 📋 Required CSV Format:")
305
+ gr.Markdown("""
306
+ - **randomized_id**: Vehicle identifier
307
+ - **lat**: Latitude (required)
308
+ - **lng**: Longitude (required)
309
+ - **azm**: Azimuth/bearing (optional)
310
+ - **alt**: Altitude (optional)
311
+ - **spd**: Speed (optional)
312
+ """)
313
+
314
+ with gr.Column(scale=2):
315
+ gr.Markdown("## 📈 Analysis Status")
316
+ status_output = gr.Textbox(
317
+ label="Status",
318
+ value="Ready to analyze. Please upload a CSV file.",
319
+ interactive=False
320
+ )
321
+
322
+ # Results section
323
+ with gr.Row(visible=False) as results_section:
324
+ gr.Markdown("## 📊 Analysis Results")
325
+
326
+ with gr.Row():
327
+ with gr.Column():
328
+ summary_output = gr.Markdown("## Analysis Summary")
329
+
330
+ with gr.Row():
331
+ with gr.Column():
332
+ gr.Markdown("### 🛣️ Popular Routes Visualization")
333
+ plot1 = gr.Plot(label="Popular Routes Map")
334
+
335
+ with gr.Column():
336
+ gr.Markdown("### 🚦 Congestion Areas")
337
+ plot2 = gr.Plot(label="Traffic Congestion Heatmap")
338
+
339
+ with gr.Row():
340
+ with gr.Column():
341
+ gr.Markdown("### 🗺️ Combined Analysis")
342
+ plot3 = gr.Plot(label="Routes & Congestion Combined")
343
+
344
+ with gr.Column():
345
+ gr.Markdown("### 📈 Statistical Dashboard")
346
+ plot4 = gr.Plot(label="Traffic Statistics")
347
+
348
+ with gr.Row():
349
+ with gr.Column():
350
+ gr.Markdown("### 📄 Raw JSON Output")
351
+ json_output = gr.Code(
352
+ label="Analysis Results (JSON)",
353
+ language="json",
354
+ lines=20
355
+ )
356
+
357
+ # Event handlers
358
+ analyze_btn.click(
359
+ fn=analyze_traffic_data,
360
+ inputs=[file_input, sample_size],
361
+ outputs=[
362
+ status_output,
363
+ summary_output,
364
+ plot1,
365
+ plot2,
366
+ plot3,
367
+ plot4,
368
+ json_output,
369
+ results_section
370
+ ]
371
+ )
372
+
373
+ sample_btn.click(
374
+ fn=create_sample_data,
375
+ outputs=file_input
376
+ )
377
+
378
+ # Footer
379
+ gr.HTML("""
380
+ <div style="text-align: center; margin-top: 50px; padding: 20px; background-color: #f8f9fa; border-radius: 10px; color: black;">
381
+ <h3 style="color: black;">🚗 Advanced Traffic Analytics</h3>
382
+ <p style="color: black;">Powered by Machine Learning • Built with Gradio • GPS Data Analysis</p>
383
+ <p style="color: black;"><em>Upload your traffic data and discover insights about popular routes and congestion patterns!</em></p>
384
+ </div>
385
+ """)
386
+
387
+ if __name__ == "__main__":
388
+ print("🚀 Starting Advanced Traffic Analytics Dashboard...")
389
+ print("📊 Features:")
390
+ print(" • Popular Routes Detection")
391
+ print(" • Congestion Area Analysis")
392
+ print(" • Statistical Dashboards")
393
+ print(" • Interactive Visualizations")
394
+ print("\n🌐 Opening in browser...")
395
+
396
+ app.launch(
397
+ share=True,
398
+ show_error=True,
399
+ debug=True,
400
+ server_name="0.0.0.0",
401
+ server_port=7860
402
+ )
predictor.py ADDED
@@ -0,0 +1,1159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from sklearn.cluster import DBSCAN, KMeans
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.ensemble import IsolationForest
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import silhouette_score
10
+ from scipy.spatial.distance import pdist, squareform
11
+ import json
12
+ import warnings
13
+ warnings.filterwarnings('ignore')
14
+
15
+ class AdvancedGeoTrackAnalyzer:
16
+ def __init__(self, data_path_or_df, sample_size=400000):
17
+ """
18
+ Initialize the analyzer with data path or DataFrame
19
+
20
+ Parameters:
21
+ data_path_or_df: str or pandas.DataFrame - Path to CSV file or DataFrame
22
+ sample_size: int - Maximum number of rows to use for training (default 400k)
23
+ """
24
+ if isinstance(data_path_or_df, str):
25
+ print(f"Loading data from {data_path_or_df}")
26
+ self.df = pd.read_csv(data_path_or_df)
27
+ else:
28
+ self.df = data_path_or_df.copy()
29
+
30
+ print(f"Original dataset size: {len(self.df):,} rows")
31
+ print(f"Available columns: {list(self.df.columns)}")
32
+
33
+ # Sample data if it's too large
34
+ if len(self.df) > sample_size:
35
+ print(f"Sampling {sample_size:,} rows from {len(self.df):,} total rows")
36
+ self.df = self.df.sample(n=sample_size, random_state=42).reset_index(drop=True)
37
+ print(f"Using sampled dataset of {len(self.df):,} rows")
38
+
39
+ self.processed_df = None
40
+ self.routes = None
41
+ self.tight_places = None
42
+
43
+ def preprocess_data(self):
44
+ """Preprocess the geo-tracking data"""
45
+ print("Preprocessing data...")
46
+
47
+ # Make a copy for processing
48
+ self.processed_df = self.df.copy()
49
+
50
+ # Reset index to avoid ambiguity issues
51
+ self.processed_df = self.processed_df.reset_index(drop=True)
52
+
53
+ # Check for required columns
54
+ required_cols = ['randomized_id', 'lat', 'lng']
55
+ missing_cols = [col for col in required_cols if col not in self.processed_df.columns]
56
+ if missing_cols:
57
+ raise ValueError(f"Missing required columns: {missing_cols}")
58
+
59
+ # Check for optional columns
60
+ has_speed = 'spd' in self.processed_df.columns
61
+ has_azimuth = 'azm' in self.processed_df.columns
62
+
63
+ print(f"Speed data available: {has_speed}")
64
+ print(f"Azimuth data available: {has_azimuth}")
65
+
66
+ # Sort by randomized_id for trajectory analysis
67
+ self.processed_df = self.processed_df.sort_values(['randomized_id']).reset_index(drop=True)
68
+
69
+ # Feature engineering
70
+ print("Creating derived features...")
71
+
72
+ # Group by randomized_id to calculate trajectory features
73
+ grouped = self.processed_df.groupby('randomized_id')
74
+
75
+ # Calculate distance between consecutive points in each trajectory
76
+ def haversine_distance(lat1, lon1, lat2, lon2):
77
+ """Calculate the great circle distance between two points on earth"""
78
+ # Convert decimal degrees to radians
79
+ lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
80
+
81
+ # Haversine formula
82
+ dlat = lat2 - lat1
83
+ dlon = lon2 - lon1
84
+ a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
85
+ c = 2 * np.arcsin(np.sqrt(a))
86
+ r = 6371 # Radius of earth in kilometers
87
+ return c * r * 1000 # Convert to meters
88
+
89
+ # Calculate distance between consecutive points
90
+ lat_prev = grouped['lat'].shift(1)
91
+ lng_prev = grouped['lng'].shift(1)
92
+
93
+ self.processed_df['distance_to_prev'] = haversine_distance(
94
+ lat_prev, lng_prev,
95
+ self.processed_df['lat'], self.processed_df['lng']
96
+ ).fillna(0)
97
+
98
+ # Speed-related features if speed data is available
99
+ if has_speed:
100
+ self.processed_df['speed_change'] = grouped['spd'].diff().fillna(0)
101
+ else:
102
+ # Estimate speed from distance (assuming 1 second intervals)
103
+ self.processed_df['estimated_speed'] = self.processed_df['distance_to_prev'] * 3.6 # m/s to km/h
104
+ self.processed_df['speed_change'] = grouped['estimated_speed'].diff().fillna(0)
105
+
106
+ # Direction features if azimuth data is available
107
+ if has_azimuth:
108
+ self.processed_df['direction_change'] = grouped['azm'].diff().fillna(0)
109
+ else:
110
+ # Calculate bearing between consecutive points
111
+ def calculate_bearing(lat1, lon1, lat2, lon2):
112
+ lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
113
+ dlon = lon2 - lon1
114
+ y = np.sin(dlon) * np.cos(lat2)
115
+ x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
116
+ bearing = np.degrees(np.arctan2(y, x))
117
+ return (bearing + 360) % 360
118
+
119
+ bearing = calculate_bearing(
120
+ lat_prev, lng_prev,
121
+ self.processed_df['lat'], self.processed_df['lng']
122
+ )
123
+ self.processed_df['calculated_bearing'] = bearing
124
+ self.processed_df['direction_change'] = grouped['calculated_bearing'].diff().fillna(0)
125
+
126
+ # Remove rows with invalid coordinates
127
+ self.processed_df = self.processed_df[
128
+ (self.processed_df['lat'].between(-90, 90)) &
129
+ (self.processed_df['lng'].between(-180, 180))
130
+ ].reset_index(drop=True)
131
+
132
+ print(f"Preprocessing complete. Final dataset: {len(self.processed_df):,} rows")
133
+ def identify_popular_routes(self, eps_route=0.01, min_samples_route=5):
134
+ """Identify popular routes by clustering start-end point pairs - Compatible with generate_report"""
135
+ print("Identifying popular routes...")
136
+
137
+ if self.processed_df is None:
138
+ raise ValueError("Data must be preprocessed first")
139
+
140
+ # Extract start and end points for each trajectory
141
+ print("Extracting trajectory start and end points...")
142
+ trajectory_summary = self.processed_df.groupby('randomized_id').agg({
143
+ 'lat': ['first', 'last', 'count'],
144
+ 'lng': ['first', 'last']
145
+ }).reset_index()
146
+
147
+ # Flatten column names
148
+ trajectory_summary.columns = [
149
+ 'randomized_id', 'start_lat', 'end_lat', 'point_count', 'start_lng', 'end_lng'
150
+ ]
151
+
152
+ print(f"Total trajectories: {len(trajectory_summary)}")
153
+
154
+ # Filter trajectories with minimum points (at least 3 points to be considered a route)
155
+ valid_trajectories = trajectory_summary[trajectory_summary['point_count'] >= 3].copy()
156
+ print(f"Trajectories with ≥3 points: {len(valid_trajectories)}")
157
+
158
+ if len(valid_trajectories) == 0:
159
+ print("No valid trajectories found")
160
+ self.routes = {}
161
+ return {}
162
+
163
+ # Calculate route distances to filter out very short routes
164
+ valid_trajectories['route_distance_deg'] = np.sqrt(
165
+ (valid_trajectories['end_lat'] - valid_trajectories['start_lat'])**2 +
166
+ (valid_trajectories['end_lng'] - valid_trajectories['start_lng'])**2
167
+ )
168
+
169
+ # Use a more lenient distance threshold
170
+ distance_threshold = valid_trajectories['route_distance_deg'].quantile(0.1) # Bottom 10%
171
+ print(f"Distance threshold: {distance_threshold:.6f} degrees")
172
+
173
+ # Filter out very short routes
174
+ meaningful_routes = valid_trajectories[
175
+ valid_trajectories['route_distance_deg'] > distance_threshold
176
+ ].copy()
177
+
178
+ print(f"Routes after distance filtering: {len(meaningful_routes)}")
179
+
180
+ if len(meaningful_routes) < min_samples_route:
181
+ print(f"Not enough meaningful routes ({len(meaningful_routes)}) for clustering (need at least {min_samples_route})")
182
+ # Lower the minimum samples requirement
183
+ min_samples_route = max(2, len(meaningful_routes) // 5)
184
+ print(f"Adjusting min_samples_route to: {min_samples_route}")
185
+
186
+ if len(meaningful_routes) < 2:
187
+ print("Not enough routes for any clustering")
188
+ self.routes = {}
189
+ return {}
190
+
191
+ # Create route vectors for clustering
192
+ route_vectors = meaningful_routes[['start_lat', 'start_lng', 'end_lat', 'end_lng']].values
193
+
194
+ print(f"Route vectors shape: {route_vectors.shape}")
195
+
196
+ # Initialize routes dictionary
197
+ self.routes = {}
198
+
199
+ # Try multiple clustering approaches
200
+ # Method 1: DBSCAN with geographic coordinates
201
+ print("\nTrying DBSCAN clustering...")
202
+ try:
203
+ # Scale the coordinates
204
+ scaler = StandardScaler()
205
+ scaled_routes = scaler.fit_transform(route_vectors)
206
+
207
+ # Try different eps values
208
+ eps_values = [0.1, 0.2, 0.5, 1.0, 1.5, 2.0]
209
+ best_eps = None
210
+ best_clusters = None
211
+ max_clusters = 0
212
+
213
+ for eps in eps_values:
214
+ clustering = DBSCAN(eps=eps, min_samples=min_samples_route)
215
+ cluster_labels = clustering.fit_predict(scaled_routes)
216
+ n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
217
+ n_noise = list(cluster_labels).count(-1)
218
+
219
+ print(f" eps={eps}: {n_clusters} clusters, {n_noise} noise points")
220
+
221
+ if n_clusters > max_clusters and n_clusters <= len(meaningful_routes) // 2:
222
+ max_clusters = n_clusters
223
+ best_eps = eps
224
+ best_clusters = cluster_labels
225
+
226
+ if best_clusters is not None and max_clusters > 0:
227
+ print(f"Best DBSCAN result: eps={best_eps}, {max_clusters} clusters")
228
+
229
+ unique_clusters = np.unique(best_clusters[best_clusters != -1])
230
+
231
+ for cluster_id in unique_clusters:
232
+ cluster_mask = best_clusters == cluster_id
233
+ cluster_routes = route_vectors[cluster_mask]
234
+ cluster_trajectory_ids = meaningful_routes.loc[
235
+ meaningful_routes.index[cluster_mask], 'randomized_id'
236
+ ].values
237
+
238
+ # Calculate cluster statistics
239
+ avg_start_lat = np.mean(cluster_routes[:, 0])
240
+ avg_start_lng = np.mean(cluster_routes[:, 1])
241
+ avg_end_lat = np.mean(cluster_routes[:, 2])
242
+ avg_end_lng = np.mean(cluster_routes[:, 3])
243
+
244
+ # Calculate average route length in METERS (for compatibility with generate_report)
245
+ route_length_m = np.mean([
246
+ self.haversine_distance_m(route[0], route[1], route[2], route[3])
247
+ for route in cluster_routes
248
+ ])
249
+
250
+ self.routes[f"dbscan_{cluster_id}"] = {
251
+ 'route_count': len(cluster_routes),
252
+ 'trajectory_ids': cluster_trajectory_ids.tolist(),
253
+ 'avg_start_point': {'lat': avg_start_lat, 'lng': avg_start_lng},
254
+ 'avg_end_point': {'lat': avg_end_lat, 'lng': avg_end_lng},
255
+ 'avg_route_length_m': route_length_m, # In meters for compatibility
256
+ 'popularity_score': len(cluster_routes) / len(meaningful_routes) * 100,
257
+ 'method': 'DBSCAN'
258
+ }
259
+
260
+ except Exception as e:
261
+ print(f"DBSCAN failed: {e}")
262
+
263
+ # Method 2: KMeans clustering if DBSCAN didn't work well
264
+ if len(self.routes) == 0:
265
+ print("\nTrying KMeans clustering...")
266
+ try:
267
+ # Try different numbers of clusters
268
+ max_k = min(10, len(meaningful_routes) // 3)
269
+
270
+ if max_k >= 2:
271
+ scaler = StandardScaler()
272
+ scaled_routes = scaler.fit_transform(route_vectors)
273
+
274
+ best_k = 2
275
+ best_score = -1
276
+
277
+ for k in range(2, max_k + 1):
278
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
279
+ cluster_labels = kmeans.fit_predict(scaled_routes)
280
+
281
+ # Calculate silhouette score
282
+ try:
283
+ score = silhouette_score(scaled_routes, cluster_labels)
284
+ print(f" k={k}: silhouette score = {score:.3f}")
285
+
286
+ if score > best_score:
287
+ best_score = score
288
+ best_k = k
289
+ except:
290
+ continue
291
+
292
+ # Use best k
293
+ print(f"Using k={best_k} (best silhouette score: {best_score:.3f})")
294
+ kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
295
+ cluster_labels = kmeans.fit_predict(scaled_routes)
296
+
297
+ for cluster_id in range(best_k):
298
+ cluster_mask = cluster_labels == cluster_id
299
+ cluster_routes = route_vectors[cluster_mask]
300
+ cluster_trajectory_ids = meaningful_routes.loc[
301
+ meaningful_routes.index[cluster_mask], 'randomized_id'
302
+ ].values
303
+
304
+ if len(cluster_routes) >= 2: # At least 2 routes in cluster
305
+ # Calculate cluster statistics
306
+ avg_start_lat = np.mean(cluster_routes[:, 0])
307
+ avg_start_lng = np.mean(cluster_routes[:, 1])
308
+ avg_end_lat = np.mean(cluster_routes[:, 2])
309
+ avg_end_lng = np.mean(cluster_routes[:, 3])
310
+
311
+ # Calculate average route length in METERS
312
+ route_length_m = np.mean([
313
+ self.haversine_distance_m(route[0], route[1], route[2], route[3])
314
+ for route in cluster_routes
315
+ ])
316
+
317
+ self.routes[f"kmeans_{cluster_id}"] = {
318
+ 'route_count': len(cluster_routes),
319
+ 'trajectory_ids': cluster_trajectory_ids.tolist(),
320
+ 'avg_start_point': {'lat': avg_start_lat, 'lng': avg_start_lng},
321
+ 'avg_end_point': {'lat': avg_end_lat, 'lng': avg_end_lng},
322
+ 'avg_route_length_m': route_length_m, # In meters for compatibility
323
+ 'popularity_score': len(cluster_routes) / len(meaningful_routes) * 100,
324
+ 'method': 'KMeans'
325
+ }
326
+
327
+ except Exception as e:
328
+ print(f"KMeans failed: {e}")
329
+
330
+ # Method 3: Simple grid-based clustering if both fail
331
+ if len(self.routes) == 0:
332
+ print("\nTrying grid-based clustering...")
333
+ try:
334
+ # Create a simple grid-based approach
335
+ lat_bins = 20
336
+ lng_bins = 20
337
+
338
+ # Create bins for start and end points
339
+ start_lat_bins = pd.cut(meaningful_routes['start_lat'], bins=lat_bins, labels=False)
340
+ start_lng_bins = pd.cut(meaningful_routes['start_lng'], bins=lng_bins, labels=False)
341
+ end_lat_bins = pd.cut(meaningful_routes['end_lat'], bins=lat_bins, labels=False)
342
+ end_lng_bins = pd.cut(meaningful_routes['end_lng'], bins=lng_bins, labels=False)
343
+
344
+ # Create route signatures
345
+ meaningful_routes['route_signature'] = (
346
+ start_lat_bins.astype(str) + '_' + start_lng_bins.astype(str) + '_' +
347
+ end_lat_bins.astype(str) + '_' + end_lng_bins.astype(str)
348
+ )
349
+
350
+ # Count routes by signature
351
+ signature_counts = meaningful_routes['route_signature'].value_counts()
352
+ popular_signatures = signature_counts[signature_counts >= 2] # At least 2 routes
353
+
354
+ print(f"Found {len(popular_signatures)} popular route patterns")
355
+
356
+ for i, (signature, count) in enumerate(popular_signatures.head(10).items()):
357
+ cluster_routes_df = meaningful_routes[meaningful_routes['route_signature'] == signature]
358
+
359
+ # Calculate average route length in METERS
360
+ route_length_m = np.mean([
361
+ self.haversine_distance_m(row['start_lat'], row['start_lng'],
362
+ row['end_lat'], row['end_lng'])
363
+ for _, row in cluster_routes_df.iterrows()
364
+ ])
365
+
366
+ self.routes[f"grid_{i}"] = {
367
+ 'route_count': count,
368
+ 'trajectory_ids': cluster_routes_df['randomized_id'].tolist(),
369
+ 'avg_start_point': {
370
+ 'lat': cluster_routes_df['start_lat'].mean(),
371
+ 'lng': cluster_routes_df['start_lng'].mean()
372
+ },
373
+ 'avg_end_point': {
374
+ 'lat': cluster_routes_df['end_lat'].mean(),
375
+ 'lng': cluster_routes_df['end_lng'].mean()
376
+ },
377
+ 'avg_route_length_m': route_length_m, # In meters for compatibility
378
+ 'popularity_score': count / len(meaningful_routes) * 100,
379
+ 'method': 'Grid-based'
380
+ }
381
+
382
+ except Exception as e:
383
+ print(f"Grid-based clustering failed: {e}")
384
+
385
+ # Sort routes by popularity
386
+ if self.routes:
387
+ self.routes = dict(sorted(
388
+ self.routes.items(),
389
+ key=lambda x: x[1]['route_count'],
390
+ reverse=True
391
+ ))
392
+
393
+ print(f"\nSuccessfully identified {len(self.routes)} popular route clusters!")
394
+ for route_id, route_info in list(self.routes.items())[:5]:
395
+ print(f" {route_id}: {route_info['route_count']} trips ({route_info['popularity_score']:.1f}%)")
396
+ else:
397
+ print("No popular routes could be identified")
398
+ self.routes = {}
399
+
400
+ return self.routes
401
+
402
+ def haversine_distance_m(self, lat1, lon1, lat2, lon2):
403
+ """Calculate haversine distance in METERS (for compatibility with generate_report)"""
404
+ # Convert decimal degrees to radians
405
+ lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
406
+
407
+ # Haversine formula
408
+ dlat = lat2 - lat1
409
+ dlon = lon2 - lon1
410
+ a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
411
+ c = 2 * np.arcsin(np.sqrt(a))
412
+ r = 6371 # Radius of earth in kilometers
413
+ return c * r * 1000 # Return in METERS
414
+ def identify_tight_places(self, eps_tight=0.0005, min_samples_tight=50, density_threshold=0.8):
415
+ """Identify tight places (congestion areas) based on point density and movement patterns"""
416
+ print("Identifying tight places (congestion areas)...")
417
+
418
+ if self.processed_df is None:
419
+ raise ValueError("Data must be preprocessed first")
420
+
421
+ # Use all GPS points for density analysis
422
+ coords = self.processed_df[['lat', 'lng']].values
423
+
424
+ # Apply DBSCAN clustering to find high-density areas
425
+ clustering = DBSCAN(eps=eps_tight, min_samples=min_samples_tight)
426
+ clusters = clustering.fit_predict(coords)
427
+
428
+ # Add cluster labels to dataframe
429
+ self.processed_df['density_cluster'] = clusters
430
+
431
+ # Analyze each cluster to identify tight places
432
+ unique_clusters = np.unique(clusters[clusters != -1])
433
+
434
+ self.tight_places = {}
435
+ for cluster_id in unique_clusters:
436
+ cluster_mask = clusters == cluster_id
437
+ cluster_points = coords[cluster_mask]
438
+ cluster_data = self.processed_df[self.processed_df['density_cluster'] == cluster_id]
439
+
440
+ # Calculate density metrics
441
+ cluster_area_km2 = self.calculate_cluster_area(cluster_points)
442
+ point_density = len(cluster_points) / max(cluster_area_km2, 0.001) # points per km²
443
+
444
+ # Calculate movement characteristics
445
+ if 'spd' in cluster_data.columns:
446
+ avg_speed = cluster_data['spd'].mean()
447
+ speed_variance = cluster_data['spd'].var()
448
+ else:
449
+ avg_speed = cluster_data['estimated_speed'].mean()
450
+ speed_variance = cluster_data['estimated_speed'].var()
451
+
452
+ # Calculate how many unique vehicles pass through this area
453
+ unique_vehicles = cluster_data['randomized_id'].nunique()
454
+
455
+ # Calculate congestion indicators
456
+ # Low speed + high density + many vehicles = congestion
457
+ congestion_score = (point_density * unique_vehicles) / max(avg_speed, 1)
458
+
459
+ # Identify as tight place if meets criteria
460
+ is_tight_place = (
461
+ point_density > density_threshold * np.mean([
462
+ len(coords[clusters == c]) / max(self.calculate_cluster_area(coords[clusters == c]), 0.001)
463
+ for c in unique_clusters
464
+ ]) and
465
+ avg_speed < np.percentile(self.processed_df.get('spd', self.processed_df.get('estimated_speed', [30])), 25)
466
+ )
467
+
468
+ self.tight_places[cluster_id] = {
469
+ 'center_lat': np.mean(cluster_points[:, 0]),
470
+ 'center_lng': np.mean(cluster_points[:, 1]),
471
+ 'point_count': len(cluster_points),
472
+ 'unique_vehicles': unique_vehicles,
473
+ 'area_km2': cluster_area_km2,
474
+ 'point_density_per_km2': point_density,
475
+ 'avg_speed_kmh': avg_speed,
476
+ 'speed_variance': speed_variance,
477
+ 'congestion_score': congestion_score,
478
+ 'is_tight_place': is_tight_place,
479
+ 'severity': 'High' if congestion_score > np.percentile([
480
+ (len(coords[clusters == c]) * self.processed_df[self.processed_df['density_cluster'] == c]['randomized_id'].nunique()) /
481
+ max(self.processed_df[self.processed_df['density_cluster'] == c].get('spd', self.processed_df[self.processed_df['density_cluster'] == c].get('estimated_speed', [30])).mean(), 1)
482
+ for c in unique_clusters
483
+ ], 75) else 'Medium' if congestion_score > np.percentile([
484
+ (len(coords[clusters == c]) * self.processed_df[self.processed_df['density_cluster'] == c]['randomized_id'].nunique()) /
485
+ max(self.processed_df[self.processed_df['density_cluster'] == c].get('spd', self.processed_df[self.processed_df['density_cluster'] == c].get('estimated_speed', [30])).mean(), 1)
486
+ for c in unique_clusters
487
+ ], 50) else 'Low'
488
+ }
489
+
490
+ # Filter to only tight places
491
+ self.tight_places = {
492
+ k: v for k, v in self.tight_places.items()
493
+ if v['is_tight_place']
494
+ }
495
+
496
+ # Sort by congestion score
497
+ self.tight_places = dict(sorted(
498
+ self.tight_places.items(),
499
+ key=lambda x: x[1]['congestion_score'],
500
+ reverse=True
501
+ ))
502
+
503
+ print(f"Identified {len(self.tight_places)} tight places (congestion areas)")
504
+ return self.tight_places
505
+
506
+ def calculate_cluster_area(self, points):
507
+ """Calculate the approximate area of a cluster in km²"""
508
+ if len(points) < 3:
509
+ return 0.001 # Minimum area for small clusters
510
+
511
+ # Use convex hull approach for area calculation
512
+ from scipy.spatial import ConvexHull
513
+
514
+ try:
515
+ hull = ConvexHull(points)
516
+ # Convert to meters using rough approximation
517
+ lat_to_m = 111000 # meters per degree latitude
518
+ lng_to_m = 111000 * np.cos(np.radians(np.mean(points[:, 0]))) # adjust for longitude
519
+
520
+ # Scale points to meters
521
+ points_m = points.copy()
522
+ points_m[:, 0] *= lat_to_m
523
+ points_m[:, 1] *= lng_to_m
524
+
525
+ hull_m = ConvexHull(points_m)
526
+ area_m2 = hull_m.volume # In 2D, volume gives area
527
+ area_km2 = area_m2 / 1_000_000 # Convert to km²
528
+
529
+ return max(area_km2, 0.001) # Minimum area
530
+ except:
531
+ # Fallback: bounding box area
532
+ lat_range = np.max(points[:, 0]) - np.min(points[:, 0])
533
+ lng_range = np.max(points[:, 1]) - np.min(points[:, 1])
534
+ area_deg2 = lat_range * lng_range
535
+ area_km2 = area_deg2 * 111 * 111 # rough conversion
536
+ return max(area_km2, 0.001)
537
+
538
+ def analyze_route_efficiency(self):
539
+ """Analyze route efficiency and suggest optimizations"""
540
+ print("Analyzing route efficiency...")
541
+
542
+ if not self.routes:
543
+ print("No routes identified. Run identify_popular_routes() first.")
544
+ return {}
545
+
546
+ efficiency_analysis = {}
547
+
548
+ for route_id, route_info in self.routes.items():
549
+ trajectory_ids = route_info['trajectory_ids']
550
+
551
+ # Get all trajectories for this route
552
+ route_trajectories = self.processed_df[
553
+ self.processed_df['randomized_id'].isin(trajectory_ids)
554
+ ]
555
+
556
+ # Calculate efficiency metrics
557
+ total_distances = []
558
+ total_times = []
559
+ avg_speeds = []
560
+
561
+ for traj_id in trajectory_ids:
562
+ traj_data = route_trajectories[route_trajectories['randomized_id'] == traj_id]
563
+
564
+ if len(traj_data) > 1:
565
+ total_distance = traj_data['distance_to_prev'].sum()
566
+ total_distances.append(total_distance)
567
+
568
+ if 'spd' in traj_data.columns:
569
+ avg_speed = traj_data['spd'].mean()
570
+ else:
571
+ avg_speed = traj_data['estimated_speed'].mean()
572
+ avg_speeds.append(avg_speed)
573
+
574
+ if total_distances and avg_speeds:
575
+ efficiency_analysis[route_id] = {
576
+ 'avg_distance_m': np.mean(total_distances),
577
+ 'distance_variance': np.var(total_distances),
578
+ 'avg_speed_kmh': np.mean(avg_speeds),
579
+ 'speed_consistency': 1 / (1 + np.var(avg_speeds)), # Higher is more consistent
580
+ 'efficiency_score': np.mean(avg_speeds) / max(np.mean(total_distances) / 1000, 0.1), # Speed per km
581
+ 'route_optimization_potential': 'High' if np.var(total_distances) > np.mean(total_distances) * 0.3 else 'Low'
582
+ }
583
+
584
+ return efficiency_analysis
585
+
586
+ def create_visualizations_for_gradio(self):
587
+ """Create visualizations and return figures for Gradio (plotly for routes, matplotlib for others)"""
588
+ import plotly.express as px
589
+ import plotly.graph_objects as go
590
+ from plotly.subplots import make_subplots
591
+
592
+ print("Creating visualizations for Gradio...")
593
+
594
+ # Set up the plotting style for matplotlib
595
+ plt.style.use('default')
596
+ sns.set_palette("husl")
597
+
598
+ figures = {}
599
+
600
+ # 1. Popular Routes Visualization using Plotly (Real Map)
601
+ if self.routes:
602
+ # Debug: Print coordinate ranges
603
+ print(f"Coordinate ranges: Lat {self.processed_df['lat'].min():.4f} to {self.processed_df['lat'].max():.4f}, "
604
+ f"Lng {self.processed_df['lng'].min():.4f} to {self.processed_df['lng'].max():.4f}")
605
+
606
+ # Try different approaches for mapping
607
+ try:
608
+ # Method 1: Try Scattermapbox first
609
+ fig1 = go.Figure()
610
+
611
+ # Add base GPS points (sample for performance)
612
+ sample_points = self.processed_df.sample(min(3000, len(self.processed_df)))
613
+ fig1.add_trace(go.Scattermapbox(
614
+ lat=sample_points['lat'],
615
+ lon=sample_points['lng'],
616
+ mode='markers',
617
+ marker=dict(size=3, color='lightgray', opacity=0.4),
618
+ name='GPS Points',
619
+ hoverinfo='skip'
620
+ ))
621
+
622
+ # Add popular routes with different colors
623
+ colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'olive', 'cyan', 'magenta']
624
+
625
+ for i, (route_id, route_info) in enumerate(list(self.routes.items())[:10]):
626
+ color = colors[i % len(colors)]
627
+ start_point = route_info['avg_start_point']
628
+ end_point = route_info['avg_end_point']
629
+
630
+ # Add start point
631
+ fig1.add_trace(go.Scattermapbox(
632
+ lat=[start_point['lat']],
633
+ lon=[start_point['lng']],
634
+ mode='markers',
635
+ marker=dict(size=12, color=color, symbol='circle'),
636
+ name=f'Route {route_id} Start ({route_info["route_count"]} trips)',
637
+ hovertemplate=f'<b>Route {route_id} - Start</b><br>' +
638
+ f'Trips: {route_info["route_count"]}<br>' +
639
+ f'Lat: {start_point["lat"]:.4f}<br>' +
640
+ f'Lng: {start_point["lng"]:.4f}<extra></extra>'
641
+ ))
642
+
643
+ # Add end point
644
+ fig1.add_trace(go.Scattermapbox(
645
+ lat=[end_point['lat']],
646
+ lon=[end_point['lng']],
647
+ mode='markers',
648
+ marker=dict(size=12, color=color, symbol='square'),
649
+ name=f'Route {route_id} End',
650
+ hovertemplate=f'<b>Route {route_id} - End</b><br>' +
651
+ f'Avg Length: {route_info["avg_route_length_m"]/1000:.2f} km<br>' +
652
+ f'Lat: {end_point["lat"]:.4f}<br>' +
653
+ f'Lng: {end_point["lng"]:.4f}<extra></extra>'
654
+ ))
655
+
656
+ # Add route line
657
+ fig1.add_trace(go.Scattermapbox(
658
+ lat=[start_point['lat'], end_point['lat']],
659
+ lon=[start_point['lng'], end_point['lng']],
660
+ mode='lines',
661
+ line=dict(width=3, color=color),
662
+ name=f'Route {route_id} Path',
663
+ hoverinfo='skip'
664
+ ))
665
+
666
+ # Calculate center and zoom
667
+ center_lat = self.processed_df['lat'].mean()
668
+ center_lng = self.processed_df['lng'].mean()
669
+
670
+ lat_range = self.processed_df['lat'].max() - self.processed_df['lat'].min()
671
+ lng_range = self.processed_df['lng'].max() - self.processed_df['lng'].min()
672
+ max_range = max(lat_range, lng_range)
673
+
674
+ if max_range > 1:
675
+ zoom_level = 8
676
+ elif max_range > 0.1:
677
+ zoom_level = 10
678
+ elif max_range > 0.01:
679
+ zoom_level = 12
680
+ else:
681
+ zoom_level = 14
682
+
683
+ fig1.update_layout(
684
+ title='Popular Routes on Real Map<br><sub>Circle=Start, Square=End</sub>',
685
+ mapbox=dict(
686
+ style='carto-positron',
687
+ center=dict(lat=center_lat, lon=center_lng),
688
+ zoom=zoom_level
689
+ ),
690
+ showlegend=True,
691
+ height=600,
692
+ margin=dict(l=0, r=0, t=50, b=0)
693
+ )
694
+
695
+ figures['popular_routes'] = fig1
696
+ print("✅ Created Scattermapbox visualization")
697
+
698
+ except Exception as e:
699
+ print(f"⚠️ Scattermapbox failed: {e}, trying Scatter Geo...")
700
+
701
+ # Method 2: Fallback to scatter_geo
702
+ try:
703
+ fig1 = go.Figure()
704
+
705
+ # Add base GPS points
706
+ sample_points = self.processed_df.sample(min(3000, len(self.processed_df)))
707
+ fig1.add_trace(go.Scattergeo(
708
+ lat=sample_points['lat'],
709
+ lon=sample_points['lng'],
710
+ mode='markers',
711
+ marker=dict(size=3, color='lightgray', opacity=0.4),
712
+ name='GPS Points',
713
+ hoverinfo='skip'
714
+ ))
715
+
716
+ colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'olive', 'cyan', 'magenta']
717
+
718
+ for i, (route_id, route_info) in enumerate(list(self.routes.items())[:10]):
719
+ color = colors[i % len(colors)]
720
+ start_point = route_info['avg_start_point']
721
+ end_point = route_info['avg_end_point']
722
+
723
+ # Add start point
724
+ fig1.add_trace(go.Scattergeo(
725
+ lat=[start_point['lat']],
726
+ lon=[start_point['lng']],
727
+ mode='markers',
728
+ marker=dict(size=12, color=color, symbol='circle'),
729
+ name=f'Route {route_id} Start ({route_info["route_count"]} trips)',
730
+ hovertemplate=f'<b>Route {route_id} - Start</b><br>' +
731
+ f'Trips: {route_info["route_count"]}<br>' +
732
+ f'Lat: {start_point["lat"]:.4f}<br>' +
733
+ f'Lng: {start_point["lng"]:.4f}<extra></extra>'
734
+ ))
735
+
736
+ # Add end point
737
+ fig1.add_trace(go.Scattergeo(
738
+ lat=[end_point['lat']],
739
+ lon=[end_point['lng']],
740
+ mode='markers',
741
+ marker=dict(size=12, color=color, symbol='square'),
742
+ name=f'Route {route_id} End',
743
+ hovertemplate=f'<b>Route {route_id} - End</b><br>' +
744
+ f'Avg Length: {route_info["avg_route_length_m"]/1000:.2f} km<br>' +
745
+ f'Lat: {end_point["lat"]:.4f}<br>' +
746
+ f'Lng: {end_point["lng"]:.4f}<extra></extra>'
747
+ ))
748
+
749
+ # Add route line
750
+ fig1.add_trace(go.Scattergeo(
751
+ lat=[start_point['lat'], end_point['lat']],
752
+ lon=[start_point['lng'], end_point['lng']],
753
+ mode='lines',
754
+ line=dict(width=3, color=color),
755
+ name=f'Route {route_id} Path',
756
+ hoverinfo='skip'
757
+ ))
758
+
759
+ center_lat = self.processed_df['lat'].mean()
760
+ center_lng = self.processed_df['lng'].mean()
761
+
762
+ fig1.update_layout(
763
+ title='Popular Routes on World Map<br><sub>Circle=Start, Square=End</sub>',
764
+ geo=dict(
765
+ projection_type='natural earth',
766
+ showland=True,
767
+ landcolor='rgb(243, 243, 243)',
768
+ coastlinecolor='rgb(204, 204, 204)',
769
+ center=dict(lat=center_lat, lon=center_lng),
770
+ projection_scale=1
771
+ ),
772
+ showlegend=True,
773
+ height=600,
774
+ margin=dict(l=0, r=0, t=50, b=0)
775
+ )
776
+
777
+ figures['popular_routes'] = fig1
778
+ print("✅ Created Scatter Geo visualization")
779
+
780
+ except Exception as e2:
781
+ print(f"⚠️ Scatter Geo also failed: {e2}, using matplotlib fallback...")
782
+
783
+ # Method 3: Matplotlib fallback
784
+ fig1 = plt.figure(figsize=(15, 10))
785
+
786
+ # Plot all points in light gray
787
+ plt.scatter(self.processed_df['lng'], self.processed_df['lat'],
788
+ c='lightgray', alpha=0.1, s=0.5, label='All GPS Points')
789
+
790
+ # Plot popular routes
791
+ colors_mpl = plt.cm.Set1(np.linspace(0, 1, len(self.routes)))
792
+
793
+ for i, (route_id, route_info) in enumerate(list(self.routes.items())[:10]):
794
+ start_point = route_info['avg_start_point']
795
+ end_point = route_info['avg_end_point']
796
+
797
+ # Plot start and end points
798
+ plt.scatter(start_point['lng'], start_point['lat'],
799
+ c=[colors_mpl[i]], s=100, marker='o',
800
+ label=f'Route {route_id} Start ({route_info["route_count"]} trips)')
801
+ plt.scatter(end_point['lng'], end_point['lat'],
802
+ c=[colors_mpl[i]], s=100, marker='s')
803
+
804
+ # Draw line between start and end
805
+ plt.plot([start_point['lng'], end_point['lng']],
806
+ [start_point['lat'], end_point['lat']],
807
+ c=colors_mpl[i], linewidth=2, alpha=0.7)
808
+
809
+ plt.xlabel('Longitude')
810
+ plt.ylabel('Latitude')
811
+ plt.title('Popular Routes Identification\n(Circle=Start, Square=End)')
812
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
813
+ plt.grid(True, alpha=0.3)
814
+ plt.tight_layout()
815
+ figures['popular_routes'] = fig1
816
+ print("✅ Created matplotlib fallback visualization")
817
+
818
+ # 2. Tight Places (Congestion Areas) Visualization - Keep as matplotlib
819
+ if self.tight_places:
820
+ fig2 = plt.figure(figsize=(15, 10))
821
+
822
+ # Plot all points
823
+ plt.scatter(self.processed_df['lng'], self.processed_df['lat'],
824
+ c='lightblue', alpha=0.1, s=0.5, label='All GPS Points')
825
+
826
+ # Plot tight places with size based on congestion score
827
+ for place_id, place_info in self.tight_places.items():
828
+ size = min(place_info['congestion_score'] * 10, 500)
829
+ color = {'High': 'red', 'Medium': 'orange', 'Low': 'yellow'}[place_info['severity']]
830
+
831
+ plt.scatter(place_info['center_lng'], place_info['center_lat'],
832
+ s=size, c=color, alpha=0.7, edgecolors='black',
833
+ label=f'{place_info["severity"]} Congestion ({place_info["unique_vehicles"]} vehicles)')
834
+
835
+ plt.xlabel('Longitude')
836
+ plt.ylabel('Latitude')
837
+ plt.title('Tight Places (Congestion Areas) Identification\n(Size = Congestion Score)')
838
+ plt.legend()
839
+ plt.grid(True, alpha=0.3)
840
+ plt.tight_layout()
841
+ figures['tight_places'] = fig2
842
+
843
+ # 3. Combined Analysis Map
844
+ fig3 = plt.figure(figsize=(15, 10))
845
+
846
+ # Base map
847
+ plt.scatter(self.processed_df['lng'], self.processed_df['lat'],
848
+ c='lightgray', alpha=0.05, s=0.3)
849
+
850
+ # Popular routes
851
+ if self.routes:
852
+ route_colors = plt.cm.Blues(np.linspace(0.4, 1, len(self.routes)))
853
+ for i, (route_id, route_info) in enumerate(list(self.routes.items())[:5]):
854
+ start_point = route_info['avg_start_point']
855
+ end_point = route_info['avg_end_point']
856
+ plt.plot([start_point['lng'], end_point['lng']],
857
+ [start_point['lat'], end_point['lat']],
858
+ c=route_colors[i], linewidth=3, alpha=0.8,
859
+ label=f'Popular Route {route_id}')
860
+
861
+ # Tight places
862
+ if self.tight_places:
863
+ for place_id, place_info in self.tight_places.items():
864
+ size = min(place_info['congestion_score'] * 15, 300)
865
+ plt.scatter(place_info['center_lng'], place_info['center_lat'],
866
+ s=size, c='red', alpha=0.8, marker='X', edgecolors='darkred',
867
+ label='Congestion Area' if place_id == list(self.tight_places.keys())[0] else "")
868
+
869
+ plt.xlabel('Longitude')
870
+ plt.ylabel('Latitude')
871
+ plt.title('Combined Analysis: Popular Routes & Congestion Areas')
872
+ plt.legend()
873
+ plt.grid(True, alpha=0.3)
874
+ plt.tight_layout()
875
+ figures['combined_analysis'] = fig3
876
+
877
+ # 4. Statistics Dashboard
878
+ fig4, axes = plt.subplots(2, 2, figsize=(15, 10))
879
+
880
+ # Route popularity distribution
881
+ if self.routes:
882
+ route_counts = [info['route_count'] for info in self.routes.values()]
883
+ axes[0, 0].bar(range(len(route_counts)), route_counts, color='skyblue')
884
+ axes[0, 0].set_xlabel('Route Cluster ID')
885
+ axes[0, 0].set_ylabel('Number of Trips')
886
+ axes[0, 0].set_title('Route Popularity Distribution')
887
+ axes[0, 0].grid(True, alpha=0.3)
888
+
889
+ # Congestion severity distribution
890
+ if self.tight_places:
891
+ severity_counts = {}
892
+ for place_info in self.tight_places.values():
893
+ severity = place_info['severity']
894
+ severity_counts[severity] = severity_counts.get(severity, 0) + 1
895
+
896
+ axes[0, 1].pie(severity_counts.values(), labels=severity_counts.keys(),
897
+ autopct='%1.1f%%', colors=['red', 'orange', 'yellow'])
898
+ axes[0, 1].set_title('Congestion Severity Distribution')
899
+
900
+ # Speed distribution
901
+ speed_col = 'spd' if 'spd' in self.processed_df.columns else 'estimated_speed'
902
+ if speed_col in self.processed_df.columns:
903
+ axes[1, 0].hist(self.processed_df[speed_col], bins=50, alpha=0.7, color='green')
904
+ axes[1, 0].set_xlabel('Speed (km/h)')
905
+ axes[1, 0].set_ylabel('Frequency')
906
+ axes[1, 0].set_title('Speed Distribution')
907
+ axes[1, 0].grid(True, alpha=0.3)
908
+
909
+ # Vehicle count by area
910
+ unique_vehicles_per_cluster = self.processed_df.groupby('density_cluster')['randomized_id'].nunique()
911
+ axes[1, 1].bar(range(len(unique_vehicles_per_cluster)),
912
+ unique_vehicles_per_cluster.values, color='purple', alpha=0.7)
913
+ axes[1, 1].set_xlabel('Area Cluster')
914
+ axes[1, 1].set_ylabel('Unique Vehicles')
915
+ axes[1, 1].set_title('Vehicle Distribution by Area')
916
+ axes[1, 1].grid(True, alpha=0.3)
917
+
918
+ plt.tight_layout()
919
+ figures['statistics_dashboard'] = fig4
920
+
921
+ print("Visualizations created for Gradio!")
922
+ return figures
923
+
924
+ def create_visualizations(self, output_dir='./geo_analysis_output'):
925
+ """Create comprehensive visualizations and save to files (legacy method)"""
926
+ import os
927
+ os.makedirs(output_dir, exist_ok=True)
928
+
929
+ # Get figures from the new method
930
+ figures = self.create_visualizations_for_gradio()
931
+
932
+ # Save each figure
933
+ for name, fig in figures.items():
934
+ if hasattr(fig, 'write_image'): # Plotly figure
935
+ fig.write_image(f'{output_dir}/{name}.png', width=1500, height=600, scale=2)
936
+ else: # Matplotlib figure
937
+ fig.savefig(f'{output_dir}/{name}.png', dpi=300, bbox_inches='tight')
938
+ plt.close(fig)
939
+
940
+ print(f"Visualizations saved to {output_dir}/")
941
+
942
+ def generate_report(self):
943
+ """Generate a comprehensive analysis report"""
944
+ print("Generating analysis report...")
945
+
946
+ report = {
947
+ 'data_summary': {
948
+ 'total_records': len(self.processed_df),
949
+ 'unique_vehicles': self.processed_df['randomized_id'].nunique(),
950
+ 'geographic_bounds': {
951
+ 'lat_min': self.processed_df['lat'].min(),
952
+ 'lat_max': self.processed_df['lat'].max(),
953
+ 'lng_min': self.processed_df['lng'].min(),
954
+ 'lng_max': self.processed_df['lng'].max()
955
+ }
956
+ },
957
+ 'popular_routes': {
958
+ 'total_route_clusters': len(self.routes) if self.routes else 0,
959
+ 'top_5_routes': []
960
+ },
961
+ 'tight_places': {
962
+ 'total_congestion_areas': len(self.tight_places) if self.tight_places else 0,
963
+ 'severity_breakdown': {},
964
+ 'top_5_congestion_areas': []
965
+ }
966
+ }
967
+
968
+ # Add popular routes details
969
+ if self.routes:
970
+ for i, (route_id, route_info) in enumerate(list(self.routes.items())[:5]):
971
+ report['popular_routes']['top_5_routes'].append({
972
+ 'route_id': route_id,
973
+ 'trip_count': route_info['route_count'],
974
+ 'popularity_percentage': route_info['popularity_score'],
975
+ 'avg_length_km': route_info['avg_route_length_m'] / 1000,
976
+ 'start_location': route_info['avg_start_point'],
977
+ 'end_location': route_info['avg_end_point']
978
+ })
979
+
980
+ # Add tight places details
981
+ if self.tight_places:
982
+ severity_counts = {'High': 0, 'Medium': 0, 'Low': 0}
983
+ for place_info in self.tight_places.values():
984
+ severity_counts[place_info['severity']] += 1
985
+
986
+ report['tight_places']['severity_breakdown'] = severity_counts
987
+
988
+ for i, (place_id, place_info) in enumerate(list(self.tight_places.items())[:5]):
989
+ report['tight_places']['top_5_congestion_areas'].append({
990
+ 'area_id': place_id,
991
+ 'congestion_score': place_info['congestion_score'],
992
+ 'severity': place_info['severity'],
993
+ 'unique_vehicles': place_info['unique_vehicles'],
994
+ 'avg_speed_kmh': place_info['avg_speed_kmh'],
995
+ 'location': {
996
+ 'lat': place_info['center_lat'],
997
+ 'lng': place_info['center_lng']
998
+ }
999
+ })
1000
+
1001
+ return report
1002
+
1003
+
1004
+ def run_complete_analysis(data_path_or_df, output_dir='./geo_analysis_output', sample_size=400000):
1005
+ """Run complete geo-tracking analysis pipeline focused on routes and congestion"""
1006
+ print("="*60)
1007
+ print("ADVANCED GEO-TRACKING ANALYSIS")
1008
+ print("FOCUS: Popular Routes & Congestion Areas")
1009
+ print("="*60)
1010
+
1011
+ # Initialize analyzer with sampling
1012
+ analyzer = AdvancedGeoTrackAnalyzer(data_path_or_df, sample_size=sample_size)
1013
+
1014
+ # 1. Preprocess data
1015
+ analyzer.preprocess_data()
1016
+
1017
+ # 2. Identify popular routes
1018
+ print("\n" + "="*40)
1019
+ print("IDENTIFYING POPULAR ROUTES")
1020
+ print("="*40)
1021
+ routes = analyzer.identify_popular_routes()
1022
+
1023
+ # 3. Identify tight places (congestion areas)
1024
+ print("\n" + "="*40)
1025
+ print("IDENTIFYING CONGESTION AREAS")
1026
+ print("="*40)
1027
+ tight_places = analyzer.identify_tight_places()
1028
+
1029
+ # 4. Analyze route efficiency
1030
+ print("\n" + "="*40)
1031
+ print("ANALYZING ROUTE EFFICIENCY")
1032
+ print("="*40)
1033
+ efficiency = analyzer.analyze_route_efficiency()
1034
+
1035
+ # 5. Create visualizations
1036
+ print("\n" + "="*40)
1037
+ print("CREATING VISUALIZATIONS")
1038
+ print("="*40)
1039
+ analyzer.create_visualizations(output_dir)
1040
+
1041
+ # 6. Generate report
1042
+ report = analyzer.generate_report()
1043
+
1044
+ print("\n" + "="*60)
1045
+ print("ANALYSIS COMPLETE!")
1046
+ print("="*60)
1047
+ print(f"Results saved to: {output_dir}")
1048
+ print(f"Total records processed: {len(analyzer.processed_df):,}")
1049
+ print(f"Unique vehicles: {analyzer.processed_df['randomized_id'].nunique():,}")
1050
+ print(f"Popular routes identified: {len(routes)}")
1051
+ print(f"Congestion areas identified: {len(tight_places)}")
1052
+ def convert_numpy_types(obj):
1053
+ """Convert numpy types to native Python types for JSON serialization"""
1054
+ if isinstance(obj, dict):
1055
+ return {str(k): convert_numpy_types(v) for k, v in obj.items()}
1056
+ elif isinstance(obj, list):
1057
+ return [convert_numpy_types(item) for item in obj]
1058
+ elif isinstance(obj, np.integer):
1059
+ return int(obj)
1060
+ elif isinstance(obj, np.floating):
1061
+ return float(obj)
1062
+ elif isinstance(obj, np.ndarray):
1063
+ return obj.tolist()
1064
+ else:
1065
+ return obj
1066
+ if routes:
1067
+ print(f"\nTop 3 Popular Routes:")
1068
+ for i, (route_id, route_info) in enumerate(list(routes.items())[:3]):
1069
+ print(f" Route {route_id}: {route_info['route_count']} trips ({route_info['popularity_score']:.1f}% of all routes)")
1070
+ with open(f'{output_dir}/popular_routes.json', 'w') as f:
1071
+ json.dump(convert_numpy_types(routes), f, indent=2, default=str)
1072
+ print(f"Popular routes saved to {output_dir}/popular_routes.json")
1073
+ if tight_places:
1074
+ print(f"\nTop 3 Congestion Areas:")
1075
+ for i, (place_id, place_info) in enumerate(list(tight_places.items())[:3]):
1076
+ print(f" Area {place_id}: {place_info['severity']} severity, {place_info['unique_vehicles']} vehicles, avg speed {place_info['avg_speed_kmh']:.1f} km/h")
1077
+ with open(f'{output_dir}/tight_places.json', 'w') as f:
1078
+ json.dump(convert_numpy_types(tight_places), f, indent=2, default=str)
1079
+ print(f"Tight places saved to {output_dir}/tight_places.json")
1080
+ return analyzer, report
1081
+
1082
+ def predict_traffic_patterns_with_plots(df, sample_size=500000):
1083
+ """
1084
+ Analyze traffic patterns from DataFrame and return predictions as JSON plus matplotlib figures for Gradio
1085
+
1086
+ Parameters:
1087
+ df: pandas.DataFrame - Input DataFrame with geo-tracking data
1088
+ sample_size: int - Maximum number of rows to use for analysis (default 500k)
1089
+
1090
+ Returns:
1091
+ tuple: (predictions_dict, figures_dict) where:
1092
+ - predictions_dict: JSON-serializable predictions
1093
+ - figures_dict: Dictionary of matplotlib figures for Gradio display
1094
+ """
1095
+ def convert_numpy_types(obj):
1096
+ """Convert numpy types to native Python types for JSON serialization"""
1097
+ if isinstance(obj, dict):
1098
+ return {str(k): convert_numpy_types(v) for k, v in obj.items()}
1099
+ elif isinstance(obj, list):
1100
+ return [convert_numpy_types(item) for item in obj]
1101
+ elif isinstance(obj, np.integer):
1102
+ return int(obj)
1103
+ elif isinstance(obj, np.floating):
1104
+ return float(obj)
1105
+ elif isinstance(obj, np.ndarray):
1106
+ return obj.tolist()
1107
+ else:
1108
+ return obj
1109
+
1110
+ try:
1111
+ # Initialize analyzer with sampling
1112
+ analyzer = AdvancedGeoTrackAnalyzer(df, sample_size=sample_size)
1113
+
1114
+ # Run analysis steps
1115
+ analyzer.preprocess_data()
1116
+ routes = analyzer.identify_popular_routes()
1117
+ tight_places = analyzer.identify_tight_places()
1118
+ efficiency = analyzer.analyze_route_efficiency()
1119
+
1120
+ # Generate visualizations for Gradio (returns matplotlib figures)
1121
+ figures = analyzer.create_visualizations_for_gradio()
1122
+
1123
+ # Generate report
1124
+ report = analyzer.generate_report()
1125
+
1126
+ # Convert the report to JSON-serializable format
1127
+ json_predictions = convert_numpy_types(report)
1128
+
1129
+ # Create predictions dictionary
1130
+ predictions = {
1131
+ 'status': 'success',
1132
+ 'analysis_summary': json_predictions,
1133
+ 'popular_routes': {
1134
+ 'total_clusters': len(analyzer.routes) if analyzer.routes else 0,
1135
+ 'routes': convert_numpy_types(analyzer.routes) if analyzer.routes else {}
1136
+ },
1137
+ 'congestion_areas': {
1138
+ 'total_areas': len(analyzer.tight_places) if analyzer.tight_places else 0,
1139
+ 'areas': convert_numpy_types(analyzer.tight_places) if analyzer.tight_places else {}
1140
+ },
1141
+ 'metadata': {
1142
+ 'sample_size_used': len(analyzer.processed_df),
1143
+ 'unique_vehicles': analyzer.processed_df['randomized_id'].nunique(),
1144
+ 'analysis_date': pd.Timestamp.now().isoformat()
1145
+ }
1146
+ }
1147
+
1148
+ return predictions, figures
1149
+
1150
+ except Exception as e:
1151
+ error_predictions = {
1152
+ 'status': 'error',
1153
+ 'error_message': str(e),
1154
+ 'analysis_summary': {},
1155
+ 'popular_routes': {'total_clusters': 0, 'routes': {}},
1156
+ 'congestion_areas': {'total_areas': 0, 'areas': {}},
1157
+ 'metadata': {'error_date': pd.Timestamp.now().isoformat()}
1158
+ }
1159
+ return error_predictions, {}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas>=1.5.0
2
+ numpy>=1.21.0
3
+ matplotlib>=3.5.0
4
+ seaborn>=0.11.0
5
+ scikit-learn>=1.1.0
6
+ scipy>=1.9.0
7
+ gradio>=4.0.0
8
+ plotly>=5.0.0