Shinegupta commited on
Commit
a385675
·
verified ·
1 Parent(s): e1378be

Upload 7 files

Browse files
Files changed (7) hide show
  1. chatbot_engine.py +418 -0
  2. config.py +249 -0
  3. data_processor.py +228 -0
  4. fetii_data.csv +0 -0
  5. requirements.txt +5 -0
  6. utils.py +251 -0
  7. visualizations.py +588 -0
chatbot_engine.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List, Any, Tuple
3
+ from data_processor import DataProcessor
4
+ import utils
5
+
6
+ class FetiiChatbot:
7
+ """
8
+ GPT-style chatbot that can answer questions about Fetii rideshare data.
9
+ """
10
+
11
+ def __init__(self, data_processor: DataProcessor):
12
+ """Initialize the chatbot with a data processor."""
13
+ self.data_processor = data_processor
14
+ self.conversation_history = []
15
+
16
+ self.query_patterns = {
17
+ 'location_stats': [
18
+ r'how many.*(?:groups?|trips?).*(?:went to|to|from)\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
19
+ r'(?:trips?|groups?).*(?:to|from)\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
20
+ r'tell me about\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
21
+ r'stats for\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
22
+ r'(?:show me|find|search)\s+([^?]+?)(?:\s+(?:trips?|data|stats))?(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$'
23
+ ],
24
+ 'time_patterns': [
25
+ r'when do.*groups?.*ride',
26
+ r'what time.*most popular',
27
+ r'peak hours?',
28
+ r'busiest time'
29
+ ],
30
+ 'group_size': [
31
+ r'large groups?\s*\((\d+)\+?\)',
32
+ r'groups? of (\d+)\+? riders?',
33
+ r'(\d+)\+? passengers?',
34
+ r'group size'
35
+ ],
36
+ 'top_locations': [
37
+ r'top.*(?:pickup|drop-?off).*spots?',
38
+ r'most popular.*locations?',
39
+ r'busiest.*locations?',
40
+ r'hottest spots?',
41
+ r'show.*(?:pickup|drop-?off|locations?)',
42
+ r'list.*locations?'
43
+ ],
44
+ 'demographics': [
45
+ r'(\d+)[-–](\d+) year[- ]olds?',
46
+ r'age group',
47
+ r'demographics?'
48
+ ],
49
+ 'general_stats': [
50
+ r'how many total',
51
+ r'average group size',
52
+ r'summary',
53
+ r'overview',
54
+ r'give me.*overview',
55
+ r'show me.*stats',
56
+ r'total trips'
57
+ ]
58
+ }
59
+
60
+ self.time_patterns = [
61
+ r'\s+(?:last|this|yesterday|today)\s+(?:week|month|year|night)',
62
+ r'\s+(?:last|this)\s+(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)',
63
+ r'\s+(?:in\s+)?(?:january|february|march|april|may|june|july|august|september|october|november|december)',
64
+ r'\s+(?:last|this|next)\s+\w+',
65
+ r'\s+(?:yesterday|today|tonight)',
66
+ r'\s+\d{1,2}\/\d{1,2}\/\d{2,4}',
67
+ r'\s+\d{1,2}-\d{1,2}-\d{2,4}'
68
+ ]
69
+
70
+ def process_query(self, user_query: str) -> str:
71
+ """Process a user query and return an appropriate response."""
72
+ user_query = user_query.lower().strip()
73
+
74
+ self.conversation_history.append({"role": "user", "content": user_query})
75
+
76
+ try:
77
+ query_type, params = self._parse_query(user_query)
78
+ response = self._generate_response(query_type, params, user_query)
79
+ self.conversation_history.append({"role": "assistant", "content": response})
80
+
81
+ return response
82
+
83
+ except Exception as e:
84
+ error_response = ("I'm having trouble understanding that question. "
85
+ "Try asking about specific locations, times, or group sizes. "
86
+ "For example: 'How many groups went to The Aquarium on 6th?' or "
87
+ "'What are the peak hours for large groups?'")
88
+ return error_response
89
+
90
+ def _clean_location_from_query(self, location_text: str) -> str:
91
+ """Clean time references from location text."""
92
+ cleaned = location_text.strip()
93
+
94
+ for pattern in self.time_patterns:
95
+ cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
96
+
97
+ cleaned = re.sub(r'\s+', ' ', cleaned).strip()
98
+
99
+ return cleaned
100
+
101
+ def _parse_query(self, query: str) -> Tuple[str, Dict[str, Any]]:
102
+ """Parse the user query to determine intent and extract parameters."""
103
+ params = {}
104
+
105
+ for pattern in self.query_patterns['location_stats']:
106
+ match = re.search(pattern, query, re.IGNORECASE)
107
+ if match:
108
+ location = match.group(1).strip()
109
+ location = self._clean_location_from_query(location)
110
+ if location:
111
+ params['location'] = location
112
+ return 'location_stats', params
113
+
114
+ for pattern in self.query_patterns['time_patterns']:
115
+ if re.search(pattern, query, re.IGNORECASE):
116
+ group_match = re.search(r'(\d+)\+?', query)
117
+ if group_match:
118
+ params['min_group_size'] = int(group_match.group(1))
119
+ return 'time_patterns', params
120
+
121
+ for pattern in self.query_patterns['group_size']:
122
+ match = re.search(pattern, query, re.IGNORECASE)
123
+ if match:
124
+ if match.groups():
125
+ params['group_size'] = int(match.group(1))
126
+ return 'group_size', params
127
+
128
+ for pattern in self.query_patterns['top_locations']:
129
+ if re.search(pattern, query, re.IGNORECASE):
130
+ if 'pickup' in query or 'pick up' in query:
131
+ params['location_type'] = 'pickup'
132
+ elif 'drop' in query:
133
+ params['location_type'] = 'dropoff'
134
+ else:
135
+ params['location_type'] = 'both'
136
+ return 'top_locations', params
137
+
138
+ for pattern in self.query_patterns['demographics']:
139
+ match = re.search(pattern, query, re.IGNORECASE)
140
+ if match and match.groups():
141
+ if len(match.groups()) == 2:
142
+ params['age_range'] = (int(match.group(1)), int(match.group(2)))
143
+ return 'demographics', params
144
+
145
+ for pattern in self.query_patterns['general_stats']:
146
+ if re.search(pattern, query, re.IGNORECASE):
147
+ return 'general_stats', params
148
+
149
+ return 'general_stats', params
150
+
151
+ def _fuzzy_search_location(self, query_location: str) -> List[Tuple[str, int]]:
152
+ """Search for locations using fuzzy matching."""
153
+ all_pickups = self.data_processor.df['pickup_main'].value_counts()
154
+ all_dropoffs = self.data_processor.df['dropoff_main'].value_counts()
155
+
156
+ all_locations = {}
157
+ for location, count in all_pickups.items():
158
+ all_locations[location] = all_locations.get(location, 0) + count
159
+ for location, count in all_dropoffs.items():
160
+ all_locations[location] = all_locations.get(location, 0) + count
161
+
162
+ matches = []
163
+ query_lower = query_location.lower()
164
+
165
+ # Exact match
166
+ for location, count in all_locations.items():
167
+ if query_lower == location.lower():
168
+ matches.append((location, count))
169
+
170
+ # Partial match
171
+ if not matches:
172
+ for location, count in all_locations.items():
173
+ if query_lower in location.lower() or location.lower() in query_lower:
174
+ matches.append((location, count))
175
+
176
+ # Word match
177
+ if not matches:
178
+ query_words = query_lower.split()
179
+ for location, count in all_locations.items():
180
+ location_lower = location.lower()
181
+ if any(word in location_lower for word in query_words if len(word) > 2):
182
+ matches.append((location, count))
183
+
184
+ matches.sort(key=lambda x: x[1], reverse=True)
185
+ return matches[:5]
186
+
187
+ def _generate_response(self, query_type: str, params: Dict[str, Any], original_query: str) -> str:
188
+ """Generate a response based on the query type and parameters."""
189
+
190
+ if query_type == 'location_stats':
191
+ return self._handle_location_stats(params, original_query)
192
+ elif query_type == 'time_patterns':
193
+ return self._handle_time_patterns(params)
194
+ elif query_type == 'group_size':
195
+ return self._handle_group_size(params)
196
+ elif query_type == 'top_locations':
197
+ return self._handle_top_locations(params)
198
+ elif query_type == 'demographics':
199
+ return self._handle_demographics(params)
200
+ elif query_type == 'general_stats':
201
+ return self._handle_general_stats()
202
+ else:
203
+ return self._handle_fallback(original_query)
204
+
205
+ def _handle_location_stats(self, params: Dict[str, Any], original_query: str) -> str:
206
+ """Handle location-specific statistics queries."""
207
+ location = params.get('location', '')
208
+
209
+ stats = self.data_processor.get_location_stats(location)
210
+
211
+ if stats['pickup_count'] == 0 and stats['dropoff_count'] == 0:
212
+ matches = self._fuzzy_search_location(location)
213
+
214
+ if matches:
215
+ best_match = matches[0][0]
216
+ stats = self.data_processor.get_location_stats(best_match)
217
+
218
+ if stats['pickup_count'] > 0 or stats['dropoff_count'] > 0:
219
+ response = f"<strong>Found results for '{best_match}'</strong> (closest match to '{location}'):\n\n"
220
+ else:
221
+ response = f"I couldn't find exact data for '{location}'. Did you mean one of these?\n\n"
222
+ for match_location, count in matches[:3]:
223
+ response += f"• <strong>{match_location}</strong> ({count} total trips)\n"
224
+ response += f"\nTry asking: 'Tell me about {matches[0][0]}'"
225
+ return response
226
+ else:
227
+ return f"I couldn't find any trips associated with '{location}'. Try checking the spelling or asking about a different location like 'West Campus' or 'The Aquarium on 6th'."
228
+ else:
229
+ best_match = location.title()
230
+ response = f"<strong>Stats for {best_match}:</strong>\n\n"
231
+
232
+ if stats['pickup_count'] > 0:
233
+ response += f"<strong>{stats['pickup_count']} pickup trips</strong> with an average group size of {stats['avg_group_size_pickup']:.1f}\n"
234
+ if stats['peak_hours_pickup']:
235
+ peak_hours = ', '.join([utils.format_time(h) for h in stats['peak_hours_pickup']])
236
+ response += f"Most popular pickup times: {peak_hours}\n"
237
+
238
+ if stats['dropoff_count'] > 0:
239
+ response += f"<strong>{stats['dropoff_count']} drop-off trips</strong> with an average group size of {stats['avg_group_size_dropoff']:.1f}\n"
240
+ if stats['peak_hours_dropoff']:
241
+ peak_hours = ', '.join([utils.format_time(h) for h in stats['peak_hours_dropoff']])
242
+ response += f"Most popular drop-off times: {peak_hours}\n"
243
+
244
+ total_trips = stats['pickup_count'] + stats['dropoff_count']
245
+ insights = self.data_processor.get_quick_insights()
246
+ percentage = (total_trips / insights['total_trips']) * 100
247
+
248
+ response += f"\n<strong>Insight:</strong> This location accounts for {percentage:.1f}% of all Austin trips!"
249
+
250
+ if any(word in original_query for word in ['last', 'this', 'month', 'week', 'yesterday', 'today']):
251
+ response += f"\n\n<strong>Note:</strong> This data covers our full Austin dataset. For specific time periods, the patterns shown represent typical activity for this location."
252
+
253
+ return response
254
+
255
+ def _handle_time_patterns(self, params: Dict[str, Any]) -> str:
256
+ """Handle time pattern queries."""
257
+ min_group_size = params.get('min_group_size', None)
258
+
259
+ time_data = self.data_processor.get_time_patterns(min_group_size)
260
+
261
+ response = "<strong>Peak Riding Times:</strong>\n\n"
262
+
263
+ if min_group_size:
264
+ response += f"<em>For groups of {min_group_size}+ riders:</em>\n\n"
265
+
266
+ hourly_counts = time_data['hourly_counts']
267
+ top_hours = sorted(hourly_counts.items(), key=lambda x: x[1], reverse=True)[:5]
268
+
269
+ response += "<strong>Busiest Hours:</strong>\n"
270
+ for i, (hour, count) in enumerate(top_hours, 1):
271
+ time_label = utils.format_time(hour)
272
+ response += f"{i}. <strong>{time_label}</strong> - {count} trips\n"
273
+
274
+ time_categories = time_data['time_category_counts']
275
+ response += "\n<strong>By Time Period:</strong>\n"
276
+ for period, count in sorted(time_categories.items(), key=lambda x: x[1], reverse=True):
277
+ response += f"• <strong>{period}:</strong> {count} trips\n"
278
+
279
+ peak_hour = top_hours[0][0]
280
+ peak_count = top_hours[0][1]
281
+ response += f"\n<strong>Insight:</strong> {utils.format_time(peak_hour)} is the absolute peak with {peak_count} trips!"
282
+
283
+ return response
284
+
285
+ def _handle_group_size(self, params: Dict[str, Any]) -> str:
286
+ """Handle group size queries."""
287
+ target_size = params.get('group_size', 6)
288
+
289
+ insights = self.data_processor.get_quick_insights()
290
+ group_distribution = insights['group_size_distribution']
291
+
292
+ response = f"<strong>Group Size Analysis ({target_size}+ passengers):</strong>\n\n"
293
+
294
+ large_group_trips = sum(count for size, count in group_distribution.items() if size >= target_size)
295
+ total_trips = insights['total_trips']
296
+ percentage = (large_group_trips / total_trips) * 100
297
+
298
+ response += f"• <strong>{large_group_trips} trips</strong> had {target_size}+ passengers ({percentage:.1f}% of all trips)\n"
299
+
300
+ response += f"\n<strong>Breakdown of {target_size}+ passenger groups:</strong>\n"
301
+ large_groups = {size: count for size, count in group_distribution.items() if size >= target_size}
302
+ for size, count in sorted(large_groups.items(), key=lambda x: x[1], reverse=True)[:8]:
303
+ group_pct = (count / large_group_trips) * 100 if large_group_trips > 0 else 0
304
+ response += f"• <strong>{size} passengers:</strong> {count} trips ({group_pct:.1f}%)\n"
305
+
306
+ avg_size = insights['avg_group_size']
307
+ response += f"\n<strong>Insight:</strong> Average group size is {avg_size:.1f} passengers - most rides are group experiences!"
308
+
309
+ return response
310
+
311
+ def _handle_top_locations(self, params: Dict[str, Any]) -> str:
312
+ """Handle top locations queries."""
313
+ location_type = params.get('location_type', 'both')
314
+ insights = self.data_processor.get_quick_insights()
315
+
316
+ response = "<strong>Most Popular Locations:</strong>\n\n"
317
+
318
+ if location_type in ['pickup', 'both']:
319
+ response += "<strong>Top Pickup Spots:</strong>\n"
320
+ for i, (location, count) in enumerate(list(insights['top_pickups'])[:8], 1):
321
+ response += f"{i}. <strong>{location}</strong> - {count} pickups\n"
322
+
323
+ if location_type in ['dropoff', 'both']:
324
+ if location_type == 'both':
325
+ response += "\n<strong>Top Drop-off Destinations:</strong>\n"
326
+ else:
327
+ response += "<strong>Top Drop-off Destinations:</strong>\n"
328
+ for i, (location, count) in enumerate(list(insights['top_dropoffs'])[:8], 1):
329
+ response += f"{i}. <strong>{location}</strong> - {count} drop-offs\n"
330
+
331
+ if location_type in ['pickup', 'both']:
332
+ top_pickup = list(insights['top_pickups'])[0]
333
+ response += f"\n<strong>Insight:</strong> {top_pickup[0]} dominates pickups with {top_pickup[1]} trips!"
334
+
335
+ return response
336
+
337
+ def _handle_demographics(self, params: Dict[str, Any]) -> str:
338
+ """Handle demographics queries."""
339
+ age_range = params.get('age_range', (18, 24))
340
+
341
+ response = f"<strong>Demographics Analysis ({age_range[0]}-{age_range[1]} year olds):</strong>\n\n"
342
+ response += "I'd love to help with demographic analysis, but I don't currently have access to rider age data in this dataset. "
343
+ response += "However, I can tell you about the locations and times that are popular with different group sizes!\n\n"
344
+
345
+ insights = self.data_processor.get_quick_insights()
346
+ response += "<strong>Popular spots that might appeal to younger riders:</strong>\n"
347
+
348
+ entertainment_spots = ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'LUNA Rooftop', 'Green Light Social']
349
+
350
+ for spot in entertainment_spots[:5]:
351
+ for location, count in insights['top_dropoffs']:
352
+ if spot.lower() in location.lower():
353
+ response += f"• <strong>{location}</strong> - {count} drop-offs\n"
354
+ break
355
+
356
+ response += "\n<strong>Insight:</strong> Late night hours (10 PM - 1 AM) see the highest activity, which often correlates with younger demographics!"
357
+
358
+ return response
359
+
360
+ def _handle_general_stats(self) -> str:
361
+ """Handle general statistics queries."""
362
+ insights = self.data_processor.get_quick_insights()
363
+
364
+ response = "<strong>Fetii Austin Overview:</strong>\n\n"
365
+
366
+ response += f"<strong>Total Trips Analyzed:</strong> {insights['total_trips']:,}\n"
367
+ response += f"<strong>Average Group Size:</strong> {insights['avg_group_size']:.1f} passengers\n"
368
+ response += f"<strong>Peak Hour:</strong> {utils.format_time(insights['peak_hour'])}\n"
369
+ response += f"<strong>Large Groups (6+):</strong> {insights['large_groups_count']} trips ({insights['large_groups_pct']:.1f}%)\n\n"
370
+
371
+ response += "<strong>Top Hotspots:</strong>\n"
372
+ top_pickup = list(insights['top_pickups'])[0]
373
+ top_dropoff = list(insights['top_dropoffs'])[0]
374
+ response += f"• Most popular pickup: <strong>{top_pickup[0]}</strong> ({top_pickup[1]} trips)\n"
375
+ response += f"• Most popular destination: <strong>{top_dropoff[0]}</strong> ({top_dropoff[1]} trips)\n\n"
376
+
377
+ group_dist = insights['group_size_distribution']
378
+ most_common_size = max(group_dist.items(), key=lambda x: x[1])
379
+ response += f"<strong>Most Common Group Size:</strong> {most_common_size[0]} passengers ({most_common_size[1]} trips)\n\n"
380
+
381
+ response += "<strong>Key Insights:</strong>\n"
382
+ response += f"• {insights['large_groups_pct']:.0f}% of all rides are large groups (6+ people)\n"
383
+ response += "• Peak activity happens late evening (10-11 PM)\n"
384
+ response += "• West Campus dominates as the top pickup location\n"
385
+ response += "• Entertainment venues are the most popular destinations"
386
+
387
+ return response
388
+
389
+ def _handle_fallback(self, query: str) -> str:
390
+ """Handle queries that don't match any specific pattern."""
391
+ response = "I'm not sure I understood that question perfectly. Here's what I can help you with:\n\n"
392
+
393
+ response += "<strong>Location Questions:</strong>\n"
394
+ response += "• 'How many groups went to [location]?'\n"
395
+ response += "• 'Tell me about [location]'\n"
396
+ response += "• 'Top pickup/drop-off spots'\n\n"
397
+
398
+ response += "<strong>Time Questions:</strong>\n"
399
+ response += "• 'When do large groups typically ride?'\n"
400
+ response += "• 'Peak hours for groups of 6+'\n"
401
+ response += "• 'Busiest times'\n\n"
402
+
403
+ response += "<strong>Group Size Questions:</strong>\n"
404
+ response += "• 'How many trips had 10+ passengers?'\n"
405
+ response += "• 'Large group patterns'\n"
406
+ response += "• 'Average group size'\n\n"
407
+
408
+ response += "Would you like to try asking one of these types of questions?"
409
+
410
+ return response
411
+
412
+ def get_conversation_history(self) -> List[Dict[str, str]]:
413
+ """Get the conversation history."""
414
+ return self.conversation_history
415
+
416
+ def clear_history(self):
417
+ """Clear the conversation history."""
418
+ self.conversation_history = []
config.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for Fetii AI Chatbot
3
+ """
4
+
5
+ # File settings
6
+ CSV_FILE_PATH = "fetii_data.csv"
7
+ SAMPLE_DATA_SIZE = 2000
8
+
9
+ # App settings
10
+ APP_TITLE = "Fetii AI Assistant"
11
+ APP_ICON = "🚗"
12
+ PAGE_LAYOUT = "wide"
13
+
14
+ # Modern color palette
15
+ COLORS = {
16
+ 'primary': '#3b82f6', # Blue-500
17
+ 'primary_dark': '#1d4ed8', # Blue-700
18
+ 'secondary': '#10b981', # Emerald-500
19
+ 'success': '#059669', # Emerald-600
20
+ 'warning': '#f59e0b', # Amber-500
21
+ 'danger': '#ef4444', # Red-500
22
+ 'info': '#06b6d4', # Cyan-500
23
+ 'light': '#f8fafc', # Slate-50
24
+ 'dark': '#1e293b', # Slate-800
25
+ 'gray_100': '#f1f5f9', # Slate-100
26
+ 'gray_300': '#cbd5e1', # Slate-300
27
+ 'gray_500': '#64748b', # Slate-500
28
+ 'gray_700': '#334155', # Slate-700
29
+ 'gray_900': '#0f172a' # Slate-900
30
+ }
31
+
32
+ # Chart configuration
33
+ CHART_CONFIG = {
34
+ 'height': 320,
35
+ 'margin': dict(t=60, b=50, l=50, r=50),
36
+ 'plot_bgcolor': 'rgba(0,0,0,0)',
37
+ 'paper_bgcolor': 'rgba(0,0,0,0)',
38
+ 'font_color': '#374151',
39
+ 'font_family': 'Inter',
40
+ 'grid_color': 'rgba(156, 163, 175, 0.2)',
41
+ 'line_color': 'rgba(156, 163, 175, 0.3)'
42
+ }
43
+
44
+ # Chatbot configuration
45
+ CHATBOT_CONFIG = {
46
+ 'max_history': 50,
47
+ 'response_delay': 0.5,
48
+ 'example_questions': [
49
+ "How many groups went to The Aquarium on 6th last month?",
50
+ "What are the top drop-off spots for large groups on Saturday nights?",
51
+ "When do groups of 6+ riders typically ride downtown?",
52
+ "Show me the busiest pickup locations",
53
+ "What's the pattern for West Campus pickups?",
54
+ "How many trips had more than 10 passengers?"
55
+ ]
56
+ }
57
+
58
+ # Location categories for analysis
59
+ LOCATION_CATEGORIES = {
60
+ 'entertainment': [
61
+ 'bar', 'club', 'lounge', 'aquarium', 'rooftop', 'social',
62
+ 'pub', 'restaurant', 'venue', 'hall', 'theater'
63
+ ],
64
+ 'campus': [
65
+ 'campus', 'university', 'drag', 'west campus', 'student',
66
+ 'dorm', 'residence hall', 'fraternity', 'sorority'
67
+ ],
68
+ 'residential': [
69
+ 'house', 'apartment', 'residence', 'home', 'complex',
70
+ 'condo', 'townhouse', 'manor'
71
+ ],
72
+ 'business': [
73
+ 'office', 'building', 'center', 'district', 'plaza',
74
+ 'tower', 'corporate', 'business'
75
+ ],
76
+ 'transport': [
77
+ 'airport', 'station', 'terminal', 'stop', 'hub',
78
+ 'depot', 'port'
79
+ ],
80
+ 'retail': [
81
+ 'mall', 'store', 'shop', 'market', 'center',
82
+ 'plaza', 'outlet', 'galleria'
83
+ ]
84
+ }
85
+
86
+ # Time categories for analysis
87
+ TIME_CATEGORIES = {
88
+ 'early_morning': (0, 6), # 12 AM - 6 AM
89
+ 'morning': (6, 12), # 6 AM - 12 PM
90
+ 'afternoon': (12, 17), # 12 PM - 5 PM
91
+ 'evening': (17, 21), # 5 PM - 9 PM
92
+ 'night': (21, 24) # 9 PM - 12 AM
93
+ }
94
+
95
+ # Group size categories
96
+ GROUP_SIZE_CATEGORIES = {
97
+ 'small': (1, 4), # 1-4 passengers
98
+ 'medium': (5, 8), # 5-8 passengers
99
+ 'large': (9, 12), # 9-12 passengers
100
+ 'extra_large': (13, 20) # 13+ passengers
101
+ }
102
+
103
+ # Analysis thresholds
104
+ ANALYSIS_THRESHOLDS = {
105
+ 'min_trips_for_pattern': 5,
106
+ 'peak_hour_threshold': 0.8,
107
+ 'popular_location_threshold': 10,
108
+ 'large_group_threshold': 6,
109
+ 'min_group_size_for_analysis': 3
110
+ }
111
+
112
+ # Export configuration
113
+ EXPORT_CONFIG = {
114
+ 'formats': ['csv', 'json', 'pdf'],
115
+ 'max_export_rows': 10000,
116
+ 'include_visualizations': True,
117
+ 'compression': 'gzip'
118
+ }
119
+
120
+ # UI Icons (using simple unicode icons)
121
+ ICONS = {
122
+ 'trips': '📊',
123
+ 'users': '👥',
124
+ 'time': '⏰',
125
+ 'location': '📍',
126
+ 'chart': '📈',
127
+ 'chat': '💬',
128
+ 'insights': '💡',
129
+ 'pickup': '🚗',
130
+ 'dropoff': '🎯',
131
+ 'large_groups': '🎉',
132
+ 'analytics': '📊',
133
+ 'dashboard': '🏠'
134
+ }
135
+
136
+ # Font configuration
137
+ FONTS = {
138
+ 'primary': 'Inter',
139
+ 'monospace': 'JetBrains Mono',
140
+ 'sizes': {
141
+ 'xs': '0.75rem',
142
+ 'sm': '0.875rem',
143
+ 'base': '1rem',
144
+ 'lg': '1.125rem',
145
+ 'xl': '1.25rem',
146
+ '2xl': '1.5rem',
147
+ '3xl': '1.875rem',
148
+ '4xl': '2.25rem'
149
+ },
150
+ 'weights': {
151
+ 'light': 300,
152
+ 'normal': 400,
153
+ 'medium': 500,
154
+ 'semibold': 600,
155
+ 'bold': 700
156
+ }
157
+ }
158
+
159
+ # Spacing configuration
160
+ SPACING = {
161
+ 'xs': '0.25rem',
162
+ 'sm': '0.5rem',
163
+ 'md': '1rem',
164
+ 'lg': '1.5rem',
165
+ 'xl': '2rem',
166
+ '2xl': '2.5rem',
167
+ '3xl': '3rem'
168
+ }
169
+
170
+ # Border radius configuration
171
+ BORDER_RADIUS = {
172
+ 'sm': '4px',
173
+ 'md': '8px',
174
+ 'lg': '12px',
175
+ 'xl': '16px',
176
+ '2xl': '20px',
177
+ 'full': '9999px'
178
+ }
179
+
180
+ # Shadow configuration
181
+ SHADOWS = {
182
+ 'sm': '0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24)',
183
+ 'md': '0 4px 6px rgba(0, 0, 0, 0.07), 0 2px 4px rgba(0, 0, 0, 0.06)',
184
+ 'lg': '0 10px 15px rgba(0, 0, 0, 0.1), 0 4px 6px rgba(0, 0, 0, 0.05)',
185
+ 'xl': '0 20px 25px rgba(0, 0, 0, 0.1), 0 10px 10px rgba(0, 0, 0, 0.04)',
186
+ '2xl': '0 25px 50px rgba(0, 0, 0, 0.25)'
187
+ }
188
+
189
+ # Animation configuration
190
+ ANIMATIONS = {
191
+ 'duration': {
192
+ 'fast': '0.15s',
193
+ 'normal': '0.3s',
194
+ 'slow': '0.5s'
195
+ },
196
+ 'easing': {
197
+ 'ease_in': 'cubic-bezier(0.4, 0, 1, 1)',
198
+ 'ease_out': 'cubic-bezier(0, 0, 0.2, 1)',
199
+ 'ease_in_out': 'cubic-bezier(0.4, 0, 0.2, 1)'
200
+ }
201
+ }
202
+
203
+ # Responsive breakpoints
204
+ BREAKPOINTS = {
205
+ 'sm': '640px',
206
+ 'md': '768px',
207
+ 'lg': '1024px',
208
+ 'xl': '1280px',
209
+ '2xl': '1536px'
210
+ }
211
+
212
+ # Data validation rules
213
+ VALIDATION_RULES = {
214
+ 'min_passengers': 1,
215
+ 'max_passengers': 20,
216
+ 'required_fields': ['Trip ID', 'Total Passengers', 'Trip Date and Time'],
217
+ 'date_formats': ['%m/%d/%y %H:%M', '%m/%d/%Y %H:%M', '%Y-%m-%d %H:%M:%S'],
218
+ 'coordinate_bounds': {
219
+ 'lat_min': 30.0,
220
+ 'lat_max': 30.5,
221
+ 'lng_min': -98.0,
222
+ 'lng_max': -97.5
223
+ }
224
+ }
225
+
226
+ # Performance settings
227
+ PERFORMANCE = {
228
+ 'max_rows_for_visualization': 10000,
229
+ 'cache_timeout': 3600, # 1 hour
230
+ 'pagination_size': 50,
231
+ 'max_memory_usage': '1GB'
232
+ }
233
+
234
+ # Error messages
235
+ ERROR_MESSAGES = {
236
+ 'file_not_found': 'Data file not found. Using sample data for demonstration.',
237
+ 'invalid_data': 'Invalid data format detected. Please check your data.',
238
+ 'no_results': 'No results found for your query. Try adjusting your filters.',
239
+ 'processing_error': 'An error occurred while processing your request.',
240
+ 'visualization_error': 'Unable to create visualization with current data.'
241
+ }
242
+
243
+ # Success messages
244
+ SUCCESS_MESSAGES = {
245
+ 'data_loaded': 'Data loaded successfully',
246
+ 'export_complete': 'Export completed successfully',
247
+ 'analysis_complete': 'Analysis completed',
248
+ 'cache_updated': 'Cache updated successfully'
249
+ }
data_processor.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import Dict, Any
4
+
5
+ class DataProcessor:
6
+ """
7
+ Handles all data processing and analysis for Fetii rideshare data.
8
+ """
9
+
10
+ def __init__(self, csv_file_path: str = "fetii_data.csv"):
11
+ """Initialize the data processor with the CSV file."""
12
+ self.csv_file_path = csv_file_path
13
+ self.df = None
14
+ self.insights = {}
15
+ self.load_and_process_data()
16
+
17
+ def load_and_process_data(self):
18
+ """Load and process the Fetii trip data."""
19
+ try:
20
+ self.df = pd.read_csv(self.csv_file_path)
21
+
22
+ self._clean_data()
23
+ self._extract_temporal_features()
24
+ self._extract_location_features()
25
+ self._calculate_insights()
26
+
27
+ print(f"✅ Successfully loaded {len(self.df)} trips from Austin")
28
+
29
+ except FileNotFoundError:
30
+ print("⚠️ CSV file not found. Creating sample data for demo...")
31
+ self._create_sample_data()
32
+
33
+ def _create_sample_data(self):
34
+ """Create sample data based on the analysis patterns."""
35
+ np.random.seed(42)
36
+
37
+ locations = {
38
+ 'pickup': ['West Campus', 'The Drag', 'Market District', 'Sixth Street', 'East End',
39
+ 'Downtown', 'Govalle', 'Hancock', 'South Lamar', 'Warehouse District'],
40
+ 'dropoff': ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'Mayfair Austin',
41
+ 'Latchkey', '6013 Loyola Ln', "Buford's", 'Darrell K Royal Texas Memorial Stadium',
42
+ 'LUNA Rooftop', 'University of Texas KA house', 'Green Light Social', "The Cat's Pajamas"]
43
+ }
44
+
45
+ passenger_choices = [14, 8, 7, 10, 9, 12, 11, 13, 6, 5, 4, 3, 2, 1]
46
+ passenger_weights = [0.173, 0.128, 0.120, 0.115, 0.113, 0.087, 0.085, 0.077, 0.063, 0.028, 0.007, 0.004, 0.001, 0.001]
47
+
48
+ hour_choices = [22, 23, 21, 19, 0, 20, 18, 1, 2, 17, 16, 3]
49
+ hour_weights = [0.25, 0.23, 0.19, 0.11, 0.08, 0.06, 0.05, 0.03, 0.02, 0.01, 0.01, 0.01]
50
+
51
+ sample_data = []
52
+ for i in range(2000):
53
+ passengers = np.random.choice(passenger_choices, p=passenger_weights)
54
+ hour = np.random.choice(hour_choices, p=hour_weights)
55
+
56
+ pickup_lat = np.random.normal(30.2672, 0.02)
57
+ pickup_lng = np.random.normal(-97.7431, 0.02)
58
+ dropoff_lat = np.random.normal(30.2672, 0.02)
59
+ dropoff_lng = np.random.normal(-97.7431, 0.02)
60
+
61
+ day = np.random.randint(1, 31)
62
+ minute = np.random.randint(0, 60)
63
+
64
+ sample_data.append({
65
+ 'Trip ID': 734889 - i,
66
+ 'Booking User ID': np.random.randint(10000, 999999),
67
+ 'Pick Up Latitude': pickup_lat,
68
+ 'Pick Up Longitude': pickup_lng,
69
+ 'Drop Off Latitude': dropoff_lat,
70
+ 'Drop Off Longitude': dropoff_lng,
71
+ 'Pick Up Address': f"{np.random.choice(locations['pickup'])}, Austin, TX",
72
+ 'Drop Off Address': f"{np.random.choice(locations['dropoff'])}, Austin, TX",
73
+ 'Trip Date and Time': f"9/{day}/25 {hour}:{minute:02d}",
74
+ 'Total Passengers': passengers
75
+ })
76
+
77
+ self.df = pd.DataFrame(sample_data)
78
+ self._clean_data()
79
+ self._extract_temporal_features()
80
+ self._extract_location_features()
81
+ self._calculate_insights()
82
+
83
+ def _clean_data(self):
84
+ """Clean and standardize the data."""
85
+ self.df = self.df.dropna(subset=['Total Passengers', 'Trip Date and Time'])
86
+
87
+ self.df['Total Passengers'] = self.df['Total Passengers'].astype(int)
88
+
89
+ self.df['pickup_main'] = self.df['Pick Up Address'].apply(self._extract_main_location)
90
+ self.df['dropoff_main'] = self.df['Drop Off Address'].apply(self._extract_main_location)
91
+
92
+ def _extract_main_location(self, address: str) -> str:
93
+ """Extract the main location name from an address."""
94
+ if pd.isna(address):
95
+ return "Unknown"
96
+ return address.split(',')[0].strip()
97
+
98
+ def _extract_temporal_features(self):
99
+ """Extract temporal features from trip data."""
100
+ self.df['datetime'] = pd.to_datetime(self.df['Trip Date and Time'], format='%m/%d/%y %H:%M')
101
+ self.df['hour'] = self.df['datetime'].dt.hour
102
+ self.df['day_of_week'] = self.df['datetime'].dt.day_name()
103
+ self.df['date'] = self.df['datetime'].dt.date
104
+
105
+ self.df['time_category'] = self.df['hour'].apply(self._categorize_time)
106
+
107
+ def _categorize_time(self, hour: int) -> str:
108
+ """Categorize hour into time periods."""
109
+ if 6 <= hour < 12:
110
+ return "Morning"
111
+ elif 12 <= hour < 17:
112
+ return "Afternoon"
113
+ elif 17 <= hour < 21:
114
+ return "Evening"
115
+ elif 21 <= hour <= 23:
116
+ return "Night"
117
+ else:
118
+ return "Late Night"
119
+
120
+ def _extract_location_features(self):
121
+ """Extract location-based features."""
122
+ self.df['group_category'] = self.df['Total Passengers'].apply(self._categorize_group_size)
123
+
124
+ self.df['is_entertainment'] = self.df['dropoff_main'].apply(self._is_entertainment_venue)
125
+ self.df['is_campus'] = self.df['pickup_main'].apply(self._is_campus_location)
126
+
127
+ def _categorize_group_size(self, passengers: int) -> str:
128
+ """Categorize group size."""
129
+ if passengers <= 4:
130
+ return "Small (1-4)"
131
+ elif passengers <= 8:
132
+ return "Medium (5-8)"
133
+ elif passengers <= 12:
134
+ return "Large (9-12)"
135
+ else:
136
+ return "Extra Large (13+)"
137
+
138
+ def _is_entertainment_venue(self, location: str) -> bool:
139
+ """Check if location is an entertainment venue."""
140
+ entertainment_keywords = ['bar', 'club', 'lounge', 'aquarium', 'rooftop', 'social', 'pub']
141
+ return any(keyword in location.lower() for keyword in entertainment_keywords)
142
+
143
+ def _is_campus_location(self, location: str) -> bool:
144
+ """Check if location is campus-related."""
145
+ campus_keywords = ['campus', 'university', 'drag', 'west campus']
146
+ return any(keyword in location.lower() for keyword in campus_keywords)
147
+
148
+ def _calculate_insights(self):
149
+ """Calculate key insights from the data."""
150
+ self.insights = {
151
+ 'total_trips': len(self.df),
152
+ 'avg_group_size': self.df['Total Passengers'].mean(),
153
+ 'peak_hour': self.df['hour'].mode().iloc[0],
154
+ 'large_groups_count': len(self.df[self.df['Total Passengers'] >= 6]),
155
+ 'large_groups_pct': (len(self.df[self.df['Total Passengers'] >= 6]) / len(self.df)) * 100,
156
+ 'top_pickups': list(self.df['pickup_main'].value_counts().head(10).items()),
157
+ 'top_dropoffs': list(self.df['dropoff_main'].value_counts().head(10).items()),
158
+ 'hourly_distribution': self.df['hour'].value_counts().sort_index().to_dict(),
159
+ 'group_size_distribution': self.df['Total Passengers'].value_counts().sort_index().to_dict()
160
+ }
161
+
162
+ def get_quick_insights(self) -> Dict[str, Any]:
163
+ """Get quick insights for dashboard."""
164
+ return self.insights
165
+
166
+ def query_data(self, query_params: Dict[str, Any]) -> pd.DataFrame:
167
+ """Query the data based on parameters."""
168
+ filtered_df = self.df.copy()
169
+
170
+ if 'pickup_location' in query_params:
171
+ filtered_df = filtered_df[filtered_df['pickup_main'].str.contains(
172
+ query_params['pickup_location'], case=False, na=False)]
173
+
174
+ if 'dropoff_location' in query_params:
175
+ filtered_df = filtered_df[filtered_df['dropoff_main'].str.contains(
176
+ query_params['dropoff_location'], case=False, na=False)]
177
+
178
+ if 'hour_range' in query_params:
179
+ start_hour, end_hour = query_params['hour_range']
180
+ filtered_df = filtered_df[
181
+ (filtered_df['hour'] >= start_hour) & (filtered_df['hour'] <= end_hour)]
182
+
183
+ if 'min_passengers' in query_params:
184
+ filtered_df = filtered_df[filtered_df['Total Passengers'] >= query_params['min_passengers']]
185
+
186
+ if 'max_passengers' in query_params:
187
+ filtered_df = filtered_df[filtered_df['Total Passengers'] <= query_params['max_passengers']]
188
+
189
+ if 'date_range' in query_params:
190
+ start_date, end_date = query_params['date_range']
191
+ filtered_df = filtered_df[
192
+ (filtered_df['date'] >= start_date) & (filtered_df['date'] <= end_date)]
193
+
194
+ return filtered_df
195
+
196
+ def get_location_stats(self, location: str, location_type: str = 'both') -> Dict[str, Any]:
197
+ """Get statistics for a specific location."""
198
+ if location_type in ['pickup', 'both']:
199
+ pickup_data = self.df[self.df['pickup_main'].str.contains(location, case=False, na=False)]
200
+ else:
201
+ pickup_data = pd.DataFrame()
202
+
203
+ if location_type in ['dropoff', 'both']:
204
+ dropoff_data = self.df[self.df['dropoff_main'].str.contains(location, case=False, na=False)]
205
+ else:
206
+ dropoff_data = pd.DataFrame()
207
+
208
+ return {
209
+ 'pickup_count': len(pickup_data),
210
+ 'dropoff_count': len(dropoff_data),
211
+ 'avg_group_size_pickup': pickup_data['Total Passengers'].mean() if len(pickup_data) > 0 else 0,
212
+ 'avg_group_size_dropoff': dropoff_data['Total Passengers'].mean() if len(dropoff_data) > 0 else 0,
213
+ 'peak_hours_pickup': pickup_data['hour'].mode().tolist() if len(pickup_data) > 0 else [],
214
+ 'peak_hours_dropoff': dropoff_data['hour'].mode().tolist() if len(dropoff_data) > 0 else []
215
+ }
216
+
217
+ def get_time_patterns(self, group_size_filter: int = None) -> Dict[str, Any]:
218
+ """Get time-based patterns."""
219
+ data = self.df.copy()
220
+
221
+ if group_size_filter:
222
+ data = data[data['Total Passengers'] >= group_size_filter]
223
+
224
+ return {
225
+ 'hourly_counts': data['hour'].value_counts().sort_index().to_dict(),
226
+ 'daily_counts': data['day_of_week'].value_counts().to_dict(),
227
+ 'time_category_counts': data['time_category'].value_counts().to_dict()
228
+ }
fetii_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ plotly
4
+ numpy
5
+ python-dateutil
utils.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for Fetii AI Chatbot
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from datetime import datetime, timedelta
8
+ import re
9
+ from typing import List, Dict, Any, Tuple, Optional
10
+ import config
11
+
12
+ def clean_location_name(location: str) -> str:
13
+ """Clean and standardize location names."""
14
+ if pd.isna(location) or not location:
15
+ return "Unknown"
16
+
17
+ cleaned = location.strip().title()
18
+
19
+ suffixes_to_remove = [", Austin, TX", ", Austin, Texas", ", USA", ", United States"]
20
+ for suffix in suffixes_to_remove:
21
+ if cleaned.endswith(suffix):
22
+ cleaned = cleaned[:-len(suffix)]
23
+
24
+ return cleaned
25
+
26
+ def categorize_location(location: str) -> str:
27
+ """Categorize location type based on keywords."""
28
+ location_lower = location.lower()
29
+
30
+ for category, keywords in config.LOCATION_CATEGORIES.items():
31
+ if any(keyword in location_lower for keyword in keywords):
32
+ return category.title()
33
+
34
+ return "Other"
35
+
36
+ def calculate_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
37
+ """Calculate approximate distance between two coordinates in kilometers."""
38
+ lat_diff = lat2 - lat1
39
+ lon_diff = lon2 - lon1
40
+ distance = np.sqrt(lat_diff**2 + lon_diff**2) * 111
41
+ return round(distance, 2)
42
+
43
+ def format_time(hour: int) -> str:
44
+ """Format hour as readable time string."""
45
+ if hour == 0:
46
+ return "12:00 AM"
47
+ elif hour < 12:
48
+ return f"{hour}:00 AM"
49
+ elif hour == 12:
50
+ return "12:00 PM"
51
+ else:
52
+ return f"{hour-12}:00 PM"
53
+
54
+ def get_time_category(hour: int) -> str:
55
+ """Get time category for a given hour."""
56
+ for category, (start, end) in config.TIME_CATEGORIES.items():
57
+ if start <= hour < end:
58
+ return category.replace('_', ' ').title()
59
+ return "Unknown"
60
+
61
+ def get_group_size_category(passengers: int) -> str:
62
+ """Get group size category for passenger count."""
63
+ for category, (min_size, max_size) in config.GROUP_SIZE_CATEGORIES.items():
64
+ if min_size <= passengers <= max_size:
65
+ return category.replace('_', ' ').title()
66
+ return "Unknown"
67
+
68
+ def extract_numbers_from_text(text: str) -> List[int]:
69
+ """Extract all numbers from text."""
70
+ numbers = re.findall(r'\d+', text)
71
+ return [int(num) for num in numbers]
72
+
73
+ def parse_date_string(date_str: str) -> Optional[datetime]:
74
+ """Parse various date string formats."""
75
+ formats = [
76
+ '%m/%d/%y %H:%M',
77
+ '%m/%d/%Y %H:%M',
78
+ '%Y-%m-%d %H:%M:%S',
79
+ '%Y-%m-%d %H:%M',
80
+ '%m/%d/%y %H:%M:%S'
81
+ ]
82
+
83
+ for fmt in formats:
84
+ try:
85
+ return datetime.strptime(date_str, fmt)
86
+ except ValueError:
87
+ continue
88
+
89
+ return None
90
+
91
+ def generate_insights(data: pd.DataFrame) -> Dict[str, Any]:
92
+ """Generate comprehensive insights from trip data."""
93
+ insights = {}
94
+
95
+ insights['total_trips'] = len(data)
96
+ insights['total_passengers'] = data['Total Passengers'].sum()
97
+ insights['avg_group_size'] = data['Total Passengers'].mean()
98
+ insights['median_group_size'] = data['Total Passengers'].median()
99
+
100
+ if 'hour' in data.columns:
101
+ insights['peak_hour'] = data['hour'].mode().iloc[0] if len(data['hour'].mode()) > 0 else None
102
+ insights['hour_distribution'] = data['hour'].value_counts().to_dict()
103
+
104
+ if 'pickup_main' in data.columns:
105
+ insights['top_pickups'] = data['pickup_main'].value_counts().head(10).to_dict()
106
+ insights['unique_pickup_locations'] = data['pickup_main'].nunique()
107
+
108
+ if 'dropoff_main' in data.columns:
109
+ insights['top_dropoffs'] = data['dropoff_main'].value_counts().head(10).to_dict()
110
+ insights['unique_dropoff_locations'] = data['dropoff_main'].nunique()
111
+
112
+ insights['group_size_distribution'] = data['Total Passengers'].value_counts().to_dict()
113
+ insights['large_groups'] = len(data[data['Total Passengers'] >= config.ANALYSIS_THRESHOLDS['large_group_threshold']])
114
+ insights['large_groups_percentage'] = (insights['large_groups'] / insights['total_trips']) * 100
115
+
116
+ if 'date' in data.columns:
117
+ insights['date_range'] = {
118
+ 'start': data['date'].min(),
119
+ 'end': data['date'].max(),
120
+ 'days_covered': (data['date'].max() - data['date'].min()).days + 1
121
+ }
122
+ insights['daily_average'] = insights['total_trips'] / insights['date_range']['days_covered']
123
+
124
+ return insights
125
+
126
+ def format_number(number: float, decimals: int = 1) -> str:
127
+ """Format numbers for display."""
128
+ if number >= 1000000:
129
+ return f"{number/1000000:.{decimals}f}M"
130
+ elif number >= 1000:
131
+ return f"{number/1000:.{decimals}f}K"
132
+ else:
133
+ return f"{number:.{decimals}f}" if decimals > 0 else str(int(number))
134
+
135
+ def create_summary_stats(data: pd.DataFrame) -> Dict[str, str]:
136
+ """Create formatted summary statistics for display."""
137
+ insights = generate_insights(data)
138
+
139
+ return {
140
+ 'Total Trips': format_number(insights['total_trips'], 0),
141
+ 'Total Passengers': format_number(insights['total_passengers'], 0),
142
+ 'Average Group Size': f"{insights['avg_group_size']:.1f}",
143
+ 'Peak Hour': format_time(insights.get('peak_hour', 22)),
144
+ 'Large Groups': f"{insights['large_groups_percentage']:.1f}%",
145
+ 'Unique Pickup Locations': format_number(insights.get('unique_pickup_locations', 0), 0),
146
+ 'Unique Destinations': format_number(insights.get('unique_dropoff_locations', 0), 0),
147
+ 'Daily Average': f"{insights.get('daily_average', 0):.1f} trips/day"
148
+ }
149
+
150
+ def validate_data(data: pd.DataFrame) -> Tuple[bool, List[str]]:
151
+ """Validate data quality and return issues found."""
152
+ issues = []
153
+
154
+ required_columns = ['Trip ID', 'Total Passengers', 'Trip Date and Time']
155
+ missing_columns = [col for col in required_columns if col not in data.columns]
156
+ if missing_columns:
157
+ issues.append(f"Missing required columns: {', '.join(missing_columns)}")
158
+
159
+ if len(data) == 0:
160
+ issues.append("Dataset is empty")
161
+ return False, issues
162
+
163
+ if 'Total Passengers' in data.columns:
164
+ invalid_passengers = data[
165
+ (data['Total Passengers'] < 1) |
166
+ (data['Total Passengers'] > 20) |
167
+ (data['Total Passengers'].isna())
168
+ ]
169
+ if len(invalid_passengers) > 0:
170
+ issues.append(f"Found {len(invalid_passengers)} trips with invalid passenger counts")
171
+
172
+ if 'Trip Date and Time' in data.columns:
173
+ invalid_dates = 0
174
+ for date_str in data['Trip Date and Time'].dropna():
175
+ if parse_date_string(str(date_str)) is None:
176
+ invalid_dates += 1
177
+ if invalid_dates > 0:
178
+ issues.append(f"Found {invalid_dates} trips with invalid date formats")
179
+
180
+ if 'Trip ID' in data.columns:
181
+ duplicates = data['Trip ID'].duplicated().sum()
182
+ if duplicates > 0:
183
+ issues.append(f"Found {duplicates} duplicate trip IDs")
184
+
185
+ return len(issues) == 0, issues
186
+
187
+ def create_export_data(data: pd.DataFrame, insights: Dict[str, Any], format_type: str = 'csv') -> Any:
188
+ """Create data for export in specified format."""
189
+ if format_type == 'csv':
190
+ return data.to_csv(index=False)
191
+
192
+ elif format_type == 'json':
193
+ export_data = {
194
+ 'metadata': {
195
+ 'export_date': datetime.now().isoformat(),
196
+ 'total_records': len(data),
197
+ 'insights': insights
198
+ },
199
+ 'data': data.to_dict('records')
200
+ }
201
+ return export_data
202
+
203
+ elif format_type == 'summary':
204
+ summary = create_summary_stats(data)
205
+ return summary
206
+
207
+ else:
208
+ raise ValueError(f"Unsupported export format: {format_type}")
209
+
210
+ def search_locations(query: str, locations: List[str], max_results: int = 5) -> List[str]:
211
+ """Search for locations matching a query."""
212
+ query_lower = query.lower()
213
+ matches = []
214
+
215
+ for location in locations:
216
+ if query_lower == location.lower():
217
+ matches.append(location)
218
+
219
+ for location in locations:
220
+ if query_lower in location.lower() and location not in matches:
221
+ matches.append(location)
222
+
223
+ query_words = query_lower.split()
224
+ for location in locations:
225
+ location_lower = location.lower()
226
+ if (any(word in location_lower for word in query_words) and
227
+ location not in matches):
228
+ matches.append(location)
229
+
230
+ return matches[:max_results]
231
+
232
+ def get_color_palette(num_colors: int) -> List[str]:
233
+ """Get a color palette for visualizations."""
234
+ base_colors = [
235
+ '#667eea', '#764ba2', '#f093fb', '#f5576c',
236
+ '#4facfe', '#00f2fe', '#43e97b', '#38f9d7',
237
+ '#ffecd2', '#fcb69f', '#a8edea', '#fed6e3'
238
+ ]
239
+
240
+ if num_colors <= len(base_colors):
241
+ return base_colors[:num_colors]
242
+
243
+ import colorsys
244
+ additional_colors = []
245
+ for i in range(num_colors - len(base_colors)):
246
+ hue = (i * 0.618033988749895) % 1
247
+ rgb = colorsys.hsv_to_rgb(hue, 0.7, 0.9)
248
+ hex_color = '#%02x%02x%02x' % tuple(int(c * 255) for c in rgb)
249
+ additional_colors.append(hex_color)
250
+
251
+ return base_colors + additional_colors
visualizations.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import plotly.express as px
3
+ import plotly.graph_objects as go
4
+ from plotly.subplots import make_subplots
5
+ import pandas as pd
6
+ from typing import Dict, Any
7
+ from data_processor import DataProcessor
8
+
9
+ def create_visualizations(data_processor: DataProcessor) -> Dict[str, Any]:
10
+ """
11
+ Create all visualizations for the Fetii dashboard.
12
+ Compatible with both Streamlit and Gradio interfaces.
13
+ """
14
+ insights = data_processor.get_quick_insights()
15
+ df = data_processor.df
16
+
17
+ visualizations = {}
18
+
19
+ # Core visualizations - optimized for Gradio display
20
+ visualizations['hourly_distribution'] = create_hourly_chart(insights['hourly_distribution'])
21
+ visualizations['group_size_distribution'] = create_group_size_chart(insights['group_size_distribution'])
22
+ visualizations['popular_locations'] = create_locations_chart(insights['top_pickups'])
23
+
24
+ # Advanced visualizations
25
+ visualizations['time_heatmap'] = create_time_heatmap(df)
26
+ visualizations['daily_volume'] = create_daily_volume_chart(df)
27
+ visualizations['trip_distance_analysis'] = create_distance_analysis(df)
28
+ visualizations['location_comparison'] = create_location_comparison(df)
29
+ visualizations['peak_patterns'] = create_peak_patterns(df)
30
+
31
+ return visualizations
32
+
33
+ def create_hourly_chart(hourly_data: Dict[int, int]) -> go.Figure:
34
+ """Create modern hourly distribution chart."""
35
+ hours = sorted(hourly_data.keys())
36
+ counts = [hourly_data[hour] for hour in hours]
37
+
38
+ # Create hour labels
39
+ hour_labels = []
40
+ for hour in hours:
41
+ if hour == 0:
42
+ hour_labels.append("12 AM")
43
+ elif hour < 12:
44
+ hour_labels.append(f"{hour} AM")
45
+ elif hour == 12:
46
+ hour_labels.append("12 PM")
47
+ else:
48
+ hour_labels.append(f"{hour-12} PM")
49
+
50
+ fig = go.Figure()
51
+
52
+ # Create modern gradient colors based on intensity
53
+ max_count = max(counts)
54
+ colors = []
55
+ for count in counts:
56
+ intensity = count / max_count
57
+ if intensity > 0.8:
58
+ colors.append('#667eea') # Primary gradient start
59
+ elif intensity > 0.6:
60
+ colors.append('#764ba2') # Primary gradient end
61
+ elif intensity > 0.4:
62
+ colors.append('#f093fb') # Secondary gradient start
63
+ elif intensity > 0.2:
64
+ colors.append('#4facfe') # Success gradient
65
+ else:
66
+ colors.append('#9ca3af') # Gray for low activity
67
+
68
+ fig.add_trace(go.Bar(
69
+ x=hour_labels,
70
+ y=counts,
71
+ marker=dict(
72
+ color=colors,
73
+ line=dict(color='rgba(255,255,255,0.8)', width=1)
74
+ ),
75
+ name='Trips',
76
+ hovertemplate='<b>%{x}</b><br>Trips: %{y}<extra></extra>',
77
+ text=counts,
78
+ textposition='outside',
79
+ textfont=dict(color='#374151', size=10, family='Inter')
80
+ ))
81
+
82
+ fig.update_layout(
83
+ title={
84
+ 'text': 'Trip Distribution by Hour',
85
+ 'x': 0.5,
86
+ 'font': {'size': 16, 'color': '#1f2937', 'family': 'Inter'}
87
+ },
88
+ xaxis_title='Hour of Day',
89
+ yaxis_title='Number of Trips',
90
+ plot_bgcolor='rgba(0,0,0,0)',
91
+ paper_bgcolor='rgba(0,0,0,0)',
92
+ font={'color': '#374151', 'family': 'Inter'},
93
+ height=280,
94
+ margin=dict(t=50, b=40, l=40, r=40),
95
+ xaxis=dict(
96
+ showgrid=True,
97
+ gridwidth=1,
98
+ gridcolor='rgba(156, 163, 175, 0.2)',
99
+ showline=True,
100
+ linecolor='rgba(156, 163, 175, 0.3)'
101
+ ),
102
+ yaxis=dict(
103
+ showgrid=True,
104
+ gridwidth=1,
105
+ gridcolor='rgba(156, 163, 175, 0.2)',
106
+ showline=True,
107
+ linecolor='rgba(156, 163, 175, 0.3)'
108
+ )
109
+ )
110
+
111
+ return fig
112
+
113
+ def create_group_size_chart(group_data: Dict[int, int]) -> go.Figure:
114
+ """Create modern group size distribution chart."""
115
+ sizes = list(group_data.keys())
116
+ counts = list(group_data.values())
117
+
118
+ # Enhanced modern color palette with gradients
119
+ colors = [
120
+ '#667eea', '#764ba2', '#f093fb', '#f5576c',
121
+ '#4facfe', '#00f2fe', '#43e97b', '#38f9d7',
122
+ '#fa709a', '#fee140', '#a8edea', '#fed6e3'
123
+ ]
124
+
125
+ fig = go.Figure()
126
+
127
+ fig.add_trace(go.Pie(
128
+ labels=[f"{size} passengers" for size in sizes],
129
+ values=counts,
130
+ marker=dict(
131
+ colors=colors[:len(sizes)],
132
+ line=dict(color='white', width=2)
133
+ ),
134
+ hovertemplate='<b>%{label}</b><br>Trips: %{value}<br>Percentage: %{percent}<extra></extra>',
135
+ textinfo='label+percent',
136
+ textposition='auto',
137
+ textfont=dict(color='white', size=11, family='Inter'),
138
+ hole=0.4
139
+ ))
140
+
141
+ fig.update_layout(
142
+ title={
143
+ 'text': 'Group Size Distribution',
144
+ 'x': 0.5,
145
+ 'font': {'size': 16, 'color': '#1f2937', 'family': 'Inter'}
146
+ },
147
+ plot_bgcolor='rgba(0,0,0,0)',
148
+ paper_bgcolor='rgba(0,0,0,0)',
149
+ font={'color': '#374151', 'family': 'Inter'},
150
+ height=280,
151
+ margin=dict(t=50, b=40, l=40, r=40),
152
+ showlegend=False
153
+ )
154
+
155
+ return fig
156
+
157
+ def create_locations_chart(pickup_data: list) -> go.Figure:
158
+ """Create modern popular locations chart."""
159
+ locations = [item[0] for item in pickup_data[:8]]
160
+ counts = [item[1] for item in pickup_data[:8]]
161
+
162
+ # Truncate long location names
163
+ truncated_locations = []
164
+ for loc in locations:
165
+ if len(loc) > 20:
166
+ truncated_locations.append(loc[:17] + "...")
167
+ else:
168
+ truncated_locations.append(loc)
169
+
170
+ fig = go.Figure()
171
+
172
+ # Enhanced gradient colors with modern palette
173
+ max_count = max(counts)
174
+ base_colors = ['#667eea', '#764ba2', '#f093fb', '#f5576c', '#4facfe', '#00f2fe', '#43e97b', '#38f9d7']
175
+ colors = []
176
+ for i, count in enumerate(counts):
177
+ base_color = base_colors[i % len(base_colors)]
178
+ # Convert hex to rgba with opacity based on intensity
179
+ hex_color = base_color.lstrip('#')
180
+ rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
181
+ intensity = count / max_count
182
+ colors.append(f'rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {0.6 + intensity * 0.4})')
183
+
184
+ fig.add_trace(go.Bar(
185
+ x=counts,
186
+ y=truncated_locations,
187
+ orientation='h',
188
+ marker=dict(
189
+ color=colors,
190
+ line=dict(color='rgba(255,255,255,0.8)', width=1),
191
+ cornerradius=4
192
+ ),
193
+ hovertemplate='<b>%{customdata}</b><br>Pickups: %{x}<extra></extra>',
194
+ customdata=locations,
195
+ text=counts,
196
+ textposition='outside',
197
+ textfont=dict(color='#374151', size=10, family='Inter')
198
+ ))
199
+
200
+ fig.update_layout(
201
+ title={
202
+ 'text': 'Top Pickup Locations',
203
+ 'x': 0.5,
204
+ 'font': {'size': 16, 'color': '#1f2937', 'family': 'Inter'}
205
+ },
206
+ xaxis_title='Number of Pickups',
207
+ yaxis_title='',
208
+ plot_bgcolor='rgba(0,0,0,0)',
209
+ paper_bgcolor='rgba(0,0,0,0)',
210
+ font={'color': '#374151', 'family': 'Inter'},
211
+ height=280,
212
+ margin=dict(t=50, b=40, l=120, r=40),
213
+ yaxis=dict(
214
+ autorange="reversed",
215
+ showline=True,
216
+ linecolor='rgba(156, 163, 175, 0.3)'
217
+ ),
218
+ xaxis=dict(
219
+ showgrid=True,
220
+ gridwidth=1,
221
+ gridcolor='rgba(156, 163, 175, 0.2)',
222
+ showline=True,
223
+ linecolor='rgba(156, 163, 175, 0.3)'
224
+ )
225
+ )
226
+
227
+ return fig
228
+
229
+ def create_time_heatmap(df: pd.DataFrame) -> go.Figure:
230
+ """Create advanced time-based heatmap."""
231
+ df_copy = df.copy()
232
+ df_copy['day_num'] = df_copy['datetime'].dt.dayofweek
233
+ df_copy['day_name'] = df_copy['datetime'].dt.day_name()
234
+
235
+ heatmap_data = df_copy.groupby(['day_num', 'hour']).size().reset_index(name='trips')
236
+ heatmap_pivot = heatmap_data.pivot(index='day_num', columns='hour', values='trips').fillna(0)
237
+
238
+ day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
239
+
240
+ hour_labels = []
241
+ for hour in range(24):
242
+ if hour == 0:
243
+ hour_labels.append("12 AM")
244
+ elif hour < 12:
245
+ hour_labels.append(f"{hour} AM")
246
+ elif hour == 12:
247
+ hour_labels.append("12 PM")
248
+ else:
249
+ hour_labels.append(f"{hour-12} PM")
250
+
251
+ fig = go.Figure()
252
+
253
+ fig.add_trace(go.Heatmap(
254
+ z=heatmap_pivot.values,
255
+ x=hour_labels,
256
+ y=day_names,
257
+ colorscale=[
258
+ [0, '#f8fafc'],
259
+ [0.2, '#e2e8f0'],
260
+ [0.4, '#94a3b8'],
261
+ [0.6, '#3b82f6'],
262
+ [0.8, '#1d4ed8'],
263
+ [1, '#1e40af']
264
+ ],
265
+ hovertemplate='<b>%{y}</b><br>%{x}<br>Trips: %{z}<extra></extra>',
266
+ colorbar=dict(
267
+ title=dict(text="Trips", font=dict(family='Inter', color='#374151')),
268
+ tickfont=dict(family='Inter', color='#374151')
269
+ )
270
+ ))
271
+
272
+ fig.update_layout(
273
+ title={
274
+ 'text': 'Trip Patterns by Day & Hour',
275
+ 'x': 0.5,
276
+ 'font': {'size': 16, 'color': '#1f2937', 'family': 'Inter', 'weight': 700}
277
+ },
278
+ xaxis_title='Hour of Day',
279
+ yaxis_title='Day of Week',
280
+ plot_bgcolor='rgba(248, 250, 252, 0.5)',
281
+ paper_bgcolor='rgba(0,0,0,0)',
282
+ font={'color': '#374151', 'family': 'Inter'},
283
+ height=350,
284
+ margin=dict(t=50, b=40, l=100, r=40),
285
+ xaxis=dict(
286
+ showgrid=True,
287
+ gridwidth=1,
288
+ gridcolor='rgba(156, 163, 175, 0.3)',
289
+ tickfont=dict(size=11)
290
+ ),
291
+ yaxis=dict(
292
+ showgrid=True,
293
+ gridwidth=1,
294
+ gridcolor='rgba(156, 163, 175, 0.3)',
295
+ tickfont=dict(size=11)
296
+ )
297
+ )
298
+
299
+ return fig
300
+
301
+ def create_daily_volume_chart(df: pd.DataFrame) -> go.Figure:
302
+ """Create modern daily trip volume chart."""
303
+ daily_trips = df.groupby('date').size().reset_index(name='trips')
304
+ daily_trips['date'] = pd.to_datetime(daily_trips['date'])
305
+ daily_trips = daily_trips.sort_values('date')
306
+
307
+ fig = go.Figure()
308
+
309
+ # Main line
310
+ fig.add_trace(go.Scatter(
311
+ x=daily_trips['date'],
312
+ y=daily_trips['trips'],
313
+ mode='lines+markers',
314
+ line=dict(color='#3b82f6', width=3, shape='spline'),
315
+ marker=dict(size=6, color='#1d4ed8', line=dict(color='white', width=1)),
316
+ fill='tonexty',
317
+ fillcolor='rgba(59, 130, 246, 0.1)',
318
+ hovertemplate='<b>%{x}</b><br>Trips: %{y}<extra></extra>',
319
+ name='Daily Trips'
320
+ ))
321
+
322
+ # Add trend line
323
+ if len(daily_trips) > 1:
324
+ z = np.polyfit(range(len(daily_trips)), daily_trips['trips'], 1)
325
+ p = np.poly1d(z)
326
+ fig.add_trace(go.Scatter(
327
+ x=daily_trips['date'],
328
+ y=p(range(len(daily_trips))),
329
+ mode='lines',
330
+ line=dict(color='#ef4444', width=2, dash='dot'),
331
+ name='Trend',
332
+ hovertemplate='Trend: %{y:.0f}<extra></extra>'
333
+ ))
334
+
335
+ fig.update_layout(
336
+ title={
337
+ 'text': 'Daily Trip Volume',
338
+ 'x': 0.5,
339
+ 'font': {'size': 18, 'color': '#1f2937', 'family': 'Inter'}
340
+ },
341
+ xaxis_title='Date',
342
+ yaxis_title='Number of Trips',
343
+ plot_bgcolor='rgba(0,0,0,0)',
344
+ paper_bgcolor='rgba(0,0,0,0)',
345
+ font={'color': '#374151', 'family': 'Inter'},
346
+ height=320,
347
+ margin=dict(t=60, b=50, l=50, r=50),
348
+ showlegend=True,
349
+ legend=dict(
350
+ x=0.02,
351
+ y=0.98,
352
+ bgcolor='rgba(255,255,255,0.9)',
353
+ bordercolor='rgba(156, 163, 175, 0.3)',
354
+ borderwidth=1
355
+ ),
356
+ xaxis=dict(
357
+ showgrid=True,
358
+ gridwidth=1,
359
+ gridcolor='rgba(156, 163, 175, 0.2)'
360
+ ),
361
+ yaxis=dict(
362
+ showgrid=True,
363
+ gridwidth=1,
364
+ gridcolor='rgba(156, 163, 175, 0.2)'
365
+ )
366
+ )
367
+
368
+ return fig
369
+
370
+ def create_distance_analysis(df: pd.DataFrame) -> go.Figure:
371
+ """Create group size vs trip distance analysis."""
372
+ if not all(col in df.columns for col in ['Pick Up Latitude', 'Pick Up Longitude', 'Drop Off Latitude', 'Drop Off Longitude']):
373
+ return create_placeholder_chart("Distance Analysis", "Location data not available")
374
+
375
+ df_copy = df.copy()
376
+ df_copy['distance'] = np.sqrt(
377
+ (df_copy['Drop Off Latitude'] - df_copy['Pick Up Latitude'])**2 +
378
+ (df_copy['Drop Off Longitude'] - df_copy['Pick Up Longitude'])**2
379
+ ) * 111 # Approximate km conversion
380
+
381
+ distance_by_group = df_copy.groupby('Total Passengers')['distance'].agg(['mean', 'std', 'count']).reset_index()
382
+ distance_by_group = distance_by_group[distance_by_group['count'] >= 3] # Filter groups with few trips
383
+
384
+ fig = go.Figure()
385
+
386
+ fig.add_trace(go.Scatter(
387
+ x=distance_by_group['Total Passengers'],
388
+ y=distance_by_group['mean'],
389
+ mode='markers+lines',
390
+ marker=dict(
391
+ size=distance_by_group['count']/5,
392
+ color=distance_by_group['mean'],
393
+ colorscale='Viridis',
394
+ showscale=True,
395
+ colorbar=dict(title="Avg Distance (km)"),
396
+ line=dict(color='white', width=1)
397
+ ),
398
+ line=dict(color='#3b82f6', width=2),
399
+ error_y=dict(
400
+ type='data',
401
+ array=distance_by_group['std'],
402
+ color='rgba(59, 130, 246, 0.3)'
403
+ ),
404
+ hovertemplate='<b>Group Size: %{x}</b><br>Avg Distance: %{y:.2f} km<br>Trips: %{marker.size:.0f}<extra></extra>',
405
+ name='Average Distance'
406
+ ))
407
+
408
+ fig.update_layout(
409
+ title={
410
+ 'text': 'Average Trip Distance by Group Size',
411
+ 'x': 0.5,
412
+ 'font': {'size': 18, 'color': '#1f2937', 'family': 'Inter'}
413
+ },
414
+ xaxis_title='Group Size (Passengers)',
415
+ yaxis_title='Average Distance (km)',
416
+ plot_bgcolor='rgba(0,0,0,0)',
417
+ paper_bgcolor='rgba(0,0,0,0)',
418
+ font={'color': '#374151', 'family': 'Inter'},
419
+ height=400,
420
+ margin=dict(t=60, b=50, l=50, r=50)
421
+ )
422
+
423
+ return fig
424
+
425
+ def create_location_comparison(df: pd.DataFrame) -> go.Figure:
426
+ """Create pickup vs dropoff location comparison."""
427
+ pickup_counts = df['pickup_main'].value_counts().head(10)
428
+ dropoff_counts = df['dropoff_main'].value_counts().head(10)
429
+
430
+ # Get common locations
431
+ common_locations = list(set(pickup_counts.index) & set(dropoff_counts.index))
432
+ if not common_locations:
433
+ # If no common locations, take top 5 from each
434
+ all_locations = list(set(list(pickup_counts.index[:5]) + list(dropoff_counts.index[:5])))
435
+ else:
436
+ all_locations = common_locations[:8]
437
+
438
+ pickup_values = [pickup_counts.get(loc, 0) for loc in all_locations]
439
+ dropoff_values = [dropoff_counts.get(loc, 0) for loc in all_locations]
440
+
441
+ # Truncate location names
442
+ truncated_locations = []
443
+ for loc in all_locations:
444
+ if len(loc) > 15:
445
+ truncated_locations.append(loc[:12] + "...")
446
+ else:
447
+ truncated_locations.append(loc)
448
+
449
+ fig = go.Figure()
450
+
451
+ fig.add_trace(go.Bar(
452
+ name='Pickups',
453
+ x=truncated_locations,
454
+ y=pickup_values,
455
+ marker_color='#3b82f6',
456
+ hovertemplate='<b>%{x}</b><br>Pickups: %{y}<extra></extra>',
457
+ customdata=all_locations
458
+ ))
459
+
460
+ fig.add_trace(go.Bar(
461
+ name='Drop-offs',
462
+ x=truncated_locations,
463
+ y=dropoff_values,
464
+ marker_color='#10b981',
465
+ hovertemplate='<b>%{x}</b><br>Drop-offs: %{y}<extra></extra>',
466
+ customdata=all_locations
467
+ ))
468
+
469
+ fig.update_layout(
470
+ title={
471
+ 'text': 'Pickup vs Drop-off Comparison',
472
+ 'x': 0.5,
473
+ 'font': {'size': 18, 'color': '#1f2937', 'family': 'Inter'}
474
+ },
475
+ xaxis_title='Locations',
476
+ yaxis_title='Number of Trips',
477
+ plot_bgcolor='rgba(0,0,0,0)',
478
+ paper_bgcolor='rgba(0,0,0,0)',
479
+ font={'color': '#374151', 'family': 'Inter'},
480
+ height=400,
481
+ margin=dict(t=60, b=50, l=50, r=50),
482
+ barmode='group',
483
+ legend=dict(
484
+ x=0.02,
485
+ y=0.98,
486
+ bgcolor='rgba(255,255,255,0.9)',
487
+ bordercolor='rgba(156, 163, 175, 0.3)',
488
+ borderwidth=1
489
+ )
490
+ )
491
+
492
+ return fig
493
+
494
+ def create_peak_patterns(df: pd.DataFrame) -> go.Figure:
495
+ """Create peak hours analysis by group size category."""
496
+ df_copy = df.copy()
497
+ df_copy['group_category'] = df_copy['Total Passengers'].apply(
498
+ lambda x: 'Small (1-4)' if x <= 4 else
499
+ 'Medium (5-8)' if x <= 8 else
500
+ 'Large (9-12)' if x <= 12 else
501
+ 'Extra Large (13+)'
502
+ )
503
+
504
+ hourly_by_group = df_copy.groupby(['group_category', 'hour']).size().reset_index(name='trips')
505
+
506
+ fig = go.Figure()
507
+
508
+ colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444']
509
+ categories = ['Small (1-4)', 'Medium (5-8)', 'Large (9-12)', 'Extra Large (13+)']
510
+
511
+ for i, category in enumerate(categories):
512
+ data = hourly_by_group[hourly_by_group['group_category'] == category]
513
+ if not data.empty:
514
+ fig.add_trace(go.Scatter(
515
+ x=data['hour'],
516
+ y=data['trips'],
517
+ mode='lines+markers',
518
+ name=category,
519
+ line=dict(color=colors[i], width=3, shape='spline'),
520
+ marker=dict(size=6, line=dict(color='white', width=1)),
521
+ hovertemplate='<b>%{fullData.name}</b><br>Hour: %{x}<br>Trips: %{y}<extra></extra>'
522
+ ))
523
+
524
+ fig.update_layout(
525
+ title={
526
+ 'text': 'Peak Hours by Group Size Category',
527
+ 'x': 0.5,
528
+ 'font': {'size': 18, 'color': '#1f2937', 'family': 'Inter'}
529
+ },
530
+ xaxis_title='Hour of Day',
531
+ yaxis_title='Number of Trips',
532
+ plot_bgcolor='rgba(0,0,0,0)',
533
+ paper_bgcolor='rgba(0,0,0,0)',
534
+ font={'color': '#374151', 'family': 'Inter'},
535
+ height=400,
536
+ margin=dict(t=60, b=50, l=50, r=50),
537
+ legend=dict(
538
+ x=0.02,
539
+ y=0.98,
540
+ bgcolor='rgba(255,255,255,0.9)',
541
+ bordercolor='rgba(156, 163, 175, 0.3)',
542
+ borderwidth=1
543
+ ),
544
+ xaxis=dict(
545
+ showgrid=True,
546
+ gridwidth=1,
547
+ gridcolor='rgba(156, 163, 175, 0.2)',
548
+ tickvals=list(range(0, 24, 2)),
549
+ ticktext=[f"{h}:00" for h in range(0, 24, 2)]
550
+ ),
551
+ yaxis=dict(
552
+ showgrid=True,
553
+ gridwidth=1,
554
+ gridcolor='rgba(156, 163, 175, 0.2)'
555
+ )
556
+ )
557
+
558
+ return fig
559
+
560
+ def create_placeholder_chart(title: str, message: str) -> go.Figure:
561
+ """Create a placeholder chart when data is not available."""
562
+ fig = go.Figure()
563
+
564
+ fig.add_annotation(
565
+ text=message,
566
+ x=0.5,
567
+ y=0.5,
568
+ xref="paper",
569
+ yref="paper",
570
+ showarrow=False,
571
+ font=dict(size=16, color='#6b7280', family='Inter')
572
+ )
573
+
574
+ fig.update_layout(
575
+ title={
576
+ 'text': title,
577
+ 'x': 0.5,
578
+ 'font': {'size': 18, 'color': '#1f2937', 'family': 'Inter'}
579
+ },
580
+ plot_bgcolor='rgba(0,0,0,0)',
581
+ paper_bgcolor='rgba(0,0,0,0)',
582
+ height=300,
583
+ margin=dict(t=60, b=50, l=50, r=50),
584
+ xaxis=dict(showgrid=False, showticklabels=False),
585
+ yaxis=dict(showgrid=False, showticklabels=False)
586
+ )
587
+
588
+ return fig