Fetii_AI / chatbot_engine.py
parthsinha's picture
added files
ed0810b
import re
from typing import Dict, List, Any, Tuple
from data_processor import DataProcessor
import utils
class FetiiChatbot:
"""
GPT-style chatbot that can answer questions about Fetii rideshare data.
"""
def __init__(self, data_processor: DataProcessor):
"""Initialize the chatbot with a data processor."""
self.data_processor = data_processor
self.conversation_history = []
self.query_patterns = {
'location_stats': [
r'how many.*(?:groups?|trips?).*(?:went to|to|from)\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
r'(?:trips?|groups?).*(?:to|from)\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
r'tell me about\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
r'stats for\s+([^?]+?)(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$',
r'(?:show me|find|search)\s+([^?]+?)(?:\s+(?:trips?|data|stats))?(?:\s+(?:last|this|yesterday|today|week|month|year).*?)?[?.]?$'
],
'time_patterns': [
r'when do.*groups?.*ride',
r'what time.*most popular',
r'peak hours?',
r'busiest time'
],
'group_size': [
r'large groups?\s*\((\d+)\+?\)',
r'groups? of (\d+)\+? riders?',
r'(\d+)\+? passengers?',
r'group size'
],
'top_locations': [
r'top.*(?:pickup|drop-?off).*spots?',
r'most popular.*locations?',
r'busiest.*locations?',
r'hottest spots?',
r'show.*(?:pickup|drop-?off|locations?)',
r'list.*locations?'
],
'demographics': [
r'(\d+)[-–](\d+) year[- ]olds?',
r'age group',
r'demographics?'
],
'general_stats': [
r'how many total',
r'average group size',
r'summary',
r'overview',
r'give me.*overview',
r'show me.*stats',
r'total trips'
]
}
self.time_patterns = [
r'\s+(?:last|this|yesterday|today)\s+(?:week|month|year|night)',
r'\s+(?:last|this)\s+(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)',
r'\s+(?:in\s+)?(?:january|february|march|april|may|june|july|august|september|october|november|december)',
r'\s+(?:last|this|next)\s+\w+',
r'\s+(?:yesterday|today|tonight)',
r'\s+\d{1,2}\/\d{1,2}\/\d{2,4}',
r'\s+\d{1,2}-\d{1,2}-\d{2,4}'
]
def process_query(self, user_query: str) -> str:
"""Process a user query and return an appropriate response."""
user_query = user_query.lower().strip()
self.conversation_history.append({"role": "user", "content": user_query})
try:
query_type, params = self._parse_query(user_query)
response = self._generate_response(query_type, params, user_query)
self.conversation_history.append({"role": "assistant", "content": response})
return response
except Exception as e:
error_response = ("I'm having trouble understanding that question. "
"Try asking about specific locations, times, or group sizes. "
"For example: 'How many groups went to The Aquarium on 6th?' or "
"'What are the peak hours for large groups?'")
return error_response
def _clean_location_from_query(self, location_text: str) -> str:
"""Clean time references from location text."""
cleaned = location_text.strip()
for pattern in self.time_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned
def _parse_query(self, query: str) -> Tuple[str, Dict[str, Any]]:
"""Parse the user query to determine intent and extract parameters."""
params = {}
for pattern in self.query_patterns['location_stats']:
match = re.search(pattern, query, re.IGNORECASE)
if match:
location = match.group(1).strip()
location = self._clean_location_from_query(location)
if location:
params['location'] = location
return 'location_stats', params
for pattern in self.query_patterns['time_patterns']:
if re.search(pattern, query, re.IGNORECASE):
group_match = re.search(r'(\d+)\+?', query)
if group_match:
params['min_group_size'] = int(group_match.group(1))
return 'time_patterns', params
for pattern in self.query_patterns['group_size']:
match = re.search(pattern, query, re.IGNORECASE)
if match:
if match.groups():
params['group_size'] = int(match.group(1))
return 'group_size', params
for pattern in self.query_patterns['top_locations']:
if re.search(pattern, query, re.IGNORECASE):
if 'pickup' in query or 'pick up' in query:
params['location_type'] = 'pickup'
elif 'drop' in query:
params['location_type'] = 'dropoff'
else:
params['location_type'] = 'both'
return 'top_locations', params
for pattern in self.query_patterns['demographics']:
match = re.search(pattern, query, re.IGNORECASE)
if match and match.groups():
if len(match.groups()) == 2:
params['age_range'] = (int(match.group(1)), int(match.group(2)))
return 'demographics', params
for pattern in self.query_patterns['general_stats']:
if re.search(pattern, query, re.IGNORECASE):
return 'general_stats', params
return 'general_stats', params
def _fuzzy_search_location(self, query_location: str) -> List[Tuple[str, int]]:
"""Search for locations using fuzzy matching."""
all_pickups = self.data_processor.df['pickup_main'].value_counts()
all_dropoffs = self.data_processor.df['dropoff_main'].value_counts()
all_locations = {}
for location, count in all_pickups.items():
all_locations[location] = all_locations.get(location, 0) + count
for location, count in all_dropoffs.items():
all_locations[location] = all_locations.get(location, 0) + count
matches = []
query_lower = query_location.lower()
# Exact match
for location, count in all_locations.items():
if query_lower == location.lower():
matches.append((location, count))
# Partial match
if not matches:
for location, count in all_locations.items():
if query_lower in location.lower() or location.lower() in query_lower:
matches.append((location, count))
# Word match
if not matches:
query_words = query_lower.split()
for location, count in all_locations.items():
location_lower = location.lower()
if any(word in location_lower for word in query_words if len(word) > 2):
matches.append((location, count))
matches.sort(key=lambda x: x[1], reverse=True)
return matches[:5]
def _generate_response(self, query_type: str, params: Dict[str, Any], original_query: str) -> str:
"""Generate a response based on the query type and parameters."""
if query_type == 'location_stats':
return self._handle_location_stats(params, original_query)
elif query_type == 'time_patterns':
return self._handle_time_patterns(params)
elif query_type == 'group_size':
return self._handle_group_size(params)
elif query_type == 'top_locations':
return self._handle_top_locations(params)
elif query_type == 'demographics':
return self._handle_demographics(params)
elif query_type == 'general_stats':
return self._handle_general_stats()
else:
return self._handle_fallback(original_query)
def _handle_location_stats(self, params: Dict[str, Any], original_query: str) -> str:
"""Handle location-specific statistics queries."""
location = params.get('location', '')
stats = self.data_processor.get_location_stats(location)
if stats['pickup_count'] == 0 and stats['dropoff_count'] == 0:
matches = self._fuzzy_search_location(location)
if matches:
best_match = matches[0][0]
stats = self.data_processor.get_location_stats(best_match)
if stats['pickup_count'] > 0 or stats['dropoff_count'] > 0:
response = f"<strong>Found results for '{best_match}'</strong> (closest match to '{location}'):\n\n"
else:
response = f"I couldn't find exact data for '{location}'. Did you mean one of these?\n\n"
for match_location, count in matches[:3]:
response += f"• <strong>{match_location}</strong> ({count} total trips)\n"
response += f"\nTry asking: 'Tell me about {matches[0][0]}'"
return response
else:
return f"I couldn't find any trips associated with '{location}'. Try checking the spelling or asking about a different location like 'West Campus' or 'The Aquarium on 6th'."
else:
best_match = location.title()
response = f"<strong>Stats for {best_match}:</strong>\n\n"
if stats['pickup_count'] > 0:
response += f"<strong>{stats['pickup_count']} pickup trips</strong> with an average group size of {stats['avg_group_size_pickup']:.1f}\n"
if stats['peak_hours_pickup']:
peak_hours = ', '.join([utils.format_time(h) for h in stats['peak_hours_pickup']])
response += f"Most popular pickup times: {peak_hours}\n"
if stats['dropoff_count'] > 0:
response += f"<strong>{stats['dropoff_count']} drop-off trips</strong> with an average group size of {stats['avg_group_size_dropoff']:.1f}\n"
if stats['peak_hours_dropoff']:
peak_hours = ', '.join([utils.format_time(h) for h in stats['peak_hours_dropoff']])
response += f"Most popular drop-off times: {peak_hours}\n"
total_trips = stats['pickup_count'] + stats['dropoff_count']
insights = self.data_processor.get_quick_insights()
percentage = (total_trips / insights['total_trips']) * 100
response += f"\n<strong>Insight:</strong> This location accounts for {percentage:.1f}% of all Austin trips!"
if any(word in original_query for word in ['last', 'this', 'month', 'week', 'yesterday', 'today']):
response += f"\n\n<strong>Note:</strong> This data covers our full Austin dataset. For specific time periods, the patterns shown represent typical activity for this location."
return response
def _handle_time_patterns(self, params: Dict[str, Any]) -> str:
"""Handle time pattern queries."""
min_group_size = params.get('min_group_size', None)
time_data = self.data_processor.get_time_patterns(min_group_size)
response = "<strong>Peak Riding Times:</strong>\n\n"
if min_group_size:
response += f"<em>For groups of {min_group_size}+ riders:</em>\n\n"
hourly_counts = time_data['hourly_counts']
top_hours = sorted(hourly_counts.items(), key=lambda x: x[1], reverse=True)[:5]
response += "<strong>Busiest Hours:</strong>\n"
for i, (hour, count) in enumerate(top_hours, 1):
time_label = utils.format_time(hour)
response += f"{i}. <strong>{time_label}</strong> - {count} trips\n"
time_categories = time_data['time_category_counts']
response += "\n<strong>By Time Period:</strong>\n"
for period, count in sorted(time_categories.items(), key=lambda x: x[1], reverse=True):
response += f"• <strong>{period}:</strong> {count} trips\n"
peak_hour = top_hours[0][0]
peak_count = top_hours[0][1]
response += f"\n<strong>Insight:</strong> {utils.format_time(peak_hour)} is the absolute peak with {peak_count} trips!"
return response
def _handle_group_size(self, params: Dict[str, Any]) -> str:
"""Handle group size queries."""
target_size = params.get('group_size', 6)
insights = self.data_processor.get_quick_insights()
group_distribution = insights['group_size_distribution']
response = f"<strong>Group Size Analysis ({target_size}+ passengers):</strong>\n\n"
large_group_trips = sum(count for size, count in group_distribution.items() if size >= target_size)
total_trips = insights['total_trips']
percentage = (large_group_trips / total_trips) * 100
response += f"• <strong>{large_group_trips} trips</strong> had {target_size}+ passengers ({percentage:.1f}% of all trips)\n"
response += f"\n<strong>Breakdown of {target_size}+ passenger groups:</strong>\n"
large_groups = {size: count for size, count in group_distribution.items() if size >= target_size}
for size, count in sorted(large_groups.items(), key=lambda x: x[1], reverse=True)[:8]:
group_pct = (count / large_group_trips) * 100 if large_group_trips > 0 else 0
response += f"• <strong>{size} passengers:</strong> {count} trips ({group_pct:.1f}%)\n"
avg_size = insights['avg_group_size']
response += f"\n<strong>Insight:</strong> Average group size is {avg_size:.1f} passengers - most rides are group experiences!"
return response
def _handle_top_locations(self, params: Dict[str, Any]) -> str:
"""Handle top locations queries."""
location_type = params.get('location_type', 'both')
insights = self.data_processor.get_quick_insights()
response = "<strong>Most Popular Locations:</strong>\n\n"
if location_type in ['pickup', 'both']:
response += "<strong>Top Pickup Spots:</strong>\n"
for i, (location, count) in enumerate(list(insights['top_pickups'])[:8], 1):
response += f"{i}. <strong>{location}</strong> - {count} pickups\n"
if location_type in ['dropoff', 'both']:
if location_type == 'both':
response += "\n<strong>Top Drop-off Destinations:</strong>\n"
else:
response += "<strong>Top Drop-off Destinations:</strong>\n"
for i, (location, count) in enumerate(list(insights['top_dropoffs'])[:8], 1):
response += f"{i}. <strong>{location}</strong> - {count} drop-offs\n"
if location_type in ['pickup', 'both']:
top_pickup = list(insights['top_pickups'])[0]
response += f"\n<strong>Insight:</strong> {top_pickup[0]} dominates pickups with {top_pickup[1]} trips!"
return response
def _handle_demographics(self, params: Dict[str, Any]) -> str:
"""Handle demographics queries."""
age_range = params.get('age_range', (18, 24))
response = f"<strong>Demographics Analysis ({age_range[0]}-{age_range[1]} year olds):</strong>\n\n"
response += "I'd love to help with demographic analysis, but I don't currently have access to rider age data in this dataset. "
response += "However, I can tell you about the locations and times that are popular with different group sizes!\n\n"
insights = self.data_processor.get_quick_insights()
response += "<strong>Popular spots that might appeal to younger riders:</strong>\n"
entertainment_spots = ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'LUNA Rooftop', 'Green Light Social']
for spot in entertainment_spots[:5]:
for location, count in insights['top_dropoffs']:
if spot.lower() in location.lower():
response += f"• <strong>{location}</strong> - {count} drop-offs\n"
break
response += "\n<strong>Insight:</strong> Late night hours (10 PM - 1 AM) see the highest activity, which often correlates with younger demographics!"
return response
def _handle_general_stats(self) -> str:
"""Handle general statistics queries."""
insights = self.data_processor.get_quick_insights()
response = "<strong>Fetii Austin Overview:</strong>\n\n"
response += f"<strong>Total Trips Analyzed:</strong> {insights['total_trips']:,}\n"
response += f"<strong>Average Group Size:</strong> {insights['avg_group_size']:.1f} passengers\n"
response += f"<strong>Peak Hour:</strong> {utils.format_time(insights['peak_hour'])}\n"
response += f"<strong>Large Groups (6+):</strong> {insights['large_groups_count']} trips ({insights['large_groups_pct']:.1f}%)\n\n"
response += "<strong>Top Hotspots:</strong>\n"
top_pickup = list(insights['top_pickups'])[0]
top_dropoff = list(insights['top_dropoffs'])[0]
response += f"• Most popular pickup: <strong>{top_pickup[0]}</strong> ({top_pickup[1]} trips)\n"
response += f"• Most popular destination: <strong>{top_dropoff[0]}</strong> ({top_dropoff[1]} trips)\n\n"
group_dist = insights['group_size_distribution']
most_common_size = max(group_dist.items(), key=lambda x: x[1])
response += f"<strong>Most Common Group Size:</strong> {most_common_size[0]} passengers ({most_common_size[1]} trips)\n\n"
response += "<strong>Key Insights:</strong>\n"
response += f"• {insights['large_groups_pct']:.0f}% of all rides are large groups (6+ people)\n"
response += "• Peak activity happens late evening (10-11 PM)\n"
response += "• West Campus dominates as the top pickup location\n"
response += "• Entertainment venues are the most popular destinations"
return response
def _handle_fallback(self, query: str) -> str:
"""Handle queries that don't match any specific pattern."""
response = "I'm not sure I understood that question perfectly. Here's what I can help you with:\n\n"
response += "<strong>Location Questions:</strong>\n"
response += "• 'How many groups went to [location]?'\n"
response += "• 'Tell me about [location]'\n"
response += "• 'Top pickup/drop-off spots'\n\n"
response += "<strong>Time Questions:</strong>\n"
response += "• 'When do large groups typically ride?'\n"
response += "• 'Peak hours for groups of 6+'\n"
response += "• 'Busiest times'\n\n"
response += "<strong>Group Size Questions:</strong>\n"
response += "• 'How many trips had 10+ passengers?'\n"
response += "• 'Large group patterns'\n"
response += "• 'Average group size'\n\n"
response += "Would you like to try asking one of these types of questions?"
return response
def get_conversation_history(self) -> List[Dict[str, str]]:
"""Get the conversation history."""
return self.conversation_history
def clear_history(self):
"""Clear the conversation history."""
self.conversation_history = []