Spaces:

parthsinha
/

Fetii_AI

Runtime error

App Files Files Community

Fetii_AI / chatbot_engine.py

parthsinha

added files

ed0810b 5 months ago

raw

history blame contribute delete

20.5 kB

	import re
	from typing import Dict, List, Any, Tuple
	from data_processor import DataProcessor
	import utils

	class FetiiChatbot:
	"""
	GPT-style chatbot that can answer questions about Fetii rideshare data.
	"""

	def __init__(self, data_processor: DataProcessor):
	"""Initialize the chatbot with a data processor."""
	self.data_processor = data_processor
	self.conversation_history = []

	self.query_patterns = {
	'location_stats': [
	r'how many.(?:groups?\|trips?).(?:went to\|to\|from)\s+([^?]+?)(?:\s+(?:last\|this\|yesterday\|today\|week\|month\|year).*?)?[?.]?$',
	r'(?:trips?\|groups?).(?:to\|from)\s+([^?]+?)(?:\s+(?:last\|this\|yesterday\|today\|week\|month\|year).?)?[?.]?$',
	r'tell me about\s+([^?]+?)(?:\s+(?:last\|this\|yesterday\|today\|week\|month\|year).*?)?[?.]?$',
	r'stats for\s+([^?]+?)(?:\s+(?:last\|this\|yesterday\|today\|week\|month\|year).*?)?[?.]?$',
	r'(?:show me\|find\|search)\s+([^?]+?)(?:\s+(?:trips?\|data\|stats))?(?:\s+(?:last\|this\|yesterday\|today\|week\|month\|year).*?)?[?.]?$'
	],
	'time_patterns': [
	r'when do.groups?.ride',
	r'what time.*most popular',
	r'peak hours?',
	r'busiest time'
	],
	'group_size': [
	r'large groups?\s*$(\d+)\+?$',
	r'groups? of (\d+)\+? riders?',
	r'(\d+)\+? passengers?',
	r'group size'
	],
	'top_locations': [
	r'top.(?:pickup\|drop-?off).spots?',
	r'most popular.*locations?',
	r'busiest.*locations?',
	r'hottest spots?',
	r'show.*(?:pickup\|drop-?off\|locations?)',
	r'list.*locations?'
	],
	'demographics': [
	r'(\d+)[-–](\d+) year[- ]olds?',
	r'age group',
	r'demographics?'
	],
	'general_stats': [
	r'how many total',
	r'average group size',
	r'summary',
	r'overview',
	r'give me.*overview',
	r'show me.*stats',
	r'total trips'
	]
	}

	self.time_patterns = [
	r'\s+(?:last\|this\|yesterday\|today)\s+(?:week\|month\|year\|night)',
	r'\s+(?:last\|this)\s+(?:monday\|tuesday\|wednesday\|thursday\|friday\|saturday\|sunday)',
	r'\s+(?:in\s+)?(?:january\|february\|march\|april\|may\|june\|july\|august\|september\|october\|november\|december)',
	r'\s+(?:last\|this\|next)\s+\w+',
	r'\s+(?:yesterday\|today\|tonight)',
	r'\s+\d{1,2}\/\d{1,2}\/\d{2,4}',
	r'\s+\d{1,2}-\d{1,2}-\d{2,4}'
	]

	def process_query(self, user_query: str) -> str:
	"""Process a user query and return an appropriate response."""
	user_query = user_query.lower().strip()

	self.conversation_history.append({"role": "user", "content": user_query})

	try:
	query_type, params = self._parse_query(user_query)
	response = self._generate_response(query_type, params, user_query)
	self.conversation_history.append({"role": "assistant", "content": response})

	return response

	except Exception as e:
	error_response = ("I'm having trouble understanding that question. "
	"Try asking about specific locations, times, or group sizes. "
	"For example: 'How many groups went to The Aquarium on 6th?' or "
	"'What are the peak hours for large groups?'")
	return error_response

	def _clean_location_from_query(self, location_text: str) -> str:
	"""Clean time references from location text."""
	cleaned = location_text.strip()

	for pattern in self.time_patterns:
	cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)

	cleaned = re.sub(r'\s+', ' ', cleaned).strip()

	return cleaned

	def _parse_query(self, query: str) -> Tuple[str, Dict[str, Any]]:
	"""Parse the user query to determine intent and extract parameters."""
	params = {}

	for pattern in self.query_patterns['location_stats']:
	match = re.search(pattern, query, re.IGNORECASE)
	if match:
	location = match.group(1).strip()
	location = self._clean_location_from_query(location)
	if location:
	params['location'] = location
	return 'location_stats', params

	for pattern in self.query_patterns['time_patterns']:
	if re.search(pattern, query, re.IGNORECASE):
	group_match = re.search(r'(\d+)\+?', query)
	if group_match:
	params['min_group_size'] = int(group_match.group(1))
	return 'time_patterns', params

	for pattern in self.query_patterns['group_size']:
	match = re.search(pattern, query, re.IGNORECASE)
	if match:
	if match.groups():
	params['group_size'] = int(match.group(1))
	return 'group_size', params

	for pattern in self.query_patterns['top_locations']:
	if re.search(pattern, query, re.IGNORECASE):
	if 'pickup' in query or 'pick up' in query:
	params['location_type'] = 'pickup'
	elif 'drop' in query:
	params['location_type'] = 'dropoff'
	else:
	params['location_type'] = 'both'
	return 'top_locations', params

	for pattern in self.query_patterns['demographics']:
	match = re.search(pattern, query, re.IGNORECASE)
	if match and match.groups():
	if len(match.groups()) == 2:
	params['age_range'] = (int(match.group(1)), int(match.group(2)))
	return 'demographics', params

	for pattern in self.query_patterns['general_stats']:
	if re.search(pattern, query, re.IGNORECASE):
	return 'general_stats', params

	return 'general_stats', params

	def _fuzzy_search_location(self, query_location: str) -> List[Tuple[str, int]]:
	"""Search for locations using fuzzy matching."""
	all_pickups = self.data_processor.df['pickup_main'].value_counts()
	all_dropoffs = self.data_processor.df['dropoff_main'].value_counts()

	all_locations = {}
	for location, count in all_pickups.items():
	all_locations[location] = all_locations.get(location, 0) + count
	for location, count in all_dropoffs.items():
	all_locations[location] = all_locations.get(location, 0) + count

	matches = []
	query_lower = query_location.lower()

	# Exact match
	for location, count in all_locations.items():
	if query_lower == location.lower():
	matches.append((location, count))

	# Partial match
	if not matches:
	for location, count in all_locations.items():
	if query_lower in location.lower() or location.lower() in query_lower:
	matches.append((location, count))

	# Word match
	if not matches:
	query_words = query_lower.split()
	for location, count in all_locations.items():
	location_lower = location.lower()
	if any(word in location_lower for word in query_words if len(word) > 2):
	matches.append((location, count))

	matches.sort(key=lambda x: x[1], reverse=True)
	return matches[:5]

	def _generate_response(self, query_type: str, params: Dict[str, Any], original_query: str) -> str:
	"""Generate a response based on the query type and parameters."""

	if query_type == 'location_stats':
	return self._handle_location_stats(params, original_query)
	elif query_type == 'time_patterns':
	return self._handle_time_patterns(params)
	elif query_type == 'group_size':
	return self._handle_group_size(params)
	elif query_type == 'top_locations':
	return self._handle_top_locations(params)
	elif query_type == 'demographics':
	return self._handle_demographics(params)
	elif query_type == 'general_stats':
	return self._handle_general_stats()
	else:
	return self._handle_fallback(original_query)

	def _handle_location_stats(self, params: Dict[str, Any], original_query: str) -> str:
	"""Handle location-specific statistics queries."""
	location = params.get('location', '')

	stats = self.data_processor.get_location_stats(location)

	if stats['pickup_count'] == 0 and stats['dropoff_count'] == 0:
	matches = self._fuzzy_search_location(location)

	if matches:
	best_match = matches[0][0]
	stats = self.data_processor.get_location_stats(best_match)

	if stats['pickup_count'] > 0 or stats['dropoff_count'] > 0:
	response = f"<strong>Found results for '{best_match}'</strong> (closest match to '{location}'):\n\n"
	else:
	response = f"I couldn't find exact data for '{location}'. Did you mean one of these?\n\n"
	for match_location, count in matches[:3]:
	response += f"• <strong>{match_location}</strong> ({count} total trips)\n"
	response += f"\nTry asking: 'Tell me about {matches[0][0]}'"
	return response
	else:
	return f"I couldn't find any trips associated with '{location}'. Try checking the spelling or asking about a different location like 'West Campus' or 'The Aquarium on 6th'."
	else:
	best_match = location.title()
	response = f"<strong>Stats for {best_match}:</strong>\n\n"

	if stats['pickup_count'] > 0:
	response += f"<strong>{stats['pickup_count']} pickup trips</strong> with an average group size of {stats['avg_group_size_pickup']:.1f}\n"
	if stats['peak_hours_pickup']:
	peak_hours = ', '.join([utils.format_time(h) for h in stats['peak_hours_pickup']])
	response += f"Most popular pickup times: {peak_hours}\n"

	if stats['dropoff_count'] > 0:
	response += f"<strong>{stats['dropoff_count']} drop-off trips</strong> with an average group size of {stats['avg_group_size_dropoff']:.1f}\n"
	if stats['peak_hours_dropoff']:
	peak_hours = ', '.join([utils.format_time(h) for h in stats['peak_hours_dropoff']])
	response += f"Most popular drop-off times: {peak_hours}\n"

	total_trips = stats['pickup_count'] + stats['dropoff_count']
	insights = self.data_processor.get_quick_insights()
	percentage = (total_trips / insights['total_trips']) * 100

	response += f"\n<strong>Insight:</strong> This location accounts for {percentage:.1f}% of all Austin trips!"

	if any(word in original_query for word in ['last', 'this', 'month', 'week', 'yesterday', 'today']):
	response += f"\n\n<strong>Note:</strong> This data covers our full Austin dataset. For specific time periods, the patterns shown represent typical activity for this location."

	return response

	def _handle_time_patterns(self, params: Dict[str, Any]) -> str:
	"""Handle time pattern queries."""
	min_group_size = params.get('min_group_size', None)

	time_data = self.data_processor.get_time_patterns(min_group_size)

	response = "<strong>Peak Riding Times:</strong>\n\n"

	if min_group_size:
	response += f"<em>For groups of {min_group_size}+ riders:</em>\n\n"

	hourly_counts = time_data['hourly_counts']
	top_hours = sorted(hourly_counts.items(), key=lambda x: x[1], reverse=True)[:5]

	response += "<strong>Busiest Hours:</strong>\n"
	for i, (hour, count) in enumerate(top_hours, 1):
	time_label = utils.format_time(hour)
	response += f"{i}. <strong>{time_label}</strong> - {count} trips\n"

	time_categories = time_data['time_category_counts']
	response += "\n<strong>By Time Period:</strong>\n"
	for period, count in sorted(time_categories.items(), key=lambda x: x[1], reverse=True):
	response += f"• <strong>{period}:</strong> {count} trips\n"

	peak_hour = top_hours[0][0]
	peak_count = top_hours[0][1]
	response += f"\n<strong>Insight:</strong> {utils.format_time(peak_hour)} is the absolute peak with {peak_count} trips!"

	return response

	def _handle_group_size(self, params: Dict[str, Any]) -> str:
	"""Handle group size queries."""
	target_size = params.get('group_size', 6)

	insights = self.data_processor.get_quick_insights()
	group_distribution = insights['group_size_distribution']

	response = f"<strong>Group Size Analysis ({target_size}+ passengers):</strong>\n\n"

	large_group_trips = sum(count for size, count in group_distribution.items() if size >= target_size)
	total_trips = insights['total_trips']
	percentage = (large_group_trips / total_trips) * 100

	response += f"• <strong>{large_group_trips} trips</strong> had {target_size}+ passengers ({percentage:.1f}% of all trips)\n"

	response += f"\n<strong>Breakdown of {target_size}+ passenger groups:</strong>\n"
	large_groups = {size: count for size, count in group_distribution.items() if size >= target_size}
	for size, count in sorted(large_groups.items(), key=lambda x: x[1], reverse=True)[:8]:
	group_pct = (count / large_group_trips) * 100 if large_group_trips > 0 else 0
	response += f"• <strong>{size} passengers:</strong> {count} trips ({group_pct:.1f}%)\n"

	avg_size = insights['avg_group_size']
	response += f"\n<strong>Insight:</strong> Average group size is {avg_size:.1f} passengers - most rides are group experiences!"

	return response

	def _handle_top_locations(self, params: Dict[str, Any]) -> str:
	"""Handle top locations queries."""
	location_type = params.get('location_type', 'both')
	insights = self.data_processor.get_quick_insights()

	response = "<strong>Most Popular Locations:</strong>\n\n"

	if location_type in ['pickup', 'both']:
	response += "<strong>Top Pickup Spots:</strong>\n"
	for i, (location, count) in enumerate(list(insights['top_pickups'])[:8], 1):
	response += f"{i}. <strong>{location}</strong> - {count} pickups\n"

	if location_type in ['dropoff', 'both']:
	if location_type == 'both':
	response += "\n<strong>Top Drop-off Destinations:</strong>\n"
	else:
	response += "<strong>Top Drop-off Destinations:</strong>\n"
	for i, (location, count) in enumerate(list(insights['top_dropoffs'])[:8], 1):
	response += f"{i}. <strong>{location}</strong> - {count} drop-offs\n"

	if location_type in ['pickup', 'both']:
	top_pickup = list(insights['top_pickups'])[0]
	response += f"\n<strong>Insight:</strong> {top_pickup[0]} dominates pickups with {top_pickup[1]} trips!"

	return response

	def _handle_demographics(self, params: Dict[str, Any]) -> str:
	"""Handle demographics queries."""
	age_range = params.get('age_range', (18, 24))

	response = f"<strong>Demographics Analysis ({age_range[0]}-{age_range[1]} year olds):</strong>\n\n"
	response += "I'd love to help with demographic analysis, but I don't currently have access to rider age data in this dataset. "
	response += "However, I can tell you about the locations and times that are popular with different group sizes!\n\n"

	insights = self.data_processor.get_quick_insights()
	response += "<strong>Popular spots that might appeal to younger riders:</strong>\n"

	entertainment_spots = ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'LUNA Rooftop', 'Green Light Social']

	for spot in entertainment_spots[:5]:
	for location, count in insights['top_dropoffs']:
	if spot.lower() in location.lower():
	response += f"• <strong>{location}</strong> - {count} drop-offs\n"
	break

	response += "\n<strong>Insight:</strong> Late night hours (10 PM - 1 AM) see the highest activity, which often correlates with younger demographics!"

	return response

	def _handle_general_stats(self) -> str:
	"""Handle general statistics queries."""
	insights = self.data_processor.get_quick_insights()

	response = "<strong>Fetii Austin Overview:</strong>\n\n"

	response += f"<strong>Total Trips Analyzed:</strong> {insights['total_trips']:,}\n"
	response += f"<strong>Average Group Size:</strong> {insights['avg_group_size']:.1f} passengers\n"
	response += f"<strong>Peak Hour:</strong> {utils.format_time(insights['peak_hour'])}\n"
	response += f"<strong>Large Groups (6+):</strong> {insights['large_groups_count']} trips ({insights['large_groups_pct']:.1f}%)\n\n"

	response += "<strong>Top Hotspots:</strong>\n"
	top_pickup = list(insights['top_pickups'])[0]
	top_dropoff = list(insights['top_dropoffs'])[0]
	response += f"• Most popular pickup: <strong>{top_pickup[0]}</strong> ({top_pickup[1]} trips)\n"
	response += f"• Most popular destination: <strong>{top_dropoff[0]}</strong> ({top_dropoff[1]} trips)\n\n"

	group_dist = insights['group_size_distribution']
	most_common_size = max(group_dist.items(), key=lambda x: x[1])
	response += f"<strong>Most Common Group Size:</strong> {most_common_size[0]} passengers ({most_common_size[1]} trips)\n\n"

	response += "<strong>Key Insights:</strong>\n"
	response += f"• {insights['large_groups_pct']:.0f}% of all rides are large groups (6+ people)\n"
	response += "• Peak activity happens late evening (10-11 PM)\n"
	response += "• West Campus dominates as the top pickup location\n"
	response += "• Entertainment venues are the most popular destinations"

	return response

	def _handle_fallback(self, query: str) -> str:
	"""Handle queries that don't match any specific pattern."""
	response = "I'm not sure I understood that question perfectly. Here's what I can help you with:\n\n"

	response += "<strong>Location Questions:</strong>\n"
	response += "• 'How many groups went to [location]?'\n"
	response += "• 'Tell me about [location]'\n"
	response += "• 'Top pickup/drop-off spots'\n\n"

	response += "<strong>Time Questions:</strong>\n"
	response += "• 'When do large groups typically ride?'\n"
	response += "• 'Peak hours for groups of 6+'\n"
	response += "• 'Busiest times'\n\n"

	response += "<strong>Group Size Questions:</strong>\n"
	response += "• 'How many trips had 10+ passengers?'\n"
	response += "• 'Large group patterns'\n"
	response += "• 'Average group size'\n\n"

	response += "Would you like to try asking one of these types of questions?"

	return response

	def get_conversation_history(self) -> List[Dict[str, str]]:
	"""Get the conversation history."""
	return self.conversation_history

	def clear_history(self):
	"""Clear the conversation history."""
	self.conversation_history = []