Spaces:

Shinegupta
/

Fetti_AI

Runtime error

App Files Files Community

Fetti_AI / data_processor.py

Shinegupta

Upload 7 files

a385675 verified 5 months ago

raw

history blame contribute delete

10.7 kB

	import pandas as pd
	import numpy as np
	from typing import Dict, Any

	class DataProcessor:
	"""
	Handles all data processing and analysis for Fetii rideshare data.
	"""

	def __init__(self, csv_file_path: str = "fetii_data.csv"):
	"""Initialize the data processor with the CSV file."""
	self.csv_file_path = csv_file_path
	self.df = None
	self.insights = {}
	self.load_and_process_data()

	def load_and_process_data(self):
	"""Load and process the Fetii trip data."""
	try:
	self.df = pd.read_csv(self.csv_file_path)

	self._clean_data()
	self._extract_temporal_features()
	self._extract_location_features()
	self._calculate_insights()

	print(f"✅ Successfully loaded {len(self.df)} trips from Austin")

	except FileNotFoundError:
	print("⚠️ CSV file not found. Creating sample data for demo...")
	self._create_sample_data()

	def _create_sample_data(self):
	"""Create sample data based on the analysis patterns."""
	np.random.seed(42)

	locations = {
	'pickup': ['West Campus', 'The Drag', 'Market District', 'Sixth Street', 'East End',
	'Downtown', 'Govalle', 'Hancock', 'South Lamar', 'Warehouse District'],
	'dropoff': ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'Mayfair Austin',
	'Latchkey', '6013 Loyola Ln', "Buford's", 'Darrell K Royal Texas Memorial Stadium',
	'LUNA Rooftop', 'University of Texas KA house', 'Green Light Social', "The Cat's Pajamas"]
	}

	passenger_choices = [14, 8, 7, 10, 9, 12, 11, 13, 6, 5, 4, 3, 2, 1]
	passenger_weights = [0.173, 0.128, 0.120, 0.115, 0.113, 0.087, 0.085, 0.077, 0.063, 0.028, 0.007, 0.004, 0.001, 0.001]

	hour_choices = [22, 23, 21, 19, 0, 20, 18, 1, 2, 17, 16, 3]
	hour_weights = [0.25, 0.23, 0.19, 0.11, 0.08, 0.06, 0.05, 0.03, 0.02, 0.01, 0.01, 0.01]

	sample_data = []
	for i in range(2000):
	passengers = np.random.choice(passenger_choices, p=passenger_weights)
	hour = np.random.choice(hour_choices, p=hour_weights)

	pickup_lat = np.random.normal(30.2672, 0.02)
	pickup_lng = np.random.normal(-97.7431, 0.02)
	dropoff_lat = np.random.normal(30.2672, 0.02)
	dropoff_lng = np.random.normal(-97.7431, 0.02)

	day = np.random.randint(1, 31)
	minute = np.random.randint(0, 60)

	sample_data.append({
	'Trip ID': 734889 - i,
	'Booking User ID': np.random.randint(10000, 999999),
	'Pick Up Latitude': pickup_lat,
	'Pick Up Longitude': pickup_lng,
	'Drop Off Latitude': dropoff_lat,
	'Drop Off Longitude': dropoff_lng,
	'Pick Up Address': f"{np.random.choice(locations['pickup'])}, Austin, TX",
	'Drop Off Address': f"{np.random.choice(locations['dropoff'])}, Austin, TX",
	'Trip Date and Time': f"9/{day}/25 {hour}:{minute:02d}",
	'Total Passengers': passengers
	})

	self.df = pd.DataFrame(sample_data)
	self._clean_data()
	self._extract_temporal_features()
	self._extract_location_features()
	self._calculate_insights()

	def _clean_data(self):
	"""Clean and standardize the data."""
	self.df = self.df.dropna(subset=['Total Passengers', 'Trip Date and Time'])

	self.df['Total Passengers'] = self.df['Total Passengers'].astype(int)

	self.df['pickup_main'] = self.df['Pick Up Address'].apply(self._extract_main_location)
	self.df['dropoff_main'] = self.df['Drop Off Address'].apply(self._extract_main_location)

	def _extract_main_location(self, address: str) -> str:
	"""Extract the main location name from an address."""
	if pd.isna(address):
	return "Unknown"
	return address.split(',')[0].strip()

	def _extract_temporal_features(self):
	"""Extract temporal features from trip data."""
	self.df['datetime'] = pd.to_datetime(self.df['Trip Date and Time'], format='%m/%d/%y %H:%M')
	self.df['hour'] = self.df['datetime'].dt.hour
	self.df['day_of_week'] = self.df['datetime'].dt.day_name()
	self.df['date'] = self.df['datetime'].dt.date

	self.df['time_category'] = self.df['hour'].apply(self._categorize_time)

	def _categorize_time(self, hour: int) -> str:
	"""Categorize hour into time periods."""
	if 6 <= hour < 12:
	return "Morning"
	elif 12 <= hour < 17:
	return "Afternoon"
	elif 17 <= hour < 21:
	return "Evening"
	elif 21 <= hour <= 23:
	return "Night"
	else:
	return "Late Night"

	def _extract_location_features(self):
	"""Extract location-based features."""
	self.df['group_category'] = self.df['Total Passengers'].apply(self._categorize_group_size)

	self.df['is_entertainment'] = self.df['dropoff_main'].apply(self._is_entertainment_venue)
	self.df['is_campus'] = self.df['pickup_main'].apply(self._is_campus_location)

	def _categorize_group_size(self, passengers: int) -> str:
	"""Categorize group size."""
	if passengers <= 4:
	return "Small (1-4)"
	elif passengers <= 8:
	return "Medium (5-8)"
	elif passengers <= 12:
	return "Large (9-12)"
	else:
	return "Extra Large (13+)"

	def _is_entertainment_venue(self, location: str) -> bool:
	"""Check if location is an entertainment venue."""
	entertainment_keywords = ['bar', 'club', 'lounge', 'aquarium', 'rooftop', 'social', 'pub']
	return any(keyword in location.lower() for keyword in entertainment_keywords)

	def _is_campus_location(self, location: str) -> bool:
	"""Check if location is campus-related."""
	campus_keywords = ['campus', 'university', 'drag', 'west campus']
	return any(keyword in location.lower() for keyword in campus_keywords)

	def _calculate_insights(self):
	"""Calculate key insights from the data."""
	self.insights = {
	'total_trips': len(self.df),
	'avg_group_size': self.df['Total Passengers'].mean(),
	'peak_hour': self.df['hour'].mode().iloc[0],
	'large_groups_count': len(self.df[self.df['Total Passengers'] >= 6]),
	'large_groups_pct': (len(self.df[self.df['Total Passengers'] >= 6]) / len(self.df)) * 100,
	'top_pickups': list(self.df['pickup_main'].value_counts().head(10).items()),
	'top_dropoffs': list(self.df['dropoff_main'].value_counts().head(10).items()),
	'hourly_distribution': self.df['hour'].value_counts().sort_index().to_dict(),
	'group_size_distribution': self.df['Total Passengers'].value_counts().sort_index().to_dict()
	}

	def get_quick_insights(self) -> Dict[str, Any]:
	"""Get quick insights for dashboard."""
	return self.insights

	def query_data(self, query_params: Dict[str, Any]) -> pd.DataFrame:
	"""Query the data based on parameters."""
	filtered_df = self.df.copy()

	if 'pickup_location' in query_params:
	filtered_df = filtered_df[filtered_df['pickup_main'].str.contains(
	query_params['pickup_location'], case=False, na=False)]

	if 'dropoff_location' in query_params:
	filtered_df = filtered_df[filtered_df['dropoff_main'].str.contains(
	query_params['dropoff_location'], case=False, na=False)]

	if 'hour_range' in query_params:
	start_hour, end_hour = query_params['hour_range']
	filtered_df = filtered_df[
	(filtered_df['hour'] >= start_hour) & (filtered_df['hour'] <= end_hour)]

	if 'min_passengers' in query_params:
	filtered_df = filtered_df[filtered_df['Total Passengers'] >= query_params['min_passengers']]

	if 'max_passengers' in query_params:
	filtered_df = filtered_df[filtered_df['Total Passengers'] <= query_params['max_passengers']]

	if 'date_range' in query_params:
	start_date, end_date = query_params['date_range']
	filtered_df = filtered_df[
	(filtered_df['date'] >= start_date) & (filtered_df['date'] <= end_date)]

	return filtered_df

	def get_location_stats(self, location: str, location_type: str = 'both') -> Dict[str, Any]:
	"""Get statistics for a specific location."""
	if location_type in ['pickup', 'both']:
	pickup_data = self.df[self.df['pickup_main'].str.contains(location, case=False, na=False)]
	else:
	pickup_data = pd.DataFrame()

	if location_type in ['dropoff', 'both']:
	dropoff_data = self.df[self.df['dropoff_main'].str.contains(location, case=False, na=False)]
	else:
	dropoff_data = pd.DataFrame()

	return {
	'pickup_count': len(pickup_data),
	'dropoff_count': len(dropoff_data),
	'avg_group_size_pickup': pickup_data['Total Passengers'].mean() if len(pickup_data) > 0 else 0,
	'avg_group_size_dropoff': dropoff_data['Total Passengers'].mean() if len(dropoff_data) > 0 else 0,
	'peak_hours_pickup': pickup_data['hour'].mode().tolist() if len(pickup_data) > 0 else [],
	'peak_hours_dropoff': dropoff_data['hour'].mode().tolist() if len(dropoff_data) > 0 else []
	}

	def get_time_patterns(self, group_size_filter: int = None) -> Dict[str, Any]:
	"""Get time-based patterns."""
	data = self.df.copy()

	if group_size_filter:
	data = data[data['Total Passengers'] >= group_size_filter]

	return {
	'hourly_counts': data['hour'].value_counts().sort_index().to_dict(),
	'daily_counts': data['day_of_week'].value_counts().to_dict(),
	'time_category_counts': data['time_category'].value_counts().to_dict()
	}