Spaces:

Shinegupta
/

Fetii_AI_Assistant

Configuration error

File size: 10,673 Bytes

514f898

import pandas as pd
import numpy as np
from typing import Dict, Any

class DataProcessor:
    """

    Handles all data processing and analysis for Fetii rideshare data.

    """
    
    def __init__(self, csv_file_path: str = "fetii_data.csv"):
        """Initialize the data processor with the CSV file."""
        self.csv_file_path = csv_file_path
        self.df = None
        self.insights = {}
        self.load_and_process_data()
    
    def load_and_process_data(self):
        """Load and process the Fetii trip data."""
        try:
            self.df = pd.read_csv(self.csv_file_path)
            
            self._clean_data()
            self._extract_temporal_features()
            self._extract_location_features()
            self._calculate_insights()
            
            print(f"✅ Successfully loaded {len(self.df)} trips from Austin")
            
        except FileNotFoundError:
            print("⚠️ CSV file not found. Creating sample data for demo...")
            self._create_sample_data()
    
    def _create_sample_data(self):
        """Create sample data based on the analysis patterns."""
        np.random.seed(42)
        
        locations = {
            'pickup': ['West Campus', 'The Drag', 'Market District', 'Sixth Street', 'East End', 
                      'Downtown', 'Govalle', 'Hancock', 'South Lamar', 'Warehouse District'],
            'dropoff': ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'Mayfair Austin', 
                       'Latchkey', '6013 Loyola Ln', "Buford's", 'Darrell K Royal Texas Memorial Stadium',
                       'LUNA Rooftop', 'University of Texas KA house', 'Green Light Social', "The Cat's Pajamas"]
        }
        
        passenger_choices = [14, 8, 7, 10, 9, 12, 11, 13, 6, 5, 4, 3, 2, 1]
        passenger_weights = [0.173, 0.128, 0.120, 0.115, 0.113, 0.087, 0.085, 0.077, 0.063, 0.028, 0.007, 0.004, 0.001, 0.001]
        
        hour_choices = [22, 23, 21, 19, 0, 20, 18, 1, 2, 17, 16, 3]
        hour_weights = [0.25, 0.23, 0.19, 0.11, 0.08, 0.06, 0.05, 0.03, 0.02, 0.01, 0.01, 0.01]
        
        sample_data = []
        for i in range(2000):
            passengers = np.random.choice(passenger_choices, p=passenger_weights)
            hour = np.random.choice(hour_choices, p=hour_weights)
            
            pickup_lat = np.random.normal(30.2672, 0.02)
            pickup_lng = np.random.normal(-97.7431, 0.02)
            dropoff_lat = np.random.normal(30.2672, 0.02)
            dropoff_lng = np.random.normal(-97.7431, 0.02)
            
            day = np.random.randint(1, 31)
            minute = np.random.randint(0, 60)
            
            sample_data.append({
                'Trip ID': 734889 - i,
                'Booking User ID': np.random.randint(10000, 999999),
                'Pick Up Latitude': pickup_lat,
                'Pick Up Longitude': pickup_lng,
                'Drop Off Latitude': dropoff_lat,
                'Drop Off Longitude': dropoff_lng,
                'Pick Up Address': f"{np.random.choice(locations['pickup'])}, Austin, TX",
                'Drop Off Address': f"{np.random.choice(locations['dropoff'])}, Austin, TX",
                'Trip Date and Time': f"9/{day}/25 {hour}:{minute:02d}",
                'Total Passengers': passengers
            })
        
        self.df = pd.DataFrame(sample_data)
        self._clean_data()
        self._extract_temporal_features()
        self._extract_location_features()
        self._calculate_insights()
    
    def _clean_data(self):
        """Clean and standardize the data."""
        self.df = self.df.dropna(subset=['Total Passengers', 'Trip Date and Time'])
        
        self.df['Total Passengers'] = self.df['Total Passengers'].astype(int)
        
        self.df['pickup_main'] = self.df['Pick Up Address'].apply(self._extract_main_location)
        self.df['dropoff_main'] = self.df['Drop Off Address'].apply(self._extract_main_location)
    
    def _extract_main_location(self, address: str) -> str:
        """Extract the main location name from an address."""
        if pd.isna(address):
            return "Unknown"
        return address.split(',')[0].strip()
    
    def _extract_temporal_features(self):
        """Extract temporal features from trip data."""
        self.df['datetime'] = pd.to_datetime(self.df['Trip Date and Time'], format='%m/%d/%y %H:%M')
        self.df['hour'] = self.df['datetime'].dt.hour
        self.df['day_of_week'] = self.df['datetime'].dt.day_name()
        self.df['date'] = self.df['datetime'].dt.date
        
        self.df['time_category'] = self.df['hour'].apply(self._categorize_time)
    
    def _categorize_time(self, hour: int) -> str:
        """Categorize hour into time periods."""
        if 6 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 17:
            return "Afternoon"
        elif 17 <= hour < 21:
            return "Evening"
        elif 21 <= hour <= 23:
            return "Night"
        else:
            return "Late Night"
    
    def _extract_location_features(self):
        """Extract location-based features."""
        self.df['group_category'] = self.df['Total Passengers'].apply(self._categorize_group_size)
        
        self.df['is_entertainment'] = self.df['dropoff_main'].apply(self._is_entertainment_venue)
        self.df['is_campus'] = self.df['pickup_main'].apply(self._is_campus_location)
    
    def _categorize_group_size(self, passengers: int) -> str:
        """Categorize group size."""
        if passengers <= 4:
            return "Small (1-4)"
        elif passengers <= 8:
            return "Medium (5-8)"
        elif passengers <= 12:
            return "Large (9-12)"
        else:
            return "Extra Large (13+)"
    
    def _is_entertainment_venue(self, location: str) -> bool:
        """Check if location is an entertainment venue."""
        entertainment_keywords = ['bar', 'club', 'lounge', 'aquarium', 'rooftop', 'social', 'pub']
        return any(keyword in location.lower() for keyword in entertainment_keywords)
    
    def _is_campus_location(self, location: str) -> bool:
        """Check if location is campus-related."""
        campus_keywords = ['campus', 'university', 'drag', 'west campus']
        return any(keyword in location.lower() for keyword in campus_keywords)
    
    def _calculate_insights(self):
        """Calculate key insights from the data."""
        self.insights = {
            'total_trips': len(self.df),
            'avg_group_size': self.df['Total Passengers'].mean(),
            'peak_hour': self.df['hour'].mode().iloc[0],
            'large_groups_count': len(self.df[self.df['Total Passengers'] >= 6]),
            'large_groups_pct': (len(self.df[self.df['Total Passengers'] >= 6]) / len(self.df)) * 100,
            'top_pickups': list(self.df['pickup_main'].value_counts().head(10).items()),
            'top_dropoffs': list(self.df['dropoff_main'].value_counts().head(10).items()),
            'hourly_distribution': self.df['hour'].value_counts().sort_index().to_dict(),
            'group_size_distribution': self.df['Total Passengers'].value_counts().sort_index().to_dict()
        }
    
    def get_quick_insights(self) -> Dict[str, Any]:
        """Get quick insights for dashboard."""
        return self.insights
    
    def query_data(self, query_params: Dict[str, Any]) -> pd.DataFrame:
        """Query the data based on parameters."""
        filtered_df = self.df.copy()
        
        if 'pickup_location' in query_params:
            filtered_df = filtered_df[filtered_df['pickup_main'].str.contains(
                query_params['pickup_location'], case=False, na=False)]
        
        if 'dropoff_location' in query_params:
            filtered_df = filtered_df[filtered_df['dropoff_main'].str.contains(
                query_params['dropoff_location'], case=False, na=False)]
        
        if 'hour_range' in query_params:
            start_hour, end_hour = query_params['hour_range']
            filtered_df = filtered_df[
                (filtered_df['hour'] >= start_hour) & (filtered_df['hour'] <= end_hour)]
        
        if 'min_passengers' in query_params:
            filtered_df = filtered_df[filtered_df['Total Passengers'] >= query_params['min_passengers']]
        
        if 'max_passengers' in query_params:
            filtered_df = filtered_df[filtered_df['Total Passengers'] <= query_params['max_passengers']]
        
        if 'date_range' in query_params:
            start_date, end_date = query_params['date_range']
            filtered_df = filtered_df[
                (filtered_df['date'] >= start_date) & (filtered_df['date'] <= end_date)]
        
        return filtered_df
    
    def get_location_stats(self, location: str, location_type: str = 'both') -> Dict[str, Any]:
        """Get statistics for a specific location."""
        if location_type in ['pickup', 'both']:
            pickup_data = self.df[self.df['pickup_main'].str.contains(location, case=False, na=False)]
        else:
            pickup_data = pd.DataFrame()
        
        if location_type in ['dropoff', 'both']:
            dropoff_data = self.df[self.df['dropoff_main'].str.contains(location, case=False, na=False)]
        else:
            dropoff_data = pd.DataFrame()
        
        return {
            'pickup_count': len(pickup_data),
            'dropoff_count': len(dropoff_data),
            'avg_group_size_pickup': pickup_data['Total Passengers'].mean() if len(pickup_data) > 0 else 0,
            'avg_group_size_dropoff': dropoff_data['Total Passengers'].mean() if len(dropoff_data) > 0 else 0,
            'peak_hours_pickup': pickup_data['hour'].mode().tolist() if len(pickup_data) > 0 else [],
            'peak_hours_dropoff': dropoff_data['hour'].mode().tolist() if len(dropoff_data) > 0 else []
        }
    
    def get_time_patterns(self, group_size_filter: int = None) -> Dict[str, Any]:
        """Get time-based patterns."""
        data = self.df.copy()
        
        if group_size_filter:
            data = data[data['Total Passengers'] >= group_size_filter]
        
        return {
            'hourly_counts': data['hour'].value_counts().sort_index().to_dict(),
            'daily_counts': data['day_of_week'].value_counts().to_dict(),
            'time_category_counts': data['time_category'].value_counts().to_dict()
        }