File size: 10,673 Bytes
514f898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pandas as pd
import numpy as np
from typing import Dict, Any

class DataProcessor:
    """

    Handles all data processing and analysis for Fetii rideshare data.

    """
    
    def __init__(self, csv_file_path: str = "fetii_data.csv"):
        """Initialize the data processor with the CSV file."""
        self.csv_file_path = csv_file_path
        self.df = None
        self.insights = {}
        self.load_and_process_data()
    
    def load_and_process_data(self):
        """Load and process the Fetii trip data."""
        try:
            self.df = pd.read_csv(self.csv_file_path)
            
            self._clean_data()
            self._extract_temporal_features()
            self._extract_location_features()
            self._calculate_insights()
            
            print(f"✅ Successfully loaded {len(self.df)} trips from Austin")
            
        except FileNotFoundError:
            print("⚠️ CSV file not found. Creating sample data for demo...")
            self._create_sample_data()
    
    def _create_sample_data(self):
        """Create sample data based on the analysis patterns."""
        np.random.seed(42)
        
        locations = {
            'pickup': ['West Campus', 'The Drag', 'Market District', 'Sixth Street', 'East End', 
                      'Downtown', 'Govalle', 'Hancock', 'South Lamar', 'Warehouse District'],
            'dropoff': ['The Aquarium on 6th', 'Wiggle Room', "Shakespeare's", 'Mayfair Austin', 
                       'Latchkey', '6013 Loyola Ln', "Buford's", 'Darrell K Royal Texas Memorial Stadium',
                       'LUNA Rooftop', 'University of Texas KA house', 'Green Light Social', "The Cat's Pajamas"]
        }
        
        passenger_choices = [14, 8, 7, 10, 9, 12, 11, 13, 6, 5, 4, 3, 2, 1]
        passenger_weights = [0.173, 0.128, 0.120, 0.115, 0.113, 0.087, 0.085, 0.077, 0.063, 0.028, 0.007, 0.004, 0.001, 0.001]
        
        hour_choices = [22, 23, 21, 19, 0, 20, 18, 1, 2, 17, 16, 3]
        hour_weights = [0.25, 0.23, 0.19, 0.11, 0.08, 0.06, 0.05, 0.03, 0.02, 0.01, 0.01, 0.01]
        
        sample_data = []
        for i in range(2000):
            passengers = np.random.choice(passenger_choices, p=passenger_weights)
            hour = np.random.choice(hour_choices, p=hour_weights)
            
            pickup_lat = np.random.normal(30.2672, 0.02)
            pickup_lng = np.random.normal(-97.7431, 0.02)
            dropoff_lat = np.random.normal(30.2672, 0.02)
            dropoff_lng = np.random.normal(-97.7431, 0.02)
            
            day = np.random.randint(1, 31)
            minute = np.random.randint(0, 60)
            
            sample_data.append({
                'Trip ID': 734889 - i,
                'Booking User ID': np.random.randint(10000, 999999),
                'Pick Up Latitude': pickup_lat,
                'Pick Up Longitude': pickup_lng,
                'Drop Off Latitude': dropoff_lat,
                'Drop Off Longitude': dropoff_lng,
                'Pick Up Address': f"{np.random.choice(locations['pickup'])}, Austin, TX",
                'Drop Off Address': f"{np.random.choice(locations['dropoff'])}, Austin, TX",
                'Trip Date and Time': f"9/{day}/25 {hour}:{minute:02d}",
                'Total Passengers': passengers
            })
        
        self.df = pd.DataFrame(sample_data)
        self._clean_data()
        self._extract_temporal_features()
        self._extract_location_features()
        self._calculate_insights()
    
    def _clean_data(self):
        """Clean and standardize the data."""
        self.df = self.df.dropna(subset=['Total Passengers', 'Trip Date and Time'])
        
        self.df['Total Passengers'] = self.df['Total Passengers'].astype(int)
        
        self.df['pickup_main'] = self.df['Pick Up Address'].apply(self._extract_main_location)
        self.df['dropoff_main'] = self.df['Drop Off Address'].apply(self._extract_main_location)
    
    def _extract_main_location(self, address: str) -> str:
        """Extract the main location name from an address."""
        if pd.isna(address):
            return "Unknown"
        return address.split(',')[0].strip()
    
    def _extract_temporal_features(self):
        """Extract temporal features from trip data."""
        self.df['datetime'] = pd.to_datetime(self.df['Trip Date and Time'], format='%m/%d/%y %H:%M')
        self.df['hour'] = self.df['datetime'].dt.hour
        self.df['day_of_week'] = self.df['datetime'].dt.day_name()
        self.df['date'] = self.df['datetime'].dt.date
        
        self.df['time_category'] = self.df['hour'].apply(self._categorize_time)
    
    def _categorize_time(self, hour: int) -> str:
        """Categorize hour into time periods."""
        if 6 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 17:
            return "Afternoon"
        elif 17 <= hour < 21:
            return "Evening"
        elif 21 <= hour <= 23:
            return "Night"
        else:
            return "Late Night"
    
    def _extract_location_features(self):
        """Extract location-based features."""
        self.df['group_category'] = self.df['Total Passengers'].apply(self._categorize_group_size)
        
        self.df['is_entertainment'] = self.df['dropoff_main'].apply(self._is_entertainment_venue)
        self.df['is_campus'] = self.df['pickup_main'].apply(self._is_campus_location)
    
    def _categorize_group_size(self, passengers: int) -> str:
        """Categorize group size."""
        if passengers <= 4:
            return "Small (1-4)"
        elif passengers <= 8:
            return "Medium (5-8)"
        elif passengers <= 12:
            return "Large (9-12)"
        else:
            return "Extra Large (13+)"
    
    def _is_entertainment_venue(self, location: str) -> bool:
        """Check if location is an entertainment venue."""
        entertainment_keywords = ['bar', 'club', 'lounge', 'aquarium', 'rooftop', 'social', 'pub']
        return any(keyword in location.lower() for keyword in entertainment_keywords)
    
    def _is_campus_location(self, location: str) -> bool:
        """Check if location is campus-related."""
        campus_keywords = ['campus', 'university', 'drag', 'west campus']
        return any(keyword in location.lower() for keyword in campus_keywords)
    
    def _calculate_insights(self):
        """Calculate key insights from the data."""
        self.insights = {
            'total_trips': len(self.df),
            'avg_group_size': self.df['Total Passengers'].mean(),
            'peak_hour': self.df['hour'].mode().iloc[0],
            'large_groups_count': len(self.df[self.df['Total Passengers'] >= 6]),
            'large_groups_pct': (len(self.df[self.df['Total Passengers'] >= 6]) / len(self.df)) * 100,
            'top_pickups': list(self.df['pickup_main'].value_counts().head(10).items()),
            'top_dropoffs': list(self.df['dropoff_main'].value_counts().head(10).items()),
            'hourly_distribution': self.df['hour'].value_counts().sort_index().to_dict(),
            'group_size_distribution': self.df['Total Passengers'].value_counts().sort_index().to_dict()
        }
    
    def get_quick_insights(self) -> Dict[str, Any]:
        """Get quick insights for dashboard."""
        return self.insights
    
    def query_data(self, query_params: Dict[str, Any]) -> pd.DataFrame:
        """Query the data based on parameters."""
        filtered_df = self.df.copy()
        
        if 'pickup_location' in query_params:
            filtered_df = filtered_df[filtered_df['pickup_main'].str.contains(
                query_params['pickup_location'], case=False, na=False)]
        
        if 'dropoff_location' in query_params:
            filtered_df = filtered_df[filtered_df['dropoff_main'].str.contains(
                query_params['dropoff_location'], case=False, na=False)]
        
        if 'hour_range' in query_params:
            start_hour, end_hour = query_params['hour_range']
            filtered_df = filtered_df[
                (filtered_df['hour'] >= start_hour) & (filtered_df['hour'] <= end_hour)]
        
        if 'min_passengers' in query_params:
            filtered_df = filtered_df[filtered_df['Total Passengers'] >= query_params['min_passengers']]
        
        if 'max_passengers' in query_params:
            filtered_df = filtered_df[filtered_df['Total Passengers'] <= query_params['max_passengers']]
        
        if 'date_range' in query_params:
            start_date, end_date = query_params['date_range']
            filtered_df = filtered_df[
                (filtered_df['date'] >= start_date) & (filtered_df['date'] <= end_date)]
        
        return filtered_df
    
    def get_location_stats(self, location: str, location_type: str = 'both') -> Dict[str, Any]:
        """Get statistics for a specific location."""
        if location_type in ['pickup', 'both']:
            pickup_data = self.df[self.df['pickup_main'].str.contains(location, case=False, na=False)]
        else:
            pickup_data = pd.DataFrame()
        
        if location_type in ['dropoff', 'both']:
            dropoff_data = self.df[self.df['dropoff_main'].str.contains(location, case=False, na=False)]
        else:
            dropoff_data = pd.DataFrame()
        
        return {
            'pickup_count': len(pickup_data),
            'dropoff_count': len(dropoff_data),
            'avg_group_size_pickup': pickup_data['Total Passengers'].mean() if len(pickup_data) > 0 else 0,
            'avg_group_size_dropoff': dropoff_data['Total Passengers'].mean() if len(dropoff_data) > 0 else 0,
            'peak_hours_pickup': pickup_data['hour'].mode().tolist() if len(pickup_data) > 0 else [],
            'peak_hours_dropoff': dropoff_data['hour'].mode().tolist() if len(dropoff_data) > 0 else []
        }
    
    def get_time_patterns(self, group_size_filter: int = None) -> Dict[str, Any]:
        """Get time-based patterns."""
        data = self.df.copy()
        
        if group_size_filter:
            data = data[data['Total Passengers'] >= group_size_filter]
        
        return {
            'hourly_counts': data['hour'].value_counts().sort_index().to_dict(),
            'daily_counts': data['day_of_week'].value_counts().to_dict(),
            'time_category_counts': data['time_category'].value_counts().to_dict()
        }