#!/usr/bin/env python3 """ Script to fetch and analyze data from Google Sheets Usage: python fetch_and_analyze.py """ import os import sys import json import requests from datetime import datetime # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from data.sheets_service import GoogleSheetsService, DataPreprocessor def fetch_and_analyze(): """Fetch data from Google Sheets and analyze it""" print("=" * 60) print("Gapura AI Analysis - Data Fetch & Analyze") print("=" * 60) # Initialize services sheets_service = GoogleSheetsService() preprocessor = DataPreprocessor() # Configuration spreadsheet_id = os.getenv("GOOGLE_SHEET_ID") if not spreadsheet_id: print("ERROR: GOOGLE_SHEET_ID not set in environment") sys.exit(1) print(f"\nFetching data from spreadsheet: {spreadsheet_id}") print("-" * 60) # Fetch NON CARGO data print("\n1. Fetching NON CARGO sheet...") try: non_cargo_data = sheets_service.fetch_sheet_data( spreadsheet_id=spreadsheet_id, sheet_name="NON CARGO", range_str="A1:AA100", # First 100 rows including header ) print(f" ✓ Fetched {len(non_cargo_data)} records") except Exception as e: print(f" ✗ Error: {str(e)}") non_cargo_data = [] # Fetch CGO data print("\n2. Fetching CGO sheet...") try: cargo_data = sheets_service.fetch_sheet_data( spreadsheet_id=spreadsheet_id, sheet_name="CGO", range_str="A1:Z100" ) print(f" ✓ Fetched {len(cargo_data)} records") except Exception as e: print(f" ✗ Error: {str(e)}") cargo_data = [] # Combine data all_data = non_cargo_data + cargo_data print(f"\n Total records: {len(all_data)}") if not all_data: print("\nNo data fetched. Exiting.") return # Preprocess data print("\n3. Preprocessing data...") preprocessed_data = preprocessor.preprocess_batch( all_data[:10] ) # First 10 for demo print(f" ✓ Preprocessed {len(preprocessed_data)} records") # Display sample features print("\n4. Sample Features:") print("-" * 60) for i, features in enumerate(preprocessed_data[:3]): print(f"\nRecord {i + 1}:") print(f" Airline: {features['airline']}") print(f" Hub: {features['hub']}") print(f" Category: {features['category']}") print(f" Report Length: {features['report_length']} chars") print(f" Word Count: {features['report_word_count']}") print(f" Severity Score: {features['severity_keyword_count']}") print(f" Has Photos: {features['has_photos']}") print(f" Is Closed: {features['is_closed']}") # Save to file output_file = f"analysis_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" print(f"\n5. Saving data to {output_file}...") with open(output_file, "w") as f: json.dump( { "metadata": { "spreadsheet_id": spreadsheet_id, "fetched_at": datetime.now().isoformat(), "non_cargo_count": len(non_cargo_data), "cargo_count": len(cargo_data), "total_count": len(all_data), }, "raw_data": all_data[:50], # Save first 50 records "preprocessed_features": preprocessed_data[:10], }, f, indent=2, default=str, ) print(f" ✓ Data saved to {output_file}") # Aggregate statistics print("\n6. Aggregate Statistics:") print("-" * 60) # Count by airline airline_counts = {} hub_counts = {} category_counts = {} for report in all_data: airline = report.get("Airlines", "Unknown") hub = report.get("HUB", "Unknown") category = report.get("Irregularity_Complain_Category", "Unknown") airline_counts[airline] = airline_counts.get(airline, 0) + 1 hub_counts[hub] = hub_counts.get(hub, 0) + 1 category_counts[category] = category_counts.get(category, 0) + 1 print("\nBy Airline:") for airline, count in sorted( airline_counts.items(), key=lambda x: x[1], reverse=True )[:5]: print(f" {airline}: {count}") print("\nBy Hub:") for hub, count in sorted(hub_counts.items(), key=lambda x: x[1], reverse=True): print(f" {hub}: {count}") print("\nBy Category:") for category, count in sorted( category_counts.items(), key=lambda x: x[1], reverse=True )[:5]: print(f" {category}: {count}") print("\n" + "=" * 60) print("Analysis complete!") print("=" * 60) if __name__ == "__main__": fetch_and_analyze()