Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Script to fetch and analyze data from Google Sheets | |
| Usage: python fetch_and_analyze.py | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import requests | |
| from datetime import datetime | |
| # Add parent directory to path | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from data.sheets_service import GoogleSheetsService, DataPreprocessor | |
| def fetch_and_analyze(): | |
| """Fetch data from Google Sheets and analyze it""" | |
| print("=" * 60) | |
| print("Gapura AI Analysis - Data Fetch & Analyze") | |
| print("=" * 60) | |
| # Initialize services | |
| sheets_service = GoogleSheetsService() | |
| preprocessor = DataPreprocessor() | |
| # Configuration | |
| spreadsheet_id = os.getenv("GOOGLE_SHEET_ID") | |
| if not spreadsheet_id: | |
| print("ERROR: GOOGLE_SHEET_ID not set in environment") | |
| sys.exit(1) | |
| print(f"\nFetching data from spreadsheet: {spreadsheet_id}") | |
| print("-" * 60) | |
| # Fetch NON CARGO data | |
| print("\n1. Fetching NON CARGO sheet...") | |
| try: | |
| non_cargo_data = sheets_service.fetch_sheet_data( | |
| spreadsheet_id=spreadsheet_id, | |
| sheet_name="NON CARGO", | |
| range_str="A1:AA100", # First 100 rows including header | |
| ) | |
| print(f" ✓ Fetched {len(non_cargo_data)} records") | |
| except Exception as e: | |
| print(f" ✗ Error: {str(e)}") | |
| non_cargo_data = [] | |
| # Fetch CGO data | |
| print("\n2. Fetching CGO sheet...") | |
| try: | |
| cargo_data = sheets_service.fetch_sheet_data( | |
| spreadsheet_id=spreadsheet_id, sheet_name="CGO", range_str="A1:Z100" | |
| ) | |
| print(f" ✓ Fetched {len(cargo_data)} records") | |
| except Exception as e: | |
| print(f" ✗ Error: {str(e)}") | |
| cargo_data = [] | |
| # Combine data | |
| all_data = non_cargo_data + cargo_data | |
| print(f"\n Total records: {len(all_data)}") | |
| if not all_data: | |
| print("\nNo data fetched. Exiting.") | |
| return | |
| # Preprocess data | |
| print("\n3. Preprocessing data...") | |
| preprocessed_data = preprocessor.preprocess_batch( | |
| all_data[:10] | |
| ) # First 10 for demo | |
| print(f" ✓ Preprocessed {len(preprocessed_data)} records") | |
| # Display sample features | |
| print("\n4. Sample Features:") | |
| print("-" * 60) | |
| for i, features in enumerate(preprocessed_data[:3]): | |
| print(f"\nRecord {i + 1}:") | |
| print(f" Airline: {features['airline']}") | |
| print(f" Hub: {features['hub']}") | |
| print(f" Category: {features['category']}") | |
| print(f" Report Length: {features['report_length']} chars") | |
| print(f" Word Count: {features['report_word_count']}") | |
| print(f" Severity Score: {features['severity_keyword_count']}") | |
| print(f" Has Photos: {features['has_photos']}") | |
| print(f" Is Closed: {features['is_closed']}") | |
| # Save to file | |
| output_file = f"analysis_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| print(f"\n5. Saving data to {output_file}...") | |
| with open(output_file, "w") as f: | |
| json.dump( | |
| { | |
| "metadata": { | |
| "spreadsheet_id": spreadsheet_id, | |
| "fetched_at": datetime.now().isoformat(), | |
| "non_cargo_count": len(non_cargo_data), | |
| "cargo_count": len(cargo_data), | |
| "total_count": len(all_data), | |
| }, | |
| "raw_data": all_data[:50], # Save first 50 records | |
| "preprocessed_features": preprocessed_data[:10], | |
| }, | |
| f, | |
| indent=2, | |
| default=str, | |
| ) | |
| print(f" ✓ Data saved to {output_file}") | |
| # Aggregate statistics | |
| print("\n6. Aggregate Statistics:") | |
| print("-" * 60) | |
| # Count by airline | |
| airline_counts = {} | |
| hub_counts = {} | |
| category_counts = {} | |
| for report in all_data: | |
| airline = report.get("Airlines", "Unknown") | |
| hub = report.get("HUB", "Unknown") | |
| category = report.get("Irregularity_Complain_Category", "Unknown") | |
| airline_counts[airline] = airline_counts.get(airline, 0) + 1 | |
| hub_counts[hub] = hub_counts.get(hub, 0) + 1 | |
| category_counts[category] = category_counts.get(category, 0) + 1 | |
| print("\nBy Airline:") | |
| for airline, count in sorted( | |
| airline_counts.items(), key=lambda x: x[1], reverse=True | |
| )[:5]: | |
| print(f" {airline}: {count}") | |
| print("\nBy Hub:") | |
| for hub, count in sorted(hub_counts.items(), key=lambda x: x[1], reverse=True): | |
| print(f" {hub}: {count}") | |
| print("\nBy Category:") | |
| for category, count in sorted( | |
| category_counts.items(), key=lambda x: x[1], reverse=True | |
| )[:5]: | |
| print(f" {category}: {count}") | |
| print("\n" + "=" * 60) | |
| print("Analysis complete!") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| fetch_and_analyze() | |