gapura-ai / scripts /fetch_and_analyze.py
Muhammad Ridzki Nugraha
Upload folder using huggingface_hub
c32f8fb verified
#!/usr/bin/env python3
"""
Script to fetch and analyze data from Google Sheets
Usage: python fetch_and_analyze.py
"""
import os
import sys
import json
import requests
from datetime import datetime
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from data.sheets_service import GoogleSheetsService, DataPreprocessor
def fetch_and_analyze():
"""Fetch data from Google Sheets and analyze it"""
print("=" * 60)
print("Gapura AI Analysis - Data Fetch & Analyze")
print("=" * 60)
# Initialize services
sheets_service = GoogleSheetsService()
preprocessor = DataPreprocessor()
# Configuration
spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
if not spreadsheet_id:
print("ERROR: GOOGLE_SHEET_ID not set in environment")
sys.exit(1)
print(f"\nFetching data from spreadsheet: {spreadsheet_id}")
print("-" * 60)
# Fetch NON CARGO data
print("\n1. Fetching NON CARGO sheet...")
try:
non_cargo_data = sheets_service.fetch_sheet_data(
spreadsheet_id=spreadsheet_id,
sheet_name="NON CARGO",
range_str="A1:AA100", # First 100 rows including header
)
print(f" ✓ Fetched {len(non_cargo_data)} records")
except Exception as e:
print(f" ✗ Error: {str(e)}")
non_cargo_data = []
# Fetch CGO data
print("\n2. Fetching CGO sheet...")
try:
cargo_data = sheets_service.fetch_sheet_data(
spreadsheet_id=spreadsheet_id, sheet_name="CGO", range_str="A1:Z100"
)
print(f" ✓ Fetched {len(cargo_data)} records")
except Exception as e:
print(f" ✗ Error: {str(e)}")
cargo_data = []
# Combine data
all_data = non_cargo_data + cargo_data
print(f"\n Total records: {len(all_data)}")
if not all_data:
print("\nNo data fetched. Exiting.")
return
# Preprocess data
print("\n3. Preprocessing data...")
preprocessed_data = preprocessor.preprocess_batch(
all_data[:10]
) # First 10 for demo
print(f" ✓ Preprocessed {len(preprocessed_data)} records")
# Display sample features
print("\n4. Sample Features:")
print("-" * 60)
for i, features in enumerate(preprocessed_data[:3]):
print(f"\nRecord {i + 1}:")
print(f" Airline: {features['airline']}")
print(f" Hub: {features['hub']}")
print(f" Category: {features['category']}")
print(f" Report Length: {features['report_length']} chars")
print(f" Word Count: {features['report_word_count']}")
print(f" Severity Score: {features['severity_keyword_count']}")
print(f" Has Photos: {features['has_photos']}")
print(f" Is Closed: {features['is_closed']}")
# Save to file
output_file = f"analysis_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
print(f"\n5. Saving data to {output_file}...")
with open(output_file, "w") as f:
json.dump(
{
"metadata": {
"spreadsheet_id": spreadsheet_id,
"fetched_at": datetime.now().isoformat(),
"non_cargo_count": len(non_cargo_data),
"cargo_count": len(cargo_data),
"total_count": len(all_data),
},
"raw_data": all_data[:50], # Save first 50 records
"preprocessed_features": preprocessed_data[:10],
},
f,
indent=2,
default=str,
)
print(f" ✓ Data saved to {output_file}")
# Aggregate statistics
print("\n6. Aggregate Statistics:")
print("-" * 60)
# Count by airline
airline_counts = {}
hub_counts = {}
category_counts = {}
for report in all_data:
airline = report.get("Airlines", "Unknown")
hub = report.get("HUB", "Unknown")
category = report.get("Irregularity_Complain_Category", "Unknown")
airline_counts[airline] = airline_counts.get(airline, 0) + 1
hub_counts[hub] = hub_counts.get(hub, 0) + 1
category_counts[category] = category_counts.get(category, 0) + 1
print("\nBy Airline:")
for airline, count in sorted(
airline_counts.items(), key=lambda x: x[1], reverse=True
)[:5]:
print(f" {airline}: {count}")
print("\nBy Hub:")
for hub, count in sorted(hub_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {hub}: {count}")
print("\nBy Category:")
for category, count in sorted(
category_counts.items(), key=lambda x: x[1], reverse=True
)[:5]:
print(f" {category}: {count}")
print("\n" + "=" * 60)
print("Analysis complete!")
print("=" * 60)
if __name__ == "__main__":
fetch_and_analyze()