Spaces:

gapura-dev
/

gapura-ai

Running

gapura-ai / scripts /fetch_and_analyze.py

Muhammad Ridzki Nugraha

Upload folder using huggingface_hub

c32f8fb verified 12 days ago

4.86 kB

	#!/usr/bin/env python3
	"""
	Script to fetch and analyze data from Google Sheets
	Usage: python fetch_and_analyze.py
	"""

	import os
	import sys
	import json
	import requests
	from datetime import datetime

	# Add parent directory to path
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from data.sheets_service import GoogleSheetsService, DataPreprocessor


	def fetch_and_analyze():
	"""Fetch data from Google Sheets and analyze it"""

	print("=" * 60)
	print("Gapura AI Analysis - Data Fetch & Analyze")
	print("=" * 60)

	# Initialize services
	sheets_service = GoogleSheetsService()
	preprocessor = DataPreprocessor()

	# Configuration
	spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")

	if not spreadsheet_id:
	print("ERROR: GOOGLE_SHEET_ID not set in environment")
	sys.exit(1)

	print(f"\nFetching data from spreadsheet: {spreadsheet_id}")
	print("-" * 60)

	# Fetch NON CARGO data
	print("\n1. Fetching NON CARGO sheet...")
	try:
	non_cargo_data = sheets_service.fetch_sheet_data(
	spreadsheet_id=spreadsheet_id,
	sheet_name="NON CARGO",
	range_str="A1:AA100", # First 100 rows including header
	)
	print(f" ✓ Fetched {len(non_cargo_data)} records")
	except Exception as e:
	print(f" ✗ Error: {str(e)}")
	non_cargo_data = []

	# Fetch CGO data
	print("\n2. Fetching CGO sheet...")
	try:
	cargo_data = sheets_service.fetch_sheet_data(
	spreadsheet_id=spreadsheet_id, sheet_name="CGO", range_str="A1:Z100"
	)
	print(f" ✓ Fetched {len(cargo_data)} records")
	except Exception as e:
	print(f" ✗ Error: {str(e)}")
	cargo_data = []

	# Combine data
	all_data = non_cargo_data + cargo_data
	print(f"\n Total records: {len(all_data)}")

	if not all_data:
	print("\nNo data fetched. Exiting.")
	return

	# Preprocess data
	print("\n3. Preprocessing data...")
	preprocessed_data = preprocessor.preprocess_batch(
	all_data[:10]
	) # First 10 for demo
	print(f" ✓ Preprocessed {len(preprocessed_data)} records")

	# Display sample features
	print("\n4. Sample Features:")
	print("-" * 60)
	for i, features in enumerate(preprocessed_data[:3]):
	print(f"\nRecord {i + 1}:")
	print(f" Airline: {features['airline']}")
	print(f" Hub: {features['hub']}")
	print(f" Category: {features['category']}")
	print(f" Report Length: {features['report_length']} chars")
	print(f" Word Count: {features['report_word_count']}")
	print(f" Severity Score: {features['severity_keyword_count']}")
	print(f" Has Photos: {features['has_photos']}")
	print(f" Is Closed: {features['is_closed']}")

	# Save to file
	output_file = f"analysis_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	print(f"\n5. Saving data to {output_file}...")

	with open(output_file, "w") as f:
	json.dump(
	{
	"metadata": {
	"spreadsheet_id": spreadsheet_id,
	"fetched_at": datetime.now().isoformat(),
	"non_cargo_count": len(non_cargo_data),
	"cargo_count": len(cargo_data),
	"total_count": len(all_data),
	},
	"raw_data": all_data[:50], # Save first 50 records
	"preprocessed_features": preprocessed_data[:10],
	},
	f,
	indent=2,
	default=str,
	)

	print(f" ✓ Data saved to {output_file}")

	# Aggregate statistics
	print("\n6. Aggregate Statistics:")
	print("-" * 60)

	# Count by airline
	airline_counts = {}
	hub_counts = {}
	category_counts = {}

	for report in all_data:
	airline = report.get("Airlines", "Unknown")
	hub = report.get("HUB", "Unknown")
	category = report.get("Irregularity_Complain_Category", "Unknown")

	airline_counts[airline] = airline_counts.get(airline, 0) + 1
	hub_counts[hub] = hub_counts.get(hub, 0) + 1
	category_counts[category] = category_counts.get(category, 0) + 1

	print("\nBy Airline:")
	for airline, count in sorted(
	airline_counts.items(), key=lambda x: x[1], reverse=True
	)[:5]:
	print(f" {airline}: {count}")

	print("\nBy Hub:")
	for hub, count in sorted(hub_counts.items(), key=lambda x: x[1], reverse=True):
	print(f" {hub}: {count}")

	print("\nBy Category:")
	for category, count in sorted(
	category_counts.items(), key=lambda x: x[1], reverse=True
	)[:5]:
	print(f" {category}: {count}")

	print("\n" + "=" * 60)
	print("Analysis complete!")
	print("=" * 60)


	if __name__ == "__main__":
	fetch_and_analyze()