Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| import pandas as pd | |
| import json | |
| import time | |
| from datetime import datetime | |
| import sqlite3 | |
| import hashlib | |
| import gzip | |
| from typing import Dict, Any, List | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Optional plotly import with fallback | |
| PLOTLY_AVAILABLE = False | |
| try: | |
| import plotly.express as px | |
| PLOTLY_AVAILABLE = True | |
| except ImportError: | |
| st.warning("π Plotly not available - charts will be disabled") | |
| PLOTLY_AVAILABLE = False | |
| # Global ML availability flag | |
| ML_AVAILABLE = False | |
| # AI/ML Imports for enhanced functionality | |
| try: | |
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| ML_AVAILABLE = True | |
| except ImportError: | |
| ML_AVAILABLE = False | |
| # Enhanced Page Configuration | |
| st.set_page_config( | |
| page_title="Simplified Data Harvester", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="collapsed" | |
| ) | |
| # Enhanced CSS with modern, professional styling | |
| st.markdown(""" | |
| <style> | |
| .main > div { | |
| padding-top: 1rem; | |
| } | |
| .stApp { | |
| background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
| color: #2c3e50; | |
| } | |
| .title-container { | |
| text-align: center; | |
| padding: 2rem 0; | |
| background: rgba(255, 255, 255, 0.9); | |
| border-radius: 15px; | |
| margin-bottom: 2rem; | |
| border: 1px solid rgba(52, 73, 94, 0.1); | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05); | |
| } | |
| .api-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
| gap: 1rem; | |
| margin: 1rem 0; | |
| } | |
| .api-card { | |
| background: rgba(255, 255, 255, 0.9); | |
| border-radius: 10px; | |
| padding: 1rem; | |
| border: 1px solid rgba(52, 73, 94, 0.15); | |
| text-align: center; | |
| } | |
| .main-button { | |
| background: linear-gradient(135deg, #3498db, #2980b9); | |
| color: white; | |
| border: none; | |
| border-radius: 12px; | |
| padding: 1rem 2rem; | |
| font-size: 1.2rem; | |
| font-weight: bold; | |
| cursor: pointer; | |
| width: 100%; | |
| margin: 1rem 0; | |
| } | |
| .main-button:hover { | |
| background: linear-gradient(135deg, #2980b9, #1f4e79); | |
| transform: translateY(-2px); | |
| box-shadow: 0 8px 25px rgba(52, 152, 219, 0.3); | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Database Configuration | |
| DB_PATH = "simplified_harvester.db" | |
| # API Configuration - Tested and verified working endpoints | |
| SIMPLIFIED_API_CONFIG = { | |
| "Skolverket": { | |
| "name": "πΈπͺ Skolverket", | |
| "description": "Swedish National Agency for Education - School Units", | |
| "endpoints": [ | |
| { | |
| "url": "https://api.skolverket.se/planned-educations/v3/compact-school-units?coordinateSystemType=WGS84&page=0&size=20", | |
| "headers": {"Accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"}, | |
| "method": "GET", | |
| "data_path": "body._embedded.schoolUnits", | |
| "key_field": "schoolUnitCode" | |
| } | |
| ] | |
| }, | |
| "SCB": { | |
| "name": "πΈπͺ Statistics Sweden", | |
| "description": "Swedish National Statistics Office - Population Metadata", | |
| "endpoints": [ | |
| { | |
| "url": "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101A/BefolkningNy", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "variables", | |
| "key_field": "code" | |
| } | |
| ] | |
| }, | |
| "Kolada": { | |
| "name": "πΈπͺ Kolada", | |
| "description": "Municipal Key Performance Indicators - Municipalities", | |
| "endpoints": [ | |
| { | |
| "url": "https://api.kolada.se/v2/municipality", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "values", | |
| "key_field": "id" | |
| } | |
| ] | |
| }, | |
| "Eurostat": { | |
| "name": "πͺπΊ Eurostat", | |
| "description": "European Union Statistics - Population Density", | |
| "endpoints": [ | |
| { | |
| "url": "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/demo_r_d3dens?format=JSON&lang=EN", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "value", | |
| "key_field": "index" | |
| } | |
| ] | |
| }, | |
| "WHO": { | |
| "name": "π WHO", | |
| "description": "World Health Organization - Country Dimensions", | |
| "endpoints": [ | |
| { | |
| "url": "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "value", | |
| "key_field": "Code" | |
| } | |
| ] | |
| }, | |
| "OECD": { | |
| "name": "π OECD", | |
| "description": "Organisation for Economic Co-operation and Development - Economic Data", | |
| "endpoints": [ | |
| { | |
| "url": "https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_QNA_EXPENDITURE_GROWTH_OECD/?format=jsondata", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "data.dataSets", | |
| "key_field": "series" | |
| } | |
| ] | |
| }, | |
| "WorldBank": { | |
| "name": "π World Bank", | |
| "description": "International Financial Institution - Population Data", | |
| "endpoints": [ | |
| { | |
| "url": "https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?format=json&per_page=50&date=2023", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "[1]", | |
| "key_field": "countryiso3code" | |
| } | |
| ] | |
| }, | |
| "Riksbanken": { | |
| "name": "πΈπͺ Riksbanken", | |
| "description": "Swedish Central Bank - Exchange Rates", | |
| "endpoints": [ | |
| { | |
| "url": "https://api.riksbank.se/swea/v1/Observations/Latest/ByGroup/130", | |
| "headers": { | |
| "Accept": "application/json" | |
| }, | |
| "method": "GET", | |
| "data_path": "", | |
| "key_field": "seriesId" | |
| } | |
| ] | |
| }, | |
| "Swecris": { | |
| "name": "πΈπͺ Swecris", | |
| "description": "Swedish Research Council Database - Research Projects", | |
| "endpoints": [ | |
| { | |
| "url": "https://swecris-api.vr.se/v1/projects?size=20", | |
| "headers": { | |
| "Accept": "application/json", | |
| "Authorization": "Bearer VRSwecrisAPI2025-1" | |
| }, | |
| "method": "GET", | |
| "data_path": "content", | |
| "key_field": "projectId", | |
| "requires_auth": True | |
| } | |
| ] | |
| }, | |
| "CSN": { | |
| "name": "πΈπͺ CSN", | |
| "description": "Swedish Board of Student Finance - Statistics Portal", | |
| "endpoints": [ | |
| { | |
| "url": "https://statistik.csn.se/PXWeb/api/v1/sv/CSNstat", | |
| "headers": {"Accept": "application/json"}, | |
| "method": "GET", | |
| "data_path": "", | |
| "key_field": "id", | |
| "note": "PX-Web format requires specific POST queries for data" | |
| } | |
| ] | |
| } | |
| } | |
| def init_database(): | |
| """Initialize optimized SQLite database with proper indexing and compression support""" | |
| conn = sqlite3.connect(DB_PATH) | |
| cursor = conn.cursor() | |
| # Enable WAL mode for better concurrent access | |
| cursor.execute('PRAGMA journal_mode=WAL') | |
| cursor.execute('PRAGMA synchronous=NORMAL') | |
| cursor.execute('PRAGMA cache_size=10000') | |
| cursor.execute('PRAGMA temp_store=memory') | |
| # Create optimized table structure | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS harvested_data ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| api_name TEXT NOT NULL, | |
| endpoint_url TEXT NOT NULL, | |
| data_hash TEXT UNIQUE NOT NULL, | |
| raw_data TEXT, | |
| compressed_data BLOB, | |
| processed_data TEXT, | |
| record_count INTEGER DEFAULT 0, | |
| data_size_bytes INTEGER DEFAULT 0, | |
| compression_ratio REAL, | |
| fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| fetch_duration_ms INTEGER DEFAULT 0, | |
| status TEXT DEFAULT 'success', | |
| session_id TEXT, | |
| data_path TEXT, | |
| key_field TEXT, | |
| error_message TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| # Create performance indexes | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_api_name ON harvested_data(api_name)') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_timestamp ON harvested_data(fetch_timestamp)') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_status ON harvested_data(status)') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_session ON harvested_data(session_id)') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_hash ON harvested_data(data_hash)') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_api_timestamp ON harvested_data(api_name, fetch_timestamp)') | |
| # Create sessions table for better tracking | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS fetch_sessions ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| session_id TEXT UNIQUE NOT NULL, | |
| session_name TEXT, | |
| started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| completed_at TIMESTAMP, | |
| total_apis INTEGER DEFAULT 0, | |
| successful_fetches INTEGER DEFAULT 0, | |
| failed_fetches INTEGER DEFAULT 0, | |
| total_records INTEGER DEFAULT 0, | |
| total_size_bytes INTEGER DEFAULT 0, | |
| status TEXT DEFAULT 'running' | |
| ) | |
| ''') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_session_id ON fetch_sessions(session_id)') | |
| cursor.execute('CREATE INDEX IF NOT EXISTS idx_session_started ON fetch_sessions(started_at)') | |
| conn.commit() | |
| conn.close() | |
| class SimplifiedDataHarvester: | |
| """Simplified data harvester - one function to fetch from all APIs""" | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Simplified-Data-Harvester/1.0 (Research & Analysis)' | |
| }) | |
| self.results = {} | |
| self.errors = {} | |
| def fetch_all_apis(self, progress_callback=None) -> Dict: | |
| """One function to fetch data from all APIs automatically""" | |
| session_id = f"simplified_{int(time.time())}" | |
| total_apis = len(SIMPLIFIED_API_CONFIG) | |
| completed = 0 | |
| if progress_callback: | |
| progress_callback(f"π Starting data collection from {total_apis} APIs...") | |
| for api_name, config in SIMPLIFIED_API_CONFIG.items(): | |
| if progress_callback: | |
| progress_callback(f"π Fetching from {config['name']}...") | |
| try: | |
| api_results = self._fetch_api_data(api_name, config, session_id) | |
| self.results[api_name] = api_results | |
| completed += 1 | |
| if progress_callback: | |
| progress = (completed / total_apis) * 100 | |
| progress_callback(f"β {config['name']} completed ({progress:.1f}%)") | |
| time.sleep(0.5) # Respectful delay | |
| except Exception as e: | |
| self.errors[api_name] = str(e) | |
| if progress_callback: | |
| progress_callback(f"β {config['name']} failed: {str(e)[:50]}...") | |
| completed += 1 | |
| if progress_callback: | |
| successful = len(self.results) | |
| failed = len(self.errors) | |
| progress_callback(f"π Collection complete! β {successful} successful, β {failed} failed") | |
| return { | |
| "results": self.results, | |
| "errors": self.errors, | |
| "session_id": session_id, | |
| "summary": { | |
| "total_apis": total_apis, | |
| "successful": len(self.results), | |
| "failed": len(self.errors), | |
| "success_rate": (len(self.results) / total_apis) * 100 | |
| } | |
| } | |
| def _fetch_api_data(self, api_name: str, config: Dict, session_id: str) -> Dict: | |
| """Fetch data from all endpoints for a specific API""" | |
| api_results = { | |
| "api_name": api_name, | |
| "endpoints": [], | |
| "total_records": 0, | |
| "total_size": 0 | |
| } | |
| for i, endpoint in enumerate(config['endpoints']): | |
| try: | |
| start_time = time.time() | |
| # Make request | |
| if endpoint.get('method', 'GET').upper() == 'POST': | |
| response = self.session.post( | |
| endpoint['url'], | |
| headers=endpoint.get('headers', {}), | |
| json=endpoint.get('data', {}), | |
| timeout=30 | |
| ) | |
| else: | |
| response = self.session.get( | |
| endpoint['url'], | |
| headers=endpoint.get('headers', {}), | |
| timeout=30 | |
| ) | |
| response.raise_for_status() | |
| # Process response | |
| data = self._process_response(response) | |
| fetch_duration = int((time.time() - start_time) * 1000) | |
| # Extract meaningful data using endpoint configuration | |
| processed_data = self._extract_api_data(data, api_name, endpoint) | |
| record_count = self._count_records(processed_data) | |
| data_size = len(response.content) | |
| # Save to database with enhanced metadata | |
| self._save_data_to_db( | |
| api_name, endpoint['url'], data, processed_data, session_id, | |
| fetch_duration, record_count, data_size, "success", endpoint | |
| ) | |
| endpoint_result = { | |
| "endpoint_url": endpoint['url'], | |
| "status": "success", | |
| "records": record_count, | |
| "size_bytes": data_size, | |
| "duration_ms": fetch_duration, | |
| "data_preview": str(processed_data)[:200] + "..." if len(str(processed_data)) > 200 else str(processed_data) | |
| } | |
| api_results["endpoints"].append(endpoint_result) | |
| api_results["total_records"] += record_count | |
| api_results["total_size"] += data_size | |
| except Exception as e: | |
| endpoint_result = { | |
| "endpoint_url": endpoint['url'], | |
| "status": "error", | |
| "error": str(e), | |
| "records": 0, | |
| "size_bytes": 0, | |
| "duration_ms": 0 | |
| } | |
| api_results["endpoints"].append(endpoint_result) | |
| return api_results | |
| def _process_response(self, response): | |
| """Process API response""" | |
| content_type = response.headers.get('content-type', '').lower() | |
| if 'json' in content_type: | |
| return response.json() | |
| else: | |
| try: | |
| return response.json() # Try JSON first | |
| except: | |
| return {"raw_content": response.text} | |
| def _extract_api_data(self, data: Any, api_name: str, endpoint_config: Dict) -> Any: | |
| """Extract meaningful data from API response using data_path configuration""" | |
| try: | |
| data_path = endpoint_config.get('data_path', '') | |
| if not data_path: | |
| return data | |
| # Handle nested path extraction | |
| current_data = data | |
| # Handle array index notation like "[1]" | |
| if data_path.startswith('[') and data_path.endswith(']'): | |
| index = int(data_path[1:-1]) | |
| if isinstance(current_data, list) and len(current_data) > index: | |
| return current_data[index] | |
| else: | |
| return current_data | |
| # Handle dot notation like "body._embedded.schoolUnits" | |
| if '.' in data_path: | |
| path_parts = data_path.split('.') | |
| for part in path_parts: | |
| if isinstance(current_data, dict): | |
| current_data = current_data.get(part, current_data) | |
| else: | |
| break | |
| return current_data | |
| # Handle simple key extraction | |
| if isinstance(current_data, dict): | |
| return current_data.get(data_path, current_data) | |
| return current_data | |
| except Exception as e: | |
| st.warning(f"Data extraction failed for {api_name}: {str(e)}") | |
| return data | |
| def _count_records(self, data: Any) -> int: | |
| """Count records in the data""" | |
| if isinstance(data, list): | |
| return len(data) | |
| elif isinstance(data, dict): | |
| for key, value in data.items(): | |
| if isinstance(value, list) and len(value) > 0: | |
| return len(value) | |
| return 1 | |
| else: | |
| return 1 if data else 0 | |
| def _save_data_to_db(self, api_name: str, endpoint_url: str, raw_data: Any, | |
| processed_data: Any, session_id: str, fetch_duration: int, | |
| record_count: int, data_size: int, status: str, endpoint_config: Dict): | |
| """Save data to database with compression and enhanced metadata""" | |
| conn = sqlite3.connect(DB_PATH) | |
| cursor = conn.cursor() | |
| try: | |
| # Create data hash for deduplication | |
| raw_data_str = json.dumps(raw_data, sort_keys=True, default=str, separators=(',', ':')) | |
| processed_data_str = json.dumps(processed_data, sort_keys=True, default=str, separators=(',', ':')) | |
| data_hash = hashlib.sha256(raw_data_str.encode()).hexdigest() | |
| # Check if data exists | |
| cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,)) | |
| if cursor.fetchone(): | |
| return # Skip duplicate | |
| # Implement smart compression | |
| raw_data_final = None | |
| compressed_data = None | |
| compression_ratio = 1.0 | |
| if data_size > 512: # Compress data larger than 512 bytes | |
| try: | |
| compressed_data = gzip.compress(raw_data_str.encode('utf-8')) | |
| compression_ratio = len(compressed_data) / len(raw_data_str.encode('utf-8')) | |
| # Only use compression if it saves significant space | |
| if compression_ratio < 0.8: | |
| raw_data_final = None | |
| else: | |
| raw_data_final = raw_data_str | |
| compressed_data = None | |
| compression_ratio = 1.0 | |
| except Exception: | |
| raw_data_final = raw_data_str | |
| compressed_data = None | |
| compression_ratio = 1.0 | |
| else: | |
| raw_data_final = raw_data_str | |
| # Extract endpoint metadata | |
| data_path = endpoint_config.get('data_path', '') | |
| key_field = endpoint_config.get('key_field', '') | |
| # Insert data with enhanced metadata | |
| cursor.execute(''' | |
| INSERT INTO harvested_data | |
| (api_name, endpoint_url, data_hash, raw_data, compressed_data, processed_data, | |
| record_count, data_size_bytes, compression_ratio, fetch_duration_ms, | |
| status, session_id, data_path, key_field) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| api_name, endpoint_url, data_hash, raw_data_final, compressed_data, | |
| processed_data_str, record_count, data_size, compression_ratio, | |
| fetch_duration, status, session_id, data_path, key_field | |
| )) | |
| conn.commit() | |
| except Exception as e: | |
| # Log error but don't fail the entire operation | |
| cursor.execute(''' | |
| INSERT INTO harvested_data | |
| (api_name, endpoint_url, data_hash, status, error_message, session_id) | |
| VALUES (?, ?, ?, ?, ?, ?) | |
| ''', (api_name, endpoint_url, f"error_{int(time.time())}", "error", str(e), session_id)) | |
| conn.commit() | |
| finally: | |
| conn.close() | |
| def get_database_stats(): | |
| """Get enhanced database statistics""" | |
| conn = sqlite3.connect(DB_PATH) | |
| cursor = conn.cursor() | |
| try: | |
| # Basic counts | |
| cursor.execute('SELECT COUNT(*) FROM harvested_data WHERE status = "success"') | |
| total_records = cursor.fetchone()[0] | |
| cursor.execute('SELECT COUNT(DISTINCT api_name) FROM harvested_data WHERE status = "success"') | |
| active_apis = cursor.fetchone()[0] | |
| cursor.execute('SELECT SUM(record_count) FROM harvested_data WHERE status = "success"') | |
| total_data_records = cursor.fetchone()[0] or 0 | |
| # Enhanced metrics | |
| cursor.execute('SELECT SUM(data_size_bytes) FROM harvested_data WHERE status = "success"') | |
| total_size_bytes = cursor.fetchone()[0] or 0 | |
| cursor.execute('SELECT AVG(compression_ratio) FROM harvested_data WHERE compression_ratio IS NOT NULL') | |
| avg_compression = cursor.fetchone()[0] or 1.0 | |
| cursor.execute('SELECT COUNT(*) FROM harvested_data WHERE status = "error"') | |
| error_count = cursor.fetchone()[0] | |
| cursor.execute('SELECT AVG(fetch_duration_ms) FROM harvested_data WHERE status = "success"') | |
| avg_fetch_time = cursor.fetchone()[0] or 0 | |
| # Latest session info | |
| cursor.execute(''' | |
| SELECT session_id, COUNT(*) as fetches, MAX(fetch_timestamp) as latest | |
| FROM harvested_data | |
| WHERE session_id IS NOT NULL | |
| GROUP BY session_id | |
| ORDER BY latest DESC | |
| LIMIT 1 | |
| ''') | |
| latest_session = cursor.fetchone() | |
| return { | |
| "total_records": total_records, | |
| "active_apis": active_apis, | |
| "total_data_records": total_data_records, | |
| "total_size_mb": round(total_size_bytes / (1024 * 1024), 2), | |
| "avg_compression": round(avg_compression, 3), | |
| "error_count": error_count, | |
| "avg_fetch_time_ms": round(avg_fetch_time, 1), | |
| "latest_session": latest_session[0] if latest_session else None, | |
| "latest_session_fetches": latest_session[1] if latest_session else 0 | |
| } | |
| finally: | |
| conn.close() | |
| # Initialize database | |
| init_database() | |
| # Initialize components | |
| if 'harvester' not in st.session_state: | |
| st.session_state.harvester = SimplifiedDataHarvester() | |
| if 'last_results' not in st.session_state: | |
| st.session_state.last_results = None | |
| # Header | |
| st.markdown(""" | |
| <div class="title-container"> | |
| <h1 style="font-size: 2.5rem; margin: 0; color: #2c3e50;"> | |
| π Simplified Data Harvester | |
| </h1> | |
| <p style="font-size: 1.1rem; margin: 0.5rem 0 0 0; color: #34495e;"> | |
| One-Click Data Collection from All APIs | |
| </p> | |
| <p style="font-size: 0.95rem; margin: 0.3rem 0 0 0; color: #7f8c8d;"> | |
| Automatic data fetching from 10 international sources with smart database storage | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Display ML status | |
| if ML_AVAILABLE: | |
| st.success("π€ **AI Enhanced** - Quality assessment and analysis active") | |
| else: | |
| st.info("π **Standard Mode** - Core functionality available") | |
| # Main Action Section | |
| st.markdown("### π Data Collection") | |
| # Show API status | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.markdown("**Available APIs:**") | |
| # Display APIs in a compact format | |
| for api_name, config in SIMPLIFIED_API_CONFIG.items(): | |
| st.markdown(f"β **{config['name']}** - {config['description']}") | |
| with col2: | |
| # Enhanced database stats | |
| try: | |
| stats = get_database_stats() | |
| st.metric("π Fetch Records", f"{stats.get('total_records', 0):,}") | |
| st.metric("π Active APIs", stats.get('active_apis', 0)) | |
| st.metric("π Data Records", f"{stats.get('total_data_records', 0):,}") | |
| st.metric("πΎ Storage (MB)", f"{stats.get('total_size_mb', 0)}") | |
| if stats.get('avg_compression', 1.0) < 1.0: | |
| compression_pct = (1 - stats.get('avg_compression', 1.0)) * 100 | |
| st.metric("ποΈ Compression", f"{compression_pct:.1f}% saved") | |
| if stats.get('error_count', 0) > 0: | |
| st.metric("β οΈ Errors", stats.get('error_count', 0)) | |
| except Exception as e: | |
| st.metric("π Total Records", "0") | |
| st.metric("π Active APIs", "0") | |
| st.error(f"Stats error: {str(e)}") | |
| st.markdown("---") | |
| # Single button to fetch all data | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| if st.button("π FETCH ALL DATA FROM ALL APIS", | |
| type="primary", | |
| use_container_width=True, | |
| help="Automatically collect data from all 10 APIs and save to database"): | |
| # Progress tracking | |
| status_container = st.empty() | |
| progress_bar = st.progress(0) | |
| def update_progress(message): | |
| status_container.text(message) | |
| # Execute the one-click data collection | |
| with st.spinner("π Collecting data from all APIs..."): | |
| try: | |
| results = st.session_state.harvester.fetch_all_apis(update_progress) | |
| st.session_state.last_results = results | |
| progress_bar.progress(1.0) | |
| status_container.success("β Collection completed!") | |
| except Exception as e: | |
| progress_bar.progress(0.0) | |
| status_container.error(f"β Collection failed: {str(e)}") | |
| st.error("Data collection encountered an error. Please try again.") | |
| st.stop() # Stop execution here if collection failed | |
| # Show results only if collection was successful | |
| if 'results' not in locals() or not results: | |
| st.error("No results to display") | |
| st.stop() | |
| summary = results.get('summary', {}) | |
| # Success metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("β Successful APIs", summary.get('successful', 0)) | |
| with col2: | |
| st.metric("β Failed APIs", summary.get('failed', 0)) | |
| with col3: | |
| st.metric("π Success Rate", f"{summary.get('success_rate', 0):.1f}%") | |
| with col4: | |
| try: | |
| total_records = sum(api_data.get('total_records', 0) for api_data in results.get('results', {}).values()) | |
| st.metric("π Total Records", f"{total_records:,}") | |
| except Exception: | |
| st.metric("π Total Records", "Error") | |
| # Simplified results summary | |
| st.markdown("### π Results Summary") | |
| try: | |
| if len(results.get('results', {})) > 0: | |
| st.success(f"β Successfully processed {len(results['results'])} APIs") | |
| # Show just API names and record counts in a simple format | |
| for api_name, api_data in list(results['results'].items())[:5]: | |
| records = api_data.get('total_records', 0) | |
| size_mb = api_data.get('total_size', 0) / (1024 * 1024) | |
| st.write(f"β’ **{SIMPLIFIED_API_CONFIG[api_name]['name']}**: {records:,} records ({size_mb:.2f} MB)") | |
| if len(results['results']) > 5: | |
| st.info(f"... and {len(results['results']) - 5} more APIs processed successfully") | |
| else: | |
| st.warning("No results to display") | |
| except Exception as e: | |
| st.warning("Results summary unavailable - check database viewer below for stored data") | |
| # Error details | |
| if results['errors']: | |
| st.markdown("### β Error Details") | |
| for api_name, error in results['errors'].items(): | |
| st.error(f"**{SIMPLIFIED_API_CONFIG[api_name]['name']}:** {error}") | |
| # Show last results if available | |
| if st.session_state.last_results: | |
| st.markdown("---") | |
| st.markdown("### π Quick Analytics") | |
| results = st.session_state.last_results | |
| # Simple analytics with minimal processing | |
| try: | |
| if results.get('results') and len(results['results']) > 0: | |
| # Very simple metrics - no complex calculations | |
| st.success("π Analytics: Data collection completed successfully!") | |
| # Basic info only | |
| total_apis = len(results['results']) | |
| st.info(f"π Processed {total_apis} APIs with data storage in database") | |
| # Skip complex charts and calculations to prevent hanging | |
| if PLOTLY_AVAILABLE: | |
| st.info("π Charts available - check database viewer for detailed analysis") | |
| else: | |
| st.info("π Database contains all collected data for analysis") | |
| except Exception as e: | |
| st.info("β Data collection was successful - check database viewer below") | |
| # Simplified Database viewer | |
| with st.expander("ποΈ Database Viewer"): | |
| try: | |
| conn = sqlite3.connect(DB_PATH) | |
| # Simple count query first | |
| cursor = conn.cursor() | |
| cursor.execute('SELECT COUNT(*) FROM harvested_data') | |
| total_count = cursor.fetchone()[0] | |
| if total_count > 0: | |
| st.success(f"π Database contains {total_count} records") | |
| # Show only recent 10 records to prevent overload | |
| df = pd.read_sql_query(''' | |
| SELECT | |
| api_name as "API", | |
| record_count as "Records", | |
| status as "Status", | |
| SUBSTR(fetch_timestamp, 1, 16) as "Time" | |
| FROM harvested_data | |
| ORDER BY fetch_timestamp DESC | |
| LIMIT 10 | |
| ''', conn) | |
| st.dataframe(df, use_container_width=True) | |
| if total_count > 10: | |
| st.info(f"Showing latest 10 records. Total: {total_count} records in database.") | |
| else: | |
| st.info("No data in database yet. Run the data collection first!") | |
| conn.close() | |
| except Exception as e: | |
| st.warning("Database viewer temporarily unavailable") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 1rem; background: rgba(255,255,255,0.8); border-radius: 10px; color: #2c3e50;"> | |
| <p><strong>π Optimized Data Harvester</strong> - Research-verified APIs with smart storage</p> | |
| <p style="font-size: 0.9rem; color: #7f8c8d;"> | |
| β 10 Tested APIs β’ ποΈ Data compression β’ π Performance metrics β’ π Enhanced analytics | |
| </p> | |
| <p style="font-size: 0.8rem; color: #95a5a6;"> | |
| Featuring proper data extraction, optimized database with indexing, and compression for efficient storage | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) |