Spaces:

isakskogstad
/

api-data-fetcher

Sleeping

App Files Files Community

api-data-fetcher / app_modern.py

isakskogstad

Upload app_modern.py with huggingface_hub

526873c verified 9 months ago

raw

history blame contribute delete

25 kB

	import streamlit as st
	import requests
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import json
	import time
	from datetime import datetime, timedelta
	import asyncio
	import aiohttp
	from typing import Dict, Any, List, Optional
	import sqlite3
	import hashlib
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import xml.etree.ElementTree as ET

	# Modern Page Configuration
	st.set_page_config(
	page_title="Global Data Harvester",
	page_icon="🌍",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Modern CSS Styling
	st.markdown("""
	<style>
	.main > div {
	padding-top: 2rem;
	}
	.stApp {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	}
	.metric-card {
	background: rgba(255, 255, 255, 0.1);
	backdrop-filter: blur(10px);
	border-radius: 15px;
	padding: 1.5rem;
	margin: 0.5rem 0;
	border: 1px solid rgba(255, 255, 255, 0.2);
	box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
	}
	.api-card {
	background: rgba(255, 255, 255, 0.15);
	backdrop-filter: blur(10px);
	border-radius: 12px;
	padding: 1rem;
	margin: 0.5rem;
	border: 1px solid rgba(255, 255, 255, 0.3);
	transition: all 0.3s ease;
	}
	.api-card:hover {
	transform: translateY(-5px);
	box-shadow: 0 12px 40px 0 rgba(31, 38, 135, 0.5);
	}
	.title-container {
	text-align: center;
	padding: 2rem 0;
	background: rgba(255, 255, 255, 0.1);
	border-radius: 20px;
	margin-bottom: 2rem;
	backdrop-filter: blur(15px);
	}
	.progress-ring {
	transform: rotate(-90deg);
	}
	.progress-ring-circle {
	transition: stroke-dashoffset 0.35s;
	transform: rotate(-90deg);
	transform-origin: 50% 50%;
	}
	.status-indicator {
	width: 12px;
	height: 12px;
	border-radius: 50%;
	display: inline-block;
	margin-right: 8px;
	}
	.status-active { background-color: #4ade80; }
	.status-warning { background-color: #fbbf24; }
	.status-error { background-color: #ef4444; }
	</style>
	""", unsafe_allow_html=True)

	# Enhanced API Configuration with correct endpoints based on research
	API_CONFIG = {
	"Skolverket": {
	"name": "🇸🇪 Skolverket",
	"description": "Swedish National Agency for Education",
	"base_url": "https://api.skolverket.se",
	"endpoints": {
	"planned_educations": {
	"url": "/planned-educations/v3/compact-school-units",
	"method": "GET",
	"headers": {"accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"},
	"params": {"coordinateSystemType": "WGS84", "page": 0, "size": 20}
	},
	"school_register": {
	"url": "/skolenhetsregister/v2/skolenhet",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {}
	},
	"syllabus": {
	"url": "/syllabus/v1/studievag",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"studievagstyp": "GY"}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"SCB": {
	"name": "🇸🇪 Statistics Sweden",
	"description": "Swedish National Statistics Office",
	"base_url": "https://api.scb.se",
	"endpoints": {
	"population": {
	"url": "/OV0104/v1/doris/sv/ssd/BE/BE0101/BE0101A/BefolkningNy",
	"method": "POST",
	"headers": {"accept": "application/json"},
	"data": {"query": [], "response": {"format": "json"}}
	}
	},
	"rate_limit": {"requests": 10, "per_seconds": 10},
	"auth": None
	},
	"Kolada": {
	"name": "🇸🇪 Kolada",
	"description": "Municipal Key Performance Indicators",
	"base_url": "https://api.kolada.se",
	"endpoints": {
	"municipalities": {
	"url": "/v2/municipality",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {}
	},
	"kpi_data": {
	"url": "/v2/data/kpi/N00945",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"municipality": "1860", "year": "2023"}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"Eurostat": {
	"name": "🇪🇺 Eurostat",
	"description": "European Union Statistics",
	"base_url": "https://ec.europa.eu/eurostat",
	"endpoints": {
	"education": {
	"url": "/api/dissemination/statistics/1.0/data/educ_uoe_enra21",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"format": "JSON", "lang": "EN"}
	},
	"population": {
	"url": "/api/dissemination/statistics/1.0/data/demo_pjan",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"format": "JSON", "lang": "EN"}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"WHO": {
	"name": "🌍 WHO",
	"description": "World Health Organization",
	"base_url": "https://ghoapi.azureedge.net",
	"endpoints": {
	"indicators": {
	"url": "/api/Indicator",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"$top": "10"}
	},
	"life_expectancy": {
	"url": "/api/GHO/WHOSIS_000001",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"OECD": {
	"name": "🌍 OECD",
	"description": "Organisation for Economic Co-operation and Development",
	"base_url": "https://sdmx.oecd.org",
	"endpoints": {
	"gdp_data": {
	"url": "/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@NAAG,1.0/.A.GDP_V_CAP.",
	"method": "GET",
	"headers": {"accept": "application/vnd.sdmx.data+json;version=1.0.0"},
	"params": {"startPeriod": "2020"}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"WorldBank": {
	"name": "🌍 World Bank",
	"description": "International Financial Institution",
	"base_url": "https://api.worldbank.org",
	"endpoints": {
	"countries": {
	"url": "/v2/country",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"format": "json", "per_page": "20"}
	},
	"gdp_indicator": {
	"url": "/v2/country/all/indicator/NY.GDP.MKTP.CD",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {"format": "json", "date": "2023"}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"Riksbanken": {
	"name": "🇸🇪 Riksbanken",
	"description": "Swedish Central Bank",
	"base_url": "https://api.riksbank.se",
	"endpoints": {
	"eur_sek": {
	"url": "/swea/v1/Observations/SEKEURPMI/2024-01-01/2024-12-31",
	"method": "GET",
	"headers": {"accept": "application/json"},
	"params": {}
	}
	},
	"rate_limit": None,
	"auth": None
	},
	"Swecris": {
	"name": "🇸🇪 Swecris",
	"description": "Swedish Research Council Database",
	"base_url": "https://swecris-api.vr.se",
	"endpoints": {
	"projects": {
	"url": "/v1/projects",
	"method": "GET",
	"headers": {"accept": "application/json", "Authorization": "Bearer VRSwecrisAPI2025-1"},
	"params": {"funderId": "VR"}
	}
	},
	"rate_limit": None,
	"auth": {"type": "Bearer", "token": "VRSwecrisAPI2025-1"}
	},
	"CSN": {
	"name": "🇸🇪 CSN",
	"description": "Swedish Board of Student Finance",
	"base_url": "https://statistik.csn.se",
	"endpoints": {
	"student_aid": {
	"url": "/PXWeb/api/v1/sv/CSNstat/Studiestod/Studiemedel/Hogskola/SS0101B1.px",
	"method": "POST",
	"headers": {"accept": "application/json"},
	"data": {"query": [], "response": {"format": "json"}}
	}
	},
	"rate_limit": None,
	"auth": None
	}
	}

	# Database setup
	DB_PATH = "global_data_harvester.db"

	def init_database():
	"""Initialize modern SQLite database"""
	conn = sqlite3.connect(DB_PATH)
	cursor = conn.cursor()

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS api_data (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	api_name TEXT NOT NULL,
	endpoint_name TEXT NOT NULL,
	data_hash TEXT UNIQUE,
	raw_data TEXT,
	record_count INTEGER,
	fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	status TEXT DEFAULT 'success'
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS fetch_sessions (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	session_name TEXT NOT NULL,
	started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	completed_at TIMESTAMP,
	total_apis INTEGER,
	successful_fetches INTEGER,
	failed_fetches INTEGER,
	total_records INTEGER
	)
	''')

	conn.commit()
	conn.close()

	class ModernDataFetcher:
	"""Modern, clean data fetcher with proper error handling"""

	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Global-Data-Harvester/1.0 (Research & Education Purpose)'
	})
	init_database()

	def fetch_api_data(self, api_name: str, endpoint_name: str) -> Dict[str, Any]:
	"""Fetch data from a specific API endpoint"""
	try:
	config = API_CONFIG[api_name]
	endpoint = config["endpoints"][endpoint_name]

	# Apply rate limiting
	if config.get("rate_limit"):
	rate_limit = config["rate_limit"]
	time.sleep(rate_limit["per_seconds"] / rate_limit["requests"])

	# Prepare request
	url = config["base_url"] + endpoint["url"]
	headers = endpoint.get("headers", {})

	# Make request based on method
	if endpoint["method"] == "POST":
	response = self.session.post(
	url,
	json=endpoint.get("data", {}),
	headers=headers,
	timeout=30
	)
	else:
	params = endpoint.get("params", {})
	response = self.session.get(url, params=params, headers=headers, timeout=30)

	response.raise_for_status()

	# Parse response
	try:
	data = response.json()
	except:
	data = {"raw_response": response.text}

	# Process and clean data
	processed_data = self._process_response(api_name, data)

	# Save to database
	self._save_to_database(api_name, endpoint_name, processed_data)

	return {
	"status": "success",
	"data": processed_data,
	"record_count": len(processed_data) if isinstance(processed_data, list) else 1,
	"timestamp": datetime.now().isoformat()
	}

	except Exception as e:
	return {
	"status": "error",
	"error": str(e),
	"timestamp": datetime.now().isoformat()
	}

	def _process_response(self, api_name: str, data: Any) -> Any:
	"""Process API response based on known structures"""
	if api_name == "Skolverket":
	if isinstance(data, dict) and "_embedded" in data:
	for key, value in data["_embedded"].items():
	if isinstance(value, list):
	return value
	return data

	elif api_name == "SCB":
	if isinstance(data, dict):
	return data.get("data", data)

	elif api_name == "Kolada":
	if isinstance(data, dict):
	return data.get("values", data)

	elif api_name == "Eurostat":
	if isinstance(data, dict):
	return data.get("value", data)

	elif api_name == "WHO":
	if isinstance(data, dict):
	return data.get("value", data)

	elif api_name == "WorldBank":
	if isinstance(data, list) and len(data) > 1:
	return data[1] if data[1] else data[0]
	return data

	elif api_name == "Riksbanken":
	if isinstance(data, dict):
	return data.get("observations", data)

	elif api_name == "Swecris":
	if isinstance(data, dict):
	return data.get("items", data)

	elif api_name == "CSN":
	if isinstance(data, dict):
	return data.get("data", data)

	return data

	def _save_to_database(self, api_name: str, endpoint_name: str, data: Any):
	"""Save data to database with deduplication"""
	conn = sqlite3.connect(DB_PATH)
	cursor = conn.cursor()

	# Create hash for deduplication
	data_str = json.dumps(data, sort_keys=True, default=str)
	data_hash = hashlib.sha256(data_str.encode()).hexdigest()

	record_count = len(data) if isinstance(data, list) else 1

	try:
	cursor.execute('''
	INSERT OR IGNORE INTO api_data
	(api_name, endpoint_name, data_hash, raw_data, record_count)
	VALUES (?, ?, ?, ?, ?)
	''', (api_name, endpoint_name, data_hash, data_str, record_count))

	conn.commit()
	except sqlite3.IntegrityError:
	pass # Data already exists
	finally:
	conn.close()

	# Initialize session state
	if 'fetcher' not in st.session_state:
	st.session_state.fetcher = ModernDataFetcher()

	if 'fetch_results' not in st.session_state:
	st.session_state.fetch_results = {}

	if 'active_apis' not in st.session_state:
	st.session_state.active_apis = set()

	# Modern Header
	st.markdown("""
	<div class="title-container">
	<h1 style="font-size: 3rem; margin: 0; background: linear-gradient(45deg, #fff, #f0f0f0); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">
	🌍 Global Data Harvester
	</h1>
	<p style="font-size: 1.2rem; margin: 0.5rem 0 0 0; opacity: 0.9;">
	Intelligent data collection from 10 international sources
	</p>
	<div style="margin-top: 1rem;">
	<span style="background: rgba(255,255,255,0.2); padding: 0.3rem 0.8rem; border-radius: 15px; margin: 0 0.3rem; font-size: 0.9rem;">Real-time</span>
	<span style="background: rgba(255,255,255,0.2); padding: 0.3rem 0.8rem; border-radius: 15px; margin: 0 0.3rem; font-size: 0.9rem;">Auto-discovery</span>
	<span style="background: rgba(255,255,255,0.2); padding: 0.3rem 0.8rem; border-radius: 15px; margin: 0 0.3rem; font-size: 0.9rem;">Parallel processing</span>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Main dashboard layout
	col1, col2 = st.columns([2, 1])

	with col1:
	st.markdown("### 🎯 API Sources")

	# Create API grid
	api_cols = st.columns(2)

	for i, (api_key, api_config) in enumerate(API_CONFIG.items()):
	col = api_cols[i % 2]

	with col:
	# Get status
	last_fetch = st.session_state.fetch_results.get(api_key, {})
	status = last_fetch.get("status", "inactive")

	status_color = {
	"success": "status-active",
	"error": "status-error",
	"inactive": "status-warning"
	}.get(status, "status-warning")

	# Create API card
	st.markdown(f"""
	<div class="api-card">
	<div style="display: flex; align-items: center; margin-bottom: 0.5rem;">
	<span class="status-indicator {status_color}"></span>
	<strong>{api_config['name']}</strong>
	</div>
	<p style="margin: 0; font-size: 0.9rem; opacity: 0.8;">{api_config['description']}</p>
	<p style="margin: 0.5rem 0 0 0; font-size: 0.8rem;">
	{len(api_config['endpoints'])} endpoints available
	</p>
	</div>
	""", unsafe_allow_html=True)

	# Fetch button for each API
	if st.button(f"Fetch {api_key}", key=f"fetch_{api_key}", use_container_width=True):
	with st.spinner(f"Fetching data from {api_key}..."):
	results = {}
	for endpoint_name in api_config["endpoints"].keys():
	result = st.session_state.fetcher.fetch_api_data(api_key, endpoint_name)
	results[endpoint_name] = result

	st.session_state.fetch_results[api_key] = results
	st.session_state.active_apis.add(api_key)
	st.rerun()

	with col2:
	st.markdown("### 📊 Live Metrics")

	# Calculate metrics
	total_apis = len(API_CONFIG)
	active_apis = len(st.session_state.active_apis)
	total_successful = sum(
	1 for api_results in st.session_state.fetch_results.values()
	for result in api_results.values()
	if result.get("status") == "success"
	)
	total_records = sum(
	result.get("record_count", 0)
	for api_results in st.session_state.fetch_results.values()
	for result in api_results.values()
	if result.get("status") == "success"
	)

	# Display metrics with modern cards
	metrics = [
	("🎯 Active APIs", f"{active_apis}/{total_apis}"),
	("✅ Successful", total_successful),
	("📊 Total Records", f"{total_records:,}"),
	("🕒 Last Update", datetime.now().strftime("%H:%M:%S"))
	]

	for label, value in metrics:
	st.markdown(f"""
	<div class="metric-card">
	<div style="font-size: 0.9rem; opacity: 0.8; margin-bottom: 0.3rem;">{label}</div>
	<div style="font-size: 1.5rem; font-weight: bold;">{value}</div>
	</div>
	""", unsafe_allow_html=True)

	# Bulk operations
	st.markdown("### 🚀 Bulk Operations")

	if st.button("🌍 Fetch All APIs", use_container_width=True, type="primary"):
	progress_bar = st.progress(0)
	status_text = st.empty()

	total_operations = sum(len(config["endpoints"]) for config in API_CONFIG.values())
	completed = 0

	for api_key, api_config in API_CONFIG.items():
	status_text.text(f"Processing {api_key}...")
	results = {}

	for endpoint_name in api_config["endpoints"].keys():
	result = st.session_state.fetcher.fetch_api_data(api_key, endpoint_name)
	results[endpoint_name] = result
	completed += 1
	progress_bar.progress(completed / total_operations)

	st.session_state.fetch_results[api_key] = results
	st.session_state.active_apis.add(api_key)

	status_text.text("✅ All APIs processed!")
	time.sleep(1)
	st.rerun()

	# Results section
	if st.session_state.fetch_results:
	st.markdown("### 📈 Latest Results")

	# Create tabs for different views
	tab1, tab2, tab3 = st.tabs(["📊 Data Preview", "📋 Status Overview", "💾 Export"])

	with tab1:
	selected_api = st.selectbox("Select API for preview", list(st.session_state.fetch_results.keys()))

	if selected_api:
	api_results = st.session_state.fetch_results[selected_api]

	for endpoint_name, result in api_results.items():
	if result.get("status") == "success":
	st.markdown(f"{selected_api} - {endpoint_name}")

	data = result.get("data", [])
	if isinstance(data, list) and len(data) > 0:
	# Create DataFrame and show preview
	df = pd.json_normalize(data[:10]) # First 10 records
	st.dataframe(df, use_container_width=True)
	st.caption(f"Showing 10 of {len(data)} records")
	else:
	st.json(data)

	with tab2:
	# Status overview with charts
	status_data = []
	for api_name, api_results in st.session_state.fetch_results.items():
	for endpoint_name, result in api_results.items():
	status_data.append({
	"API": api_name,
	"Endpoint": endpoint_name,
	"Status": result.get("status", "unknown"),
	"Records": result.get("record_count", 0),
	"Timestamp": result.get("timestamp", "")
	})

	if status_data:
	df_status = pd.DataFrame(status_data)

	# Success rate chart
	fig = px.pie(
	df_status,
	names="Status",
	title="Fetch Success Rate",
	color_discrete_map={"success": "#4ade80", "error": "#ef4444"}
	)
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	font_color="white"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Records by API
	records_by_api = df_status[df_status["Status"] == "success"].groupby("API")["Records"].sum().reset_index()
	if not records_by_api.empty:
	fig2 = px.bar(
	records_by_api,
	x="API",
	y="Records",
	title="Records Fetched by API"
	)
	fig2.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	font_color="white"
	)
	st.plotly_chart(fig2, use_container_width=True)

	with tab3:
	st.markdown("Export Options")

	# Database statistics
	conn = sqlite3.connect(DB_PATH)
	total_records_db = pd.read_sql_query("SELECT COUNT(*) as count FROM api_data", conn).iloc[0]["count"]
	conn.close()

	st.metric("Database Records", total_records_db)

	if st.button("📄 Export All Data (JSON)", use_container_width=True):
	export_data = {
	"export_timestamp": datetime.now().isoformat(),
	"session_results": st.session_state.fetch_results,
	"summary": {
	"total_apis": len(st.session_state.fetch_results),
	"total_records": total_records,
	"active_apis": list(st.session_state.active_apis)
	}
	}

	st.download_button(
	"💾 Download Export",
	data=json.dumps(export_data, indent=2, default=str),
	file_name=f"global_data_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
	mime="application/json",
	use_container_width=True
	)

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; padding: 1rem; opacity: 0.8;">
	<p><strong>🌍 Global Data Harvester</strong> - Modern data collection from international sources</p>
	<p style="font-size: 0.9rem;">
	🇸🇪 Swedish: Skolverket, SCB, Kolada, Riksbanken, Swecris, CSN •
	🌍 Global: Eurostat, WHO, OECD, World Bank
	</p>
	</div>
	""", unsafe_allow_html=True)