api-data-fetcher / app_modern.py
isakskogstad's picture
Upload app_modern.py with huggingface_hub
526873c verified
import streamlit as st
import requests
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import time
from datetime import datetime, timedelta
import asyncio
import aiohttp
from typing import Dict, Any, List, Optional
import sqlite3
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
import xml.etree.ElementTree as ET
# Modern Page Configuration
st.set_page_config(
page_title="Global Data Harvester",
page_icon="🌍",
layout="wide",
initial_sidebar_state="collapsed"
)
# Modern CSS Styling
st.markdown("""
<style>
.main > div {
padding-top: 2rem;
}
.stApp {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.metric-card {
background: rgba(255, 255, 255, 0.1);
backdrop-filter: blur(10px);
border-radius: 15px;
padding: 1.5rem;
margin: 0.5rem 0;
border: 1px solid rgba(255, 255, 255, 0.2);
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
}
.api-card {
background: rgba(255, 255, 255, 0.15);
backdrop-filter: blur(10px);
border-radius: 12px;
padding: 1rem;
margin: 0.5rem;
border: 1px solid rgba(255, 255, 255, 0.3);
transition: all 0.3s ease;
}
.api-card:hover {
transform: translateY(-5px);
box-shadow: 0 12px 40px 0 rgba(31, 38, 135, 0.5);
}
.title-container {
text-align: center;
padding: 2rem 0;
background: rgba(255, 255, 255, 0.1);
border-radius: 20px;
margin-bottom: 2rem;
backdrop-filter: blur(15px);
}
.progress-ring {
transform: rotate(-90deg);
}
.progress-ring-circle {
transition: stroke-dashoffset 0.35s;
transform: rotate(-90deg);
transform-origin: 50% 50%;
}
.status-indicator {
width: 12px;
height: 12px;
border-radius: 50%;
display: inline-block;
margin-right: 8px;
}
.status-active { background-color: #4ade80; }
.status-warning { background-color: #fbbf24; }
.status-error { background-color: #ef4444; }
</style>
""", unsafe_allow_html=True)
# Enhanced API Configuration with correct endpoints based on research
API_CONFIG = {
"Skolverket": {
"name": "πŸ‡ΈπŸ‡ͺ Skolverket",
"description": "Swedish National Agency for Education",
"base_url": "https://api.skolverket.se",
"endpoints": {
"planned_educations": {
"url": "/planned-educations/v3/compact-school-units",
"method": "GET",
"headers": {"accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"},
"params": {"coordinateSystemType": "WGS84", "page": 0, "size": 20}
},
"school_register": {
"url": "/skolenhetsregister/v2/skolenhet",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {}
},
"syllabus": {
"url": "/syllabus/v1/studievag",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"studievagstyp": "GY"}
}
},
"rate_limit": None,
"auth": None
},
"SCB": {
"name": "πŸ‡ΈπŸ‡ͺ Statistics Sweden",
"description": "Swedish National Statistics Office",
"base_url": "https://api.scb.se",
"endpoints": {
"population": {
"url": "/OV0104/v1/doris/sv/ssd/BE/BE0101/BE0101A/BefolkningNy",
"method": "POST",
"headers": {"accept": "application/json"},
"data": {"query": [], "response": {"format": "json"}}
}
},
"rate_limit": {"requests": 10, "per_seconds": 10},
"auth": None
},
"Kolada": {
"name": "πŸ‡ΈπŸ‡ͺ Kolada",
"description": "Municipal Key Performance Indicators",
"base_url": "https://api.kolada.se",
"endpoints": {
"municipalities": {
"url": "/v2/municipality",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {}
},
"kpi_data": {
"url": "/v2/data/kpi/N00945",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"municipality": "1860", "year": "2023"}
}
},
"rate_limit": None,
"auth": None
},
"Eurostat": {
"name": "πŸ‡ͺπŸ‡Ί Eurostat",
"description": "European Union Statistics",
"base_url": "https://ec.europa.eu/eurostat",
"endpoints": {
"education": {
"url": "/api/dissemination/statistics/1.0/data/educ_uoe_enra21",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"format": "JSON", "lang": "EN"}
},
"population": {
"url": "/api/dissemination/statistics/1.0/data/demo_pjan",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"format": "JSON", "lang": "EN"}
}
},
"rate_limit": None,
"auth": None
},
"WHO": {
"name": "🌍 WHO",
"description": "World Health Organization",
"base_url": "https://ghoapi.azureedge.net",
"endpoints": {
"indicators": {
"url": "/api/Indicator",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"$top": "10"}
},
"life_expectancy": {
"url": "/api/GHO/WHOSIS_000001",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {}
}
},
"rate_limit": None,
"auth": None
},
"OECD": {
"name": "🌍 OECD",
"description": "Organisation for Economic Co-operation and Development",
"base_url": "https://sdmx.oecd.org",
"endpoints": {
"gdp_data": {
"url": "/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@NAAG,1.0/.A.GDP_V_CAP.",
"method": "GET",
"headers": {"accept": "application/vnd.sdmx.data+json;version=1.0.0"},
"params": {"startPeriod": "2020"}
}
},
"rate_limit": None,
"auth": None
},
"WorldBank": {
"name": "🌍 World Bank",
"description": "International Financial Institution",
"base_url": "https://api.worldbank.org",
"endpoints": {
"countries": {
"url": "/v2/country",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"format": "json", "per_page": "20"}
},
"gdp_indicator": {
"url": "/v2/country/all/indicator/NY.GDP.MKTP.CD",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {"format": "json", "date": "2023"}
}
},
"rate_limit": None,
"auth": None
},
"Riksbanken": {
"name": "πŸ‡ΈπŸ‡ͺ Riksbanken",
"description": "Swedish Central Bank",
"base_url": "https://api.riksbank.se",
"endpoints": {
"eur_sek": {
"url": "/swea/v1/Observations/SEKEURPMI/2024-01-01/2024-12-31",
"method": "GET",
"headers": {"accept": "application/json"},
"params": {}
}
},
"rate_limit": None,
"auth": None
},
"Swecris": {
"name": "πŸ‡ΈπŸ‡ͺ Swecris",
"description": "Swedish Research Council Database",
"base_url": "https://swecris-api.vr.se",
"endpoints": {
"projects": {
"url": "/v1/projects",
"method": "GET",
"headers": {"accept": "application/json", "Authorization": "Bearer VRSwecrisAPI2025-1"},
"params": {"funderId": "VR"}
}
},
"rate_limit": None,
"auth": {"type": "Bearer", "token": "VRSwecrisAPI2025-1"}
},
"CSN": {
"name": "πŸ‡ΈπŸ‡ͺ CSN",
"description": "Swedish Board of Student Finance",
"base_url": "https://statistik.csn.se",
"endpoints": {
"student_aid": {
"url": "/PXWeb/api/v1/sv/CSNstat/Studiestod/Studiemedel/Hogskola/SS0101B1.px",
"method": "POST",
"headers": {"accept": "application/json"},
"data": {"query": [], "response": {"format": "json"}}
}
},
"rate_limit": None,
"auth": None
}
}
# Database setup
DB_PATH = "global_data_harvester.db"
def init_database():
"""Initialize modern SQLite database"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS api_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
api_name TEXT NOT NULL,
endpoint_name TEXT NOT NULL,
data_hash TEXT UNIQUE,
raw_data TEXT,
record_count INTEGER,
fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
status TEXT DEFAULT 'success'
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS fetch_sessions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_name TEXT NOT NULL,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP,
total_apis INTEGER,
successful_fetches INTEGER,
failed_fetches INTEGER,
total_records INTEGER
)
''')
conn.commit()
conn.close()
class ModernDataFetcher:
"""Modern, clean data fetcher with proper error handling"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Global-Data-Harvester/1.0 (Research & Education Purpose)'
})
init_database()
def fetch_api_data(self, api_name: str, endpoint_name: str) -> Dict[str, Any]:
"""Fetch data from a specific API endpoint"""
try:
config = API_CONFIG[api_name]
endpoint = config["endpoints"][endpoint_name]
# Apply rate limiting
if config.get("rate_limit"):
rate_limit = config["rate_limit"]
time.sleep(rate_limit["per_seconds"] / rate_limit["requests"])
# Prepare request
url = config["base_url"] + endpoint["url"]
headers = endpoint.get("headers", {})
# Make request based on method
if endpoint["method"] == "POST":
response = self.session.post(
url,
json=endpoint.get("data", {}),
headers=headers,
timeout=30
)
else:
params = endpoint.get("params", {})
response = self.session.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
# Parse response
try:
data = response.json()
except:
data = {"raw_response": response.text}
# Process and clean data
processed_data = self._process_response(api_name, data)
# Save to database
self._save_to_database(api_name, endpoint_name, processed_data)
return {
"status": "success",
"data": processed_data,
"record_count": len(processed_data) if isinstance(processed_data, list) else 1,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
return {
"status": "error",
"error": str(e),
"timestamp": datetime.now().isoformat()
}
def _process_response(self, api_name: str, data: Any) -> Any:
"""Process API response based on known structures"""
if api_name == "Skolverket":
if isinstance(data, dict) and "_embedded" in data:
for key, value in data["_embedded"].items():
if isinstance(value, list):
return value
return data
elif api_name == "SCB":
if isinstance(data, dict):
return data.get("data", data)
elif api_name == "Kolada":
if isinstance(data, dict):
return data.get("values", data)
elif api_name == "Eurostat":
if isinstance(data, dict):
return data.get("value", data)
elif api_name == "WHO":
if isinstance(data, dict):
return data.get("value", data)
elif api_name == "WorldBank":
if isinstance(data, list) and len(data) > 1:
return data[1] if data[1] else data[0]
return data
elif api_name == "Riksbanken":
if isinstance(data, dict):
return data.get("observations", data)
elif api_name == "Swecris":
if isinstance(data, dict):
return data.get("items", data)
elif api_name == "CSN":
if isinstance(data, dict):
return data.get("data", data)
return data
def _save_to_database(self, api_name: str, endpoint_name: str, data: Any):
"""Save data to database with deduplication"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Create hash for deduplication
data_str = json.dumps(data, sort_keys=True, default=str)
data_hash = hashlib.sha256(data_str.encode()).hexdigest()
record_count = len(data) if isinstance(data, list) else 1
try:
cursor.execute('''
INSERT OR IGNORE INTO api_data
(api_name, endpoint_name, data_hash, raw_data, record_count)
VALUES (?, ?, ?, ?, ?)
''', (api_name, endpoint_name, data_hash, data_str, record_count))
conn.commit()
except sqlite3.IntegrityError:
pass # Data already exists
finally:
conn.close()
# Initialize session state
if 'fetcher' not in st.session_state:
st.session_state.fetcher = ModernDataFetcher()
if 'fetch_results' not in st.session_state:
st.session_state.fetch_results = {}
if 'active_apis' not in st.session_state:
st.session_state.active_apis = set()
# Modern Header
st.markdown("""
<div class="title-container">
<h1 style="font-size: 3rem; margin: 0; background: linear-gradient(45deg, #fff, #f0f0f0); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">
🌍 Global Data Harvester
</h1>
<p style="font-size: 1.2rem; margin: 0.5rem 0 0 0; opacity: 0.9;">
Intelligent data collection from 10 international sources
</p>
<div style="margin-top: 1rem;">
<span style="background: rgba(255,255,255,0.2); padding: 0.3rem 0.8rem; border-radius: 15px; margin: 0 0.3rem; font-size: 0.9rem;">Real-time</span>
<span style="background: rgba(255,255,255,0.2); padding: 0.3rem 0.8rem; border-radius: 15px; margin: 0 0.3rem; font-size: 0.9rem;">Auto-discovery</span>
<span style="background: rgba(255,255,255,0.2); padding: 0.3rem 0.8rem; border-radius: 15px; margin: 0 0.3rem; font-size: 0.9rem;">Parallel processing</span>
</div>
</div>
""", unsafe_allow_html=True)
# Main dashboard layout
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("### 🎯 API Sources")
# Create API grid
api_cols = st.columns(2)
for i, (api_key, api_config) in enumerate(API_CONFIG.items()):
col = api_cols[i % 2]
with col:
# Get status
last_fetch = st.session_state.fetch_results.get(api_key, {})
status = last_fetch.get("status", "inactive")
status_color = {
"success": "status-active",
"error": "status-error",
"inactive": "status-warning"
}.get(status, "status-warning")
# Create API card
st.markdown(f"""
<div class="api-card">
<div style="display: flex; align-items: center; margin-bottom: 0.5rem;">
<span class="status-indicator {status_color}"></span>
<strong>{api_config['name']}</strong>
</div>
<p style="margin: 0; font-size: 0.9rem; opacity: 0.8;">{api_config['description']}</p>
<p style="margin: 0.5rem 0 0 0; font-size: 0.8rem;">
{len(api_config['endpoints'])} endpoints available
</p>
</div>
""", unsafe_allow_html=True)
# Fetch button for each API
if st.button(f"Fetch {api_key}", key=f"fetch_{api_key}", use_container_width=True):
with st.spinner(f"Fetching data from {api_key}..."):
results = {}
for endpoint_name in api_config["endpoints"].keys():
result = st.session_state.fetcher.fetch_api_data(api_key, endpoint_name)
results[endpoint_name] = result
st.session_state.fetch_results[api_key] = results
st.session_state.active_apis.add(api_key)
st.rerun()
with col2:
st.markdown("### πŸ“Š Live Metrics")
# Calculate metrics
total_apis = len(API_CONFIG)
active_apis = len(st.session_state.active_apis)
total_successful = sum(
1 for api_results in st.session_state.fetch_results.values()
for result in api_results.values()
if result.get("status") == "success"
)
total_records = sum(
result.get("record_count", 0)
for api_results in st.session_state.fetch_results.values()
for result in api_results.values()
if result.get("status") == "success"
)
# Display metrics with modern cards
metrics = [
("🎯 Active APIs", f"{active_apis}/{total_apis}"),
("βœ… Successful", total_successful),
("πŸ“Š Total Records", f"{total_records:,}"),
("πŸ•’ Last Update", datetime.now().strftime("%H:%M:%S"))
]
for label, value in metrics:
st.markdown(f"""
<div class="metric-card">
<div style="font-size: 0.9rem; opacity: 0.8; margin-bottom: 0.3rem;">{label}</div>
<div style="font-size: 1.5rem; font-weight: bold;">{value}</div>
</div>
""", unsafe_allow_html=True)
# Bulk operations
st.markdown("### πŸš€ Bulk Operations")
if st.button("🌍 Fetch All APIs", use_container_width=True, type="primary"):
progress_bar = st.progress(0)
status_text = st.empty()
total_operations = sum(len(config["endpoints"]) for config in API_CONFIG.values())
completed = 0
for api_key, api_config in API_CONFIG.items():
status_text.text(f"Processing {api_key}...")
results = {}
for endpoint_name in api_config["endpoints"].keys():
result = st.session_state.fetcher.fetch_api_data(api_key, endpoint_name)
results[endpoint_name] = result
completed += 1
progress_bar.progress(completed / total_operations)
st.session_state.fetch_results[api_key] = results
st.session_state.active_apis.add(api_key)
status_text.text("βœ… All APIs processed!")
time.sleep(1)
st.rerun()
# Results section
if st.session_state.fetch_results:
st.markdown("### πŸ“ˆ Latest Results")
# Create tabs for different views
tab1, tab2, tab3 = st.tabs(["πŸ“Š Data Preview", "πŸ“‹ Status Overview", "πŸ’Ύ Export"])
with tab1:
selected_api = st.selectbox("Select API for preview", list(st.session_state.fetch_results.keys()))
if selected_api:
api_results = st.session_state.fetch_results[selected_api]
for endpoint_name, result in api_results.items():
if result.get("status") == "success":
st.markdown(f"**{selected_api} - {endpoint_name}**")
data = result.get("data", [])
if isinstance(data, list) and len(data) > 0:
# Create DataFrame and show preview
df = pd.json_normalize(data[:10]) # First 10 records
st.dataframe(df, use_container_width=True)
st.caption(f"Showing 10 of {len(data)} records")
else:
st.json(data)
with tab2:
# Status overview with charts
status_data = []
for api_name, api_results in st.session_state.fetch_results.items():
for endpoint_name, result in api_results.items():
status_data.append({
"API": api_name,
"Endpoint": endpoint_name,
"Status": result.get("status", "unknown"),
"Records": result.get("record_count", 0),
"Timestamp": result.get("timestamp", "")
})
if status_data:
df_status = pd.DataFrame(status_data)
# Success rate chart
fig = px.pie(
df_status,
names="Status",
title="Fetch Success Rate",
color_discrete_map={"success": "#4ade80", "error": "#ef4444"}
)
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)",
font_color="white"
)
st.plotly_chart(fig, use_container_width=True)
# Records by API
records_by_api = df_status[df_status["Status"] == "success"].groupby("API")["Records"].sum().reset_index()
if not records_by_api.empty:
fig2 = px.bar(
records_by_api,
x="API",
y="Records",
title="Records Fetched by API"
)
fig2.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)",
font_color="white"
)
st.plotly_chart(fig2, use_container_width=True)
with tab3:
st.markdown("**Export Options**")
# Database statistics
conn = sqlite3.connect(DB_PATH)
total_records_db = pd.read_sql_query("SELECT COUNT(*) as count FROM api_data", conn).iloc[0]["count"]
conn.close()
st.metric("Database Records", total_records_db)
if st.button("πŸ“„ Export All Data (JSON)", use_container_width=True):
export_data = {
"export_timestamp": datetime.now().isoformat(),
"session_results": st.session_state.fetch_results,
"summary": {
"total_apis": len(st.session_state.fetch_results),
"total_records": total_records,
"active_apis": list(st.session_state.active_apis)
}
}
st.download_button(
"πŸ’Ύ Download Export",
data=json.dumps(export_data, indent=2, default=str),
file_name=f"global_data_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json",
use_container_width=True
)
# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; padding: 1rem; opacity: 0.8;">
<p><strong>🌍 Global Data Harvester</strong> - Modern data collection from international sources</p>
<p style="font-size: 0.9rem;">
πŸ‡ΈπŸ‡ͺ Swedish: Skolverket, SCB, Kolada, Riksbanken, Swecris, CSN β€’
🌍 Global: Eurostat, WHO, OECD, World Bank
</p>
</div>
""", unsafe_allow_html=True)