Spaces:
Sleeping
Sleeping
File size: 32,422 Bytes
18b60c9 45efc9d 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 9b8d982 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 9b8d982 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 9b8d982 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 9b8d982 c55a4b7 18b60c9 c55a4b7 18b60c9 9b8d982 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 9b8d982 c55a4b7 9b8d982 c55a4b7 18b60c9 c55a4b7 18b60c9 9b8d982 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 1281e4d 18b60c9 1281e4d 18b60c9 1281e4d 18b60c9 1281e4d 18b60c9 1281e4d 18b60c9 1281e4d 18b60c9 1281e4d 18b60c9 45efc9d 1281e4d 45efc9d 1281e4d 45efc9d 18b60c9 45efc9d 1281e4d 45efc9d 1281e4d 45efc9d 1281e4d 45efc9d 1281e4d 45efc9d 1281e4d 18b60c9 45efc9d 18b60c9 c55a4b7 45efc9d 18b60c9 45efc9d c55a4b7 45efc9d c55a4b7 45efc9d c55a4b7 45efc9d c55a4b7 45efc9d 18b60c9 c55a4b7 18b60c9 45efc9d 18b60c9 c55a4b7 18b60c9 c55a4b7 18b60c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 |
import streamlit as st
import requests
import pandas as pd
import json
import time
from datetime import datetime
import sqlite3
import hashlib
import gzip
from typing import Dict, Any, List
import warnings
warnings.filterwarnings('ignore')
# Optional plotly import with fallback
PLOTLY_AVAILABLE = False
try:
import plotly.express as px
PLOTLY_AVAILABLE = True
except ImportError:
st.warning("π Plotly not available - charts will be disabled")
PLOTLY_AVAILABLE = False
# Global ML availability flag
ML_AVAILABLE = False
# AI/ML Imports for enhanced functionality
try:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
ML_AVAILABLE = True
except ImportError:
ML_AVAILABLE = False
# Enhanced Page Configuration
st.set_page_config(
page_title="Simplified Data Harvester",
page_icon="π",
layout="wide",
initial_sidebar_state="collapsed"
)
# Enhanced CSS with modern, professional styling
st.markdown("""
<style>
.main > div {
padding-top: 1rem;
}
.stApp {
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
color: #2c3e50;
}
.title-container {
text-align: center;
padding: 2rem 0;
background: rgba(255, 255, 255, 0.9);
border-radius: 15px;
margin-bottom: 2rem;
border: 1px solid rgba(52, 73, 94, 0.1);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
}
.api-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1rem;
margin: 1rem 0;
}
.api-card {
background: rgba(255, 255, 255, 0.9);
border-radius: 10px;
padding: 1rem;
border: 1px solid rgba(52, 73, 94, 0.15);
text-align: center;
}
.main-button {
background: linear-gradient(135deg, #3498db, #2980b9);
color: white;
border: none;
border-radius: 12px;
padding: 1rem 2rem;
font-size: 1.2rem;
font-weight: bold;
cursor: pointer;
width: 100%;
margin: 1rem 0;
}
.main-button:hover {
background: linear-gradient(135deg, #2980b9, #1f4e79);
transform: translateY(-2px);
box-shadow: 0 8px 25px rgba(52, 152, 219, 0.3);
}
</style>
""", unsafe_allow_html=True)
# Database Configuration
DB_PATH = "simplified_harvester.db"
# API Configuration - Tested and verified working endpoints
SIMPLIFIED_API_CONFIG = {
"Skolverket": {
"name": "πΈπͺ Skolverket",
"description": "Swedish National Agency for Education - School Units",
"endpoints": [
{
"url": "https://api.skolverket.se/planned-educations/v3/compact-school-units?coordinateSystemType=WGS84&page=0&size=20",
"headers": {"Accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"},
"method": "GET",
"data_path": "body._embedded.schoolUnits",
"key_field": "schoolUnitCode"
}
]
},
"SCB": {
"name": "πΈπͺ Statistics Sweden",
"description": "Swedish National Statistics Office - Population Metadata",
"endpoints": [
{
"url": "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101A/BefolkningNy",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "variables",
"key_field": "code"
}
]
},
"Kolada": {
"name": "πΈπͺ Kolada",
"description": "Municipal Key Performance Indicators - Municipalities",
"endpoints": [
{
"url": "https://api.kolada.se/v2/municipality",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "values",
"key_field": "id"
}
]
},
"Eurostat": {
"name": "πͺπΊ Eurostat",
"description": "European Union Statistics - Population Density",
"endpoints": [
{
"url": "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/demo_r_d3dens?format=JSON&lang=EN",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "value",
"key_field": "index"
}
]
},
"WHO": {
"name": "π WHO",
"description": "World Health Organization - Country Dimensions",
"endpoints": [
{
"url": "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "value",
"key_field": "Code"
}
]
},
"OECD": {
"name": "π OECD",
"description": "Organisation for Economic Co-operation and Development - Economic Data",
"endpoints": [
{
"url": "https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_QNA_EXPENDITURE_GROWTH_OECD/?format=jsondata",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "data.dataSets",
"key_field": "series"
}
]
},
"WorldBank": {
"name": "π World Bank",
"description": "International Financial Institution - Population Data",
"endpoints": [
{
"url": "https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?format=json&per_page=50&date=2023",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "[1]",
"key_field": "countryiso3code"
}
]
},
"Riksbanken": {
"name": "πΈπͺ Riksbanken",
"description": "Swedish Central Bank - Exchange Rates",
"endpoints": [
{
"url": "https://api.riksbank.se/swea/v1/Observations/Latest/ByGroup/130",
"headers": {
"Accept": "application/json"
},
"method": "GET",
"data_path": "",
"key_field": "seriesId"
}
]
},
"Swecris": {
"name": "πΈπͺ Swecris",
"description": "Swedish Research Council Database - Research Projects",
"endpoints": [
{
"url": "https://swecris-api.vr.se/v1/projects?size=20",
"headers": {
"Accept": "application/json",
"Authorization": "Bearer VRSwecrisAPI2025-1"
},
"method": "GET",
"data_path": "content",
"key_field": "projectId",
"requires_auth": True
}
]
},
"CSN": {
"name": "πΈπͺ CSN",
"description": "Swedish Board of Student Finance - Statistics Portal",
"endpoints": [
{
"url": "https://statistik.csn.se/PXWeb/api/v1/sv/CSNstat",
"headers": {"Accept": "application/json"},
"method": "GET",
"data_path": "",
"key_field": "id",
"note": "PX-Web format requires specific POST queries for data"
}
]
}
}
def init_database():
"""Initialize optimized SQLite database with proper indexing and compression support"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Enable WAL mode for better concurrent access
cursor.execute('PRAGMA journal_mode=WAL')
cursor.execute('PRAGMA synchronous=NORMAL')
cursor.execute('PRAGMA cache_size=10000')
cursor.execute('PRAGMA temp_store=memory')
# Create optimized table structure
cursor.execute('''
CREATE TABLE IF NOT EXISTS harvested_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
api_name TEXT NOT NULL,
endpoint_url TEXT NOT NULL,
data_hash TEXT UNIQUE NOT NULL,
raw_data TEXT,
compressed_data BLOB,
processed_data TEXT,
record_count INTEGER DEFAULT 0,
data_size_bytes INTEGER DEFAULT 0,
compression_ratio REAL,
fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
fetch_duration_ms INTEGER DEFAULT 0,
status TEXT DEFAULT 'success',
session_id TEXT,
data_path TEXT,
key_field TEXT,
error_message TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Create performance indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_api_name ON harvested_data(api_name)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_timestamp ON harvested_data(fetch_timestamp)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_status ON harvested_data(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_session ON harvested_data(session_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_hash ON harvested_data(data_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_api_timestamp ON harvested_data(api_name, fetch_timestamp)')
# Create sessions table for better tracking
cursor.execute('''
CREATE TABLE IF NOT EXISTS fetch_sessions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT UNIQUE NOT NULL,
session_name TEXT,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP,
total_apis INTEGER DEFAULT 0,
successful_fetches INTEGER DEFAULT 0,
failed_fetches INTEGER DEFAULT 0,
total_records INTEGER DEFAULT 0,
total_size_bytes INTEGER DEFAULT 0,
status TEXT DEFAULT 'running'
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_session_id ON fetch_sessions(session_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_session_started ON fetch_sessions(started_at)')
conn.commit()
conn.close()
class SimplifiedDataHarvester:
"""Simplified data harvester - one function to fetch from all APIs"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Simplified-Data-Harvester/1.0 (Research & Analysis)'
})
self.results = {}
self.errors = {}
def fetch_all_apis(self, progress_callback=None) -> Dict:
"""One function to fetch data from all APIs automatically"""
session_id = f"simplified_{int(time.time())}"
total_apis = len(SIMPLIFIED_API_CONFIG)
completed = 0
if progress_callback:
progress_callback(f"π Starting data collection from {total_apis} APIs...")
for api_name, config in SIMPLIFIED_API_CONFIG.items():
if progress_callback:
progress_callback(f"π Fetching from {config['name']}...")
try:
api_results = self._fetch_api_data(api_name, config, session_id)
self.results[api_name] = api_results
completed += 1
if progress_callback:
progress = (completed / total_apis) * 100
progress_callback(f"β
{config['name']} completed ({progress:.1f}%)")
time.sleep(0.5) # Respectful delay
except Exception as e:
self.errors[api_name] = str(e)
if progress_callback:
progress_callback(f"β {config['name']} failed: {str(e)[:50]}...")
completed += 1
if progress_callback:
successful = len(self.results)
failed = len(self.errors)
progress_callback(f"π Collection complete! β
{successful} successful, β {failed} failed")
return {
"results": self.results,
"errors": self.errors,
"session_id": session_id,
"summary": {
"total_apis": total_apis,
"successful": len(self.results),
"failed": len(self.errors),
"success_rate": (len(self.results) / total_apis) * 100
}
}
def _fetch_api_data(self, api_name: str, config: Dict, session_id: str) -> Dict:
"""Fetch data from all endpoints for a specific API"""
api_results = {
"api_name": api_name,
"endpoints": [],
"total_records": 0,
"total_size": 0
}
for i, endpoint in enumerate(config['endpoints']):
try:
start_time = time.time()
# Make request
if endpoint.get('method', 'GET').upper() == 'POST':
response = self.session.post(
endpoint['url'],
headers=endpoint.get('headers', {}),
json=endpoint.get('data', {}),
timeout=30
)
else:
response = self.session.get(
endpoint['url'],
headers=endpoint.get('headers', {}),
timeout=30
)
response.raise_for_status()
# Process response
data = self._process_response(response)
fetch_duration = int((time.time() - start_time) * 1000)
# Extract meaningful data using endpoint configuration
processed_data = self._extract_api_data(data, api_name, endpoint)
record_count = self._count_records(processed_data)
data_size = len(response.content)
# Save to database with enhanced metadata
self._save_data_to_db(
api_name, endpoint['url'], data, processed_data, session_id,
fetch_duration, record_count, data_size, "success", endpoint
)
endpoint_result = {
"endpoint_url": endpoint['url'],
"status": "success",
"records": record_count,
"size_bytes": data_size,
"duration_ms": fetch_duration,
"data_preview": str(processed_data)[:200] + "..." if len(str(processed_data)) > 200 else str(processed_data)
}
api_results["endpoints"].append(endpoint_result)
api_results["total_records"] += record_count
api_results["total_size"] += data_size
except Exception as e:
endpoint_result = {
"endpoint_url": endpoint['url'],
"status": "error",
"error": str(e),
"records": 0,
"size_bytes": 0,
"duration_ms": 0
}
api_results["endpoints"].append(endpoint_result)
return api_results
def _process_response(self, response):
"""Process API response"""
content_type = response.headers.get('content-type', '').lower()
if 'json' in content_type:
return response.json()
else:
try:
return response.json() # Try JSON first
except:
return {"raw_content": response.text}
def _extract_api_data(self, data: Any, api_name: str, endpoint_config: Dict) -> Any:
"""Extract meaningful data from API response using data_path configuration"""
try:
data_path = endpoint_config.get('data_path', '')
if not data_path:
return data
# Handle nested path extraction
current_data = data
# Handle array index notation like "[1]"
if data_path.startswith('[') and data_path.endswith(']'):
index = int(data_path[1:-1])
if isinstance(current_data, list) and len(current_data) > index:
return current_data[index]
else:
return current_data
# Handle dot notation like "body._embedded.schoolUnits"
if '.' in data_path:
path_parts = data_path.split('.')
for part in path_parts:
if isinstance(current_data, dict):
current_data = current_data.get(part, current_data)
else:
break
return current_data
# Handle simple key extraction
if isinstance(current_data, dict):
return current_data.get(data_path, current_data)
return current_data
except Exception as e:
st.warning(f"Data extraction failed for {api_name}: {str(e)}")
return data
def _count_records(self, data: Any) -> int:
"""Count records in the data"""
if isinstance(data, list):
return len(data)
elif isinstance(data, dict):
for key, value in data.items():
if isinstance(value, list) and len(value) > 0:
return len(value)
return 1
else:
return 1 if data else 0
def _save_data_to_db(self, api_name: str, endpoint_url: str, raw_data: Any,
processed_data: Any, session_id: str, fetch_duration: int,
record_count: int, data_size: int, status: str, endpoint_config: Dict):
"""Save data to database with compression and enhanced metadata"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
try:
# Create data hash for deduplication
raw_data_str = json.dumps(raw_data, sort_keys=True, default=str, separators=(',', ':'))
processed_data_str = json.dumps(processed_data, sort_keys=True, default=str, separators=(',', ':'))
data_hash = hashlib.sha256(raw_data_str.encode()).hexdigest()
# Check if data exists
cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
if cursor.fetchone():
return # Skip duplicate
# Implement smart compression
raw_data_final = None
compressed_data = None
compression_ratio = 1.0
if data_size > 512: # Compress data larger than 512 bytes
try:
compressed_data = gzip.compress(raw_data_str.encode('utf-8'))
compression_ratio = len(compressed_data) / len(raw_data_str.encode('utf-8'))
# Only use compression if it saves significant space
if compression_ratio < 0.8:
raw_data_final = None
else:
raw_data_final = raw_data_str
compressed_data = None
compression_ratio = 1.0
except Exception:
raw_data_final = raw_data_str
compressed_data = None
compression_ratio = 1.0
else:
raw_data_final = raw_data_str
# Extract endpoint metadata
data_path = endpoint_config.get('data_path', '')
key_field = endpoint_config.get('key_field', '')
# Insert data with enhanced metadata
cursor.execute('''
INSERT INTO harvested_data
(api_name, endpoint_url, data_hash, raw_data, compressed_data, processed_data,
record_count, data_size_bytes, compression_ratio, fetch_duration_ms,
status, session_id, data_path, key_field)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
api_name, endpoint_url, data_hash, raw_data_final, compressed_data,
processed_data_str, record_count, data_size, compression_ratio,
fetch_duration, status, session_id, data_path, key_field
))
conn.commit()
except Exception as e:
# Log error but don't fail the entire operation
cursor.execute('''
INSERT INTO harvested_data
(api_name, endpoint_url, data_hash, status, error_message, session_id)
VALUES (?, ?, ?, ?, ?, ?)
''', (api_name, endpoint_url, f"error_{int(time.time())}", "error", str(e), session_id))
conn.commit()
finally:
conn.close()
def get_database_stats():
"""Get enhanced database statistics"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
try:
# Basic counts
cursor.execute('SELECT COUNT(*) FROM harvested_data WHERE status = "success"')
total_records = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT api_name) FROM harvested_data WHERE status = "success"')
active_apis = cursor.fetchone()[0]
cursor.execute('SELECT SUM(record_count) FROM harvested_data WHERE status = "success"')
total_data_records = cursor.fetchone()[0] or 0
# Enhanced metrics
cursor.execute('SELECT SUM(data_size_bytes) FROM harvested_data WHERE status = "success"')
total_size_bytes = cursor.fetchone()[0] or 0
cursor.execute('SELECT AVG(compression_ratio) FROM harvested_data WHERE compression_ratio IS NOT NULL')
avg_compression = cursor.fetchone()[0] or 1.0
cursor.execute('SELECT COUNT(*) FROM harvested_data WHERE status = "error"')
error_count = cursor.fetchone()[0]
cursor.execute('SELECT AVG(fetch_duration_ms) FROM harvested_data WHERE status = "success"')
avg_fetch_time = cursor.fetchone()[0] or 0
# Latest session info
cursor.execute('''
SELECT session_id, COUNT(*) as fetches, MAX(fetch_timestamp) as latest
FROM harvested_data
WHERE session_id IS NOT NULL
GROUP BY session_id
ORDER BY latest DESC
LIMIT 1
''')
latest_session = cursor.fetchone()
return {
"total_records": total_records,
"active_apis": active_apis,
"total_data_records": total_data_records,
"total_size_mb": round(total_size_bytes / (1024 * 1024), 2),
"avg_compression": round(avg_compression, 3),
"error_count": error_count,
"avg_fetch_time_ms": round(avg_fetch_time, 1),
"latest_session": latest_session[0] if latest_session else None,
"latest_session_fetches": latest_session[1] if latest_session else 0
}
finally:
conn.close()
# Initialize database
init_database()
# Initialize components
if 'harvester' not in st.session_state:
st.session_state.harvester = SimplifiedDataHarvester()
if 'last_results' not in st.session_state:
st.session_state.last_results = None
# Header
st.markdown("""
<div class="title-container">
<h1 style="font-size: 2.5rem; margin: 0; color: #2c3e50;">
π Simplified Data Harvester
</h1>
<p style="font-size: 1.1rem; margin: 0.5rem 0 0 0; color: #34495e;">
One-Click Data Collection from All APIs
</p>
<p style="font-size: 0.95rem; margin: 0.3rem 0 0 0; color: #7f8c8d;">
Automatic data fetching from 10 international sources with smart database storage
</p>
</div>
""", unsafe_allow_html=True)
# Display ML status
if ML_AVAILABLE:
st.success("π€ **AI Enhanced** - Quality assessment and analysis active")
else:
st.info("π **Standard Mode** - Core functionality available")
# Main Action Section
st.markdown("### π Data Collection")
# Show API status
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("**Available APIs:**")
# Display APIs in a compact format
for api_name, config in SIMPLIFIED_API_CONFIG.items():
st.markdown(f"β
**{config['name']}** - {config['description']}")
with col2:
# Enhanced database stats
try:
stats = get_database_stats()
st.metric("π Fetch Records", f"{stats.get('total_records', 0):,}")
st.metric("π Active APIs", stats.get('active_apis', 0))
st.metric("π Data Records", f"{stats.get('total_data_records', 0):,}")
st.metric("πΎ Storage (MB)", f"{stats.get('total_size_mb', 0)}")
if stats.get('avg_compression', 1.0) < 1.0:
compression_pct = (1 - stats.get('avg_compression', 1.0)) * 100
st.metric("ποΈ Compression", f"{compression_pct:.1f}% saved")
if stats.get('error_count', 0) > 0:
st.metric("β οΈ Errors", stats.get('error_count', 0))
except Exception as e:
st.metric("π Total Records", "0")
st.metric("π Active APIs", "0")
st.error(f"Stats error: {str(e)}")
st.markdown("---")
# Single button to fetch all data
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
if st.button("π FETCH ALL DATA FROM ALL APIS",
type="primary",
use_container_width=True,
help="Automatically collect data from all 10 APIs and save to database"):
# Progress tracking
status_container = st.empty()
progress_bar = st.progress(0)
def update_progress(message):
status_container.text(message)
# Execute the one-click data collection
with st.spinner("π Collecting data from all APIs..."):
try:
results = st.session_state.harvester.fetch_all_apis(update_progress)
st.session_state.last_results = results
progress_bar.progress(1.0)
status_container.success("β
Collection completed!")
except Exception as e:
progress_bar.progress(0.0)
status_container.error(f"β Collection failed: {str(e)}")
st.error("Data collection encountered an error. Please try again.")
st.stop() # Stop execution here if collection failed
# Show results only if collection was successful
if 'results' not in locals() or not results:
st.error("No results to display")
st.stop()
summary = results.get('summary', {})
# Success metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("β
Successful APIs", summary.get('successful', 0))
with col2:
st.metric("β Failed APIs", summary.get('failed', 0))
with col3:
st.metric("π Success Rate", f"{summary.get('success_rate', 0):.1f}%")
with col4:
try:
total_records = sum(api_data.get('total_records', 0) for api_data in results.get('results', {}).values())
st.metric("π Total Records", f"{total_records:,}")
except Exception:
st.metric("π Total Records", "Error")
# Simplified results summary
st.markdown("### π Results Summary")
try:
if len(results.get('results', {})) > 0:
st.success(f"β
Successfully processed {len(results['results'])} APIs")
# Show just API names and record counts in a simple format
for api_name, api_data in list(results['results'].items())[:5]:
records = api_data.get('total_records', 0)
size_mb = api_data.get('total_size', 0) / (1024 * 1024)
st.write(f"β’ **{SIMPLIFIED_API_CONFIG[api_name]['name']}**: {records:,} records ({size_mb:.2f} MB)")
if len(results['results']) > 5:
st.info(f"... and {len(results['results']) - 5} more APIs processed successfully")
else:
st.warning("No results to display")
except Exception as e:
st.warning("Results summary unavailable - check database viewer below for stored data")
# Error details
if results['errors']:
st.markdown("### β Error Details")
for api_name, error in results['errors'].items():
st.error(f"**{SIMPLIFIED_API_CONFIG[api_name]['name']}:** {error}")
# Show last results if available
if st.session_state.last_results:
st.markdown("---")
st.markdown("### π Quick Analytics")
results = st.session_state.last_results
# Simple analytics with minimal processing
try:
if results.get('results') and len(results['results']) > 0:
# Very simple metrics - no complex calculations
st.success("π Analytics: Data collection completed successfully!")
# Basic info only
total_apis = len(results['results'])
st.info(f"π Processed {total_apis} APIs with data storage in database")
# Skip complex charts and calculations to prevent hanging
if PLOTLY_AVAILABLE:
st.info("π Charts available - check database viewer for detailed analysis")
else:
st.info("π Database contains all collected data for analysis")
except Exception as e:
st.info("β
Data collection was successful - check database viewer below")
# Simplified Database viewer
with st.expander("ποΈ Database Viewer"):
try:
conn = sqlite3.connect(DB_PATH)
# Simple count query first
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM harvested_data')
total_count = cursor.fetchone()[0]
if total_count > 0:
st.success(f"π Database contains {total_count} records")
# Show only recent 10 records to prevent overload
df = pd.read_sql_query('''
SELECT
api_name as "API",
record_count as "Records",
status as "Status",
SUBSTR(fetch_timestamp, 1, 16) as "Time"
FROM harvested_data
ORDER BY fetch_timestamp DESC
LIMIT 10
''', conn)
st.dataframe(df, use_container_width=True)
if total_count > 10:
st.info(f"Showing latest 10 records. Total: {total_count} records in database.")
else:
st.info("No data in database yet. Run the data collection first!")
conn.close()
except Exception as e:
st.warning("Database viewer temporarily unavailable")
# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; padding: 1rem; background: rgba(255,255,255,0.8); border-radius: 10px; color: #2c3e50;">
<p><strong>π Optimized Data Harvester</strong> - Research-verified APIs with smart storage</p>
<p style="font-size: 0.9rem; color: #7f8c8d;">
β
10 Tested APIs β’ ποΈ Data compression β’ π Performance metrics β’ π Enhanced analytics
</p>
<p style="font-size: 0.8rem; color: #95a5a6;">
Featuring proper data extraction, optimized database with indexing, and compression for efficient storage
</p>
</div>
""", unsafe_allow_html=True) |