Spaces:

isakskogstad
/

api-data-fetcher

Sleeping

App Files Files Community

isakskogstad commited on Jul 12, 2025

Commit

788251f

verified ·

1 Parent(s): 7186a85

Upload app_ultimate.py with huggingface_hub

Browse files

Files changed (1) hide show

app_ultimate.py +457 -120

app_ultimate.py CHANGED Viewed

@@ -616,155 +616,169 @@ else:
     semantic_analyzer = None
     health_monitor = None
-# Comprehensive API Discovery Configuration
-DEEP_API_CONFIG = {
     "Skolverket": {
         "name": "🇸🇪 Skolverket",
         "description": "Swedish National Agency for Education",
-        "base_url": "https://api.skolverket.se",
-        "discovery_patterns": [
-            r'/planned-educations/v\d+/.*',
-            r'/skolenhetsregister/v\d+/.*',
-            r'/syllabus/v\d+/.*',
-            r'/susa-navet/v\d+/.*'
-        ],
-        "known_roots": [
-            "/planned-educations/v3/",
-            "/skolenhetsregister/v2/",
-            "/syllabus/v1/",
-            "/susa-navet/v1/"
         ],
-        "explore_depth": 3,
-        "rate_limit": None,
-        "auth": None
     },
     "SCB": {
-        "name": "🇸🇪 Statistics Sweden",
         "description": "Swedish National Statistics Office",
-        "base_url": "https://api.scb.se",
-        "discovery_patterns": [
-            r'/OV0104/v\d+/doris/.*',
-            r'/\w+/v\d+/.*'
-        ],
-        "known_roots": [
-            "/OV0104/v1/doris/",
-            "/OV0104/v1/doris/sv/ssd/"
         ],
-        "explore_depth": 5,
-        "rate_limit": {"requests": 10, "per_seconds": 10},
-        "auth": None
     },
     "Kolada": {
         "name": "🇸🇪 Kolada",
         "description": "Municipal Key Performance Indicators",
-        "base_url": "https://api.kolada.se",
-        "discovery_patterns": [
-            r'/v\d+/.*'
-        ],
-        "known_roots": [
-            "/v2/"
         ],
-        "explore_depth": 3,
-        "rate_limit": None,
-        "auth": None
     },
     "Eurostat": {
         "name": "🇪🇺 Eurostat",
         "description": "European Union Statistics",
-        "base_url": "https://ec.europa.eu/eurostat",
-        "discovery_patterns": [
-            r'/api/dissemination/statistics/\d+\.\d+/data/.*'
-        ],
-        "known_roots": [
-            "/api/dissemination/statistics/1.0/data/"
         ],
-        "explore_depth": 2,
-        "rate_limit": None,
-        "auth": None
     },
     "WHO": {
         "name": "🌍 WHO",
         "description": "World Health Organization",
-        "base_url": "https://ghoapi.azureedge.net",
-        "discovery_patterns": [
-            r'/api/.*'
-        ],
-        "known_roots": [
-            "/api/"
         ],
-        "explore_depth": 3,
-        "rate_limit": None,
-        "auth": None
     },
     "OECD": {
         "name": "🌍 OECD",
         "description": "Organisation for Economic Co-operation and Development",
-        "base_url": "https://sdmx.oecd.org",
-        "discovery_patterns": [
-            r'/public/rest/data/.*'
-        ],
-        "known_roots": [
-            "/public/rest/data/"
         ],
-        "explore_depth": 3,
-        "rate_limit": None,
-        "auth": None
     },
     "WorldBank": {
         "name": "🌍 World Bank",
         "description": "International Financial Institution",
-        "base_url": "https://api.worldbank.org",
-        "discovery_patterns": [
-            r'/v\d+/.*'
-        ],
-        "known_roots": [
-            "/v2/"
         ],
-        "explore_depth": 4,
-        "rate_limit": None,
-        "auth": None
     },
     "Riksbanken": {
         "name": "🇸🇪 Riksbanken",
         "description": "Swedish Central Bank",
-        "base_url": "https://api.riksbank.se",
-        "discovery_patterns": [
-            r'/swea/v\d+/.*'
-        ],
-        "known_roots": [
-            "/swea/v1/"
         ],
-        "explore_depth": 3,
-        "rate_limit": {"requests": 5, "per_seconds": 60},
-        "auth": None
     },
     "Swecris": {
         "name": "🇸🇪 Swecris",
         "description": "Swedish Research Council Database",
-        "base_url": "https://swecris-api.vr.se",
-        "discovery_patterns": [
-            r'/v\d+/.*'
-        ],
-        "known_roots": [
-            "/v1/"
         ],
-        "explore_depth": 2,
-        "rate_limit": None,
-        "auth": {"type": "Bearer", "token": "VRSwecrisAPI2025-1"}
     },
     "CSN": {
         "name": "🇸🇪 CSN",
         "description": "Swedish Board of Student Finance",
-        "base_url": "https://statistik.csn.se",
-        "discovery_patterns": [
-            r'/PXWeb/api/v\d+/.*'
-        ],
-        "known_roots": [
-            "/PXWeb/api/v1/sv/CSNstat/"
         ],
-        "explore_depth": 4,
-        "rate_limit": None,
-        "auth": None
     }
 }
@@ -1155,20 +1169,350 @@ def backup_database(backup_path=None):
     except Exception as e:
         return None
-class DeepEndpointDiscoverer:
-    """Advanced endpoint discovery with recursive exploration"""
     def __init__(self):
         self.session = requests.Session()
         self.session.headers.update({
-            'User-Agent': 'Ultimate-Data-Harvester/2.0 (Deep Discovery & Research Purpose)'
         })
-        self.discovered_endpoints = set()
-        self.discovery_cache = {}
-    def discover_all_endpoints(self, api_name: str, progress_callback=None) -> List[Dict]:
-        """Discover all possible endpoints for an API with deep exploration"""
-        config = DEEP_API_CONFIG.get(api_name)
         if not config:
             return []
@@ -2098,19 +2442,12 @@ class UltimateDataHarvester:
             "total_sessions": total_sessions
         }
-# Initialize components
 if 'harvester' not in st.session_state:
-    st.session_state.harvester = UltimateDataHarvester()
-if 'discovery_status' not in st.session_state:
-    st.session_state.discovery_status = {}
-if 'current_session' not in st.session_state:
-    st.session_state.current_session = None
-if 'last_session_info' not in st.session_state:
-    last_session = st.session_state.harvester.session_manager.get_last_session()
-    st.session_state.last_session_info = last_session
 # Enhanced Header
 st.markdown("""

     semantic_analyzer = None
     health_monitor = None
+# Simplified API Configuration - Real working endpoints
+SIMPLIFIED_API_CONFIG = {
     "Skolverket": {
         "name": "🇸🇪 Skolverket",
         "description": "Swedish National Agency for Education",
+        "endpoints": [
+            {
+                "url": "https://api.skolverket.se/planned-educations/v3",
+                "headers": {"Accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"},
+                "method": "GET"
+            },
+            {
+                "url": "https://api.skolverket.se/skolenhetsregister/v2/skolenhet",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "SCB": {
+        "name": "🇸🇪 Statistics Sweden",
         "description": "Swedish National Statistics Office",
+        "endpoints": [
+            {
+                "url": "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101A/BefolkningNy",
+                "headers": {"Content-Type": "application/json"},
+                "method": "POST",
+                "data": {
+                    "query": [
+                        {"code": "Region", "selection": {"filter": "item", "values": ["00"]}},
+                        {"code": "Civilstand", "selection": {"filter": "item", "values": ["TOT"]}},
+                        {"code": "Alder", "selection": {"filter": "item", "values": ["tot"]}},
+                        {"code": "Kon", "selection": {"filter": "item", "values": ["1", "2"]}},
+                        {"code": "ContentsCode", "selection": {"filter": "item", "values": ["BE0101N1"]}},
+                        {"code": "Tid", "selection": {"filter": "item", "values": ["2023"]}}
+                    ],
+                    "response": {"format": "json"}
+                }
+            }
         ],
+        "rate_limit": {"requests": 10, "per_seconds": 10}
     },
     "Kolada": {
         "name": "🇸🇪 Kolada",
         "description": "Municipal Key Performance Indicators",
+        "endpoints": [
+            {
+                "url": "https://api.kolada.se/v2/municipality",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            },
+            {
+                "url": "https://api.kolada.se/v2/kpi",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "Eurostat": {
         "name": "🇪🇺 Eurostat",
         "description": "European Union Statistics",
+        "endpoints": [
+            {
+                "url": "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/demo_pjan?format=JSON&lang=en&geo=EU27_2020&age=TOTAL&sex=T&time=2023",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "WHO": {
         "name": "🌍 WHO",
         "description": "World Health Organization",
+        "endpoints": [
+            {
+                "url": "https://ghoapi.azureedge.net/api/WHOSIS_000001",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            },
+            {
+                "url": "https://ghoapi.azureedge.net/api/Dimension",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "OECD": {
         "name": "🌍 OECD",
         "description": "Organisation for Economic Co-operation and Development",
+        "endpoints": [
+            {
+                "url": "https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_QNA,1.0/AUS.B1GQ.C.Q?format=jsondata",
+                "headers": {"Accept": "application/vnd.sdmx.data+json;version=1.0.0"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "WorldBank": {
         "name": "🌍 World Bank",
         "description": "International Financial Institution",
+        "endpoints": [
+            {
+                "url": "https://api.worldbank.org/v2/country?format=json&per_page=50",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            },
+            {
+                "url": "https://api.worldbank.org/v2/indicator/SP.POP.TOTL?format=json&date=2023&per_page=50",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "Riksbanken": {
         "name": "🇸🇪 Riksbanken",
         "description": "Swedish Central Bank",
+        "endpoints": [
+            {
+                "url": "https://api.riksbank.se/swea/v1/Observations/SEKEURPMI/2023-01-01/2023-12-31",
+                "headers": {"Accept": "application/json"},
+                "method": "GET"
+            }
         ],
+        "rate_limit": {"requests": 5, "per_seconds": 60}
     },
     "Swecris": {
         "name": "🇸🇪 Swecris",
         "description": "Swedish Research Council Database",
+        "endpoints": [
+            {
+                "url": "https://swecris-api.vr.se/v1/projects?size=50",
+                "headers": {
+                    "Accept": "application/json",
+                    "Authorization": "Bearer VRSwecrisAPI2025-1"
+                },
+                "method": "GET"
+            }
         ],
+        "rate_limit": None
     },
     "CSN": {
         "name": "🇸🇪 CSN",
         "description": "Swedish Board of Student Finance",
+        "endpoints": [
+            {
+                "url": "https://statistik.csn.se/PXWeb/api/v1/sv/CSNstat/StudiebidragGymnasieskola/SS0101B1.px",
+                "headers": {"Content-Type": "application/json"},
+                "method": "POST",
+                "data": {
+                    "query": [
+                        {"code": "Region", "selection": {"filter": "item", "values": ["00"]}},
+                        {"code": "ContentsCode", "selection": {"filter": "item", "values": ["SS0101B1"]}},
+                        {"code": "Tid", "selection": {"filter": "item", "values": ["2023"]}}
+                    ],
+                    "response": {"format": "json"}
+                }
+            }
         ],
+        "rate_limit": None
     }
 }
     except Exception as e:
         return None
+class SimplifiedDataHarvester:
+    """Simplified data harvester - one function to fetch from all APIs"""
     def __init__(self):
         self.session = requests.Session()
         self.session.headers.update({
+            'User-Agent': 'Simplified-Data-Harvester/1.0 (Research & Analysis)'
         })
+        self.results = {}
+        self.errors = {}
+    def fetch_all_apis(self, progress_callback=None) -> Dict:
+        """One function to fetch data from all APIs automatically"""
+        session_id = f"simplified_{int(time.time())}"
+        total_apis = len(SIMPLIFIED_API_CONFIG)
+        completed = 0
+        if progress_callback:
+            progress_callback(f"🚀 Starting comprehensive data collection from {total_apis} APIs...")
+        for api_name, config in SIMPLIFIED_API_CONFIG.items():
+            if progress_callback:
+                progress_callback(f"🔄 Fetching from {config['name']}...")
+            try:
+                api_results = self._fetch_api_data(api_name, config, session_id)
+                self.results[api_name] = api_results
+                completed += 1
+                if progress_callback:
+                    progress = (completed / total_apis) * 100
+                    progress_callback(f"✅ {config['name']} completed ({progress:.1f}%)")
+                # Apply rate limiting if specified
+                if config.get('rate_limit'):
+                    rate_limit = config['rate_limit']
+                    sleep_time = rate_limit['per_seconds'] / rate_limit['requests']
+                    time.sleep(sleep_time)
+                else:
+                    time.sleep(0.5)  # Default delay between APIs
+            except Exception as e:
+                self.errors[api_name] = str(e)
+                if progress_callback:
+                    progress_callback(f"❌ {config['name']} failed: {str(e)[:50]}...")
+                completed += 1
+        if progress_callback:
+            successful = len(self.results)
+            failed = len(self.errors)
+            progress_callback(f"🎉 Collection complete! ✅ {successful} successful, ❌ {failed} failed")
+        return {
+            "results": self.results,
+            "errors": self.errors,
+            "session_id": session_id,
+            "summary": {
+                "total_apis": total_apis,
+                "successful": len(self.results),
+                "failed": len(self.errors),
+                "success_rate": (len(self.results) / total_apis) * 100
+            }
+        }
+    def _fetch_api_data(self, api_name: str, config: Dict, session_id: str) -> Dict:
+        """Fetch data from all endpoints for a specific API"""
+        api_results = {
+            "api_name": api_name,
+            "endpoints": [],
+            "total_records": 0,
+            "total_size": 0
+        }
+        for i, endpoint in enumerate(config['endpoints']):
+            try:
+                start_time = time.time()
+                # Make request
+                if endpoint.get('method', 'GET').upper() == 'POST':
+                    response = self.session.post(
+                        endpoint['url'],
+                        headers=endpoint.get('headers', {}),
+                        json=endpoint.get('data', {}),
+                        timeout=30
+                    )
+                else:
+                    response = self.session.get(
+                        endpoint['url'],
+                        headers=endpoint.get('headers', {}),
+                        timeout=30
+                    )
+                response.raise_for_status()
+                # Process response
+                data = self._process_response(response, api_name)
+                fetch_duration = int((time.time() - start_time) * 1000)
+                # Extract meaningful data
+                processed_data = self._extract_api_data(data, api_name)
+                record_count = self._count_records(processed_data)
+                data_size = len(response.content)
+                # Save to database
+                endpoint_path = f"endpoint_{i+1}"
+                self._save_data_to_db(
+                    api_name, endpoint_path, processed_data, session_id,
+                    fetch_duration, record_count, data_size, "success"
+                )
+                endpoint_result = {
+                    "endpoint_url": endpoint['url'],
+                    "status": "success",
+                    "records": record_count,
+                    "size_bytes": data_size,
+                    "duration_ms": fetch_duration,
+                    "data_preview": self._create_data_preview(processed_data)
+                }
+                api_results["endpoints"].append(endpoint_result)
+                api_results["total_records"] += record_count
+                api_results["total_size"] += data_size
+            except Exception as e:
+                endpoint_result = {
+                    "endpoint_url": endpoint['url'],
+                    "status": "error",
+                    "error": str(e),
+                    "records": 0,
+                    "size_bytes": 0,
+                    "duration_ms": 0
+                }
+                api_results["endpoints"].append(endpoint_result)
+        return api_results
+    def _process_response(self, response, api_name: str):
+        """Process API response based on content type"""
+        content_type = response.headers.get('content-type', '').lower()
+        if 'json' in content_type:
+            return response.json()
+        elif 'xml' in content_type:
+            return self._xml_to_dict(response.text)
+        else:
+            try:
+                return response.json()  # Try JSON first
+            except:
+                return {"raw_content": response.text}
+    def _xml_to_dict(self, xml_text: str) -> Dict:
+        """Convert XML to dictionary"""
+        try:
+            import xml.etree.ElementTree as ET
+            root = ET.fromstring(xml_text)
+            return self._element_to_dict(root)
+        except:
+            return {"raw_xml": xml_text}
+    def _element_to_dict(self, element) -> Dict:
+        """Convert XML element to dictionary"""
+        result = {}
+        if element.attrib:
+            result.update(element.attrib)
+        if element.text and element.text.strip():
+            if len(element) == 0:
+                return element.text.strip()
+            result['text'] = element.text.strip()
+        for child in element:
+            child_data = self._element_to_dict(child)
+            if child.tag in result:
+                if not isinstance(result[child.tag], list):
+                    result[child.tag] = [result[child.tag]]
+                result[child.tag].append(child_data)
+            else:
+                result[child.tag] = child_data
+        return result
+    def _extract_api_data(self, data: Any, api_name: str) -> Any:
+        """Extract meaningful data from API response based on API type"""
+        if api_name == "Skolverket":
+            if isinstance(data, dict):
+                if "_embedded" in data:
+                    return data["_embedded"]
+                elif "skolenheter" in data:
+                    return data["skolenheter"]
+            return data
+        elif api_name == "SCB":
+            if isinstance(data, dict):
+                return data.get("data", data.get("variables", data))
+        elif api_name == "Kolada":
+            if isinstance(data, dict):
+                return data.get("values", data)
+        elif api_name == "Eurostat":
+            if isinstance(data, dict):
+                return data.get("value", data.get("data", data))
+        elif api_name == "WHO":
+            if isinstance(data, dict):
+                return data.get("value", data.get("fact", data))
+        elif api_name == "OECD":
+            if isinstance(data, dict):
+                if "data" in data:
+                    return data["data"]
+                return data
+        elif api_name == "WorldBank":
+            if isinstance(data, list) and len(data) > 1:
+                return data[1] if data[1] else data[0]
+            return data
+        elif api_name == "Riksbanken":
+            if isinstance(data, dict):
+                return data.get("observations", data.get("data", data))
+        elif api_name == "Swecris":
+            if isinstance(data, dict):
+                return data.get("items", data.get("projects", data))
+        elif api_name == "CSN":
+            if isinstance(data, dict):
+                return data.get("data", data.get("variables", data))
+        return data
+    def _count_records(self, data: Any) -> int:
+        """Count records in the data"""
+        if isinstance(data, list):
+            return len(data)
+        elif isinstance(data, dict):
+            # Try to find arrays that represent records
+            for key, value in data.items():
+                if isinstance(value, list) and len(value) > 0:
+                    return len(value)
+            return 1
+        else:
+            return 1 if data else 0
+    def _create_data_preview(self, data: Any) -> Dict:
+        """Create a preview of the data for display"""
+        preview = {
+            "type": type(data).__name__,
+            "sample": None
+        }
+        if isinstance(data, list):
+            preview["length"] = len(data)
+            preview["sample"] = data[:3] if len(data) > 3 else data
+        elif isinstance(data, dict):
+            preview["keys"] = list(data.keys())[:10]
+            if data:
+                first_key = list(data.keys())[0]
+                preview["sample"] = {first_key: data[first_key]}
+        else:
+            preview["sample"] = str(data)[:200]
+        return preview
+    def _save_data_to_db(self, api_name: str, endpoint_path: str, data: Any,
+                        session_id: str, fetch_duration: int, record_count: int,
+                        data_size: int, status: str, error_message: str = None):
+        """Save data to database with optimization"""
+        import gzip
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        try:
+            # Create data hash for deduplication
+            data_str = json.dumps(data, sort_keys=True, default=str)
+            data_hash = hashlib.sha256(data_str.encode()).hexdigest()
+            # Check if data exists
+            cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
+            if cursor.fetchone():
+                return  # Skip duplicate
+            # Compress if large
+            raw_data_compressed = None
+            raw_data = None
+            if data_size > 1024:
+                try:
+                    raw_data_compressed = gzip.compress(data_str.encode('utf-8'))
+                except:
+                    raw_data = data_str
+            else:
+                raw_data = data_str
+            # Insert data
+            cursor.execute('''
+                INSERT INTO harvested_data
+                (api_name, endpoint_path, data_hash, raw_data, raw_data_compressed,
+                 record_count, data_size_bytes, fetch_duration_ms, status,
+                 error_message, session_id, data_format)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ''', (
+                api_name, endpoint_path, data_hash, raw_data, raw_data_compressed,
+                record_count, data_size, fetch_duration, status, error_message,
+                session_id, self._detect_data_format(data)
+            ))
+            conn.commit()
+        except Exception as e:
+            # Fallback to basic schema
+            try:
+                cursor.execute('''
+                    INSERT OR REPLACE INTO harvested_data
+                    (api_name, endpoint_path, data_hash, raw_data, record_count,
+                     data_size_bytes, fetch_duration_ms, status, session_id)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    api_name, endpoint_path, data_hash, data_str[:10000],  # Limit size
+                    record_count, data_size, fetch_duration, status, session_id
+                ))
+                conn.commit()
+            except:
+                pass  # Silent fail
+        finally:
+            conn.close()
+    def _detect_data_format(self, data: Any) -> str:
+        """Detect data format"""
+        if isinstance(data, dict):
+            if "_embedded" in data or "_links" in data:
+                return "HAL+JSON"
+            elif "query" in data or "variables" in data:
+                return "PX-Web"
+            else:
+                return "JSON"
+        elif isinstance(data, list):
+            return "JSON-Array"
+        else:
+            return "Unknown"
         if not config:
             return []
             "total_sessions": total_sessions
         }
+# Initialize simplified components
 if 'harvester' not in st.session_state:
+    st.session_state.harvester = SimplifiedDataHarvester()
+if 'last_results' not in st.session_state:
+    st.session_state.last_results = None
 # Enhanced Header
 st.markdown("""