isakskogstad commited on
Commit
788251f
·
verified ·
1 Parent(s): 7186a85

Upload app_ultimate.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app_ultimate.py +457 -120
app_ultimate.py CHANGED
@@ -616,155 +616,169 @@ else:
616
  semantic_analyzer = None
617
  health_monitor = None
618
 
619
- # Comprehensive API Discovery Configuration
620
- DEEP_API_CONFIG = {
621
  "Skolverket": {
622
  "name": "🇸🇪 Skolverket",
623
  "description": "Swedish National Agency for Education",
624
- "base_url": "https://api.skolverket.se",
625
- "discovery_patterns": [
626
- r'/planned-educations/v\d+/.*',
627
- r'/skolenhetsregister/v\d+/.*',
628
- r'/syllabus/v\d+/.*',
629
- r'/susa-navet/v\d+/.*'
630
- ],
631
- "known_roots": [
632
- "/planned-educations/v3/",
633
- "/skolenhetsregister/v2/",
634
- "/syllabus/v1/",
635
- "/susa-navet/v1/"
636
  ],
637
- "explore_depth": 3,
638
- "rate_limit": None,
639
- "auth": None
640
  },
641
  "SCB": {
642
- "name": "🇸🇪 Statistics Sweden",
643
  "description": "Swedish National Statistics Office",
644
- "base_url": "https://api.scb.se",
645
- "discovery_patterns": [
646
- r'/OV0104/v\d+/doris/.*',
647
- r'/\w+/v\d+/.*'
648
- ],
649
- "known_roots": [
650
- "/OV0104/v1/doris/",
651
- "/OV0104/v1/doris/sv/ssd/"
 
 
 
 
 
 
 
 
 
652
  ],
653
- "explore_depth": 5,
654
- "rate_limit": {"requests": 10, "per_seconds": 10},
655
- "auth": None
656
  },
657
  "Kolada": {
658
  "name": "🇸🇪 Kolada",
659
  "description": "Municipal Key Performance Indicators",
660
- "base_url": "https://api.kolada.se",
661
- "discovery_patterns": [
662
- r'/v\d+/.*'
663
- ],
664
- "known_roots": [
665
- "/v2/"
 
 
 
 
 
666
  ],
667
- "explore_depth": 3,
668
- "rate_limit": None,
669
- "auth": None
670
  },
671
  "Eurostat": {
672
  "name": "🇪🇺 Eurostat",
673
  "description": "European Union Statistics",
674
- "base_url": "https://ec.europa.eu/eurostat",
675
- "discovery_patterns": [
676
- r'/api/dissemination/statistics/\d+\.\d+/data/.*'
677
- ],
678
- "known_roots": [
679
- "/api/dissemination/statistics/1.0/data/"
680
  ],
681
- "explore_depth": 2,
682
- "rate_limit": None,
683
- "auth": None
684
  },
685
  "WHO": {
686
  "name": "🌍 WHO",
687
  "description": "World Health Organization",
688
- "base_url": "https://ghoapi.azureedge.net",
689
- "discovery_patterns": [
690
- r'/api/.*'
691
- ],
692
- "known_roots": [
693
- "/api/"
 
 
 
 
 
694
  ],
695
- "explore_depth": 3,
696
- "rate_limit": None,
697
- "auth": None
698
  },
699
  "OECD": {
700
  "name": "🌍 OECD",
701
  "description": "Organisation for Economic Co-operation and Development",
702
- "base_url": "https://sdmx.oecd.org",
703
- "discovery_patterns": [
704
- r'/public/rest/data/.*'
705
- ],
706
- "known_roots": [
707
- "/public/rest/data/"
708
  ],
709
- "explore_depth": 3,
710
- "rate_limit": None,
711
- "auth": None
712
  },
713
  "WorldBank": {
714
  "name": "🌍 World Bank",
715
  "description": "International Financial Institution",
716
- "base_url": "https://api.worldbank.org",
717
- "discovery_patterns": [
718
- r'/v\d+/.*'
719
- ],
720
- "known_roots": [
721
- "/v2/"
 
 
 
 
 
722
  ],
723
- "explore_depth": 4,
724
- "rate_limit": None,
725
- "auth": None
726
  },
727
  "Riksbanken": {
728
  "name": "🇸🇪 Riksbanken",
729
  "description": "Swedish Central Bank",
730
- "base_url": "https://api.riksbank.se",
731
- "discovery_patterns": [
732
- r'/swea/v\d+/.*'
733
- ],
734
- "known_roots": [
735
- "/swea/v1/"
736
  ],
737
- "explore_depth": 3,
738
- "rate_limit": {"requests": 5, "per_seconds": 60},
739
- "auth": None
740
  },
741
  "Swecris": {
742
  "name": "🇸🇪 Swecris",
743
  "description": "Swedish Research Council Database",
744
- "base_url": "https://swecris-api.vr.se",
745
- "discovery_patterns": [
746
- r'/v\d+/.*'
747
- ],
748
- "known_roots": [
749
- "/v1/"
 
 
 
750
  ],
751
- "explore_depth": 2,
752
- "rate_limit": None,
753
- "auth": {"type": "Bearer", "token": "VRSwecrisAPI2025-1"}
754
  },
755
  "CSN": {
756
  "name": "🇸🇪 CSN",
757
  "description": "Swedish Board of Student Finance",
758
- "base_url": "https://statistik.csn.se",
759
- "discovery_patterns": [
760
- r'/PXWeb/api/v\d+/.*'
761
- ],
762
- "known_roots": [
763
- "/PXWeb/api/v1/sv/CSNstat/"
 
 
 
 
 
 
 
 
764
  ],
765
- "explore_depth": 4,
766
- "rate_limit": None,
767
- "auth": None
768
  }
769
  }
770
 
@@ -1155,20 +1169,350 @@ def backup_database(backup_path=None):
1155
  except Exception as e:
1156
  return None
1157
 
1158
- class DeepEndpointDiscoverer:
1159
- """Advanced endpoint discovery with recursive exploration"""
1160
 
1161
  def __init__(self):
1162
  self.session = requests.Session()
1163
  self.session.headers.update({
1164
- 'User-Agent': 'Ultimate-Data-Harvester/2.0 (Deep Discovery & Research Purpose)'
1165
  })
1166
- self.discovered_endpoints = set()
1167
- self.discovery_cache = {}
 
 
 
 
 
 
 
 
 
1168
 
1169
- def discover_all_endpoints(self, api_name: str, progress_callback=None) -> List[Dict]:
1170
- """Discover all possible endpoints for an API with deep exploration"""
1171
- config = DEEP_API_CONFIG.get(api_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1172
  if not config:
1173
  return []
1174
 
@@ -2098,19 +2442,12 @@ class UltimateDataHarvester:
2098
  "total_sessions": total_sessions
2099
  }
2100
 
2101
- # Initialize components
2102
  if 'harvester' not in st.session_state:
2103
- st.session_state.harvester = UltimateDataHarvester()
2104
-
2105
- if 'discovery_status' not in st.session_state:
2106
- st.session_state.discovery_status = {}
2107
-
2108
- if 'current_session' not in st.session_state:
2109
- st.session_state.current_session = None
2110
 
2111
- if 'last_session_info' not in st.session_state:
2112
- last_session = st.session_state.harvester.session_manager.get_last_session()
2113
- st.session_state.last_session_info = last_session
2114
 
2115
  # Enhanced Header
2116
  st.markdown("""
 
616
  semantic_analyzer = None
617
  health_monitor = None
618
 
619
+ # Simplified API Configuration - Real working endpoints
620
+ SIMPLIFIED_API_CONFIG = {
621
  "Skolverket": {
622
  "name": "🇸🇪 Skolverket",
623
  "description": "Swedish National Agency for Education",
624
+ "endpoints": [
625
+ {
626
+ "url": "https://api.skolverket.se/planned-educations/v3",
627
+ "headers": {"Accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"},
628
+ "method": "GET"
629
+ },
630
+ {
631
+ "url": "https://api.skolverket.se/skolenhetsregister/v2/skolenhet",
632
+ "headers": {"Accept": "application/json"},
633
+ "method": "GET"
634
+ }
 
635
  ],
636
+ "rate_limit": None
 
 
637
  },
638
  "SCB": {
639
+ "name": "🇸🇪 Statistics Sweden",
640
  "description": "Swedish National Statistics Office",
641
+ "endpoints": [
642
+ {
643
+ "url": "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101A/BefolkningNy",
644
+ "headers": {"Content-Type": "application/json"},
645
+ "method": "POST",
646
+ "data": {
647
+ "query": [
648
+ {"code": "Region", "selection": {"filter": "item", "values": ["00"]}},
649
+ {"code": "Civilstand", "selection": {"filter": "item", "values": ["TOT"]}},
650
+ {"code": "Alder", "selection": {"filter": "item", "values": ["tot"]}},
651
+ {"code": "Kon", "selection": {"filter": "item", "values": ["1", "2"]}},
652
+ {"code": "ContentsCode", "selection": {"filter": "item", "values": ["BE0101N1"]}},
653
+ {"code": "Tid", "selection": {"filter": "item", "values": ["2023"]}}
654
+ ],
655
+ "response": {"format": "json"}
656
+ }
657
+ }
658
  ],
659
+ "rate_limit": {"requests": 10, "per_seconds": 10}
 
 
660
  },
661
  "Kolada": {
662
  "name": "🇸🇪 Kolada",
663
  "description": "Municipal Key Performance Indicators",
664
+ "endpoints": [
665
+ {
666
+ "url": "https://api.kolada.se/v2/municipality",
667
+ "headers": {"Accept": "application/json"},
668
+ "method": "GET"
669
+ },
670
+ {
671
+ "url": "https://api.kolada.se/v2/kpi",
672
+ "headers": {"Accept": "application/json"},
673
+ "method": "GET"
674
+ }
675
  ],
676
+ "rate_limit": None
 
 
677
  },
678
  "Eurostat": {
679
  "name": "🇪🇺 Eurostat",
680
  "description": "European Union Statistics",
681
+ "endpoints": [
682
+ {
683
+ "url": "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/demo_pjan?format=JSON&lang=en&geo=EU27_2020&age=TOTAL&sex=T&time=2023",
684
+ "headers": {"Accept": "application/json"},
685
+ "method": "GET"
686
+ }
687
  ],
688
+ "rate_limit": None
 
 
689
  },
690
  "WHO": {
691
  "name": "🌍 WHO",
692
  "description": "World Health Organization",
693
+ "endpoints": [
694
+ {
695
+ "url": "https://ghoapi.azureedge.net/api/WHOSIS_000001",
696
+ "headers": {"Accept": "application/json"},
697
+ "method": "GET"
698
+ },
699
+ {
700
+ "url": "https://ghoapi.azureedge.net/api/Dimension",
701
+ "headers": {"Accept": "application/json"},
702
+ "method": "GET"
703
+ }
704
  ],
705
+ "rate_limit": None
 
 
706
  },
707
  "OECD": {
708
  "name": "🌍 OECD",
709
  "description": "Organisation for Economic Co-operation and Development",
710
+ "endpoints": [
711
+ {
712
+ "url": "https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_QNA,1.0/AUS.B1GQ.C.Q?format=jsondata",
713
+ "headers": {"Accept": "application/vnd.sdmx.data+json;version=1.0.0"},
714
+ "method": "GET"
715
+ }
716
  ],
717
+ "rate_limit": None
 
 
718
  },
719
  "WorldBank": {
720
  "name": "🌍 World Bank",
721
  "description": "International Financial Institution",
722
+ "endpoints": [
723
+ {
724
+ "url": "https://api.worldbank.org/v2/country?format=json&per_page=50",
725
+ "headers": {"Accept": "application/json"},
726
+ "method": "GET"
727
+ },
728
+ {
729
+ "url": "https://api.worldbank.org/v2/indicator/SP.POP.TOTL?format=json&date=2023&per_page=50",
730
+ "headers": {"Accept": "application/json"},
731
+ "method": "GET"
732
+ }
733
  ],
734
+ "rate_limit": None
 
 
735
  },
736
  "Riksbanken": {
737
  "name": "🇸🇪 Riksbanken",
738
  "description": "Swedish Central Bank",
739
+ "endpoints": [
740
+ {
741
+ "url": "https://api.riksbank.se/swea/v1/Observations/SEKEURPMI/2023-01-01/2023-12-31",
742
+ "headers": {"Accept": "application/json"},
743
+ "method": "GET"
744
+ }
745
  ],
746
+ "rate_limit": {"requests": 5, "per_seconds": 60}
 
 
747
  },
748
  "Swecris": {
749
  "name": "🇸🇪 Swecris",
750
  "description": "Swedish Research Council Database",
751
+ "endpoints": [
752
+ {
753
+ "url": "https://swecris-api.vr.se/v1/projects?size=50",
754
+ "headers": {
755
+ "Accept": "application/json",
756
+ "Authorization": "Bearer VRSwecrisAPI2025-1"
757
+ },
758
+ "method": "GET"
759
+ }
760
  ],
761
+ "rate_limit": None
 
 
762
  },
763
  "CSN": {
764
  "name": "🇸🇪 CSN",
765
  "description": "Swedish Board of Student Finance",
766
+ "endpoints": [
767
+ {
768
+ "url": "https://statistik.csn.se/PXWeb/api/v1/sv/CSNstat/StudiebidragGymnasieskola/SS0101B1.px",
769
+ "headers": {"Content-Type": "application/json"},
770
+ "method": "POST",
771
+ "data": {
772
+ "query": [
773
+ {"code": "Region", "selection": {"filter": "item", "values": ["00"]}},
774
+ {"code": "ContentsCode", "selection": {"filter": "item", "values": ["SS0101B1"]}},
775
+ {"code": "Tid", "selection": {"filter": "item", "values": ["2023"]}}
776
+ ],
777
+ "response": {"format": "json"}
778
+ }
779
+ }
780
  ],
781
+ "rate_limit": None
 
 
782
  }
783
  }
784
 
 
1169
  except Exception as e:
1170
  return None
1171
 
1172
+ class SimplifiedDataHarvester:
1173
+ """Simplified data harvester - one function to fetch from all APIs"""
1174
 
1175
  def __init__(self):
1176
  self.session = requests.Session()
1177
  self.session.headers.update({
1178
+ 'User-Agent': 'Simplified-Data-Harvester/1.0 (Research & Analysis)'
1179
  })
1180
+ self.results = {}
1181
+ self.errors = {}
1182
+
1183
+ def fetch_all_apis(self, progress_callback=None) -> Dict:
1184
+ """One function to fetch data from all APIs automatically"""
1185
+ session_id = f"simplified_{int(time.time())}"
1186
+ total_apis = len(SIMPLIFIED_API_CONFIG)
1187
+ completed = 0
1188
+
1189
+ if progress_callback:
1190
+ progress_callback(f"🚀 Starting comprehensive data collection from {total_apis} APIs...")
1191
 
1192
+ for api_name, config in SIMPLIFIED_API_CONFIG.items():
1193
+ if progress_callback:
1194
+ progress_callback(f"🔄 Fetching from {config['name']}...")
1195
+
1196
+ try:
1197
+ api_results = self._fetch_api_data(api_name, config, session_id)
1198
+ self.results[api_name] = api_results
1199
+ completed += 1
1200
+
1201
+ if progress_callback:
1202
+ progress = (completed / total_apis) * 100
1203
+ progress_callback(f"✅ {config['name']} completed ({progress:.1f}%)")
1204
+
1205
+ # Apply rate limiting if specified
1206
+ if config.get('rate_limit'):
1207
+ rate_limit = config['rate_limit']
1208
+ sleep_time = rate_limit['per_seconds'] / rate_limit['requests']
1209
+ time.sleep(sleep_time)
1210
+ else:
1211
+ time.sleep(0.5) # Default delay between APIs
1212
+
1213
+ except Exception as e:
1214
+ self.errors[api_name] = str(e)
1215
+ if progress_callback:
1216
+ progress_callback(f"❌ {config['name']} failed: {str(e)[:50]}...")
1217
+ completed += 1
1218
+
1219
+ if progress_callback:
1220
+ successful = len(self.results)
1221
+ failed = len(self.errors)
1222
+ progress_callback(f"🎉 Collection complete! ✅ {successful} successful, ❌ {failed} failed")
1223
+
1224
+ return {
1225
+ "results": self.results,
1226
+ "errors": self.errors,
1227
+ "session_id": session_id,
1228
+ "summary": {
1229
+ "total_apis": total_apis,
1230
+ "successful": len(self.results),
1231
+ "failed": len(self.errors),
1232
+ "success_rate": (len(self.results) / total_apis) * 100
1233
+ }
1234
+ }
1235
+
1236
+ def _fetch_api_data(self, api_name: str, config: Dict, session_id: str) -> Dict:
1237
+ """Fetch data from all endpoints for a specific API"""
1238
+ api_results = {
1239
+ "api_name": api_name,
1240
+ "endpoints": [],
1241
+ "total_records": 0,
1242
+ "total_size": 0
1243
+ }
1244
+
1245
+ for i, endpoint in enumerate(config['endpoints']):
1246
+ try:
1247
+ start_time = time.time()
1248
+
1249
+ # Make request
1250
+ if endpoint.get('method', 'GET').upper() == 'POST':
1251
+ response = self.session.post(
1252
+ endpoint['url'],
1253
+ headers=endpoint.get('headers', {}),
1254
+ json=endpoint.get('data', {}),
1255
+ timeout=30
1256
+ )
1257
+ else:
1258
+ response = self.session.get(
1259
+ endpoint['url'],
1260
+ headers=endpoint.get('headers', {}),
1261
+ timeout=30
1262
+ )
1263
+
1264
+ response.raise_for_status()
1265
+
1266
+ # Process response
1267
+ data = self._process_response(response, api_name)
1268
+ fetch_duration = int((time.time() - start_time) * 1000)
1269
+
1270
+ # Extract meaningful data
1271
+ processed_data = self._extract_api_data(data, api_name)
1272
+ record_count = self._count_records(processed_data)
1273
+ data_size = len(response.content)
1274
+
1275
+ # Save to database
1276
+ endpoint_path = f"endpoint_{i+1}"
1277
+ self._save_data_to_db(
1278
+ api_name, endpoint_path, processed_data, session_id,
1279
+ fetch_duration, record_count, data_size, "success"
1280
+ )
1281
+
1282
+ endpoint_result = {
1283
+ "endpoint_url": endpoint['url'],
1284
+ "status": "success",
1285
+ "records": record_count,
1286
+ "size_bytes": data_size,
1287
+ "duration_ms": fetch_duration,
1288
+ "data_preview": self._create_data_preview(processed_data)
1289
+ }
1290
+
1291
+ api_results["endpoints"].append(endpoint_result)
1292
+ api_results["total_records"] += record_count
1293
+ api_results["total_size"] += data_size
1294
+
1295
+ except Exception as e:
1296
+ endpoint_result = {
1297
+ "endpoint_url": endpoint['url'],
1298
+ "status": "error",
1299
+ "error": str(e),
1300
+ "records": 0,
1301
+ "size_bytes": 0,
1302
+ "duration_ms": 0
1303
+ }
1304
+ api_results["endpoints"].append(endpoint_result)
1305
+
1306
+ return api_results
1307
+
1308
+ def _process_response(self, response, api_name: str):
1309
+ """Process API response based on content type"""
1310
+ content_type = response.headers.get('content-type', '').lower()
1311
+
1312
+ if 'json' in content_type:
1313
+ return response.json()
1314
+ elif 'xml' in content_type:
1315
+ return self._xml_to_dict(response.text)
1316
+ else:
1317
+ try:
1318
+ return response.json() # Try JSON first
1319
+ except:
1320
+ return {"raw_content": response.text}
1321
+
1322
+ def _xml_to_dict(self, xml_text: str) -> Dict:
1323
+ """Convert XML to dictionary"""
1324
+ try:
1325
+ import xml.etree.ElementTree as ET
1326
+ root = ET.fromstring(xml_text)
1327
+ return self._element_to_dict(root)
1328
+ except:
1329
+ return {"raw_xml": xml_text}
1330
+
1331
+ def _element_to_dict(self, element) -> Dict:
1332
+ """Convert XML element to dictionary"""
1333
+ result = {}
1334
+
1335
+ if element.attrib:
1336
+ result.update(element.attrib)
1337
+
1338
+ if element.text and element.text.strip():
1339
+ if len(element) == 0:
1340
+ return element.text.strip()
1341
+ result['text'] = element.text.strip()
1342
+
1343
+ for child in element:
1344
+ child_data = self._element_to_dict(child)
1345
+ if child.tag in result:
1346
+ if not isinstance(result[child.tag], list):
1347
+ result[child.tag] = [result[child.tag]]
1348
+ result[child.tag].append(child_data)
1349
+ else:
1350
+ result[child.tag] = child_data
1351
+
1352
+ return result
1353
+
1354
+ def _extract_api_data(self, data: Any, api_name: str) -> Any:
1355
+ """Extract meaningful data from API response based on API type"""
1356
+ if api_name == "Skolverket":
1357
+ if isinstance(data, dict):
1358
+ if "_embedded" in data:
1359
+ return data["_embedded"]
1360
+ elif "skolenheter" in data:
1361
+ return data["skolenheter"]
1362
+ return data
1363
+
1364
+ elif api_name == "SCB":
1365
+ if isinstance(data, dict):
1366
+ return data.get("data", data.get("variables", data))
1367
+
1368
+ elif api_name == "Kolada":
1369
+ if isinstance(data, dict):
1370
+ return data.get("values", data)
1371
+
1372
+ elif api_name == "Eurostat":
1373
+ if isinstance(data, dict):
1374
+ return data.get("value", data.get("data", data))
1375
+
1376
+ elif api_name == "WHO":
1377
+ if isinstance(data, dict):
1378
+ return data.get("value", data.get("fact", data))
1379
+
1380
+ elif api_name == "OECD":
1381
+ if isinstance(data, dict):
1382
+ if "data" in data:
1383
+ return data["data"]
1384
+ return data
1385
+
1386
+ elif api_name == "WorldBank":
1387
+ if isinstance(data, list) and len(data) > 1:
1388
+ return data[1] if data[1] else data[0]
1389
+ return data
1390
+
1391
+ elif api_name == "Riksbanken":
1392
+ if isinstance(data, dict):
1393
+ return data.get("observations", data.get("data", data))
1394
+
1395
+ elif api_name == "Swecris":
1396
+ if isinstance(data, dict):
1397
+ return data.get("items", data.get("projects", data))
1398
+
1399
+ elif api_name == "CSN":
1400
+ if isinstance(data, dict):
1401
+ return data.get("data", data.get("variables", data))
1402
+
1403
+ return data
1404
+
1405
+ def _count_records(self, data: Any) -> int:
1406
+ """Count records in the data"""
1407
+ if isinstance(data, list):
1408
+ return len(data)
1409
+ elif isinstance(data, dict):
1410
+ # Try to find arrays that represent records
1411
+ for key, value in data.items():
1412
+ if isinstance(value, list) and len(value) > 0:
1413
+ return len(value)
1414
+ return 1
1415
+ else:
1416
+ return 1 if data else 0
1417
+
1418
+ def _create_data_preview(self, data: Any) -> Dict:
1419
+ """Create a preview of the data for display"""
1420
+ preview = {
1421
+ "type": type(data).__name__,
1422
+ "sample": None
1423
+ }
1424
+
1425
+ if isinstance(data, list):
1426
+ preview["length"] = len(data)
1427
+ preview["sample"] = data[:3] if len(data) > 3 else data
1428
+ elif isinstance(data, dict):
1429
+ preview["keys"] = list(data.keys())[:10]
1430
+ if data:
1431
+ first_key = list(data.keys())[0]
1432
+ preview["sample"] = {first_key: data[first_key]}
1433
+ else:
1434
+ preview["sample"] = str(data)[:200]
1435
+
1436
+ return preview
1437
+
1438
+ def _save_data_to_db(self, api_name: str, endpoint_path: str, data: Any,
1439
+ session_id: str, fetch_duration: int, record_count: int,
1440
+ data_size: int, status: str, error_message: str = None):
1441
+ """Save data to database with optimization"""
1442
+ import gzip
1443
+
1444
+ conn = sqlite3.connect(DB_PATH)
1445
+ cursor = conn.cursor()
1446
+
1447
+ try:
1448
+ # Create data hash for deduplication
1449
+ data_str = json.dumps(data, sort_keys=True, default=str)
1450
+ data_hash = hashlib.sha256(data_str.encode()).hexdigest()
1451
+
1452
+ # Check if data exists
1453
+ cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
1454
+ if cursor.fetchone():
1455
+ return # Skip duplicate
1456
+
1457
+ # Compress if large
1458
+ raw_data_compressed = None
1459
+ raw_data = None
1460
+
1461
+ if data_size > 1024:
1462
+ try:
1463
+ raw_data_compressed = gzip.compress(data_str.encode('utf-8'))
1464
+ except:
1465
+ raw_data = data_str
1466
+ else:
1467
+ raw_data = data_str
1468
+
1469
+ # Insert data
1470
+ cursor.execute('''
1471
+ INSERT INTO harvested_data
1472
+ (api_name, endpoint_path, data_hash, raw_data, raw_data_compressed,
1473
+ record_count, data_size_bytes, fetch_duration_ms, status,
1474
+ error_message, session_id, data_format)
1475
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1476
+ ''', (
1477
+ api_name, endpoint_path, data_hash, raw_data, raw_data_compressed,
1478
+ record_count, data_size, fetch_duration, status, error_message,
1479
+ session_id, self._detect_data_format(data)
1480
+ ))
1481
+
1482
+ conn.commit()
1483
+
1484
+ except Exception as e:
1485
+ # Fallback to basic schema
1486
+ try:
1487
+ cursor.execute('''
1488
+ INSERT OR REPLACE INTO harvested_data
1489
+ (api_name, endpoint_path, data_hash, raw_data, record_count,
1490
+ data_size_bytes, fetch_duration_ms, status, session_id)
1491
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1492
+ ''', (
1493
+ api_name, endpoint_path, data_hash, data_str[:10000], # Limit size
1494
+ record_count, data_size, fetch_duration, status, session_id
1495
+ ))
1496
+ conn.commit()
1497
+ except:
1498
+ pass # Silent fail
1499
+
1500
+ finally:
1501
+ conn.close()
1502
+
1503
+ def _detect_data_format(self, data: Any) -> str:
1504
+ """Detect data format"""
1505
+ if isinstance(data, dict):
1506
+ if "_embedded" in data or "_links" in data:
1507
+ return "HAL+JSON"
1508
+ elif "query" in data or "variables" in data:
1509
+ return "PX-Web"
1510
+ else:
1511
+ return "JSON"
1512
+ elif isinstance(data, list):
1513
+ return "JSON-Array"
1514
+ else:
1515
+ return "Unknown"
1516
  if not config:
1517
  return []
1518
 
 
2442
  "total_sessions": total_sessions
2443
  }
2444
 
2445
+ # Initialize simplified components
2446
  if 'harvester' not in st.session_state:
2447
+ st.session_state.harvester = SimplifiedDataHarvester()
 
 
 
 
 
 
2448
 
2449
+ if 'last_results' not in st.session_state:
2450
+ st.session_state.last_results = None
 
2451
 
2452
  # Enhanced Header
2453
  st.markdown("""