isakskogstad commited on
Commit
7186a85
Β·
verified Β·
1 Parent(s): 284efb4

Upload app_ultimate.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app_ultimate.py +571 -43
app_ultimate.py CHANGED
@@ -769,11 +769,18 @@ DEEP_API_CONFIG = {
769
  }
770
 
771
  def init_enhanced_database():
772
- """Initialize enhanced SQLite database with comprehensive schema"""
773
  conn = sqlite3.connect(DB_PATH)
774
  cursor = conn.cursor()
775
 
776
- # Enhanced endpoints table
 
 
 
 
 
 
 
777
  cursor.execute('''
778
  CREATE TABLE IF NOT EXISTS discovered_endpoints (
779
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -781,41 +788,71 @@ def init_enhanced_database():
781
  endpoint_path TEXT NOT NULL,
782
  full_url TEXT NOT NULL,
783
  discovery_method TEXT,
784
- depth_level INTEGER,
785
  parent_endpoint TEXT,
786
  endpoint_type TEXT,
787
  last_checked TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
788
  is_active BOOLEAN DEFAULT 1,
789
  response_format TEXT,
790
  parameters_schema TEXT,
 
 
 
791
  UNIQUE(api_name, endpoint_path)
792
  )
793
  ''')
794
 
795
- # Enhanced data storage table
 
 
 
 
 
 
796
  cursor.execute('''
797
  CREATE TABLE IF NOT EXISTS harvested_data (
798
  id INTEGER PRIMARY KEY AUTOINCREMENT,
799
  api_name TEXT NOT NULL,
800
  endpoint_path TEXT NOT NULL,
801
- data_hash TEXT UNIQUE,
802
- raw_data TEXT,
803
- processed_data TEXT,
804
- record_count INTEGER,
805
- data_size_bytes INTEGER,
 
 
806
  fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
807
- fetch_duration_ms INTEGER,
808
  status TEXT DEFAULT 'success',
809
  error_message TEXT,
810
- session_id TEXT
 
 
 
 
 
 
 
 
 
811
  )
812
  ''')
813
 
814
- # Session management table
 
 
 
 
 
 
 
 
 
 
815
  cursor.execute('''
816
  CREATE TABLE IF NOT EXISTS harvest_sessions (
817
  id INTEGER PRIMARY KEY AUTOINCREMENT,
818
- session_id TEXT UNIQUE,
819
  session_name TEXT,
820
  started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
821
  last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
@@ -825,14 +862,26 @@ def init_enhanced_database():
825
  successful_fetches INTEGER DEFAULT 0,
826
  failed_fetches INTEGER DEFAULT 0,
827
  total_records INTEGER DEFAULT 0,
 
828
  session_status TEXT DEFAULT 'active',
829
  current_api TEXT,
830
  current_endpoint TEXT,
831
- session_config TEXT
 
 
 
 
 
832
  )
833
  ''')
834
 
835
- # Discovery progress table
 
 
 
 
 
 
836
  cursor.execute('''
837
  CREATE TABLE IF NOT EXISTS discovery_progress (
838
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -843,13 +892,269 @@ def init_enhanced_database():
843
  endpoints_found INTEGER DEFAULT 0,
844
  depth_reached INTEGER DEFAULT 0,
845
  discovery_status TEXT DEFAULT 'running',
846
- discovery_config TEXT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
847
  )
848
  ''')
849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
  conn.commit()
851
  conn.close()
852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
853
  class DeepEndpointDiscoverer:
854
  """Advanced endpoint discovery with recursive exploration"""
855
 
@@ -1535,7 +1840,9 @@ class UltimateDataHarvester:
1535
  def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
1536
  session_id: str, fetch_duration: int, record_count: int,
1537
  data_size: int, status: str = "success", error_message: str = None):
1538
- """Save harvested data with AI-enhanced intelligent categorization"""
 
 
1539
  conn = sqlite3.connect(DB_PATH)
1540
  cursor = conn.cursor()
1541
 
@@ -1543,6 +1850,19 @@ class UltimateDataHarvester:
1543
  data_str = json.dumps(data, sort_keys=True, default=str)
1544
  data_hash = hashlib.sha256(data_str.encode()).hexdigest()
1545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1546
  # AI Quality Assessment
1547
  quality_assessment = {}
1548
  if ai_quality_assessor and status == "success":
@@ -1561,20 +1881,57 @@ class UltimateDataHarvester:
1561
  api_name, fetch_duration, success_rate, data_size
1562
  )
1563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1564
  try:
1565
  cursor.execute('''
1566
- INSERT OR REPLACE INTO harvested_data
1567
- (api_name, endpoint_path, data_hash, raw_data, processed_data,
 
1568
  record_count, data_size_bytes, fetch_duration_ms, status,
1569
- error_message, session_id, quality_score, health_score, similar_datasets)
1570
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 
1571
  ''', (
1572
- api_name, endpoint_path, data_hash, data_str,
1573
- json.dumps(data, default=str), record_count, data_size,
1574
- fetch_duration, status, error_message, session_id,
1575
  quality_assessment.get('ai_quality_score', 0.0),
1576
  health_info.get('health_score', 0.0),
1577
- json.dumps(similar_datasets[:3], default=str) # Top 3 similar datasets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1578
  ))
1579
 
1580
  conn.commit()
@@ -1583,27 +1940,45 @@ class UltimateDataHarvester:
1583
  if quality_assessment and st.session_state.get('show_ai_insights', True):
1584
  self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
1585
 
1586
- except sqlite3.IntegrityError:
1587
- pass # Data already exists
1588
- except sqlite3.OperationalError:
1589
- # Handle case where AI columns don't exist yet - add them
1590
- self._upgrade_database_schema()
1591
- # Retry with basic data
1592
- cursor.execute('''
1593
- INSERT OR REPLACE INTO harvested_data
1594
- (api_name, endpoint_path, data_hash, raw_data, processed_data,
1595
- record_count, data_size_bytes, fetch_duration_ms, status,
1596
- error_message, session_id)
1597
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1598
- ''', (
1599
- api_name, endpoint_path, data_hash, data_str,
1600
- json.dumps(data, default=str), record_count, data_size,
1601
- fetch_duration, status, error_message, session_id
1602
- ))
1603
- conn.commit()
1604
  finally:
1605
  conn.close()
1606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1607
  def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
1608
  health_info: Dict, similar_datasets: List[Dict]):
1609
  """Display AI-powered insights in real-time"""
@@ -2062,6 +2437,159 @@ with tab3:
2062
  finally:
2063
  conn.close()
2064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2065
  # AI Enhancement Panel
2066
  st.markdown("---")
2067
  with st.expander("πŸ€– AI Enhancement Status", expanded=False):
 
769
  }
770
 
771
  def init_enhanced_database():
772
+ """Initialize optimized SQLite database with comprehensive schema and performance enhancements"""
773
  conn = sqlite3.connect(DB_PATH)
774
  cursor = conn.cursor()
775
 
776
+ # Enable WAL mode for better concurrency and performance
777
+ cursor.execute('PRAGMA journal_mode=WAL')
778
+ cursor.execute('PRAGMA synchronous=NORMAL')
779
+ cursor.execute('PRAGMA cache_size=10000')
780
+ cursor.execute('PRAGMA temp_store=MEMORY')
781
+ cursor.execute('PRAGMA mmap_size=268435456') # 256MB
782
+
783
+ # Enhanced endpoints table with better indexing
784
  cursor.execute('''
785
  CREATE TABLE IF NOT EXISTS discovered_endpoints (
786
  id INTEGER PRIMARY KEY AUTOINCREMENT,
 
788
  endpoint_path TEXT NOT NULL,
789
  full_url TEXT NOT NULL,
790
  discovery_method TEXT,
791
+ depth_level INTEGER DEFAULT 0,
792
  parent_endpoint TEXT,
793
  endpoint_type TEXT,
794
  last_checked TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
795
  is_active BOOLEAN DEFAULT 1,
796
  response_format TEXT,
797
  parameters_schema TEXT,
798
+ estimated_records INTEGER DEFAULT 0,
799
+ last_fetch_status TEXT,
800
+ creation_date DATE DEFAULT (date('now')),
801
  UNIQUE(api_name, endpoint_path)
802
  )
803
  ''')
804
 
805
+ # Create indexes for endpoints table
806
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_api_name ON discovered_endpoints(api_name)')
807
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_active ON discovered_endpoints(is_active)')
808
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_last_checked ON discovered_endpoints(last_checked)')
809
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_depth ON discovered_endpoints(depth_level)')
810
+
811
+ # Optimized data storage table with compression and partitioning support
812
  cursor.execute('''
813
  CREATE TABLE IF NOT EXISTS harvested_data (
814
  id INTEGER PRIMARY KEY AUTOINCREMENT,
815
  api_name TEXT NOT NULL,
816
  endpoint_path TEXT NOT NULL,
817
+ data_hash TEXT UNIQUE NOT NULL,
818
+ raw_data_compressed BLOB,
819
+ processed_data_compressed BLOB,
820
+ raw_data_size INTEGER,
821
+ processed_data_size INTEGER,
822
+ record_count INTEGER DEFAULT 0,
823
+ data_size_bytes INTEGER DEFAULT 0,
824
  fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
825
+ fetch_duration_ms INTEGER DEFAULT 0,
826
  status TEXT DEFAULT 'success',
827
  error_message TEXT,
828
+ session_id TEXT,
829
+ quality_score REAL DEFAULT 0.0,
830
+ health_score REAL DEFAULT 0.0,
831
+ similar_datasets TEXT DEFAULT '[]',
832
+ data_format TEXT,
833
+ api_version TEXT,
834
+ fetch_date DATE DEFAULT (date('now')),
835
+ last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
836
+ access_count INTEGER DEFAULT 1,
837
+ CHECK (status IN ('success', 'error', 'partial', 'timeout'))
838
  )
839
  ''')
840
 
841
+ # Create comprehensive indexes for data table
842
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_api_name ON harvested_data(api_name)')
843
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_fetch_date ON harvested_data(fetch_date)')
844
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_status ON harvested_data(status)')
845
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_session ON harvested_data(session_id)')
846
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_hash ON harvested_data(data_hash)')
847
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_quality ON harvested_data(quality_score)')
848
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_records ON harvested_data(record_count)')
849
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_size ON harvested_data(data_size_bytes)')
850
+
851
+ # Enhanced session management table
852
  cursor.execute('''
853
  CREATE TABLE IF NOT EXISTS harvest_sessions (
854
  id INTEGER PRIMARY KEY AUTOINCREMENT,
855
+ session_id TEXT UNIQUE NOT NULL,
856
  session_name TEXT,
857
  started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
858
  last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 
862
  successful_fetches INTEGER DEFAULT 0,
863
  failed_fetches INTEGER DEFAULT 0,
864
  total_records INTEGER DEFAULT 0,
865
+ total_data_size INTEGER DEFAULT 0,
866
  session_status TEXT DEFAULT 'active',
867
  current_api TEXT,
868
  current_endpoint TEXT,
869
+ session_config TEXT,
870
+ error_count INTEGER DEFAULT 0,
871
+ avg_fetch_time REAL DEFAULT 0.0,
872
+ session_type TEXT DEFAULT 'manual',
873
+ priority INTEGER DEFAULT 1,
874
+ CHECK (session_status IN ('active', 'paused', 'completed', 'failed', 'cancelled'))
875
  )
876
  ''')
877
 
878
+ # Create indexes for sessions table
879
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_status ON harvest_sessions(session_status)')
880
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_started ON harvest_sessions(started_at)')
881
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_activity ON harvest_sessions(last_activity)')
882
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_priority ON harvest_sessions(priority)')
883
+
884
+ # Enhanced discovery progress table
885
  cursor.execute('''
886
  CREATE TABLE IF NOT EXISTS discovery_progress (
887
  id INTEGER PRIMARY KEY AUTOINCREMENT,
 
892
  endpoints_found INTEGER DEFAULT 0,
893
  depth_reached INTEGER DEFAULT 0,
894
  discovery_status TEXT DEFAULT 'running',
895
+ discovery_config TEXT,
896
+ errors_encountered INTEGER DEFAULT 0,
897
+ success_rate REAL DEFAULT 0.0,
898
+ estimated_total INTEGER DEFAULT 0,
899
+ CHECK (discovery_status IN ('running', 'completed', 'failed', 'paused'))
900
+ )
901
+ ''')
902
+
903
+ # Create indexes for discovery table
904
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_api ON discovery_progress(api_name)')
905
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_status ON discovery_progress(discovery_status)')
906
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_started ON discovery_progress(started_at)')
907
+
908
+ # Data quality and metadata table
909
+ cursor.execute('''
910
+ CREATE TABLE IF NOT EXISTS data_quality_metrics (
911
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
912
+ data_id INTEGER REFERENCES harvested_data(id),
913
+ api_name TEXT NOT NULL,
914
+ completeness_score REAL DEFAULT 0.0,
915
+ consistency_score REAL DEFAULT 0.0,
916
+ accuracy_score REAL DEFAULT 0.0,
917
+ timeliness_score REAL DEFAULT 0.0,
918
+ overall_quality REAL DEFAULT 0.0,
919
+ anomalies_detected INTEGER DEFAULT 0,
920
+ anomaly_details TEXT,
921
+ validation_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
922
+ validation_rules_version TEXT DEFAULT '1.0'
923
  )
924
  ''')
925
 
926
+ # Create quality metrics indexes
927
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_api ON data_quality_metrics(api_name)')
928
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_overall ON data_quality_metrics(overall_quality)')
929
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_timestamp ON data_quality_metrics(validation_timestamp)')
930
+
931
+ # API performance tracking table
932
+ cursor.execute('''
933
+ CREATE TABLE IF NOT EXISTS api_performance_log (
934
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
935
+ api_name TEXT NOT NULL,
936
+ endpoint_path TEXT NOT NULL,
937
+ response_time_ms INTEGER,
938
+ response_size_bytes INTEGER,
939
+ http_status_code INTEGER,
940
+ success BOOLEAN,
941
+ error_type TEXT,
942
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
943
+ date_only DATE DEFAULT (date('now'))
944
+ )
945
+ ''')
946
+
947
+ # Create performance indexes
948
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_api_date ON api_performance_log(api_name, date_only)')
949
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_success ON api_performance_log(success)')
950
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_response_time ON api_performance_log(response_time_ms)')
951
+
952
+ # Data archival management table
953
+ cursor.execute('''
954
+ CREATE TABLE IF NOT EXISTS data_archive_log (
955
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
956
+ original_data_id INTEGER,
957
+ archive_path TEXT,
958
+ archive_format TEXT DEFAULT 'gzip',
959
+ archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
960
+ original_size INTEGER,
961
+ compressed_size INTEGER,
962
+ compression_ratio REAL,
963
+ checksum TEXT,
964
+ retention_date DATE,
965
+ archive_status TEXT DEFAULT 'active'
966
+ )
967
+ ''')
968
+
969
+ # Create views for common queries
970
+ cursor.execute('''
971
+ CREATE VIEW IF NOT EXISTS v_api_summary AS
972
+ SELECT
973
+ api_name,
974
+ COUNT(*) as total_fetches,
975
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as successful_fetches,
976
+ SUM(record_count) as total_records,
977
+ SUM(data_size_bytes) as total_data_size,
978
+ AVG(fetch_duration_ms) as avg_fetch_time,
979
+ AVG(quality_score) as avg_quality_score,
980
+ MAX(fetch_timestamp) as last_fetch,
981
+ MIN(fetch_timestamp) as first_fetch
982
+ FROM harvested_data
983
+ GROUP BY api_name
984
+ ''')
985
+
986
+ cursor.execute('''
987
+ CREATE VIEW IF NOT EXISTS v_session_summary AS
988
+ SELECT
989
+ session_id,
990
+ session_name,
991
+ session_status,
992
+ started_at,
993
+ completed_at,
994
+ total_endpoints,
995
+ processed_endpoints,
996
+ successful_fetches,
997
+ failed_fetches,
998
+ total_records,
999
+ total_data_size,
1000
+ CASE
1001
+ WHEN total_endpoints > 0 THEN
1002
+ ROUND((processed_endpoints * 100.0) / total_endpoints, 2)
1003
+ ELSE 0
1004
+ END as completion_percentage,
1005
+ CASE
1006
+ WHEN processed_endpoints > 0 THEN
1007
+ ROUND((successful_fetches * 100.0) / processed_endpoints, 2)
1008
+ ELSE 0
1009
+ END as success_percentage
1010
+ FROM harvest_sessions
1011
+ ''')
1012
+
1013
+ # Enable automatic statistics collection
1014
+ cursor.execute('PRAGMA optimize')
1015
+
1016
  conn.commit()
1017
  conn.close()
1018
 
1019
+ # Database optimization and maintenance functions
1020
+ def optimize_database():
1021
+ """Perform database optimization and maintenance"""
1022
+ conn = sqlite3.connect(DB_PATH)
1023
+ cursor = conn.cursor()
1024
+
1025
+ try:
1026
+ # Update statistics
1027
+ cursor.execute('ANALYZE')
1028
+
1029
+ # Vacuum if necessary (reclaim space)
1030
+ cursor.execute('PRAGMA auto_vacuum=INCREMENTAL')
1031
+ cursor.execute('PRAGMA incremental_vacuum')
1032
+
1033
+ # Optimize query planner
1034
+ cursor.execute('PRAGMA optimize')
1035
+
1036
+ conn.commit()
1037
+ return True
1038
+ except Exception as e:
1039
+ return False
1040
+ finally:
1041
+ conn.close()
1042
+
1043
+ def get_database_stats():
1044
+ """Get comprehensive database statistics"""
1045
+ conn = sqlite3.connect(DB_PATH)
1046
+ cursor = conn.cursor()
1047
+
1048
+ try:
1049
+ stats = {}
1050
+
1051
+ # Basic table counts
1052
+ tables = ['discovered_endpoints', 'harvested_data', 'harvest_sessions', 'discovery_progress']
1053
+ for table in tables:
1054
+ cursor.execute(f'SELECT COUNT(*) FROM {table}')
1055
+ stats[f'{table}_count'] = cursor.fetchone()[0]
1056
+
1057
+ # Database size
1058
+ cursor.execute('PRAGMA page_count')
1059
+ page_count = cursor.fetchone()[0]
1060
+ cursor.execute('PRAGMA page_size')
1061
+ page_size = cursor.fetchone()[0]
1062
+ stats['database_size_mb'] = round((page_count * page_size) / (1024 * 1024), 2)
1063
+
1064
+ # Data quality stats
1065
+ cursor.execute('SELECT AVG(quality_score), AVG(health_score) FROM harvested_data WHERE status = "success"')
1066
+ quality_stats = cursor.fetchone()
1067
+ stats['avg_quality_score'] = round(quality_stats[0] or 0, 3)
1068
+ stats['avg_health_score'] = round(quality_stats[1] or 0, 3)
1069
+
1070
+ # Recent activity
1071
+ cursor.execute('''
1072
+ SELECT COUNT(*) FROM harvested_data
1073
+ WHERE fetch_timestamp > datetime('now', '-24 hours')
1074
+ ''')
1075
+ stats['recent_fetches_24h'] = cursor.fetchone()[0]
1076
+
1077
+ return stats
1078
+
1079
+ finally:
1080
+ conn.close()
1081
+
1082
+ def compress_old_data(days_old=30):
1083
+ """Compress data older than specified days"""
1084
+ import gzip
1085
+ import json
1086
+
1087
+ conn = sqlite3.connect(DB_PATH)
1088
+ cursor = conn.cursor()
1089
+
1090
+ try:
1091
+ # Find old data to compress
1092
+ cursor.execute('''
1093
+ SELECT id, raw_data, processed_data
1094
+ FROM harvested_data
1095
+ WHERE fetch_timestamp < datetime('now', '-{} days')
1096
+ AND raw_data_compressed IS NULL
1097
+ '''.format(days_old))
1098
+
1099
+ old_records = cursor.fetchall()
1100
+ compressed_count = 0
1101
+
1102
+ for record_id, raw_data, processed_data in old_records:
1103
+ try:
1104
+ # Compress raw data
1105
+ raw_compressed = None
1106
+ if raw_data:
1107
+ raw_compressed = gzip.compress(raw_data.encode('utf-8'))
1108
+
1109
+ # Compress processed data
1110
+ processed_compressed = None
1111
+ if processed_data:
1112
+ processed_compressed = gzip.compress(processed_data.encode('utf-8'))
1113
+
1114
+ # Update record with compressed data
1115
+ cursor.execute('''
1116
+ UPDATE harvested_data
1117
+ SET raw_data_compressed = ?,
1118
+ processed_data_compressed = ?,
1119
+ raw_data = NULL,
1120
+ processed_data = NULL,
1121
+ raw_data_size = ?,
1122
+ processed_data_size = ?
1123
+ WHERE id = ?
1124
+ ''', (
1125
+ raw_compressed,
1126
+ processed_compressed,
1127
+ len(raw_data) if raw_data else 0,
1128
+ len(processed_data) if processed_data else 0,
1129
+ record_id
1130
+ ))
1131
+
1132
+ compressed_count += 1
1133
+
1134
+ except Exception as e:
1135
+ continue # Skip problematic records
1136
+
1137
+ conn.commit()
1138
+ return compressed_count
1139
+
1140
+ finally:
1141
+ conn.close()
1142
+
1143
+ def backup_database(backup_path=None):
1144
+ """Create a backup of the database"""
1145
+ import shutil
1146
+ from datetime import datetime
1147
+
1148
+ if backup_path is None:
1149
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1150
+ backup_path = f"backup_harvester_{timestamp}.db"
1151
+
1152
+ try:
1153
+ shutil.copy2(DB_PATH, backup_path)
1154
+ return backup_path
1155
+ except Exception as e:
1156
+ return None
1157
+
1158
  class DeepEndpointDiscoverer:
1159
  """Advanced endpoint discovery with recursive exploration"""
1160
 
 
1840
  def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
1841
  session_id: str, fetch_duration: int, record_count: int,
1842
  data_size: int, status: str = "success", error_message: str = None):
1843
+ """Save harvested data with optimized storage and AI-enhanced analysis"""
1844
+ import gzip
1845
+
1846
  conn = sqlite3.connect(DB_PATH)
1847
  cursor = conn.cursor()
1848
 
 
1850
  data_str = json.dumps(data, sort_keys=True, default=str)
1851
  data_hash = hashlib.sha256(data_str.encode()).hexdigest()
1852
 
1853
+ # Check if this data already exists
1854
+ cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
1855
+ if cursor.fetchone():
1856
+ # Update access count and last accessed time
1857
+ cursor.execute('''
1858
+ UPDATE harvested_data
1859
+ SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
1860
+ WHERE data_hash = ?
1861
+ ''', (data_hash,))
1862
+ conn.commit()
1863
+ conn.close()
1864
+ return
1865
+
1866
  # AI Quality Assessment
1867
  quality_assessment = {}
1868
  if ai_quality_assessor and status == "success":
 
1881
  api_name, fetch_duration, success_rate, data_size
1882
  )
1883
 
1884
+ # Determine data format
1885
+ data_format = self._detect_data_format(data)
1886
+
1887
+ # Compress data if it's large
1888
+ raw_data_compressed = None
1889
+ processed_data_compressed = None
1890
+ raw_data = None
1891
+ processed_data = None
1892
+
1893
+ if data_size > 1024: # Compress if larger than 1KB
1894
+ try:
1895
+ raw_data_compressed = gzip.compress(data_str.encode('utf-8'))
1896
+ processed_data_compressed = gzip.compress(json.dumps(data, default=str).encode('utf-8'))
1897
+ except:
1898
+ # Fallback to uncompressed storage
1899
+ raw_data = data_str
1900
+ processed_data = json.dumps(data, default=str)
1901
+ else:
1902
+ raw_data = data_str
1903
+ processed_data = json.dumps(data, default=str)
1904
+
1905
  try:
1906
  cursor.execute('''
1907
+ INSERT INTO harvested_data
1908
+ (api_name, endpoint_path, data_hash, raw_data_compressed, processed_data_compressed,
1909
+ raw_data, processed_data, raw_data_size, processed_data_size,
1910
  record_count, data_size_bytes, fetch_duration_ms, status,
1911
+ error_message, session_id, quality_score, health_score, similar_datasets,
1912
+ data_format, access_count)
1913
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1914
  ''', (
1915
+ api_name, endpoint_path, data_hash, raw_data_compressed, processed_data_compressed,
1916
+ raw_data, processed_data, len(data_str), len(json.dumps(data, default=str)),
1917
+ record_count, data_size, fetch_duration, status, error_message, session_id,
1918
  quality_assessment.get('ai_quality_score', 0.0),
1919
  health_info.get('health_score', 0.0),
1920
+ json.dumps(similar_datasets[:3], default=str),
1921
+ data_format, 1
1922
+ ))
1923
+
1924
+ # Log API performance
1925
+ cursor.execute('''
1926
+ INSERT INTO api_performance_log
1927
+ (api_name, endpoint_path, response_time_ms, response_size_bytes,
1928
+ http_status_code, success, error_type)
1929
+ VALUES (?, ?, ?, ?, ?, ?, ?)
1930
+ ''', (
1931
+ api_name, endpoint_path, fetch_duration, data_size,
1932
+ 200 if status == "success" else 500,
1933
+ status == "success",
1934
+ error_message if status != "success" else None
1935
  ))
1936
 
1937
  conn.commit()
 
1940
  if quality_assessment and st.session_state.get('show_ai_insights', True):
1941
  self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
1942
 
1943
+ except sqlite3.OperationalError as e:
1944
+ # Handle database schema updates
1945
+ if "no such column" in str(e):
1946
+ self._upgrade_database_schema()
1947
+ # Retry with basic data structure
1948
+ cursor.execute('''
1949
+ INSERT OR REPLACE INTO harvested_data
1950
+ (api_name, endpoint_path, data_hash, raw_data, processed_data,
1951
+ record_count, data_size_bytes, fetch_duration_ms, status,
1952
+ error_message, session_id)
1953
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1954
+ ''', (
1955
+ api_name, endpoint_path, data_hash, raw_data or data_str,
1956
+ processed_data or json.dumps(data, default=str), record_count, data_size,
1957
+ fetch_duration, status, error_message, session_id
1958
+ ))
1959
+ conn.commit()
 
1960
  finally:
1961
  conn.close()
1962
 
1963
+ def _detect_data_format(self, data: Any) -> str:
1964
+ """Detect the format of the data"""
1965
+ if isinstance(data, dict):
1966
+ if "_embedded" in data or "_links" in data:
1967
+ return "HAL+JSON"
1968
+ elif "dataSets" in data or "structure" in data:
1969
+ return "SDMX-JSON"
1970
+ else:
1971
+ return "JSON"
1972
+ elif isinstance(data, list):
1973
+ return "JSON-Array"
1974
+ elif isinstance(data, str):
1975
+ if data.strip().startswith('<'):
1976
+ return "XML"
1977
+ else:
1978
+ return "Text"
1979
+ else:
1980
+ return "Unknown"
1981
+
1982
  def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
1983
  health_info: Dict, similar_datasets: List[Dict]):
1984
  """Display AI-powered insights in real-time"""
 
2437
  finally:
2438
  conn.close()
2439
 
2440
+ # Database Management Section
2441
+ with st.expander("πŸ—„οΈ Database Management & Statistics", expanded=False):
2442
+ st.markdown("**Database Performance & Maintenance Tools**")
2443
+
2444
+ col1, col2, col3, col4 = st.columns(4)
2445
+
2446
+ with col1:
2447
+ if st.button("πŸ“Š Get Database Stats", use_container_width=True):
2448
+ with st.spinner("Analyzing database..."):
2449
+ stats = get_database_stats()
2450
+
2451
+ st.markdown("**Database Statistics:**")
2452
+ for key, value in stats.items():
2453
+ formatted_key = key.replace('_', ' ').title()
2454
+ if 'size_mb' in key:
2455
+ st.metric(formatted_key, f"{value} MB")
2456
+ elif 'score' in key:
2457
+ st.metric(formatted_key, f"{value:.3f}")
2458
+ else:
2459
+ st.metric(formatted_key, value)
2460
+
2461
+ with col2:
2462
+ if st.button("πŸ”§ Optimize Database", use_container_width=True):
2463
+ with st.spinner("Optimizing database..."):
2464
+ success = optimize_database()
2465
+ if success:
2466
+ st.success("βœ… Database optimized successfully!")
2467
+ else:
2468
+ st.error("❌ Database optimization failed")
2469
+
2470
+ with col3:
2471
+ if st.button("πŸ—œοΈ Compress Old Data", use_container_width=True):
2472
+ with st.spinner("Compressing old data..."):
2473
+ compressed_count = compress_old_data(days_old=7) # Compress data older than 7 days
2474
+ if compressed_count > 0:
2475
+ st.success(f"βœ… Compressed {compressed_count} old records")
2476
+ else:
2477
+ st.info("ℹ️ No old data found to compress")
2478
+
2479
+ with col4:
2480
+ if st.button("πŸ’Ύ Create Backup", use_container_width=True):
2481
+ with st.spinner("Creating backup..."):
2482
+ backup_path = backup_database()
2483
+ if backup_path:
2484
+ st.success(f"βœ… Backup created: {backup_path}")
2485
+ # Offer download
2486
+ try:
2487
+ with open(backup_path, 'rb') as f:
2488
+ st.download_button(
2489
+ label="⬇️ Download Backup",
2490
+ data=f.read(),
2491
+ file_name=backup_path,
2492
+ mime="application/x-sqlite3"
2493
+ )
2494
+ except:
2495
+ pass
2496
+ else:
2497
+ st.error("❌ Backup creation failed")
2498
+
2499
+ # Enhanced database insights
2500
+ st.markdown("---")
2501
+
2502
+ try:
2503
+ conn = sqlite3.connect(DB_PATH)
2504
+
2505
+ # Show recent activity summary
2506
+ col1, col2 = st.columns(2)
2507
+
2508
+ with col1:
2509
+ st.markdown("**πŸ“ˆ Recent Activity (Last 24h)**")
2510
+ df_recent = pd.read_sql_query('''
2511
+ SELECT api_name, COUNT(*) as fetches, SUM(record_count) as records
2512
+ FROM harvested_data
2513
+ WHERE fetch_timestamp > datetime('now', '-1 day')
2514
+ GROUP BY api_name
2515
+ ORDER BY fetches DESC
2516
+ ''', conn)
2517
+
2518
+ if not df_recent.empty:
2519
+ st.dataframe(df_recent, use_container_width=True)
2520
+ else:
2521
+ st.info("No recent activity")
2522
+
2523
+ with col2:
2524
+ st.markdown("**🎯 Data Quality Overview**")
2525
+ df_quality = pd.read_sql_query('''
2526
+ SELECT
2527
+ api_name,
2528
+ ROUND(AVG(quality_score), 3) as avg_quality,
2529
+ ROUND(AVG(health_score), 3) as avg_health,
2530
+ COUNT(*) as total_records
2531
+ FROM harvested_data
2532
+ WHERE status = 'success' AND quality_score > 0
2533
+ GROUP BY api_name
2534
+ ORDER BY avg_quality DESC
2535
+ ''', conn)
2536
+
2537
+ if not df_quality.empty:
2538
+ st.dataframe(df_quality, use_container_width=True)
2539
+ else:
2540
+ st.info("No quality data available")
2541
+
2542
+ conn.close()
2543
+
2544
+ except Exception as e:
2545
+ st.error(f"Database error: {e}")
2546
+
2547
+ # Storage efficiency metrics
2548
+ st.markdown("**πŸ’Ύ Storage Efficiency**")
2549
+
2550
+ try:
2551
+ conn = sqlite3.connect(DB_PATH)
2552
+ cursor = conn.cursor()
2553
+
2554
+ # Calculate compression ratios
2555
+ cursor.execute('''
2556
+ SELECT
2557
+ COUNT(*) as total_records,
2558
+ COUNT(CASE WHEN raw_data_compressed IS NOT NULL THEN 1 END) as compressed_records,
2559
+ SUM(data_size_bytes) as total_original_size,
2560
+ SUM(CASE WHEN raw_data_compressed IS NOT NULL THEN raw_data_size ELSE data_size_bytes END) as effective_size
2561
+ FROM harvested_data
2562
+ ''')
2563
+
2564
+ storage_stats = cursor.fetchone()
2565
+
2566
+ if storage_stats and storage_stats[0] > 0:
2567
+ total_records, compressed_records, original_size, effective_size = storage_stats
2568
+
2569
+ col1, col2, col3, col4 = st.columns(4)
2570
+
2571
+ with col1:
2572
+ st.metric("Total Records", total_records)
2573
+
2574
+ with col2:
2575
+ st.metric("Compressed Records", compressed_records)
2576
+
2577
+ with col3:
2578
+ compression_ratio = 0
2579
+ if original_size and effective_size:
2580
+ compression_ratio = (1 - effective_size / original_size) * 100
2581
+ st.metric("Compression Ratio", f"{compression_ratio:.1f}%")
2582
+
2583
+ with col4:
2584
+ space_saved = (original_size - effective_size) if original_size and effective_size else 0
2585
+ space_saved_mb = space_saved / (1024 * 1024)
2586
+ st.metric("Space Saved", f"{space_saved_mb:.2f} MB")
2587
+
2588
+ conn.close()
2589
+
2590
+ except Exception as e:
2591
+ st.warning(f"Could not calculate storage metrics: {e}")
2592
+
2593
  # AI Enhancement Panel
2594
  st.markdown("---")
2595
  with st.expander("πŸ€– AI Enhancement Status", expanded=False):