Spaces:
Sleeping
Sleeping
Upload app_ultimate.py with huggingface_hub
Browse files- app_ultimate.py +571 -43
app_ultimate.py
CHANGED
|
@@ -769,11 +769,18 @@ DEEP_API_CONFIG = {
|
|
| 769 |
}
|
| 770 |
|
| 771 |
def init_enhanced_database():
|
| 772 |
-
"""Initialize
|
| 773 |
conn = sqlite3.connect(DB_PATH)
|
| 774 |
cursor = conn.cursor()
|
| 775 |
|
| 776 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
cursor.execute('''
|
| 778 |
CREATE TABLE IF NOT EXISTS discovered_endpoints (
|
| 779 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -781,41 +788,71 @@ def init_enhanced_database():
|
|
| 781 |
endpoint_path TEXT NOT NULL,
|
| 782 |
full_url TEXT NOT NULL,
|
| 783 |
discovery_method TEXT,
|
| 784 |
-
depth_level INTEGER,
|
| 785 |
parent_endpoint TEXT,
|
| 786 |
endpoint_type TEXT,
|
| 787 |
last_checked TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 788 |
is_active BOOLEAN DEFAULT 1,
|
| 789 |
response_format TEXT,
|
| 790 |
parameters_schema TEXT,
|
|
|
|
|
|
|
|
|
|
| 791 |
UNIQUE(api_name, endpoint_path)
|
| 792 |
)
|
| 793 |
''')
|
| 794 |
|
| 795 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
cursor.execute('''
|
| 797 |
CREATE TABLE IF NOT EXISTS harvested_data (
|
| 798 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 799 |
api_name TEXT NOT NULL,
|
| 800 |
endpoint_path TEXT NOT NULL,
|
| 801 |
-
data_hash TEXT UNIQUE,
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
|
|
|
|
|
|
| 806 |
fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 807 |
-
fetch_duration_ms INTEGER,
|
| 808 |
status TEXT DEFAULT 'success',
|
| 809 |
error_message TEXT,
|
| 810 |
-
session_id TEXT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
)
|
| 812 |
''')
|
| 813 |
|
| 814 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 815 |
cursor.execute('''
|
| 816 |
CREATE TABLE IF NOT EXISTS harvest_sessions (
|
| 817 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 818 |
-
session_id TEXT UNIQUE,
|
| 819 |
session_name TEXT,
|
| 820 |
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 821 |
last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
@@ -825,14 +862,26 @@ def init_enhanced_database():
|
|
| 825 |
successful_fetches INTEGER DEFAULT 0,
|
| 826 |
failed_fetches INTEGER DEFAULT 0,
|
| 827 |
total_records INTEGER DEFAULT 0,
|
|
|
|
| 828 |
session_status TEXT DEFAULT 'active',
|
| 829 |
current_api TEXT,
|
| 830 |
current_endpoint TEXT,
|
| 831 |
-
session_config TEXT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
)
|
| 833 |
''')
|
| 834 |
|
| 835 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
cursor.execute('''
|
| 837 |
CREATE TABLE IF NOT EXISTS discovery_progress (
|
| 838 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -843,13 +892,269 @@ def init_enhanced_database():
|
|
| 843 |
endpoints_found INTEGER DEFAULT 0,
|
| 844 |
depth_reached INTEGER DEFAULT 0,
|
| 845 |
discovery_status TEXT DEFAULT 'running',
|
| 846 |
-
discovery_config TEXT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 847 |
)
|
| 848 |
''')
|
| 849 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
conn.commit()
|
| 851 |
conn.close()
|
| 852 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
class DeepEndpointDiscoverer:
|
| 854 |
"""Advanced endpoint discovery with recursive exploration"""
|
| 855 |
|
|
@@ -1535,7 +1840,9 @@ class UltimateDataHarvester:
|
|
| 1535 |
def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
|
| 1536 |
session_id: str, fetch_duration: int, record_count: int,
|
| 1537 |
data_size: int, status: str = "success", error_message: str = None):
|
| 1538 |
-
"""Save harvested data with AI-enhanced
|
|
|
|
|
|
|
| 1539 |
conn = sqlite3.connect(DB_PATH)
|
| 1540 |
cursor = conn.cursor()
|
| 1541 |
|
|
@@ -1543,6 +1850,19 @@ class UltimateDataHarvester:
|
|
| 1543 |
data_str = json.dumps(data, sort_keys=True, default=str)
|
| 1544 |
data_hash = hashlib.sha256(data_str.encode()).hexdigest()
|
| 1545 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1546 |
# AI Quality Assessment
|
| 1547 |
quality_assessment = {}
|
| 1548 |
if ai_quality_assessor and status == "success":
|
|
@@ -1561,20 +1881,57 @@ class UltimateDataHarvester:
|
|
| 1561 |
api_name, fetch_duration, success_rate, data_size
|
| 1562 |
)
|
| 1563 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1564 |
try:
|
| 1565 |
cursor.execute('''
|
| 1566 |
-
INSERT
|
| 1567 |
-
(api_name, endpoint_path, data_hash,
|
|
|
|
| 1568 |
record_count, data_size_bytes, fetch_duration_ms, status,
|
| 1569 |
-
error_message, session_id, quality_score, health_score, similar_datasets
|
| 1570 |
-
|
|
|
|
| 1571 |
''', (
|
| 1572 |
-
api_name, endpoint_path, data_hash,
|
| 1573 |
-
json.dumps(data, default=str),
|
| 1574 |
-
fetch_duration, status, error_message, session_id,
|
| 1575 |
quality_assessment.get('ai_quality_score', 0.0),
|
| 1576 |
health_info.get('health_score', 0.0),
|
| 1577 |
-
json.dumps(similar_datasets[:3], default=str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1578 |
))
|
| 1579 |
|
| 1580 |
conn.commit()
|
|
@@ -1583,27 +1940,45 @@ class UltimateDataHarvester:
|
|
| 1583 |
if quality_assessment and st.session_state.get('show_ai_insights', True):
|
| 1584 |
self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
|
| 1585 |
|
| 1586 |
-
except sqlite3.
|
| 1587 |
-
|
| 1588 |
-
|
| 1589 |
-
|
| 1590 |
-
|
| 1591 |
-
|
| 1592 |
-
|
| 1593 |
-
|
| 1594 |
-
|
| 1595 |
-
|
| 1596 |
-
|
| 1597 |
-
|
| 1598 |
-
|
| 1599 |
-
|
| 1600 |
-
|
| 1601 |
-
|
| 1602 |
-
|
| 1603 |
-
conn.commit()
|
| 1604 |
finally:
|
| 1605 |
conn.close()
|
| 1606 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1607 |
def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
|
| 1608 |
health_info: Dict, similar_datasets: List[Dict]):
|
| 1609 |
"""Display AI-powered insights in real-time"""
|
|
@@ -2062,6 +2437,159 @@ with tab3:
|
|
| 2062 |
finally:
|
| 2063 |
conn.close()
|
| 2064 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2065 |
# AI Enhancement Panel
|
| 2066 |
st.markdown("---")
|
| 2067 |
with st.expander("π€ AI Enhancement Status", expanded=False):
|
|
|
|
| 769 |
}
|
| 770 |
|
| 771 |
def init_enhanced_database():
|
| 772 |
+
"""Initialize optimized SQLite database with comprehensive schema and performance enhancements"""
|
| 773 |
conn = sqlite3.connect(DB_PATH)
|
| 774 |
cursor = conn.cursor()
|
| 775 |
|
| 776 |
+
# Enable WAL mode for better concurrency and performance
|
| 777 |
+
cursor.execute('PRAGMA journal_mode=WAL')
|
| 778 |
+
cursor.execute('PRAGMA synchronous=NORMAL')
|
| 779 |
+
cursor.execute('PRAGMA cache_size=10000')
|
| 780 |
+
cursor.execute('PRAGMA temp_store=MEMORY')
|
| 781 |
+
cursor.execute('PRAGMA mmap_size=268435456') # 256MB
|
| 782 |
+
|
| 783 |
+
# Enhanced endpoints table with better indexing
|
| 784 |
cursor.execute('''
|
| 785 |
CREATE TABLE IF NOT EXISTS discovered_endpoints (
|
| 786 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
| 788 |
endpoint_path TEXT NOT NULL,
|
| 789 |
full_url TEXT NOT NULL,
|
| 790 |
discovery_method TEXT,
|
| 791 |
+
depth_level INTEGER DEFAULT 0,
|
| 792 |
parent_endpoint TEXT,
|
| 793 |
endpoint_type TEXT,
|
| 794 |
last_checked TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 795 |
is_active BOOLEAN DEFAULT 1,
|
| 796 |
response_format TEXT,
|
| 797 |
parameters_schema TEXT,
|
| 798 |
+
estimated_records INTEGER DEFAULT 0,
|
| 799 |
+
last_fetch_status TEXT,
|
| 800 |
+
creation_date DATE DEFAULT (date('now')),
|
| 801 |
UNIQUE(api_name, endpoint_path)
|
| 802 |
)
|
| 803 |
''')
|
| 804 |
|
| 805 |
+
# Create indexes for endpoints table
|
| 806 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_api_name ON discovered_endpoints(api_name)')
|
| 807 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_active ON discovered_endpoints(is_active)')
|
| 808 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_last_checked ON discovered_endpoints(last_checked)')
|
| 809 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_depth ON discovered_endpoints(depth_level)')
|
| 810 |
+
|
| 811 |
+
# Optimized data storage table with compression and partitioning support
|
| 812 |
cursor.execute('''
|
| 813 |
CREATE TABLE IF NOT EXISTS harvested_data (
|
| 814 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 815 |
api_name TEXT NOT NULL,
|
| 816 |
endpoint_path TEXT NOT NULL,
|
| 817 |
+
data_hash TEXT UNIQUE NOT NULL,
|
| 818 |
+
raw_data_compressed BLOB,
|
| 819 |
+
processed_data_compressed BLOB,
|
| 820 |
+
raw_data_size INTEGER,
|
| 821 |
+
processed_data_size INTEGER,
|
| 822 |
+
record_count INTEGER DEFAULT 0,
|
| 823 |
+
data_size_bytes INTEGER DEFAULT 0,
|
| 824 |
fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 825 |
+
fetch_duration_ms INTEGER DEFAULT 0,
|
| 826 |
status TEXT DEFAULT 'success',
|
| 827 |
error_message TEXT,
|
| 828 |
+
session_id TEXT,
|
| 829 |
+
quality_score REAL DEFAULT 0.0,
|
| 830 |
+
health_score REAL DEFAULT 0.0,
|
| 831 |
+
similar_datasets TEXT DEFAULT '[]',
|
| 832 |
+
data_format TEXT,
|
| 833 |
+
api_version TEXT,
|
| 834 |
+
fetch_date DATE DEFAULT (date('now')),
|
| 835 |
+
last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 836 |
+
access_count INTEGER DEFAULT 1,
|
| 837 |
+
CHECK (status IN ('success', 'error', 'partial', 'timeout'))
|
| 838 |
)
|
| 839 |
''')
|
| 840 |
|
| 841 |
+
# Create comprehensive indexes for data table
|
| 842 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_api_name ON harvested_data(api_name)')
|
| 843 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_fetch_date ON harvested_data(fetch_date)')
|
| 844 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_status ON harvested_data(status)')
|
| 845 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_session ON harvested_data(session_id)')
|
| 846 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_hash ON harvested_data(data_hash)')
|
| 847 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_quality ON harvested_data(quality_score)')
|
| 848 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_records ON harvested_data(record_count)')
|
| 849 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_size ON harvested_data(data_size_bytes)')
|
| 850 |
+
|
| 851 |
+
# Enhanced session management table
|
| 852 |
cursor.execute('''
|
| 853 |
CREATE TABLE IF NOT EXISTS harvest_sessions (
|
| 854 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 855 |
+
session_id TEXT UNIQUE NOT NULL,
|
| 856 |
session_name TEXT,
|
| 857 |
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 858 |
last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
| 862 |
successful_fetches INTEGER DEFAULT 0,
|
| 863 |
failed_fetches INTEGER DEFAULT 0,
|
| 864 |
total_records INTEGER DEFAULT 0,
|
| 865 |
+
total_data_size INTEGER DEFAULT 0,
|
| 866 |
session_status TEXT DEFAULT 'active',
|
| 867 |
current_api TEXT,
|
| 868 |
current_endpoint TEXT,
|
| 869 |
+
session_config TEXT,
|
| 870 |
+
error_count INTEGER DEFAULT 0,
|
| 871 |
+
avg_fetch_time REAL DEFAULT 0.0,
|
| 872 |
+
session_type TEXT DEFAULT 'manual',
|
| 873 |
+
priority INTEGER DEFAULT 1,
|
| 874 |
+
CHECK (session_status IN ('active', 'paused', 'completed', 'failed', 'cancelled'))
|
| 875 |
)
|
| 876 |
''')
|
| 877 |
|
| 878 |
+
# Create indexes for sessions table
|
| 879 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_status ON harvest_sessions(session_status)')
|
| 880 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_started ON harvest_sessions(started_at)')
|
| 881 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_activity ON harvest_sessions(last_activity)')
|
| 882 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_priority ON harvest_sessions(priority)')
|
| 883 |
+
|
| 884 |
+
# Enhanced discovery progress table
|
| 885 |
cursor.execute('''
|
| 886 |
CREATE TABLE IF NOT EXISTS discovery_progress (
|
| 887 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
| 892 |
endpoints_found INTEGER DEFAULT 0,
|
| 893 |
depth_reached INTEGER DEFAULT 0,
|
| 894 |
discovery_status TEXT DEFAULT 'running',
|
| 895 |
+
discovery_config TEXT,
|
| 896 |
+
errors_encountered INTEGER DEFAULT 0,
|
| 897 |
+
success_rate REAL DEFAULT 0.0,
|
| 898 |
+
estimated_total INTEGER DEFAULT 0,
|
| 899 |
+
CHECK (discovery_status IN ('running', 'completed', 'failed', 'paused'))
|
| 900 |
+
)
|
| 901 |
+
''')
|
| 902 |
+
|
| 903 |
+
# Create indexes for discovery table
|
| 904 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_api ON discovery_progress(api_name)')
|
| 905 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_status ON discovery_progress(discovery_status)')
|
| 906 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_started ON discovery_progress(started_at)')
|
| 907 |
+
|
| 908 |
+
# Data quality and metadata table
|
| 909 |
+
cursor.execute('''
|
| 910 |
+
CREATE TABLE IF NOT EXISTS data_quality_metrics (
|
| 911 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 912 |
+
data_id INTEGER REFERENCES harvested_data(id),
|
| 913 |
+
api_name TEXT NOT NULL,
|
| 914 |
+
completeness_score REAL DEFAULT 0.0,
|
| 915 |
+
consistency_score REAL DEFAULT 0.0,
|
| 916 |
+
accuracy_score REAL DEFAULT 0.0,
|
| 917 |
+
timeliness_score REAL DEFAULT 0.0,
|
| 918 |
+
overall_quality REAL DEFAULT 0.0,
|
| 919 |
+
anomalies_detected INTEGER DEFAULT 0,
|
| 920 |
+
anomaly_details TEXT,
|
| 921 |
+
validation_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 922 |
+
validation_rules_version TEXT DEFAULT '1.0'
|
| 923 |
)
|
| 924 |
''')
|
| 925 |
|
| 926 |
+
# Create quality metrics indexes
|
| 927 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_api ON data_quality_metrics(api_name)')
|
| 928 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_overall ON data_quality_metrics(overall_quality)')
|
| 929 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_timestamp ON data_quality_metrics(validation_timestamp)')
|
| 930 |
+
|
| 931 |
+
# API performance tracking table
|
| 932 |
+
cursor.execute('''
|
| 933 |
+
CREATE TABLE IF NOT EXISTS api_performance_log (
|
| 934 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 935 |
+
api_name TEXT NOT NULL,
|
| 936 |
+
endpoint_path TEXT NOT NULL,
|
| 937 |
+
response_time_ms INTEGER,
|
| 938 |
+
response_size_bytes INTEGER,
|
| 939 |
+
http_status_code INTEGER,
|
| 940 |
+
success BOOLEAN,
|
| 941 |
+
error_type TEXT,
|
| 942 |
+
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 943 |
+
date_only DATE DEFAULT (date('now'))
|
| 944 |
+
)
|
| 945 |
+
''')
|
| 946 |
+
|
| 947 |
+
# Create performance indexes
|
| 948 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_api_date ON api_performance_log(api_name, date_only)')
|
| 949 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_success ON api_performance_log(success)')
|
| 950 |
+
cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_response_time ON api_performance_log(response_time_ms)')
|
| 951 |
+
|
| 952 |
+
# Data archival management table
|
| 953 |
+
cursor.execute('''
|
| 954 |
+
CREATE TABLE IF NOT EXISTS data_archive_log (
|
| 955 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 956 |
+
original_data_id INTEGER,
|
| 957 |
+
archive_path TEXT,
|
| 958 |
+
archive_format TEXT DEFAULT 'gzip',
|
| 959 |
+
archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 960 |
+
original_size INTEGER,
|
| 961 |
+
compressed_size INTEGER,
|
| 962 |
+
compression_ratio REAL,
|
| 963 |
+
checksum TEXT,
|
| 964 |
+
retention_date DATE,
|
| 965 |
+
archive_status TEXT DEFAULT 'active'
|
| 966 |
+
)
|
| 967 |
+
''')
|
| 968 |
+
|
| 969 |
+
# Create views for common queries
|
| 970 |
+
cursor.execute('''
|
| 971 |
+
CREATE VIEW IF NOT EXISTS v_api_summary AS
|
| 972 |
+
SELECT
|
| 973 |
+
api_name,
|
| 974 |
+
COUNT(*) as total_fetches,
|
| 975 |
+
COUNT(CASE WHEN status = 'success' THEN 1 END) as successful_fetches,
|
| 976 |
+
SUM(record_count) as total_records,
|
| 977 |
+
SUM(data_size_bytes) as total_data_size,
|
| 978 |
+
AVG(fetch_duration_ms) as avg_fetch_time,
|
| 979 |
+
AVG(quality_score) as avg_quality_score,
|
| 980 |
+
MAX(fetch_timestamp) as last_fetch,
|
| 981 |
+
MIN(fetch_timestamp) as first_fetch
|
| 982 |
+
FROM harvested_data
|
| 983 |
+
GROUP BY api_name
|
| 984 |
+
''')
|
| 985 |
+
|
| 986 |
+
cursor.execute('''
|
| 987 |
+
CREATE VIEW IF NOT EXISTS v_session_summary AS
|
| 988 |
+
SELECT
|
| 989 |
+
session_id,
|
| 990 |
+
session_name,
|
| 991 |
+
session_status,
|
| 992 |
+
started_at,
|
| 993 |
+
completed_at,
|
| 994 |
+
total_endpoints,
|
| 995 |
+
processed_endpoints,
|
| 996 |
+
successful_fetches,
|
| 997 |
+
failed_fetches,
|
| 998 |
+
total_records,
|
| 999 |
+
total_data_size,
|
| 1000 |
+
CASE
|
| 1001 |
+
WHEN total_endpoints > 0 THEN
|
| 1002 |
+
ROUND((processed_endpoints * 100.0) / total_endpoints, 2)
|
| 1003 |
+
ELSE 0
|
| 1004 |
+
END as completion_percentage,
|
| 1005 |
+
CASE
|
| 1006 |
+
WHEN processed_endpoints > 0 THEN
|
| 1007 |
+
ROUND((successful_fetches * 100.0) / processed_endpoints, 2)
|
| 1008 |
+
ELSE 0
|
| 1009 |
+
END as success_percentage
|
| 1010 |
+
FROM harvest_sessions
|
| 1011 |
+
''')
|
| 1012 |
+
|
| 1013 |
+
# Enable automatic statistics collection
|
| 1014 |
+
cursor.execute('PRAGMA optimize')
|
| 1015 |
+
|
| 1016 |
conn.commit()
|
| 1017 |
conn.close()
|
| 1018 |
|
| 1019 |
+
# Database optimization and maintenance functions
|
| 1020 |
+
def optimize_database():
|
| 1021 |
+
"""Perform database optimization and maintenance"""
|
| 1022 |
+
conn = sqlite3.connect(DB_PATH)
|
| 1023 |
+
cursor = conn.cursor()
|
| 1024 |
+
|
| 1025 |
+
try:
|
| 1026 |
+
# Update statistics
|
| 1027 |
+
cursor.execute('ANALYZE')
|
| 1028 |
+
|
| 1029 |
+
# Vacuum if necessary (reclaim space)
|
| 1030 |
+
cursor.execute('PRAGMA auto_vacuum=INCREMENTAL')
|
| 1031 |
+
cursor.execute('PRAGMA incremental_vacuum')
|
| 1032 |
+
|
| 1033 |
+
# Optimize query planner
|
| 1034 |
+
cursor.execute('PRAGMA optimize')
|
| 1035 |
+
|
| 1036 |
+
conn.commit()
|
| 1037 |
+
return True
|
| 1038 |
+
except Exception as e:
|
| 1039 |
+
return False
|
| 1040 |
+
finally:
|
| 1041 |
+
conn.close()
|
| 1042 |
+
|
| 1043 |
+
def get_database_stats():
|
| 1044 |
+
"""Get comprehensive database statistics"""
|
| 1045 |
+
conn = sqlite3.connect(DB_PATH)
|
| 1046 |
+
cursor = conn.cursor()
|
| 1047 |
+
|
| 1048 |
+
try:
|
| 1049 |
+
stats = {}
|
| 1050 |
+
|
| 1051 |
+
# Basic table counts
|
| 1052 |
+
tables = ['discovered_endpoints', 'harvested_data', 'harvest_sessions', 'discovery_progress']
|
| 1053 |
+
for table in tables:
|
| 1054 |
+
cursor.execute(f'SELECT COUNT(*) FROM {table}')
|
| 1055 |
+
stats[f'{table}_count'] = cursor.fetchone()[0]
|
| 1056 |
+
|
| 1057 |
+
# Database size
|
| 1058 |
+
cursor.execute('PRAGMA page_count')
|
| 1059 |
+
page_count = cursor.fetchone()[0]
|
| 1060 |
+
cursor.execute('PRAGMA page_size')
|
| 1061 |
+
page_size = cursor.fetchone()[0]
|
| 1062 |
+
stats['database_size_mb'] = round((page_count * page_size) / (1024 * 1024), 2)
|
| 1063 |
+
|
| 1064 |
+
# Data quality stats
|
| 1065 |
+
cursor.execute('SELECT AVG(quality_score), AVG(health_score) FROM harvested_data WHERE status = "success"')
|
| 1066 |
+
quality_stats = cursor.fetchone()
|
| 1067 |
+
stats['avg_quality_score'] = round(quality_stats[0] or 0, 3)
|
| 1068 |
+
stats['avg_health_score'] = round(quality_stats[1] or 0, 3)
|
| 1069 |
+
|
| 1070 |
+
# Recent activity
|
| 1071 |
+
cursor.execute('''
|
| 1072 |
+
SELECT COUNT(*) FROM harvested_data
|
| 1073 |
+
WHERE fetch_timestamp > datetime('now', '-24 hours')
|
| 1074 |
+
''')
|
| 1075 |
+
stats['recent_fetches_24h'] = cursor.fetchone()[0]
|
| 1076 |
+
|
| 1077 |
+
return stats
|
| 1078 |
+
|
| 1079 |
+
finally:
|
| 1080 |
+
conn.close()
|
| 1081 |
+
|
| 1082 |
+
def compress_old_data(days_old=30):
|
| 1083 |
+
"""Compress data older than specified days"""
|
| 1084 |
+
import gzip
|
| 1085 |
+
import json
|
| 1086 |
+
|
| 1087 |
+
conn = sqlite3.connect(DB_PATH)
|
| 1088 |
+
cursor = conn.cursor()
|
| 1089 |
+
|
| 1090 |
+
try:
|
| 1091 |
+
# Find old data to compress
|
| 1092 |
+
cursor.execute('''
|
| 1093 |
+
SELECT id, raw_data, processed_data
|
| 1094 |
+
FROM harvested_data
|
| 1095 |
+
WHERE fetch_timestamp < datetime('now', '-{} days')
|
| 1096 |
+
AND raw_data_compressed IS NULL
|
| 1097 |
+
'''.format(days_old))
|
| 1098 |
+
|
| 1099 |
+
old_records = cursor.fetchall()
|
| 1100 |
+
compressed_count = 0
|
| 1101 |
+
|
| 1102 |
+
for record_id, raw_data, processed_data in old_records:
|
| 1103 |
+
try:
|
| 1104 |
+
# Compress raw data
|
| 1105 |
+
raw_compressed = None
|
| 1106 |
+
if raw_data:
|
| 1107 |
+
raw_compressed = gzip.compress(raw_data.encode('utf-8'))
|
| 1108 |
+
|
| 1109 |
+
# Compress processed data
|
| 1110 |
+
processed_compressed = None
|
| 1111 |
+
if processed_data:
|
| 1112 |
+
processed_compressed = gzip.compress(processed_data.encode('utf-8'))
|
| 1113 |
+
|
| 1114 |
+
# Update record with compressed data
|
| 1115 |
+
cursor.execute('''
|
| 1116 |
+
UPDATE harvested_data
|
| 1117 |
+
SET raw_data_compressed = ?,
|
| 1118 |
+
processed_data_compressed = ?,
|
| 1119 |
+
raw_data = NULL,
|
| 1120 |
+
processed_data = NULL,
|
| 1121 |
+
raw_data_size = ?,
|
| 1122 |
+
processed_data_size = ?
|
| 1123 |
+
WHERE id = ?
|
| 1124 |
+
''', (
|
| 1125 |
+
raw_compressed,
|
| 1126 |
+
processed_compressed,
|
| 1127 |
+
len(raw_data) if raw_data else 0,
|
| 1128 |
+
len(processed_data) if processed_data else 0,
|
| 1129 |
+
record_id
|
| 1130 |
+
))
|
| 1131 |
+
|
| 1132 |
+
compressed_count += 1
|
| 1133 |
+
|
| 1134 |
+
except Exception as e:
|
| 1135 |
+
continue # Skip problematic records
|
| 1136 |
+
|
| 1137 |
+
conn.commit()
|
| 1138 |
+
return compressed_count
|
| 1139 |
+
|
| 1140 |
+
finally:
|
| 1141 |
+
conn.close()
|
| 1142 |
+
|
| 1143 |
+
def backup_database(backup_path=None):
|
| 1144 |
+
"""Create a backup of the database"""
|
| 1145 |
+
import shutil
|
| 1146 |
+
from datetime import datetime
|
| 1147 |
+
|
| 1148 |
+
if backup_path is None:
|
| 1149 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 1150 |
+
backup_path = f"backup_harvester_{timestamp}.db"
|
| 1151 |
+
|
| 1152 |
+
try:
|
| 1153 |
+
shutil.copy2(DB_PATH, backup_path)
|
| 1154 |
+
return backup_path
|
| 1155 |
+
except Exception as e:
|
| 1156 |
+
return None
|
| 1157 |
+
|
| 1158 |
class DeepEndpointDiscoverer:
|
| 1159 |
"""Advanced endpoint discovery with recursive exploration"""
|
| 1160 |
|
|
|
|
| 1840 |
def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
|
| 1841 |
session_id: str, fetch_duration: int, record_count: int,
|
| 1842 |
data_size: int, status: str = "success", error_message: str = None):
|
| 1843 |
+
"""Save harvested data with optimized storage and AI-enhanced analysis"""
|
| 1844 |
+
import gzip
|
| 1845 |
+
|
| 1846 |
conn = sqlite3.connect(DB_PATH)
|
| 1847 |
cursor = conn.cursor()
|
| 1848 |
|
|
|
|
| 1850 |
data_str = json.dumps(data, sort_keys=True, default=str)
|
| 1851 |
data_hash = hashlib.sha256(data_str.encode()).hexdigest()
|
| 1852 |
|
| 1853 |
+
# Check if this data already exists
|
| 1854 |
+
cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
|
| 1855 |
+
if cursor.fetchone():
|
| 1856 |
+
# Update access count and last accessed time
|
| 1857 |
+
cursor.execute('''
|
| 1858 |
+
UPDATE harvested_data
|
| 1859 |
+
SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
|
| 1860 |
+
WHERE data_hash = ?
|
| 1861 |
+
''', (data_hash,))
|
| 1862 |
+
conn.commit()
|
| 1863 |
+
conn.close()
|
| 1864 |
+
return
|
| 1865 |
+
|
| 1866 |
# AI Quality Assessment
|
| 1867 |
quality_assessment = {}
|
| 1868 |
if ai_quality_assessor and status == "success":
|
|
|
|
| 1881 |
api_name, fetch_duration, success_rate, data_size
|
| 1882 |
)
|
| 1883 |
|
| 1884 |
+
# Determine data format
|
| 1885 |
+
data_format = self._detect_data_format(data)
|
| 1886 |
+
|
| 1887 |
+
# Compress data if it's large
|
| 1888 |
+
raw_data_compressed = None
|
| 1889 |
+
processed_data_compressed = None
|
| 1890 |
+
raw_data = None
|
| 1891 |
+
processed_data = None
|
| 1892 |
+
|
| 1893 |
+
if data_size > 1024: # Compress if larger than 1KB
|
| 1894 |
+
try:
|
| 1895 |
+
raw_data_compressed = gzip.compress(data_str.encode('utf-8'))
|
| 1896 |
+
processed_data_compressed = gzip.compress(json.dumps(data, default=str).encode('utf-8'))
|
| 1897 |
+
except:
|
| 1898 |
+
# Fallback to uncompressed storage
|
| 1899 |
+
raw_data = data_str
|
| 1900 |
+
processed_data = json.dumps(data, default=str)
|
| 1901 |
+
else:
|
| 1902 |
+
raw_data = data_str
|
| 1903 |
+
processed_data = json.dumps(data, default=str)
|
| 1904 |
+
|
| 1905 |
try:
|
| 1906 |
cursor.execute('''
|
| 1907 |
+
INSERT INTO harvested_data
|
| 1908 |
+
(api_name, endpoint_path, data_hash, raw_data_compressed, processed_data_compressed,
|
| 1909 |
+
raw_data, processed_data, raw_data_size, processed_data_size,
|
| 1910 |
record_count, data_size_bytes, fetch_duration_ms, status,
|
| 1911 |
+
error_message, session_id, quality_score, health_score, similar_datasets,
|
| 1912 |
+
data_format, access_count)
|
| 1913 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 1914 |
''', (
|
| 1915 |
+
api_name, endpoint_path, data_hash, raw_data_compressed, processed_data_compressed,
|
| 1916 |
+
raw_data, processed_data, len(data_str), len(json.dumps(data, default=str)),
|
| 1917 |
+
record_count, data_size, fetch_duration, status, error_message, session_id,
|
| 1918 |
quality_assessment.get('ai_quality_score', 0.0),
|
| 1919 |
health_info.get('health_score', 0.0),
|
| 1920 |
+
json.dumps(similar_datasets[:3], default=str),
|
| 1921 |
+
data_format, 1
|
| 1922 |
+
))
|
| 1923 |
+
|
| 1924 |
+
# Log API performance
|
| 1925 |
+
cursor.execute('''
|
| 1926 |
+
INSERT INTO api_performance_log
|
| 1927 |
+
(api_name, endpoint_path, response_time_ms, response_size_bytes,
|
| 1928 |
+
http_status_code, success, error_type)
|
| 1929 |
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
| 1930 |
+
''', (
|
| 1931 |
+
api_name, endpoint_path, fetch_duration, data_size,
|
| 1932 |
+
200 if status == "success" else 500,
|
| 1933 |
+
status == "success",
|
| 1934 |
+
error_message if status != "success" else None
|
| 1935 |
))
|
| 1936 |
|
| 1937 |
conn.commit()
|
|
|
|
| 1940 |
if quality_assessment and st.session_state.get('show_ai_insights', True):
|
| 1941 |
self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
|
| 1942 |
|
| 1943 |
+
except sqlite3.OperationalError as e:
|
| 1944 |
+
# Handle database schema updates
|
| 1945 |
+
if "no such column" in str(e):
|
| 1946 |
+
self._upgrade_database_schema()
|
| 1947 |
+
# Retry with basic data structure
|
| 1948 |
+
cursor.execute('''
|
| 1949 |
+
INSERT OR REPLACE INTO harvested_data
|
| 1950 |
+
(api_name, endpoint_path, data_hash, raw_data, processed_data,
|
| 1951 |
+
record_count, data_size_bytes, fetch_duration_ms, status,
|
| 1952 |
+
error_message, session_id)
|
| 1953 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 1954 |
+
''', (
|
| 1955 |
+
api_name, endpoint_path, data_hash, raw_data or data_str,
|
| 1956 |
+
processed_data or json.dumps(data, default=str), record_count, data_size,
|
| 1957 |
+
fetch_duration, status, error_message, session_id
|
| 1958 |
+
))
|
| 1959 |
+
conn.commit()
|
|
|
|
| 1960 |
finally:
|
| 1961 |
conn.close()
|
| 1962 |
|
| 1963 |
+
def _detect_data_format(self, data: Any) -> str:
|
| 1964 |
+
"""Detect the format of the data"""
|
| 1965 |
+
if isinstance(data, dict):
|
| 1966 |
+
if "_embedded" in data or "_links" in data:
|
| 1967 |
+
return "HAL+JSON"
|
| 1968 |
+
elif "dataSets" in data or "structure" in data:
|
| 1969 |
+
return "SDMX-JSON"
|
| 1970 |
+
else:
|
| 1971 |
+
return "JSON"
|
| 1972 |
+
elif isinstance(data, list):
|
| 1973 |
+
return "JSON-Array"
|
| 1974 |
+
elif isinstance(data, str):
|
| 1975 |
+
if data.strip().startswith('<'):
|
| 1976 |
+
return "XML"
|
| 1977 |
+
else:
|
| 1978 |
+
return "Text"
|
| 1979 |
+
else:
|
| 1980 |
+
return "Unknown"
|
| 1981 |
+
|
| 1982 |
def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
|
| 1983 |
health_info: Dict, similar_datasets: List[Dict]):
|
| 1984 |
"""Display AI-powered insights in real-time"""
|
|
|
|
| 2437 |
finally:
|
| 2438 |
conn.close()
|
| 2439 |
|
| 2440 |
+
# Database Management Section
|
| 2441 |
+
with st.expander("ποΈ Database Management & Statistics", expanded=False):
|
| 2442 |
+
st.markdown("**Database Performance & Maintenance Tools**")
|
| 2443 |
+
|
| 2444 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 2445 |
+
|
| 2446 |
+
with col1:
|
| 2447 |
+
if st.button("π Get Database Stats", use_container_width=True):
|
| 2448 |
+
with st.spinner("Analyzing database..."):
|
| 2449 |
+
stats = get_database_stats()
|
| 2450 |
+
|
| 2451 |
+
st.markdown("**Database Statistics:**")
|
| 2452 |
+
for key, value in stats.items():
|
| 2453 |
+
formatted_key = key.replace('_', ' ').title()
|
| 2454 |
+
if 'size_mb' in key:
|
| 2455 |
+
st.metric(formatted_key, f"{value} MB")
|
| 2456 |
+
elif 'score' in key:
|
| 2457 |
+
st.metric(formatted_key, f"{value:.3f}")
|
| 2458 |
+
else:
|
| 2459 |
+
st.metric(formatted_key, value)
|
| 2460 |
+
|
| 2461 |
+
with col2:
|
| 2462 |
+
if st.button("π§ Optimize Database", use_container_width=True):
|
| 2463 |
+
with st.spinner("Optimizing database..."):
|
| 2464 |
+
success = optimize_database()
|
| 2465 |
+
if success:
|
| 2466 |
+
st.success("β
Database optimized successfully!")
|
| 2467 |
+
else:
|
| 2468 |
+
st.error("β Database optimization failed")
|
| 2469 |
+
|
| 2470 |
+
with col3:
|
| 2471 |
+
if st.button("ποΈ Compress Old Data", use_container_width=True):
|
| 2472 |
+
with st.spinner("Compressing old data..."):
|
| 2473 |
+
compressed_count = compress_old_data(days_old=7) # Compress data older than 7 days
|
| 2474 |
+
if compressed_count > 0:
|
| 2475 |
+
st.success(f"β
Compressed {compressed_count} old records")
|
| 2476 |
+
else:
|
| 2477 |
+
st.info("βΉοΈ No old data found to compress")
|
| 2478 |
+
|
| 2479 |
+
with col4:
|
| 2480 |
+
if st.button("πΎ Create Backup", use_container_width=True):
|
| 2481 |
+
with st.spinner("Creating backup..."):
|
| 2482 |
+
backup_path = backup_database()
|
| 2483 |
+
if backup_path:
|
| 2484 |
+
st.success(f"β
Backup created: {backup_path}")
|
| 2485 |
+
# Offer download
|
| 2486 |
+
try:
|
| 2487 |
+
with open(backup_path, 'rb') as f:
|
| 2488 |
+
st.download_button(
|
| 2489 |
+
label="β¬οΈ Download Backup",
|
| 2490 |
+
data=f.read(),
|
| 2491 |
+
file_name=backup_path,
|
| 2492 |
+
mime="application/x-sqlite3"
|
| 2493 |
+
)
|
| 2494 |
+
except:
|
| 2495 |
+
pass
|
| 2496 |
+
else:
|
| 2497 |
+
st.error("β Backup creation failed")
|
| 2498 |
+
|
| 2499 |
+
# Enhanced database insights
|
| 2500 |
+
st.markdown("---")
|
| 2501 |
+
|
| 2502 |
+
try:
|
| 2503 |
+
conn = sqlite3.connect(DB_PATH)
|
| 2504 |
+
|
| 2505 |
+
# Show recent activity summary
|
| 2506 |
+
col1, col2 = st.columns(2)
|
| 2507 |
+
|
| 2508 |
+
with col1:
|
| 2509 |
+
st.markdown("**π Recent Activity (Last 24h)**")
|
| 2510 |
+
df_recent = pd.read_sql_query('''
|
| 2511 |
+
SELECT api_name, COUNT(*) as fetches, SUM(record_count) as records
|
| 2512 |
+
FROM harvested_data
|
| 2513 |
+
WHERE fetch_timestamp > datetime('now', '-1 day')
|
| 2514 |
+
GROUP BY api_name
|
| 2515 |
+
ORDER BY fetches DESC
|
| 2516 |
+
''', conn)
|
| 2517 |
+
|
| 2518 |
+
if not df_recent.empty:
|
| 2519 |
+
st.dataframe(df_recent, use_container_width=True)
|
| 2520 |
+
else:
|
| 2521 |
+
st.info("No recent activity")
|
| 2522 |
+
|
| 2523 |
+
with col2:
|
| 2524 |
+
st.markdown("**π― Data Quality Overview**")
|
| 2525 |
+
df_quality = pd.read_sql_query('''
|
| 2526 |
+
SELECT
|
| 2527 |
+
api_name,
|
| 2528 |
+
ROUND(AVG(quality_score), 3) as avg_quality,
|
| 2529 |
+
ROUND(AVG(health_score), 3) as avg_health,
|
| 2530 |
+
COUNT(*) as total_records
|
| 2531 |
+
FROM harvested_data
|
| 2532 |
+
WHERE status = 'success' AND quality_score > 0
|
| 2533 |
+
GROUP BY api_name
|
| 2534 |
+
ORDER BY avg_quality DESC
|
| 2535 |
+
''', conn)
|
| 2536 |
+
|
| 2537 |
+
if not df_quality.empty:
|
| 2538 |
+
st.dataframe(df_quality, use_container_width=True)
|
| 2539 |
+
else:
|
| 2540 |
+
st.info("No quality data available")
|
| 2541 |
+
|
| 2542 |
+
conn.close()
|
| 2543 |
+
|
| 2544 |
+
except Exception as e:
|
| 2545 |
+
st.error(f"Database error: {e}")
|
| 2546 |
+
|
| 2547 |
+
# Storage efficiency metrics
|
| 2548 |
+
st.markdown("**πΎ Storage Efficiency**")
|
| 2549 |
+
|
| 2550 |
+
try:
|
| 2551 |
+
conn = sqlite3.connect(DB_PATH)
|
| 2552 |
+
cursor = conn.cursor()
|
| 2553 |
+
|
| 2554 |
+
# Calculate compression ratios
|
| 2555 |
+
cursor.execute('''
|
| 2556 |
+
SELECT
|
| 2557 |
+
COUNT(*) as total_records,
|
| 2558 |
+
COUNT(CASE WHEN raw_data_compressed IS NOT NULL THEN 1 END) as compressed_records,
|
| 2559 |
+
SUM(data_size_bytes) as total_original_size,
|
| 2560 |
+
SUM(CASE WHEN raw_data_compressed IS NOT NULL THEN raw_data_size ELSE data_size_bytes END) as effective_size
|
| 2561 |
+
FROM harvested_data
|
| 2562 |
+
''')
|
| 2563 |
+
|
| 2564 |
+
storage_stats = cursor.fetchone()
|
| 2565 |
+
|
| 2566 |
+
if storage_stats and storage_stats[0] > 0:
|
| 2567 |
+
total_records, compressed_records, original_size, effective_size = storage_stats
|
| 2568 |
+
|
| 2569 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 2570 |
+
|
| 2571 |
+
with col1:
|
| 2572 |
+
st.metric("Total Records", total_records)
|
| 2573 |
+
|
| 2574 |
+
with col2:
|
| 2575 |
+
st.metric("Compressed Records", compressed_records)
|
| 2576 |
+
|
| 2577 |
+
with col3:
|
| 2578 |
+
compression_ratio = 0
|
| 2579 |
+
if original_size and effective_size:
|
| 2580 |
+
compression_ratio = (1 - effective_size / original_size) * 100
|
| 2581 |
+
st.metric("Compression Ratio", f"{compression_ratio:.1f}%")
|
| 2582 |
+
|
| 2583 |
+
with col4:
|
| 2584 |
+
space_saved = (original_size - effective_size) if original_size and effective_size else 0
|
| 2585 |
+
space_saved_mb = space_saved / (1024 * 1024)
|
| 2586 |
+
st.metric("Space Saved", f"{space_saved_mb:.2f} MB")
|
| 2587 |
+
|
| 2588 |
+
conn.close()
|
| 2589 |
+
|
| 2590 |
+
except Exception as e:
|
| 2591 |
+
st.warning(f"Could not calculate storage metrics: {e}")
|
| 2592 |
+
|
| 2593 |
# AI Enhancement Panel
|
| 2594 |
st.markdown("---")
|
| 2595 |
with st.expander("π€ AI Enhancement Status", expanded=False):
|