Spaces:

isakskogstad
/

api-data-fetcher

Sleeping

App Files Files Community

isakskogstad commited on Jul 12, 2025

Commit

7186a85

verified ·

1 Parent(s): 284efb4

Upload app_ultimate.py with huggingface_hub

Browse files

Files changed (1) hide show

app_ultimate.py +571 -43

app_ultimate.py CHANGED Viewed

@@ -769,11 +769,18 @@ DEEP_API_CONFIG = {
 }
 def init_enhanced_database():
-    """Initialize enhanced SQLite database with comprehensive schema"""
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
-    # Enhanced endpoints table
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS discovered_endpoints (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -781,41 +788,71 @@ def init_enhanced_database():
             endpoint_path TEXT NOT NULL,
             full_url TEXT NOT NULL,
             discovery_method TEXT,
-            depth_level INTEGER,
             parent_endpoint TEXT,
             endpoint_type TEXT,
             last_checked TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
             is_active BOOLEAN DEFAULT 1,
             response_format TEXT,
             parameters_schema TEXT,
             UNIQUE(api_name, endpoint_path)
         )
     ''')
-    # Enhanced data storage table
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS harvested_data (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             api_name TEXT NOT NULL,
             endpoint_path TEXT NOT NULL,
-            data_hash TEXT UNIQUE,
-            raw_data TEXT,
-            processed_data TEXT,
-            record_count INTEGER,
-            data_size_bytes INTEGER,
             fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-            fetch_duration_ms INTEGER,
             status TEXT DEFAULT 'success',
             error_message TEXT,
-            session_id TEXT
         )
     ''')
-    # Session management table
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS harvest_sessions (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
-            session_id TEXT UNIQUE,
             session_name TEXT,
             started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
             last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
@@ -825,14 +862,26 @@ def init_enhanced_database():
             successful_fetches INTEGER DEFAULT 0,
             failed_fetches INTEGER DEFAULT 0,
             total_records INTEGER DEFAULT 0,
             session_status TEXT DEFAULT 'active',
             current_api TEXT,
             current_endpoint TEXT,
-            session_config TEXT
         )
     ''')
-    # Discovery progress table
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS discovery_progress (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -843,13 +892,269 @@ def init_enhanced_database():
             endpoints_found INTEGER DEFAULT 0,
             depth_reached INTEGER DEFAULT 0,
             discovery_status TEXT DEFAULT 'running',
-            discovery_config TEXT
         )
     ''')
     conn.commit()
     conn.close()
 class DeepEndpointDiscoverer:
     """Advanced endpoint discovery with recursive exploration"""
@@ -1535,7 +1840,9 @@ class UltimateDataHarvester:
     def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
                            session_id: str, fetch_duration: int, record_count: int,
                            data_size: int, status: str = "success", error_message: str = None):
-        """Save harvested data with AI-enhanced intelligent categorization"""
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
@@ -1543,6 +1850,19 @@ class UltimateDataHarvester:
         data_str = json.dumps(data, sort_keys=True, default=str)
         data_hash = hashlib.sha256(data_str.encode()).hexdigest()
         # AI Quality Assessment
         quality_assessment = {}
         if ai_quality_assessor and status == "success":
@@ -1561,20 +1881,57 @@ class UltimateDataHarvester:
                 api_name, fetch_duration, success_rate, data_size
             )
         try:
             cursor.execute('''
-                INSERT OR REPLACE INTO harvested_data
-                (api_name, endpoint_path, data_hash, raw_data, processed_data,
                  record_count, data_size_bytes, fetch_duration_ms, status,
-                 error_message, session_id, quality_score, health_score, similar_datasets)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', (
-                api_name, endpoint_path, data_hash, data_str,
-                json.dumps(data, default=str), record_count, data_size,
-                fetch_duration, status, error_message, session_id,
                 quality_assessment.get('ai_quality_score', 0.0),
                 health_info.get('health_score', 0.0),
-                json.dumps(similar_datasets[:3], default=str)  # Top 3 similar datasets
             ))
             conn.commit()
@@ -1583,27 +1940,45 @@ class UltimateDataHarvester:
             if quality_assessment and st.session_state.get('show_ai_insights', True):
                 self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
-        except sqlite3.IntegrityError:
-            pass  # Data already exists
-        except sqlite3.OperationalError:
-            # Handle case where AI columns don't exist yet - add them
-            self._upgrade_database_schema()
-            # Retry with basic data
-            cursor.execute('''
-                INSERT OR REPLACE INTO harvested_data
-                (api_name, endpoint_path, data_hash, raw_data, processed_data,
-                 record_count, data_size_bytes, fetch_duration_ms, status,
-                 error_message, session_id)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-            ''', (
-                api_name, endpoint_path, data_hash, data_str,
-                json.dumps(data, default=str), record_count, data_size,
-                fetch_duration, status, error_message, session_id
-            ))
-            conn.commit()
         finally:
             conn.close()
     def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
                            health_info: Dict, similar_datasets: List[Dict]):
         """Display AI-powered insights in real-time"""
@@ -2062,6 +2437,159 @@ with tab3:
     finally:
         conn.close()
 # AI Enhancement Panel
 st.markdown("---")
 with st.expander("🤖 AI Enhancement Status", expanded=False):

 }
 def init_enhanced_database():
+    """Initialize optimized SQLite database with comprehensive schema and performance enhancements"""
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
+    # Enable WAL mode for better concurrency and performance
+    cursor.execute('PRAGMA journal_mode=WAL')
+    cursor.execute('PRAGMA synchronous=NORMAL')
+    cursor.execute('PRAGMA cache_size=10000')
+    cursor.execute('PRAGMA temp_store=MEMORY')
+    cursor.execute('PRAGMA mmap_size=268435456')  # 256MB
+    # Enhanced endpoints table with better indexing
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS discovered_endpoints (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             endpoint_path TEXT NOT NULL,
             full_url TEXT NOT NULL,
             discovery_method TEXT,
+            depth_level INTEGER DEFAULT 0,
             parent_endpoint TEXT,
             endpoint_type TEXT,
             last_checked TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
             is_active BOOLEAN DEFAULT 1,
             response_format TEXT,
             parameters_schema TEXT,
+            estimated_records INTEGER DEFAULT 0,
+            last_fetch_status TEXT,
+            creation_date DATE DEFAULT (date('now')),
             UNIQUE(api_name, endpoint_path)
         )
     ''')
+    # Create indexes for endpoints table
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_api_name ON discovered_endpoints(api_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_active ON discovered_endpoints(is_active)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_last_checked ON discovered_endpoints(last_checked)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_endpoints_depth ON discovered_endpoints(depth_level)')
+    # Optimized data storage table with compression and partitioning support
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS harvested_data (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             api_name TEXT NOT NULL,
             endpoint_path TEXT NOT NULL,
+            data_hash TEXT UNIQUE NOT NULL,
+            raw_data_compressed BLOB,
+            processed_data_compressed BLOB,
+            raw_data_size INTEGER,
+            processed_data_size INTEGER,
+            record_count INTEGER DEFAULT 0,
+            data_size_bytes INTEGER DEFAULT 0,
             fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            fetch_duration_ms INTEGER DEFAULT 0,
             status TEXT DEFAULT 'success',
             error_message TEXT,
+            session_id TEXT,
+            quality_score REAL DEFAULT 0.0,
+            health_score REAL DEFAULT 0.0,
+            similar_datasets TEXT DEFAULT '[]',
+            data_format TEXT,
+            api_version TEXT,
+            fetch_date DATE DEFAULT (date('now')),
+            last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            access_count INTEGER DEFAULT 1,
+            CHECK (status IN ('success', 'error', 'partial', 'timeout'))
         )
     ''')
+    # Create comprehensive indexes for data table
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_api_name ON harvested_data(api_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_fetch_date ON harvested_data(fetch_date)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_status ON harvested_data(status)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_session ON harvested_data(session_id)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_hash ON harvested_data(data_hash)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_quality ON harvested_data(quality_score)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_records ON harvested_data(record_count)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_size ON harvested_data(data_size_bytes)')
+    # Enhanced session management table
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS harvest_sessions (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
+            session_id TEXT UNIQUE NOT NULL,
             session_name TEXT,
             started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
             last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
             successful_fetches INTEGER DEFAULT 0,
             failed_fetches INTEGER DEFAULT 0,
             total_records INTEGER DEFAULT 0,
+            total_data_size INTEGER DEFAULT 0,
             session_status TEXT DEFAULT 'active',
             current_api TEXT,
             current_endpoint TEXT,
+            session_config TEXT,
+            error_count INTEGER DEFAULT 0,
+            avg_fetch_time REAL DEFAULT 0.0,
+            session_type TEXT DEFAULT 'manual',
+            priority INTEGER DEFAULT 1,
+            CHECK (session_status IN ('active', 'paused', 'completed', 'failed', 'cancelled'))
         )
     ''')
+    # Create indexes for sessions table
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_status ON harvest_sessions(session_status)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_started ON harvest_sessions(started_at)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_activity ON harvest_sessions(last_activity)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_sessions_priority ON harvest_sessions(priority)')
+    # Enhanced discovery progress table
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS discovery_progress (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             endpoints_found INTEGER DEFAULT 0,
             depth_reached INTEGER DEFAULT 0,
             discovery_status TEXT DEFAULT 'running',
+            discovery_config TEXT,
+            errors_encountered INTEGER DEFAULT 0,
+            success_rate REAL DEFAULT 0.0,
+            estimated_total INTEGER DEFAULT 0,
+            CHECK (discovery_status IN ('running', 'completed', 'failed', 'paused'))
+        )
+    ''')
+    # Create indexes for discovery table
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_api ON discovery_progress(api_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_status ON discovery_progress(discovery_status)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_started ON discovery_progress(started_at)')
+    # Data quality and metadata table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS data_quality_metrics (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            data_id INTEGER REFERENCES harvested_data(id),
+            api_name TEXT NOT NULL,
+            completeness_score REAL DEFAULT 0.0,
+            consistency_score REAL DEFAULT 0.0,
+            accuracy_score REAL DEFAULT 0.0,
+            timeliness_score REAL DEFAULT 0.0,
+            overall_quality REAL DEFAULT 0.0,
+            anomalies_detected INTEGER DEFAULT 0,
+            anomaly_details TEXT,
+            validation_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            validation_rules_version TEXT DEFAULT '1.0'
         )
     ''')
+    # Create quality metrics indexes
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_api ON data_quality_metrics(api_name)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_overall ON data_quality_metrics(overall_quality)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_quality_timestamp ON data_quality_metrics(validation_timestamp)')
+    # API performance tracking table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS api_performance_log (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            api_name TEXT NOT NULL,
+            endpoint_path TEXT NOT NULL,
+            response_time_ms INTEGER,
+            response_size_bytes INTEGER,
+            http_status_code INTEGER,
+            success BOOLEAN,
+            error_type TEXT,
+            timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            date_only DATE DEFAULT (date('now'))
+        )
+    ''')
+    # Create performance indexes
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_api_date ON api_performance_log(api_name, date_only)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_success ON api_performance_log(success)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_perf_response_time ON api_performance_log(response_time_ms)')
+    # Data archival management table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS data_archive_log (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            original_data_id INTEGER,
+            archive_path TEXT,
+            archive_format TEXT DEFAULT 'gzip',
+            archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            original_size INTEGER,
+            compressed_size INTEGER,
+            compression_ratio REAL,
+            checksum TEXT,
+            retention_date DATE,
+            archive_status TEXT DEFAULT 'active'
+        )
+    ''')
+    # Create views for common queries
+    cursor.execute('''
+        CREATE VIEW IF NOT EXISTS v_api_summary AS
+        SELECT
+            api_name,
+            COUNT(*) as total_fetches,
+            COUNT(CASE WHEN status = 'success' THEN 1 END) as successful_fetches,
+            SUM(record_count) as total_records,
+            SUM(data_size_bytes) as total_data_size,
+            AVG(fetch_duration_ms) as avg_fetch_time,
+            AVG(quality_score) as avg_quality_score,
+            MAX(fetch_timestamp) as last_fetch,
+            MIN(fetch_timestamp) as first_fetch
+        FROM harvested_data
+        GROUP BY api_name
+    ''')
+    cursor.execute('''
+        CREATE VIEW IF NOT EXISTS v_session_summary AS
+        SELECT
+            session_id,
+            session_name,
+            session_status,
+            started_at,
+            completed_at,
+            total_endpoints,
+            processed_endpoints,
+            successful_fetches,
+            failed_fetches,
+            total_records,
+            total_data_size,
+            CASE
+                WHEN total_endpoints > 0 THEN
+                    ROUND((processed_endpoints * 100.0) / total_endpoints, 2)
+                ELSE 0
+            END as completion_percentage,
+            CASE
+                WHEN processed_endpoints > 0 THEN
+                    ROUND((successful_fetches * 100.0) / processed_endpoints, 2)
+                ELSE 0
+            END as success_percentage
+        FROM harvest_sessions
+    ''')
+    # Enable automatic statistics collection
+    cursor.execute('PRAGMA optimize')
     conn.commit()
     conn.close()
+# Database optimization and maintenance functions
+def optimize_database():
+    """Perform database optimization and maintenance"""
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    try:
+        # Update statistics
+        cursor.execute('ANALYZE')
+        # Vacuum if necessary (reclaim space)
+        cursor.execute('PRAGMA auto_vacuum=INCREMENTAL')
+        cursor.execute('PRAGMA incremental_vacuum')
+        # Optimize query planner
+        cursor.execute('PRAGMA optimize')
+        conn.commit()
+        return True
+    except Exception as e:
+        return False
+    finally:
+        conn.close()
+def get_database_stats():
+    """Get comprehensive database statistics"""
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    try:
+        stats = {}
+        # Basic table counts
+        tables = ['discovered_endpoints', 'harvested_data', 'harvest_sessions', 'discovery_progress']
+        for table in tables:
+            cursor.execute(f'SELECT COUNT(*) FROM {table}')
+            stats[f'{table}_count'] = cursor.fetchone()[0]
+        # Database size
+        cursor.execute('PRAGMA page_count')
+        page_count = cursor.fetchone()[0]
+        cursor.execute('PRAGMA page_size')
+        page_size = cursor.fetchone()[0]
+        stats['database_size_mb'] = round((page_count * page_size) / (1024 * 1024), 2)
+        # Data quality stats
+        cursor.execute('SELECT AVG(quality_score), AVG(health_score) FROM harvested_data WHERE status = "success"')
+        quality_stats = cursor.fetchone()
+        stats['avg_quality_score'] = round(quality_stats[0] or 0, 3)
+        stats['avg_health_score'] = round(quality_stats[1] or 0, 3)
+        # Recent activity
+        cursor.execute('''
+            SELECT COUNT(*) FROM harvested_data
+            WHERE fetch_timestamp > datetime('now', '-24 hours')
+        ''')
+        stats['recent_fetches_24h'] = cursor.fetchone()[0]
+        return stats
+    finally:
+        conn.close()
+def compress_old_data(days_old=30):
+    """Compress data older than specified days"""
+    import gzip
+    import json
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    try:
+        # Find old data to compress
+        cursor.execute('''
+            SELECT id, raw_data, processed_data
+            FROM harvested_data
+            WHERE fetch_timestamp < datetime('now', '-{} days')
+            AND raw_data_compressed IS NULL
+        '''.format(days_old))
+        old_records = cursor.fetchall()
+        compressed_count = 0
+        for record_id, raw_data, processed_data in old_records:
+            try:
+                # Compress raw data
+                raw_compressed = None
+                if raw_data:
+                    raw_compressed = gzip.compress(raw_data.encode('utf-8'))
+                # Compress processed data
+                processed_compressed = None
+                if processed_data:
+                    processed_compressed = gzip.compress(processed_data.encode('utf-8'))
+                # Update record with compressed data
+                cursor.execute('''
+                    UPDATE harvested_data
+                    SET raw_data_compressed = ?,
+                        processed_data_compressed = ?,
+                        raw_data = NULL,
+                        processed_data = NULL,
+                        raw_data_size = ?,
+                        processed_data_size = ?
+                    WHERE id = ?
+                ''', (
+                    raw_compressed,
+                    processed_compressed,
+                    len(raw_data) if raw_data else 0,
+                    len(processed_data) if processed_data else 0,
+                    record_id
+                ))
+                compressed_count += 1
+            except Exception as e:
+                continue  # Skip problematic records
+        conn.commit()
+        return compressed_count
+    finally:
+        conn.close()
+def backup_database(backup_path=None):
+    """Create a backup of the database"""
+    import shutil
+    from datetime import datetime
+    if backup_path is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        backup_path = f"backup_harvester_{timestamp}.db"
+    try:
+        shutil.copy2(DB_PATH, backup_path)
+        return backup_path
+    except Exception as e:
+        return None
 class DeepEndpointDiscoverer:
     """Advanced endpoint discovery with recursive exploration"""
     def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
                            session_id: str, fetch_duration: int, record_count: int,
                            data_size: int, status: str = "success", error_message: str = None):
+        """Save harvested data with optimized storage and AI-enhanced analysis"""
+        import gzip
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
         data_str = json.dumps(data, sort_keys=True, default=str)
         data_hash = hashlib.sha256(data_str.encode()).hexdigest()
+        # Check if this data already exists
+        cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
+        if cursor.fetchone():
+            # Update access count and last accessed time
+            cursor.execute('''
+                UPDATE harvested_data
+                SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
+                WHERE data_hash = ?
+            ''', (data_hash,))
+            conn.commit()
+            conn.close()
+            return
         # AI Quality Assessment
         quality_assessment = {}
         if ai_quality_assessor and status == "success":
                 api_name, fetch_duration, success_rate, data_size
             )
+        # Determine data format
+        data_format = self._detect_data_format(data)
+        # Compress data if it's large
+        raw_data_compressed = None
+        processed_data_compressed = None
+        raw_data = None
+        processed_data = None
+        if data_size > 1024:  # Compress if larger than 1KB
+            try:
+                raw_data_compressed = gzip.compress(data_str.encode('utf-8'))
+                processed_data_compressed = gzip.compress(json.dumps(data, default=str).encode('utf-8'))
+            except:
+                # Fallback to uncompressed storage
+                raw_data = data_str
+                processed_data = json.dumps(data, default=str)
+        else:
+            raw_data = data_str
+            processed_data = json.dumps(data, default=str)
         try:
             cursor.execute('''
+                INSERT INTO harvested_data
+                (api_name, endpoint_path, data_hash, raw_data_compressed, processed_data_compressed,
+                 raw_data, processed_data, raw_data_size, processed_data_size,
                  record_count, data_size_bytes, fetch_duration_ms, status,
+                 error_message, session_id, quality_score, health_score, similar_datasets,
+                 data_format, access_count)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', (
+                api_name, endpoint_path, data_hash, raw_data_compressed, processed_data_compressed,
+                raw_data, processed_data, len(data_str), len(json.dumps(data, default=str)),
+                record_count, data_size, fetch_duration, status, error_message, session_id,
                 quality_assessment.get('ai_quality_score', 0.0),
                 health_info.get('health_score', 0.0),
+                json.dumps(similar_datasets[:3], default=str),
+                data_format, 1
+            ))
+            # Log API performance
+            cursor.execute('''
+                INSERT INTO api_performance_log
+                (api_name, endpoint_path, response_time_ms, response_size_bytes,
+                 http_status_code, success, error_type)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+            ''', (
+                api_name, endpoint_path, fetch_duration, data_size,
+                200 if status == "success" else 500,
+                status == "success",
+                error_message if status != "success" else None
             ))
             conn.commit()
             if quality_assessment and st.session_state.get('show_ai_insights', True):
                 self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
+        except sqlite3.OperationalError as e:
+            # Handle database schema updates
+            if "no such column" in str(e):
+                self._upgrade_database_schema()
+                # Retry with basic data structure
+                cursor.execute('''
+                    INSERT OR REPLACE INTO harvested_data
+                    (api_name, endpoint_path, data_hash, raw_data, processed_data,
+                     record_count, data_size_bytes, fetch_duration_ms, status,
+                     error_message, session_id)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    api_name, endpoint_path, data_hash, raw_data or data_str,
+                    processed_data or json.dumps(data, default=str), record_count, data_size,
+                    fetch_duration, status, error_message, session_id
+                ))
+                conn.commit()
         finally:
             conn.close()
+    def _detect_data_format(self, data: Any) -> str:
+        """Detect the format of the data"""
+        if isinstance(data, dict):
+            if "_embedded" in data or "_links" in data:
+                return "HAL+JSON"
+            elif "dataSets" in data or "structure" in data:
+                return "SDMX-JSON"
+            else:
+                return "JSON"
+        elif isinstance(data, list):
+            return "JSON-Array"
+        elif isinstance(data, str):
+            if data.strip().startswith('<'):
+                return "XML"
+            else:
+                return "Text"
+        else:
+            return "Unknown"
     def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
                            health_info: Dict, similar_datasets: List[Dict]):
         """Display AI-powered insights in real-time"""
     finally:
         conn.close()
+# Database Management Section
+with st.expander("🗄️ Database Management & Statistics", expanded=False):
+    st.markdown("**Database Performance & Maintenance Tools**")
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        if st.button("📊 Get Database Stats", use_container_width=True):
+            with st.spinner("Analyzing database..."):
+                stats = get_database_stats()
+                st.markdown("**Database Statistics:**")
+                for key, value in stats.items():
+                    formatted_key = key.replace('_', ' ').title()
+                    if 'size_mb' in key:
+                        st.metric(formatted_key, f"{value} MB")
+                    elif 'score' in key:
+                        st.metric(formatted_key, f"{value:.3f}")
+                    else:
+                        st.metric(formatted_key, value)
+    with col2:
+        if st.button("🔧 Optimize Database", use_container_width=True):
+            with st.spinner("Optimizing database..."):
+                success = optimize_database()
+                if success:
+                    st.success("✅ Database optimized successfully!")
+                else:
+                    st.error("❌ Database optimization failed")
+    with col3:
+        if st.button("🗜️ Compress Old Data", use_container_width=True):
+            with st.spinner("Compressing old data..."):
+                compressed_count = compress_old_data(days_old=7)  # Compress data older than 7 days
+                if compressed_count > 0:
+                    st.success(f"✅ Compressed {compressed_count} old records")
+                else:
+                    st.info("ℹ️ No old data found to compress")
+    with col4:
+        if st.button("💾 Create Backup", use_container_width=True):
+            with st.spinner("Creating backup..."):
+                backup_path = backup_database()
+                if backup_path:
+                    st.success(f"✅ Backup created: {backup_path}")
+                    # Offer download
+                    try:
+                        with open(backup_path, 'rb') as f:
+                            st.download_button(
+                                label="⬇️ Download Backup",
+                                data=f.read(),
+                                file_name=backup_path,
+                                mime="application/x-sqlite3"
+                            )
+                    except:
+                        pass
+                else:
+                    st.error("❌ Backup creation failed")
+    # Enhanced database insights
+    st.markdown("---")
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        # Show recent activity summary
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("**📈 Recent Activity (Last 24h)**")
+            df_recent = pd.read_sql_query('''
+                SELECT api_name, COUNT(*) as fetches, SUM(record_count) as records
+                FROM harvested_data
+                WHERE fetch_timestamp > datetime('now', '-1 day')
+                GROUP BY api_name
+                ORDER BY fetches DESC
+            ''', conn)
+            if not df_recent.empty:
+                st.dataframe(df_recent, use_container_width=True)
+            else:
+                st.info("No recent activity")
+        with col2:
+            st.markdown("**🎯 Data Quality Overview**")
+            df_quality = pd.read_sql_query('''
+                SELECT
+                    api_name,
+                    ROUND(AVG(quality_score), 3) as avg_quality,
+                    ROUND(AVG(health_score), 3) as avg_health,
+                    COUNT(*) as total_records
+                FROM harvested_data
+                WHERE status = 'success' AND quality_score > 0
+                GROUP BY api_name
+                ORDER BY avg_quality DESC
+            ''', conn)
+            if not df_quality.empty:
+                st.dataframe(df_quality, use_container_width=True)
+            else:
+                st.info("No quality data available")
+        conn.close()
+    except Exception as e:
+        st.error(f"Database error: {e}")
+    # Storage efficiency metrics
+    st.markdown("**💾 Storage Efficiency**")
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        # Calculate compression ratios
+        cursor.execute('''
+            SELECT
+                COUNT(*) as total_records,
+                COUNT(CASE WHEN raw_data_compressed IS NOT NULL THEN 1 END) as compressed_records,
+                SUM(data_size_bytes) as total_original_size,
+                SUM(CASE WHEN raw_data_compressed IS NOT NULL THEN raw_data_size ELSE data_size_bytes END) as effective_size
+            FROM harvested_data
+        ''')
+        storage_stats = cursor.fetchone()
+        if storage_stats and storage_stats[0] > 0:
+            total_records, compressed_records, original_size, effective_size = storage_stats
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Total Records", total_records)
+            with col2:
+                st.metric("Compressed Records", compressed_records)
+            with col3:
+                compression_ratio = 0
+                if original_size and effective_size:
+                    compression_ratio = (1 - effective_size / original_size) * 100
+                st.metric("Compression Ratio", f"{compression_ratio:.1f}%")
+            with col4:
+                space_saved = (original_size - effective_size) if original_size and effective_size else 0
+                space_saved_mb = space_saved / (1024 * 1024)
+                st.metric("Space Saved", f"{space_saved_mb:.2f} MB")
+        conn.close()
+    except Exception as e:
+        st.warning(f"Could not calculate storage metrics: {e}")
 # AI Enhancement Panel
 st.markdown("---")
 with st.expander("🤖 AI Enhancement Status", expanded=False):