File size: 32,422 Bytes
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
45efc9d
 
 
 
 
 
 
 
 
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55a4b7
18b60c9
 
 
c55a4b7
18b60c9
 
9b8d982
18b60c9
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
 
c55a4b7
 
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
 
 
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
9b8d982
18b60c9
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
9b8d982
18b60c9
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
c55a4b7
9b8d982
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
9b8d982
18b60c9
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
9b8d982
 
c55a4b7
9b8d982
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
9b8d982
18b60c9
 
 
 
c55a4b7
 
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
c55a4b7
 
 
 
 
 
18b60c9
 
 
 
 
 
c55a4b7
18b60c9
 
 
c55a4b7
 
 
 
 
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
c55a4b7
 
 
 
18b60c9
c55a4b7
18b60c9
c55a4b7
 
 
 
 
 
18b60c9
 
 
c55a4b7
18b60c9
 
 
c55a4b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55a4b7
 
18b60c9
 
 
c55a4b7
18b60c9
c55a4b7
 
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55a4b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
c55a4b7
 
 
 
18b60c9
 
 
 
 
c55a4b7
 
 
18b60c9
 
 
 
 
 
c55a4b7
 
18b60c9
c55a4b7
18b60c9
c55a4b7
18b60c9
c55a4b7
 
 
 
 
 
 
 
 
 
 
 
 
 
18b60c9
c55a4b7
 
 
 
 
18b60c9
c55a4b7
18b60c9
 
c55a4b7
 
 
 
18b60c9
c55a4b7
 
 
18b60c9
 
 
 
 
c55a4b7
 
 
 
 
 
 
18b60c9
 
 
 
 
c55a4b7
18b60c9
 
 
 
c55a4b7
 
18b60c9
 
c55a4b7
18b60c9
 
c55a4b7
18b60c9
 
c55a4b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18b60c9
 
 
c55a4b7
 
 
 
 
 
 
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55a4b7
18b60c9
 
c55a4b7
18b60c9
 
c55a4b7
 
 
 
 
 
 
 
 
 
18b60c9
 
c55a4b7
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281e4d
 
 
 
 
 
 
 
 
 
 
 
18b60c9
1281e4d
 
 
 
18b60c9
1281e4d
18b60c9
 
 
 
 
1281e4d
18b60c9
 
1281e4d
18b60c9
 
1281e4d
18b60c9
 
1281e4d
 
 
 
 
18b60c9
45efc9d
 
1281e4d
 
45efc9d
 
 
 
 
 
 
 
 
 
 
 
 
1281e4d
 
45efc9d
18b60c9
 
 
 
 
 
 
 
 
 
 
 
 
 
45efc9d
1281e4d
 
45efc9d
 
1281e4d
45efc9d
 
 
1281e4d
45efc9d
 
 
1281e4d
45efc9d
1281e4d
 
 
18b60c9
45efc9d
 
18b60c9
 
c55a4b7
45efc9d
 
 
 
18b60c9
45efc9d
 
c55a4b7
45efc9d
 
c55a4b7
 
45efc9d
 
 
c55a4b7
45efc9d
 
c55a4b7
45efc9d
 
 
 
 
18b60c9
 
c55a4b7
 
18b60c9
45efc9d
18b60c9
 
 
 
 
c55a4b7
18b60c9
c55a4b7
 
 
 
18b60c9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
import streamlit as st
import requests
import pandas as pd
import json
import time
from datetime import datetime
import sqlite3
import hashlib
import gzip
from typing import Dict, Any, List
import warnings
warnings.filterwarnings('ignore')

# Optional plotly import with fallback
PLOTLY_AVAILABLE = False
try:
    import plotly.express as px
    PLOTLY_AVAILABLE = True
except ImportError:
    st.warning("πŸ“Š Plotly not available - charts will be disabled")
    PLOTLY_AVAILABLE = False

# Global ML availability flag
ML_AVAILABLE = False

# AI/ML Imports for enhanced functionality
try:
    from transformers import pipeline
    from sentence_transformers import SentenceTransformer
    ML_AVAILABLE = True
except ImportError:
    ML_AVAILABLE = False

# Enhanced Page Configuration
st.set_page_config(
    page_title="Simplified Data Harvester",
    page_icon="πŸš€",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Enhanced CSS with modern, professional styling
st.markdown("""
<style>
    .main > div {
        padding-top: 1rem;
    }
    .stApp {
        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
        color: #2c3e50;
    }
    .title-container {
        text-align: center;
        padding: 2rem 0;
        background: rgba(255, 255, 255, 0.9);
        border-radius: 15px;
        margin-bottom: 2rem;
        border: 1px solid rgba(52, 73, 94, 0.1);
        box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
    }
    .api-grid {
        display: grid;
        grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
        gap: 1rem;
        margin: 1rem 0;
    }
    .api-card {
        background: rgba(255, 255, 255, 0.9);
        border-radius: 10px;
        padding: 1rem;
        border: 1px solid rgba(52, 73, 94, 0.15);
        text-align: center;
    }
    .main-button {
        background: linear-gradient(135deg, #3498db, #2980b9);
        color: white;
        border: none;
        border-radius: 12px;
        padding: 1rem 2rem;
        font-size: 1.2rem;
        font-weight: bold;
        cursor: pointer;
        width: 100%;
        margin: 1rem 0;
    }
    .main-button:hover {
        background: linear-gradient(135deg, #2980b9, #1f4e79);
        transform: translateY(-2px);
        box-shadow: 0 8px 25px rgba(52, 152, 219, 0.3);
    }
</style>
""", unsafe_allow_html=True)

# Database Configuration
DB_PATH = "simplified_harvester.db"

# API Configuration - Tested and verified working endpoints
SIMPLIFIED_API_CONFIG = {
    "Skolverket": {
        "name": "πŸ‡ΈπŸ‡ͺ Skolverket",
        "description": "Swedish National Agency for Education - School Units",
        "endpoints": [
            {
                "url": "https://api.skolverket.se/planned-educations/v3/compact-school-units?coordinateSystemType=WGS84&page=0&size=20",
                "headers": {"Accept": "application/vnd.skolverket.plannededucations.api.v3.hal+json"},
                "method": "GET",
                "data_path": "body._embedded.schoolUnits",
                "key_field": "schoolUnitCode"
            }
        ]
    },
    "SCB": {
        "name": "πŸ‡ΈπŸ‡ͺ Statistics Sweden", 
        "description": "Swedish National Statistics Office - Population Metadata",
        "endpoints": [
            {
                "url": "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/BE/BE0101/BE0101A/BefolkningNy",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "variables",
                "key_field": "code"
            }
        ]
    },
    "Kolada": {
        "name": "πŸ‡ΈπŸ‡ͺ Kolada",
        "description": "Municipal Key Performance Indicators - Municipalities",
        "endpoints": [
            {
                "url": "https://api.kolada.se/v2/municipality",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "values",
                "key_field": "id"
            }
        ]
    },
    "Eurostat": {
        "name": "πŸ‡ͺπŸ‡Ί Eurostat",
        "description": "European Union Statistics - Population Density",
        "endpoints": [
            {
                "url": "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/demo_r_d3dens?format=JSON&lang=EN",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "value",
                "key_field": "index"
            }
        ]
    },
    "WHO": {
        "name": "🌍 WHO",
        "description": "World Health Organization - Country Dimensions",
        "endpoints": [
            {
                "url": "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "value",
                "key_field": "Code"
            }
        ]
    },
    "OECD": {
        "name": "🌍 OECD",
        "description": "Organisation for Economic Co-operation and Development - Economic Data",
        "endpoints": [
            {
                "url": "https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_QNA_EXPENDITURE_GROWTH_OECD/?format=jsondata",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "data.dataSets",
                "key_field": "series"
            }
        ]
    },
    "WorldBank": {
        "name": "🌍 World Bank",
        "description": "International Financial Institution - Population Data",
        "endpoints": [
            {
                "url": "https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?format=json&per_page=50&date=2023",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "[1]",
                "key_field": "countryiso3code"
            }
        ]
    },
    "Riksbanken": {
        "name": "πŸ‡ΈπŸ‡ͺ Riksbanken",
        "description": "Swedish Central Bank - Exchange Rates",
        "endpoints": [
            {
                "url": "https://api.riksbank.se/swea/v1/Observations/Latest/ByGroup/130",
                "headers": {
                    "Accept": "application/json"
                },
                "method": "GET",
                "data_path": "",
                "key_field": "seriesId"
            }
        ]
    },
    "Swecris": {
        "name": "πŸ‡ΈπŸ‡ͺ Swecris",
        "description": "Swedish Research Council Database - Research Projects",
        "endpoints": [
            {
                "url": "https://swecris-api.vr.se/v1/projects?size=20",
                "headers": {
                    "Accept": "application/json",
                    "Authorization": "Bearer VRSwecrisAPI2025-1"
                },
                "method": "GET",
                "data_path": "content",
                "key_field": "projectId",
                "requires_auth": True
            }
        ]
    },
    "CSN": {
        "name": "πŸ‡ΈπŸ‡ͺ CSN",
        "description": "Swedish Board of Student Finance - Statistics Portal",
        "endpoints": [
            {
                "url": "https://statistik.csn.se/PXWeb/api/v1/sv/CSNstat",
                "headers": {"Accept": "application/json"},
                "method": "GET",
                "data_path": "",
                "key_field": "id",
                "note": "PX-Web format requires specific POST queries for data"
            }
        ]
    }
}

def init_database():
    """Initialize optimized SQLite database with proper indexing and compression support"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    # Enable WAL mode for better concurrent access
    cursor.execute('PRAGMA journal_mode=WAL')
    cursor.execute('PRAGMA synchronous=NORMAL')
    cursor.execute('PRAGMA cache_size=10000')
    cursor.execute('PRAGMA temp_store=memory')
    
    # Create optimized table structure
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS harvested_data (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            api_name TEXT NOT NULL,
            endpoint_url TEXT NOT NULL,
            data_hash TEXT UNIQUE NOT NULL,
            raw_data TEXT,
            compressed_data BLOB,
            processed_data TEXT,
            record_count INTEGER DEFAULT 0,
            data_size_bytes INTEGER DEFAULT 0,
            compression_ratio REAL,
            fetch_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            fetch_duration_ms INTEGER DEFAULT 0,
            status TEXT DEFAULT 'success',
            session_id TEXT,
            data_path TEXT,
            key_field TEXT,
            error_message TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    # Create performance indexes
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_api_name ON harvested_data(api_name)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_timestamp ON harvested_data(fetch_timestamp)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_status ON harvested_data(status)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_session ON harvested_data(session_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_data_hash ON harvested_data(data_hash)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_api_timestamp ON harvested_data(api_name, fetch_timestamp)')
    
    # Create sessions table for better tracking
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS fetch_sessions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            session_id TEXT UNIQUE NOT NULL,
            session_name TEXT,
            started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            completed_at TIMESTAMP,
            total_apis INTEGER DEFAULT 0,
            successful_fetches INTEGER DEFAULT 0,
            failed_fetches INTEGER DEFAULT 0,
            total_records INTEGER DEFAULT 0,
            total_size_bytes INTEGER DEFAULT 0,
            status TEXT DEFAULT 'running'
        )
    ''')
    
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_session_id ON fetch_sessions(session_id)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_session_started ON fetch_sessions(started_at)')
    
    conn.commit()
    conn.close()

class SimplifiedDataHarvester:
    """Simplified data harvester - one function to fetch from all APIs"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Simplified-Data-Harvester/1.0 (Research & Analysis)'
        })
        self.results = {}
        self.errors = {}
    
    def fetch_all_apis(self, progress_callback=None) -> Dict:
        """One function to fetch data from all APIs automatically"""
        session_id = f"simplified_{int(time.time())}"
        total_apis = len(SIMPLIFIED_API_CONFIG)
        completed = 0
        
        if progress_callback:
            progress_callback(f"πŸš€ Starting data collection from {total_apis} APIs...")
        
        for api_name, config in SIMPLIFIED_API_CONFIG.items():
            if progress_callback:
                progress_callback(f"πŸ”„ Fetching from {config['name']}...")
            
            try:
                api_results = self._fetch_api_data(api_name, config, session_id)
                self.results[api_name] = api_results
                completed += 1
                
                if progress_callback:
                    progress = (completed / total_apis) * 100
                    progress_callback(f"βœ… {config['name']} completed ({progress:.1f}%)")
                
                time.sleep(0.5)  # Respectful delay
                    
            except Exception as e:
                self.errors[api_name] = str(e)
                if progress_callback:
                    progress_callback(f"❌ {config['name']} failed: {str(e)[:50]}...")
                completed += 1
        
        if progress_callback:
            successful = len(self.results)
            failed = len(self.errors)
            progress_callback(f"πŸŽ‰ Collection complete! βœ… {successful} successful, ❌ {failed} failed")
        
        return {
            "results": self.results,
            "errors": self.errors,
            "session_id": session_id,
            "summary": {
                "total_apis": total_apis,
                "successful": len(self.results),
                "failed": len(self.errors),
                "success_rate": (len(self.results) / total_apis) * 100
            }
        }
    
    def _fetch_api_data(self, api_name: str, config: Dict, session_id: str) -> Dict:
        """Fetch data from all endpoints for a specific API"""
        api_results = {
            "api_name": api_name,
            "endpoints": [],
            "total_records": 0,
            "total_size": 0
        }
        
        for i, endpoint in enumerate(config['endpoints']):
            try:
                start_time = time.time()
                
                # Make request
                if endpoint.get('method', 'GET').upper() == 'POST':
                    response = self.session.post(
                        endpoint['url'],
                        headers=endpoint.get('headers', {}),
                        json=endpoint.get('data', {}),
                        timeout=30
                    )
                else:
                    response = self.session.get(
                        endpoint['url'],
                        headers=endpoint.get('headers', {}),
                        timeout=30
                    )
                
                response.raise_for_status()
                
                # Process response
                data = self._process_response(response)
                fetch_duration = int((time.time() - start_time) * 1000)
                
                # Extract meaningful data using endpoint configuration
                processed_data = self._extract_api_data(data, api_name, endpoint)
                record_count = self._count_records(processed_data)
                data_size = len(response.content)
                
                # Save to database with enhanced metadata
                self._save_data_to_db(
                    api_name, endpoint['url'], data, processed_data, session_id,
                    fetch_duration, record_count, data_size, "success", endpoint
                )
                
                endpoint_result = {
                    "endpoint_url": endpoint['url'],
                    "status": "success",
                    "records": record_count,
                    "size_bytes": data_size,
                    "duration_ms": fetch_duration,
                    "data_preview": str(processed_data)[:200] + "..." if len(str(processed_data)) > 200 else str(processed_data)
                }
                
                api_results["endpoints"].append(endpoint_result)
                api_results["total_records"] += record_count
                api_results["total_size"] += data_size
                
            except Exception as e:
                endpoint_result = {
                    "endpoint_url": endpoint['url'],
                    "status": "error", 
                    "error": str(e),
                    "records": 0,
                    "size_bytes": 0,
                    "duration_ms": 0
                }
                api_results["endpoints"].append(endpoint_result)
        
        return api_results
    
    def _process_response(self, response):
        """Process API response"""
        content_type = response.headers.get('content-type', '').lower()
        
        if 'json' in content_type:
            return response.json()
        else:
            try:
                return response.json()  # Try JSON first
            except:
                return {"raw_content": response.text}
    
    def _extract_api_data(self, data: Any, api_name: str, endpoint_config: Dict) -> Any:
        """Extract meaningful data from API response using data_path configuration"""
        try:
            data_path = endpoint_config.get('data_path', '')
            
            if not data_path:
                return data
            
            # Handle nested path extraction
            current_data = data
            
            # Handle array index notation like "[1]"
            if data_path.startswith('[') and data_path.endswith(']'):
                index = int(data_path[1:-1])
                if isinstance(current_data, list) and len(current_data) > index:
                    return current_data[index]
                else:
                    return current_data
            
            # Handle dot notation like "body._embedded.schoolUnits"
            if '.' in data_path:
                path_parts = data_path.split('.')
                for part in path_parts:
                    if isinstance(current_data, dict):
                        current_data = current_data.get(part, current_data)
                    else:
                        break
                return current_data
            
            # Handle simple key extraction
            if isinstance(current_data, dict):
                return current_data.get(data_path, current_data)
            
            return current_data
            
        except Exception as e:
            st.warning(f"Data extraction failed for {api_name}: {str(e)}")
            return data
    
    def _count_records(self, data: Any) -> int:
        """Count records in the data"""
        if isinstance(data, list):
            return len(data)
        elif isinstance(data, dict):
            for key, value in data.items():
                if isinstance(value, list) and len(value) > 0:
                    return len(value)
            return 1
        else:
            return 1 if data else 0
    
    def _save_data_to_db(self, api_name: str, endpoint_url: str, raw_data: Any, 
                        processed_data: Any, session_id: str, fetch_duration: int, 
                        record_count: int, data_size: int, status: str, endpoint_config: Dict):
        """Save data to database with compression and enhanced metadata"""
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        
        try:
            # Create data hash for deduplication
            raw_data_str = json.dumps(raw_data, sort_keys=True, default=str, separators=(',', ':'))
            processed_data_str = json.dumps(processed_data, sort_keys=True, default=str, separators=(',', ':'))
            data_hash = hashlib.sha256(raw_data_str.encode()).hexdigest()
            
            # Check if data exists
            cursor.execute('SELECT id FROM harvested_data WHERE data_hash = ?', (data_hash,))
            if cursor.fetchone():
                return  # Skip duplicate
            
            # Implement smart compression
            raw_data_final = None
            compressed_data = None
            compression_ratio = 1.0
            
            if data_size > 512:  # Compress data larger than 512 bytes
                try:
                    compressed_data = gzip.compress(raw_data_str.encode('utf-8'))
                    compression_ratio = len(compressed_data) / len(raw_data_str.encode('utf-8'))
                    
                    # Only use compression if it saves significant space
                    if compression_ratio < 0.8:
                        raw_data_final = None
                    else:
                        raw_data_final = raw_data_str
                        compressed_data = None
                        compression_ratio = 1.0
                except Exception:
                    raw_data_final = raw_data_str
                    compressed_data = None
                    compression_ratio = 1.0
            else:
                raw_data_final = raw_data_str
            
            # Extract endpoint metadata
            data_path = endpoint_config.get('data_path', '')
            key_field = endpoint_config.get('key_field', '')
            
            # Insert data with enhanced metadata
            cursor.execute('''
                INSERT INTO harvested_data 
                (api_name, endpoint_url, data_hash, raw_data, compressed_data, processed_data,
                 record_count, data_size_bytes, compression_ratio, fetch_duration_ms, 
                 status, session_id, data_path, key_field)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                api_name, endpoint_url, data_hash, raw_data_final, compressed_data, 
                processed_data_str, record_count, data_size, compression_ratio,
                fetch_duration, status, session_id, data_path, key_field
            ))
            
            conn.commit()
            
        except Exception as e:
            # Log error but don't fail the entire operation
            cursor.execute('''
                INSERT INTO harvested_data 
                (api_name, endpoint_url, data_hash, status, error_message, session_id)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (api_name, endpoint_url, f"error_{int(time.time())}", "error", str(e), session_id))
            conn.commit()
                
        finally:
            conn.close()

def get_database_stats():
    """Get enhanced database statistics"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    try:
        # Basic counts
        cursor.execute('SELECT COUNT(*) FROM harvested_data WHERE status = "success"')
        total_records = cursor.fetchone()[0]
        
        cursor.execute('SELECT COUNT(DISTINCT api_name) FROM harvested_data WHERE status = "success"')
        active_apis = cursor.fetchone()[0]
        
        cursor.execute('SELECT SUM(record_count) FROM harvested_data WHERE status = "success"')
        total_data_records = cursor.fetchone()[0] or 0
        
        # Enhanced metrics
        cursor.execute('SELECT SUM(data_size_bytes) FROM harvested_data WHERE status = "success"')
        total_size_bytes = cursor.fetchone()[0] or 0
        
        cursor.execute('SELECT AVG(compression_ratio) FROM harvested_data WHERE compression_ratio IS NOT NULL')
        avg_compression = cursor.fetchone()[0] or 1.0
        
        cursor.execute('SELECT COUNT(*) FROM harvested_data WHERE status = "error"')
        error_count = cursor.fetchone()[0]
        
        cursor.execute('SELECT AVG(fetch_duration_ms) FROM harvested_data WHERE status = "success"')
        avg_fetch_time = cursor.fetchone()[0] or 0
        
        # Latest session info
        cursor.execute('''
            SELECT session_id, COUNT(*) as fetches, MAX(fetch_timestamp) as latest
            FROM harvested_data 
            WHERE session_id IS NOT NULL 
            GROUP BY session_id 
            ORDER BY latest DESC 
            LIMIT 1
        ''')
        latest_session = cursor.fetchone()
        
        return {
            "total_records": total_records,
            "active_apis": active_apis,
            "total_data_records": total_data_records,
            "total_size_mb": round(total_size_bytes / (1024 * 1024), 2),
            "avg_compression": round(avg_compression, 3),
            "error_count": error_count,
            "avg_fetch_time_ms": round(avg_fetch_time, 1),
            "latest_session": latest_session[0] if latest_session else None,
            "latest_session_fetches": latest_session[1] if latest_session else 0
        }
    finally:
        conn.close()

# Initialize database
init_database()

# Initialize components
if 'harvester' not in st.session_state:
    st.session_state.harvester = SimplifiedDataHarvester()

if 'last_results' not in st.session_state:
    st.session_state.last_results = None

# Header
st.markdown("""
<div class="title-container">
    <h1 style="font-size: 2.5rem; margin: 0; color: #2c3e50;">
        πŸš€ Simplified Data Harvester
    </h1>
    <p style="font-size: 1.1rem; margin: 0.5rem 0 0 0; color: #34495e;">
        One-Click Data Collection from All APIs
    </p>
    <p style="font-size: 0.95rem; margin: 0.3rem 0 0 0; color: #7f8c8d;">
        Automatic data fetching from 10 international sources with smart database storage
    </p>
</div>
""", unsafe_allow_html=True)

# Display ML status
if ML_AVAILABLE:
    st.success("πŸ€– **AI Enhanced** - Quality assessment and analysis active")
else:
    st.info("πŸ“Š **Standard Mode** - Core functionality available")

# Main Action Section
st.markdown("### πŸš€ Data Collection")

# Show API status
col1, col2 = st.columns([2, 1])

with col1:
    st.markdown("**Available APIs:**")
    
    # Display APIs in a compact format
    for api_name, config in SIMPLIFIED_API_CONFIG.items():
        st.markdown(f"βœ… **{config['name']}** - {config['description']}")

with col2:
    # Enhanced database stats
    try:
        stats = get_database_stats()
        st.metric("πŸ“Š Fetch Records", f"{stats.get('total_records', 0):,}")
        st.metric("🌍 Active APIs", stats.get('active_apis', 0))
        st.metric("πŸ“„ Data Records", f"{stats.get('total_data_records', 0):,}")
        st.metric("πŸ’Ύ Storage (MB)", f"{stats.get('total_size_mb', 0)}")
        
        if stats.get('avg_compression', 1.0) < 1.0:
            compression_pct = (1 - stats.get('avg_compression', 1.0)) * 100
            st.metric("πŸ—œοΈ Compression", f"{compression_pct:.1f}% saved")
        
        if stats.get('error_count', 0) > 0:
            st.metric("⚠️ Errors", stats.get('error_count', 0))
            
    except Exception as e:
        st.metric("πŸ“Š Total Records", "0")
        st.metric("🌍 Active APIs", "0")
        st.error(f"Stats error: {str(e)}")

st.markdown("---")

# Single button to fetch all data
col1, col2, col3 = st.columns([1, 2, 1])

with col2:
    if st.button("πŸš€ FETCH ALL DATA FROM ALL APIS", 
                type="primary", 
                use_container_width=True,
                help="Automatically collect data from all 10 APIs and save to database"):
        
        # Progress tracking
        status_container = st.empty()
        progress_bar = st.progress(0)
        
        def update_progress(message):
            status_container.text(message)
        
        # Execute the one-click data collection
        with st.spinner("πŸ”„ Collecting data from all APIs..."):
            try:
                results = st.session_state.harvester.fetch_all_apis(update_progress)
                st.session_state.last_results = results
                
                progress_bar.progress(1.0)
                status_container.success("βœ… Collection completed!")
                
            except Exception as e:
                progress_bar.progress(0.0)
                status_container.error(f"❌ Collection failed: {str(e)}")
                st.error("Data collection encountered an error. Please try again.")
                st.stop()  # Stop execution here if collection failed
        
        # Show results only if collection was successful
        if 'results' not in locals() or not results:
            st.error("No results to display")
            st.stop()
        
        summary = results.get('summary', {})
        
        # Success metrics
        col1, col2, col3, col4 = st.columns(4)
        
        with col1:
            st.metric("βœ… Successful APIs", summary.get('successful', 0))
        
        with col2:
            st.metric("❌ Failed APIs", summary.get('failed', 0))
        
        with col3:
            st.metric("πŸ“Š Success Rate", f"{summary.get('success_rate', 0):.1f}%")
        
        with col4:
            try:
                total_records = sum(api_data.get('total_records', 0) for api_data in results.get('results', {}).values())
                st.metric("πŸ“„ Total Records", f"{total_records:,}")
            except Exception:
                st.metric("πŸ“„ Total Records", "Error")
        
        # Simplified results summary
        st.markdown("### πŸ“‹ Results Summary")
        
        try:
            if len(results.get('results', {})) > 0:
                st.success(f"βœ… Successfully processed {len(results['results'])} APIs")
                
                # Show just API names and record counts in a simple format
                for api_name, api_data in list(results['results'].items())[:5]:
                    records = api_data.get('total_records', 0)
                    size_mb = api_data.get('total_size', 0) / (1024 * 1024)
                    st.write(f"β€’ **{SIMPLIFIED_API_CONFIG[api_name]['name']}**: {records:,} records ({size_mb:.2f} MB)")
                
                if len(results['results']) > 5:
                    st.info(f"... and {len(results['results']) - 5} more APIs processed successfully")
            else:
                st.warning("No results to display")
                
        except Exception as e:
            st.warning("Results summary unavailable - check database viewer below for stored data")
        
        # Error details
        if results['errors']:
            st.markdown("### ❌ Error Details")
            for api_name, error in results['errors'].items():
                st.error(f"**{SIMPLIFIED_API_CONFIG[api_name]['name']}:** {error}")

# Show last results if available
if st.session_state.last_results:
    st.markdown("---")
    st.markdown("### πŸ“Š Quick Analytics")
    
    results = st.session_state.last_results
    
    # Simple analytics with minimal processing
    try:
        if results.get('results') and len(results['results']) > 0:
            # Very simple metrics - no complex calculations
            st.success("πŸ“Š Analytics: Data collection completed successfully!")
            
            # Basic info only
            total_apis = len(results['results'])
            st.info(f"🌍 Processed {total_apis} APIs with data storage in database")
            
            # Skip complex charts and calculations to prevent hanging
            if PLOTLY_AVAILABLE:
                st.info("πŸ“ˆ Charts available - check database viewer for detailed analysis")
            else:
                st.info("πŸ“Š Database contains all collected data for analysis")
                
    except Exception as e:
        st.info("βœ… Data collection was successful - check database viewer below")

# Simplified Database viewer
with st.expander("πŸ—„οΈ Database Viewer"):
    try:
        conn = sqlite3.connect(DB_PATH)
        
        # Simple count query first
        cursor = conn.cursor()
        cursor.execute('SELECT COUNT(*) FROM harvested_data')
        total_count = cursor.fetchone()[0]
        
        if total_count > 0:
            st.success(f"πŸ“Š Database contains {total_count} records")
            
            # Show only recent 10 records to prevent overload
            df = pd.read_sql_query('''
                SELECT 
                    api_name as "API",
                    record_count as "Records", 
                    status as "Status",
                    SUBSTR(fetch_timestamp, 1, 16) as "Time"
                FROM harvested_data 
                ORDER BY fetch_timestamp DESC 
                LIMIT 10
            ''', conn)
            
            st.dataframe(df, use_container_width=True)
            
            if total_count > 10:
                st.info(f"Showing latest 10 records. Total: {total_count} records in database.")
        else:
            st.info("No data in database yet. Run the data collection first!")
            
        conn.close()
    except Exception as e:
        st.warning("Database viewer temporarily unavailable")

# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; padding: 1rem; background: rgba(255,255,255,0.8); border-radius: 10px; color: #2c3e50;">
    <p><strong>πŸš€ Optimized Data Harvester</strong> - Research-verified APIs with smart storage</p>
    <p style="font-size: 0.9rem; color: #7f8c8d;">
        βœ… 10 Tested APIs β€’ πŸ—œοΈ Data compression β€’ πŸ“Š Performance metrics β€’ πŸ” Enhanced analytics
    </p>
    <p style="font-size: 0.8rem; color: #95a5a6;">
        Featuring proper data extraction, optimized database with indexing, and compression for efficient storage
    </p>
</div>
""", unsafe_allow_html=True)