isakskogstad commited on
Commit
eabfed0
Β·
verified Β·
1 Parent(s): 2d4d24a

Upload app_ultimate.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app_ultimate.py +585 -7
app_ultimate.py CHANGED
@@ -20,6 +20,20 @@ import pickle
20
  from urllib.parse import urljoin, urlparse
21
  import threading
22
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Enhanced Page Configuration
25
  st.set_page_config(
@@ -131,6 +145,439 @@ DB_PATH = "ultimate_data_harvester.db"
131
  SESSION_PATH = "harvester_session.pkl"
132
  ENDPOINTS_CACHE = "discovered_endpoints.json"
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  # Comprehensive API Discovery Configuration
135
  DEEP_API_CONFIG = {
136
  "Skolverket": {
@@ -1050,7 +1497,7 @@ class UltimateDataHarvester:
1050
  def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
1051
  session_id: str, fetch_duration: int, record_count: int,
1052
  data_size: int, status: str = "success", error_message: str = None):
1053
- """Save harvested data with intelligent categorization"""
1054
  conn = sqlite3.connect(DB_PATH)
1055
  cursor = conn.cursor()
1056
 
@@ -1058,22 +1505,114 @@ class UltimateDataHarvester:
1058
  data_str = json.dumps(data, sort_keys=True, default=str)
1059
  data_hash = hashlib.sha256(data_str.encode()).hexdigest()
1060
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  try:
1062
  cursor.execute('''
1063
  INSERT OR REPLACE INTO harvested_data
1064
  (api_name, endpoint_path, data_hash, raw_data, processed_data,
1065
  record_count, data_size_bytes, fetch_duration_ms, status,
1066
- error_message, session_id)
1067
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1068
  ''', (
1069
  api_name, endpoint_path, data_hash, data_str,
1070
  json.dumps(data, default=str), record_count, data_size,
1071
- fetch_duration, status, error_message, session_id
 
 
 
1072
  ))
1073
 
1074
  conn.commit()
 
 
 
 
 
1075
  except sqlite3.IntegrityError:
1076
  pass # Data already exists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1077
  finally:
1078
  conn.close()
1079
 
@@ -1263,7 +1802,16 @@ st.markdown("### πŸš€ Operations")
1263
  tab1, tab2, tab3 = st.tabs(["πŸ” Deep Discovery", "πŸ“Š Data Harvesting", "πŸ“ˆ Analytics"])
1264
 
1265
  with tab1:
1266
- st.markdown("**Discover all possible endpoints from API sources**")
 
 
 
 
 
 
 
 
 
1267
 
1268
  # API Selection for Discovery
1269
  selected_apis_discovery = st.multiselect(
@@ -1467,13 +2015,43 @@ with tab3:
1467
  finally:
1468
  conn.close()
1469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1470
  # Footer
1471
  st.markdown("---")
1472
  st.markdown("""
1473
  <div style="text-align: center; padding: 1rem; opacity: 0.9;">
1474
- <p><strong>πŸš€ Ultimate Data Harvester</strong> - Deep discovery, session resumption, intelligent storage</p>
1475
  <p style="font-size: 0.9rem;">
1476
- πŸ” Recursive endpoint discovery β€’ 🎯 Session management β€’ πŸ’Ύ Smart database storage β€’ πŸ“Š Real-time analytics
1477
  </p>
1478
  </div>
1479
  """, unsafe_allow_html=True)
 
20
  from urllib.parse import urljoin, urlparse
21
  import threading
22
  from pathlib import Path
23
+ import numpy as np
24
+ from sklearn.ensemble import IsolationForest
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+ import warnings
27
+ warnings.filterwarnings('ignore')
28
+
29
+ # AI/ML Imports for enhanced functionality
30
+ try:
31
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
32
+ from sentence_transformers import SentenceTransformer
33
+ ML_AVAILABLE = True
34
+ except ImportError:
35
+ ML_AVAILABLE = False
36
+ st.warning("⚠️ ML libraries not available. Some AI features will be disabled.")
37
 
38
  # Enhanced Page Configuration
39
  st.set_page_config(
 
145
  SESSION_PATH = "harvester_session.pkl"
146
  ENDPOINTS_CACHE = "discovered_endpoints.json"
147
 
148
+ # AI Enhancement Classes
149
+ class AIDataQualityAssessor:
150
+ """AI-powered data quality assessment using transformers"""
151
+
152
+ def __init__(self):
153
+ self.quality_model = None
154
+ self.embeddings_model = None
155
+ self._initialize_models()
156
+
157
+ def _initialize_models(self):
158
+ """Initialize AI models for quality assessment"""
159
+ if ML_AVAILABLE:
160
+ try:
161
+ # Initialize quality classifier
162
+ self.quality_model = pipeline(
163
+ "text-classification",
164
+ model="distilbert-base-uncased-finetuned-sst-2-english",
165
+ return_all_scores=True
166
+ )
167
+
168
+ # Initialize embeddings model for similarity
169
+ self.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
170
+
171
+ st.success("βœ… AI models loaded successfully!")
172
+ except Exception as e:
173
+ st.warning(f"⚠️ Failed to load AI models: {e}")
174
+ ML_AVAILABLE = False
175
+
176
+ def assess_data_quality(self, data: Any, api_name: str) -> Dict:
177
+ """Comprehensive AI-powered data quality assessment"""
178
+ if not ML_AVAILABLE or not self.quality_model:
179
+ return self._basic_quality_assessment(data, api_name)
180
+
181
+ try:
182
+ # Convert data to text for analysis
183
+ text_data = self._data_to_text(data)
184
+
185
+ # AI quality scoring
186
+ ai_scores = self.quality_model(text_data[:512]) # Limit to 512 chars
187
+ quality_score = max([score['score'] for score in ai_scores[0]])
188
+
189
+ # Basic quality metrics
190
+ completeness = self._check_completeness(data)
191
+ consistency = self._check_consistency(data, api_name)
192
+ structure_quality = self._assess_structure(data)
193
+
194
+ # Anomaly detection
195
+ anomalies = self._detect_anomalies(data)
196
+
197
+ return {
198
+ "ai_quality_score": round(quality_score, 3),
199
+ "completeness_score": completeness,
200
+ "consistency_score": consistency,
201
+ "structure_score": structure_quality,
202
+ "anomaly_count": len(anomalies),
203
+ "anomalies": anomalies[:5], # Top 5 anomalies
204
+ "overall_grade": self._calculate_overall_grade(
205
+ quality_score, completeness, consistency, structure_quality
206
+ ),
207
+ "recommendations": self._generate_quality_recommendations(
208
+ quality_score, completeness, consistency, anomalies
209
+ )
210
+ }
211
+
212
+ except Exception as e:
213
+ st.warning(f"AI quality assessment failed: {e}")
214
+ return self._basic_quality_assessment(data, api_name)
215
+
216
+ def _data_to_text(self, data: Any) -> str:
217
+ """Convert any data format to text for AI analysis"""
218
+ if isinstance(data, str):
219
+ return data
220
+ elif isinstance(data, dict):
221
+ return json.dumps(data, ensure_ascii=False)[:1000]
222
+ elif isinstance(data, list):
223
+ return str(data)[:1000]
224
+ else:
225
+ return str(data)[:1000]
226
+
227
+ def _check_completeness(self, data: Any) -> float:
228
+ """Check data completeness"""
229
+ if isinstance(data, dict):
230
+ total_fields = len(data)
231
+ complete_fields = sum(1 for v in data.values() if v is not None and v != "")
232
+ return complete_fields / total_fields if total_fields > 0 else 0.0
233
+ elif isinstance(data, list):
234
+ if not data:
235
+ return 0.0
236
+ if isinstance(data[0], dict):
237
+ return np.mean([self._check_completeness(item) for item in data])
238
+ return 1.0
239
+ return 1.0 if data is not None else 0.0
240
+
241
+ def _check_consistency(self, data: Any, api_name: str) -> float:
242
+ """Check data consistency based on API expectations"""
243
+ consistency_score = 1.0
244
+
245
+ if isinstance(data, list):
246
+ if len(data) > 1:
247
+ # Check if all items have similar structure
248
+ first_item = data[0] if data else {}
249
+ if isinstance(first_item, dict):
250
+ first_keys = set(first_item.keys())
251
+ consistency_scores = []
252
+ for item in data[1:6]: # Check first 5 items
253
+ if isinstance(item, dict):
254
+ item_keys = set(item.keys())
255
+ similarity = len(first_keys & item_keys) / len(first_keys | item_keys)
256
+ consistency_scores.append(similarity)
257
+
258
+ if consistency_scores:
259
+ consistency_score = np.mean(consistency_scores)
260
+
261
+ return consistency_score
262
+
263
+ def _assess_structure(self, data: Any) -> float:
264
+ """Assess data structure quality"""
265
+ if isinstance(data, dict):
266
+ # Check for nested structure, proper keys, etc.
267
+ score = 0.8 # Base score for dictionary
268
+ if len(data) > 0:
269
+ score += 0.1
270
+ if any(isinstance(v, (dict, list)) for v in data.values()):
271
+ score += 0.1 # Bonus for nested structure
272
+ return min(score, 1.0)
273
+ elif isinstance(data, list):
274
+ return 0.9 if data else 0.5
275
+ else:
276
+ return 0.6 # Basic data
277
+
278
+ def _detect_anomalies(self, data: Any) -> List[str]:
279
+ """Detect data anomalies"""
280
+ anomalies = []
281
+
282
+ if isinstance(data, dict):
283
+ # Check for suspicious values
284
+ for key, value in data.items():
285
+ if value is None:
286
+ anomalies.append(f"Null value in field: {key}")
287
+ elif isinstance(value, str) and len(value) > 1000:
288
+ anomalies.append(f"Unusually long string in field: {key}")
289
+ elif isinstance(value, (int, float)) and abs(value) > 1e10:
290
+ anomalies.append(f"Extreme numeric value in field: {key}")
291
+
292
+ elif isinstance(data, list):
293
+ if len(data) > 10000:
294
+ anomalies.append(f"Very large dataset: {len(data)} items")
295
+
296
+ # Check for inconsistent types
297
+ if data:
298
+ first_type = type(data[0])
299
+ if not all(isinstance(item, first_type) for item in data[:10]):
300
+ anomalies.append("Inconsistent data types in list")
301
+
302
+ return anomalies
303
+
304
+ def _calculate_overall_grade(self, ai_score: float, completeness: float,
305
+ consistency: float, structure: float) -> str:
306
+ """Calculate overall data quality grade"""
307
+ overall_score = (ai_score + completeness + consistency + structure) / 4
308
+
309
+ if overall_score >= 0.9:
310
+ return "A+ (Excellent)"
311
+ elif overall_score >= 0.8:
312
+ return "A (Very Good)"
313
+ elif overall_score >= 0.7:
314
+ return "B (Good)"
315
+ elif overall_score >= 0.6:
316
+ return "C (Fair)"
317
+ else:
318
+ return "D (Poor)"
319
+
320
+ def _generate_quality_recommendations(self, ai_score: float, completeness: float,
321
+ consistency: float, anomalies: List[str]) -> List[str]:
322
+ """Generate AI-powered recommendations for data quality improvement"""
323
+ recommendations = []
324
+
325
+ if ai_score < 0.7:
326
+ recommendations.append("πŸ“ Consider data validation and cleaning")
327
+
328
+ if completeness < 0.8:
329
+ recommendations.append("πŸ” Investigate missing data fields")
330
+
331
+ if consistency < 0.8:
332
+ recommendations.append("βš™οΈ Standardize data format across records")
333
+
334
+ if len(anomalies) > 3:
335
+ recommendations.append("🚨 Multiple anomalies detected - requires investigation")
336
+
337
+ if not recommendations:
338
+ recommendations.append("βœ… Data quality is good - no immediate action needed")
339
+
340
+ return recommendations
341
+
342
+ def _basic_quality_assessment(self, data: Any, api_name: str) -> Dict:
343
+ """Basic quality assessment without AI"""
344
+ return {
345
+ "ai_quality_score": 0.0,
346
+ "completeness_score": self._check_completeness(data),
347
+ "consistency_score": 0.8, # Default
348
+ "structure_score": self._assess_structure(data),
349
+ "anomaly_count": 0,
350
+ "anomalies": [],
351
+ "overall_grade": "C (Basic Assessment)",
352
+ "recommendations": ["Install ML libraries for advanced AI assessment"]
353
+ }
354
+
355
+ class SemanticDataAnalyzer:
356
+ """Semantic analysis and similarity detection"""
357
+
358
+ def __init__(self):
359
+ self.embeddings_model = None
360
+ self.stored_embeddings = {}
361
+ self._initialize_model()
362
+
363
+ def _initialize_model(self):
364
+ """Initialize sentence transformer model"""
365
+ if ML_AVAILABLE:
366
+ try:
367
+ self.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
368
+ except Exception as e:
369
+ st.warning(f"Failed to load embeddings model: {e}")
370
+
371
+ def find_similar_datasets(self, new_data: Any, api_name: str, threshold: float = 0.85) -> List[Dict]:
372
+ """Find semantically similar datasets"""
373
+ if not self.embeddings_model:
374
+ return []
375
+
376
+ try:
377
+ # Convert data to text and create embedding
378
+ text_data = self._data_to_text(new_data)
379
+ new_embedding = self.embeddings_model.encode([text_data])
380
+
381
+ # Compare with stored embeddings
382
+ similar_datasets = []
383
+ for stored_key, stored_embedding in self.stored_embeddings.items():
384
+ similarity = cosine_similarity(new_embedding, [stored_embedding])[0][0]
385
+ if similarity > threshold:
386
+ similar_datasets.append({
387
+ "dataset": stored_key,
388
+ "similarity": float(similarity),
389
+ "api_name": stored_key.split("_")[0] if "_" in stored_key else "unknown"
390
+ })
391
+
392
+ # Store new embedding
393
+ embedding_key = f"{api_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
394
+ self.stored_embeddings[embedding_key] = new_embedding[0]
395
+
396
+ return sorted(similar_datasets, key=lambda x: x['similarity'], reverse=True)
397
+
398
+ except Exception as e:
399
+ st.warning(f"Semantic analysis failed: {e}")
400
+ return []
401
+
402
+ def _data_to_text(self, data: Any) -> str:
403
+ """Convert data to text for embedding"""
404
+ if isinstance(data, str):
405
+ return data[:500]
406
+ elif isinstance(data, dict):
407
+ # Extract key information
408
+ text_parts = []
409
+ for key, value in list(data.items())[:10]: # First 10 keys
410
+ text_parts.append(f"{key}: {str(value)[:100]}")
411
+ return " | ".join(text_parts)
412
+ elif isinstance(data, list) and data:
413
+ return str(data[0])[:500]
414
+ else:
415
+ return str(data)[:500]
416
+
417
+ class APIHealthMonitor:
418
+ """Intelligent API health monitoring with anomaly detection"""
419
+
420
+ def __init__(self):
421
+ self.anomaly_detector = IsolationForest(contamination=0.1, random_state=42)
422
+ self.health_history = {}
423
+ self.is_trained = False
424
+
425
+ def monitor_api_health(self, api_name: str, response_time: float,
426
+ success_rate: float, data_size: int) -> Dict:
427
+ """Comprehensive API health assessment"""
428
+ current_metrics = {
429
+ "response_time": response_time,
430
+ "success_rate": success_rate,
431
+ "data_size": data_size,
432
+ "timestamp": time.time()
433
+ }
434
+
435
+ # Store health history
436
+ if api_name not in self.health_history:
437
+ self.health_history[api_name] = []
438
+
439
+ self.health_history[api_name].append(current_metrics)
440
+
441
+ # Keep only last 50 measurements
442
+ if len(self.health_history[api_name]) > 50:
443
+ self.health_history[api_name] = self.health_history[api_name][-50:]
444
+
445
+ # Calculate health score
446
+ health_score = self._calculate_health_score(current_metrics)
447
+
448
+ # Detect anomalies if we have enough data
449
+ anomaly_score = 0.0
450
+ if len(self.health_history[api_name]) >= 10:
451
+ anomaly_score = self._detect_performance_anomaly(api_name, current_metrics)
452
+
453
+ # Generate recommendations
454
+ recommendations = self._generate_health_recommendations(
455
+ current_metrics, health_score, anomaly_score
456
+ )
457
+
458
+ return {
459
+ "health_score": health_score,
460
+ "status": self._get_health_status(health_score),
461
+ "anomaly_score": anomaly_score,
462
+ "is_anomaly": anomaly_score < -0.5,
463
+ "recommendations": recommendations,
464
+ "trend": self._calculate_trend(api_name),
465
+ "metrics": current_metrics
466
+ }
467
+
468
+ def _calculate_health_score(self, metrics: Dict) -> float:
469
+ """Calculate overall health score (0-1)"""
470
+ # Response time score (lower is better)
471
+ time_score = max(0, 1 - (metrics["response_time"] / 10000)) # 10s max
472
+
473
+ # Success rate score
474
+ success_score = metrics["success_rate"]
475
+
476
+ # Data size score (normalized)
477
+ size_score = min(1.0, metrics["data_size"] / 1000000) # 1MB reference
478
+
479
+ # Weighted average
480
+ health_score = (time_score * 0.4 + success_score * 0.5 + size_score * 0.1)
481
+ return max(0, min(1, health_score))
482
+
483
+ def _detect_performance_anomaly(self, api_name: str, current_metrics: Dict) -> float:
484
+ """Detect performance anomalies using isolation forest"""
485
+ try:
486
+ history = self.health_history[api_name]
487
+
488
+ # Prepare training data
489
+ training_data = []
490
+ for h in history[:-1]: # Exclude current measurement
491
+ training_data.append([
492
+ h["response_time"],
493
+ h["success_rate"],
494
+ h["data_size"]
495
+ ])
496
+
497
+ if len(training_data) >= 5:
498
+ # Train anomaly detector
499
+ self.anomaly_detector.fit(training_data)
500
+
501
+ # Check current metrics
502
+ current_data = [[
503
+ current_metrics["response_time"],
504
+ current_metrics["success_rate"],
505
+ current_metrics["data_size"]
506
+ ]]
507
+
508
+ anomaly_score = self.anomaly_detector.decision_function(current_data)[0]
509
+ return float(anomaly_score)
510
+
511
+ except Exception as e:
512
+ st.warning(f"Anomaly detection failed: {e}")
513
+
514
+ return 0.0
515
+
516
+ def _get_health_status(self, health_score: float) -> str:
517
+ """Get health status based on score"""
518
+ if health_score >= 0.9:
519
+ return "🟒 Excellent"
520
+ elif health_score >= 0.7:
521
+ return "🟑 Good"
522
+ elif health_score >= 0.5:
523
+ return "🟠 Fair"
524
+ else:
525
+ return "πŸ”΄ Poor"
526
+
527
+ def _generate_health_recommendations(self, metrics: Dict, health_score: float,
528
+ anomaly_score: float) -> List[str]:
529
+ """Generate health improvement recommendations"""
530
+ recommendations = []
531
+
532
+ if metrics["response_time"] > 5000:
533
+ recommendations.append("⏱️ High response time detected - consider caching")
534
+
535
+ if metrics["success_rate"] < 0.9:
536
+ recommendations.append("❌ Low success rate - check API status")
537
+
538
+ if anomaly_score < -0.5:
539
+ recommendations.append("🚨 Performance anomaly detected - investigate")
540
+
541
+ if health_score < 0.6:
542
+ recommendations.append("⚠️ Overall poor health - consider alternatives")
543
+
544
+ if not recommendations:
545
+ recommendations.append("βœ… API performing well")
546
+
547
+ return recommendations
548
+
549
+ def _calculate_trend(self, api_name: str) -> str:
550
+ """Calculate performance trend"""
551
+ if api_name not in self.health_history or len(self.health_history[api_name]) < 5:
552
+ return "πŸ“Š Insufficient data"
553
+
554
+ recent_scores = []
555
+ for metrics in self.health_history[api_name][-5:]:
556
+ score = self._calculate_health_score(metrics)
557
+ recent_scores.append(score)
558
+
559
+ if len(recent_scores) >= 3:
560
+ trend = np.polyfit(range(len(recent_scores)), recent_scores, 1)[0]
561
+
562
+ if trend > 0.02:
563
+ return "πŸ“ˆ Improving"
564
+ elif trend < -0.02:
565
+ return "πŸ“‰ Declining"
566
+ else:
567
+ return "➑️ Stable"
568
+
569
+ return "πŸ“Š Monitoring"
570
+
571
+ # Initialize AI components
572
+ if ML_AVAILABLE:
573
+ ai_quality_assessor = AIDataQualityAssessor()
574
+ semantic_analyzer = SemanticDataAnalyzer()
575
+ health_monitor = APIHealthMonitor()
576
+ else:
577
+ ai_quality_assessor = None
578
+ semantic_analyzer = None
579
+ health_monitor = None
580
+
581
  # Comprehensive API Discovery Configuration
582
  DEEP_API_CONFIG = {
583
  "Skolverket": {
 
1497
  def _save_harvested_data(self, api_name: str, endpoint_path: str, data: Any,
1498
  session_id: str, fetch_duration: int, record_count: int,
1499
  data_size: int, status: str = "success", error_message: str = None):
1500
+ """Save harvested data with AI-enhanced intelligent categorization"""
1501
  conn = sqlite3.connect(DB_PATH)
1502
  cursor = conn.cursor()
1503
 
 
1505
  data_str = json.dumps(data, sort_keys=True, default=str)
1506
  data_hash = hashlib.sha256(data_str.encode()).hexdigest()
1507
 
1508
+ # AI Quality Assessment
1509
+ quality_assessment = {}
1510
+ if ai_quality_assessor and status == "success":
1511
+ quality_assessment = ai_quality_assessor.assess_data_quality(data, api_name)
1512
+
1513
+ # Semantic Similarity Analysis
1514
+ similar_datasets = []
1515
+ if semantic_analyzer and status == "success":
1516
+ similar_datasets = semantic_analyzer.find_similar_datasets(data, api_name)
1517
+
1518
+ # API Health Monitoring
1519
+ health_info = {}
1520
+ if health_monitor:
1521
+ success_rate = 1.0 if status == "success" else 0.0
1522
+ health_info = health_monitor.monitor_api_health(
1523
+ api_name, fetch_duration, success_rate, data_size
1524
+ )
1525
+
1526
  try:
1527
  cursor.execute('''
1528
  INSERT OR REPLACE INTO harvested_data
1529
  (api_name, endpoint_path, data_hash, raw_data, processed_data,
1530
  record_count, data_size_bytes, fetch_duration_ms, status,
1531
+ error_message, session_id, quality_score, health_score, similar_datasets)
1532
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1533
  ''', (
1534
  api_name, endpoint_path, data_hash, data_str,
1535
  json.dumps(data, default=str), record_count, data_size,
1536
+ fetch_duration, status, error_message, session_id,
1537
+ quality_assessment.get('ai_quality_score', 0.0),
1538
+ health_info.get('health_score', 0.0),
1539
+ json.dumps(similar_datasets[:3], default=str) # Top 3 similar datasets
1540
  ))
1541
 
1542
  conn.commit()
1543
+
1544
+ # Display AI insights if available
1545
+ if quality_assessment and st.session_state.get('show_ai_insights', True):
1546
+ self._display_ai_insights(api_name, quality_assessment, health_info, similar_datasets)
1547
+
1548
  except sqlite3.IntegrityError:
1549
  pass # Data already exists
1550
+ except sqlite3.OperationalError:
1551
+ # Handle case where AI columns don't exist yet - add them
1552
+ self._upgrade_database_schema()
1553
+ # Retry with basic data
1554
+ cursor.execute('''
1555
+ INSERT OR REPLACE INTO harvested_data
1556
+ (api_name, endpoint_path, data_hash, raw_data, processed_data,
1557
+ record_count, data_size_bytes, fetch_duration_ms, status,
1558
+ error_message, session_id)
1559
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1560
+ ''', (
1561
+ api_name, endpoint_path, data_hash, data_str,
1562
+ json.dumps(data, default=str), record_count, data_size,
1563
+ fetch_duration, status, error_message, session_id
1564
+ ))
1565
+ conn.commit()
1566
+ finally:
1567
+ conn.close()
1568
+
1569
+ def _display_ai_insights(self, api_name: str, quality_assessment: Dict,
1570
+ health_info: Dict, similar_datasets: List[Dict]):
1571
+ """Display AI-powered insights in real-time"""
1572
+ if quality_assessment:
1573
+ with st.expander(f"πŸ€– AI Insights for {api_name}", expanded=False):
1574
+ col1, col2, col3 = st.columns(3)
1575
+
1576
+ with col1:
1577
+ st.metric("Quality Grade", quality_assessment.get('overall_grade', 'N/A'))
1578
+ st.metric("Completeness", f"{quality_assessment.get('completeness_score', 0):.2f}")
1579
+
1580
+ with col2:
1581
+ if health_info:
1582
+ st.metric("Health Status", health_info.get('status', 'Unknown'))
1583
+ st.metric("Performance Trend", health_info.get('trend', 'N/A'))
1584
+
1585
+ with col3:
1586
+ st.metric("Anomalies", quality_assessment.get('anomaly_count', 0))
1587
+ if similar_datasets:
1588
+ st.metric("Similar Datasets", len(similar_datasets))
1589
+
1590
+ # Recommendations
1591
+ recommendations = quality_assessment.get('recommendations', [])
1592
+ if recommendations:
1593
+ st.write("**🎯 Recommendations:**")
1594
+ for rec in recommendations[:3]:
1595
+ st.write(f"β€’ {rec}")
1596
+
1597
+ # Similar datasets
1598
+ if similar_datasets:
1599
+ st.write("**πŸ” Similar Datasets Found:**")
1600
+ for sim in similar_datasets[:2]:
1601
+ st.write(f"β€’ {sim['dataset']} (similarity: {sim['similarity']:.2f})")
1602
+
1603
+ def _upgrade_database_schema(self):
1604
+ """Upgrade database schema to include AI columns"""
1605
+ conn = sqlite3.connect(DB_PATH)
1606
+ cursor = conn.cursor()
1607
+
1608
+ try:
1609
+ # Add AI enhancement columns
1610
+ cursor.execute('ALTER TABLE harvested_data ADD COLUMN quality_score REAL DEFAULT 0.0')
1611
+ cursor.execute('ALTER TABLE harvested_data ADD COLUMN health_score REAL DEFAULT 0.0')
1612
+ cursor.execute('ALTER TABLE harvested_data ADD COLUMN similar_datasets TEXT DEFAULT "[]"')
1613
+ conn.commit()
1614
+ except sqlite3.OperationalError:
1615
+ pass # Columns already exist
1616
  finally:
1617
  conn.close()
1618
 
 
1802
  tab1, tab2, tab3 = st.tabs(["πŸ” Deep Discovery", "πŸ“Š Data Harvesting", "πŸ“ˆ Analytics"])
1803
 
1804
  with tab1:
1805
+ st.markdown("**πŸ€– AI-Enhanced Deep Discovery - Find all endpoints with intelligent analysis**")
1806
+
1807
+ # AI Settings
1808
+ col1, col2 = st.columns(2)
1809
+ with col1:
1810
+ enable_ai_insights = st.checkbox("πŸ€– Enable AI Quality Assessment", value=True, key="enable_ai")
1811
+ with col2:
1812
+ show_similarity = st.checkbox("πŸ” Show Semantic Similarity", value=True, key="enable_similarity")
1813
+
1814
+ st.session_state['show_ai_insights'] = enable_ai_insights
1815
 
1816
  # API Selection for Discovery
1817
  selected_apis_discovery = st.multiselect(
 
2015
  finally:
2016
  conn.close()
2017
 
2018
+ # AI Enhancement Panel
2019
+ if ML_AVAILABLE:
2020
+ st.markdown("---")
2021
+ with st.expander("πŸ€– AI Enhancement Status", expanded=False):
2022
+ col1, col2, col3 = st.columns(3)
2023
+
2024
+ with col1:
2025
+ st.markdown("**🎯 Quality Assessment**")
2026
+ if ai_quality_assessor and ai_quality_assessor.quality_model:
2027
+ st.success("βœ… Active - DistilBERT")
2028
+ else:
2029
+ st.error("❌ Not Available")
2030
+
2031
+ with col2:
2032
+ st.markdown("**πŸ” Semantic Analysis**")
2033
+ if semantic_analyzer and semantic_analyzer.embeddings_model:
2034
+ st.success("βœ… Active - MiniLM-L6-v2")
2035
+ else:
2036
+ st.error("❌ Not Available")
2037
+
2038
+ with col3:
2039
+ st.markdown("**πŸ“Š Health Monitoring**")
2040
+ if health_monitor:
2041
+ st.success("βœ… Active - Isolation Forest")
2042
+ else:
2043
+ st.error("❌ Not Available")
2044
+
2045
+ if ai_quality_assessor and hasattr(ai_quality_assessor, 'quality_model'):
2046
+ st.info("πŸ’‘ AI models are loaded and ready for enhanced data analysis!")
2047
+
2048
  # Footer
2049
  st.markdown("---")
2050
  st.markdown("""
2051
  <div style="text-align: center; padding: 1rem; opacity: 0.9;">
2052
+ <p><strong>πŸš€ Ultimate Data Harvester with AI</strong> - Deep discovery, session resumption, intelligent storage</p>
2053
  <p style="font-size: 0.9rem;">
2054
+ πŸ” Recursive endpoint discovery β€’ πŸ€– AI quality assessment β€’ 🎯 Session management β€’ πŸ’Ύ Smart database storage β€’ πŸ“Š Real-time analytics
2055
  </p>
2056
  </div>
2057
  """, unsafe_allow_html=True)