Spaces:
Sleeping
Sleeping
D Ф m i И i q ц e L Ф y e r
Deploy SysCRED v2.3.1 - GraphRAG + LIAR benchmark + TREC integration
8e97fc5 | #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| LIAR Benchmark via Hugging Face Space API | |
| ========================================== | |
| Runs the LIAR benchmark against the remote SysCRED instance on HF Space. | |
| This uses the full ML pipeline (PyTorch, Transformers) running in the cloud. | |
| Usage: | |
| python run_liar_benchmark_remote.py --sample 100 | |
| python run_liar_benchmark_remote.py --split test --url https://your-space.hf.space | |
| (c) Dominique S. Loyer - PhD Thesis Prototype | |
| """ | |
| import argparse | |
| import json | |
| import time | |
| import sys | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Dict, List, Any, Optional | |
| import requests | |
| try: | |
| import pandas as pd | |
| HAS_PANDAS = True | |
| except ImportError: | |
| HAS_PANDAS = False | |
| try: | |
| from sklearn.metrics import ( | |
| accuracy_score, precision_score, recall_score, f1_score, | |
| confusion_matrix, classification_report | |
| ) | |
| HAS_SKLEARN = True | |
| except ImportError: | |
| HAS_SKLEARN = False | |
| # Add parent to path for imports | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from syscred.liar_dataset import LIARDataset, LiarStatement | |
| class RemoteLIARBenchmark: | |
| """ | |
| Benchmark runner using remote HF Space API. | |
| """ | |
| # Default HF Space URL | |
| DEFAULT_API_URL = "https://domloyer-syscred.hf.space" | |
| SYSCRED_THRESHOLD = 0.5 # Below = Fake, Above = Real | |
| def __init__( | |
| self, | |
| api_url: Optional[str] = None, | |
| data_dir: Optional[str] = None, | |
| timeout: int = 60 | |
| ): | |
| """ | |
| Initialize remote benchmark. | |
| Args: | |
| api_url: HF Space API URL | |
| data_dir: Path to LIAR dataset | |
| timeout: Request timeout in seconds | |
| """ | |
| print("=" * 60) | |
| print("SysCRED LIAR Benchmark (Remote HF Space)") | |
| print("=" * 60) | |
| self.api_url = (api_url or self.DEFAULT_API_URL).rstrip('/') | |
| self.timeout = timeout | |
| # Test connection | |
| print(f"\n[Remote] API URL: {self.api_url}") | |
| self._test_connection() | |
| # Load dataset | |
| self.dataset = LIARDataset(data_dir) | |
| self.results: List[Dict] = [] | |
| print("[Remote] Ready.\n") | |
| def _test_connection(self): | |
| """Test API connectivity.""" | |
| try: | |
| response = requests.get(f"{self.api_url}/api/health", timeout=10) | |
| if response.status_code == 200: | |
| print("[Remote] ✓ API connection successful") | |
| else: | |
| print(f"[Remote] ⚠ API returned status {response.status_code}") | |
| except requests.exceptions.ConnectionError: | |
| print("[Remote] ⚠ Could not connect to API (may be sleeping)") | |
| print("[Remote] The first request will wake it up...") | |
| except Exception as e: | |
| print(f"[Remote] ⚠ Connection test failed: {e}") | |
| def _call_api(self, text: str) -> Dict[str, Any]: | |
| """Call the SysCRED API.""" | |
| try: | |
| response = requests.post( | |
| f"{self.api_url}/api/verify", | |
| json={"input": text}, | |
| timeout=self.timeout, | |
| headers={"Content-Type": "application/json"} | |
| ) | |
| if response.status_code == 200: | |
| return response.json() | |
| else: | |
| return {"error": f"HTTP {response.status_code}: {response.text[:100]}"} | |
| except requests.exceptions.Timeout: | |
| return {"error": "Request timeout"} | |
| except requests.exceptions.ConnectionError: | |
| return {"error": "Connection error"} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def _syscred_to_binary(self, score: float) -> str: | |
| """Convert SysCRED score to binary label.""" | |
| return "Real" if score >= self.SYSCRED_THRESHOLD else "Fake" | |
| def _syscred_to_ternary(self, score: float) -> str: | |
| """Convert SysCRED score to ternary label.""" | |
| if score >= 0.65: | |
| return "True" | |
| elif score >= 0.35: | |
| return "Mixed" | |
| else: | |
| return "False" | |
| def evaluate_statement(self, statement: LiarStatement) -> Dict[str, Any]: | |
| """Evaluate a single statement via API.""" | |
| start_time = time.time() | |
| result = { | |
| 'id': statement.id, | |
| 'statement': statement.statement[:200], | |
| 'ground_truth_6way': statement.label.name, | |
| 'ground_truth_binary': statement.binary_label, | |
| 'ground_truth_ternary': statement.ternary_label, | |
| 'speaker': statement.speaker, | |
| 'party': statement.party, | |
| 'syscred_score': 0.5, | |
| 'predicted_binary': 'Unknown', | |
| 'predicted_ternary': 'Unknown', | |
| 'binary_correct': False, | |
| 'ternary_correct': False, | |
| 'processing_time': 0, | |
| 'error': None, | |
| 'ml_used': False | |
| } | |
| # Call remote API | |
| api_result = self._call_api(statement.statement) | |
| if 'error' not in api_result: | |
| score = api_result.get('scoreCredibilite', 0.5) | |
| result['syscred_score'] = score | |
| result['predicted_binary'] = self._syscred_to_binary(score) | |
| result['predicted_ternary'] = self._syscred_to_ternary(score) | |
| result['binary_correct'] = (result['predicted_binary'] == result['ground_truth_binary']) | |
| result['ternary_correct'] = (result['predicted_ternary'] == result['ground_truth_ternary']) | |
| # Check if ML was used | |
| nlp = api_result.get('analyseNLP', {}) | |
| result['ml_used'] = nlp.get('sentiment') is not None | |
| # GraphRAG info | |
| graphrag = api_result.get('graphRAG', {}) | |
| result['graph_context_score'] = graphrag.get('context_score') | |
| result['graph_has_history'] = graphrag.get('has_history', False) | |
| else: | |
| result['error'] = api_result['error'] | |
| result['processing_time'] = time.time() - start_time | |
| return result | |
| def run_benchmark( | |
| self, | |
| split: str = "test", | |
| sample_size: Optional[int] = None, | |
| verbose: bool = False | |
| ) -> Dict[str, Any]: | |
| """Run benchmark against remote API.""" | |
| print(f"\n[Remote] Running on {split} split via HF Space API...") | |
| statements = self.dataset.load_split(split) | |
| if sample_size: | |
| import random | |
| statements = random.sample(statements, min(sample_size, len(statements))) | |
| print(f"[Remote] Using sample of {len(statements)} statements") | |
| total = len(statements) | |
| self.results = [] | |
| ml_used_count = 0 | |
| start_time = time.time() | |
| for i, stmt in enumerate(statements): | |
| if verbose or (i + 1) % 10 == 0: | |
| print(f"[{i+1}/{total}] Processing: {stmt.statement[:50]}...") | |
| result = self.evaluate_statement(stmt) | |
| self.results.append(result) | |
| if result.get('ml_used'): | |
| ml_used_count += 1 | |
| if verbose and not result.get('error'): | |
| symbol = "✅" if result['binary_correct'] else "❌" | |
| ml = "🧠" if result['ml_used'] else "📊" | |
| print(f" -> Score: {result['syscred_score']:.2f} {ml} | " | |
| f"Pred: {result['predicted_binary']} | " | |
| f"True: {result['ground_truth_binary']} {symbol}") | |
| # Rate limiting - be nice to the API | |
| if i < total - 1: | |
| time.sleep(0.5) | |
| elapsed = time.time() - start_time | |
| metrics = self._calculate_metrics() | |
| metrics['elapsed_time'] = elapsed | |
| metrics['statements_per_second'] = total / elapsed if elapsed > 0 else 0 | |
| metrics['ml_used_percentage'] = (ml_used_count / total * 100) if total > 0 else 0 | |
| metrics['api_url'] = self.api_url | |
| return metrics | |
| def _calculate_metrics(self) -> Dict[str, Any]: | |
| """Calculate evaluation metrics.""" | |
| if not self.results: | |
| return {'error': 'No results'} | |
| valid_results = [r for r in self.results if r['error'] is None] | |
| error_count = len(self.results) - len(valid_results) | |
| if not valid_results: | |
| return {'error': 'All evaluations failed'} | |
| metrics = { | |
| 'total_statements': len(self.results), | |
| 'successful_evaluations': len(valid_results), | |
| 'error_count': error_count, | |
| 'error_rate': error_count / len(self.results) | |
| } | |
| y_true_binary = [r['ground_truth_binary'] for r in valid_results] | |
| y_pred_binary = [r['predicted_binary'] for r in valid_results] | |
| y_true_ternary = [r['ground_truth_ternary'] for r in valid_results] | |
| y_pred_ternary = [r['predicted_ternary'] for r in valid_results] | |
| if HAS_SKLEARN: | |
| metrics['binary'] = { | |
| 'accuracy': accuracy_score(y_true_binary, y_pred_binary), | |
| 'precision': precision_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0), | |
| 'recall': recall_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0), | |
| 'f1': f1_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0), | |
| 'confusion_matrix': confusion_matrix(y_true_binary, y_pred_binary, labels=['Fake', 'Real']).tolist() | |
| } | |
| metrics['ternary'] = { | |
| 'accuracy': accuracy_score(y_true_ternary, y_pred_ternary), | |
| 'macro_f1': f1_score(y_true_ternary, y_pred_ternary, average='macro', zero_division=0), | |
| } | |
| else: | |
| correct_binary = sum(1 for r in valid_results if r['binary_correct']) | |
| metrics['binary'] = {'accuracy': correct_binary / len(valid_results)} | |
| scores = [r['syscred_score'] for r in valid_results] | |
| metrics['score_distribution'] = { | |
| 'mean': sum(scores) / len(scores), | |
| 'min': min(scores), | |
| 'max': max(scores), | |
| } | |
| return metrics | |
| def print_results(self, metrics: Dict[str, Any]) -> None: | |
| """Print benchmark results.""" | |
| print("\n" + "=" * 60) | |
| print("LIAR BENCHMARK RESULTS (Remote HF Space)") | |
| print("=" * 60) | |
| print(f"\n🌐 API: {metrics.get('api_url', 'N/A')}") | |
| print(f"🧠 ML Models Used: {metrics.get('ml_used_percentage', 0):.1f}%") | |
| print(f"\n📊 Overview:") | |
| print(f" Total: {metrics.get('total_statements', 0)}") | |
| print(f" Success: {metrics.get('successful_evaluations', 0)}") | |
| print(f" Errors: {metrics.get('error_count', 0)}") | |
| print(f" Time: {metrics.get('elapsed_time', 0):.1f}s") | |
| if 'binary' in metrics: | |
| print(f"\n📈 Binary Classification:") | |
| b = metrics['binary'] | |
| print(f" Accuracy: {b.get('accuracy', 0):.2%}") | |
| print(f" Precision: {b.get('precision', 0):.2%}") | |
| print(f" Recall: {b.get('recall', 0):.2%}") | |
| print(f" F1-Score: {b.get('f1', 0):.2f}") | |
| print("\n" + "=" * 60) | |
| def save_results(self, output_path: str, metrics: Dict[str, Any]) -> None: | |
| """Save results.""" | |
| output = Path(output_path) | |
| output.parent.mkdir(parents=True, exist_ok=True) | |
| if HAS_PANDAS and self.results: | |
| df = pd.DataFrame(self.results) | |
| csv_path = output.with_suffix('.csv') | |
| df.to_csv(csv_path, index=False) | |
| print(f"[Remote] Results: {csv_path}") | |
| json_path = output.with_suffix('.json') | |
| with open(json_path, 'w') as f: | |
| json.dump({ | |
| 'timestamp': datetime.now().isoformat(), | |
| 'dataset': 'LIAR', | |
| 'mode': 'remote', | |
| 'metrics': metrics | |
| }, f, indent=2, default=str) | |
| print(f"[Remote] Metrics: {json_path}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description='LIAR benchmark via HF Space API') | |
| parser.add_argument('--url', type=str, default=None, | |
| help='HF Space API URL') | |
| parser.add_argument('--split', type=str, default='test', | |
| choices=['train', 'valid', 'test']) | |
| parser.add_argument('--sample', type=int, default=None, | |
| help='Number of statements to sample') | |
| parser.add_argument('--data-dir', type=str, default=None) | |
| parser.add_argument('--output', type=str, default=None) | |
| parser.add_argument('--verbose', '-v', action='store_true') | |
| parser.add_argument('--timeout', type=int, default=60) | |
| args = parser.parse_args() | |
| benchmark = RemoteLIARBenchmark( | |
| api_url=args.url, | |
| data_dir=args.data_dir, | |
| timeout=args.timeout | |
| ) | |
| try: | |
| metrics = benchmark.run_benchmark( | |
| split=args.split, | |
| sample_size=args.sample, | |
| verbose=args.verbose | |
| ) | |
| benchmark.print_results(metrics) | |
| output = args.output or f"liar_benchmark_remote_{args.split}.csv" | |
| benchmark.save_results(output, metrics) | |
| except FileNotFoundError as e: | |
| print(f"\n❌ Error: {e}") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |