syscred_duplicate

Sleeping

File size: 13,651 Bytes

8e97fc5

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LIAR Benchmark via Hugging Face Space API
==========================================
Runs the LIAR benchmark against the remote SysCRED instance on HF Space.
This uses the full ML pipeline (PyTorch, Transformers) running in the cloud.

Usage:
    python run_liar_benchmark_remote.py --sample 100
    python run_liar_benchmark_remote.py --split test --url https://your-space.hf.space

(c) Dominique S. Loyer - PhD Thesis Prototype
"""

import argparse
import json
import time
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional
import requests

try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    HAS_PANDAS = False

try:
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score,
        confusion_matrix, classification_report
    )
    HAS_SKLEARN = True
except ImportError:
    HAS_SKLEARN = False

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from syscred.liar_dataset import LIARDataset, LiarStatement


class RemoteLIARBenchmark:
    """
    Benchmark runner using remote HF Space API.
    """
    
    # Default HF Space URL
    DEFAULT_API_URL = "https://domloyer-syscred.hf.space"
    
    SYSCRED_THRESHOLD = 0.5  # Below = Fake, Above = Real
    
    def __init__(
        self,
        api_url: Optional[str] = None,
        data_dir: Optional[str] = None,
        timeout: int = 60
    ):
        """
        Initialize remote benchmark.
        
        Args:
            api_url: HF Space API URL
            data_dir: Path to LIAR dataset
            timeout: Request timeout in seconds
        """
        print("=" * 60)
        print("SysCRED LIAR Benchmark (Remote HF Space)")
        print("=" * 60)
        
        self.api_url = (api_url or self.DEFAULT_API_URL).rstrip('/')
        self.timeout = timeout
        
        # Test connection
        print(f"\n[Remote] API URL: {self.api_url}")
        self._test_connection()
        
        # Load dataset
        self.dataset = LIARDataset(data_dir)
        self.results: List[Dict] = []
        
        print("[Remote] Ready.\n")
    
    def _test_connection(self):
        """Test API connectivity."""
        try:
            response = requests.get(f"{self.api_url}/api/health", timeout=10)
            if response.status_code == 200:
                print("[Remote] ✓ API connection successful")
            else:
                print(f"[Remote] ⚠ API returned status {response.status_code}")
        except requests.exceptions.ConnectionError:
            print("[Remote] ⚠ Could not connect to API (may be sleeping)")
            print("[Remote] The first request will wake it up...")
        except Exception as e:
            print(f"[Remote] ⚠ Connection test failed: {e}")
    
    def _call_api(self, text: str) -> Dict[str, Any]:
        """Call the SysCRED API."""
        try:
            response = requests.post(
                f"{self.api_url}/api/verify",
                json={"input": text},
                timeout=self.timeout,
                headers={"Content-Type": "application/json"}
            )
            
            if response.status_code == 200:
                return response.json()
            else:
                return {"error": f"HTTP {response.status_code}: {response.text[:100]}"}
                
        except requests.exceptions.Timeout:
            return {"error": "Request timeout"}
        except requests.exceptions.ConnectionError:
            return {"error": "Connection error"}
        except Exception as e:
            return {"error": str(e)}
    
    def _syscred_to_binary(self, score: float) -> str:
        """Convert SysCRED score to binary label."""
        return "Real" if score >= self.SYSCRED_THRESHOLD else "Fake"
    
    def _syscred_to_ternary(self, score: float) -> str:
        """Convert SysCRED score to ternary label."""
        if score >= 0.65:
            return "True"
        elif score >= 0.35:
            return "Mixed"
        else:
            return "False"
    
    def evaluate_statement(self, statement: LiarStatement) -> Dict[str, Any]:
        """Evaluate a single statement via API."""
        start_time = time.time()
        
        result = {
            'id': statement.id,
            'statement': statement.statement[:200],
            'ground_truth_6way': statement.label.name,
            'ground_truth_binary': statement.binary_label,
            'ground_truth_ternary': statement.ternary_label,
            'speaker': statement.speaker,
            'party': statement.party,
            'syscred_score': 0.5,
            'predicted_binary': 'Unknown',
            'predicted_ternary': 'Unknown',
            'binary_correct': False,
            'ternary_correct': False,
            'processing_time': 0,
            'error': None,
            'ml_used': False
        }
        
        # Call remote API
        api_result = self._call_api(statement.statement)
        
        if 'error' not in api_result:
            score = api_result.get('scoreCredibilite', 0.5)
            result['syscred_score'] = score
            result['predicted_binary'] = self._syscred_to_binary(score)
            result['predicted_ternary'] = self._syscred_to_ternary(score)
            
            result['binary_correct'] = (result['predicted_binary'] == result['ground_truth_binary'])
            result['ternary_correct'] = (result['predicted_ternary'] == result['ground_truth_ternary'])
            
            # Check if ML was used
            nlp = api_result.get('analyseNLP', {})
            result['ml_used'] = nlp.get('sentiment') is not None
            
            # GraphRAG info
            graphrag = api_result.get('graphRAG', {})
            result['graph_context_score'] = graphrag.get('context_score')
            result['graph_has_history'] = graphrag.get('has_history', False)
        else:
            result['error'] = api_result['error']
        
        result['processing_time'] = time.time() - start_time
        
        return result
    
    def run_benchmark(
        self,
        split: str = "test",
        sample_size: Optional[int] = None,
        verbose: bool = False
    ) -> Dict[str, Any]:
        """Run benchmark against remote API."""
        print(f"\n[Remote] Running on {split} split via HF Space API...")
        
        statements = self.dataset.load_split(split)
        
        if sample_size:
            import random
            statements = random.sample(statements, min(sample_size, len(statements)))
            print(f"[Remote] Using sample of {len(statements)} statements")
        
        total = len(statements)
        self.results = []
        ml_used_count = 0
        
        start_time = time.time()
        
        for i, stmt in enumerate(statements):
            if verbose or (i + 1) % 10 == 0:
                print(f"[{i+1}/{total}] Processing: {stmt.statement[:50]}...")
            
            result = self.evaluate_statement(stmt)
            self.results.append(result)
            
            if result.get('ml_used'):
                ml_used_count += 1
            
            if verbose and not result.get('error'):
                symbol = "✅" if result['binary_correct'] else "❌"
                ml = "🧠" if result['ml_used'] else "📊"
                print(f"  -> Score: {result['syscred_score']:.2f} {ml} | "
                      f"Pred: {result['predicted_binary']} | "
                      f"True: {result['ground_truth_binary']} {symbol}")
            
            # Rate limiting - be nice to the API
            if i < total - 1:
                time.sleep(0.5)
        
        elapsed = time.time() - start_time
        
        metrics = self._calculate_metrics()
        metrics['elapsed_time'] = elapsed
        metrics['statements_per_second'] = total / elapsed if elapsed > 0 else 0
        metrics['ml_used_percentage'] = (ml_used_count / total * 100) if total > 0 else 0
        metrics['api_url'] = self.api_url
        
        return metrics
    
    def _calculate_metrics(self) -> Dict[str, Any]:
        """Calculate evaluation metrics."""
        if not self.results:
            return {'error': 'No results'}
        
        valid_results = [r for r in self.results if r['error'] is None]
        error_count = len(self.results) - len(valid_results)
        
        if not valid_results:
            return {'error': 'All evaluations failed'}
        
        metrics = {
            'total_statements': len(self.results),
            'successful_evaluations': len(valid_results),
            'error_count': error_count,
            'error_rate': error_count / len(self.results)
        }
        
        y_true_binary = [r['ground_truth_binary'] for r in valid_results]
        y_pred_binary = [r['predicted_binary'] for r in valid_results]
        
        y_true_ternary = [r['ground_truth_ternary'] for r in valid_results]
        y_pred_ternary = [r['predicted_ternary'] for r in valid_results]
        
        if HAS_SKLEARN:
            metrics['binary'] = {
                'accuracy': accuracy_score(y_true_binary, y_pred_binary),
                'precision': precision_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0),
                'recall': recall_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0),
                'f1': f1_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0),
                'confusion_matrix': confusion_matrix(y_true_binary, y_pred_binary, labels=['Fake', 'Real']).tolist()
            }
            
            metrics['ternary'] = {
                'accuracy': accuracy_score(y_true_ternary, y_pred_ternary),
                'macro_f1': f1_score(y_true_ternary, y_pred_ternary, average='macro', zero_division=0),
            }
        else:
            correct_binary = sum(1 for r in valid_results if r['binary_correct'])
            metrics['binary'] = {'accuracy': correct_binary / len(valid_results)}
        
        scores = [r['syscred_score'] for r in valid_results]
        metrics['score_distribution'] = {
            'mean': sum(scores) / len(scores),
            'min': min(scores),
            'max': max(scores),
        }
        
        return metrics
    
    def print_results(self, metrics: Dict[str, Any]) -> None:
        """Print benchmark results."""
        print("\n" + "=" * 60)
        print("LIAR BENCHMARK RESULTS (Remote HF Space)")
        print("=" * 60)
        
        print(f"\n🌐 API: {metrics.get('api_url', 'N/A')}")
        print(f"🧠 ML Models Used: {metrics.get('ml_used_percentage', 0):.1f}%")
        
        print(f"\n📊 Overview:")
        print(f"  Total: {metrics.get('total_statements', 0)}")
        print(f"  Success: {metrics.get('successful_evaluations', 0)}")
        print(f"  Errors: {metrics.get('error_count', 0)}")
        print(f"  Time: {metrics.get('elapsed_time', 0):.1f}s")
        
        if 'binary' in metrics:
            print(f"\n📈 Binary Classification:")
            b = metrics['binary']
            print(f"  Accuracy:  {b.get('accuracy', 0):.2%}")
            print(f"  Precision: {b.get('precision', 0):.2%}")
            print(f"  Recall:    {b.get('recall', 0):.2%}")
            print(f"  F1-Score:  {b.get('f1', 0):.2f}")
        
        print("\n" + "=" * 60)
    
    def save_results(self, output_path: str, metrics: Dict[str, Any]) -> None:
        """Save results."""
        output = Path(output_path)
        output.parent.mkdir(parents=True, exist_ok=True)
        
        if HAS_PANDAS and self.results:
            df = pd.DataFrame(self.results)
            csv_path = output.with_suffix('.csv')
            df.to_csv(csv_path, index=False)
            print(f"[Remote] Results: {csv_path}")
        
        json_path = output.with_suffix('.json')
        with open(json_path, 'w') as f:
            json.dump({
                'timestamp': datetime.now().isoformat(),
                'dataset': 'LIAR',
                'mode': 'remote',
                'metrics': metrics
            }, f, indent=2, default=str)
        print(f"[Remote] Metrics: {json_path}")


def main():
    parser = argparse.ArgumentParser(description='LIAR benchmark via HF Space API')
    parser.add_argument('--url', type=str, default=None,
                       help='HF Space API URL')
    parser.add_argument('--split', type=str, default='test',
                       choices=['train', 'valid', 'test'])
    parser.add_argument('--sample', type=int, default=None,
                       help='Number of statements to sample')
    parser.add_argument('--data-dir', type=str, default=None)
    parser.add_argument('--output', type=str, default=None)
    parser.add_argument('--verbose', '-v', action='store_true')
    parser.add_argument('--timeout', type=int, default=60)
    
    args = parser.parse_args()
    
    benchmark = RemoteLIARBenchmark(
        api_url=args.url,
        data_dir=args.data_dir,
        timeout=args.timeout
    )
    
    try:
        metrics = benchmark.run_benchmark(
            split=args.split,
            sample_size=args.sample,
            verbose=args.verbose
        )
        
        benchmark.print_results(metrics)
        
        output = args.output or f"liar_benchmark_remote_{args.split}.csv"
        benchmark.save_results(output, metrics)
        
    except FileNotFoundError as e:
        print(f"\n❌ Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()