Spaces:
Sleeping
Sleeping
D Ф m i И i q ц e L Ф y e r
Deploy SysCRED v2.3.1 - GraphRAG + LIAR benchmark + TREC integration
8e97fc5 | #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| LIAR Benchmark Runner - SysCRED | |
| ================================ | |
| Scientific evaluation of SysCRED on the LIAR benchmark dataset. | |
| Usage: | |
| python run_liar_benchmark.py --split test | |
| python run_liar_benchmark.py --sample 100 --verbose | |
| python run_liar_benchmark.py --split test --output results/liar_benchmark.csv | |
| (c) Dominique S. Loyer - PhD Thesis Prototype | |
| """ | |
| import argparse | |
| import json | |
| import time | |
| import sys | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Dict, List, Any, Optional | |
| from collections import Counter | |
| # Add parent to path for imports | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| try: | |
| import pandas as pd | |
| HAS_PANDAS = True | |
| except ImportError: | |
| HAS_PANDAS = False | |
| print("[Warning] pandas not installed. CSV export will be limited.") | |
| try: | |
| from sklearn.metrics import ( | |
| accuracy_score, precision_score, recall_score, f1_score, | |
| confusion_matrix, classification_report | |
| ) | |
| HAS_SKLEARN = True | |
| except ImportError: | |
| HAS_SKLEARN = False | |
| print("[Warning] sklearn not installed. Using basic metrics.") | |
| from syscred.liar_dataset import LIARDataset, LiarStatement | |
| from syscred.verification_system import CredibilityVerificationSystem | |
| from syscred import config | |
| class LIARBenchmark: | |
| """ | |
| Benchmark runner for evaluating SysCRED on LIAR dataset. | |
| """ | |
| # Map SysCRED score to binary label | |
| SYSCRED_THRESHOLD = 0.5 # Below = Fake, Above = Real | |
| def __init__( | |
| self, | |
| data_dir: Optional[str] = None, | |
| load_ml: bool = True, | |
| use_graphrag: bool = True | |
| ): | |
| """ | |
| Initialize benchmark. | |
| Args: | |
| data_dir: Path to LIAR dataset directory | |
| load_ml: Whether to load ML models | |
| use_graphrag: Whether to use GraphRAG context | |
| """ | |
| print("=" * 60) | |
| print("SysCRED LIAR Benchmark Runner") | |
| print("=" * 60) | |
| # Load dataset | |
| self.dataset = LIARDataset(data_dir) | |
| # Initialize SysCRED | |
| print("\n[Benchmark] Initializing SysCRED...") | |
| self.system = CredibilityVerificationSystem( | |
| ontology_base_path=str(config.Config.ONTOLOGY_BASE_PATH), | |
| ontology_data_path=str(config.Config.ONTOLOGY_DATA_PATH), | |
| load_ml_models=load_ml, | |
| google_api_key=config.Config.GOOGLE_FACT_CHECK_API_KEY | |
| ) | |
| self.use_graphrag = use_graphrag | |
| self.results: List[Dict] = [] | |
| print("[Benchmark] System ready.\n") | |
| def _syscred_to_binary(self, score: float) -> str: | |
| """Convert SysCRED score to binary label.""" | |
| return "Real" if score >= self.SYSCRED_THRESHOLD else "Fake" | |
| def _syscred_to_ternary(self, score: float) -> str: | |
| """Convert SysCRED score to ternary label.""" | |
| if score >= 0.65: | |
| return "True" | |
| elif score >= 0.35: | |
| return "Mixed" | |
| else: | |
| return "False" | |
| def evaluate_statement(self, statement: LiarStatement) -> Dict[str, Any]: | |
| """ | |
| Evaluate a single statement. | |
| Args: | |
| statement: LiarStatement to evaluate | |
| Returns: | |
| Result dictionary with prediction and ground truth | |
| """ | |
| start_time = time.time() | |
| result = { | |
| 'id': statement.id, | |
| 'statement': statement.statement[:200], | |
| 'ground_truth_6way': statement.label.name, | |
| 'ground_truth_binary': statement.binary_label, | |
| 'ground_truth_ternary': statement.ternary_label, | |
| 'speaker': statement.speaker, | |
| 'party': statement.party, | |
| 'syscred_score': 0.5, | |
| 'predicted_binary': 'Unknown', | |
| 'predicted_ternary': 'Unknown', | |
| 'binary_correct': False, | |
| 'ternary_correct': False, | |
| 'processing_time': 0, | |
| 'error': None | |
| } | |
| try: | |
| # Run SysCRED analysis on the statement text | |
| # Note: LIAR statements are short claims, not URLs | |
| report = self.system.verify_information(statement.statement) | |
| if 'error' not in report: | |
| score = report.get('scoreCredibilite', 0.5) | |
| result['syscred_score'] = score | |
| result['predicted_binary'] = self._syscred_to_binary(score) | |
| result['predicted_ternary'] = self._syscred_to_ternary(score) | |
| # Check correctness | |
| result['binary_correct'] = (result['predicted_binary'] == result['ground_truth_binary']) | |
| result['ternary_correct'] = (result['predicted_ternary'] == result['ground_truth_ternary']) | |
| # Add extra details if available | |
| if 'analyseNLP' in report: | |
| result['sentiment'] = report['analyseNLP'].get('sentiment', {}) | |
| result['bias'] = report['analyseNLP'].get('bias_analysis', {}) | |
| else: | |
| result['error'] = report['error'] | |
| except Exception as e: | |
| result['error'] = str(e) | |
| result['processing_time'] = time.time() - start_time | |
| return result | |
| def run_benchmark( | |
| self, | |
| split: str = "test", | |
| sample_size: Optional[int] = None, | |
| verbose: bool = False | |
| ) -> Dict[str, Any]: | |
| """ | |
| Run full benchmark on a dataset split. | |
| Args: | |
| split: 'train', 'valid', or 'test' | |
| sample_size: If set, only evaluate this many statements | |
| verbose: Print progress for each statement | |
| Returns: | |
| Dictionary with metrics and detailed results | |
| """ | |
| print(f"\n[Benchmark] Running on {split} split...") | |
| # Load dataset | |
| statements = self.dataset.load_split(split) | |
| if sample_size: | |
| import random | |
| statements = random.sample(statements, min(sample_size, len(statements))) | |
| print(f"[Benchmark] Using sample of {len(statements)} statements") | |
| total = len(statements) | |
| self.results = [] | |
| # Progress tracking | |
| start_time = time.time() | |
| for i, stmt in enumerate(statements): | |
| if verbose or (i + 1) % 50 == 0: | |
| print(f"[{i+1}/{total}] Processing: {stmt.statement[:50]}...") | |
| result = self.evaluate_statement(stmt) | |
| self.results.append(result) | |
| if verbose: | |
| symbol = "✅" if result['binary_correct'] else "❌" | |
| print(f" -> Score: {result['syscred_score']:.2f} | " | |
| f"Pred: {result['predicted_binary']} | " | |
| f"True: {result['ground_truth_binary']} {symbol}") | |
| elapsed = time.time() - start_time | |
| # Calculate metrics | |
| metrics = self._calculate_metrics() | |
| metrics['elapsed_time'] = elapsed | |
| metrics['statements_per_second'] = total / elapsed if elapsed > 0 else 0 | |
| return metrics | |
| def _calculate_metrics(self) -> Dict[str, Any]: | |
| """Calculate evaluation metrics from results.""" | |
| if not self.results: | |
| return {'error': 'No results to evaluate'} | |
| # Filter successful evaluations | |
| valid_results = [r for r in self.results if r['error'] is None] | |
| error_count = len(self.results) - len(valid_results) | |
| if not valid_results: | |
| return {'error': 'All evaluations failed'} | |
| metrics = { | |
| 'total_statements': len(self.results), | |
| 'successful_evaluations': len(valid_results), | |
| 'error_count': error_count, | |
| 'error_rate': error_count / len(self.results) | |
| } | |
| # Extract labels | |
| y_true_binary = [r['ground_truth_binary'] for r in valid_results] | |
| y_pred_binary = [r['predicted_binary'] for r in valid_results] | |
| y_true_ternary = [r['ground_truth_ternary'] for r in valid_results] | |
| y_pred_ternary = [r['predicted_ternary'] for r in valid_results] | |
| # Binary metrics | |
| if HAS_SKLEARN: | |
| metrics['binary'] = { | |
| 'accuracy': accuracy_score(y_true_binary, y_pred_binary), | |
| 'precision': precision_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0), | |
| 'recall': recall_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0), | |
| 'f1': f1_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0), | |
| 'confusion_matrix': confusion_matrix(y_true_binary, y_pred_binary, labels=['Fake', 'Real']).tolist() | |
| } | |
| metrics['ternary'] = { | |
| 'accuracy': accuracy_score(y_true_ternary, y_pred_ternary), | |
| 'macro_f1': f1_score(y_true_ternary, y_pred_ternary, average='macro', zero_division=0), | |
| 'confusion_matrix': confusion_matrix(y_true_ternary, y_pred_ternary, | |
| labels=['False', 'Mixed', 'True']).tolist() | |
| } | |
| # Detailed classification report | |
| metrics['classification_report'] = classification_report( | |
| y_true_binary, y_pred_binary, | |
| target_names=['Fake', 'Real'], | |
| output_dict=True | |
| ) | |
| else: | |
| # Basic metrics without sklearn | |
| correct_binary = sum(1 for r in valid_results if r['binary_correct']) | |
| correct_ternary = sum(1 for r in valid_results if r['ternary_correct']) | |
| metrics['binary'] = { | |
| 'accuracy': correct_binary / len(valid_results), | |
| 'correct': correct_binary, | |
| 'incorrect': len(valid_results) - correct_binary | |
| } | |
| metrics['ternary'] = { | |
| 'accuracy': correct_ternary / len(valid_results), | |
| 'correct': correct_ternary, | |
| 'incorrect': len(valid_results) - correct_ternary | |
| } | |
| # Score distribution | |
| scores = [r['syscred_score'] for r in valid_results] | |
| metrics['score_distribution'] = { | |
| 'mean': sum(scores) / len(scores), | |
| 'min': min(scores), | |
| 'max': max(scores), | |
| 'median': sorted(scores)[len(scores) // 2] | |
| } | |
| # Per-party analysis | |
| party_results = {} | |
| for party in ['republican', 'democrat']: | |
| party_items = [r for r in valid_results if r['party'].lower() == party] | |
| if party_items: | |
| party_correct = sum(1 for r in party_items if r['binary_correct']) | |
| party_results[party] = { | |
| 'count': len(party_items), | |
| 'accuracy': party_correct / len(party_items) | |
| } | |
| metrics['per_party'] = party_results | |
| return metrics | |
| def print_results(self, metrics: Dict[str, Any]) -> None: | |
| """Pretty-print benchmark results.""" | |
| print("\n" + "=" * 60) | |
| print("LIAR BENCHMARK RESULTS") | |
| print("=" * 60) | |
| print(f"\n📊 Overview:") | |
| print(f" Total Statements: {metrics.get('total_statements', 0)}") | |
| print(f" Successful: {metrics.get('successful_evaluations', 0)}") | |
| print(f" Errors: {metrics.get('error_count', 0)} ({metrics.get('error_rate', 0):.1%})") | |
| print(f" Processing Time: {metrics.get('elapsed_time', 0):.1f}s") | |
| print(f" Speed: {metrics.get('statements_per_second', 0):.2f} stmt/sec") | |
| if 'binary' in metrics: | |
| print(f"\n📈 Binary Classification (Fake vs Real):") | |
| b = metrics['binary'] | |
| print(f" Accuracy: {b.get('accuracy', 0):.2%}") | |
| print(f" Precision: {b.get('precision', 0):.2%}") | |
| print(f" Recall: {b.get('recall', 0):.2%}") | |
| print(f" F1-Score: {b.get('f1', 0):.2f}") | |
| if 'confusion_matrix' in b: | |
| cm = b['confusion_matrix'] | |
| print(f"\n Confusion Matrix:") | |
| print(f" Pred Fake Pred Real") | |
| print(f" True Fake {cm[0][0]:5d} {cm[0][1]:5d}") | |
| print(f" True Real {cm[1][0]:5d} {cm[1][1]:5d}") | |
| if 'ternary' in metrics: | |
| print(f"\n📊 Ternary Classification (False/Mixed/True):") | |
| t = metrics['ternary'] | |
| print(f" Accuracy: {t.get('accuracy', 0):.2%}") | |
| print(f" Macro F1: {t.get('macro_f1', 0):.2f}") | |
| if 'per_party' in metrics: | |
| print(f"\n🏛️ Per-Party Analysis:") | |
| for party, data in metrics['per_party'].items(): | |
| print(f" {party.capitalize()}: {data['accuracy']:.2%} accuracy ({data['count']} samples)") | |
| if 'score_distribution' in metrics: | |
| print(f"\n📉 Score Distribution:") | |
| sd = metrics['score_distribution'] | |
| print(f" Mean: {sd['mean']:.3f}") | |
| print(f" Median: {sd['median']:.3f}") | |
| print(f" Range: [{sd['min']:.3f}, {sd['max']:.3f}]") | |
| print("\n" + "=" * 60) | |
| def save_results(self, output_path: str, metrics: Dict[str, Any]) -> None: | |
| """Save results to files.""" | |
| output = Path(output_path) | |
| output.parent.mkdir(parents=True, exist_ok=True) | |
| # Save detailed results as CSV | |
| if HAS_PANDAS and self.results: | |
| df = pd.DataFrame(self.results) | |
| csv_path = output.with_suffix('.csv') | |
| df.to_csv(csv_path, index=False) | |
| print(f"[Benchmark] Results saved to: {csv_path}") | |
| # Save metrics as JSON | |
| json_path = output.with_suffix('.json') | |
| with open(json_path, 'w') as f: | |
| json.dump({ | |
| 'timestamp': datetime.now().isoformat(), | |
| 'dataset': 'LIAR', | |
| 'metrics': metrics, | |
| 'config': { | |
| 'threshold': self.SYSCRED_THRESHOLD, | |
| 'use_graphrag': self.use_graphrag, | |
| 'weights': dict(self.system.weights) | |
| } | |
| }, f, indent=2, default=str) | |
| print(f"[Benchmark] Metrics saved to: {json_path}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Run LIAR benchmark on SysCRED') | |
| parser.add_argument('--split', type=str, default='test', | |
| choices=['train', 'valid', 'test'], | |
| help='Dataset split to evaluate') | |
| parser.add_argument('--sample', type=int, default=None, | |
| help='Number of statements to sample (for quick testing)') | |
| parser.add_argument('--data-dir', type=str, default=None, | |
| help='Path to LIAR dataset directory') | |
| parser.add_argument('--output', type=str, default=None, | |
| help='Output path for results (CSV/JSON)') | |
| parser.add_argument('--no-ml', action='store_true', | |
| help='Disable ML models for faster testing') | |
| parser.add_argument('--verbose', '-v', action='store_true', | |
| help='Print details for each statement') | |
| args = parser.parse_args() | |
| # Run benchmark | |
| benchmark = LIARBenchmark( | |
| data_dir=args.data_dir, | |
| load_ml=not args.no_ml | |
| ) | |
| try: | |
| metrics = benchmark.run_benchmark( | |
| split=args.split, | |
| sample_size=args.sample, | |
| verbose=args.verbose | |
| ) | |
| benchmark.print_results(metrics) | |
| if args.output: | |
| benchmark.save_results(args.output, metrics) | |
| else: | |
| # Default output path | |
| default_output = Path(__file__).parent / f"liar_benchmark_{args.split}.csv" | |
| benchmark.save_results(str(default_output), metrics) | |
| except FileNotFoundError as e: | |
| print(f"\n❌ Error: {e}") | |
| print("\nTo download the LIAR dataset:") | |
| print(" 1. wget https://www.cs.ucsb.edu/~william/data/liar_dataset.zip") | |
| print(" 2. unzip liar_dataset.zip -d 02_Code/syscred/datasets/liar/") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |