syscred_duplicate / syscred /run_liar_benchmark_remote.py
D Ф m i И i q ц e L Ф y e r
Deploy SysCRED v2.3.1 - GraphRAG + LIAR benchmark + TREC integration
8e97fc5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LIAR Benchmark via Hugging Face Space API
==========================================
Runs the LIAR benchmark against the remote SysCRED instance on HF Space.
This uses the full ML pipeline (PyTorch, Transformers) running in the cloud.
Usage:
python run_liar_benchmark_remote.py --sample 100
python run_liar_benchmark_remote.py --split test --url https://your-space.hf.space
(c) Dominique S. Loyer - PhD Thesis Prototype
"""
import argparse
import json
import time
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional
import requests
try:
import pandas as pd
HAS_PANDAS = True
except ImportError:
HAS_PANDAS = False
try:
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report
)
HAS_SKLEARN = True
except ImportError:
HAS_SKLEARN = False
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from syscred.liar_dataset import LIARDataset, LiarStatement
class RemoteLIARBenchmark:
"""
Benchmark runner using remote HF Space API.
"""
# Default HF Space URL
DEFAULT_API_URL = "https://domloyer-syscred.hf.space"
SYSCRED_THRESHOLD = 0.5 # Below = Fake, Above = Real
def __init__(
self,
api_url: Optional[str] = None,
data_dir: Optional[str] = None,
timeout: int = 60
):
"""
Initialize remote benchmark.
Args:
api_url: HF Space API URL
data_dir: Path to LIAR dataset
timeout: Request timeout in seconds
"""
print("=" * 60)
print("SysCRED LIAR Benchmark (Remote HF Space)")
print("=" * 60)
self.api_url = (api_url or self.DEFAULT_API_URL).rstrip('/')
self.timeout = timeout
# Test connection
print(f"\n[Remote] API URL: {self.api_url}")
self._test_connection()
# Load dataset
self.dataset = LIARDataset(data_dir)
self.results: List[Dict] = []
print("[Remote] Ready.\n")
def _test_connection(self):
"""Test API connectivity."""
try:
response = requests.get(f"{self.api_url}/api/health", timeout=10)
if response.status_code == 200:
print("[Remote] ✓ API connection successful")
else:
print(f"[Remote] ⚠ API returned status {response.status_code}")
except requests.exceptions.ConnectionError:
print("[Remote] ⚠ Could not connect to API (may be sleeping)")
print("[Remote] The first request will wake it up...")
except Exception as e:
print(f"[Remote] ⚠ Connection test failed: {e}")
def _call_api(self, text: str) -> Dict[str, Any]:
"""Call the SysCRED API."""
try:
response = requests.post(
f"{self.api_url}/api/verify",
json={"input": text},
timeout=self.timeout,
headers={"Content-Type": "application/json"}
)
if response.status_code == 200:
return response.json()
else:
return {"error": f"HTTP {response.status_code}: {response.text[:100]}"}
except requests.exceptions.Timeout:
return {"error": "Request timeout"}
except requests.exceptions.ConnectionError:
return {"error": "Connection error"}
except Exception as e:
return {"error": str(e)}
def _syscred_to_binary(self, score: float) -> str:
"""Convert SysCRED score to binary label."""
return "Real" if score >= self.SYSCRED_THRESHOLD else "Fake"
def _syscred_to_ternary(self, score: float) -> str:
"""Convert SysCRED score to ternary label."""
if score >= 0.65:
return "True"
elif score >= 0.35:
return "Mixed"
else:
return "False"
def evaluate_statement(self, statement: LiarStatement) -> Dict[str, Any]:
"""Evaluate a single statement via API."""
start_time = time.time()
result = {
'id': statement.id,
'statement': statement.statement[:200],
'ground_truth_6way': statement.label.name,
'ground_truth_binary': statement.binary_label,
'ground_truth_ternary': statement.ternary_label,
'speaker': statement.speaker,
'party': statement.party,
'syscred_score': 0.5,
'predicted_binary': 'Unknown',
'predicted_ternary': 'Unknown',
'binary_correct': False,
'ternary_correct': False,
'processing_time': 0,
'error': None,
'ml_used': False
}
# Call remote API
api_result = self._call_api(statement.statement)
if 'error' not in api_result:
score = api_result.get('scoreCredibilite', 0.5)
result['syscred_score'] = score
result['predicted_binary'] = self._syscred_to_binary(score)
result['predicted_ternary'] = self._syscred_to_ternary(score)
result['binary_correct'] = (result['predicted_binary'] == result['ground_truth_binary'])
result['ternary_correct'] = (result['predicted_ternary'] == result['ground_truth_ternary'])
# Check if ML was used
nlp = api_result.get('analyseNLP', {})
result['ml_used'] = nlp.get('sentiment') is not None
# GraphRAG info
graphrag = api_result.get('graphRAG', {})
result['graph_context_score'] = graphrag.get('context_score')
result['graph_has_history'] = graphrag.get('has_history', False)
else:
result['error'] = api_result['error']
result['processing_time'] = time.time() - start_time
return result
def run_benchmark(
self,
split: str = "test",
sample_size: Optional[int] = None,
verbose: bool = False
) -> Dict[str, Any]:
"""Run benchmark against remote API."""
print(f"\n[Remote] Running on {split} split via HF Space API...")
statements = self.dataset.load_split(split)
if sample_size:
import random
statements = random.sample(statements, min(sample_size, len(statements)))
print(f"[Remote] Using sample of {len(statements)} statements")
total = len(statements)
self.results = []
ml_used_count = 0
start_time = time.time()
for i, stmt in enumerate(statements):
if verbose or (i + 1) % 10 == 0:
print(f"[{i+1}/{total}] Processing: {stmt.statement[:50]}...")
result = self.evaluate_statement(stmt)
self.results.append(result)
if result.get('ml_used'):
ml_used_count += 1
if verbose and not result.get('error'):
symbol = "✅" if result['binary_correct'] else "❌"
ml = "🧠" if result['ml_used'] else "📊"
print(f" -> Score: {result['syscred_score']:.2f} {ml} | "
f"Pred: {result['predicted_binary']} | "
f"True: {result['ground_truth_binary']} {symbol}")
# Rate limiting - be nice to the API
if i < total - 1:
time.sleep(0.5)
elapsed = time.time() - start_time
metrics = self._calculate_metrics()
metrics['elapsed_time'] = elapsed
metrics['statements_per_second'] = total / elapsed if elapsed > 0 else 0
metrics['ml_used_percentage'] = (ml_used_count / total * 100) if total > 0 else 0
metrics['api_url'] = self.api_url
return metrics
def _calculate_metrics(self) -> Dict[str, Any]:
"""Calculate evaluation metrics."""
if not self.results:
return {'error': 'No results'}
valid_results = [r for r in self.results if r['error'] is None]
error_count = len(self.results) - len(valid_results)
if not valid_results:
return {'error': 'All evaluations failed'}
metrics = {
'total_statements': len(self.results),
'successful_evaluations': len(valid_results),
'error_count': error_count,
'error_rate': error_count / len(self.results)
}
y_true_binary = [r['ground_truth_binary'] for r in valid_results]
y_pred_binary = [r['predicted_binary'] for r in valid_results]
y_true_ternary = [r['ground_truth_ternary'] for r in valid_results]
y_pred_ternary = [r['predicted_ternary'] for r in valid_results]
if HAS_SKLEARN:
metrics['binary'] = {
'accuracy': accuracy_score(y_true_binary, y_pred_binary),
'precision': precision_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0),
'recall': recall_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0),
'f1': f1_score(y_true_binary, y_pred_binary, pos_label='Fake', zero_division=0),
'confusion_matrix': confusion_matrix(y_true_binary, y_pred_binary, labels=['Fake', 'Real']).tolist()
}
metrics['ternary'] = {
'accuracy': accuracy_score(y_true_ternary, y_pred_ternary),
'macro_f1': f1_score(y_true_ternary, y_pred_ternary, average='macro', zero_division=0),
}
else:
correct_binary = sum(1 for r in valid_results if r['binary_correct'])
metrics['binary'] = {'accuracy': correct_binary / len(valid_results)}
scores = [r['syscred_score'] for r in valid_results]
metrics['score_distribution'] = {
'mean': sum(scores) / len(scores),
'min': min(scores),
'max': max(scores),
}
return metrics
def print_results(self, metrics: Dict[str, Any]) -> None:
"""Print benchmark results."""
print("\n" + "=" * 60)
print("LIAR BENCHMARK RESULTS (Remote HF Space)")
print("=" * 60)
print(f"\n🌐 API: {metrics.get('api_url', 'N/A')}")
print(f"🧠 ML Models Used: {metrics.get('ml_used_percentage', 0):.1f}%")
print(f"\n📊 Overview:")
print(f" Total: {metrics.get('total_statements', 0)}")
print(f" Success: {metrics.get('successful_evaluations', 0)}")
print(f" Errors: {metrics.get('error_count', 0)}")
print(f" Time: {metrics.get('elapsed_time', 0):.1f}s")
if 'binary' in metrics:
print(f"\n📈 Binary Classification:")
b = metrics['binary']
print(f" Accuracy: {b.get('accuracy', 0):.2%}")
print(f" Precision: {b.get('precision', 0):.2%}")
print(f" Recall: {b.get('recall', 0):.2%}")
print(f" F1-Score: {b.get('f1', 0):.2f}")
print("\n" + "=" * 60)
def save_results(self, output_path: str, metrics: Dict[str, Any]) -> None:
"""Save results."""
output = Path(output_path)
output.parent.mkdir(parents=True, exist_ok=True)
if HAS_PANDAS and self.results:
df = pd.DataFrame(self.results)
csv_path = output.with_suffix('.csv')
df.to_csv(csv_path, index=False)
print(f"[Remote] Results: {csv_path}")
json_path = output.with_suffix('.json')
with open(json_path, 'w') as f:
json.dump({
'timestamp': datetime.now().isoformat(),
'dataset': 'LIAR',
'mode': 'remote',
'metrics': metrics
}, f, indent=2, default=str)
print(f"[Remote] Metrics: {json_path}")
def main():
parser = argparse.ArgumentParser(description='LIAR benchmark via HF Space API')
parser.add_argument('--url', type=str, default=None,
help='HF Space API URL')
parser.add_argument('--split', type=str, default='test',
choices=['train', 'valid', 'test'])
parser.add_argument('--sample', type=int, default=None,
help='Number of statements to sample')
parser.add_argument('--data-dir', type=str, default=None)
parser.add_argument('--output', type=str, default=None)
parser.add_argument('--verbose', '-v', action='store_true')
parser.add_argument('--timeout', type=int, default=60)
args = parser.parse_args()
benchmark = RemoteLIARBenchmark(
api_url=args.url,
data_dir=args.data_dir,
timeout=args.timeout
)
try:
metrics = benchmark.run_benchmark(
split=args.split,
sample_size=args.sample,
verbose=args.verbose
)
benchmark.print_results(metrics)
output = args.output or f"liar_benchmark_remote_{args.split}.csv"
benchmark.save_results(output, metrics)
except FileNotFoundError as e:
print(f"\n❌ Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()