D Ф m i И i q ц e L Ф y e r
commited on
Commit
·
e70050b
1
Parent(s):
1ae34e8
Deploy SysCRED with PyTorch
Browse files- Dockerfile +37 -0
- README.md +21 -6
- requirements.txt +31 -0
- syscred/README.md +1 -0
- syscred/SysCRED_Documentation.md +659 -0
- syscred/__init__.py +55 -0
- syscred/api_clients.py +560 -0
- syscred/backend_app.py +363 -0
- syscred/benchmark_data.json +92 -0
- syscred/config.py +291 -0
- syscred/database.py +54 -0
- syscred/debug_factcheck.py +43 -0
- syscred/debug_graph_json.py +58 -0
- syscred/debug_init.py +33 -0
- syscred/debug_local_server.py +25 -0
- syscred/diagnose_imports.py +37 -0
- syscred/eval_metrics.py +349 -0
- syscred/graph_rag.py +171 -0
- syscred/ir_engine.py +410 -0
- syscred/ontology_manager.py +509 -0
- syscred/requirements-light.txt +31 -0
- syscred/requirements.txt +34 -0
- syscred/requirements_light.txt +19 -0
- syscred/run_benchmark.py +135 -0
- syscred/run_trec_benchmark.py +414 -0
- syscred/save_to_notes.sh +121 -0
- syscred/seo_analyzer.py +610 -0
- syscred/setup.py +65 -0
- syscred/static/index.html +850 -0
- syscred/static/js/d3.min.js +0 -0
- syscred/test_graphrag.py +87 -0
- syscred/test_phase1.py +28 -0
- syscred/test_phase2.py +55 -0
- syscred/test_suite.py +64 -0
- syscred/test_trec_integration.py +271 -0
- syscred/trec_dataset.py +409 -0
- syscred/trec_retriever.py +446 -0
- syscred/verification_system.py +926 -0
Dockerfile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SysCRED Docker Configuration for Hugging Face Spaces
|
| 2 |
+
# Full version with PyTorch and Transformers
|
| 3 |
+
FROM python:3.10-slim
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 8 |
+
ENV PYTHONUNBUFFERED=1
|
| 9 |
+
ENV PYTHONPATH=/app
|
| 10 |
+
ENV SYSCRED_LOAD_ML_MODELS=true
|
| 11 |
+
|
| 12 |
+
# Install system dependencies
|
| 13 |
+
RUN apt-get update && apt-get install -y \
|
| 14 |
+
build-essential \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Copy requirements (full version with ML)
|
| 18 |
+
COPY requirements.txt /app/requirements.txt
|
| 19 |
+
|
| 20 |
+
# Install dependencies (includes PyTorch, Transformers)
|
| 21 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 22 |
+
|
| 23 |
+
# Copy application code
|
| 24 |
+
COPY syscred/ /app/syscred/
|
| 25 |
+
|
| 26 |
+
# Create user for HF Spaces (required)
|
| 27 |
+
RUN useradd -m -u 1000 user
|
| 28 |
+
USER user
|
| 29 |
+
ENV HOME=/home/user
|
| 30 |
+
ENV PATH=/home/user/.local/bin:$PATH
|
| 31 |
+
|
| 32 |
+
WORKDIR /app
|
| 33 |
+
|
| 34 |
+
EXPOSE 7860
|
| 35 |
+
|
| 36 |
+
# Run with HF Spaces port (7860)
|
| 37 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "300", "syscred.backend_app:app"]
|
README.md
CHANGED
|
@@ -1,12 +1,27 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
-
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SysCRED - Système de Vérification de Crédibilité
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
+
app_port: 7860
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# SysCRED - Credibility Verification System
|
| 13 |
+
|
| 14 |
+
A hybrid neuro-symbolic system for credibility verification and fact-checking.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
- 🔍 URL and text credibility analysis
|
| 18 |
+
- 🧠 NLP-based coherence analysis with Transformers
|
| 19 |
+
- 📊 SEO and source reputation scoring
|
| 20 |
+
- 🌐 Knowledge graph visualization with D3.js
|
| 21 |
+
- 🔗 Ontology-based reasoning with RDFLib
|
| 22 |
+
|
| 23 |
+
## Author
|
| 24 |
+
**Dominique S. Loyer** - UQAM
|
| 25 |
+
|
| 26 |
+
## Usage
|
| 27 |
+
Enter a URL or paste text to analyze its credibility score based on multiple factors.
|
requirements.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SysCRED - Full Requirements for Hugging Face Spaces
|
| 2 |
+
# Système Hybride de Vérification de Crédibilité
|
| 3 |
+
# (c) Dominique S. Loyer
|
| 4 |
+
|
| 5 |
+
# === Core Dependencies ===
|
| 6 |
+
requests>=2.28.0
|
| 7 |
+
beautifulsoup4>=4.11.0
|
| 8 |
+
python-whois>=0.8.0
|
| 9 |
+
|
| 10 |
+
# === RDF/Ontology ===
|
| 11 |
+
rdflib>=6.0.0
|
| 12 |
+
|
| 13 |
+
# === Machine Learning (Full) ===
|
| 14 |
+
transformers>=4.30.0
|
| 15 |
+
torch>=2.0.0
|
| 16 |
+
numpy>=1.24.0
|
| 17 |
+
sentence-transformers>=2.2.0
|
| 18 |
+
|
| 19 |
+
# === Explainability ===
|
| 20 |
+
lime>=0.2.0
|
| 21 |
+
|
| 22 |
+
# === Web Backend ===
|
| 23 |
+
flask>=2.3.0
|
| 24 |
+
flask-cors>=4.0.0
|
| 25 |
+
python-dotenv>=1.0.0
|
| 26 |
+
pandas>=2.0.0
|
| 27 |
+
|
| 28 |
+
# === Production/Database ===
|
| 29 |
+
gunicorn>=20.1.0
|
| 30 |
+
psycopg2-binary>=2.9.0
|
| 31 |
+
flask-sqlalchemy>=3.0.0
|
syscred/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# syscred
|
syscred/SysCRED_Documentation.md
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🔬 SysCRED - Documentation Complète
|
| 2 |
+
|
| 3 |
+
## Système Neuro-Symbolique de Vérification de Crédibilité
|
| 4 |
+
|
| 5 |
+
> **Version:** 2.0
|
| 6 |
+
> **Auteur:** Dominique S. Loyer
|
| 7 |
+
> **Citation Key:** `loyerModelingHybridSystem2025`
|
| 8 |
+
> **DOI:** [10.5281/zenodo.17943226](https://doi.org/10.5281/zenodo.17943226)
|
| 9 |
+
> **Dernière mise à jour:** Janvier 2026
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## 📋 Table des Matières
|
| 14 |
+
|
| 15 |
+
1. [Vue d'ensemble](#vue-densemble)
|
| 16 |
+
2. [Architecture du système](#architecture-du-système)
|
| 17 |
+
3. [Modules et fichiers](#modules-et-fichiers)
|
| 18 |
+
4. [Installation et configuration](#installation-et-configuration)
|
| 19 |
+
5. [Commandes et utilisation](#commandes-et-utilisation)
|
| 20 |
+
6. [Choix de conception](#choix-de-conception)
|
| 21 |
+
7. [Améliorations réalisées](#améliorations-réalisées)
|
| 22 |
+
8. [Améliorations futures](#améliorations-futures)
|
| 23 |
+
9. [API Reference](#api-reference)
|
| 24 |
+
10. [Ontologie OWL](#ontologie-owl)
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
## Vue d'ensemble
|
| 29 |
+
|
| 30 |
+
### Qu'est-ce que SysCRED?
|
| 31 |
+
|
| 32 |
+
SysCRED (System for CREdibility Detection) est un **système hybride neuro-symbolique** conçu pour évaluer automatiquement la crédibilité des informations en ligne. Il combine:
|
| 33 |
+
|
| 34 |
+
- **Approche symbolique** (règles explicites, transparentes et explicables)
|
| 35 |
+
- **Approche neuronale** (modèles NLP pour sentiment, biais, entités)
|
| 36 |
+
- **Ontologie OWL** (traçabilité et raisonnement sémantique)
|
| 37 |
+
|
| 38 |
+
### Philosophie du projet
|
| 39 |
+
|
| 40 |
+
Le système est conçu comme **prototype de recherche doctorale** avec ces principes:
|
| 41 |
+
|
| 42 |
+
1. **Explicabilité (xAI)**: Chaque décision peut être tracée et justifiée
|
| 43 |
+
2. **Hybridité**: Combine le meilleur des règles et du ML
|
| 44 |
+
3. **Reproductibilité**: Code open-source, documentation complète
|
| 45 |
+
4. **Modularité**: Chaque composant est indépendant et testable
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## Architecture du système
|
| 50 |
+
|
| 51 |
+
```
|
| 52 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 53 |
+
│ SysCRED v2.0 │
|
| 54 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 55 |
+
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
|
| 56 |
+
│ │ INPUT │ │ APIs │ │ OUTPUT │ │
|
| 57 |
+
│ │ URL / Texte │──│ Externes │──│ Rapport │ │
|
| 58 |
+
│ └───────────────┘ └───────────────┘ └───────────────┘ │
|
| 59 |
+
│ │ │ ▲ │
|
| 60 |
+
│ ▼ ▼ │ │
|
| 61 |
+
│ ┌─────────────────────────────────────────────────────┐ │
|
| 62 |
+
│ │ VERIFICATION SYSTEM │ │
|
| 63 |
+
│ │ ┌─────────────────┐ ┌─────────────────┐ │ │
|
| 64 |
+
│ │ │ RULE-BASED │ │ NLP ANALYSIS │ │ │
|
| 65 |
+
│ │ │ • Réputation │ │ • Sentiment │ │ │
|
| 66 |
+
│ │ │ • Âge domaine │ │ • NER │ │ │
|
| 67 |
+
│ │ │ • Fact-check │ │ • Biais │ │ │
|
| 68 |
+
│ │ │ • Marqueurs │ │ • Cohérence │ │ │
|
| 69 |
+
│ │ └─────────────────┘ └─────────────────┘ │ │
|
| 70 |
+
│ │ ↓ │ │
|
| 71 |
+
│ │ ┌─────────────────────────┐ │ │
|
| 72 |
+
│ │ │ SCORE CALCULATION │ │ │
|
| 73 |
+
│ │ │ (pondération hybride) │ │ │
|
| 74 |
+
│ │ └─────────────────────────┘ │ │
|
| 75 |
+
│ └─────────────────────────────────────────────────────┘ │
|
| 76 |
+
│ │ │
|
| 77 |
+
│ ▼ │
|
| 78 |
+
│ ┌─────────────────────────────────────────────────────┐ │
|
| 79 |
+
│ │ ONTOLOGY MANAGER (OWL/RDF) │ │
|
| 80 |
+
│ │ Traçabilité et raisonnement │ │
|
| 81 |
+
│ └─────────────────────────────────────────────────────┘ │
|
| 82 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Flux de traitement
|
| 86 |
+
|
| 87 |
+
1. **Entrée** → URL ou texte brut
|
| 88 |
+
2. **Récupération** → Contenu web (si URL)
|
| 89 |
+
3. **Prétraitement** → Nettoyage du texte
|
| 90 |
+
4. **Données externes** → WHOIS, fact-check APIs
|
| 91 |
+
5. **Analyse règles** → Marqueurs linguistiques, réputation
|
| 92 |
+
6. **Analyse NLP** → Sentiment, biais, entités
|
| 93 |
+
7. **Calcul score** → Pondération hybride (0-1)
|
| 94 |
+
8. **Génération rapport** → JSON structuré
|
| 95 |
+
9. **Sauvegarde ontologie** → Triplets RDF
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## Modules et fichiers
|
| 100 |
+
|
| 101 |
+
### Structure du projet
|
| 102 |
+
|
| 103 |
+
```
|
| 104 |
+
syscred/
|
| 105 |
+
├── __init__.py # Package init
|
| 106 |
+
├── config.py # Configuration centralisée
|
| 107 |
+
├── verification_system.py # Système principal
|
| 108 |
+
├── api_clients.py # Clients APIs externes
|
| 109 |
+
├── ontology_manager.py # Gestion OWL/RDF
|
| 110 |
+
├── seo_analyzer.py # Analyse SEO/PageRank
|
| 111 |
+
├── backend_app.py # API Flask REST
|
| 112 |
+
├── eval_metrics.py # Métriques d'évaluation
|
| 113 |
+
├── ir_engine.py # Moteur de recherche
|
| 114 |
+
├── requirements.txt # Dépendances Python
|
| 115 |
+
├── setup.py # Installation package
|
| 116 |
+
├── syscred_kaggle.ipynb # Notebook Kaggle
|
| 117 |
+
├── syscred_colab.ipynb # Notebook Colab (avec Drive)
|
| 118 |
+
└── kaggle_to_gdrive_backup.ipynb # Backup notebooks
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### Description des modules
|
| 122 |
+
|
| 123 |
+
#### `config.py` - Configuration centralisée
|
| 124 |
+
|
| 125 |
+
**But:** Centraliser tous les paramètres du système dans un seul fichier.
|
| 126 |
+
|
| 127 |
+
**Classes:**
|
| 128 |
+
|
| 129 |
+
- `Config` - Configuration de base
|
| 130 |
+
- `DevelopmentConfig` - Pour développement local
|
| 131 |
+
- `ProductionConfig` - Pour production
|
| 132 |
+
- `TestingConfig` - Pour tests (ML désactivé)
|
| 133 |
+
|
| 134 |
+
**Paramètres clés:**
|
| 135 |
+
|
| 136 |
+
| Paramètre | Description | Valeur par défaut |
|
| 137 |
+
|-----------|-------------|-------------------|
|
| 138 |
+
| `HOST` | Adresse du serveur | `0.0.0.0` |
|
| 139 |
+
| `PORT` | Port du serveur | `5000` |
|
| 140 |
+
| `DEBUG` | Mode debug | `true` |
|
| 141 |
+
| `LOAD_ML_MODELS` | Charger les modèles ML | `true` |
|
| 142 |
+
| `WEB_FETCH_TIMEOUT` | Timeout HTTP (sec) | `10` |
|
| 143 |
+
|
| 144 |
+
**Pondérations des scores:**
|
| 145 |
+
|
| 146 |
+
```python
|
| 147 |
+
SCORE_WEIGHTS = {
|
| 148 |
+
'source_reputation': 0.25, # Réputation de la source
|
| 149 |
+
'domain_age': 0.10, # Âge du domaine
|
| 150 |
+
'sentiment_neutrality': 0.15, # Neutralité du ton
|
| 151 |
+
'entity_presence': 0.15, # Présence d'entités vérifiables
|
| 152 |
+
'coherence': 0.15, # Cohérence textuelle
|
| 153 |
+
'fact_check': 0.20 # Résultats fact-check
|
| 154 |
+
}
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
**Variables d'environnement:**
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
export SYSCRED_ENV=production # Environnement (dev/prod/testing)
|
| 161 |
+
export SYSCRED_PORT=8080 # Port personnalisé
|
| 162 |
+
export SYSCRED_GOOGLE_API_KEY=xxx # Clé Google Fact Check
|
| 163 |
+
export SYSCRED_LOAD_ML=false # Désactiver ML
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
#### `verification_system.py` - Système principal
|
| 169 |
+
|
| 170 |
+
**But:** Pipeline principal de vérification de crédibilité.
|
| 171 |
+
|
| 172 |
+
**Classe principale:** `CredibilityVerificationSystem`
|
| 173 |
+
|
| 174 |
+
**Méthodes principales:**
|
| 175 |
+
|
| 176 |
+
| Méthode | Description |
|
| 177 |
+
|---------|-------------|
|
| 178 |
+
| `__init__()` | Initialise le système, charge les modèles |
|
| 179 |
+
| `verify_information(input)` | Pipeline principal de vérification |
|
| 180 |
+
| `rule_based_analysis(text, data)` | Analyse symbolique |
|
| 181 |
+
| `nlp_analysis(text)` | Analyse NLP (ML) |
|
| 182 |
+
| `calculate_overall_score()` | Calcule le score final |
|
| 183 |
+
| `generate_report()` | Génère le rapport JSON |
|
| 184 |
+
|
| 185 |
+
**Modèles ML utilisés:**
|
| 186 |
+
|
| 187 |
+
| Modèle | Usage |
|
| 188 |
+
|--------|-------|
|
| 189 |
+
| `distilbert-base-uncased-finetuned-sst-2-english` | Sentiment |
|
| 190 |
+
| `dbmdz/bert-large-cased-finetuned-conll03-english` | NER |
|
| 191 |
+
| `bert-base-uncased` | Détection de biais (placeholder) |
|
| 192 |
+
| `LIME` | Explication des prédictions |
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
#### `api_clients.py` - Clients APIs externes
|
| 197 |
+
|
| 198 |
+
**But:** Abstraire toutes les interactions avec les APIs externes.
|
| 199 |
+
|
| 200 |
+
**Classe principale:** `ExternalAPIClients`
|
| 201 |
+
|
| 202 |
+
**APIs intégrées:**
|
| 203 |
+
|
| 204 |
+
| API | Méthode | Description |
|
| 205 |
+
|-----|---------|-------------|
|
| 206 |
+
| Web Content | `fetch_web_content()` | Récupère et parse le HTML |
|
| 207 |
+
| WHOIS | `whois_lookup()` | Âge et registrar du domaine |
|
| 208 |
+
| Google Fact Check | `google_fact_check()` | Vérification des faits |
|
| 209 |
+
| Source Reputation | `get_source_reputation()` | Base de données interne |
|
| 210 |
+
| CommonCrawl | `estimate_backlinks()` | Estimation backlinks |
|
| 211 |
+
|
| 212 |
+
**Data classes:**
|
| 213 |
+
|
| 214 |
+
- `WebContent` - Contenu web parsé
|
| 215 |
+
- `DomainInfo` - Informations WHOIS
|
| 216 |
+
- `FactCheckResult` - Résultat fact-check
|
| 217 |
+
- `ExternalData` - Données agrégées
|
| 218 |
+
|
| 219 |
+
---
|
| 220 |
+
|
| 221 |
+
#### `ontology_manager.py` - Gestion OWL/RDF
|
| 222 |
+
|
| 223 |
+
**But:** Traçabilité sémantique avec ontologie OWL.
|
| 224 |
+
|
| 225 |
+
**Fonctionnalités:**
|
| 226 |
+
|
| 227 |
+
- Chargement d'ontologie de base (.ttl)
|
| 228 |
+
- Ajout de triplets RDF pour chaque évaluation
|
| 229 |
+
- Sauvegarde des données accumulées
|
| 230 |
+
- Requêtes SPARQL
|
| 231 |
+
|
| 232 |
+
**Ontologie utilisée:**
|
| 233 |
+
|
| 234 |
+
- Format: Turtle (.ttl)
|
| 235 |
+
- Namespace: `http://syscred.uqam.ca/ontology#`
|
| 236 |
+
- Concepts: `Evaluation`, `Source`, `CredibilityScore`, `Evidence`
|
| 237 |
+
|
| 238 |
+
---
|
| 239 |
+
|
| 240 |
+
#### `backend_app.py` - API Flask
|
| 241 |
+
|
| 242 |
+
**But:** Exposer SysCRED via API REST.
|
| 243 |
+
|
| 244 |
+
**Endpoints:**
|
| 245 |
+
|
| 246 |
+
| Endpoint | Méthode | Description |
|
| 247 |
+
|----------|---------|-------------|
|
| 248 |
+
| `/api/verify` | POST | Vérification principale |
|
| 249 |
+
| `/api/seo` | POST | Analyse SEO uniquement |
|
| 250 |
+
| `/api/ontology/stats` | GET | Statistiques ontologie |
|
| 251 |
+
| `/api/health` | GET | Vérification santé |
|
| 252 |
+
| `/api/config` | GET | Configuration actuelle |
|
| 253 |
+
|
| 254 |
+
**Exemple requête:**
|
| 255 |
+
|
| 256 |
+
```bash
|
| 257 |
+
curl -X POST http://localhost:5000/api/verify \
|
| 258 |
+
-H "Content-Type: application/json" \
|
| 259 |
+
-d '{"input_data": "https://example.com/article"}'
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
---
|
| 263 |
+
|
| 264 |
+
## Installation et configuration
|
| 265 |
+
|
| 266 |
+
### Prérequis
|
| 267 |
+
|
| 268 |
+
- Python 3.8+
|
| 269 |
+
- pip
|
| 270 |
+
- Git
|
| 271 |
+
|
| 272 |
+
### Installation locale
|
| 273 |
+
|
| 274 |
+
```bash
|
| 275 |
+
# Cloner le repository
|
| 276 |
+
git clone https://github.com/DominiqueLoyer/syscred.git
|
| 277 |
+
cd syscred
|
| 278 |
+
|
| 279 |
+
# Créer environnement virtuel
|
| 280 |
+
python -m venv venv
|
| 281 |
+
source venv/bin/activate # Linux/Mac
|
| 282 |
+
# ou: venv\Scripts\activate # Windows
|
| 283 |
+
|
| 284 |
+
# Installer les dépendances
|
| 285 |
+
pip install -r requirements.txt
|
| 286 |
+
|
| 287 |
+
# Installer le package en mode développement
|
| 288 |
+
pip install -e .
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
### Installation des dépendances
|
| 292 |
+
|
| 293 |
+
```bash
|
| 294 |
+
# Dépendances principales
|
| 295 |
+
pip install transformers torch numpy
|
| 296 |
+
pip install flask flask-cors
|
| 297 |
+
pip install rdflib owlrl
|
| 298 |
+
pip install requests beautifulsoup4
|
| 299 |
+
|
| 300 |
+
# Dépendances optionnelles
|
| 301 |
+
pip install python-whois # Pour WHOIS
|
| 302 |
+
pip install lime # Pour explications ML
|
| 303 |
+
```
|
| 304 |
+
|
| 305 |
+
### Fichier requirements.txt
|
| 306 |
+
|
| 307 |
+
```
|
| 308 |
+
transformers>=4.30.0
|
| 309 |
+
torch>=2.0.0
|
| 310 |
+
numpy>=1.24.0
|
| 311 |
+
flask>=2.3.0
|
| 312 |
+
flask-cors>=4.0.0
|
| 313 |
+
rdflib>=6.3.0
|
| 314 |
+
owlrl>=6.0.0
|
| 315 |
+
requests>=2.31.0
|
| 316 |
+
beautifulsoup4>=4.12.0
|
| 317 |
+
python-whois>=0.8.0
|
| 318 |
+
lime>=0.2.0
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
---
|
| 322 |
+
|
| 323 |
+
## Commandes et utilisation
|
| 324 |
+
|
| 325 |
+
### Démarrer l'API Flask
|
| 326 |
+
|
| 327 |
+
```bash
|
| 328 |
+
# Mode développement
|
| 329 |
+
cd /path/to/syscred
|
| 330 |
+
python backend_app.py
|
| 331 |
+
|
| 332 |
+
# Avec variables d'environnement
|
| 333 |
+
SYSCRED_PORT=8080 SYSCRED_DEBUG=true python backend_app.py
|
| 334 |
+
|
| 335 |
+
# Mode production
|
| 336 |
+
SYSCRED_ENV=production python backend_app.py
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
### Tester le système en ligne de commande
|
| 340 |
+
|
| 341 |
+
```bash
|
| 342 |
+
# Test direct du module
|
| 343 |
+
python -m syscred.verification_system
|
| 344 |
+
|
| 345 |
+
# Test avec entrée personnalisée
|
| 346 |
+
python -c "
|
| 347 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 348 |
+
sys = CredibilityVerificationSystem(load_ml_models=False)
|
| 349 |
+
result = sys.verify_information('https://www.lemonde.fr')
|
| 350 |
+
print(result['scoreCredibilite'])
|
| 351 |
+
"
|
| 352 |
+
```
|
| 353 |
+
|
| 354 |
+
### Utilisation dans Kaggle/Colab
|
| 355 |
+
|
| 356 |
+
Ouvrez le notebook `syscred_kaggle.ipynb` ou `syscred_colab.ipynb`:
|
| 357 |
+
|
| 358 |
+
```python
|
| 359 |
+
# Cellule 1: Installation
|
| 360 |
+
!pip install transformers torch rdflib requests beautifulsoup4
|
| 361 |
+
|
| 362 |
+
# Cellule 2: Importer et tester
|
| 363 |
+
from syscred import CredibilityVerificationSystem
|
| 364 |
+
sys = CredibilityVerificationSystem()
|
| 365 |
+
result = sys.verify_information("https://example.com")
|
| 366 |
+
```
|
| 367 |
+
|
| 368 |
+
### API REST - Exemples
|
| 369 |
+
|
| 370 |
+
```bash
|
| 371 |
+
# Vérifier une URL
|
| 372 |
+
curl -X POST http://localhost:5000/api/verify \
|
| 373 |
+
-H "Content-Type: application/json" \
|
| 374 |
+
-d '{"input_data": "https://www.bbc.com/article"}'
|
| 375 |
+
|
| 376 |
+
# Vérifier du texte
|
| 377 |
+
curl -X POST http://localhost:5000/api/verify \
|
| 378 |
+
-H "Content-Type: application/json" \
|
| 379 |
+
-d '{"input_data": "This is a verified news report."}'
|
| 380 |
+
|
| 381 |
+
# Vérifier la santé
|
| 382 |
+
curl http://localhost:5000/api/health
|
| 383 |
+
|
| 384 |
+
# Obtenir la configuration
|
| 385 |
+
curl http://localhost:5000/api/config
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
---
|
| 389 |
+
|
| 390 |
+
## Choix de conception
|
| 391 |
+
|
| 392 |
+
### Pourquoi approche hybride neuro-symbolique?
|
| 393 |
+
|
| 394 |
+
| Approche | Forces | Faiblesses |
|
| 395 |
+
|----------|--------|------------|
|
| 396 |
+
| **Règles** | Transparent, explicable, rapide | Rigide, couverture limitée |
|
| 397 |
+
| **ML/NLP** | Flexible, patterns complexes | Boîte noire, besoin données |
|
| 398 |
+
| **Hybride** | Combine les deux! | Plus complexe |
|
| 399 |
+
|
| 400 |
+
**Décision:** Utiliser les règles pour les cas clairs (réputation connue, marqueurs linguistiques) et le ML pour les nuances (sentiment, biais).
|
| 401 |
+
|
| 402 |
+
### Pourquoi ces pondérations?
|
| 403 |
+
|
| 404 |
+
Les poids par défaut reflètent l'importance relative de chaque facteur selon la littérature:
|
| 405 |
+
|
| 406 |
+
```python
|
| 407 |
+
SCORE_WEIGHTS = {
|
| 408 |
+
'source_reputation': 0.25, # Le plus important: source connue
|
| 409 |
+
'fact_check': 0.20, # Vérification externe
|
| 410 |
+
'sentiment_neutrality': 0.15,
|
| 411 |
+
'entity_presence': 0.15,
|
| 412 |
+
'coherence': 0.15,
|
| 413 |
+
'domain_age': 0.10 # Moins important seul
|
| 414 |
+
}
|
| 415 |
+
```
|
| 416 |
+
|
| 417 |
+
### Pourquoi LIME pour l'explicabilité?
|
| 418 |
+
|
| 419 |
+
- **Local Interpretable Model-agnostic Explanations**
|
| 420 |
+
- Fonctionne avec n'importe quel modèle
|
| 421 |
+
- Génère des explications compréhensibles
|
| 422 |
+
- Standard académique reconnu
|
| 423 |
+
|
| 424 |
+
### Pourquoi OWL/RDF?
|
| 425 |
+
|
| 426 |
+
- **Traçabilité**: Chaque évaluation est enregistrée
|
| 427 |
+
- **Raisonnement**: Inférences automatiques possibles (OWL-RL)
|
| 428 |
+
- **Interopérabilité**: Standard W3C, compatible SPARQL
|
| 429 |
+
- **Publication**: Données linked data
|
| 430 |
+
|
| 431 |
+
---
|
| 432 |
+
|
| 433 |
+
## Améliorations réalisées
|
| 434 |
+
|
| 435 |
+
### Version 2.0 (Janvier 2026)
|
| 436 |
+
|
| 437 |
+
1. **Configuration centralisée** (`config.py`)
|
| 438 |
+
- Variables d'environnement
|
| 439 |
+
- Profils dev/prod/testing
|
| 440 |
+
- Pondérations configurables
|
| 441 |
+
|
| 442 |
+
2. **API Clients refactorisés** (`api_clients.py`)
|
| 443 |
+
- Data classes typées
|
| 444 |
+
- Gestion d'erreurs robuste
|
| 445 |
+
- WHOIS lookup réel
|
| 446 |
+
|
| 447 |
+
3. **Notebooks Kaggle/Colab**
|
| 448 |
+
- `syscred_kaggle.ipynb` - Version Kaggle
|
| 449 |
+
- `syscred_colab.ipynb` - Version avec Google Drive
|
| 450 |
+
- Badges "Open in" pour facilité
|
| 451 |
+
|
| 452 |
+
4. **Fix du bug `NameError: result`**
|
| 453 |
+
- Variable locale dans section RDF
|
| 454 |
+
- Fallback si aucun résultat
|
| 455 |
+
|
| 456 |
+
5. **README professionnel**
|
| 457 |
+
- Badge DOI Zenodo
|
| 458 |
+
- Quick start
|
| 459 |
+
- API endpoints documentés
|
| 460 |
+
|
| 461 |
+
6. **Notebook backup Kaggle→Drive**
|
| 462 |
+
- `kaggle_to_gdrive_backup.ipynb`
|
| 463 |
+
- Sauvegarde automatique
|
| 464 |
+
|
| 465 |
+
---
|
| 466 |
+
|
| 467 |
+
## Améliorations futures
|
| 468 |
+
|
| 469 |
+
### Court terme (Prochains mois)
|
| 470 |
+
|
| 471 |
+
- [ ] **Google Fact Check API réel** - Intégrer la clé API
|
| 472 |
+
- [ ] **CommonCrawl backlinks** - Analyse réelle des backlinks
|
| 473 |
+
- [ ] **Plus de sources** - Étendre `SOURCE_REPUTATIONS`
|
| 474 |
+
- [ ] **Tests unitaires** - Couverture >80%
|
| 475 |
+
|
| 476 |
+
### Moyen terme (6-12 mois)
|
| 477 |
+
|
| 478 |
+
- [ ] **Modèle de biais fine-tuné** - Entraîner sur donées réelles
|
| 479 |
+
- [ ] **Cache Redis** - Mise en cache des résultats
|
| 480 |
+
- [ ] **Interface web moderne** - React/Vue frontend
|
| 481 |
+
- [ ] **Docker** - Conteneurisation
|
| 482 |
+
|
| 483 |
+
### Long terme (Thèse)
|
| 484 |
+
|
| 485 |
+
- [ ] **Évaluation formelle** - Dataset de benchmark
|
| 486 |
+
- [ ] **Multi-langue** - Support français natif
|
| 487 |
+
- [ ] **Graphe de connaissances** - Neo4j intégration
|
| 488 |
+
- [ ] **Apprentissage continu** - Feedback loop
|
| 489 |
+
|
| 490 |
+
---
|
| 491 |
+
|
| 492 |
+
## API Reference
|
| 493 |
+
|
| 494 |
+
### Classe `CredibilityVerificationSystem`
|
| 495 |
+
|
| 496 |
+
```python
|
| 497 |
+
class CredibilityVerificationSystem:
|
| 498 |
+
def __init__(
|
| 499 |
+
self,
|
| 500 |
+
google_api_key: Optional[str] = None,
|
| 501 |
+
ontology_base_path: Optional[str] = None,
|
| 502 |
+
ontology_data_path: Optional[str] = None,
|
| 503 |
+
load_ml_models: bool = True
|
| 504 |
+
):
|
| 505 |
+
"""
|
| 506 |
+
Initialize the credibility verification system.
|
| 507 |
+
|
| 508 |
+
Args:
|
| 509 |
+
google_api_key: API key for Google Fact Check
|
| 510 |
+
ontology_base_path: Path to base ontology TTL
|
| 511 |
+
ontology_data_path: Path to store data
|
| 512 |
+
load_ml_models: Whether to load ML models
|
| 513 |
+
"""
|
| 514 |
+
|
| 515 |
+
def verify_information(self, input_data: str) -> Dict[str, Any]:
|
| 516 |
+
"""
|
| 517 |
+
Main pipeline to verify credibility.
|
| 518 |
+
|
| 519 |
+
Args:
|
| 520 |
+
input_data: URL or text to verify
|
| 521 |
+
|
| 522 |
+
Returns:
|
| 523 |
+
Complete evaluation report with:
|
| 524 |
+
- idRapport: Unique report ID
|
| 525 |
+
- scoreCredibilite: 0.0-1.0
|
| 526 |
+
- resumeAnalyse: French summary
|
| 527 |
+
- detailsScore: Score breakdown
|
| 528 |
+
- reglesAppliquees: Rule-based results
|
| 529 |
+
- analyseNLP: NLP analysis results
|
| 530 |
+
"""
|
| 531 |
+
```
|
| 532 |
+
|
| 533 |
+
### Classe `Config`
|
| 534 |
+
|
| 535 |
+
```python
|
| 536 |
+
class Config:
|
| 537 |
+
# Chemins
|
| 538 |
+
BASE_DIR: Path
|
| 539 |
+
ONTOLOGY_BASE_PATH: Path
|
| 540 |
+
ONTOLOGY_DATA_PATH: Path
|
| 541 |
+
|
| 542 |
+
# Serveur
|
| 543 |
+
HOST: str = "0.0.0.0"
|
| 544 |
+
PORT: int = 5000
|
| 545 |
+
DEBUG: bool = True
|
| 546 |
+
|
| 547 |
+
# API Keys
|
| 548 |
+
GOOGLE_FACT_CHECK_API_KEY: Optional[str]
|
| 549 |
+
|
| 550 |
+
# Modèles ML
|
| 551 |
+
LOAD_ML_MODELS: bool = True
|
| 552 |
+
SENTIMENT_MODEL: str
|
| 553 |
+
NER_MODEL: str
|
| 554 |
+
|
| 555 |
+
# Pondérations
|
| 556 |
+
SCORE_WEIGHTS: Dict[str, float]
|
| 557 |
+
CREDIBILITY_THRESHOLDS: Dict[str, float]
|
| 558 |
+
SOURCE_REPUTATIONS: Dict[str, str]
|
| 559 |
+
|
| 560 |
+
@classmethod
|
| 561 |
+
def load_external_reputations(cls, filepath: str) -> None:
|
| 562 |
+
"""Charger réputations depuis fichier JSON."""
|
| 563 |
+
|
| 564 |
+
@classmethod
|
| 565 |
+
def update_weights(cls, new_weights: Dict[str, float]) -> None:
|
| 566 |
+
"""Mettre à jour les pondérations."""
|
| 567 |
+
|
| 568 |
+
@classmethod
|
| 569 |
+
def to_dict(cls) -> Dict:
|
| 570 |
+
"""Exporter configuration en dictionnaire."""
|
| 571 |
+
```
|
| 572 |
+
|
| 573 |
+
---
|
| 574 |
+
|
| 575 |
+
## Ontologie OWL
|
| 576 |
+
|
| 577 |
+
### Structure conceptuelle
|
| 578 |
+
|
| 579 |
+
```
|
| 580 |
+
syscred:Evaluation
|
| 581 |
+
└── syscred:evaluates → syscred:Information
|
| 582 |
+
└── syscred:hasScore → xsd:float
|
| 583 |
+
└── syscred:hasEvidence → syscred:Evidence
|
| 584 |
+
└── syscred:generatedAt → xsd:dateTime
|
| 585 |
+
|
| 586 |
+
syscred:Information
|
| 587 |
+
└── syscred:hasSource → syscred:Source
|
| 588 |
+
└── syscred:hasContent → xsd:string
|
| 589 |
+
|
| 590 |
+
syscred:Source
|
| 591 |
+
└── syscred:hasDomain → xsd:string
|
| 592 |
+
└── syscred:hasReputation → syscred:ReputationLevel
|
| 593 |
+
└── syscred:hasDomainAge → xsd:integer
|
| 594 |
+
|
| 595 |
+
syscred:Evidence
|
| 596 |
+
└── syscred:type → xsd:string (Linguistic, FactCheck, etc.)
|
| 597 |
+
└── syscred:value → xsd:string
|
| 598 |
+
└── syscred:impact → xsd:float
|
| 599 |
+
```
|
| 600 |
+
|
| 601 |
+
### Exemple de triplets générés
|
| 602 |
+
|
| 603 |
+
```turtle
|
| 604 |
+
@prefix syscred: <http://syscred.uqam.ca/ontology#> .
|
| 605 |
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
| 606 |
+
|
| 607 |
+
syscred:eval_1705890000 a syscred:Evaluation ;
|
| 608 |
+
syscred:evaluates syscred:info_lemonde_article ;
|
| 609 |
+
syscred:hasScore "0.85"^^xsd:float ;
|
| 610 |
+
syscred:generatedAt "2026-01-21T13:40:00"^^xsd:dateTime ;
|
| 611 |
+
syscred:hasEvidence syscred:evidence_1 .
|
| 612 |
+
|
| 613 |
+
syscred:evidence_1 a syscred:Evidence ;
|
| 614 |
+
syscred:type "SourceReputation" ;
|
| 615 |
+
syscred:value "High" ;
|
| 616 |
+
syscred:impact "0.25"^^xsd:float .
|
| 617 |
+
```
|
| 618 |
+
|
| 619 |
+
---
|
| 620 |
+
|
| 621 |
+
## Scripts utilitaires
|
| 622 |
+
|
| 623 |
+
### Script de backup vers Obsidian/Notion
|
| 624 |
+
|
| 625 |
+
Créez ce script dans `/Users/bk280625/documents041025/MonCode/`:
|
| 626 |
+
|
| 627 |
+
```bash
|
| 628 |
+
#!/bin/bash
|
| 629 |
+
# save_syscred_docs.sh
|
| 630 |
+
# Usage: ./save_syscred_docs.sh
|
| 631 |
+
|
| 632 |
+
DOC_SOURCE="/Users/bk280625/documents041025/MonCode/syscred/SysCRED_Documentation.md"
|
| 633 |
+
OBSIDIAN_VAULT="/Users/bk280625/Documents/Obsidian/PhD"
|
| 634 |
+
DATE=$(date +%Y%m%d)
|
| 635 |
+
|
| 636 |
+
# Copier vers Obsidian
|
| 637 |
+
cp "$DOC_SOURCE" "$OBSIDIAN_VAULT/SysCRED_Documentation_$DATE.md"
|
| 638 |
+
echo "✅ Copié vers Obsidian: $OBSIDIAN_VAULT"
|
| 639 |
+
|
| 640 |
+
# Ouvrir dans Obsidian (Mac)
|
| 641 |
+
open "obsidian://open?vault=PhD&file=SysCRED_Documentation_$DATE"
|
| 642 |
+
|
| 643 |
+
# Pour Notion: utiliser l'API ou copier manuellement
|
| 644 |
+
# Notion n'a pas d'import direct de fichiers locaux
|
| 645 |
+
echo "📋 Pour Notion: Copiez le contenu de $DOC_SOURCE"
|
| 646 |
+
echo " Ou utilisez: https://notion.so/import"
|
| 647 |
+
```
|
| 648 |
+
|
| 649 |
+
---
|
| 650 |
+
|
| 651 |
+
## Références
|
| 652 |
+
|
| 653 |
+
- Loyer, D. S. (2025). *Modeling and Hybrid System for Verification of Sources Credibility*. UQAM.
|
| 654 |
+
- Loyer, D. S. (2025). *Ontology of a Verification System for Liability of the Information*. DIC-9335.
|
| 655 |
+
|
| 656 |
+
---
|
| 657 |
+
|
| 658 |
+
*Documentation générée le 21 janvier 2026*
|
| 659 |
+
*SysCRED v2.0 - Dominique S. Loyer - UQAM*
|
syscred/__init__.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
SysCRED - Système Neuro-Symbolique de Vérification de Crédibilité
|
| 4 |
+
===================================================================
|
| 5 |
+
|
| 6 |
+
PhD Thesis Prototype - (c) Dominique S. Loyer
|
| 7 |
+
Citation Key: loyerModelingHybridSystem2025
|
| 8 |
+
|
| 9 |
+
Modules:
|
| 10 |
+
- api_clients: Web scraping, WHOIS, Fact Check APIs
|
| 11 |
+
- ir_engine: BM25, QLD, TF-IDF, PRF (from TREC)
|
| 12 |
+
- trec_retriever: Evidence retrieval for fact-checking (NEW v2.3)
|
| 13 |
+
- trec_dataset: TREC AP88-90 data loader (NEW v2.3)
|
| 14 |
+
- seo_analyzer: SEO analysis, PageRank estimation
|
| 15 |
+
- eval_metrics: MAP, NDCG, P@K, Recall, MRR
|
| 16 |
+
- ontology_manager: RDFLib integration
|
| 17 |
+
- verification_system: Main credibility pipeline
|
| 18 |
+
- graph_rag: GraphRAG for contextual memory
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
__version__ = "2.3.0"
|
| 22 |
+
__author__ = "Dominique S. Loyer"
|
| 23 |
+
__citation__ = "loyerModelingHybridSystem2025"
|
| 24 |
+
|
| 25 |
+
# Core classes
|
| 26 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 27 |
+
from syscred.api_clients import ExternalAPIClients
|
| 28 |
+
from syscred.ontology_manager import OntologyManager
|
| 29 |
+
from syscred.seo_analyzer import SEOAnalyzer
|
| 30 |
+
from syscred.ir_engine import IREngine
|
| 31 |
+
from syscred.eval_metrics import EvaluationMetrics
|
| 32 |
+
|
| 33 |
+
# TREC Integration (NEW - Feb 2026)
|
| 34 |
+
from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult
|
| 35 |
+
from syscred.trec_dataset import TRECDataset, TRECTopic
|
| 36 |
+
|
| 37 |
+
# Convenience alias
|
| 38 |
+
SysCRED = CredibilityVerificationSystem
|
| 39 |
+
|
| 40 |
+
__all__ = [
|
| 41 |
+
# Core
|
| 42 |
+
'CredibilityVerificationSystem',
|
| 43 |
+
'SysCRED',
|
| 44 |
+
'ExternalAPIClients',
|
| 45 |
+
'OntologyManager',
|
| 46 |
+
'SEOAnalyzer',
|
| 47 |
+
'IREngine',
|
| 48 |
+
'EvaluationMetrics',
|
| 49 |
+
# TREC (NEW)
|
| 50 |
+
'TRECRetriever',
|
| 51 |
+
'TRECDataset',
|
| 52 |
+
'TRECTopic',
|
| 53 |
+
'Evidence',
|
| 54 |
+
'RetrievalResult',
|
| 55 |
+
]
|
syscred/api_clients.py
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
API Clients Module - SysCRED
|
| 4 |
+
============================
|
| 5 |
+
Handles all external API calls for the credibility verification system.
|
| 6 |
+
|
| 7 |
+
APIs intégrées:
|
| 8 |
+
- Web content fetching (requests + BeautifulSoup)
|
| 9 |
+
- WHOIS lookup for domain age
|
| 10 |
+
- Google Fact Check Tools API
|
| 11 |
+
- Backlinks estimation via CommonCrawl
|
| 12 |
+
|
| 13 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 14 |
+
Citation Key: loyerModelingHybridSystem2025
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import requests
|
| 18 |
+
from urllib.parse import urlparse
|
| 19 |
+
from datetime import datetime, timedelta
|
| 20 |
+
from typing import Optional, List, Dict, Any
|
| 21 |
+
from dataclasses import dataclass
|
| 22 |
+
import re
|
| 23 |
+
import json
|
| 24 |
+
from functools import lru_cache
|
| 25 |
+
|
| 26 |
+
# Optional imports with fallbacks
|
| 27 |
+
try:
|
| 28 |
+
from bs4 import BeautifulSoup
|
| 29 |
+
HAS_BS4 = True
|
| 30 |
+
except ImportError:
|
| 31 |
+
HAS_BS4 = False
|
| 32 |
+
print("Warning: BeautifulSoup not installed. Run: pip install beautifulsoup4")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
import whois
|
| 36 |
+
HAS_WHOIS = True
|
| 37 |
+
except ImportError:
|
| 38 |
+
HAS_WHOIS = False
|
| 39 |
+
print("Warning: python-whois not installed. Run: pip install python-whois")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# --- Data Classes for Structured Results ---
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class WebContent:
|
| 46 |
+
"""Represents fetched web content."""
|
| 47 |
+
url: str
|
| 48 |
+
title: Optional[str]
|
| 49 |
+
text_content: str
|
| 50 |
+
meta_description: Optional[str]
|
| 51 |
+
meta_keywords: List[str]
|
| 52 |
+
links: List[str]
|
| 53 |
+
fetch_timestamp: str
|
| 54 |
+
success: bool
|
| 55 |
+
error: Optional[str] = None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class DomainInfo:
|
| 60 |
+
"""Represents domain WHOIS information."""
|
| 61 |
+
domain: str
|
| 62 |
+
creation_date: Optional[datetime]
|
| 63 |
+
expiration_date: Optional[datetime]
|
| 64 |
+
registrar: Optional[str]
|
| 65 |
+
age_days: Optional[int]
|
| 66 |
+
success: bool
|
| 67 |
+
error: Optional[str] = None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@dataclass
|
| 71 |
+
class FactCheckResult:
|
| 72 |
+
"""Represents a single fact-check claim review."""
|
| 73 |
+
claim: str
|
| 74 |
+
claimant: Optional[str]
|
| 75 |
+
rating: str
|
| 76 |
+
publisher: str
|
| 77 |
+
url: str
|
| 78 |
+
review_date: Optional[str]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@dataclass
|
| 82 |
+
class ExternalData:
|
| 83 |
+
"""Combined external data for credibility analysis."""
|
| 84 |
+
fact_checks: List[FactCheckResult]
|
| 85 |
+
source_reputation: str
|
| 86 |
+
domain_age_days: Optional[int]
|
| 87 |
+
domain_info: Optional[DomainInfo]
|
| 88 |
+
related_articles: List[Dict[str, str]]
|
| 89 |
+
backlinks_count: int
|
| 90 |
+
backlinks_sample: List[Dict[str, str]]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class ExternalAPIClients:
|
| 94 |
+
"""
|
| 95 |
+
Central class for all external API integrations.
|
| 96 |
+
Replaces simulated functions with real API calls.
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
def __init__(self, google_api_key: Optional[str] = None):
|
| 100 |
+
"""
|
| 101 |
+
Initialize API clients.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
google_api_key: API key for Google Fact Check Tools API (optional)
|
| 105 |
+
"""
|
| 106 |
+
self.google_api_key = google_api_key
|
| 107 |
+
self.session = requests.Session()
|
| 108 |
+
self.session.headers.update({
|
| 109 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 110 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
| 111 |
+
'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8',
|
| 112 |
+
'Referer': 'https://www.google.com/',
|
| 113 |
+
'Upgrade-Insecure-Requests': '1',
|
| 114 |
+
'Sec-Fetch-Dest': 'document',
|
| 115 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 116 |
+
'Sec-Fetch-Site': 'none',
|
| 117 |
+
'Sec-Fetch-User': '?1'
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
# Reputation database (can be extended or loaded from file)
|
| 121 |
+
self.known_reputations = {
|
| 122 |
+
# High credibility sources
|
| 123 |
+
'lemonde.fr': 'High',
|
| 124 |
+
'nytimes.com': 'High',
|
| 125 |
+
'reuters.com': 'High',
|
| 126 |
+
'bbc.com': 'High',
|
| 127 |
+
'theguardian.com': 'High',
|
| 128 |
+
'apnews.com': 'High',
|
| 129 |
+
'nature.com': 'High',
|
| 130 |
+
'sciencedirect.com': 'High',
|
| 131 |
+
'scholar.google.com': 'High',
|
| 132 |
+
'factcheck.org': 'High',
|
| 133 |
+
'snopes.com': 'High',
|
| 134 |
+
'politifact.com': 'High',
|
| 135 |
+
# Medium credibility
|
| 136 |
+
'wikipedia.org': 'Medium',
|
| 137 |
+
'medium.com': 'Medium',
|
| 138 |
+
'huffpost.com': 'Medium',
|
| 139 |
+
# Low credibility (known misinformation spreaders)
|
| 140 |
+
'infowars.com': 'Low',
|
| 141 |
+
'naturalnews.com': 'Low',
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
def fetch_web_content(self, url: str, timeout: int = 10) -> WebContent:
|
| 145 |
+
"""
|
| 146 |
+
Fetch and parse web content from a URL.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
url: The URL to fetch
|
| 150 |
+
timeout: Request timeout in seconds
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
WebContent dataclass with extracted information
|
| 154 |
+
"""
|
| 155 |
+
timestamp = datetime.now().isoformat()
|
| 156 |
+
|
| 157 |
+
if not HAS_BS4:
|
| 158 |
+
return WebContent(
|
| 159 |
+
url=url, title=None, text_content="",
|
| 160 |
+
meta_description=None, meta_keywords=[],
|
| 161 |
+
links=[], fetch_timestamp=timestamp,
|
| 162 |
+
success=False, error="BeautifulSoup not installed"
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
try:
|
| 167 |
+
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
| 168 |
+
response.raise_for_status()
|
| 169 |
+
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
|
| 170 |
+
print(f"[SysCRED] SSL/Connection error for {url}. Retrying without verification...")
|
| 171 |
+
# Suppress warnings for unverified HTTPS request
|
| 172 |
+
import urllib3
|
| 173 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 174 |
+
response = self.session.get(url, timeout=timeout, allow_redirects=True, verify=False)
|
| 175 |
+
response.raise_for_status()
|
| 176 |
+
|
| 177 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 178 |
+
|
| 179 |
+
# Extract title
|
| 180 |
+
title = soup.title.string.strip() if soup.title else None
|
| 181 |
+
|
| 182 |
+
# Extract meta description
|
| 183 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 184 |
+
meta_description = meta_desc.get('content', '') if meta_desc else None
|
| 185 |
+
|
| 186 |
+
# Extract meta keywords
|
| 187 |
+
meta_kw = soup.find('meta', attrs={'name': 'keywords'})
|
| 188 |
+
meta_keywords = []
|
| 189 |
+
if meta_kw and meta_kw.get('content'):
|
| 190 |
+
meta_keywords = [k.strip() for k in meta_kw.get('content', '').split(',')]
|
| 191 |
+
|
| 192 |
+
# Remove script and style elements
|
| 193 |
+
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
|
| 194 |
+
element.decompose()
|
| 195 |
+
|
| 196 |
+
# Extract main text content
|
| 197 |
+
text_content = soup.get_text(separator=' ', strip=True)
|
| 198 |
+
# Clean up excessive whitespace
|
| 199 |
+
text_content = re.sub(r'\s+', ' ', text_content)
|
| 200 |
+
|
| 201 |
+
# Extract links
|
| 202 |
+
links = []
|
| 203 |
+
for a_tag in soup.find_all('a', href=True)[:50]: # Limit to 50 links
|
| 204 |
+
href = a_tag['href']
|
| 205 |
+
if href.startswith('http'):
|
| 206 |
+
links.append(href)
|
| 207 |
+
|
| 208 |
+
return WebContent(
|
| 209 |
+
url=url,
|
| 210 |
+
title=title,
|
| 211 |
+
text_content=text_content[:10000], # Limit text size
|
| 212 |
+
meta_description=meta_description,
|
| 213 |
+
meta_keywords=meta_keywords,
|
| 214 |
+
links=links,
|
| 215 |
+
fetch_timestamp=timestamp,
|
| 216 |
+
success=True
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
except requests.exceptions.Timeout:
|
| 220 |
+
return WebContent(
|
| 221 |
+
url=url, title=None, text_content="",
|
| 222 |
+
meta_description=None, meta_keywords=[], links=[],
|
| 223 |
+
fetch_timestamp=timestamp, success=False,
|
| 224 |
+
error=f"Timeout after {timeout}s"
|
| 225 |
+
)
|
| 226 |
+
except requests.exceptions.RequestException as e:
|
| 227 |
+
return WebContent(
|
| 228 |
+
url=url, title=None, text_content="",
|
| 229 |
+
meta_description=None, meta_keywords=[], links=[],
|
| 230 |
+
fetch_timestamp=timestamp, success=False,
|
| 231 |
+
error=str(e)
|
| 232 |
+
)
|
| 233 |
+
except Exception as e:
|
| 234 |
+
return WebContent(
|
| 235 |
+
url=url, title=None, text_content="",
|
| 236 |
+
meta_description=None, meta_keywords=[], links=[],
|
| 237 |
+
fetch_timestamp=timestamp, success=False,
|
| 238 |
+
error=f"Parsing error: {str(e)}"
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
@lru_cache(maxsize=128)
|
| 242 |
+
def whois_lookup(self, url_or_domain: str) -> DomainInfo:
|
| 243 |
+
"""
|
| 244 |
+
Perform WHOIS lookup to get domain registration information.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
url_or_domain: URL or domain name
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
DomainInfo dataclass with domain details
|
| 251 |
+
"""
|
| 252 |
+
# Extract domain from URL if needed
|
| 253 |
+
if url_or_domain.startswith('http'):
|
| 254 |
+
domain = urlparse(url_or_domain).netloc
|
| 255 |
+
else:
|
| 256 |
+
domain = url_or_domain
|
| 257 |
+
|
| 258 |
+
# Remove 'www.' prefix
|
| 259 |
+
if domain.startswith('www.'):
|
| 260 |
+
domain = domain[4:]
|
| 261 |
+
|
| 262 |
+
if not HAS_WHOIS:
|
| 263 |
+
return DomainInfo(
|
| 264 |
+
domain=domain,
|
| 265 |
+
creation_date=None, expiration_date=None,
|
| 266 |
+
registrar=None, age_days=None,
|
| 267 |
+
success=False, error="python-whois not installed"
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
w = whois.whois(domain)
|
| 272 |
+
|
| 273 |
+
# Handle creation_date (can be a list or single value)
|
| 274 |
+
creation_date = w.creation_date
|
| 275 |
+
if isinstance(creation_date, list):
|
| 276 |
+
creation_date = creation_date[0]
|
| 277 |
+
|
| 278 |
+
# Handle expiration_date
|
| 279 |
+
expiration_date = w.expiration_date
|
| 280 |
+
if isinstance(expiration_date, list):
|
| 281 |
+
expiration_date = expiration_date[0]
|
| 282 |
+
|
| 283 |
+
# Calculate age in days
|
| 284 |
+
age_days = None
|
| 285 |
+
if creation_date:
|
| 286 |
+
if isinstance(creation_date, datetime):
|
| 287 |
+
age_days = (datetime.now() - creation_date).days
|
| 288 |
+
|
| 289 |
+
return DomainInfo(
|
| 290 |
+
domain=domain,
|
| 291 |
+
creation_date=creation_date,
|
| 292 |
+
expiration_date=expiration_date,
|
| 293 |
+
registrar=w.registrar,
|
| 294 |
+
age_days=age_days,
|
| 295 |
+
success=True
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
return DomainInfo(
|
| 300 |
+
domain=domain,
|
| 301 |
+
creation_date=None, expiration_date=None,
|
| 302 |
+
registrar=None, age_days=None,
|
| 303 |
+
success=False, error=str(e)
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
def google_fact_check(self, query: str, language: str = "fr") -> List[FactCheckResult]:
|
| 307 |
+
"""
|
| 308 |
+
Query Google Fact Check Tools API.
|
| 309 |
+
|
| 310 |
+
Args:
|
| 311 |
+
query: The claim or text to check
|
| 312 |
+
language: Language code (default: French)
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
List of FactCheckResult objects
|
| 316 |
+
"""
|
| 317 |
+
results = []
|
| 318 |
+
|
| 319 |
+
if not self.google_api_key:
|
| 320 |
+
print("[Info] Google Fact Check API key not configured. Using simulation.")
|
| 321 |
+
return self._simulate_fact_check(query)
|
| 322 |
+
|
| 323 |
+
try:
|
| 324 |
+
api_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
|
| 325 |
+
params = {
|
| 326 |
+
'key': self.google_api_key,
|
| 327 |
+
'query': query[:200], # API has character limit
|
| 328 |
+
# 'languageCode': language # Removed to allow all languages (e.g. English queries)
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
response = self.session.get(api_url, params=params, timeout=10)
|
| 332 |
+
response.raise_for_status()
|
| 333 |
+
data = response.json()
|
| 334 |
+
|
| 335 |
+
claims = data.get('claims', [])
|
| 336 |
+
for claim in claims[:5]: # Limit to 5 results
|
| 337 |
+
text = claim.get('text', '')
|
| 338 |
+
claimant = claim.get('claimant')
|
| 339 |
+
|
| 340 |
+
for review in claim.get('claimReview', []):
|
| 341 |
+
results.append(FactCheckResult(
|
| 342 |
+
claim=text,
|
| 343 |
+
claimant=claimant,
|
| 344 |
+
rating=review.get('textualRating', 'Unknown'),
|
| 345 |
+
publisher=review.get('publisher', {}).get('name', 'Unknown'),
|
| 346 |
+
url=review.get('url', ''),
|
| 347 |
+
review_date=review.get('reviewDate')
|
| 348 |
+
))
|
| 349 |
+
|
| 350 |
+
return results
|
| 351 |
+
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print(f"[Warning] Google Fact Check API error: {e}")
|
| 354 |
+
return self._simulate_fact_check(query)
|
| 355 |
+
|
| 356 |
+
def _simulate_fact_check(self, query: str) -> List[FactCheckResult]:
|
| 357 |
+
"""Fallback simulation when API is not available."""
|
| 358 |
+
# Check for known misinformation patterns
|
| 359 |
+
misinformation_keywords = [
|
| 360 |
+
'conspiracy', 'hoax', 'fake', 'miracle cure', 'they don\'t want you to know',
|
| 361 |
+
'mainstream media lies', 'deep state', 'plandemic'
|
| 362 |
+
]
|
| 363 |
+
|
| 364 |
+
query_lower = query.lower()
|
| 365 |
+
for keyword in misinformation_keywords:
|
| 366 |
+
if keyword in query_lower:
|
| 367 |
+
return [FactCheckResult(
|
| 368 |
+
claim=f"Text contains potential misinformation marker: '{keyword}'",
|
| 369 |
+
claimant=None,
|
| 370 |
+
rating="Needs Verification",
|
| 371 |
+
publisher="SysCRED Heuristic",
|
| 372 |
+
url="",
|
| 373 |
+
review_date=datetime.now().isoformat()
|
| 374 |
+
)]
|
| 375 |
+
|
| 376 |
+
return [] # No fact checks found
|
| 377 |
+
|
| 378 |
+
@lru_cache(maxsize=128)
|
| 379 |
+
def get_source_reputation(self, url: str) -> str:
|
| 380 |
+
"""
|
| 381 |
+
Get reputation score for a source/domain.
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
url: URL or domain to check
|
| 385 |
+
|
| 386 |
+
Returns:
|
| 387 |
+
Reputation level: 'High', 'Medium', 'Low', or 'Unknown'
|
| 388 |
+
"""
|
| 389 |
+
if url.startswith('http'):
|
| 390 |
+
domain = urlparse(url).netloc
|
| 391 |
+
else:
|
| 392 |
+
domain = url
|
| 393 |
+
|
| 394 |
+
# Remove www prefix
|
| 395 |
+
if domain.startswith('www.'):
|
| 396 |
+
domain = domain[4:]
|
| 397 |
+
|
| 398 |
+
# Check known reputations
|
| 399 |
+
for known_domain, reputation in self.known_reputations.items():
|
| 400 |
+
if domain.endswith(known_domain) or known_domain in domain:
|
| 401 |
+
return reputation
|
| 402 |
+
|
| 403 |
+
# Heuristics for unknown domains
|
| 404 |
+
# Academic domains tend to be more credible
|
| 405 |
+
if domain.endswith('.edu') or domain.endswith('.gov') or domain.endswith('.ac.uk'):
|
| 406 |
+
return 'High'
|
| 407 |
+
|
| 408 |
+
# Personal sites and free hosting are less credible
|
| 409 |
+
if any(x in domain for x in ['.blogspot.', '.wordpress.', '.wix.', '.weebly.']):
|
| 410 |
+
return 'Low'
|
| 411 |
+
|
| 412 |
+
return 'Unknown'
|
| 413 |
+
|
| 414 |
+
def estimate_backlinks(self, url: str) -> Dict[str, Any]:
|
| 415 |
+
"""
|
| 416 |
+
Estimate relative authority/backlinks based on available signals.
|
| 417 |
+
|
| 418 |
+
Since real backlink databases (Ahrefs, Moz) are paid/proprietary,
|
| 419 |
+
we use a composite heuristic based on:
|
| 420 |
+
1. Domain age (older domains tend to have more backlinks)
|
| 421 |
+
2. Known reputation (High reputation sources imply high backlinks)
|
| 422 |
+
3. Google Fact Check mentions (as a proxy for visibility in fact-checks)
|
| 423 |
+
"""
|
| 424 |
+
domain = urlparse(url).netloc
|
| 425 |
+
if domain.startswith('www.'):
|
| 426 |
+
domain = domain[4:]
|
| 427 |
+
|
| 428 |
+
# 1. Base Score from Reputation
|
| 429 |
+
reputation = self.get_source_reputation(domain)
|
| 430 |
+
base_count = 0
|
| 431 |
+
if reputation == 'High':
|
| 432 |
+
base_count = 10000 # High authority
|
| 433 |
+
elif reputation == 'Medium':
|
| 434 |
+
base_count = 1000 # Medium authority
|
| 435 |
+
elif reputation == 'Low':
|
| 436 |
+
base_count = 50 # Low authority
|
| 437 |
+
else:
|
| 438 |
+
base_count = 100 # Unknown
|
| 439 |
+
|
| 440 |
+
# 2. Multiplier from Domain Age
|
| 441 |
+
age_multiplier = 1.0
|
| 442 |
+
domain_info = self.whois_lookup(domain)
|
| 443 |
+
if domain_info.success and domain_info.age_days:
|
| 444 |
+
# Add 10% for every year of age, max 5x
|
| 445 |
+
years = domain_info.age_days / 365
|
| 446 |
+
age_multiplier = min(5.0, 1.0 + (years * 0.1))
|
| 447 |
+
|
| 448 |
+
estimated_count = int(base_count * age_multiplier)
|
| 449 |
+
|
| 450 |
+
# 3. Adjust for specific TLDs
|
| 451 |
+
if domain.endswith('.edu') or domain.endswith('.gov'):
|
| 452 |
+
estimated_count *= 2
|
| 453 |
+
|
| 454 |
+
return {
|
| 455 |
+
'estimated_count': estimated_count,
|
| 456 |
+
'sample_backlinks': [], # Real sample requires SERP API
|
| 457 |
+
'method': 'heuristic_v2.1',
|
| 458 |
+
'note': 'Estimated from domain age and reputation (Proxy)'
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
def fetch_external_data(self, input_data: str, fc_query: str = None) -> ExternalData:
|
| 462 |
+
"""
|
| 463 |
+
Main method to fetch all external data for credibility analysis.
|
| 464 |
+
This replaces the simulated fetch_external_data function.
|
| 465 |
+
|
| 466 |
+
Args:
|
| 467 |
+
input_data: URL or text to analyze
|
| 468 |
+
|
| 469 |
+
Returns:
|
| 470 |
+
ExternalData with all gathered information
|
| 471 |
+
"""
|
| 472 |
+
from urllib.parse import urlparse
|
| 473 |
+
|
| 474 |
+
# Determine if input is URL
|
| 475 |
+
is_url = False
|
| 476 |
+
try:
|
| 477 |
+
result = urlparse(input_data)
|
| 478 |
+
is_url = all([result.scheme, result.netloc])
|
| 479 |
+
except:
|
| 480 |
+
pass
|
| 481 |
+
|
| 482 |
+
# Initialize results
|
| 483 |
+
domain_age_days = None
|
| 484 |
+
domain_info = None
|
| 485 |
+
source_reputation = 'Unknown'
|
| 486 |
+
fact_checks = []
|
| 487 |
+
backlinks_data = {'estimated_count': 0, 'sample_backlinks': []}
|
| 488 |
+
|
| 489 |
+
if is_url:
|
| 490 |
+
# Get domain information
|
| 491 |
+
domain_info = self.whois_lookup(input_data)
|
| 492 |
+
if domain_info.success:
|
| 493 |
+
domain_age_days = domain_info.age_days
|
| 494 |
+
|
| 495 |
+
# Get source reputation
|
| 496 |
+
source_reputation = self.get_source_reputation(input_data)
|
| 497 |
+
|
| 498 |
+
# Get backlink estimation
|
| 499 |
+
backlinks_data = self.estimate_backlinks(input_data)
|
| 500 |
+
|
| 501 |
+
# Perform fact check on the content/URL
|
| 502 |
+
# Use provided query or fall back to input_data
|
| 503 |
+
query_to_use = fc_query if fc_query else input_data
|
| 504 |
+
fact_checks = self.google_fact_check(query_to_use)
|
| 505 |
+
|
| 506 |
+
return ExternalData(
|
| 507 |
+
fact_checks=fact_checks,
|
| 508 |
+
source_reputation=source_reputation,
|
| 509 |
+
domain_age_days=domain_age_days,
|
| 510 |
+
domain_info=domain_info,
|
| 511 |
+
related_articles=[], # TODO: Implement related article search
|
| 512 |
+
backlinks_count=backlinks_data.get('estimated_count', 0),
|
| 513 |
+
backlinks_sample=backlinks_data.get('sample_backlinks', [])
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
# --- Testing ---
|
| 518 |
+
if __name__ == "__main__":
|
| 519 |
+
print("=== Testing ExternalAPIClients ===\n")
|
| 520 |
+
|
| 521 |
+
client = ExternalAPIClients()
|
| 522 |
+
|
| 523 |
+
# Test 1: Web content fetching
|
| 524 |
+
print("Test 1: Fetching web content from Le Monde...")
|
| 525 |
+
content = client.fetch_web_content("https://www.lemonde.fr")
|
| 526 |
+
print(f" Success: {content.success}")
|
| 527 |
+
print(f" Title: {content.title}")
|
| 528 |
+
print(f" Text length: {len(content.text_content)} chars")
|
| 529 |
+
print(f" Links found: {len(content.links)}")
|
| 530 |
+
print()
|
| 531 |
+
|
| 532 |
+
# Test 2: WHOIS lookup
|
| 533 |
+
print("Test 2: WHOIS lookup for lemonde.fr...")
|
| 534 |
+
domain_info = client.whois_lookup("https://www.lemonde.fr")
|
| 535 |
+
print(f" Success: {domain_info.success}")
|
| 536 |
+
print(f" Domain: {domain_info.domain}")
|
| 537 |
+
print(f" Age: {domain_info.age_days} days")
|
| 538 |
+
print(f" Registrar: {domain_info.registrar}")
|
| 539 |
+
print()
|
| 540 |
+
|
| 541 |
+
# Test 3: Source reputation
|
| 542 |
+
print("Test 3: Source reputation checks...")
|
| 543 |
+
test_urls = [
|
| 544 |
+
"https://www.nytimes.com/article",
|
| 545 |
+
"https://www.infowars.com/post",
|
| 546 |
+
"https://random-blog.wordpress.com"
|
| 547 |
+
]
|
| 548 |
+
for url in test_urls:
|
| 549 |
+
rep = client.get_source_reputation(url)
|
| 550 |
+
print(f" {url}: {rep}")
|
| 551 |
+
print()
|
| 552 |
+
|
| 553 |
+
# Test 4: Full external data
|
| 554 |
+
print("Test 4: Full external data fetch...")
|
| 555 |
+
external_data = client.fetch_external_data("https://www.bbc.com/news")
|
| 556 |
+
print(f" Source reputation: {external_data.source_reputation}")
|
| 557 |
+
print(f" Domain age: {external_data.domain_age_days} days")
|
| 558 |
+
print(f" Fact checks found: {len(external_data.fact_checks)}")
|
| 559 |
+
|
| 560 |
+
print("\n=== Tests Complete ===")
|
syscred/backend_app.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
SysCRED Backend API - Flask Server
|
| 4 |
+
===================================
|
| 5 |
+
REST API for the credibility verification system.
|
| 6 |
+
|
| 7 |
+
Endpoints:
|
| 8 |
+
- POST /api/verify - Verify URL or text credibility
|
| 9 |
+
- POST /api/seo - Get SEO analysis only
|
| 10 |
+
- GET /api/ontology/stats - Get ontology statistics
|
| 11 |
+
- GET /api/health - Health check
|
| 12 |
+
- GET /api/config - View current configuration
|
| 13 |
+
|
| 14 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import sys
|
| 18 |
+
import os
|
| 19 |
+
import traceback
|
| 20 |
+
from flask import Flask, request, jsonify, send_from_directory
|
| 21 |
+
from flask_cors import CORS
|
| 22 |
+
|
| 23 |
+
# Add syscred package to path
|
| 24 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 25 |
+
|
| 26 |
+
# Import SysCRED modules
|
| 27 |
+
try:
|
| 28 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 29 |
+
from syscred.seo_analyzer import SEOAnalyzer
|
| 30 |
+
from syscred.ontology_manager import OntologyManager
|
| 31 |
+
from syscred.ontology_manager import OntologyManager
|
| 32 |
+
from syscred.config import config, Config
|
| 33 |
+
from syscred.database import init_db, db, AnalysisResult
|
| 34 |
+
SYSCRED_AVAILABLE = True
|
| 35 |
+
print("[SysCRED Backend] Modules imported successfully")
|
| 36 |
+
except ImportError as e:
|
| 37 |
+
SYSCRED_AVAILABLE = False
|
| 38 |
+
print(f"[SysCRED Backend] Warning: Could not import modules: {e}")
|
| 39 |
+
# Define dummy init_db to prevent crash
|
| 40 |
+
def init_db(app): pass
|
| 41 |
+
|
| 42 |
+
# Fallback config
|
| 43 |
+
class Config:
|
| 44 |
+
HOST = "0.0.0.0"
|
| 45 |
+
PORT = 5000
|
| 46 |
+
DEBUG = True
|
| 47 |
+
ONTOLOGY_BASE_PATH = None
|
| 48 |
+
ONTOLOGY_DATA_PATH = None
|
| 49 |
+
LOAD_ML_MODELS = True
|
| 50 |
+
GOOGLE_FACT_CHECK_API_KEY = None
|
| 51 |
+
config = Config()
|
| 52 |
+
|
| 53 |
+
# --- Initialize Flask App ---
|
| 54 |
+
app = Flask(__name__)
|
| 55 |
+
CORS(app) # Enable CORS for frontend
|
| 56 |
+
|
| 57 |
+
# Initialize Database
|
| 58 |
+
try:
|
| 59 |
+
init_db(app) # [NEW] Setup DB connection
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"[SysCRED Backend] Warning: DB init failed: {e}")
|
| 62 |
+
|
| 63 |
+
# --- Initialize SysCRED System ---
|
| 64 |
+
credibility_system = None
|
| 65 |
+
seo_analyzer = None
|
| 66 |
+
|
| 67 |
+
def initialize_system():
|
| 68 |
+
"""Initialize the credibility system (lazy loading)."""
|
| 69 |
+
global credibility_system, seo_analyzer
|
| 70 |
+
|
| 71 |
+
if not SYSCRED_AVAILABLE:
|
| 72 |
+
print("[SysCRED Backend] Cannot initialize - modules not available")
|
| 73 |
+
return False
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
# Initialize SEO analyzer (lightweight)
|
| 77 |
+
seo_analyzer = SEOAnalyzer()
|
| 78 |
+
print("[SysCRED Backend] SEO Analyzer initialized")
|
| 79 |
+
|
| 80 |
+
# Initialize full system (may take time to load ML models)
|
| 81 |
+
print("[SysCRED Backend] Initializing credibility system (loading ML models)...")
|
| 82 |
+
ontology_base = str(config.ONTOLOGY_BASE_PATH) if config.ONTOLOGY_BASE_PATH else None
|
| 83 |
+
ontology_data = str(config.ONTOLOGY_DATA_PATH) if config.ONTOLOGY_DATA_PATH else None
|
| 84 |
+
credibility_system = CredibilityVerificationSystem(
|
| 85 |
+
ontology_base_path=ontology_base if ontology_base and os.path.exists(ontology_base) else None,
|
| 86 |
+
ontology_data_path=ontology_data,
|
| 87 |
+
load_ml_models=config.LOAD_ML_MODELS,
|
| 88 |
+
google_api_key=config.GOOGLE_FACT_CHECK_API_KEY
|
| 89 |
+
)
|
| 90 |
+
print("[SysCRED Backend] System initialized successfully!")
|
| 91 |
+
return True
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"[SysCRED Backend] Error initializing system: {e}")
|
| 95 |
+
traceback.print_exc()
|
| 96 |
+
return False
|
| 97 |
+
|
| 98 |
+
# --- API Routes ---
|
| 99 |
+
|
| 100 |
+
@app.route('/')
|
| 101 |
+
def index():
|
| 102 |
+
"""Serve the frontend."""
|
| 103 |
+
return send_from_directory('static', 'index.html')
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@app.route('/api/health', methods=['GET'])
|
| 107 |
+
def health_check():
|
| 108 |
+
"""Health check endpoint."""
|
| 109 |
+
return jsonify({
|
| 110 |
+
'status': 'healthy',
|
| 111 |
+
'syscred_available': SYSCRED_AVAILABLE,
|
| 112 |
+
'system_initialized': credibility_system is not None,
|
| 113 |
+
'seo_analyzer_ready': seo_analyzer is not None
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@app.route('/api/verify', methods=['POST'])
|
| 118 |
+
def verify_endpoint():
|
| 119 |
+
"""
|
| 120 |
+
Main verification endpoint.
|
| 121 |
+
|
| 122 |
+
Request JSON:
|
| 123 |
+
{
|
| 124 |
+
"input_data": "URL or text to verify",
|
| 125 |
+
"include_seo": true/false (optional, default true),
|
| 126 |
+
"include_pagerank": true/false (optional, default true)
|
| 127 |
+
}
|
| 128 |
+
"""
|
| 129 |
+
global credibility_system
|
| 130 |
+
|
| 131 |
+
# Lazy initialization
|
| 132 |
+
if credibility_system is None:
|
| 133 |
+
if not initialize_system():
|
| 134 |
+
return jsonify({
|
| 135 |
+
'error': 'System initialization failed. Check server logs.'
|
| 136 |
+
}), 503
|
| 137 |
+
|
| 138 |
+
# Validate request
|
| 139 |
+
if not request.is_json:
|
| 140 |
+
return jsonify({'error': 'Request must be JSON'}), 400
|
| 141 |
+
|
| 142 |
+
data = request.get_json()
|
| 143 |
+
input_data = data.get('input_data', '').strip()
|
| 144 |
+
|
| 145 |
+
if not input_data:
|
| 146 |
+
return jsonify({'error': "'input_data' is required"}), 400
|
| 147 |
+
|
| 148 |
+
include_seo = data.get('include_seo', True)
|
| 149 |
+
include_pagerank = data.get('include_pagerank', True)
|
| 150 |
+
|
| 151 |
+
print(f"[SysCRED Backend] Verifying: {input_data[:100]}...")
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
# Run main verification
|
| 155 |
+
result = credibility_system.verify_information(input_data)
|
| 156 |
+
|
| 157 |
+
if 'error' in result:
|
| 158 |
+
return jsonify(result), 400
|
| 159 |
+
|
| 160 |
+
# Add SEO analysis if requested and it's a URL
|
| 161 |
+
if include_seo and credibility_system.is_url(input_data):
|
| 162 |
+
try:
|
| 163 |
+
web_content = credibility_system.api_clients.fetch_web_content(input_data)
|
| 164 |
+
if web_content.success:
|
| 165 |
+
seo_result = seo_analyzer.analyze_seo(
|
| 166 |
+
url=input_data,
|
| 167 |
+
title=web_content.title,
|
| 168 |
+
meta_description=web_content.meta_description,
|
| 169 |
+
text_content=web_content.text_content
|
| 170 |
+
)
|
| 171 |
+
result['seoAnalysis'] = {
|
| 172 |
+
'titleLength': seo_result.title_length,
|
| 173 |
+
'titleHasKeywords': seo_result.title_has_keywords,
|
| 174 |
+
'metaDescriptionLength': seo_result.meta_description_length,
|
| 175 |
+
'wordCount': seo_result.word_count,
|
| 176 |
+
'readabilityScore': round(seo_result.readability_score, 2),
|
| 177 |
+
'seoScore': round(seo_result.seo_score, 2),
|
| 178 |
+
'topKeywords': list(seo_result.keyword_density.keys())
|
| 179 |
+
}
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"[SysCRED Backend] SEO analysis error: {e}")
|
| 182 |
+
result['seoAnalysis'] = {'error': str(e)}
|
| 183 |
+
|
| 184 |
+
# Add PageRank estimation if requested
|
| 185 |
+
if include_pagerank and credibility_system.is_url(input_data):
|
| 186 |
+
try:
|
| 187 |
+
external_data = credibility_system.api_clients.fetch_external_data(input_data)
|
| 188 |
+
pr_result = seo_analyzer.estimate_pagerank(
|
| 189 |
+
url=input_data,
|
| 190 |
+
domain_age_days=external_data.domain_age_days,
|
| 191 |
+
source_reputation=external_data.source_reputation
|
| 192 |
+
)
|
| 193 |
+
result['pageRankEstimation'] = {
|
| 194 |
+
'estimatedPR': round(pr_result.estimated_pr, 3),
|
| 195 |
+
'confidence': round(pr_result.confidence, 2),
|
| 196 |
+
'factors': pr_result.factors,
|
| 197 |
+
'explanation': pr_result.explanation_text
|
| 198 |
+
}
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"[SysCRED Backend] PageRank estimation error: {e}")
|
| 201 |
+
result['pageRankEstimation'] = {'error': str(e)}
|
| 202 |
+
|
| 203 |
+
print(f"[SysCRED Backend] Score: {result.get('scoreCredibilite', 'N/A')}")
|
| 204 |
+
|
| 205 |
+
# [NEW] Persist to Database
|
| 206 |
+
try:
|
| 207 |
+
new_analysis = AnalysisResult(
|
| 208 |
+
url=input_data[:500],
|
| 209 |
+
credibility_score=result.get('scoreCredibilite', 0.5),
|
| 210 |
+
summary=result.get('resumeAnalyse', ''),
|
| 211 |
+
source_reputation=result.get('detailsScore', {}).get('factors', [{}])[0].get('value')
|
| 212 |
+
)
|
| 213 |
+
db.session.add(new_analysis)
|
| 214 |
+
db.session.commit()
|
| 215 |
+
print(f"[SysCRED-DB] Result saved. ID: {new_analysis.id}")
|
| 216 |
+
except Exception as e:
|
| 217 |
+
print(f"[SysCRED-DB] Save failed: {e}")
|
| 218 |
+
|
| 219 |
+
return jsonify(result), 200
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"[SysCRED Backend] Error: {e}")
|
| 223 |
+
traceback.print_exc()
|
| 224 |
+
return jsonify({'error': f'Internal error: {str(e)}'}), 500
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
@app.route('/api/seo', methods=['POST'])
|
| 228 |
+
def seo_endpoint():
|
| 229 |
+
"""
|
| 230 |
+
SEO-only analysis endpoint (faster, no ML models needed).
|
| 231 |
+
|
| 232 |
+
Request JSON:
|
| 233 |
+
{
|
| 234 |
+
"url": "URL to analyze"
|
| 235 |
+
}
|
| 236 |
+
"""
|
| 237 |
+
global seo_analyzer
|
| 238 |
+
|
| 239 |
+
if seo_analyzer is None:
|
| 240 |
+
seo_analyzer = SEOAnalyzer()
|
| 241 |
+
|
| 242 |
+
if not request.is_json:
|
| 243 |
+
return jsonify({'error': 'Request must be JSON'}), 400
|
| 244 |
+
|
| 245 |
+
data = request.get_json()
|
| 246 |
+
url = data.get('url', '').strip()
|
| 247 |
+
|
| 248 |
+
if not url or not url.startswith('http'):
|
| 249 |
+
return jsonify({'error': 'Valid URL is required'}), 400
|
| 250 |
+
|
| 251 |
+
try:
|
| 252 |
+
# Fetch content
|
| 253 |
+
from syscred.api_clients import ExternalAPIClients
|
| 254 |
+
api_client = ExternalAPIClients()
|
| 255 |
+
|
| 256 |
+
web_content = api_client.fetch_web_content(url)
|
| 257 |
+
if not web_content.success:
|
| 258 |
+
return jsonify({'error': f'Failed to fetch URL: {web_content.error}'}), 400
|
| 259 |
+
|
| 260 |
+
# SEO analysis
|
| 261 |
+
seo_result = seo_analyzer.analyze_seo(
|
| 262 |
+
url=url,
|
| 263 |
+
title=web_content.title,
|
| 264 |
+
meta_description=web_content.meta_description,
|
| 265 |
+
text_content=web_content.text_content
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# IR metrics
|
| 269 |
+
ir_metrics = seo_analyzer.get_ir_metrics(web_content.text_content)
|
| 270 |
+
|
| 271 |
+
# PageRank estimation
|
| 272 |
+
external_data = api_client.fetch_external_data(url)
|
| 273 |
+
pr_result = seo_analyzer.estimate_pagerank(
|
| 274 |
+
url=url,
|
| 275 |
+
domain_age_days=external_data.domain_age_days,
|
| 276 |
+
source_reputation=external_data.source_reputation
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
return jsonify({
|
| 280 |
+
'url': url,
|
| 281 |
+
'title': web_content.title,
|
| 282 |
+
'seo': {
|
| 283 |
+
'titleLength': seo_result.title_length,
|
| 284 |
+
'metaDescriptionLength': seo_result.meta_description_length,
|
| 285 |
+
'wordCount': seo_result.word_count,
|
| 286 |
+
'readabilityScore': round(seo_result.readability_score, 2),
|
| 287 |
+
'seoScore': round(seo_result.seo_score, 2),
|
| 288 |
+
'keywordDensity': seo_result.keyword_density
|
| 289 |
+
},
|
| 290 |
+
'irMetrics': {
|
| 291 |
+
'documentLength': ir_metrics.document_length,
|
| 292 |
+
'topTerms': ir_metrics.top_terms[:5],
|
| 293 |
+
'avgTermFrequency': round(ir_metrics.avg_term_frequency, 4)
|
| 294 |
+
},
|
| 295 |
+
'pageRank': {
|
| 296 |
+
'estimated': round(pr_result.estimated_pr, 3),
|
| 297 |
+
'confidence': round(pr_result.confidence, 2),
|
| 298 |
+
'factors': pr_result.factors
|
| 299 |
+
},
|
| 300 |
+
'domain': {
|
| 301 |
+
'reputation': external_data.source_reputation,
|
| 302 |
+
'ageDays': external_data.domain_age_days
|
| 303 |
+
}
|
| 304 |
+
}), 200
|
| 305 |
+
|
| 306 |
+
except Exception as e:
|
| 307 |
+
print(f"[SysCRED Backend] SEO endpoint error: {e}")
|
| 308 |
+
traceback.print_exc()
|
| 309 |
+
return jsonify({'error': str(e)}), 500
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
@app.route('/api/ontology/graph', methods=['GET'])
|
| 314 |
+
def ontology_graph():
|
| 315 |
+
"""Get ontology graph data for D3.js."""
|
| 316 |
+
global credibility_system
|
| 317 |
+
|
| 318 |
+
if credibility_system and credibility_system.ontology_manager:
|
| 319 |
+
graph_data = credibility_system.ontology_manager.get_graph_json()
|
| 320 |
+
return jsonify(graph_data), 200
|
| 321 |
+
else:
|
| 322 |
+
# Return empty graph rather than 400 to avoid breaking frontend
|
| 323 |
+
return jsonify({'nodes': [], 'links': []}), 200
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
@app.route('/api/ontology/stats', methods=['GET'])
|
| 327 |
+
def ontology_stats():
|
| 328 |
+
"""Get ontology statistics."""
|
| 329 |
+
global credibility_system
|
| 330 |
+
|
| 331 |
+
if credibility_system and credibility_system.ontology_manager:
|
| 332 |
+
stats = credibility_system.ontology_manager.get_statistics()
|
| 333 |
+
return jsonify(stats), 200
|
| 334 |
+
else:
|
| 335 |
+
return jsonify({
|
| 336 |
+
'error': 'Ontology not loaded',
|
| 337 |
+
'base_triples': 0,
|
| 338 |
+
'data_triples': 0
|
| 339 |
+
}), 200
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
# --- Main ---
|
| 343 |
+
if __name__ == '__main__':
|
| 344 |
+
print("=" * 60)
|
| 345 |
+
print("SysCRED Backend API Server")
|
| 346 |
+
print("(c) Dominique S. Loyer - PhD Thesis Prototype")
|
| 347 |
+
print("=" * 60)
|
| 348 |
+
print()
|
| 349 |
+
|
| 350 |
+
# Initialize system at startup
|
| 351 |
+
print("[SysCRED Backend] Pre-initializing system...")
|
| 352 |
+
initialize_system()
|
| 353 |
+
|
| 354 |
+
print()
|
| 355 |
+
print("[SysCRED Backend] Starting Flask server...")
|
| 356 |
+
print("[SysCRED Backend] Endpoints:")
|
| 357 |
+
print(" - POST /api/verify - Full credibility verification")
|
| 358 |
+
print(" - POST /api/seo - SEO analysis only (faster)")
|
| 359 |
+
print(" - GET /api/ontology/stats - Ontology statistics")
|
| 360 |
+
print(" - GET /api/health - Health check")
|
| 361 |
+
print()
|
| 362 |
+
|
| 363 |
+
app.run(host='0.0.0.0', port=5001, debug=True)
|
syscred/benchmark_data.json
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"url": "https://www.lemonde.fr",
|
| 4 |
+
"label": "High",
|
| 5 |
+
"expected_score_range": [
|
| 6 |
+
0.7,
|
| 7 |
+
1.0
|
| 8 |
+
],
|
| 9 |
+
"category": "News (General)"
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"url": "https://www.bbc.com",
|
| 13 |
+
"label": "High",
|
| 14 |
+
"expected_score_range": [
|
| 15 |
+
0.7,
|
| 16 |
+
1.0
|
| 17 |
+
],
|
| 18 |
+
"category": "News (International)"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"url": "https://www.nature.com",
|
| 22 |
+
"label": "High",
|
| 23 |
+
"expected_score_range": [
|
| 24 |
+
0.8,
|
| 25 |
+
1.0
|
| 26 |
+
],
|
| 27 |
+
"category": "Science"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"url": "https://www.who.int",
|
| 31 |
+
"label": "High",
|
| 32 |
+
"expected_score_range": [
|
| 33 |
+
0.8,
|
| 34 |
+
1.0
|
| 35 |
+
],
|
| 36 |
+
"category": "Health/Institution"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"url": "https://www.reuters.com",
|
| 40 |
+
"label": "High",
|
| 41 |
+
"expected_score_range": [
|
| 42 |
+
0.7,
|
| 43 |
+
1.0
|
| 44 |
+
],
|
| 45 |
+
"category": "News (Agency)"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"url": "https://www.infowars.com",
|
| 49 |
+
"label": "Low",
|
| 50 |
+
"expected_score_range": [
|
| 51 |
+
0.0,
|
| 52 |
+
0.4
|
| 53 |
+
],
|
| 54 |
+
"category": "Conspiracy"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"url": "https://www.naturalnews.com",
|
| 58 |
+
"label": "Low",
|
| 59 |
+
"expected_score_range": [
|
| 60 |
+
0.0,
|
| 61 |
+
0.4
|
| 62 |
+
],
|
| 63 |
+
"category": "Pseudoscience"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"url": "https://truthsocial.com",
|
| 67 |
+
"label": "Low",
|
| 68 |
+
"expected_score_range": [
|
| 69 |
+
0.0,
|
| 70 |
+
0.5
|
| 71 |
+
],
|
| 72 |
+
"category": "Social/Biased"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"url": "https://www.theflatearthsociety.org",
|
| 76 |
+
"label": "Low",
|
| 77 |
+
"expected_score_range": [
|
| 78 |
+
0.0,
|
| 79 |
+
0.4
|
| 80 |
+
],
|
| 81 |
+
"category": "Conspiracy"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"url": "https://beforeitsnews.com",
|
| 85 |
+
"label": "Low",
|
| 86 |
+
"expected_score_range": [
|
| 87 |
+
0.0,
|
| 88 |
+
0.4
|
| 89 |
+
],
|
| 90 |
+
"category": "Fake News"
|
| 91 |
+
}
|
| 92 |
+
]
|
syscred/config.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
SysCRED Configuration
|
| 4 |
+
=====================
|
| 5 |
+
Configuration centralisée pour le système de vérification de crédibilité.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
from syscred.config import Config
|
| 9 |
+
|
| 10 |
+
# Accéder aux paramètres
|
| 11 |
+
config = Config()
|
| 12 |
+
port = config.PORT
|
| 13 |
+
|
| 14 |
+
# Ou avec variables d'environnement
|
| 15 |
+
# export SYSCRED_GOOGLE_API_KEY=your_key
|
| 16 |
+
# export SYSCRED_PORT=8080
|
| 17 |
+
|
| 18 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Dict, Optional
|
| 24 |
+
from dotenv import load_dotenv
|
| 25 |
+
|
| 26 |
+
# Charger les variables depuis .env
|
| 27 |
+
# Charger les variables depuis .env (Project Root)
|
| 28 |
+
# Path: .../systemFactChecking/02_Code/syscred/config.py
|
| 29 |
+
# Root .env is at .../systemFactChecking/.env (3 levels up)
|
| 30 |
+
current_path = Path(__file__).resolve()
|
| 31 |
+
env_path = current_path.parent.parent.parent / '.env'
|
| 32 |
+
|
| 33 |
+
if not env_path.exists():
|
| 34 |
+
print(f"[Config] WARNING: .env not found at {env_path}")
|
| 35 |
+
# Try alternate location (sometimes CWD matters)
|
| 36 |
+
env_path = Path.cwd().parent / '.env'
|
| 37 |
+
|
| 38 |
+
load_dotenv(dotenv_path=env_path)
|
| 39 |
+
print(f"[Config] Loading .env from {env_path}")
|
| 40 |
+
print(f"[Config] SYSCRED_GOOGLE_API_KEY loaded: {'Yes' if os.environ.get('SYSCRED_GOOGLE_API_KEY') else 'No'}")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class Config:
|
| 45 |
+
"""
|
| 46 |
+
Configuration centralisée pour SysCRED.
|
| 47 |
+
|
| 48 |
+
Les valeurs peuvent être override par des variables d'environnement
|
| 49 |
+
préfixées par SYSCRED_.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
# === Chemins ===
|
| 53 |
+
BASE_DIR = Path(__file__).parent.parent
|
| 54 |
+
ONTOLOGY_BASE_PATH = BASE_DIR / "sysCRED_onto26avrtil.ttl"
|
| 55 |
+
ONTOLOGY_DATA_PATH = BASE_DIR / "ontology" / "sysCRED_data.ttl"
|
| 56 |
+
|
| 57 |
+
# === Serveur Flask ===
|
| 58 |
+
HOST = os.getenv("SYSCRED_HOST", "0.0.0.0")
|
| 59 |
+
PORT = int(os.getenv("SYSCRED_PORT", "5000"))
|
| 60 |
+
DEBUG = os.getenv("SYSCRED_DEBUG", "true").lower() == "true"
|
| 61 |
+
|
| 62 |
+
# === API Keys ===
|
| 63 |
+
GOOGLE_FACT_CHECK_API_KEY = os.getenv("SYSCRED_GOOGLE_API_KEY")
|
| 64 |
+
DATABASE_URL = os.getenv("DATABASE_URL") # [NEW] Read DB URL from env
|
| 65 |
+
|
| 66 |
+
# === Modèles ML ===
|
| 67 |
+
# Support both SYSCRED_LOAD_ML and SYSCRED_LOAD_ML_MODELS (for Render)
|
| 68 |
+
LOAD_ML_MODELS = os.getenv("SYSCRED_LOAD_ML_MODELS", os.getenv("SYSCRED_LOAD_ML", "true")).lower() == "true"
|
| 69 |
+
SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
|
| 70 |
+
NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
|
| 71 |
+
|
| 72 |
+
# === Timeouts ===
|
| 73 |
+
WEB_FETCH_TIMEOUT = int(os.getenv("SYSCRED_TIMEOUT", "10"))
|
| 74 |
+
|
| 75 |
+
# === TREC IR Configuration (NEW - Feb 2026) ===
|
| 76 |
+
TREC_INDEX_PATH = os.getenv("SYSCRED_TREC_INDEX", None) # Lucene/Pyserini index
|
| 77 |
+
TREC_CORPUS_PATH = os.getenv("SYSCRED_TREC_CORPUS", None) # JSONL corpus
|
| 78 |
+
TREC_TOPICS_PATH = os.getenv("SYSCRED_TREC_TOPICS", None) # Topics directory
|
| 79 |
+
TREC_QRELS_PATH = os.getenv("SYSCRED_TREC_QRELS", None) # Qrels directory
|
| 80 |
+
|
| 81 |
+
# BM25 Parameters (optimized on AP88-90)
|
| 82 |
+
BM25_K1 = float(os.getenv("SYSCRED_BM25_K1", "0.9"))
|
| 83 |
+
BM25_B = float(os.getenv("SYSCRED_BM25_B", "0.4"))
|
| 84 |
+
|
| 85 |
+
# PRF (Pseudo-Relevance Feedback) settings
|
| 86 |
+
ENABLE_PRF = os.getenv("SYSCRED_ENABLE_PRF", "true").lower() == "true"
|
| 87 |
+
PRF_TOP_DOCS = int(os.getenv("SYSCRED_PRF_TOP_DOCS", "3"))
|
| 88 |
+
PRF_EXPANSION_TERMS = int(os.getenv("SYSCRED_PRF_TERMS", "10"))
|
| 89 |
+
|
| 90 |
+
# === Pondération des scores ===
|
| 91 |
+
SCORE_WEIGHTS = {
|
| 92 |
+
'source_reputation': 0.25,
|
| 93 |
+
'domain_age': 0.10,
|
| 94 |
+
'sentiment_neutrality': 0.15,
|
| 95 |
+
'entity_presence': 0.15,
|
| 96 |
+
'coherence': 0.15,
|
| 97 |
+
'fact_check': 0.20
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# === Seuils de crédibilité ===
|
| 101 |
+
CREDIBILITY_THRESHOLDS = {
|
| 102 |
+
'HIGH': 0.7,
|
| 103 |
+
'MEDIUM': 0.4,
|
| 104 |
+
'LOW': 0.0
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# === Base de données de réputation ===
|
| 108 |
+
# Les sources peuvent être étendues ou chargées d'un fichier externe
|
| 109 |
+
SOURCE_REPUTATIONS: Dict[str, str] = {
|
| 110 |
+
# === HAUTE CRÉDIBILITÉ ===
|
| 111 |
+
# Médias internationaux
|
| 112 |
+
'lemonde.fr': 'High',
|
| 113 |
+
'nytimes.com': 'High',
|
| 114 |
+
'reuters.com': 'High',
|
| 115 |
+
'bbc.com': 'High',
|
| 116 |
+
'bbc.co.uk': 'High',
|
| 117 |
+
'theguardian.com': 'High',
|
| 118 |
+
'apnews.com': 'High',
|
| 119 |
+
'afp.com': 'High',
|
| 120 |
+
'france24.com': 'High',
|
| 121 |
+
|
| 122 |
+
# Médias canadiens
|
| 123 |
+
'cbc.ca': 'High',
|
| 124 |
+
'radio-canada.ca': 'High',
|
| 125 |
+
'lapresse.ca': 'High',
|
| 126 |
+
'ledevoir.com': 'High',
|
| 127 |
+
'theglobeandmail.com': 'High',
|
| 128 |
+
|
| 129 |
+
# Sources académiques
|
| 130 |
+
'nature.com': 'High',
|
| 131 |
+
'sciencedirect.com': 'High',
|
| 132 |
+
'scholar.google.com': 'High',
|
| 133 |
+
'pubmed.ncbi.nlm.nih.gov': 'High',
|
| 134 |
+
'jstor.org': 'High',
|
| 135 |
+
'springer.com': 'High',
|
| 136 |
+
'ieee.org': 'High',
|
| 137 |
+
'acm.org': 'High',
|
| 138 |
+
'arxiv.org': 'High',
|
| 139 |
+
|
| 140 |
+
# Fact-checkers
|
| 141 |
+
'factcheck.org': 'High',
|
| 142 |
+
'snopes.com': 'High',
|
| 143 |
+
'politifact.com': 'High',
|
| 144 |
+
'fullfact.org': 'High',
|
| 145 |
+
'checknews.fr': 'High',
|
| 146 |
+
|
| 147 |
+
# Institutions
|
| 148 |
+
'who.int': 'High',
|
| 149 |
+
'un.org': 'High',
|
| 150 |
+
'europa.eu': 'High',
|
| 151 |
+
'canada.ca': 'High',
|
| 152 |
+
'gouv.fr': 'High',
|
| 153 |
+
'gouv.qc.ca': 'High',
|
| 154 |
+
|
| 155 |
+
# === CRÉDIBILITÉ MOYENNE ===
|
| 156 |
+
'wikipedia.org': 'Medium',
|
| 157 |
+
'medium.com': 'Medium',
|
| 158 |
+
'huffpost.com': 'Medium',
|
| 159 |
+
'buzzfeed.com': 'Medium',
|
| 160 |
+
'vice.com': 'Medium',
|
| 161 |
+
'slate.com': 'Medium',
|
| 162 |
+
'theconversation.com': 'Medium',
|
| 163 |
+
|
| 164 |
+
# === BASSE CRÉDIBILITÉ ===
|
| 165 |
+
'infowars.com': 'Low',
|
| 166 |
+
'naturalnews.com': 'Low',
|
| 167 |
+
'breitbart.com': 'Low',
|
| 168 |
+
'dailystormer.su': 'Low',
|
| 169 |
+
'beforeitsnews.com': 'Low',
|
| 170 |
+
'worldtruth.tv': 'Low',
|
| 171 |
+
'yournewswire.com': 'Low',
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
# === Patterns de mésinformation ===
|
| 175 |
+
MISINFORMATION_KEYWORDS = [
|
| 176 |
+
'conspiracy', 'hoax', 'fake news', 'miracle cure',
|
| 177 |
+
"they don't want you to know", 'mainstream media lies',
|
| 178 |
+
'deep state', 'plandemic', 'wake up sheeple',
|
| 179 |
+
'big pharma cover-up', 'government conspiracy',
|
| 180 |
+
'censored truth', 'what they hide'
|
| 181 |
+
]
|
| 182 |
+
|
| 183 |
+
@classmethod
|
| 184 |
+
def load_external_reputations(cls, filepath: str) -> None:
|
| 185 |
+
"""
|
| 186 |
+
Charger des réputations supplémentaires depuis un fichier JSON.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
filepath: Chemin vers le fichier JSON avec format:
|
| 190 |
+
{"domain.com": "High", "autre.com": "Low"}
|
| 191 |
+
"""
|
| 192 |
+
import json
|
| 193 |
+
try:
|
| 194 |
+
with open(filepath, 'r') as f:
|
| 195 |
+
external_reps = json.load(f)
|
| 196 |
+
cls.SOURCE_REPUTATIONS.update(external_reps)
|
| 197 |
+
print(f"[Config] Loaded {len(external_reps)} external reputations")
|
| 198 |
+
except Exception as e:
|
| 199 |
+
print(f"[Config] Could not load external reputations: {e}")
|
| 200 |
+
|
| 201 |
+
@classmethod
|
| 202 |
+
def update_weights(cls, new_weights: Dict[str, float]) -> None:
|
| 203 |
+
"""
|
| 204 |
+
Mettre à jour les pondérations des scores.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
new_weights: Dictionnaire avec les nouvelles pondérations
|
| 208 |
+
"""
|
| 209 |
+
cls.SCORE_WEIGHTS.update(new_weights)
|
| 210 |
+
# Normaliser pour que la somme = 1
|
| 211 |
+
total = sum(cls.SCORE_WEIGHTS.values())
|
| 212 |
+
cls.SCORE_WEIGHTS = {k: v/total for k, v in cls.SCORE_WEIGHTS.items()}
|
| 213 |
+
print(f"[Config] Updated weights: {cls.SCORE_WEIGHTS}")
|
| 214 |
+
|
| 215 |
+
@classmethod
|
| 216 |
+
def to_dict(cls) -> Dict:
|
| 217 |
+
"""Exporter la configuration actuelle en dictionnaire."""
|
| 218 |
+
return {
|
| 219 |
+
'host': cls.HOST,
|
| 220 |
+
'port': cls.PORT,
|
| 221 |
+
'debug': cls.DEBUG,
|
| 222 |
+
'google_api_configured': cls.GOOGLE_FACT_CHECK_API_KEY is not None,
|
| 223 |
+
'ml_models_enabled': cls.LOAD_ML_MODELS,
|
| 224 |
+
'score_weights': cls.SCORE_WEIGHTS,
|
| 225 |
+
'known_sources_count': len(cls.SOURCE_REPUTATIONS),
|
| 226 |
+
'ontology_base': str(cls.ONTOLOGY_BASE_PATH),
|
| 227 |
+
'ontology_data': str(cls.ONTOLOGY_DATA_PATH),
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
@classmethod
|
| 231 |
+
def print_config(cls) -> None:
|
| 232 |
+
"""Afficher la configuration actuelle."""
|
| 233 |
+
print("=" * 50)
|
| 234 |
+
print("SysCRED Configuration")
|
| 235 |
+
print("=" * 50)
|
| 236 |
+
for key, value in cls.to_dict().items():
|
| 237 |
+
print(f" {key}: {value}")
|
| 238 |
+
print("=" * 50)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# === Configuration par environnement ===
|
| 242 |
+
|
| 243 |
+
class DevelopmentConfig(Config):
|
| 244 |
+
"""Configuration pour développement local."""
|
| 245 |
+
DEBUG = True
|
| 246 |
+
LOAD_ML_MODELS = True
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
class ProductionConfig(Config):
|
| 250 |
+
"""Configuration pour production."""
|
| 251 |
+
DEBUG = False
|
| 252 |
+
LOAD_ML_MODELS = True
|
| 253 |
+
HOST = "0.0.0.0"
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
class TestingConfig(Config):
|
| 257 |
+
"""Configuration pour tests."""
|
| 258 |
+
DEBUG = True
|
| 259 |
+
LOAD_ML_MODELS = False # Plus rapide pour les tests
|
| 260 |
+
WEB_FETCH_TIMEOUT = 5
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# Sélection automatique de la configuration
|
| 264 |
+
def get_config() -> Config:
|
| 265 |
+
"""
|
| 266 |
+
Retourne la configuration appropriée selon l'environnement.
|
| 267 |
+
|
| 268 |
+
Variable d'environnement: SYSCRED_ENV (development, production, testing)
|
| 269 |
+
"""
|
| 270 |
+
env = os.getenv("SYSCRED_ENV", "development").lower()
|
| 271 |
+
|
| 272 |
+
configs = {
|
| 273 |
+
'development': DevelopmentConfig,
|
| 274 |
+
'production': ProductionConfig,
|
| 275 |
+
'testing': TestingConfig,
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
return configs.get(env, DevelopmentConfig)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# Instance par défaut
|
| 282 |
+
config = get_config()
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
# Test de la configuration
|
| 287 |
+
config.print_config()
|
| 288 |
+
|
| 289 |
+
print("\n=== Source Reputations Sample ===")
|
| 290 |
+
for domain, rep in list(config.SOURCE_REPUTATIONS.items())[:10]:
|
| 291 |
+
print(f" {domain}: {rep}")
|
syscred/database.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Database Manager for SysCRED
|
| 4 |
+
===========================
|
| 5 |
+
Handles connection to Supabase (PostgreSQL) and defines models.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from flask_sqlalchemy import SQLAlchemy
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
# Initialize SQLAlchemy
|
| 13 |
+
db = SQLAlchemy()
|
| 14 |
+
|
| 15 |
+
class AnalysisResult(db.Model):
|
| 16 |
+
"""Stores the result of a credibility analysis."""
|
| 17 |
+
__tablename__ = 'analysis_results'
|
| 18 |
+
|
| 19 |
+
id = db.Column(db.Integer, primary_key=True)
|
| 20 |
+
url = db.Column(db.String(500), nullable=False)
|
| 21 |
+
credibility_score = db.Column(db.Float, nullable=False)
|
| 22 |
+
summary = db.Column(db.Text)
|
| 23 |
+
created_at = db.Column(db.DateTime, default=datetime.utcnow)
|
| 24 |
+
|
| 25 |
+
# Metadata stored as JSON if supported, or simplified columns
|
| 26 |
+
source_reputation = db.Column(db.String(50))
|
| 27 |
+
fact_check_count = db.Column(db.Integer, default=0)
|
| 28 |
+
|
| 29 |
+
def to_dict(self):
|
| 30 |
+
return {
|
| 31 |
+
'id': self.id,
|
| 32 |
+
'url': self.url,
|
| 33 |
+
'score': self.credibility_score,
|
| 34 |
+
'summary': self.summary,
|
| 35 |
+
'created_at': self.created_at.isoformat(),
|
| 36 |
+
'source_reputation': self.source_reputation
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
def init_db(app):
|
| 40 |
+
"""Initialize the database with the Flask app."""
|
| 41 |
+
# Fallback to sqlite for local dev if no DATABASE_URL
|
| 42 |
+
db_url = os.environ.get('DATABASE_URL')
|
| 43 |
+
if db_url and db_url.startswith("postgres://"):
|
| 44 |
+
db_url = db_url.replace("postgres://", "postgresql://", 1)
|
| 45 |
+
|
| 46 |
+
app.config['SQLALCHEMY_DATABASE_URI'] = db_url or 'sqlite:///syscred.db'
|
| 47 |
+
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
| 48 |
+
|
| 49 |
+
db.init_app(app)
|
| 50 |
+
|
| 51 |
+
# Create tables if they don't exist (basic migration)
|
| 52 |
+
with app.app_context():
|
| 53 |
+
db.create_all()
|
| 54 |
+
print("[SysCRED-DB] Database tables initialized.")
|
syscred/debug_factcheck.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
# Load environment variables
|
| 6 |
+
load_dotenv(dotenv_path='/Users/bk280625/documents041025/MonCode/syscred/.env')
|
| 7 |
+
|
| 8 |
+
API_KEY = os.getenv('SYSCRED_GOOGLE_API_KEY')
|
| 9 |
+
print(f"Loaded API Key: {API_KEY[:5]}...{API_KEY[-5:] if API_KEY else 'None'}")
|
| 10 |
+
|
| 11 |
+
if not API_KEY:
|
| 12 |
+
print("❌ Error: API Key not found in .env")
|
| 13 |
+
exit(1)
|
| 14 |
+
|
| 15 |
+
query = "La terre est plate"
|
| 16 |
+
url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
|
| 17 |
+
params = {
|
| 18 |
+
'key': API_KEY,
|
| 19 |
+
'query': query,
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
print(f"\nSending request for query: '{query}'...")
|
| 23 |
+
try:
|
| 24 |
+
response = requests.get(url, params=params)
|
| 25 |
+
print(f"Status Code: {response.status_code}")
|
| 26 |
+
|
| 27 |
+
if response.status_code == 200:
|
| 28 |
+
data = response.json()
|
| 29 |
+
claims = data.get('claims', [])
|
| 30 |
+
print(f"✅ Success! Found {len(claims)} claims.")
|
| 31 |
+
for i, claim in enumerate(claims[:3]):
|
| 32 |
+
print(f"\n--- Result {i+1} ---")
|
| 33 |
+
print(f"Claim: {claim.get('text')}")
|
| 34 |
+
print(f"Claimant: {claim.get('claimant')}")
|
| 35 |
+
reviews = claim.get('claimReview', [])
|
| 36 |
+
if reviews:
|
| 37 |
+
print(f"Rating: {reviews[0].get('textualRating')}")
|
| 38 |
+
print(f"URL: {reviews[0].get('url')}")
|
| 39 |
+
else:
|
| 40 |
+
print(f"❌ API Error: {response.text}")
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"❌ Connection Error: {e}")
|
syscred/debug_graph_json.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
# Add project root to path (one level up from this script)
|
| 6 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 7 |
+
|
| 8 |
+
from syscred.ontology_manager import OntologyManager
|
| 9 |
+
from syscred.config import config
|
| 10 |
+
|
| 11 |
+
def debug_graph():
|
| 12 |
+
print("=== Debugging Ontology Graph Extraction ===")
|
| 13 |
+
|
| 14 |
+
# Initialize manager
|
| 15 |
+
base_path = str(config.ONTOLOGY_BASE_PATH)
|
| 16 |
+
data_path = str(config.ONTOLOGY_DATA_PATH)
|
| 17 |
+
|
| 18 |
+
print(f"Loading data from: {data_path}")
|
| 19 |
+
manager = OntologyManager(base_ontology_path=base_path, data_path=data_path)
|
| 20 |
+
|
| 21 |
+
# Get Stats
|
| 22 |
+
stats = manager.get_statistics()
|
| 23 |
+
print(f"Total Triples: {stats['total_triples']}")
|
| 24 |
+
print(f"Evaluations: {stats.get('total_evaluations', 'N/A')}")
|
| 25 |
+
|
| 26 |
+
# Try getting graph JSON
|
| 27 |
+
print("\nExtracting Graph JSON...")
|
| 28 |
+
graph_data = manager.get_graph_json()
|
| 29 |
+
|
| 30 |
+
nodes = graph_data.get('nodes', [])
|
| 31 |
+
links = graph_data.get('links', [])
|
| 32 |
+
|
| 33 |
+
print(f"Nodes found: {len(nodes)}")
|
| 34 |
+
print(f"Links found: {len(links)}")
|
| 35 |
+
|
| 36 |
+
if len(nodes) > 0:
|
| 37 |
+
print("\n--- Sample Nodes ---")
|
| 38 |
+
for n in nodes[:3]:
|
| 39 |
+
print(json.dumps(n, indent=2))
|
| 40 |
+
else:
|
| 41 |
+
print("\n❌ No nodes found! Checking latest report query...")
|
| 42 |
+
# Manually run the query to see what's wrong
|
| 43 |
+
query = """
|
| 44 |
+
PREFIX cred: <http://www.dic9335.uqam.ca/ontologies/credibility-verification#>
|
| 45 |
+
SELECT ?report ?timestamp WHERE {
|
| 46 |
+
?report a cred:RapportEvaluation .
|
| 47 |
+
?report cred:completionTimestamp ?timestamp .
|
| 48 |
+
}
|
| 49 |
+
ORDER BY DESC(?timestamp)
|
| 50 |
+
LIMIT 5
|
| 51 |
+
"""
|
| 52 |
+
print(f"Running SPARQL:\n{query}")
|
| 53 |
+
results = manager.data_graph.query(query)
|
| 54 |
+
for row in results:
|
| 55 |
+
print(f"Found Report: {row.report} at {row.timestamp}")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
debug_graph()
|
syscred/debug_init.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import traceback
|
| 5 |
+
|
| 6 |
+
# Setup path
|
| 7 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 8 |
+
|
| 9 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 10 |
+
from syscred.config import config
|
| 11 |
+
from syscred.seo_analyzer import SEOAnalyzer
|
| 12 |
+
|
| 13 |
+
print("=== DEBUG INITIALIZATION ===")
|
| 14 |
+
try:
|
| 15 |
+
print("[1] Config check:")
|
| 16 |
+
print(f" Base Ontology: {config.ONTOLOGY_BASE_PATH}")
|
| 17 |
+
print(f" Data Path: {config.ONTOLOGY_DATA_PATH}")
|
| 18 |
+
|
| 19 |
+
print("\n[2] Initializing SEO Analyzer...")
|
| 20 |
+
seo = SEOAnalyzer()
|
| 21 |
+
print(" OK")
|
| 22 |
+
|
| 23 |
+
print("\n[3] Initializing Verification System...")
|
| 24 |
+
sys = CredibilityVerificationSystem(
|
| 25 |
+
ontology_base_path=config.ONTOLOGY_BASE_PATH,
|
| 26 |
+
ontology_data_path=config.ONTOLOGY_DATA_PATH,
|
| 27 |
+
load_ml_models=False # Disable ML for basic init test
|
| 28 |
+
)
|
| 29 |
+
print(" OK - System initialized successfully.")
|
| 30 |
+
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"\n❌ FATAL ERROR: {e}")
|
| 33 |
+
traceback.print_exc()
|
syscred/debug_local_server.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
url = "http://localhost:5001/api/verify"
|
| 5 |
+
payload = {
|
| 6 |
+
"input_data": "la terre est plate",
|
| 7 |
+
"include_seo": True
|
| 8 |
+
}
|
| 9 |
+
headers = {'Content-Type': 'application/json'}
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
print(f"Sending POST to {url} with payload: {payload}")
|
| 13 |
+
response = requests.post(url, json=payload, headers=headers)
|
| 14 |
+
print(f"Status: {response.status_code}")
|
| 15 |
+
|
| 16 |
+
if response.status_code == 200:
|
| 17 |
+
data = response.json()
|
| 18 |
+
print("\n--- JSON RESPONSE PARTIAL ---")
|
| 19 |
+
facts = data.get('reglesAppliquees', {}).get('fact_checking', [])
|
| 20 |
+
print(f"Fact Checks Count: {len(facts)}")
|
| 21 |
+
print("Fact Checks Items:", json.dumps(facts, indent=2, ensure_ascii=False))
|
| 22 |
+
else:
|
| 23 |
+
print("Error:", response.text)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"Connection failed: {e}")
|
syscred/diagnose_imports.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import traceback
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 7 |
+
|
| 8 |
+
print("--- DIAGNOSTIC START ---")
|
| 9 |
+
try:
|
| 10 |
+
print("[1] Importing config...")
|
| 11 |
+
from syscred.config import config
|
| 12 |
+
print(" OK")
|
| 13 |
+
except Exception:
|
| 14 |
+
traceback.print_exc()
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
print("[2] Importing database...")
|
| 18 |
+
from syscred.database import init_db
|
| 19 |
+
print(" OK")
|
| 20 |
+
except Exception:
|
| 21 |
+
traceback.print_exc()
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
print("[3] Importing ontology_manager...")
|
| 25 |
+
from syscred.ontology_manager import OntologyManager
|
| 26 |
+
print(" OK")
|
| 27 |
+
except Exception:
|
| 28 |
+
traceback.print_exc()
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
print("[4] Importing verification_system...")
|
| 32 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 33 |
+
print(" OK")
|
| 34 |
+
except Exception:
|
| 35 |
+
traceback.print_exc()
|
| 36 |
+
|
| 37 |
+
print("--- DIAGNOSTIC END ---")
|
syscred/eval_metrics.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Evaluation Metrics Module - SysCRED
|
| 4 |
+
====================================
|
| 5 |
+
Information Retrieval evaluation metrics for TREC-style experiments.
|
| 6 |
+
|
| 7 |
+
Metrics:
|
| 8 |
+
- MAP (Mean Average Precision)
|
| 9 |
+
- NDCG (Normalized Discounted Cumulative Gain)
|
| 10 |
+
- P@K (Precision at K)
|
| 11 |
+
- Recall@K
|
| 12 |
+
- MRR (Mean Reciprocal Rank)
|
| 13 |
+
|
| 14 |
+
Based on pytrec_eval for official TREC evaluation.
|
| 15 |
+
|
| 16 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 17 |
+
Citation Key: loyerEvaluationModelesRecherche2025
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import math
|
| 21 |
+
from typing import Dict, List, Tuple, Any
|
| 22 |
+
from collections import defaultdict
|
| 23 |
+
|
| 24 |
+
# Check for pytrec_eval
|
| 25 |
+
try:
|
| 26 |
+
import pytrec_eval
|
| 27 |
+
HAS_PYTREC_EVAL = True
|
| 28 |
+
except ImportError:
|
| 29 |
+
HAS_PYTREC_EVAL = False
|
| 30 |
+
print("[EvalMetrics] pytrec_eval not installed. Using built-in metrics.")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class EvaluationMetrics:
|
| 34 |
+
"""
|
| 35 |
+
IR Evaluation metrics using pytrec_eval or built-in implementations.
|
| 36 |
+
|
| 37 |
+
Supports TREC-style evaluation with:
|
| 38 |
+
- Official pytrec_eval (if available)
|
| 39 |
+
- Fallback pure-Python implementations
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""Initialize the metrics calculator."""
|
| 44 |
+
self.use_pytrec = HAS_PYTREC_EVAL
|
| 45 |
+
|
| 46 |
+
# --- Built-in Metric Implementations ---
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def precision_at_k(retrieved: List[str], relevant: set, k: int) -> float:
|
| 50 |
+
"""
|
| 51 |
+
Calculate Precision@K.
|
| 52 |
+
|
| 53 |
+
P@K = |relevant ∩ retrieved[:k]| / k
|
| 54 |
+
"""
|
| 55 |
+
if k <= 0:
|
| 56 |
+
return 0.0
|
| 57 |
+
retrieved_k = retrieved[:k]
|
| 58 |
+
relevant_retrieved = len([d for d in retrieved_k if d in relevant])
|
| 59 |
+
return relevant_retrieved / k
|
| 60 |
+
|
| 61 |
+
@staticmethod
|
| 62 |
+
def recall_at_k(retrieved: List[str], relevant: set, k: int) -> float:
|
| 63 |
+
"""
|
| 64 |
+
Calculate Recall@K.
|
| 65 |
+
|
| 66 |
+
R@K = |relevant ∩ retrieved[:k]| / |relevant|
|
| 67 |
+
"""
|
| 68 |
+
if not relevant:
|
| 69 |
+
return 0.0
|
| 70 |
+
retrieved_k = retrieved[:k]
|
| 71 |
+
relevant_retrieved = len([d for d in retrieved_k if d in relevant])
|
| 72 |
+
return relevant_retrieved / len(relevant)
|
| 73 |
+
|
| 74 |
+
@staticmethod
|
| 75 |
+
def average_precision(retrieved: List[str], relevant: set) -> float:
|
| 76 |
+
"""
|
| 77 |
+
Calculate Average Precision for a single query.
|
| 78 |
+
|
| 79 |
+
AP = (1/|relevant|) × Σ (P@k × rel(k))
|
| 80 |
+
"""
|
| 81 |
+
if not relevant:
|
| 82 |
+
return 0.0
|
| 83 |
+
|
| 84 |
+
hits = 0
|
| 85 |
+
sum_precision = 0.0
|
| 86 |
+
|
| 87 |
+
for i, doc in enumerate(retrieved):
|
| 88 |
+
if doc in relevant:
|
| 89 |
+
hits += 1
|
| 90 |
+
sum_precision += hits / (i + 1)
|
| 91 |
+
|
| 92 |
+
return sum_precision / len(relevant)
|
| 93 |
+
|
| 94 |
+
@staticmethod
|
| 95 |
+
def dcg_at_k(retrieved: List[str], relevance: Dict[str, int], k: int) -> float:
|
| 96 |
+
"""
|
| 97 |
+
Calculate DCG@K (Discounted Cumulative Gain).
|
| 98 |
+
|
| 99 |
+
DCG@K = Σ (2^rel(i) - 1) / log2(i + 2)
|
| 100 |
+
"""
|
| 101 |
+
dcg = 0.0
|
| 102 |
+
for i, doc in enumerate(retrieved[:k]):
|
| 103 |
+
rel = relevance.get(doc, 0)
|
| 104 |
+
dcg += (2 ** rel - 1) / math.log2(i + 2)
|
| 105 |
+
return dcg
|
| 106 |
+
|
| 107 |
+
@staticmethod
|
| 108 |
+
def ndcg_at_k(retrieved: List[str], relevance: Dict[str, int], k: int) -> float:
|
| 109 |
+
"""
|
| 110 |
+
Calculate NDCG@K (Normalized DCG).
|
| 111 |
+
|
| 112 |
+
NDCG@K = DCG@K / IDCG@K
|
| 113 |
+
"""
|
| 114 |
+
dcg = EvaluationMetrics.dcg_at_k(retrieved, relevance, k)
|
| 115 |
+
|
| 116 |
+
# Calculate IDCG (ideal DCG)
|
| 117 |
+
sorted_rels = sorted(relevance.values(), reverse=True)[:k]
|
| 118 |
+
idcg = 0.0
|
| 119 |
+
for i, rel in enumerate(sorted_rels):
|
| 120 |
+
idcg += (2 ** rel - 1) / math.log2(i + 2)
|
| 121 |
+
|
| 122 |
+
return dcg / idcg if idcg > 0 else 0.0
|
| 123 |
+
|
| 124 |
+
@staticmethod
|
| 125 |
+
def reciprocal_rank(retrieved: List[str], relevant: set) -> float:
|
| 126 |
+
"""
|
| 127 |
+
Calculate Reciprocal Rank.
|
| 128 |
+
|
| 129 |
+
RR = 1 / rank of first relevant document
|
| 130 |
+
"""
|
| 131 |
+
for i, doc in enumerate(retrieved):
|
| 132 |
+
if doc in relevant:
|
| 133 |
+
return 1.0 / (i + 1)
|
| 134 |
+
return 0.0
|
| 135 |
+
|
| 136 |
+
# --- TREC-Style Evaluation ---
|
| 137 |
+
|
| 138 |
+
def evaluate_run(
|
| 139 |
+
self,
|
| 140 |
+
run: Dict[str, List[Tuple[str, float]]],
|
| 141 |
+
qrels: Dict[str, Dict[str, int]],
|
| 142 |
+
metrics: List[str] = None
|
| 143 |
+
) -> Dict[str, Dict[str, float]]:
|
| 144 |
+
"""
|
| 145 |
+
Evaluate a run against qrels (relevance judgments).
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
run: {query_id: [(doc_id, score), ...]}
|
| 149 |
+
qrels: {query_id: {doc_id: relevance}}
|
| 150 |
+
metrics: List of metrics to compute
|
| 151 |
+
['map', 'ndcg', 'P_5', 'P_10', 'recall_100']
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
{query_id: {metric: value}}
|
| 155 |
+
"""
|
| 156 |
+
if metrics is None:
|
| 157 |
+
metrics = ['map', 'ndcg', 'P_5', 'P_10', 'P_20', 'recall_100', 'recip_rank']
|
| 158 |
+
|
| 159 |
+
if self.use_pytrec and HAS_PYTREC_EVAL:
|
| 160 |
+
return self._evaluate_pytrec(run, qrels, metrics)
|
| 161 |
+
else:
|
| 162 |
+
return self._evaluate_builtin(run, qrels, metrics)
|
| 163 |
+
|
| 164 |
+
def _evaluate_pytrec(
|
| 165 |
+
self,
|
| 166 |
+
run: Dict[str, List[Tuple[str, float]]],
|
| 167 |
+
qrels: Dict[str, Dict[str, int]],
|
| 168 |
+
metrics: List[str]
|
| 169 |
+
) -> Dict[str, Dict[str, float]]:
|
| 170 |
+
"""Evaluate using pytrec_eval."""
|
| 171 |
+
# Convert run format for pytrec_eval
|
| 172 |
+
pytrec_run = {}
|
| 173 |
+
for qid, docs in run.items():
|
| 174 |
+
pytrec_run[qid] = {doc_id: score for doc_id, score in docs}
|
| 175 |
+
|
| 176 |
+
# Create evaluator
|
| 177 |
+
evaluator = pytrec_eval.RelevanceEvaluator(qrels, set(metrics))
|
| 178 |
+
|
| 179 |
+
# Evaluate
|
| 180 |
+
results = evaluator.evaluate(pytrec_run)
|
| 181 |
+
|
| 182 |
+
return results
|
| 183 |
+
|
| 184 |
+
def _evaluate_builtin(
|
| 185 |
+
self,
|
| 186 |
+
run: Dict[str, List[Tuple[str, float]]],
|
| 187 |
+
qrels: Dict[str, Dict[str, int]],
|
| 188 |
+
metrics: List[str]
|
| 189 |
+
) -> Dict[str, Dict[str, float]]:
|
| 190 |
+
"""Evaluate using built-in implementations."""
|
| 191 |
+
results = {}
|
| 192 |
+
|
| 193 |
+
for qid, docs_scores in run.items():
|
| 194 |
+
if qid not in qrels:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
q_results = {}
|
| 198 |
+
retrieved = [doc_id for doc_id, _ in docs_scores]
|
| 199 |
+
relevance = qrels[qid]
|
| 200 |
+
relevant = set(doc_id for doc_id, rel in relevance.items() if rel > 0)
|
| 201 |
+
|
| 202 |
+
for metric in metrics:
|
| 203 |
+
if metric == 'map':
|
| 204 |
+
q_results['map'] = self.average_precision(retrieved, relevant)
|
| 205 |
+
elif metric == 'ndcg':
|
| 206 |
+
q_results['ndcg'] = self.ndcg_at_k(retrieved, relevance, 1000)
|
| 207 |
+
elif metric.startswith('ndcg_cut_'):
|
| 208 |
+
k = int(metric.split('_')[-1])
|
| 209 |
+
q_results[metric] = self.ndcg_at_k(retrieved, relevance, k)
|
| 210 |
+
elif metric.startswith('P_'):
|
| 211 |
+
k = int(metric.split('_')[-1])
|
| 212 |
+
q_results[metric] = self.precision_at_k(retrieved, relevant, k)
|
| 213 |
+
elif metric.startswith('recall_'):
|
| 214 |
+
k = int(metric.split('_')[-1])
|
| 215 |
+
q_results[metric] = self.recall_at_k(retrieved, relevant, k)
|
| 216 |
+
elif metric == 'recip_rank':
|
| 217 |
+
q_results['recip_rank'] = self.reciprocal_rank(retrieved, relevant)
|
| 218 |
+
|
| 219 |
+
results[qid] = q_results
|
| 220 |
+
|
| 221 |
+
return results
|
| 222 |
+
|
| 223 |
+
def compute_aggregate(
|
| 224 |
+
self,
|
| 225 |
+
results: Dict[str, Dict[str, float]]
|
| 226 |
+
) -> Dict[str, float]:
|
| 227 |
+
"""
|
| 228 |
+
Compute aggregate metrics across all queries.
|
| 229 |
+
|
| 230 |
+
Returns mean values for each metric.
|
| 231 |
+
"""
|
| 232 |
+
if not results:
|
| 233 |
+
return {}
|
| 234 |
+
|
| 235 |
+
aggregated = defaultdict(list)
|
| 236 |
+
for qid, metrics in results.items():
|
| 237 |
+
for metric, value in metrics.items():
|
| 238 |
+
aggregated[metric].append(value)
|
| 239 |
+
|
| 240 |
+
return {metric: sum(values) / len(values)
|
| 241 |
+
for metric, values in aggregated.items()}
|
| 242 |
+
|
| 243 |
+
def format_results(
|
| 244 |
+
self,
|
| 245 |
+
results: Dict[str, Dict[str, float]],
|
| 246 |
+
include_per_query: bool = False
|
| 247 |
+
) -> str:
|
| 248 |
+
"""Format results as a readable string."""
|
| 249 |
+
lines = []
|
| 250 |
+
|
| 251 |
+
# Aggregate
|
| 252 |
+
agg = self.compute_aggregate(results)
|
| 253 |
+
lines.append("=" * 50)
|
| 254 |
+
lines.append("AGGREGATE METRICS")
|
| 255 |
+
lines.append("=" * 50)
|
| 256 |
+
for metric, value in sorted(agg.items()):
|
| 257 |
+
lines.append(f" {metric:20s}: {value:.4f}")
|
| 258 |
+
|
| 259 |
+
# Per-query (optional)
|
| 260 |
+
if include_per_query:
|
| 261 |
+
lines.append("")
|
| 262 |
+
lines.append("=" * 50)
|
| 263 |
+
lines.append("PER-QUERY METRICS")
|
| 264 |
+
lines.append("=" * 50)
|
| 265 |
+
for qid in sorted(results.keys()):
|
| 266 |
+
lines.append(f"\nQuery {qid}:")
|
| 267 |
+
for metric, value in sorted(results[qid].items()):
|
| 268 |
+
lines.append(f" {metric:20s}: {value:.4f}")
|
| 269 |
+
|
| 270 |
+
return '\n'.join(lines)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def parse_qrels_file(filepath: str) -> Dict[str, Dict[str, int]]:
|
| 274 |
+
"""
|
| 275 |
+
Parse a TREC qrels file.
|
| 276 |
+
|
| 277 |
+
Format: query_id 0 doc_id relevance
|
| 278 |
+
"""
|
| 279 |
+
qrels = defaultdict(dict)
|
| 280 |
+
with open(filepath, 'r') as f:
|
| 281 |
+
for line in f:
|
| 282 |
+
parts = line.strip().split()
|
| 283 |
+
if len(parts) >= 4:
|
| 284 |
+
qid, _, docid, rel = parts[:4]
|
| 285 |
+
qrels[qid][docid] = int(rel)
|
| 286 |
+
return dict(qrels)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def parse_run_file(filepath: str) -> Dict[str, List[Tuple[str, float]]]:
|
| 290 |
+
"""
|
| 291 |
+
Parse a TREC run file.
|
| 292 |
+
|
| 293 |
+
Format: query_id Q0 doc_id rank score run_tag
|
| 294 |
+
"""
|
| 295 |
+
run = defaultdict(list)
|
| 296 |
+
with open(filepath, 'r') as f:
|
| 297 |
+
for line in f:
|
| 298 |
+
parts = line.strip().split()
|
| 299 |
+
if len(parts) >= 5:
|
| 300 |
+
qid, _, docid, rank, score = parts[:5]
|
| 301 |
+
run[qid].append((docid, float(score)))
|
| 302 |
+
|
| 303 |
+
# Sort by score descending
|
| 304 |
+
for qid in run:
|
| 305 |
+
run[qid].sort(key=lambda x: x[1], reverse=True)
|
| 306 |
+
|
| 307 |
+
return dict(run)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
# --- Testing ---
|
| 311 |
+
if __name__ == "__main__":
|
| 312 |
+
print("=" * 60)
|
| 313 |
+
print("SysCRED Evaluation Metrics - Tests")
|
| 314 |
+
print("=" * 60)
|
| 315 |
+
|
| 316 |
+
metrics = EvaluationMetrics()
|
| 317 |
+
print(f"\nUsing pytrec_eval: {metrics.use_pytrec}")
|
| 318 |
+
|
| 319 |
+
# Test data
|
| 320 |
+
retrieved = ['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8', 'doc9', 'doc10']
|
| 321 |
+
relevant = {'doc1', 'doc3', 'doc5', 'doc8'}
|
| 322 |
+
relevance = {'doc1': 2, 'doc3': 1, 'doc5': 2, 'doc8': 1}
|
| 323 |
+
|
| 324 |
+
print("\n--- Built-in Metrics Tests ---")
|
| 325 |
+
print(f"P@5: {metrics.precision_at_k(retrieved, relevant, 5):.4f}")
|
| 326 |
+
print(f"P@10: {metrics.precision_at_k(retrieved, relevant, 10):.4f}")
|
| 327 |
+
print(f"R@5: {metrics.recall_at_k(retrieved, relevant, 5):.4f}")
|
| 328 |
+
print(f"R@10: {metrics.recall_at_k(retrieved, relevant, 10):.4f}")
|
| 329 |
+
print(f"AP: {metrics.average_precision(retrieved, relevant):.4f}")
|
| 330 |
+
print(f"NDCG@10: {metrics.ndcg_at_k(retrieved, relevance, 10):.4f}")
|
| 331 |
+
print(f"RR: {metrics.reciprocal_rank(retrieved, relevant):.4f}")
|
| 332 |
+
|
| 333 |
+
# Test run evaluation
|
| 334 |
+
print("\n--- Run Evaluation Test ---")
|
| 335 |
+
run = {
|
| 336 |
+
'Q1': [(doc, 10-i) for i, doc in enumerate(retrieved)],
|
| 337 |
+
'Q2': [('doc2', 10), ('doc1', 9), ('doc4', 8), ('doc3', 7)]
|
| 338 |
+
}
|
| 339 |
+
qrels = {
|
| 340 |
+
'Q1': relevance,
|
| 341 |
+
'Q2': {'doc1': 1, 'doc3': 2}
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
results = metrics.evaluate_run(run, qrels)
|
| 345 |
+
print(metrics.format_results(results))
|
| 346 |
+
|
| 347 |
+
print("\n" + "=" * 60)
|
| 348 |
+
print("Tests complete!")
|
| 349 |
+
print("=" * 60)
|
syscred/graph_rag.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
GraphRAG Module - SysCRED
|
| 4 |
+
=========================
|
| 5 |
+
Retrieves context from the Knowledge Graph to enhance verification.
|
| 6 |
+
Transforms "Passive" Graph into "Active" Context.
|
| 7 |
+
|
| 8 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from typing import List, Dict, Any, Optional
|
| 12 |
+
from syscred.ontology_manager import OntologyManager
|
| 13 |
+
|
| 14 |
+
class GraphRAG:
|
| 15 |
+
"""
|
| 16 |
+
Retrieval Augmented Generation using the Semantic Knowledge Graph.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, ontology_manager: OntologyManager):
|
| 20 |
+
self.om = ontology_manager
|
| 21 |
+
|
| 22 |
+
def get_context(self, domain: str, keywords: List[str] = []) -> Dict[str, str]:
|
| 23 |
+
"""
|
| 24 |
+
Retrieve context for a specific verification task.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
domain: The domain being analyzed (e.g., 'lemonde.fr')
|
| 28 |
+
keywords: List of keywords from the claim (not yet used in V1)
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Dictionary with natural language context strings.
|
| 32 |
+
"""
|
| 33 |
+
if not self.om:
|
| 34 |
+
return {"graph_context": "No ontology manager available."}
|
| 35 |
+
|
| 36 |
+
context_parts = []
|
| 37 |
+
|
| 38 |
+
# 1. Source History
|
| 39 |
+
source_history = self._get_source_history(domain)
|
| 40 |
+
if source_history:
|
| 41 |
+
context_parts.append(source_history)
|
| 42 |
+
|
| 43 |
+
# 2. Pattern Matching (Similar Claims)
|
| 44 |
+
similar_uris = []
|
| 45 |
+
if keywords:
|
| 46 |
+
similar_result = self._find_similar_claims(keywords)
|
| 47 |
+
if similar_result["text"]:
|
| 48 |
+
context_parts.append(similar_result["text"])
|
| 49 |
+
similar_uris = similar_result["uris"]
|
| 50 |
+
|
| 51 |
+
full_context = "\n\n".join(context_parts) if context_parts else "No prior knowledge found in the graph."
|
| 52 |
+
|
| 53 |
+
return {
|
| 54 |
+
"full_text": full_context,
|
| 55 |
+
"source_history": source_history,
|
| 56 |
+
"similar_uris": similar_uris # [NEW] Return URIs for linking
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
def _get_source_history(self, domain: str) -> str:
|
| 60 |
+
"""
|
| 61 |
+
Query the graph for all previous evaluations of this domain.
|
| 62 |
+
"""
|
| 63 |
+
if not domain:
|
| 64 |
+
return ""
|
| 65 |
+
|
| 66 |
+
# We reuse the specific query logic but tailored for retrieval
|
| 67 |
+
query = """
|
| 68 |
+
PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
|
| 69 |
+
|
| 70 |
+
SELECT ?score ?level ?timestamp
|
| 71 |
+
WHERE {
|
| 72 |
+
?info cred:informationURL ?url .
|
| 73 |
+
?request cred:concernsInformation ?info .
|
| 74 |
+
?report cred:isReportOf ?request .
|
| 75 |
+
?report cred:credibilityScoreValue ?score .
|
| 76 |
+
?report cred:assignsCredibilityLevel ?level .
|
| 77 |
+
?report cred:completionTimestamp ?timestamp .
|
| 78 |
+
FILTER(CONTAINS(STR(?url), "%s"))
|
| 79 |
+
}
|
| 80 |
+
ORDER BY DESC(?timestamp)
|
| 81 |
+
LIMIT 5
|
| 82 |
+
""" % domain
|
| 83 |
+
|
| 84 |
+
results = []
|
| 85 |
+
try:
|
| 86 |
+
combined = self.om.base_graph + self.om.data_graph
|
| 87 |
+
for row in combined.query(query):
|
| 88 |
+
results.append({
|
| 89 |
+
"score": float(row.score),
|
| 90 |
+
"level": str(row.level).split('#')[-1],
|
| 91 |
+
"date": str(row.timestamp).split('T')[0]
|
| 92 |
+
})
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"[GraphRAG] Query error: {e}")
|
| 95 |
+
return ""
|
| 96 |
+
|
| 97 |
+
if not results:
|
| 98 |
+
return f"The graph contains no previous evaluations for {domain}."
|
| 99 |
+
|
| 100 |
+
# Summarize
|
| 101 |
+
count = len(results)
|
| 102 |
+
avg_score = sum(r['score'] for r in results) / count
|
| 103 |
+
last_verdict = results[0]['level']
|
| 104 |
+
|
| 105 |
+
summary = (
|
| 106 |
+
f"Graph Memory for '{domain}':\n"
|
| 107 |
+
f"- Analyzed {count} times previously.\n"
|
| 108 |
+
f"- Average Credibility Score: {avg_score:.2f} / 1.0\n"
|
| 109 |
+
f"- Most recent verdict ({results[0]['date']}): {last_verdict}.\n"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
return summary
|
| 113 |
+
|
| 114 |
+
def _find_similar_claims(self, keywords: List[str]) -> Dict[str, Any]:
|
| 115 |
+
"""
|
| 116 |
+
Find evaluation history for content containing specific keywords.
|
| 117 |
+
Returns dict with 'text' (for LLM) and 'uris' (for Graph linking).
|
| 118 |
+
"""
|
| 119 |
+
if not keywords:
|
| 120 |
+
return {"text": "", "uris": []}
|
| 121 |
+
|
| 122 |
+
# Build REGEX filter for keywords (OR logic)
|
| 123 |
+
# e.g., (fake|hoax|conspiracy)
|
| 124 |
+
clean_kws = [k for k in keywords if len(k) > 3] # Skip short words
|
| 125 |
+
if not clean_kws:
|
| 126 |
+
return {"text": "", "uris": []}
|
| 127 |
+
|
| 128 |
+
regex_pattern = "|".join(clean_kws)
|
| 129 |
+
|
| 130 |
+
query = """
|
| 131 |
+
PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
|
| 132 |
+
|
| 133 |
+
SELECT ?report ?content ?score ?level ?timestamp
|
| 134 |
+
WHERE {
|
| 135 |
+
?info cred:informationContent ?content .
|
| 136 |
+
?request cred:concernsInformation ?info .
|
| 137 |
+
?report cred:isReportOf ?request .
|
| 138 |
+
?report cred:credibilityScoreValue ?score .
|
| 139 |
+
?report cred:assignsCredibilityLevel ?level .
|
| 140 |
+
?report cred:completionTimestamp ?timestamp .
|
| 141 |
+
FILTER(REGEX(?content, "%s", "i"))
|
| 142 |
+
}
|
| 143 |
+
ORDER BY DESC(?timestamp)
|
| 144 |
+
LIMIT 3
|
| 145 |
+
""" % regex_pattern
|
| 146 |
+
|
| 147 |
+
results = []
|
| 148 |
+
try:
|
| 149 |
+
combined = self.om.base_graph + self.om.data_graph
|
| 150 |
+
for row in combined.query(query):
|
| 151 |
+
results.append({
|
| 152 |
+
"uri": str(row.report),
|
| 153 |
+
"content": str(row.content)[:100] + "...",
|
| 154 |
+
"score": float(row.score),
|
| 155 |
+
"verdict": str(row.level).split('#')[-1]
|
| 156 |
+
})
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"[GraphRAG] Similar claims error: {e}")
|
| 159 |
+
return {"text": "", "uris": []}
|
| 160 |
+
|
| 161 |
+
if not results:
|
| 162 |
+
return {"text": "", "uris": []}
|
| 163 |
+
|
| 164 |
+
lines = [f"Found {len(results)} similar claims in history:"]
|
| 165 |
+
for r in results:
|
| 166 |
+
lines.append(f"- \"{r['content']}\" ({r['verdict']}, Score: {r['score']:.2f})")
|
| 167 |
+
|
| 168 |
+
return {
|
| 169 |
+
"text": "\n".join(lines),
|
| 170 |
+
"uris": [r['uri'] for r in results]
|
| 171 |
+
}
|
syscred/ir_engine.py
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
IR Engine Module - SysCRED
|
| 4 |
+
===========================
|
| 5 |
+
Information Retrieval engine extracted from TREC AP88-90 project.
|
| 6 |
+
|
| 7 |
+
Features:
|
| 8 |
+
- TF-IDF calculation (custom and via Pyserini)
|
| 9 |
+
- BM25 scoring (via Lucene/Pyserini)
|
| 10 |
+
- Query Likelihood Dirichlet (QLD)
|
| 11 |
+
- Pseudo-Relevance Feedback (PRF)
|
| 12 |
+
- Porter Stemming integration
|
| 13 |
+
|
| 14 |
+
Based on: TREC_AP88-90_5juin2025.py
|
| 15 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 16 |
+
Citation Key: loyerEvaluationModelesRecherche2025
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import re
|
| 20 |
+
import math
|
| 21 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 22 |
+
from dataclasses import dataclass
|
| 23 |
+
from collections import Counter
|
| 24 |
+
|
| 25 |
+
# Check for optional dependencies
|
| 26 |
+
try:
|
| 27 |
+
import nltk
|
| 28 |
+
from nltk.corpus import stopwords
|
| 29 |
+
from nltk.stem import PorterStemmer
|
| 30 |
+
from nltk.tokenize import word_tokenize
|
| 31 |
+
HAS_NLTK = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
HAS_NLTK = False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
from pyserini.search.lucene import LuceneSearcher
|
| 37 |
+
HAS_PYSERINI = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
HAS_PYSERINI = False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# --- Data Classes ---
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class SearchResult:
|
| 46 |
+
"""A single search result."""
|
| 47 |
+
doc_id: str
|
| 48 |
+
score: float
|
| 49 |
+
rank: int
|
| 50 |
+
snippet: Optional[str] = None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class SearchResponse:
|
| 55 |
+
"""Complete search response."""
|
| 56 |
+
query_id: str
|
| 57 |
+
query_text: str
|
| 58 |
+
results: List[SearchResult]
|
| 59 |
+
model: str # 'bm25', 'qld', 'tfidf'
|
| 60 |
+
total_hits: int
|
| 61 |
+
search_time_ms: float
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class IREngine:
|
| 65 |
+
"""
|
| 66 |
+
Information Retrieval engine with multiple scoring methods.
|
| 67 |
+
|
| 68 |
+
Supports:
|
| 69 |
+
- Built-in TF-IDF/BM25 (no dependencies)
|
| 70 |
+
- Pyserini/Lucene BM25 and QLD (if pyserini installed)
|
| 71 |
+
- Query expansion with Pseudo-Relevance Feedback
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
# BM25 default parameters
|
| 75 |
+
BM25_K1 = 0.9
|
| 76 |
+
BM25_B = 0.4
|
| 77 |
+
|
| 78 |
+
def __init__(self, index_path: str = None, use_stemming: bool = True):
|
| 79 |
+
"""
|
| 80 |
+
Initialize the IR engine.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
index_path: Path to Lucene/Pyserini index (optional)
|
| 84 |
+
use_stemming: Whether to apply Porter stemming
|
| 85 |
+
"""
|
| 86 |
+
self.index_path = index_path
|
| 87 |
+
self.use_stemming = use_stemming
|
| 88 |
+
self.searcher = None
|
| 89 |
+
|
| 90 |
+
# Initialize NLTK components
|
| 91 |
+
if HAS_NLTK:
|
| 92 |
+
try:
|
| 93 |
+
self.stopwords = set(stopwords.words('english'))
|
| 94 |
+
self.stemmer = PorterStemmer() if use_stemming else None
|
| 95 |
+
except LookupError:
|
| 96 |
+
print("[IREngine] Downloading NLTK resources...")
|
| 97 |
+
nltk.download('stopwords', quiet=True)
|
| 98 |
+
nltk.download('punkt', quiet=True)
|
| 99 |
+
nltk.download('punkt_tab', quiet=True)
|
| 100 |
+
self.stopwords = set(stopwords.words('english'))
|
| 101 |
+
self.stemmer = PorterStemmer() if use_stemming else None
|
| 102 |
+
else:
|
| 103 |
+
# Fallback stopwords
|
| 104 |
+
self.stopwords = {
|
| 105 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
| 106 |
+
'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
|
| 107 |
+
'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does',
|
| 108 |
+
'did', 'will', 'would', 'could', 'should', 'may', 'might',
|
| 109 |
+
'must', 'shall', 'can', 'need', 'this', 'that', 'these',
|
| 110 |
+
'those', 'it', 'its', 'they', 'them', 'he', 'she', 'him',
|
| 111 |
+
'her', 'his', 'we', 'you', 'i', 'my', 'your', 'our', 'their'
|
| 112 |
+
}
|
| 113 |
+
self.stemmer = None
|
| 114 |
+
|
| 115 |
+
# Initialize Pyserini searcher if available
|
| 116 |
+
if HAS_PYSERINI and index_path:
|
| 117 |
+
try:
|
| 118 |
+
self.searcher = LuceneSearcher(index_path)
|
| 119 |
+
print(f"[IREngine] Pyserini searcher initialized with index: {index_path}")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"[IREngine] Failed to initialize Pyserini: {e}")
|
| 122 |
+
|
| 123 |
+
def preprocess(self, text: str) -> str:
|
| 124 |
+
"""
|
| 125 |
+
Preprocess text with tokenization, stopword removal, and optional stemming.
|
| 126 |
+
|
| 127 |
+
This matches the TREC preprocessing pipeline.
|
| 128 |
+
"""
|
| 129 |
+
if not isinstance(text, str):
|
| 130 |
+
return ""
|
| 131 |
+
|
| 132 |
+
text = text.lower()
|
| 133 |
+
|
| 134 |
+
if HAS_NLTK:
|
| 135 |
+
try:
|
| 136 |
+
tokens = word_tokenize(text)
|
| 137 |
+
except LookupError:
|
| 138 |
+
# Fallback tokenization
|
| 139 |
+
tokens = re.findall(r'\b[a-z]+\b', text)
|
| 140 |
+
else:
|
| 141 |
+
tokens = re.findall(r'\b[a-z]+\b', text)
|
| 142 |
+
|
| 143 |
+
# Filter stopwords and non-alpha
|
| 144 |
+
filtered = [t for t in tokens if t.isalpha() and t not in self.stopwords]
|
| 145 |
+
|
| 146 |
+
# Apply stemming if enabled
|
| 147 |
+
if self.stemmer:
|
| 148 |
+
filtered = [self.stemmer.stem(t) for t in filtered]
|
| 149 |
+
|
| 150 |
+
return ' '.join(filtered)
|
| 151 |
+
|
| 152 |
+
def calculate_tf(self, tokens: List[str]) -> Dict[str, float]:
|
| 153 |
+
"""Calculate term frequency."""
|
| 154 |
+
if not tokens:
|
| 155 |
+
return {}
|
| 156 |
+
counts = Counter(tokens)
|
| 157 |
+
total = len(tokens)
|
| 158 |
+
return {term: count / total for term, count in counts.items()}
|
| 159 |
+
|
| 160 |
+
def calculate_bm25_score(
|
| 161 |
+
self,
|
| 162 |
+
query_terms: List[str],
|
| 163 |
+
doc_terms: List[str],
|
| 164 |
+
doc_length: int,
|
| 165 |
+
avg_doc_length: float,
|
| 166 |
+
doc_freq: Dict[str, int],
|
| 167 |
+
corpus_size: int
|
| 168 |
+
) -> float:
|
| 169 |
+
"""
|
| 170 |
+
Calculate BM25 score for a document.
|
| 171 |
+
|
| 172 |
+
BM25(D, Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
|
| 173 |
+
"""
|
| 174 |
+
doc_term_counts = Counter(doc_terms)
|
| 175 |
+
score = 0.0
|
| 176 |
+
|
| 177 |
+
for term in query_terms:
|
| 178 |
+
if term not in doc_term_counts:
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
+
tf = doc_term_counts[term]
|
| 182 |
+
df = doc_freq.get(term, 1)
|
| 183 |
+
idf = math.log((corpus_size - df + 0.5) / (df + 0.5) + 1)
|
| 184 |
+
|
| 185 |
+
numerator = tf * (self.BM25_K1 + 1)
|
| 186 |
+
denominator = tf + self.BM25_K1 * (1 - self.BM25_B + self.BM25_B * doc_length / avg_doc_length)
|
| 187 |
+
|
| 188 |
+
score += idf * (numerator / denominator)
|
| 189 |
+
|
| 190 |
+
return score
|
| 191 |
+
|
| 192 |
+
def search_pyserini(
|
| 193 |
+
self,
|
| 194 |
+
query: str,
|
| 195 |
+
model: str = 'bm25',
|
| 196 |
+
k: int = 100,
|
| 197 |
+
query_id: str = "Q1"
|
| 198 |
+
) -> SearchResponse:
|
| 199 |
+
"""
|
| 200 |
+
Search using Pyserini/Lucene.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
query: Query text
|
| 204 |
+
model: 'bm25' or 'qld'
|
| 205 |
+
k: Number of results
|
| 206 |
+
query_id: Query identifier
|
| 207 |
+
"""
|
| 208 |
+
import time
|
| 209 |
+
start = time.time()
|
| 210 |
+
|
| 211 |
+
if not self.searcher:
|
| 212 |
+
raise RuntimeError("Pyserini searcher not initialized. Provide index_path.")
|
| 213 |
+
|
| 214 |
+
# Configure similarity
|
| 215 |
+
if model == 'bm25':
|
| 216 |
+
self.searcher.set_bm25(k1=self.BM25_K1, b=self.BM25_B)
|
| 217 |
+
elif model == 'qld':
|
| 218 |
+
self.searcher.set_qld()
|
| 219 |
+
else:
|
| 220 |
+
self.searcher.set_bm25()
|
| 221 |
+
|
| 222 |
+
# Preprocess query
|
| 223 |
+
processed_query = self.preprocess(query)
|
| 224 |
+
|
| 225 |
+
# Search
|
| 226 |
+
hits = self.searcher.search(processed_query, k=k)
|
| 227 |
+
|
| 228 |
+
results = []
|
| 229 |
+
for i, hit in enumerate(hits):
|
| 230 |
+
results.append(SearchResult(
|
| 231 |
+
doc_id=hit.docid,
|
| 232 |
+
score=hit.score,
|
| 233 |
+
rank=i + 1
|
| 234 |
+
))
|
| 235 |
+
|
| 236 |
+
elapsed = (time.time() - start) * 1000
|
| 237 |
+
|
| 238 |
+
return SearchResponse(
|
| 239 |
+
query_id=query_id,
|
| 240 |
+
query_text=query,
|
| 241 |
+
results=results,
|
| 242 |
+
model=model,
|
| 243 |
+
total_hits=len(results),
|
| 244 |
+
search_time_ms=elapsed
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
def pseudo_relevance_feedback(
|
| 248 |
+
self,
|
| 249 |
+
query: str,
|
| 250 |
+
top_docs_texts: List[str],
|
| 251 |
+
num_expansion_terms: int = 10
|
| 252 |
+
) -> str:
|
| 253 |
+
"""
|
| 254 |
+
Expand query using Pseudo-Relevance Feedback (PRF).
|
| 255 |
+
|
| 256 |
+
Uses top-k retrieved documents to find expansion terms.
|
| 257 |
+
"""
|
| 258 |
+
query_tokens = self.preprocess(query).split()
|
| 259 |
+
|
| 260 |
+
# Collect terms from top documents
|
| 261 |
+
expansion_candidates = Counter()
|
| 262 |
+
for doc_text in top_docs_texts:
|
| 263 |
+
doc_tokens = self.preprocess(doc_text).split()
|
| 264 |
+
# Count terms not in original query
|
| 265 |
+
for token in doc_tokens:
|
| 266 |
+
if token not in query_tokens:
|
| 267 |
+
expansion_candidates[token] += 1
|
| 268 |
+
|
| 269 |
+
# Get top expansion terms
|
| 270 |
+
expansion_terms = [term for term, _ in expansion_candidates.most_common(num_expansion_terms)]
|
| 271 |
+
|
| 272 |
+
# Create expanded query
|
| 273 |
+
expanded_query = query + ' ' + ' '.join(expansion_terms)
|
| 274 |
+
|
| 275 |
+
return expanded_query
|
| 276 |
+
|
| 277 |
+
def format_trec_run(
|
| 278 |
+
self,
|
| 279 |
+
responses: List[SearchResponse],
|
| 280 |
+
run_tag: str
|
| 281 |
+
) -> str:
|
| 282 |
+
"""
|
| 283 |
+
Format results in TREC run file format.
|
| 284 |
+
|
| 285 |
+
Format: query_id Q0 doc_id rank score run_tag
|
| 286 |
+
"""
|
| 287 |
+
lines = []
|
| 288 |
+
for response in responses:
|
| 289 |
+
for result in response.results:
|
| 290 |
+
lines.append(
|
| 291 |
+
f"{response.query_id} Q0 {result.doc_id} "
|
| 292 |
+
f"{result.rank} {result.score:.6f} {run_tag}"
|
| 293 |
+
)
|
| 294 |
+
return '\n'.join(lines)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# --- Kaggle/Colab Utilities ---
|
| 298 |
+
|
| 299 |
+
def setup_kaggle_environment():
|
| 300 |
+
"""Setup environment for Kaggle notebooks."""
|
| 301 |
+
import subprocess
|
| 302 |
+
import sys
|
| 303 |
+
|
| 304 |
+
print("=" * 60)
|
| 305 |
+
print("SysCRED - Kaggle Environment Setup")
|
| 306 |
+
print("=" * 60)
|
| 307 |
+
|
| 308 |
+
# Check for GPU/TPU
|
| 309 |
+
import torch
|
| 310 |
+
if torch.cuda.is_available():
|
| 311 |
+
print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
|
| 312 |
+
else:
|
| 313 |
+
print("✗ No GPU detected")
|
| 314 |
+
|
| 315 |
+
# Install required packages
|
| 316 |
+
packages = [
|
| 317 |
+
'pyserini',
|
| 318 |
+
'transformers',
|
| 319 |
+
'pytrec_eval',
|
| 320 |
+
'nltk',
|
| 321 |
+
'rdflib'
|
| 322 |
+
]
|
| 323 |
+
|
| 324 |
+
print("\nInstalling packages...")
|
| 325 |
+
for pkg in packages:
|
| 326 |
+
try:
|
| 327 |
+
subprocess.run(
|
| 328 |
+
[sys.executable, '-m', 'pip', 'install', '-q', pkg],
|
| 329 |
+
check=True,
|
| 330 |
+
capture_output=True
|
| 331 |
+
)
|
| 332 |
+
print(f" ✓ {pkg}")
|
| 333 |
+
except:
|
| 334 |
+
print(f" ✗ {pkg} - install failed")
|
| 335 |
+
|
| 336 |
+
# Download NLTK resources
|
| 337 |
+
import nltk
|
| 338 |
+
for resource in ['stopwords', 'punkt', 'punkt_tab', 'wordnet']:
|
| 339 |
+
try:
|
| 340 |
+
nltk.download(resource, quiet=True)
|
| 341 |
+
except:
|
| 342 |
+
pass
|
| 343 |
+
|
| 344 |
+
print("\n✓ Environment setup complete")
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def load_kaggle_dataset(dataset_path: str) -> str:
|
| 348 |
+
"""
|
| 349 |
+
Load a Kaggle dataset.
|
| 350 |
+
|
| 351 |
+
Args:
|
| 352 |
+
dataset_path: Path like '/kaggle/input/trec-ap88-90'
|
| 353 |
+
"""
|
| 354 |
+
import os
|
| 355 |
+
|
| 356 |
+
if os.path.exists(dataset_path):
|
| 357 |
+
print(f"✓ Dataset found: {dataset_path}")
|
| 358 |
+
return dataset_path
|
| 359 |
+
else:
|
| 360 |
+
print(f"✗ Dataset not found: {dataset_path}")
|
| 361 |
+
print("Make sure to add the dataset to your Kaggle notebook.")
|
| 362 |
+
return None
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# --- Testing ---
|
| 366 |
+
if __name__ == "__main__":
|
| 367 |
+
print("=" * 60)
|
| 368 |
+
print("SysCRED IR Engine - Tests")
|
| 369 |
+
print("=" * 60)
|
| 370 |
+
|
| 371 |
+
engine = IREngine(use_stemming=True)
|
| 372 |
+
|
| 373 |
+
# Test preprocessing
|
| 374 |
+
print("\n1. Testing preprocessing...")
|
| 375 |
+
sample = "Information Retrieval systems help users find relevant documents."
|
| 376 |
+
processed = engine.preprocess(sample)
|
| 377 |
+
print(f" Original: {sample}")
|
| 378 |
+
print(f" Processed: {processed}")
|
| 379 |
+
|
| 380 |
+
# Test BM25
|
| 381 |
+
print("\n2. Testing BM25 calculation...")
|
| 382 |
+
query_terms = engine.preprocess("information retrieval").split()
|
| 383 |
+
doc_terms = engine.preprocess(sample).split()
|
| 384 |
+
|
| 385 |
+
score = engine.calculate_bm25_score(
|
| 386 |
+
query_terms=query_terms,
|
| 387 |
+
doc_terms=doc_terms,
|
| 388 |
+
doc_length=len(doc_terms),
|
| 389 |
+
avg_doc_length=10,
|
| 390 |
+
doc_freq={'inform': 5, 'retriev': 3},
|
| 391 |
+
corpus_size=100
|
| 392 |
+
)
|
| 393 |
+
print(f" BM25 Score: {score:.4f}")
|
| 394 |
+
|
| 395 |
+
# Test PRF
|
| 396 |
+
print("\n3. Testing Pseudo-Relevance Feedback...")
|
| 397 |
+
expanded = engine.pseudo_relevance_feedback(
|
| 398 |
+
query="information retrieval",
|
| 399 |
+
top_docs_texts=[
|
| 400 |
+
"Information retrieval is finding relevant documents in a collection.",
|
| 401 |
+
"Search engines use retrieval models like BM25 and TF-IDF.",
|
| 402 |
+
"Query expansion improves retrieval effectiveness."
|
| 403 |
+
]
|
| 404 |
+
)
|
| 405 |
+
print(f" Original query: information retrieval")
|
| 406 |
+
print(f" Expanded query: {expanded}")
|
| 407 |
+
|
| 408 |
+
print("\n" + "=" * 60)
|
| 409 |
+
print("Tests complete!")
|
| 410 |
+
print("=" * 60)
|
syscred/ontology_manager.py
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Ontology Manager Module - SysCRED
|
| 4 |
+
==================================
|
| 5 |
+
Manages the RDF ontology for the credibility verification system.
|
| 6 |
+
Handles reading, writing, and querying of semantic triplets.
|
| 7 |
+
|
| 8 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 9 |
+
Citation Key: loyerModelingHybridSystem2025
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Optional, List, Dict, Any
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
# RDFLib imports with fallback
|
| 18 |
+
try:
|
| 19 |
+
from rdflib import Graph, Namespace, Literal, URIRef, BNode
|
| 20 |
+
from rdflib.namespace import RDF, RDFS, OWL, XSD
|
| 21 |
+
HAS_RDFLIB = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
HAS_RDFLIB = False
|
| 24 |
+
print("Warning: rdflib not installed. Run: pip install rdflib")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class EvaluationRecord:
|
| 29 |
+
"""Represents a stored evaluation from the ontology."""
|
| 30 |
+
evaluation_id: str
|
| 31 |
+
url_or_text: str
|
| 32 |
+
score: float
|
| 33 |
+
level: str
|
| 34 |
+
timestamp: str
|
| 35 |
+
fact_checks: List[str]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class OntologyManager:
|
| 39 |
+
"""
|
| 40 |
+
Manages the credibility ontology using RDFLib.
|
| 41 |
+
|
| 42 |
+
Handles:
|
| 43 |
+
- Loading base ontology
|
| 44 |
+
- Adding evaluation triplets
|
| 45 |
+
- Querying historical data
|
| 46 |
+
- Exporting enriched ontology
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
# Namespace for the credibility ontology
|
| 50 |
+
CRED_NS = "https://github.com/DominiqueLoyer/systemFactChecking#"
|
| 51 |
+
|
| 52 |
+
def __init__(self, base_ontology_path: Optional[str] = None, data_path: Optional[str] = None):
|
| 53 |
+
"""
|
| 54 |
+
Initialize the ontology manager.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
base_ontology_path: Path to the base ontology TTL file
|
| 58 |
+
data_path: Path to store/load accumulated data triplets
|
| 59 |
+
"""
|
| 60 |
+
if not HAS_RDFLIB:
|
| 61 |
+
raise ImportError("rdflib is required. Install with: pip install rdflib")
|
| 62 |
+
|
| 63 |
+
self.base_path = base_ontology_path
|
| 64 |
+
self.data_path = data_path
|
| 65 |
+
|
| 66 |
+
# Create namespace
|
| 67 |
+
self.cred = Namespace(self.CRED_NS)
|
| 68 |
+
|
| 69 |
+
# Initialize graphs
|
| 70 |
+
self.base_graph = Graph()
|
| 71 |
+
self.data_graph = Graph()
|
| 72 |
+
|
| 73 |
+
# Bind prefixes for nicer serialization
|
| 74 |
+
self._bind_prefixes(self.base_graph)
|
| 75 |
+
self._bind_prefixes(self.data_graph)
|
| 76 |
+
|
| 77 |
+
# Load ontology files if they exist
|
| 78 |
+
if base_ontology_path and os.path.exists(base_ontology_path):
|
| 79 |
+
self.load_base_ontology(base_ontology_path)
|
| 80 |
+
|
| 81 |
+
if data_path and os.path.exists(data_path):
|
| 82 |
+
self.load_data_graph(data_path)
|
| 83 |
+
|
| 84 |
+
# Counter for generating unique IDs
|
| 85 |
+
self._evaluation_counter = 0
|
| 86 |
+
|
| 87 |
+
def _bind_prefixes(self, graph: Graph):
|
| 88 |
+
"""Bind common prefixes to a graph."""
|
| 89 |
+
graph.bind("cred", self.cred)
|
| 90 |
+
graph.bind("owl", OWL)
|
| 91 |
+
graph.bind("rdf", RDF)
|
| 92 |
+
graph.bind("rdfs", RDFS)
|
| 93 |
+
graph.bind("xsd", XSD)
|
| 94 |
+
|
| 95 |
+
def load_base_ontology(self, path: str) -> bool:
|
| 96 |
+
"""Load the base ontology from a TTL file."""
|
| 97 |
+
try:
|
| 98 |
+
self.base_graph.parse(path, format='turtle')
|
| 99 |
+
print(f"[OntologyManager] Loaded base ontology: {len(self.base_graph)} triples")
|
| 100 |
+
return True
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"[OntologyManager] Error loading base ontology: {e}")
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
def load_data_graph(self, path: str) -> bool:
|
| 106 |
+
"""Load accumulated data triplets."""
|
| 107 |
+
try:
|
| 108 |
+
self.data_graph.parse(path, format='turtle')
|
| 109 |
+
print(f"[OntologyManager] Loaded data graph: {len(self.data_graph)} triples")
|
| 110 |
+
return True
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f"[OntologyManager] Error loading data graph: {e}")
|
| 113 |
+
return False
|
| 114 |
+
|
| 115 |
+
def add_evaluation_triplets(self, report: Dict[str, Any]) -> str:
|
| 116 |
+
"""
|
| 117 |
+
Add triplets for a new credibility evaluation.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
report: The evaluation report dictionary from CredibilityVerificationSystem
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
The URI of the created RapportEvaluation individual
|
| 124 |
+
"""
|
| 125 |
+
timestamp = datetime.now()
|
| 126 |
+
timestamp_str = timestamp.strftime("%Y%m%d_%H%M%S")
|
| 127 |
+
self._evaluation_counter += 1
|
| 128 |
+
|
| 129 |
+
# Create URIs for new individuals
|
| 130 |
+
report_uri = self.cred[f"Report_{timestamp_str}_{self._evaluation_counter}"]
|
| 131 |
+
request_uri = self.cred[f"Request_{timestamp_str}_{self._evaluation_counter}"]
|
| 132 |
+
info_uri = self.cred[f"Info_{timestamp_str}_{self._evaluation_counter}"]
|
| 133 |
+
|
| 134 |
+
# Get data from report
|
| 135 |
+
score = report.get('scoreCredibilite', 0.5)
|
| 136 |
+
input_data = report.get('informationEntree', '')
|
| 137 |
+
summary = report.get('resumeAnalyse', '')
|
| 138 |
+
|
| 139 |
+
# Determine credibility level based on score
|
| 140 |
+
if score >= 0.7:
|
| 141 |
+
level_uri = self.cred.Niveau_Haut
|
| 142 |
+
info_class = self.cred.InformationHauteCredibilite
|
| 143 |
+
elif score >= 0.4:
|
| 144 |
+
level_uri = self.cred.Niveau_Moyen
|
| 145 |
+
info_class = self.cred.InformationMoyenneCredibilite
|
| 146 |
+
else:
|
| 147 |
+
level_uri = self.cred.Niveau_Bas
|
| 148 |
+
info_class = self.cred.InformationFaibleCredibilite
|
| 149 |
+
|
| 150 |
+
# Add Information triplets
|
| 151 |
+
self.data_graph.add((info_uri, RDF.type, self.cred.InformationSoumise))
|
| 152 |
+
self.data_graph.add((info_uri, RDF.type, info_class))
|
| 153 |
+
self.data_graph.add((info_uri, self.cred.informationContent,
|
| 154 |
+
Literal(input_data[:500], datatype=XSD.string)))
|
| 155 |
+
|
| 156 |
+
# Check if it's a URL
|
| 157 |
+
if input_data.startswith('http'):
|
| 158 |
+
self.data_graph.add((info_uri, self.cred.informationURL,
|
| 159 |
+
Literal(input_data, datatype=XSD.anyURI)))
|
| 160 |
+
|
| 161 |
+
# Add Request triplets
|
| 162 |
+
self.data_graph.add((request_uri, RDF.type, self.cred.RequeteEvaluation))
|
| 163 |
+
self.data_graph.add((request_uri, self.cred.concernsInformation, info_uri))
|
| 164 |
+
self.data_graph.add((request_uri, self.cred.submissionTimestamp,
|
| 165 |
+
Literal(timestamp.isoformat(), datatype=XSD.dateTime)))
|
| 166 |
+
self.data_graph.add((request_uri, self.cred.requestStatus,
|
| 167 |
+
Literal("Completed", datatype=XSD.string)))
|
| 168 |
+
|
| 169 |
+
# Add Report triplets
|
| 170 |
+
self.data_graph.add((report_uri, RDF.type, self.cred.RapportEvaluation))
|
| 171 |
+
self.data_graph.add((report_uri, self.cred.isReportOf, request_uri))
|
| 172 |
+
self.data_graph.add((report_uri, self.cred.credibilityScoreValue,
|
| 173 |
+
Literal(float(score), datatype=XSD.float)))
|
| 174 |
+
self.data_graph.add((report_uri, self.cred.assignsCredibilityLevel, level_uri))
|
| 175 |
+
self.data_graph.add((report_uri, self.cred.completionTimestamp,
|
| 176 |
+
Literal(timestamp.isoformat(), datatype=XSD.dateTime)))
|
| 177 |
+
self.data_graph.add((report_uri, self.cred.reportSummary,
|
| 178 |
+
Literal(summary, datatype=XSD.string)))
|
| 179 |
+
|
| 180 |
+
# Add NLP results if available
|
| 181 |
+
nlp_results = report.get('analyseNLP', {})
|
| 182 |
+
if nlp_results:
|
| 183 |
+
nlp_result_uri = self.cred[f"NLPResult_{timestamp_str}_{self._evaluation_counter}"]
|
| 184 |
+
self.data_graph.add((nlp_result_uri, RDF.type, self.cred.ResultatNLP))
|
| 185 |
+
self.data_graph.add((report_uri, self.cred.includesNLPResult, nlp_result_uri))
|
| 186 |
+
|
| 187 |
+
sentiment = nlp_results.get('sentiment', {})
|
| 188 |
+
if sentiment:
|
| 189 |
+
self.data_graph.add((nlp_result_uri, self.cred.sentimentScore,
|
| 190 |
+
Literal(float(sentiment.get('score', 0.5)), datatype=XSD.float)))
|
| 191 |
+
|
| 192 |
+
coherence = nlp_results.get('coherence_score')
|
| 193 |
+
if coherence is not None:
|
| 194 |
+
self.data_graph.add((nlp_result_uri, self.cred.coherenceScore,
|
| 195 |
+
Literal(float(coherence), datatype=XSD.float)))
|
| 196 |
+
|
| 197 |
+
# Add source analysis if available
|
| 198 |
+
rules = report.get('reglesAppliquees', {})
|
| 199 |
+
source_analysis = rules.get('source_analysis', {})
|
| 200 |
+
if source_analysis:
|
| 201 |
+
source_uri = self.cred[f"SourceAnalysis_{timestamp_str}_{self._evaluation_counter}"]
|
| 202 |
+
self.data_graph.add((source_uri, RDF.type, self.cred.InfoSourceAnalyse))
|
| 203 |
+
self.data_graph.add((report_uri, self.cred.includesSourceAnalysis, source_uri))
|
| 204 |
+
|
| 205 |
+
reputation = source_analysis.get('reputation', 'Unknown')
|
| 206 |
+
self.data_graph.add((source_uri, self.cred.sourceAnalyzedReputation,
|
| 207 |
+
Literal(reputation, datatype=XSD.string)))
|
| 208 |
+
|
| 209 |
+
domain_age = source_analysis.get('domain_age_days')
|
| 210 |
+
if domain_age is not None:
|
| 211 |
+
self.data_graph.add((source_uri, self.cred.sourceMentionsCount,
|
| 212 |
+
Literal(int(domain_age), datatype=XSD.integer)))
|
| 213 |
+
|
| 214 |
+
# Add fact check results
|
| 215 |
+
fact_checks = rules.get('fact_checking', [])
|
| 216 |
+
for i, fc in enumerate(fact_checks):
|
| 217 |
+
evidence_uri = self.cred[f"Evidence_{timestamp_str}_{self._evaluation_counter}_{i}"]
|
| 218 |
+
self.data_graph.add((evidence_uri, RDF.type, self.cred.PreuveFactuelle))
|
| 219 |
+
self.data_graph.add((report_uri, self.cred.basedOnEvidence, evidence_uri))
|
| 220 |
+
|
| 221 |
+
self.data_graph.add((evidence_uri, self.cred.evidenceClaim,
|
| 222 |
+
Literal(fc.get('claim', ''), datatype=XSD.string)))
|
| 223 |
+
self.data_graph.add((evidence_uri, self.cred.evidenceVerdict,
|
| 224 |
+
Literal(fc.get('rating', ''), datatype=XSD.string)))
|
| 225 |
+
self.data_graph.add((evidence_uri, self.cred.evidenceSource,
|
| 226 |
+
Literal(fc.get('publisher', ''), datatype=XSD.string)))
|
| 227 |
+
if fc.get('url'):
|
| 228 |
+
self.data_graph.add((evidence_uri, self.cred.evidenceURL,
|
| 229 |
+
Literal(fc.get('url', ''), datatype=XSD.anyURI)))
|
| 230 |
+
|
| 231 |
+
# [NEW] Link similar claims found by GraphRAG
|
| 232 |
+
similar_uris = report.get('similar_claims_uris', [])
|
| 233 |
+
for sim_uri_str in similar_uris:
|
| 234 |
+
try:
|
| 235 |
+
sim_uri = URIRef(sim_uri_str)
|
| 236 |
+
self.data_graph.add((report_uri, RDFS.seeAlso, sim_uri))
|
| 237 |
+
except Exception as e:
|
| 238 |
+
print(f"[Ontology] Error linking similar URI {sim_uri_str}: {e}")
|
| 239 |
+
|
| 240 |
+
print(f"[OntologyManager] Added evaluation triplets. Report: {report_uri}")
|
| 241 |
+
return str(report_uri)
|
| 242 |
+
|
| 243 |
+
def query_source_history(self, url: str) -> List[EvaluationRecord]:
|
| 244 |
+
"""
|
| 245 |
+
Query all previous evaluations for a URL/domain.
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
url: URL to search for
|
| 249 |
+
|
| 250 |
+
Returns:
|
| 251 |
+
List of EvaluationRecord for this source
|
| 252 |
+
"""
|
| 253 |
+
results = []
|
| 254 |
+
|
| 255 |
+
# SPARQL query to find all evaluations for this URL
|
| 256 |
+
query = """
|
| 257 |
+
PREFIX cred: <http://www.dic9335.uqam.ca/ontologies/credibility-verification#>
|
| 258 |
+
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
| 259 |
+
|
| 260 |
+
SELECT ?report ?score ?level ?timestamp ?content
|
| 261 |
+
WHERE {
|
| 262 |
+
?info cred:informationURL ?url .
|
| 263 |
+
?request cred:concernsInformation ?info .
|
| 264 |
+
?report cred:isReportOf ?request .
|
| 265 |
+
?report cred:credibilityScoreValue ?score .
|
| 266 |
+
?report cred:assignsCredibilityLevel ?level .
|
| 267 |
+
?report cred:completionTimestamp ?timestamp .
|
| 268 |
+
?info cred:informationContent ?content .
|
| 269 |
+
FILTER(CONTAINS(STR(?url), "%s"))
|
| 270 |
+
}
|
| 271 |
+
ORDER BY DESC(?timestamp)
|
| 272 |
+
""" % url
|
| 273 |
+
|
| 274 |
+
try:
|
| 275 |
+
# Query combined graph (base + data)
|
| 276 |
+
combined = self.base_graph + self.data_graph
|
| 277 |
+
for row in combined.query(query):
|
| 278 |
+
results.append(EvaluationRecord(
|
| 279 |
+
evaluation_id=str(row.report),
|
| 280 |
+
url_or_text=str(row.content) if row.content else url,
|
| 281 |
+
score=float(row.score),
|
| 282 |
+
level=str(row.level).split('#')[-1],
|
| 283 |
+
timestamp=str(row.timestamp),
|
| 284 |
+
fact_checks=[]
|
| 285 |
+
))
|
| 286 |
+
except Exception as e:
|
| 287 |
+
print(f"[OntologyManager] Query error: {e}")
|
| 288 |
+
|
| 289 |
+
return results
|
| 290 |
+
|
| 291 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 292 |
+
"""Get statistics about the ontology data."""
|
| 293 |
+
stats = {
|
| 294 |
+
'base_triples': len(self.base_graph),
|
| 295 |
+
'data_triples': len(self.data_graph),
|
| 296 |
+
'total_triples': len(self.base_graph) + len(self.data_graph),
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
# Count evaluations
|
| 300 |
+
query = """
|
| 301 |
+
PREFIX cred: <http://www.dic9335.uqam.ca/ontologies/credibility-verification#>
|
| 302 |
+
SELECT (COUNT(?report) as ?count) WHERE {
|
| 303 |
+
?report a cred:RapportEvaluation .
|
| 304 |
+
}
|
| 305 |
+
"""
|
| 306 |
+
try:
|
| 307 |
+
for row in self.data_graph.query(query):
|
| 308 |
+
stats['total_evaluations'] = int(row.count)
|
| 309 |
+
except:
|
| 310 |
+
stats['total_evaluations'] = 0
|
| 311 |
+
|
| 312 |
+
return stats
|
| 313 |
+
|
| 314 |
+
def get_graph_json(self) -> Dict[str, List]:
|
| 315 |
+
"""
|
| 316 |
+
Convert ontology data into D3.js JSON format (Nodes & Links).
|
| 317 |
+
"""
|
| 318 |
+
nodes = []
|
| 319 |
+
links = []
|
| 320 |
+
added_nodes = set()
|
| 321 |
+
|
| 322 |
+
# Get the latest report ID
|
| 323 |
+
latest_query = """
|
| 324 |
+
PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
|
| 325 |
+
SELECT ?report ?timestamp WHERE {
|
| 326 |
+
?report a cred:RapportEvaluation .
|
| 327 |
+
?report cred:completionTimestamp ?timestamp .
|
| 328 |
+
}
|
| 329 |
+
ORDER BY DESC(?timestamp)
|
| 330 |
+
LIMIT 1
|
| 331 |
+
"""
|
| 332 |
+
latest_report = None
|
| 333 |
+
try:
|
| 334 |
+
for row in self.data_graph.query(latest_query):
|
| 335 |
+
latest_report = row.report
|
| 336 |
+
except:
|
| 337 |
+
pass
|
| 338 |
+
|
| 339 |
+
if not latest_report:
|
| 340 |
+
return {'nodes': [], 'links': []}
|
| 341 |
+
|
| 342 |
+
# Helper to add node if unique
|
| 343 |
+
def add_node(uri, label, type_class, group):
|
| 344 |
+
if str(uri) not in added_nodes:
|
| 345 |
+
nodes.append({
|
| 346 |
+
'id': str(uri),
|
| 347 |
+
'name': str(label),
|
| 348 |
+
'group': group,
|
| 349 |
+
'type': str(type_class).split('#')[-1]
|
| 350 |
+
})
|
| 351 |
+
added_nodes.add(str(uri))
|
| 352 |
+
|
| 353 |
+
# Add Central Node (Report)
|
| 354 |
+
add_node(latest_report, "Latest Report", "cred:RapportEvaluation", 1)
|
| 355 |
+
|
| 356 |
+
# Query triples related to this report (Level 1)
|
| 357 |
+
related_query = """
|
| 358 |
+
PREFIX cred: <https://github.com/DominiqueLoyer/systemFactChecking#>
|
| 359 |
+
SELECT ?p ?o ?oType ?oLabel WHERE {
|
| 360 |
+
<%s> ?p ?o .
|
| 361 |
+
OPTIONAL { ?o a ?oType } .
|
| 362 |
+
OPTIONAL { ?o cred:evidenceSnippet ?oLabel } .
|
| 363 |
+
OPTIONAL { ?o cred:sourceAnalyzedReputation ?oLabel } .
|
| 364 |
+
}
|
| 365 |
+
""" % str(latest_report)
|
| 366 |
+
|
| 367 |
+
try:
|
| 368 |
+
# Level 1: Report -> Components
|
| 369 |
+
for row in self.data_graph.query(related_query):
|
| 370 |
+
p = row.p
|
| 371 |
+
o = row.o
|
| 372 |
+
|
| 373 |
+
# Skip generic system triples like rdf:type, but allow rdfs:seeAlso
|
| 374 |
+
if str(p) == str(RDF.type): continue
|
| 375 |
+
if 'Literal' in str(type(o)): continue # Skip basic literals
|
| 376 |
+
|
| 377 |
+
# Determine Group/Color
|
| 378 |
+
o_type = str(row.oType) if row.oType else "Unknown"
|
| 379 |
+
group = 2 # Default gray
|
| 380 |
+
if 'High' in o_type or 'Supporting' in o_type: group = 3 # Green (Positive)
|
| 381 |
+
if 'Low' in o_type or 'Refuting' in o_type: group = 4 # Red (Negative)
|
| 382 |
+
if 'Rapport' in o_type: group = 1 # Purple (Hub)
|
| 383 |
+
if 'SourceAnalysis' in o_type: group = 5 # Blue (Source)
|
| 384 |
+
if str(p) == str(RDFS.seeAlso): group = 7 # Orange for similar claims
|
| 385 |
+
|
| 386 |
+
# Add Target Node (Level 1)
|
| 387 |
+
o_label = row.oLabel if row.oLabel else str(o).split('#')[-1]
|
| 388 |
+
add_node(o, o_label, o_type, group)
|
| 389 |
+
|
| 390 |
+
# Add Link L1
|
| 391 |
+
link_type = 'primary'
|
| 392 |
+
if str(p) == str(RDFS.seeAlso):
|
| 393 |
+
link_type = 'similar' # Special dash style for similar claims?
|
| 394 |
+
|
| 395 |
+
links.append({
|
| 396 |
+
'source': str(latest_report),
|
| 397 |
+
'target': str(o),
|
| 398 |
+
'value': 2,
|
| 399 |
+
'type': link_type
|
| 400 |
+
})
|
| 401 |
+
|
| 402 |
+
# Level 2: Component -> Details (Recursive enrich)
|
| 403 |
+
# Specifically for SourceAnalysis and Evidence
|
| 404 |
+
l2_query = """
|
| 405 |
+
SELECT ?p2 ?o2 ?o2Type WHERE {
|
| 406 |
+
<%s> ?p2 ?o2 .
|
| 407 |
+
OPTIONAL { ?o2 a ?o2Type } .
|
| 408 |
+
FILTER(isURI(?o2))
|
| 409 |
+
}""" % str(o)
|
| 410 |
+
|
| 411 |
+
for row2 in self.data_graph.query(l2_query):
|
| 412 |
+
o2 = row2.o2
|
| 413 |
+
if str(row2.p2) == str(RDF.type): continue
|
| 414 |
+
|
| 415 |
+
o2_label = str(o2).split('#')[-1]
|
| 416 |
+
add_node(o2, o2_label, "Detail", 6) # Group 6 for leaf nodes
|
| 417 |
+
|
| 418 |
+
links.append({
|
| 419 |
+
'source': str(o),
|
| 420 |
+
'target': str(o2),
|
| 421 |
+
'value': 1,
|
| 422 |
+
'type': 'secondary'
|
| 423 |
+
})
|
| 424 |
+
|
| 425 |
+
except Exception as e:
|
| 426 |
+
print(f"Graph query error: {e}")
|
| 427 |
+
|
| 428 |
+
return {'nodes': nodes, 'links': links}
|
| 429 |
+
|
| 430 |
+
def export_to_ttl(self, output_path: str, include_base: bool = False) -> bool:
|
| 431 |
+
"""
|
| 432 |
+
Export the ontology to a TTL file.
|
| 433 |
+
|
| 434 |
+
Args:
|
| 435 |
+
output_path: Path to write the TTL file
|
| 436 |
+
include_base: If True, include base ontology in export
|
| 437 |
+
|
| 438 |
+
Returns:
|
| 439 |
+
True if successful
|
| 440 |
+
"""
|
| 441 |
+
try:
|
| 442 |
+
if include_base:
|
| 443 |
+
combined = self.base_graph + self.data_graph
|
| 444 |
+
combined.serialize(destination=output_path, format='turtle')
|
| 445 |
+
else:
|
| 446 |
+
self.data_graph.serialize(destination=output_path, format='turtle')
|
| 447 |
+
|
| 448 |
+
print(f"[OntologyManager] Exported to: {output_path}")
|
| 449 |
+
return True
|
| 450 |
+
except Exception as e:
|
| 451 |
+
print(f"[OntologyManager] Export error: {e}")
|
| 452 |
+
return False
|
| 453 |
+
|
| 454 |
+
def save_data(self) -> bool:
|
| 455 |
+
"""Save the data graph to its configured path."""
|
| 456 |
+
if self.data_path:
|
| 457 |
+
return self.export_to_ttl(self.data_path, include_base=False)
|
| 458 |
+
return False
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
# --- Testing ---
|
| 462 |
+
if __name__ == "__main__":
|
| 463 |
+
print("=== Testing OntologyManager ===\n")
|
| 464 |
+
|
| 465 |
+
# Test with base ontology
|
| 466 |
+
base_path = "/Users/bk280625/documents041025/MonCode/sysCRED_onto26avrtil.ttl"
|
| 467 |
+
data_path = "/Users/bk280625/documents041025/MonCode/ontology/sysCRED_data.ttl"
|
| 468 |
+
|
| 469 |
+
manager = OntologyManager(base_ontology_path=base_path, data_path=None)
|
| 470 |
+
|
| 471 |
+
# Test adding evaluation
|
| 472 |
+
sample_report = {
|
| 473 |
+
'scoreCredibilite': 0.72,
|
| 474 |
+
'informationEntree': 'https://www.lemonde.fr/article/test',
|
| 475 |
+
'resumeAnalyse': "L'analyse suggère une crédibilité MOYENNE à ÉLEVÉE.",
|
| 476 |
+
'analyseNLP': {
|
| 477 |
+
'sentiment': {'label': 'POSITIVE', 'score': 0.85},
|
| 478 |
+
'coherence_score': 0.78
|
| 479 |
+
},
|
| 480 |
+
'reglesAppliquees': {
|
| 481 |
+
'source_analysis': {
|
| 482 |
+
'reputation': 'High',
|
| 483 |
+
'domain_age_days': 9000
|
| 484 |
+
},
|
| 485 |
+
'fact_checking': [
|
| 486 |
+
{'claim': 'Article verified by fact-checkers', 'rating': 'True'}
|
| 487 |
+
]
|
| 488 |
+
}
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
print("Test 1: Adding evaluation triplets...")
|
| 492 |
+
report_uri = manager.add_evaluation_triplets(sample_report)
|
| 493 |
+
print(f" Created: {report_uri}")
|
| 494 |
+
print()
|
| 495 |
+
|
| 496 |
+
# Test statistics
|
| 497 |
+
print("Test 2: Getting statistics...")
|
| 498 |
+
stats = manager.get_statistics()
|
| 499 |
+
for key, value in stats.items():
|
| 500 |
+
print(f" {key}: {value}")
|
| 501 |
+
print()
|
| 502 |
+
|
| 503 |
+
# Export test
|
| 504 |
+
print("Test 3: Exporting data graph...")
|
| 505 |
+
os.makedirs(os.path.dirname(data_path), exist_ok=True)
|
| 506 |
+
manager.export_to_ttl(data_path)
|
| 507 |
+
print(f" Exported to: {data_path}")
|
| 508 |
+
|
| 509 |
+
print("\n=== Tests Complete ===")
|
syscred/requirements-light.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SysCRED - Light Requirements (for Render Free Tier)
|
| 2 |
+
# Système Hybride de Vérification de Crédibilité
|
| 3 |
+
# (c) Dominique S. Loyer
|
| 4 |
+
#
|
| 5 |
+
# NOTE: ML features (embeddings) disabled for memory constraints
|
| 6 |
+
# For full ML support, use Railway, Fly.io, or Google Cloud Run
|
| 7 |
+
|
| 8 |
+
# === Core Dependencies ===
|
| 9 |
+
requests>=2.28.0
|
| 10 |
+
beautifulsoup4>=4.11.0
|
| 11 |
+
python-whois>=0.8.0
|
| 12 |
+
|
| 13 |
+
# === RDF/Ontology ===
|
| 14 |
+
rdflib>=6.0.0
|
| 15 |
+
|
| 16 |
+
# === Data Processing (lightweight) ===
|
| 17 |
+
numpy>=1.24.0
|
| 18 |
+
pandas>=2.0.0
|
| 19 |
+
|
| 20 |
+
# === Web Backend ===
|
| 21 |
+
flask>=2.3.0
|
| 22 |
+
flask-cors>=4.0.0
|
| 23 |
+
python-dotenv>=1.0.0
|
| 24 |
+
|
| 25 |
+
# === Production/Database ===
|
| 26 |
+
gunicorn>=20.1.0
|
| 27 |
+
psycopg2-binary>=2.9.0
|
| 28 |
+
flask-sqlalchemy>=3.0.0
|
| 29 |
+
|
| 30 |
+
# === Development/Testing ===
|
| 31 |
+
pytest>=7.0.0
|
syscred/requirements.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SysCRED - Requirements
|
| 2 |
+
# Système Hybride de Vérification de Crédibilité
|
| 3 |
+
# (c) Dominique S. Loyer
|
| 4 |
+
|
| 5 |
+
# === Core Dependencies ===
|
| 6 |
+
requests>=2.28.0
|
| 7 |
+
beautifulsoup4>=4.11.0
|
| 8 |
+
python-whois>=0.8.0
|
| 9 |
+
|
| 10 |
+
# === RDF/Ontology ===
|
| 11 |
+
rdflib>=6.0.0
|
| 12 |
+
|
| 13 |
+
# === Machine Learning ===
|
| 14 |
+
transformers>=4.30.0
|
| 15 |
+
torch>=2.0.0
|
| 16 |
+
numpy>=1.24.0
|
| 17 |
+
sentence-transformers>=2.2.0
|
| 18 |
+
|
| 19 |
+
# === Explainability ===
|
| 20 |
+
lime>=0.2.0
|
| 21 |
+
|
| 22 |
+
# === Web Backend ===
|
| 23 |
+
flask>=2.3.0
|
| 24 |
+
flask-cors>=4.0.0
|
| 25 |
+
python-dotenv>=1.0.0
|
| 26 |
+
pandas>=2.0.0
|
| 27 |
+
|
| 28 |
+
# === Production/Database ===
|
| 29 |
+
gunicorn>=20.1.0
|
| 30 |
+
psycopg2-binary>=2.9.0
|
| 31 |
+
flask-sqlalchemy>=3.0.0
|
| 32 |
+
|
| 33 |
+
# === Development/Testing ===
|
| 34 |
+
pytest>=7.0.0
|
syscred/requirements_light.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SysCRED - Requirements (Light Version for Render Free Tier)
|
| 2 |
+
# Sans ML models - Mode heuristique uniquement
|
| 3 |
+
# (c) Dominique S. Loyer
|
| 4 |
+
|
| 5 |
+
# === Core Dependencies ===
|
| 6 |
+
requests>=2.28.0
|
| 7 |
+
beautifulsoup4>=4.11.0
|
| 8 |
+
python-whois>=0.8.0
|
| 9 |
+
|
| 10 |
+
# === RDF/Ontology ===
|
| 11 |
+
rdflib>=6.0.0
|
| 12 |
+
|
| 13 |
+
# === Web Backend ===
|
| 14 |
+
flask>=2.3.0
|
| 15 |
+
flask-cors>=4.0.0
|
| 16 |
+
python-dotenv>=1.0.0
|
| 17 |
+
|
| 18 |
+
# === Production ===
|
| 19 |
+
gunicorn>=20.1.0
|
syscred/run_benchmark.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, List
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
# Add project root to path (one level up from this script)
|
| 12 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 13 |
+
|
| 14 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 15 |
+
from syscred.config import config
|
| 16 |
+
|
| 17 |
+
def run_benchmark():
|
| 18 |
+
print("="*60)
|
| 19 |
+
print(" SysCRED v2.1 - Scientific Evaluation Benchmark ")
|
| 20 |
+
print("="*60)
|
| 21 |
+
|
| 22 |
+
# Load Benchmark Data
|
| 23 |
+
data_path = Path(__file__).parent / "benchmark_data.json"
|
| 24 |
+
if not data_path.exists():
|
| 25 |
+
print(f"❌ Error: {data_path} not found.")
|
| 26 |
+
return
|
| 27 |
+
|
| 28 |
+
with open(data_path, 'r') as f:
|
| 29 |
+
dataset = json.load(f)
|
| 30 |
+
|
| 31 |
+
print(f"Loaded {len(dataset)} test cases.\n")
|
| 32 |
+
|
| 33 |
+
# Initialize System with Full Capabilities
|
| 34 |
+
print("Initializing SysCRED (ML Models + Google API)...")
|
| 35 |
+
system = CredibilityVerificationSystem(
|
| 36 |
+
ontology_base_path=str(config.ONTOLOGY_BASE_PATH),
|
| 37 |
+
ontology_data_path=str(config.ONTOLOGY_DATA_PATH),
|
| 38 |
+
load_ml_models=True, # Use full ML for benchmark
|
| 39 |
+
google_api_key=config.GOOGLE_FACT_CHECK_API_KEY
|
| 40 |
+
)
|
| 41 |
+
print("System ready.\n")
|
| 42 |
+
|
| 43 |
+
results = []
|
| 44 |
+
|
| 45 |
+
# Run Evaluation
|
| 46 |
+
for i, item in enumerate(dataset):
|
| 47 |
+
url = item['url']
|
| 48 |
+
label = item['label']
|
| 49 |
+
print(f"[{i+1}/{len(dataset)}] Analyzing: {url} (Expected: {label})...")
|
| 50 |
+
|
| 51 |
+
start_time = time.time()
|
| 52 |
+
try:
|
| 53 |
+
# Run analysis
|
| 54 |
+
# We treat empty text fallbacks as valid logic path
|
| 55 |
+
report = system.verify_information(url)
|
| 56 |
+
score = report.get('score_credibilite', 0.5)
|
| 57 |
+
|
| 58 |
+
# Determine System Verdict
|
| 59 |
+
sys_verdict = "High" if score >= 0.55 else "Low"
|
| 60 |
+
|
| 61 |
+
# Compare
|
| 62 |
+
match = (sys_verdict == label) or (label == "High" and sys_verdict == "High") or (label == "Low" and sys_verdict == "Low")
|
| 63 |
+
# Handling Medium? For binary benchmark, we assume simplified threshold.
|
| 64 |
+
# Or we can map:
|
| 65 |
+
# High (>=0.7)
|
| 66 |
+
# Medium (0.4-0.7)
|
| 67 |
+
# Low (<0.4)
|
| 68 |
+
|
| 69 |
+
# Simple Binary Metric for Precision/Recall:
|
| 70 |
+
# Positive Class = "High Credibility"
|
| 71 |
+
|
| 72 |
+
results.append({
|
| 73 |
+
"url": url,
|
| 74 |
+
"expected": label,
|
| 75 |
+
"score": score,
|
| 76 |
+
"system_verdict": sys_verdict,
|
| 77 |
+
"match": match,
|
| 78 |
+
"time": time.time() - start_time,
|
| 79 |
+
"error": None
|
| 80 |
+
})
|
| 81 |
+
print(f" -> Score: {score:.2f} | Verdict: {sys_verdict} | match: {'✅' if match else '❌'}")
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f" -> ❌ Error: {e}")
|
| 85 |
+
results.append({
|
| 86 |
+
"url": url,
|
| 87 |
+
"expected": label,
|
| 88 |
+
"score": 0,
|
| 89 |
+
"system_verdict": "Error",
|
| 90 |
+
"match": False,
|
| 91 |
+
"time": time.time() - start_time,
|
| 92 |
+
"error": str(e)
|
| 93 |
+
})
|
| 94 |
+
|
| 95 |
+
# Calculate Metrics
|
| 96 |
+
print("\n" + "="*60)
|
| 97 |
+
print("RESULTS SUMMARY")
|
| 98 |
+
print("="*60)
|
| 99 |
+
|
| 100 |
+
df = pd.DataFrame(results)
|
| 101 |
+
|
| 102 |
+
# Logic for metrics
|
| 103 |
+
# TP: System=High, Expected=High
|
| 104 |
+
# FP: System=High, Expected=Low
|
| 105 |
+
# TN: System=Low, Expected=Low
|
| 106 |
+
# FN: System=Low, Expected=High
|
| 107 |
+
|
| 108 |
+
tp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'High')])
|
| 109 |
+
fp = len(df[(df['system_verdict'] == 'High') & (df['expected'] == 'Low')])
|
| 110 |
+
tn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'Low')])
|
| 111 |
+
fn = len(df[(df['system_verdict'] == 'Low') & (df['expected'] == 'High')])
|
| 112 |
+
|
| 113 |
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
| 114 |
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
| 115 |
+
accuracy = (tp + tn) / len(df) if len(df) > 0 else 0
|
| 116 |
+
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
| 117 |
+
|
| 118 |
+
print(f"Total Cases: {len(df)}")
|
| 119 |
+
print(f"Accuracy: {accuracy:.2%}")
|
| 120 |
+
print(f"Precision: {precision:.2%}")
|
| 121 |
+
print(f"Recall: {recall:.2%}")
|
| 122 |
+
print(f"F1-Score: {f1:.2f}")
|
| 123 |
+
|
| 124 |
+
print("\nConfusion Matrix:")
|
| 125 |
+
print(f" | Pred High | Pred Low")
|
| 126 |
+
print(f"True High | {tp} | {fn}")
|
| 127 |
+
print(f"True Low | {fp} | {tn}")
|
| 128 |
+
|
| 129 |
+
# Save detailed report
|
| 130 |
+
report_path = Path(__file__).parent / "benchmark_results.csv"
|
| 131 |
+
df.to_csv(report_path, index=False)
|
| 132 |
+
print(f"\nDetailed CSV Saved to: {report_path}")
|
| 133 |
+
|
| 134 |
+
if __name__ == "__main__":
|
| 135 |
+
run_benchmark()
|
syscred/run_trec_benchmark.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
TREC Benchmark Script - SysCRED
|
| 4 |
+
================================
|
| 5 |
+
Run TREC-style evaluation on the fact-checking system.
|
| 6 |
+
|
| 7 |
+
This script:
|
| 8 |
+
1. Loads TREC AP88-90 topics and qrels
|
| 9 |
+
2. Runs retrieval with multiple models (BM25, QLD, TF-IDF)
|
| 10 |
+
3. Evaluates using pytrec_eval metrics
|
| 11 |
+
4. Generates comparison tables and visualizations
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python run_trec_benchmark.py --index /path/to/index --qrels /path/to/qrels
|
| 15 |
+
|
| 16 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 17 |
+
Citation Key: loyerEvaluationModelesRecherche2025
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
import sys
|
| 22 |
+
import json
|
| 23 |
+
import argparse
|
| 24 |
+
import time
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import Dict, List, Any, Tuple
|
| 27 |
+
from collections import defaultdict
|
| 28 |
+
|
| 29 |
+
# Add parent directory to path
|
| 30 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 31 |
+
|
| 32 |
+
from syscred.trec_retriever import TRECRetriever, RetrievalResult
|
| 33 |
+
from syscred.trec_dataset import TRECDataset, SAMPLE_TOPICS
|
| 34 |
+
from syscred.eval_metrics import EvaluationMetrics
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class TRECBenchmark:
|
| 38 |
+
"""
|
| 39 |
+
TREC-style benchmark runner for SysCRED.
|
| 40 |
+
|
| 41 |
+
Runs multiple retrieval configurations and compares performance
|
| 42 |
+
using standard IR metrics.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
# Configurations to test
|
| 46 |
+
CONFIGURATIONS = [
|
| 47 |
+
{"name": "BM25", "model": "bm25", "prf": False},
|
| 48 |
+
{"name": "BM25+PRF", "model": "bm25", "prf": True},
|
| 49 |
+
{"name": "QLD", "model": "qld", "prf": False},
|
| 50 |
+
{"name": "QLD+PRF", "model": "qld", "prf": True},
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
# Metrics to evaluate
|
| 54 |
+
METRICS = ["map", "ndcg", "P_10", "P_20", "recall_100", "recip_rank"]
|
| 55 |
+
|
| 56 |
+
def __init__(
|
| 57 |
+
self,
|
| 58 |
+
index_path: str = None,
|
| 59 |
+
corpus_path: str = None,
|
| 60 |
+
topics_path: str = None,
|
| 61 |
+
qrels_path: str = None,
|
| 62 |
+
output_dir: str = None
|
| 63 |
+
):
|
| 64 |
+
"""
|
| 65 |
+
Initialize the benchmark runner.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
index_path: Path to Lucene index
|
| 69 |
+
corpus_path: Path to JSONL corpus
|
| 70 |
+
topics_path: Path to TREC topics
|
| 71 |
+
qrels_path: Path to TREC qrels
|
| 72 |
+
output_dir: Directory for output files
|
| 73 |
+
"""
|
| 74 |
+
self.index_path = index_path
|
| 75 |
+
self.corpus_path = corpus_path
|
| 76 |
+
self.topics_path = topics_path
|
| 77 |
+
self.qrels_path = qrels_path
|
| 78 |
+
self.output_dir = Path(output_dir) if output_dir else Path("benchmark_results")
|
| 79 |
+
|
| 80 |
+
# Create output directory
|
| 81 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 82 |
+
|
| 83 |
+
# Initialize components
|
| 84 |
+
self.dataset = TRECDataset(
|
| 85 |
+
topics_dir=topics_path,
|
| 86 |
+
qrels_dir=qrels_path,
|
| 87 |
+
corpus_path=corpus_path
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
self.retriever = TRECRetriever(
|
| 91 |
+
index_path=index_path,
|
| 92 |
+
corpus_path=corpus_path,
|
| 93 |
+
use_stemming=True
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
self.metrics = EvaluationMetrics()
|
| 97 |
+
|
| 98 |
+
# Results storage
|
| 99 |
+
self.results: Dict[str, Dict[str, Any]] = {}
|
| 100 |
+
|
| 101 |
+
def load_data(self):
|
| 102 |
+
"""Load topics and qrels."""
|
| 103 |
+
print("\n" + "=" * 60)
|
| 104 |
+
print("Loading TREC Data")
|
| 105 |
+
print("=" * 60)
|
| 106 |
+
|
| 107 |
+
# Load topics
|
| 108 |
+
if self.topics_path:
|
| 109 |
+
self.dataset.load_topics(self.topics_path)
|
| 110 |
+
else:
|
| 111 |
+
# Use sample topics
|
| 112 |
+
print("[Benchmark] Using sample topics (no topics file provided)")
|
| 113 |
+
self.dataset.topics = SAMPLE_TOPICS.copy()
|
| 114 |
+
|
| 115 |
+
# Load qrels
|
| 116 |
+
if self.qrels_path:
|
| 117 |
+
self.dataset.load_qrels(self.qrels_path)
|
| 118 |
+
else:
|
| 119 |
+
print("[Benchmark] No qrels provided - evaluation will be limited")
|
| 120 |
+
|
| 121 |
+
# Load corpus if available
|
| 122 |
+
if self.corpus_path:
|
| 123 |
+
self.dataset.load_corpus_jsonl(self.corpus_path)
|
| 124 |
+
|
| 125 |
+
stats = self.dataset.get_statistics()
|
| 126 |
+
print(f"\nDataset Statistics:")
|
| 127 |
+
for key, value in stats.items():
|
| 128 |
+
print(f" {key}: {value}")
|
| 129 |
+
|
| 130 |
+
def run_configuration(
|
| 131 |
+
self,
|
| 132 |
+
config: Dict[str, Any],
|
| 133 |
+
query_type: str = "short",
|
| 134 |
+
k: int = 100
|
| 135 |
+
) -> Tuple[str, Dict[str, Any]]:
|
| 136 |
+
"""
|
| 137 |
+
Run a single retrieval configuration.
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
(run_tag, results_dict)
|
| 141 |
+
"""
|
| 142 |
+
config_name = config["name"]
|
| 143 |
+
model = config["model"]
|
| 144 |
+
use_prf = config["prf"]
|
| 145 |
+
|
| 146 |
+
run_tag = f"syscred_{config_name}_{query_type}"
|
| 147 |
+
|
| 148 |
+
print(f"\n--- Running: {run_tag} ---")
|
| 149 |
+
|
| 150 |
+
queries = self.dataset.get_topic_queries(query_type)
|
| 151 |
+
|
| 152 |
+
if not queries:
|
| 153 |
+
print(f" No queries available!")
|
| 154 |
+
return run_tag, {}
|
| 155 |
+
|
| 156 |
+
# Run retrieval
|
| 157 |
+
start_time = time.time()
|
| 158 |
+
|
| 159 |
+
all_results = []
|
| 160 |
+
run_lines = []
|
| 161 |
+
|
| 162 |
+
for topic_id, query_text in queries.items():
|
| 163 |
+
result = self.retriever.retrieve_evidence(
|
| 164 |
+
claim=query_text,
|
| 165 |
+
k=k,
|
| 166 |
+
model=model,
|
| 167 |
+
use_prf=use_prf
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
for evidence in result.evidences:
|
| 171 |
+
all_results.append({
|
| 172 |
+
"topic_id": topic_id,
|
| 173 |
+
"doc_id": evidence.doc_id,
|
| 174 |
+
"score": evidence.score,
|
| 175 |
+
"rank": evidence.rank
|
| 176 |
+
})
|
| 177 |
+
run_lines.append(
|
| 178 |
+
f"{topic_id} Q0 {evidence.doc_id} {evidence.rank} {evidence.score:.6f} {run_tag}"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
elapsed = time.time() - start_time
|
| 182 |
+
|
| 183 |
+
# Save run file
|
| 184 |
+
run_file = self.output_dir / f"{run_tag}.run"
|
| 185 |
+
with open(run_file, 'w') as f:
|
| 186 |
+
f.write("\n".join(run_lines))
|
| 187 |
+
|
| 188 |
+
print(f" Queries: {len(queries)}")
|
| 189 |
+
print(f" Total results: {len(all_results)}")
|
| 190 |
+
print(f" Time: {elapsed:.2f}s")
|
| 191 |
+
print(f" Saved: {run_file}")
|
| 192 |
+
|
| 193 |
+
return run_tag, {
|
| 194 |
+
"config": config,
|
| 195 |
+
"query_type": query_type,
|
| 196 |
+
"results": all_results,
|
| 197 |
+
"run_file": str(run_file),
|
| 198 |
+
"elapsed_time": elapsed
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
def evaluate_run(self, run_tag: str, results: Dict[str, Any]) -> Dict[str, float]:
|
| 202 |
+
"""
|
| 203 |
+
Evaluate a run using pytrec_eval.
|
| 204 |
+
|
| 205 |
+
Returns dictionary of metric -> value (aggregated across queries).
|
| 206 |
+
"""
|
| 207 |
+
if not self.dataset.qrels:
|
| 208 |
+
print(f" [Skip evaluation - no qrels]")
|
| 209 |
+
return {}
|
| 210 |
+
|
| 211 |
+
# Convert results to pytrec format: {query_id: [(doc_id, score), ...]}
|
| 212 |
+
run = defaultdict(list)
|
| 213 |
+
for r in results["results"]:
|
| 214 |
+
run[r["topic_id"]].append((r["doc_id"], r["score"]))
|
| 215 |
+
|
| 216 |
+
# Sort each query's results by score descending
|
| 217 |
+
for qid in run:
|
| 218 |
+
run[qid].sort(key=lambda x: x[1], reverse=True)
|
| 219 |
+
|
| 220 |
+
# Convert qrels to pytrec format
|
| 221 |
+
qrels = {}
|
| 222 |
+
for topic_id, docs in self.dataset.qrels.items():
|
| 223 |
+
qrels[topic_id] = {doc_id: rel for doc_id, rel in docs.items()}
|
| 224 |
+
|
| 225 |
+
# Evaluate
|
| 226 |
+
try:
|
| 227 |
+
per_query_results = self.metrics.evaluate_run(dict(run), qrels, self.METRICS)
|
| 228 |
+
# Aggregate results across queries
|
| 229 |
+
aggregated = self.metrics.compute_aggregate(per_query_results)
|
| 230 |
+
return aggregated
|
| 231 |
+
except Exception as e:
|
| 232 |
+
print(f" [Evaluation error: {e}]")
|
| 233 |
+
return {}
|
| 234 |
+
|
| 235 |
+
def run_full_benchmark(self, query_types: List[str] = None, k: int = 100):
|
| 236 |
+
"""
|
| 237 |
+
Run the complete benchmark suite.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
query_types: List of query types to test ("short", "long")
|
| 241 |
+
k: Number of results per query
|
| 242 |
+
"""
|
| 243 |
+
if query_types is None:
|
| 244 |
+
query_types = ["short", "long"]
|
| 245 |
+
|
| 246 |
+
print("\n" + "=" * 60)
|
| 247 |
+
print("TREC Benchmark - SysCRED")
|
| 248 |
+
print("=" * 60)
|
| 249 |
+
|
| 250 |
+
# Load data
|
| 251 |
+
self.load_data()
|
| 252 |
+
|
| 253 |
+
# Run all configurations
|
| 254 |
+
print("\n" + "=" * 60)
|
| 255 |
+
print("Running Retrieval Experiments")
|
| 256 |
+
print("=" * 60)
|
| 257 |
+
|
| 258 |
+
for query_type in query_types:
|
| 259 |
+
for config in self.CONFIGURATIONS:
|
| 260 |
+
run_tag, results = self.run_configuration(
|
| 261 |
+
config, query_type, k
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
if results:
|
| 265 |
+
self.results[run_tag] = results
|
| 266 |
+
|
| 267 |
+
# Evaluate
|
| 268 |
+
metrics = self.evaluate_run(run_tag, results)
|
| 269 |
+
self.results[run_tag]["metrics"] = metrics
|
| 270 |
+
|
| 271 |
+
# Generate report
|
| 272 |
+
self.generate_report()
|
| 273 |
+
|
| 274 |
+
return self.results
|
| 275 |
+
|
| 276 |
+
def generate_report(self):
|
| 277 |
+
"""Generate summary report."""
|
| 278 |
+
print("\n" + "=" * 60)
|
| 279 |
+
print("Benchmark Results Summary")
|
| 280 |
+
print("=" * 60)
|
| 281 |
+
|
| 282 |
+
# Table header
|
| 283 |
+
header = ["Configuration", "Query", "MAP", "NDCG", "P@10", "MRR", "Time(s)"]
|
| 284 |
+
print("\n" + " | ".join(f"{h:^12}" for h in header))
|
| 285 |
+
print("-" * 100)
|
| 286 |
+
|
| 287 |
+
# Table rows
|
| 288 |
+
for run_tag, data in self.results.items():
|
| 289 |
+
metrics = data.get("metrics", {})
|
| 290 |
+
|
| 291 |
+
row = [
|
| 292 |
+
data["config"]["name"][:12],
|
| 293 |
+
data["query_type"][:5],
|
| 294 |
+
f"{metrics.get('map', 0):.4f}",
|
| 295 |
+
f"{metrics.get('ndcg', 0):.4f}",
|
| 296 |
+
f"{metrics.get('P_10', 0):.4f}",
|
| 297 |
+
f"{metrics.get('recip_rank', 0):.4f}",
|
| 298 |
+
f"{data.get('elapsed_time', 0):.2f}"
|
| 299 |
+
]
|
| 300 |
+
print(" | ".join(f"{v:^12}" for v in row))
|
| 301 |
+
|
| 302 |
+
# Save detailed results
|
| 303 |
+
results_file = self.output_dir / "benchmark_results.json"
|
| 304 |
+
|
| 305 |
+
# Make results JSON serializable
|
| 306 |
+
serializable_results = {}
|
| 307 |
+
for run_tag, data in self.results.items():
|
| 308 |
+
serializable_results[run_tag] = {
|
| 309 |
+
"config": data["config"],
|
| 310 |
+
"query_type": data["query_type"],
|
| 311 |
+
"metrics": data.get("metrics", {}),
|
| 312 |
+
"elapsed_time": data.get("elapsed_time", 0),
|
| 313 |
+
"num_results": len(data.get("results", []))
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
with open(results_file, 'w') as f:
|
| 317 |
+
json.dump(serializable_results, f, indent=2)
|
| 318 |
+
|
| 319 |
+
print(f"\nDetailed results saved to: {results_file}")
|
| 320 |
+
|
| 321 |
+
# Generate LaTeX table
|
| 322 |
+
self._generate_latex_table()
|
| 323 |
+
|
| 324 |
+
def _generate_latex_table(self):
|
| 325 |
+
"""Generate LaTeX table for paper."""
|
| 326 |
+
latex_file = self.output_dir / "results_table.tex"
|
| 327 |
+
|
| 328 |
+
lines = [
|
| 329 |
+
r"\begin{table}[ht]",
|
| 330 |
+
r"\centering",
|
| 331 |
+
r"\caption{TREC AP88-90 Retrieval Results}",
|
| 332 |
+
r"\label{tab:trec-results}",
|
| 333 |
+
r"\begin{tabular}{l|l|cccc}",
|
| 334 |
+
r"\toprule",
|
| 335 |
+
r"Model & Query & MAP & NDCG & P@10 & MRR \\",
|
| 336 |
+
r"\midrule"
|
| 337 |
+
]
|
| 338 |
+
|
| 339 |
+
for run_tag, data in self.results.items():
|
| 340 |
+
metrics = data.get("metrics", {})
|
| 341 |
+
row = (
|
| 342 |
+
f"{data['config']['name']} & {data['query_type']} & "
|
| 343 |
+
f"{metrics.get('map', 0):.4f} & "
|
| 344 |
+
f"{metrics.get('ndcg', 0):.4f} & "
|
| 345 |
+
f"{metrics.get('P_10', 0):.4f} & "
|
| 346 |
+
f"{metrics.get('recip_rank', 0):.4f} \\\\"
|
| 347 |
+
)
|
| 348 |
+
lines.append(row)
|
| 349 |
+
|
| 350 |
+
lines.extend([
|
| 351 |
+
r"\bottomrule",
|
| 352 |
+
r"\end{tabular}",
|
| 353 |
+
r"\end{table}"
|
| 354 |
+
])
|
| 355 |
+
|
| 356 |
+
with open(latex_file, 'w') as f:
|
| 357 |
+
f.write("\n".join(lines))
|
| 358 |
+
|
| 359 |
+
print(f"LaTeX table saved to: {latex_file}")
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def main():
|
| 363 |
+
"""Main entry point."""
|
| 364 |
+
parser = argparse.ArgumentParser(
|
| 365 |
+
description="Run TREC benchmark for SysCRED"
|
| 366 |
+
)
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--index", "-i",
|
| 369 |
+
help="Path to Lucene index"
|
| 370 |
+
)
|
| 371 |
+
parser.add_argument(
|
| 372 |
+
"--corpus", "-c",
|
| 373 |
+
help="Path to JSONL corpus"
|
| 374 |
+
)
|
| 375 |
+
parser.add_argument(
|
| 376 |
+
"--topics", "-t",
|
| 377 |
+
help="Path to TREC topics file/directory"
|
| 378 |
+
)
|
| 379 |
+
parser.add_argument(
|
| 380 |
+
"--qrels", "-q",
|
| 381 |
+
help="Path to TREC qrels file/directory"
|
| 382 |
+
)
|
| 383 |
+
parser.add_argument(
|
| 384 |
+
"--output", "-o",
|
| 385 |
+
default="benchmark_results",
|
| 386 |
+
help="Output directory for results"
|
| 387 |
+
)
|
| 388 |
+
parser.add_argument(
|
| 389 |
+
"--k",
|
| 390 |
+
type=int,
|
| 391 |
+
default=100,
|
| 392 |
+
help="Number of results per query"
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
args = parser.parse_args()
|
| 396 |
+
|
| 397 |
+
# Run benchmark
|
| 398 |
+
benchmark = TRECBenchmark(
|
| 399 |
+
index_path=args.index,
|
| 400 |
+
corpus_path=args.corpus,
|
| 401 |
+
topics_path=args.topics,
|
| 402 |
+
qrels_path=args.qrels,
|
| 403 |
+
output_dir=args.output
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
results = benchmark.run_full_benchmark(k=args.k)
|
| 407 |
+
|
| 408 |
+
print("\n" + "=" * 60)
|
| 409 |
+
print("Benchmark Complete!")
|
| 410 |
+
print("=" * 60)
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
if __name__ == "__main__":
|
| 414 |
+
main()
|
syscred/save_to_notes.sh
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ============================================
|
| 3 |
+
# save_to_notes.sh
|
| 4 |
+
# Script pour sauvegarder la documentation vers Obsidian et Notion
|
| 5 |
+
#
|
| 6 |
+
# Usage: ./save_to_notes.sh [chemin_fichier_optionnel]
|
| 7 |
+
#
|
| 8 |
+
# Par défaut: Sauvegarde SysCRED_Documentation.md
|
| 9 |
+
# ============================================
|
| 10 |
+
|
| 11 |
+
# Configuration - MODIFIEZ CES CHEMINS SELON VOTRE SETUP
|
| 12 |
+
OBSIDIAN_VAULT="${OBSIDIAN_VAULT:-/Users/bk280625/documents041025/Obsidian_UQAM25_bk051225}"
|
| 13 |
+
NOTION_CLIPBOARD=true # true = copie dans le presse-papiers pour Notion
|
| 14 |
+
|
| 15 |
+
# Couleurs pour output
|
| 16 |
+
GREEN='\033[0;32m'
|
| 17 |
+
BLUE='\033[0;34m'
|
| 18 |
+
YELLOW='\033[1;33m'
|
| 19 |
+
NC='\033[0m' # No Color
|
| 20 |
+
|
| 21 |
+
# Date pour le versioning
|
| 22 |
+
DATE=$(date +%Y%m%d)
|
| 23 |
+
DATETIME=$(date +"%Y-%m-%d %H:%M")
|
| 24 |
+
|
| 25 |
+
# Fichier source (argument ou défaut)
|
| 26 |
+
if [ -n "$1" ]; then
|
| 27 |
+
DOC_SOURCE="$1"
|
| 28 |
+
else
|
| 29 |
+
DOC_SOURCE="/Users/bk280625/documents041025/MonCode/syscred/SysCRED_Documentation.md"
|
| 30 |
+
fi
|
| 31 |
+
|
| 32 |
+
# Vérifier que le fichier existe
|
| 33 |
+
if [ ! -f "$DOC_SOURCE" ]; then
|
| 34 |
+
echo -e "${YELLOW}⚠️ Fichier non trouvé: $DOC_SOURCE${NC}"
|
| 35 |
+
exit 1
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
# Nom du fichier sans chemin
|
| 39 |
+
FILENAME=$(basename "$DOC_SOURCE" .md)
|
| 40 |
+
|
| 41 |
+
echo -e "${BLUE}📝 Sauvegarde de: $DOC_SOURCE${NC}"
|
| 42 |
+
echo " Date: $DATETIME"
|
| 43 |
+
echo ""
|
| 44 |
+
|
| 45 |
+
# ============================================
|
| 46 |
+
# 1. OBSIDIAN
|
| 47 |
+
# ============================================
|
| 48 |
+
echo -e "${BLUE}📚 OBSIDIAN${NC}"
|
| 49 |
+
|
| 50 |
+
# Créer le dossier Obsidian s'il n'existe pas
|
| 51 |
+
if [ ! -d "$OBSIDIAN_VAULT" ]; then
|
| 52 |
+
echo " ⚠️ Vault Obsidian non trouvé: $OBSIDIAN_VAULT"
|
| 53 |
+
echo " Création du dossier..."
|
| 54 |
+
mkdir -p "$OBSIDIAN_VAULT"
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Copier le fichier avec date
|
| 58 |
+
OBSIDIAN_FILE="$OBSIDIAN_VAULT/${FILENAME}.md"
|
| 59 |
+
cp "$DOC_SOURCE" "$OBSIDIAN_FILE"
|
| 60 |
+
|
| 61 |
+
if [ -f "$OBSIDIAN_FILE" ]; then
|
| 62 |
+
echo -e " ${GREEN}✅ Copié: $OBSIDIAN_FILE${NC}"
|
| 63 |
+
|
| 64 |
+
# Ouvrir dans Obsidian (Mac uniquement)
|
| 65 |
+
if [[ "$OSTYPE" == "darwin"* ]]; then
|
| 66 |
+
# Encoder le nom de fichier pour l'URL
|
| 67 |
+
ENCODED_FILE=$(echo "$FILENAME" | sed 's/ /%20/g')
|
| 68 |
+
VAULT_NAME=$(basename "$OBSIDIAN_VAULT")
|
| 69 |
+
|
| 70 |
+
# Ouvrir Obsidian avec le fichier
|
| 71 |
+
open "obsidian://open?vault=$VAULT_NAME&file=$ENCODED_FILE" 2>/dev/null
|
| 72 |
+
echo " 📖 Ouvert dans Obsidian"
|
| 73 |
+
fi
|
| 74 |
+
else
|
| 75 |
+
echo " ❌ Échec de copie"
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
echo ""
|
| 79 |
+
|
| 80 |
+
# ============================================
|
| 81 |
+
# 2. NOTION (via presse-papiers)
|
| 82 |
+
# ============================================
|
| 83 |
+
echo -e "${BLUE}📋 NOTION${NC}"
|
| 84 |
+
|
| 85 |
+
if [ "$NOTION_CLIPBOARD" = true ]; then
|
| 86 |
+
# Copier le contenu dans le presse-papiers
|
| 87 |
+
if [[ "$OSTYPE" == "darwin"* ]]; then
|
| 88 |
+
# macOS
|
| 89 |
+
cat "$DOC_SOURCE" | pbcopy
|
| 90 |
+
echo -e " ${GREEN}✅ Contenu copié dans le presse-papiers${NC}"
|
| 91 |
+
echo " 📝 Pour coller dans Notion:"
|
| 92 |
+
echo " 1. Ouvrez Notion"
|
| 93 |
+
echo " 2. Créez une nouvelle page"
|
| 94 |
+
echo " 3. Cmd+V pour coller"
|
| 95 |
+
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
| 96 |
+
# Linux avec xclip
|
| 97 |
+
if command -v xclip &> /dev/null; then
|
| 98 |
+
cat "$DOC_SOURCE" | xclip -selection clipboard
|
| 99 |
+
echo -e " ${GREEN}✅ Contenu copié dans le presse-papiers${NC}"
|
| 100 |
+
else
|
| 101 |
+
echo " ⚠️ xclip non installé (sudo apt install xclip)"
|
| 102 |
+
fi
|
| 103 |
+
fi
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
echo ""
|
| 107 |
+
|
| 108 |
+
# ============================================
|
| 109 |
+
# 3. RÉSUMÉ
|
| 110 |
+
# ============================================
|
| 111 |
+
echo -e "${GREEN}================================${NC}"
|
| 112 |
+
echo -e "${GREEN}✨ Sauvegarde terminée!${NC}"
|
| 113 |
+
echo -e "${GREEN}================================${NC}"
|
| 114 |
+
echo ""
|
| 115 |
+
echo "Fichiers:"
|
| 116 |
+
echo " • Original: $DOC_SOURCE"
|
| 117 |
+
echo " • Obsidian: $OBSIDIAN_FILE"
|
| 118 |
+
echo " • Notion: 📋 (presse-papiers)"
|
| 119 |
+
echo ""
|
| 120 |
+
echo "Taille: $(wc -c < "$DOC_SOURCE" | tr -d ' ') octets"
|
| 121 |
+
echo "Lignes: $(wc -l < "$DOC_SOURCE" | tr -d ' ')"
|
syscred/seo_analyzer.py
ADDED
|
@@ -0,0 +1,610 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
SEO Analyzer Module - SysCRED
|
| 4 |
+
==============================
|
| 5 |
+
Provides SEO analysis and Information Retrieval metrics for credibility assessment.
|
| 6 |
+
|
| 7 |
+
Implements:
|
| 8 |
+
- TF-IDF calculation
|
| 9 |
+
- BM25 scoring
|
| 10 |
+
- PageRank estimation/explanation
|
| 11 |
+
- SEO meta tag analysis
|
| 12 |
+
- Backlink quality assessment
|
| 13 |
+
|
| 14 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 15 |
+
Citation Key: loyerModelingHybridSystem2025
|
| 16 |
+
|
| 17 |
+
Note sur la scalabilité:
|
| 18 |
+
- Pour des corpus de grande taille, envisager Cython ou Rust pour TF-IDF/BM25
|
| 19 |
+
- Les calculs matriciels peuvent bénéficier de NumPy optimisé ou de bibliothèques C
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import math
|
| 23 |
+
import re
|
| 24 |
+
from typing import List, Dict, Tuple, Optional, Any
|
| 25 |
+
from dataclasses import dataclass
|
| 26 |
+
from collections import Counter
|
| 27 |
+
from urllib.parse import urlparse
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import numpy as np
|
| 31 |
+
HAS_NUMPY = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
HAS_NUMPY = False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# --- Data Classes ---
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class SEOAnalysis:
|
| 40 |
+
"""Results of SEO analysis for a webpage."""
|
| 41 |
+
url: str
|
| 42 |
+
title_length: int
|
| 43 |
+
title_has_keywords: bool
|
| 44 |
+
meta_description_length: int
|
| 45 |
+
has_meta_keywords: bool
|
| 46 |
+
heading_structure: Dict[str, int] # h1, h2, h3 counts
|
| 47 |
+
word_count: int
|
| 48 |
+
keyword_density: Dict[str, float]
|
| 49 |
+
readability_score: float
|
| 50 |
+
seo_score: float # Overall 0-1 score
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class PageRankExplanation:
|
| 55 |
+
"""Explainable PageRank estimation."""
|
| 56 |
+
url: str
|
| 57 |
+
estimated_pr: float
|
| 58 |
+
factors: List[Dict[str, Any]]
|
| 59 |
+
explanation_text: str
|
| 60 |
+
confidence: float
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@dataclass
|
| 64 |
+
class IRMetrics:
|
| 65 |
+
"""Information Retrieval metrics for a document."""
|
| 66 |
+
tf_idf_scores: Dict[str, float]
|
| 67 |
+
bm25_score: float
|
| 68 |
+
top_terms: List[Tuple[str, float]]
|
| 69 |
+
document_length: int
|
| 70 |
+
avg_term_frequency: float
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class SEOAnalyzer:
|
| 74 |
+
"""
|
| 75 |
+
Analyze SEO factors and compute IR metrics for credibility assessment.
|
| 76 |
+
|
| 77 |
+
This module helps explain WHY a URL might rank well (or poorly) in search engines,
|
| 78 |
+
which is a factor in its credibility assessment.
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
# BM25 parameters (classic values)
|
| 82 |
+
BM25_K1 = 1.5 # Term frequency saturation
|
| 83 |
+
BM25_B = 0.75 # Length normalization
|
| 84 |
+
|
| 85 |
+
# Stopwords (expandable)
|
| 86 |
+
STOPWORDS = {
|
| 87 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 88 |
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
| 89 |
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
| 90 |
+
'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
|
| 91 |
+
'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them',
|
| 92 |
+
'he', 'she', 'him', 'her', 'his', 'my', 'your', 'our', 'their',
|
| 93 |
+
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how',
|
| 94 |
+
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
|
| 95 |
+
'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
|
| 96 |
+
'than', 'too', 'very', 'just', 'also', 'now', 'here', 'there',
|
| 97 |
+
# French stopwords
|
| 98 |
+
'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'et', 'ou',
|
| 99 |
+
'mais', 'donc', 'car', 'ni', 'que', 'qui', 'quoi', 'dont', 'où',
|
| 100 |
+
'ce', 'cette', 'ces', 'mon', 'ma', 'mes', 'ton', 'ta', 'tes',
|
| 101 |
+
'son', 'sa', 'ses', 'notre', 'nos', 'votre', 'vos', 'leur', 'leurs',
|
| 102 |
+
'je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils', 'elles', 'on',
|
| 103 |
+
'est', 'sont', 'être', 'avoir', 'fait', 'faire', 'dit', 'dire',
|
| 104 |
+
'plus', 'moins', 'très', 'bien', 'tout', 'tous', 'toute', 'toutes',
|
| 105 |
+
'pour', 'par', 'sur', 'sous', 'avec', 'sans', 'dans', 'en', 'au', 'aux'
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
def __init__(self):
|
| 109 |
+
"""Initialize the SEO analyzer."""
|
| 110 |
+
# Reference corpus statistics (can be updated with real data)
|
| 111 |
+
self.avg_doc_length = 500 # Average document length in words
|
| 112 |
+
self.corpus_size = 1000 # Number of documents in reference corpus
|
| 113 |
+
# IDF values for common terms (placeholder - would be computed from real corpus)
|
| 114 |
+
self.idf_cache = {}
|
| 115 |
+
|
| 116 |
+
def tokenize(self, text: str, remove_stopwords: bool = True) -> List[str]:
|
| 117 |
+
"""
|
| 118 |
+
Tokenize text into words.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
text: Input text
|
| 122 |
+
remove_stopwords: Whether to remove stopwords
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
List of tokens
|
| 126 |
+
"""
|
| 127 |
+
if not text:
|
| 128 |
+
return []
|
| 129 |
+
|
| 130 |
+
# Lowercase and extract words
|
| 131 |
+
text = text.lower()
|
| 132 |
+
tokens = re.findall(r'\b[a-zA-ZÀ-ÿ]{2,}\b', text)
|
| 133 |
+
|
| 134 |
+
if remove_stopwords:
|
| 135 |
+
tokens = [t for t in tokens if t not in self.STOPWORDS]
|
| 136 |
+
|
| 137 |
+
return tokens
|
| 138 |
+
|
| 139 |
+
def calculate_tf(self, tokens: List[str]) -> Dict[str, float]:
|
| 140 |
+
"""
|
| 141 |
+
Calculate Term Frequency for each token.
|
| 142 |
+
|
| 143 |
+
TF(t) = (count of t in document) / (total terms in document)
|
| 144 |
+
"""
|
| 145 |
+
if not tokens:
|
| 146 |
+
return {}
|
| 147 |
+
|
| 148 |
+
term_counts = Counter(tokens)
|
| 149 |
+
total_terms = len(tokens)
|
| 150 |
+
|
| 151 |
+
return {term: count / total_terms for term, count in term_counts.items()}
|
| 152 |
+
|
| 153 |
+
def calculate_idf(self, term: str, doc_frequency: int = None) -> float:
|
| 154 |
+
"""
|
| 155 |
+
Calculate Inverse Document Frequency.
|
| 156 |
+
|
| 157 |
+
IDF(t) = log(N / (1 + df(t)))
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
term: The term to calculate IDF for
|
| 161 |
+
doc_frequency: Number of documents containing the term
|
| 162 |
+
(if None, use heuristic based on term length)
|
| 163 |
+
"""
|
| 164 |
+
if term in self.idf_cache:
|
| 165 |
+
return self.idf_cache[term]
|
| 166 |
+
|
| 167 |
+
if doc_frequency is None:
|
| 168 |
+
# Heuristic: shorter common words appear in more documents
|
| 169 |
+
if len(term) <= 3:
|
| 170 |
+
doc_frequency = self.corpus_size * 0.5
|
| 171 |
+
elif len(term) <= 5:
|
| 172 |
+
doc_frequency = self.corpus_size * 0.3
|
| 173 |
+
elif len(term) <= 8:
|
| 174 |
+
doc_frequency = self.corpus_size * 0.1
|
| 175 |
+
else:
|
| 176 |
+
doc_frequency = self.corpus_size * 0.05
|
| 177 |
+
|
| 178 |
+
idf = math.log(self.corpus_size / (1 + doc_frequency))
|
| 179 |
+
self.idf_cache[term] = idf
|
| 180 |
+
return idf
|
| 181 |
+
|
| 182 |
+
def calculate_tf_idf(self, text: str) -> Dict[str, float]:
|
| 183 |
+
"""
|
| 184 |
+
Calculate TF-IDF scores for all terms in a document.
|
| 185 |
+
|
| 186 |
+
TF-IDF(t,d) = TF(t,d) × IDF(t)
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
text: Document text
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Dictionary of term -> TF-IDF score
|
| 193 |
+
"""
|
| 194 |
+
tokens = self.tokenize(text)
|
| 195 |
+
tf_scores = self.calculate_tf(tokens)
|
| 196 |
+
|
| 197 |
+
tf_idf = {}
|
| 198 |
+
for term, tf in tf_scores.items():
|
| 199 |
+
idf = self.calculate_idf(term)
|
| 200 |
+
tf_idf[term] = tf * idf
|
| 201 |
+
|
| 202 |
+
return tf_idf
|
| 203 |
+
|
| 204 |
+
def calculate_bm25(
|
| 205 |
+
self,
|
| 206 |
+
query: str,
|
| 207 |
+
document: str,
|
| 208 |
+
k1: float = None,
|
| 209 |
+
b: float = None
|
| 210 |
+
) -> float:
|
| 211 |
+
"""
|
| 212 |
+
Calculate BM25 relevance score between query and document.
|
| 213 |
+
|
| 214 |
+
BM25(D, Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
query: Query string
|
| 218 |
+
document: Document text
|
| 219 |
+
k1: Term frequency saturation parameter
|
| 220 |
+
b: Length normalization parameter
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
BM25 score
|
| 224 |
+
"""
|
| 225 |
+
k1 = k1 or self.BM25_K1
|
| 226 |
+
b = b or self.BM25_B
|
| 227 |
+
|
| 228 |
+
query_tokens = self.tokenize(query)
|
| 229 |
+
doc_tokens = self.tokenize(document, remove_stopwords=False)
|
| 230 |
+
|
| 231 |
+
if not query_tokens or not doc_tokens:
|
| 232 |
+
return 0.0
|
| 233 |
+
|
| 234 |
+
doc_length = len(doc_tokens)
|
| 235 |
+
doc_term_counts = Counter(doc_tokens)
|
| 236 |
+
|
| 237 |
+
score = 0.0
|
| 238 |
+
for term in query_tokens:
|
| 239 |
+
if term not in doc_term_counts:
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
tf = doc_term_counts[term]
|
| 243 |
+
idf = self.calculate_idf(term)
|
| 244 |
+
|
| 245 |
+
numerator = tf * (k1 + 1)
|
| 246 |
+
denominator = tf + k1 * (1 - b + b * doc_length / self.avg_doc_length)
|
| 247 |
+
|
| 248 |
+
score += idf * (numerator / denominator)
|
| 249 |
+
|
| 250 |
+
return score
|
| 251 |
+
|
| 252 |
+
def analyze_seo(
|
| 253 |
+
self,
|
| 254 |
+
url: str,
|
| 255 |
+
title: Optional[str],
|
| 256 |
+
meta_description: Optional[str],
|
| 257 |
+
text_content: str,
|
| 258 |
+
headings: Dict[str, List[str]] = None
|
| 259 |
+
) -> SEOAnalysis:
|
| 260 |
+
"""
|
| 261 |
+
Perform comprehensive SEO analysis.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
url: Page URL
|
| 265 |
+
title: Page title
|
| 266 |
+
meta_description: Meta description
|
| 267 |
+
text_content: Main text content
|
| 268 |
+
headings: Dictionary of heading levels (h1, h2, etc.) and their texts
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
SEOAnalysis with all metrics
|
| 272 |
+
"""
|
| 273 |
+
tokens = self.tokenize(text_content)
|
| 274 |
+
word_count = len(tokens)
|
| 275 |
+
|
| 276 |
+
# Title analysis
|
| 277 |
+
title_length = len(title) if title else 0
|
| 278 |
+
title_tokens = self.tokenize(title) if title else []
|
| 279 |
+
|
| 280 |
+
# Check if title contains main keywords from content
|
| 281 |
+
content_top_terms = Counter(tokens).most_common(10)
|
| 282 |
+
title_has_keywords = any(
|
| 283 |
+
term in title_tokens
|
| 284 |
+
for term, _ in content_top_terms[:5]
|
| 285 |
+
) if title_tokens else False
|
| 286 |
+
|
| 287 |
+
# Meta description analysis
|
| 288 |
+
meta_length = len(meta_description) if meta_description else 0
|
| 289 |
+
|
| 290 |
+
# Heading structure
|
| 291 |
+
headings = headings or {}
|
| 292 |
+
heading_structure = {
|
| 293 |
+
'h1': len(headings.get('h1', [])),
|
| 294 |
+
'h2': len(headings.get('h2', [])),
|
| 295 |
+
'h3': len(headings.get('h3', []))
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
# Keyword density (top 5 terms)
|
| 299 |
+
keyword_density = {}
|
| 300 |
+
for term, count in Counter(tokens).most_common(5):
|
| 301 |
+
keyword_density[term] = count / word_count if word_count > 0 else 0
|
| 302 |
+
|
| 303 |
+
# Readability score (simple metric based on average word/sentence length)
|
| 304 |
+
sentences = re.split(r'[.!?]+', text_content)
|
| 305 |
+
avg_sentence_length = word_count / len(sentences) if sentences else 0
|
| 306 |
+
|
| 307 |
+
# Convert to readability score (0-1, where 1 is optimal ~15-20 words/sentence)
|
| 308 |
+
if 15 <= avg_sentence_length <= 20:
|
| 309 |
+
readability_score = 1.0
|
| 310 |
+
elif 10 <= avg_sentence_length <= 25:
|
| 311 |
+
readability_score = 0.8
|
| 312 |
+
elif 5 <= avg_sentence_length <= 30:
|
| 313 |
+
readability_score = 0.6
|
| 314 |
+
else:
|
| 315 |
+
readability_score = 0.4
|
| 316 |
+
|
| 317 |
+
# Overall SEO score
|
| 318 |
+
seo_factors = []
|
| 319 |
+
|
| 320 |
+
# Title score (optimal: 50-60 chars)
|
| 321 |
+
if 50 <= title_length <= 60:
|
| 322 |
+
seo_factors.append(1.0)
|
| 323 |
+
elif 30 <= title_length <= 70:
|
| 324 |
+
seo_factors.append(0.7)
|
| 325 |
+
else:
|
| 326 |
+
seo_factors.append(0.3)
|
| 327 |
+
|
| 328 |
+
# Meta description (optimal: 150-160 chars)
|
| 329 |
+
if 150 <= meta_length <= 160:
|
| 330 |
+
seo_factors.append(1.0)
|
| 331 |
+
elif 100 <= meta_length <= 200:
|
| 332 |
+
seo_factors.append(0.7)
|
| 333 |
+
else:
|
| 334 |
+
seo_factors.append(0.3)
|
| 335 |
+
|
| 336 |
+
# Has exactly one H1
|
| 337 |
+
seo_factors.append(1.0 if heading_structure['h1'] == 1 else 0.5)
|
| 338 |
+
|
| 339 |
+
# Content length (optimal: 300+ words)
|
| 340 |
+
if word_count >= 1000:
|
| 341 |
+
seo_factors.append(1.0)
|
| 342 |
+
elif word_count >= 500:
|
| 343 |
+
seo_factors.append(0.8)
|
| 344 |
+
elif word_count >= 300:
|
| 345 |
+
seo_factors.append(0.6)
|
| 346 |
+
else:
|
| 347 |
+
seo_factors.append(0.3)
|
| 348 |
+
|
| 349 |
+
seo_score = sum(seo_factors) / len(seo_factors) if seo_factors else 0.5
|
| 350 |
+
|
| 351 |
+
return SEOAnalysis(
|
| 352 |
+
url=url,
|
| 353 |
+
title_length=title_length,
|
| 354 |
+
title_has_keywords=title_has_keywords,
|
| 355 |
+
meta_description_length=meta_length,
|
| 356 |
+
has_meta_keywords=bool(keyword_density),
|
| 357 |
+
heading_structure=heading_structure,
|
| 358 |
+
word_count=word_count,
|
| 359 |
+
keyword_density=keyword_density,
|
| 360 |
+
readability_score=readability_score,
|
| 361 |
+
seo_score=seo_score
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
def estimate_pagerank(
|
| 365 |
+
self,
|
| 366 |
+
url: str,
|
| 367 |
+
backlinks: List[Dict[str, Any]] = None,
|
| 368 |
+
domain_age_days: int = None,
|
| 369 |
+
source_reputation: str = None
|
| 370 |
+
) -> PageRankExplanation:
|
| 371 |
+
"""
|
| 372 |
+
Estimate and explain PageRank-like score.
|
| 373 |
+
|
| 374 |
+
This is NOT the actual Google PageRank, but an explainable approximation
|
| 375 |
+
based on available factors that contribute to search ranking.
|
| 376 |
+
|
| 377 |
+
PageRank Formula (simplified):
|
| 378 |
+
PR(A) = (1-d) + d × Σ (PR(Ti) / C(Ti))
|
| 379 |
+
|
| 380 |
+
Where:
|
| 381 |
+
- d = damping factor (0.85)
|
| 382 |
+
- Ti = pages pointing to A
|
| 383 |
+
- C(Ti) = number of outgoing links from Ti
|
| 384 |
+
|
| 385 |
+
Args:
|
| 386 |
+
url: Target URL
|
| 387 |
+
backlinks: List of backlink information
|
| 388 |
+
domain_age_days: Age of the domain in days
|
| 389 |
+
source_reputation: Known reputation level
|
| 390 |
+
|
| 391 |
+
Returns:
|
| 392 |
+
PageRankExplanation with estimated score and factors
|
| 393 |
+
"""
|
| 394 |
+
d = 0.85 # Damping factor
|
| 395 |
+
base_pr = (1 - d) # Starting PageRank
|
| 396 |
+
|
| 397 |
+
factors = []
|
| 398 |
+
pr_contributions = []
|
| 399 |
+
|
| 400 |
+
# Factor 1: Domain Age
|
| 401 |
+
if domain_age_days is not None:
|
| 402 |
+
if domain_age_days > 365 * 5: # > 5 years
|
| 403 |
+
age_contribution = 0.3
|
| 404 |
+
age_description = "Domaine ancien (5+ ans) - forte confiance"
|
| 405 |
+
elif domain_age_days > 365 * 2: # > 2 years
|
| 406 |
+
age_contribution = 0.2
|
| 407 |
+
age_description = "Domaine établi (2-5 ans) - bonne confiance"
|
| 408 |
+
elif domain_age_days > 365: # > 1 year
|
| 409 |
+
age_contribution = 0.1
|
| 410 |
+
age_description = "Domaine récent (1-2 ans) - confiance modérée"
|
| 411 |
+
else:
|
| 412 |
+
age_contribution = 0.0
|
| 413 |
+
age_description = "Domaine très récent (<1 an) - confiance faible"
|
| 414 |
+
|
| 415 |
+
factors.append({
|
| 416 |
+
'name': 'Domain Age',
|
| 417 |
+
'value': f"{domain_age_days} days ({domain_age_days/365:.1f} years)",
|
| 418 |
+
'contribution': age_contribution,
|
| 419 |
+
'description': age_description
|
| 420 |
+
})
|
| 421 |
+
pr_contributions.append(age_contribution)
|
| 422 |
+
|
| 423 |
+
# Factor 2: Source Reputation
|
| 424 |
+
if source_reputation:
|
| 425 |
+
if source_reputation == 'High':
|
| 426 |
+
rep_contribution = 0.3
|
| 427 |
+
rep_description = "Source réputée - équivalent à beaucoup de backlinks de qualité"
|
| 428 |
+
elif source_reputation == 'Medium':
|
| 429 |
+
rep_contribution = 0.15
|
| 430 |
+
rep_description = "Source connue - équivalent à quelques backlinks de qualité"
|
| 431 |
+
else:
|
| 432 |
+
rep_contribution = 0.0
|
| 433 |
+
rep_description = "Source inconnue ou peu fiable - pas de boost de réputation"
|
| 434 |
+
|
| 435 |
+
factors.append({
|
| 436 |
+
'name': 'Source Reputation',
|
| 437 |
+
'value': source_reputation,
|
| 438 |
+
'contribution': rep_contribution,
|
| 439 |
+
'description': rep_description
|
| 440 |
+
})
|
| 441 |
+
pr_contributions.append(rep_contribution)
|
| 442 |
+
|
| 443 |
+
# Factor 3: Backlinks (if available)
|
| 444 |
+
backlinks = backlinks or []
|
| 445 |
+
if backlinks:
|
| 446 |
+
# Estimate backlink contribution
|
| 447 |
+
high_quality_count = sum(1 for bl in backlinks if bl.get('quality', 'low') == 'high')
|
| 448 |
+
medium_quality_count = sum(1 for bl in backlinks if bl.get('quality', 'low') == 'medium')
|
| 449 |
+
|
| 450 |
+
# Each high-quality backlink contributes more
|
| 451 |
+
backlink_contribution = min(0.3, high_quality_count * 0.05 + medium_quality_count * 0.02)
|
| 452 |
+
|
| 453 |
+
factors.append({
|
| 454 |
+
'name': 'Backlinks',
|
| 455 |
+
'value': f"{len(backlinks)} total ({high_quality_count} high quality)",
|
| 456 |
+
'contribution': backlink_contribution,
|
| 457 |
+
'description': f"Liens entrants détectés - contribution au classement"
|
| 458 |
+
})
|
| 459 |
+
pr_contributions.append(backlink_contribution)
|
| 460 |
+
|
| 461 |
+
# Factor 4: Domain type (TLD)
|
| 462 |
+
parsed = urlparse(url)
|
| 463 |
+
domain = parsed.netloc
|
| 464 |
+
|
| 465 |
+
if domain.endswith('.edu') or domain.endswith('.gov'):
|
| 466 |
+
tld_contribution = 0.2
|
| 467 |
+
tld_description = "Domaine .edu/.gov - haute autorité institutionnelle"
|
| 468 |
+
elif domain.endswith('.ac.uk') or domain.endswith('.gouv.fr'):
|
| 469 |
+
tld_contribution = 0.15
|
| 470 |
+
tld_description = "Domaine académique/gouvernemental - bonne autorité"
|
| 471 |
+
elif domain.endswith('.org'):
|
| 472 |
+
tld_contribution = 0.05
|
| 473 |
+
tld_description = "Domaine .org - légère autorité"
|
| 474 |
+
else:
|
| 475 |
+
tld_contribution = 0.0
|
| 476 |
+
tld_description = "Domaine commercial standard"
|
| 477 |
+
|
| 478 |
+
factors.append({
|
| 479 |
+
'name': 'Domain Type (TLD)',
|
| 480 |
+
'value': domain,
|
| 481 |
+
'contribution': tld_contribution,
|
| 482 |
+
'description': tld_description
|
| 483 |
+
})
|
| 484 |
+
pr_contributions.append(tld_contribution)
|
| 485 |
+
|
| 486 |
+
# Calculate final estimated PageRank
|
| 487 |
+
total_contribution = sum(pr_contributions)
|
| 488 |
+
estimated_pr = base_pr + d * total_contribution
|
| 489 |
+
estimated_pr = min(1.0, max(0.0, estimated_pr)) # Clamp to [0, 1]
|
| 490 |
+
|
| 491 |
+
# Generate explanation
|
| 492 |
+
explanation_parts = [
|
| 493 |
+
f"PageRank estimé: {estimated_pr:.3f}",
|
| 494 |
+
f"",
|
| 495 |
+
f"Formule: PR = (1-d) + d × Σ(contributions)",
|
| 496 |
+
f" PR = {base_pr:.2f} + {d:.2f} × {total_contribution:.2f}",
|
| 497 |
+
f"",
|
| 498 |
+
f"Facteurs contributifs:"
|
| 499 |
+
]
|
| 500 |
+
|
| 501 |
+
for factor in factors:
|
| 502 |
+
explanation_parts.append(
|
| 503 |
+
f" • {factor['name']}: +{factor['contribution']:.2f} - {factor['description']}"
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
# Confidence based on how many factors we have data for
|
| 507 |
+
confidence = min(1.0, len([f for f in factors if f['contribution'] > 0]) / 4)
|
| 508 |
+
|
| 509 |
+
return PageRankExplanation(
|
| 510 |
+
url=url,
|
| 511 |
+
estimated_pr=estimated_pr,
|
| 512 |
+
factors=factors,
|
| 513 |
+
explanation_text="\n".join(explanation_parts),
|
| 514 |
+
confidence=confidence
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
def get_ir_metrics(self, text: str, query: str = None) -> IRMetrics:
|
| 518 |
+
"""
|
| 519 |
+
Get comprehensive IR metrics for a document.
|
| 520 |
+
|
| 521 |
+
Args:
|
| 522 |
+
text: Document text
|
| 523 |
+
query: Optional query for BM25 calculation
|
| 524 |
+
|
| 525 |
+
Returns:
|
| 526 |
+
IRMetrics with TF-IDF, BM25, and other metrics
|
| 527 |
+
"""
|
| 528 |
+
tokens = self.tokenize(text)
|
| 529 |
+
tf_idf = self.calculate_tf_idf(text)
|
| 530 |
+
|
| 531 |
+
# Top terms by TF-IDF
|
| 532 |
+
top_terms = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)[:10]
|
| 533 |
+
|
| 534 |
+
# BM25 score (if query provided)
|
| 535 |
+
bm25_score = 0.0
|
| 536 |
+
if query:
|
| 537 |
+
bm25_score = self.calculate_bm25(query, text)
|
| 538 |
+
|
| 539 |
+
# Average term frequency
|
| 540 |
+
tf = self.calculate_tf(tokens)
|
| 541 |
+
avg_tf = sum(tf.values()) / len(tf) if tf else 0
|
| 542 |
+
|
| 543 |
+
return IRMetrics(
|
| 544 |
+
tf_idf_scores=tf_idf,
|
| 545 |
+
bm25_score=bm25_score,
|
| 546 |
+
top_terms=top_terms,
|
| 547 |
+
document_length=len(tokens),
|
| 548 |
+
avg_term_frequency=avg_tf
|
| 549 |
+
)
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
# --- Testing ---
|
| 553 |
+
if __name__ == "__main__":
|
| 554 |
+
print("=" * 60)
|
| 555 |
+
print("SysCRED SEO Analyzer - Tests")
|
| 556 |
+
print("=" * 60 + "\n")
|
| 557 |
+
|
| 558 |
+
analyzer = SEOAnalyzer()
|
| 559 |
+
|
| 560 |
+
# Test 1: TF-IDF
|
| 561 |
+
print("1. Testing TF-IDF calculation...")
|
| 562 |
+
sample_text = """
|
| 563 |
+
The credibility of online information is crucial in today's digital age.
|
| 564 |
+
Fact-checking organizations help verify claims and identify misinformation.
|
| 565 |
+
Source reputation and domain age are important credibility factors.
|
| 566 |
+
"""
|
| 567 |
+
tf_idf = analyzer.calculate_tf_idf(sample_text)
|
| 568 |
+
top_5 = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)[:5]
|
| 569 |
+
print(" Top 5 TF-IDF terms:")
|
| 570 |
+
for term, score in top_5:
|
| 571 |
+
print(f" {term}: {score:.4f}")
|
| 572 |
+
print()
|
| 573 |
+
|
| 574 |
+
# Test 2: BM25
|
| 575 |
+
print("2. Testing BM25 scoring...")
|
| 576 |
+
query = "credibility verification"
|
| 577 |
+
bm25_score = analyzer.calculate_bm25(query, sample_text)
|
| 578 |
+
print(f" Query: '{query}'")
|
| 579 |
+
print(f" BM25 Score: {bm25_score:.4f}")
|
| 580 |
+
print()
|
| 581 |
+
|
| 582 |
+
# Test 3: SEO Analysis
|
| 583 |
+
print("3. Testing SEO analysis...")
|
| 584 |
+
seo = analyzer.analyze_seo(
|
| 585 |
+
url="https://example.com/article",
|
| 586 |
+
title="Understanding Online Credibility - A Complete Guide",
|
| 587 |
+
meta_description="Learn about the key factors that determine the credibility of online information sources.",
|
| 588 |
+
text_content=sample_text
|
| 589 |
+
)
|
| 590 |
+
print(f" Title length: {seo.title_length} chars")
|
| 591 |
+
print(f" Meta description length: {seo.meta_description_length} chars")
|
| 592 |
+
print(f" Word count: {seo.word_count}")
|
| 593 |
+
print(f" SEO Score: {seo.seo_score:.2f}")
|
| 594 |
+
print()
|
| 595 |
+
|
| 596 |
+
# Test 4: PageRank Estimation
|
| 597 |
+
print("4. Testing PageRank estimation...")
|
| 598 |
+
pr = analyzer.estimate_pagerank(
|
| 599 |
+
url="https://www.lemonde.fr/article",
|
| 600 |
+
domain_age_days=9125, # ~25 years
|
| 601 |
+
source_reputation="High"
|
| 602 |
+
)
|
| 603 |
+
print(f" Estimated PageRank: {pr.estimated_pr:.3f}")
|
| 604 |
+
print(f" Confidence: {pr.confidence:.2f}")
|
| 605 |
+
print("\n Explanation:")
|
| 606 |
+
print(" " + pr.explanation_text.replace("\n", "\n "))
|
| 607 |
+
|
| 608 |
+
print("\n" + "=" * 60)
|
| 609 |
+
print("Tests complete!")
|
| 610 |
+
print("=" * 60)
|
syscred/setup.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
SysCRED - Système de Vérification de Crédibilité
|
| 4 |
+
=================================================
|
| 5 |
+
PhD Thesis Prototype - Neuro-Symbolic Credibility Verification
|
| 6 |
+
|
| 7 |
+
(c) Dominique S. Loyer
|
| 8 |
+
Citation Key: loyerModelingHybridSystem2025
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from setuptools import setup, find_packages
|
| 12 |
+
|
| 13 |
+
setup(
|
| 14 |
+
name="syscred",
|
| 15 |
+
version="2.0.0",
|
| 16 |
+
author="Dominique S. Loyer",
|
| 17 |
+
author_email="loyer.dominique_sebastien@courrier.uqam.ca",
|
| 18 |
+
description="Neuro-Symbolic Credibility Verification System",
|
| 19 |
+
long_description=open("README.md").read() if __import__("os").path.exists("README.md") else "",
|
| 20 |
+
long_description_content_type="text/markdown",
|
| 21 |
+
url="https://github.com/DominiqueLoyer/syscred",
|
| 22 |
+
packages=find_packages(),
|
| 23 |
+
python_requires=">=3.9",
|
| 24 |
+
install_requires=[
|
| 25 |
+
"requests>=2.28.0",
|
| 26 |
+
"beautifulsoup4>=4.11.0",
|
| 27 |
+
"rdflib>=6.0.0",
|
| 28 |
+
"nltk>=3.7",
|
| 29 |
+
],
|
| 30 |
+
extras_require={
|
| 31 |
+
"ml": [
|
| 32 |
+
"torch>=2.0.0",
|
| 33 |
+
"transformers>=4.30.0",
|
| 34 |
+
"numpy>=1.23.0,<2.0",
|
| 35 |
+
],
|
| 36 |
+
"ir": [
|
| 37 |
+
"pyserini>=0.21.0",
|
| 38 |
+
"pytrec_eval>=0.5",
|
| 39 |
+
],
|
| 40 |
+
"web": [
|
| 41 |
+
"flask>=2.0.0",
|
| 42 |
+
"flask-cors>=3.0.0",
|
| 43 |
+
],
|
| 44 |
+
"full": [
|
| 45 |
+
"torch>=2.0.0",
|
| 46 |
+
"transformers>=4.30.0",
|
| 47 |
+
"numpy>=1.23.0,<2.0",
|
| 48 |
+
"pyserini>=0.21.0",
|
| 49 |
+
"pytrec_eval>=0.5",
|
| 50 |
+
"flask>=2.0.0",
|
| 51 |
+
"flask-cors>=3.0.0",
|
| 52 |
+
"lime>=0.2.0",
|
| 53 |
+
],
|
| 54 |
+
},
|
| 55 |
+
classifiers=[
|
| 56 |
+
"Development Status :: 4 - Beta",
|
| 57 |
+
"Intended Audience :: Science/Research",
|
| 58 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 59 |
+
"License :: OSI Approved :: MIT License",
|
| 60 |
+
"Programming Language :: Python :: 3.9",
|
| 61 |
+
"Programming Language :: Python :: 3.10",
|
| 62 |
+
"Programming Language :: Python :: 3.11",
|
| 63 |
+
],
|
| 64 |
+
keywords="credibility verification nlp ontology information-retrieval",
|
| 65 |
+
)
|
syscred/static/index.html
ADDED
|
@@ -0,0 +1,850 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="fr">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>SysCRED - Vérification de Crédibilité</title>
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
| 9 |
+
<script src="/static/js/d3.min.js"></script>
|
| 10 |
+
<style>
|
| 11 |
+
.graph-container {
|
| 12 |
+
width: 100%;
|
| 13 |
+
height: 500px;
|
| 14 |
+
min-height: 500px;
|
| 15 |
+
background: rgba(0, 0, 0, 0.2);
|
| 16 |
+
border-radius: 12px;
|
| 17 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 18 |
+
position: relative;
|
| 19 |
+
display: block;
|
| 20 |
+
/* Force display */
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
* {
|
| 24 |
+
margin: 0;
|
| 25 |
+
padding: 0;
|
| 26 |
+
box-sizing: border-box;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
body {
|
| 30 |
+
font-family: 'Inter', sans-serif;
|
| 31 |
+
background: linear-gradient(135deg, #0f0f23 0%, #1a1a3e 50%, #0d0d1f 100%);
|
| 32 |
+
min-height: 100vh;
|
| 33 |
+
color: #e0e0e0;
|
| 34 |
+
padding: 2rem;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
.container {
|
| 38 |
+
max-width: 900px;
|
| 39 |
+
margin: 0 auto;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
header {
|
| 43 |
+
text-align: center;
|
| 44 |
+
margin-bottom: 3rem;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
h1 {
|
| 48 |
+
font-size: 2.5rem;
|
| 49 |
+
font-weight: 700;
|
| 50 |
+
background: linear-gradient(135deg, #00d4ff, #7c3aed, #f472b6);
|
| 51 |
+
-webkit-background-clip: text;
|
| 52 |
+
-webkit-text-fill-color: transparent;
|
| 53 |
+
background-clip: text;
|
| 54 |
+
margin-bottom: 0.5rem;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.subtitle {
|
| 58 |
+
color: #8b8ba7;
|
| 59 |
+
font-size: 1.1rem;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.search-box {
|
| 63 |
+
background: rgba(255, 255, 255, 0.03);
|
| 64 |
+
backdrop-filter: blur(20px);
|
| 65 |
+
border: 1px solid rgba(255, 255, 255, 0.08);
|
| 66 |
+
border-radius: 20px;
|
| 67 |
+
padding: 2rem;
|
| 68 |
+
margin-bottom: 2rem;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
.input-group {
|
| 72 |
+
display: flex;
|
| 73 |
+
gap: 1rem;
|
| 74 |
+
align-items: center;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
input[type="text"] {
|
| 78 |
+
flex: 1;
|
| 79 |
+
padding: 1rem 1.5rem;
|
| 80 |
+
font-size: 1rem;
|
| 81 |
+
border: 2px solid rgba(124, 58, 237, 0.3);
|
| 82 |
+
border-radius: 12px;
|
| 83 |
+
background: rgba(0, 0, 0, 0.3);
|
| 84 |
+
color: #fff;
|
| 85 |
+
transition: all 0.3s ease;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
input[type="text"]:focus {
|
| 89 |
+
outline: none;
|
| 90 |
+
border-color: #7c3aed;
|
| 91 |
+
box-shadow: 0 0 20px rgba(124, 58, 237, 0.3);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
input[type="text"]::placeholder {
|
| 95 |
+
color: #6b6b8a;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
button {
|
| 99 |
+
padding: 1rem 2rem;
|
| 100 |
+
font-size: 1rem;
|
| 101 |
+
font-weight: 600;
|
| 102 |
+
border: none;
|
| 103 |
+
border-radius: 12px;
|
| 104 |
+
background: linear-gradient(135deg, #7c3aed, #a855f7);
|
| 105 |
+
color: white;
|
| 106 |
+
cursor: pointer;
|
| 107 |
+
transition: all 0.3s ease;
|
| 108 |
+
white-space: nowrap;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
button:hover {
|
| 112 |
+
transform: translateY(-2px);
|
| 113 |
+
box-shadow: 0 10px 30px rgba(124, 58, 237, 0.4);
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
button:disabled {
|
| 117 |
+
opacity: 0.6;
|
| 118 |
+
cursor: not-allowed;
|
| 119 |
+
transform: none;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.results {
|
| 123 |
+
display: none;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.results.visible {
|
| 127 |
+
display: block;
|
| 128 |
+
animation: fadeIn 0.5s ease;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
@keyframes fadeIn {
|
| 132 |
+
from {
|
| 133 |
+
opacity: 0;
|
| 134 |
+
transform: translateY(20px);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
to {
|
| 138 |
+
opacity: 1;
|
| 139 |
+
transform: translateY(0);
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.score-card {
|
| 144 |
+
background: rgba(255, 255, 255, 0.03);
|
| 145 |
+
backdrop-filter: blur(20px);
|
| 146 |
+
border: 1px solid rgba(255, 255, 255, 0.08);
|
| 147 |
+
border-radius: 20px;
|
| 148 |
+
padding: 2rem;
|
| 149 |
+
text-align: center;
|
| 150 |
+
margin-bottom: 2rem;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.score-value {
|
| 154 |
+
font-size: 4rem;
|
| 155 |
+
font-weight: 700;
|
| 156 |
+
margin: 1rem 0;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
.score-high {
|
| 160 |
+
color: #22c55e;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
.score-medium {
|
| 164 |
+
color: #eab308;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
.score-low {
|
| 168 |
+
color: #ef4444;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
.score-label {
|
| 172 |
+
font-size: 1.2rem;
|
| 173 |
+
color: #8b8ba7;
|
| 174 |
+
margin-bottom: 1rem;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.credibility-badge {
|
| 178 |
+
display: inline-block;
|
| 179 |
+
padding: 0.5rem 1.5rem;
|
| 180 |
+
border-radius: 50px;
|
| 181 |
+
font-weight: 600;
|
| 182 |
+
font-size: 0.9rem;
|
| 183 |
+
text-transform: uppercase;
|
| 184 |
+
letter-spacing: 1px;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
.badge-high {
|
| 188 |
+
background: rgba(34, 197, 94, 0.2);
|
| 189 |
+
color: #22c55e;
|
| 190 |
+
border: 1px solid #22c55e;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.badge-medium {
|
| 194 |
+
background: rgba(234, 179, 8, 0.2);
|
| 195 |
+
color: #eab308;
|
| 196 |
+
border: 1px solid #eab308;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.badge-low {
|
| 200 |
+
background: rgba(239, 68, 68, 0.2);
|
| 201 |
+
color: #ef4444;
|
| 202 |
+
border: 1px solid #ef4444;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.details-grid {
|
| 206 |
+
display: grid;
|
| 207 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 208 |
+
gap: 1rem;
|
| 209 |
+
margin-bottom: 2rem;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.detail-card {
|
| 213 |
+
background: rgba(255, 255, 255, 0.03);
|
| 214 |
+
border: 1px solid rgba(255, 255, 255, 0.08);
|
| 215 |
+
border-radius: 12px;
|
| 216 |
+
padding: 1.5rem;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.detail-label {
|
| 220 |
+
font-size: 0.85rem;
|
| 221 |
+
color: #8b8ba7;
|
| 222 |
+
margin-bottom: 0.5rem;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
.detail-value {
|
| 226 |
+
font-size: 1.1rem;
|
| 227 |
+
font-weight: 600;
|
| 228 |
+
color: #fff;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.summary-box {
|
| 232 |
+
background: rgba(124, 58, 237, 0.1);
|
| 233 |
+
border: 1px solid rgba(124, 58, 237, 0.3);
|
| 234 |
+
border-radius: 12px;
|
| 235 |
+
padding: 1.5rem;
|
| 236 |
+
margin-bottom: 2rem;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.summary-title {
|
| 240 |
+
font-weight: 600;
|
| 241 |
+
margin-bottom: 0.5rem;
|
| 242 |
+
color: #a855f7;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
.loading {
|
| 246 |
+
text-align: center;
|
| 247 |
+
padding: 3rem;
|
| 248 |
+
display: none;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
.loading.visible {
|
| 252 |
+
display: block;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
.spinner {
|
| 256 |
+
width: 50px;
|
| 257 |
+
height: 50px;
|
| 258 |
+
border: 3px solid rgba(124, 58, 237, 0.2);
|
| 259 |
+
border-top-color: #7c3aed;
|
| 260 |
+
border-radius: 50%;
|
| 261 |
+
animation: spin 1s linear infinite;
|
| 262 |
+
margin: 0 auto 1rem;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
@keyframes spin {
|
| 266 |
+
to {
|
| 267 |
+
transform: rotate(360deg);
|
| 268 |
+
}
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
.error {
|
| 272 |
+
background: rgba(239, 68, 68, 0.1);
|
| 273 |
+
border: 1px solid rgba(239, 68, 68, 0.3);
|
| 274 |
+
border-radius: 12px;
|
| 275 |
+
padding: 1.5rem;
|
| 276 |
+
color: #ef4444;
|
| 277 |
+
display: none;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.error.visible {
|
| 281 |
+
display: block;
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
footer {
|
| 285 |
+
text-align: center;
|
| 286 |
+
margin-top: 3rem;
|
| 287 |
+
color: #6b6b8a;
|
| 288 |
+
font-size: 0.9rem;
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
footer a {
|
| 292 |
+
color: #7c3aed;
|
| 293 |
+
text-decoration: none;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
/* Node Details Overlay */
|
| 297 |
+
.node-details-overlay {
|
| 298 |
+
position: absolute;
|
| 299 |
+
top: 20px;
|
| 300 |
+
right: 20px;
|
| 301 |
+
background: rgba(15, 15, 35, 0.95);
|
| 302 |
+
border: 1px solid rgba(124, 58, 237, 0.3);
|
| 303 |
+
border-radius: 12px;
|
| 304 |
+
padding: 1.5rem;
|
| 305 |
+
width: 300px;
|
| 306 |
+
display: none;
|
| 307 |
+
backdrop-filter: blur(10px);
|
| 308 |
+
z-index: 100;
|
| 309 |
+
box-shadow: 0 10px 30px rgba(0,0,0,0.5);
|
| 310 |
+
pointer-events: auto;
|
| 311 |
+
}
|
| 312 |
+
.node-details-overlay.visible {
|
| 313 |
+
display: block;
|
| 314 |
+
animation: fadeIn 0.3s ease;
|
| 315 |
+
}
|
| 316 |
+
.close-btn {
|
| 317 |
+
position: absolute;
|
| 318 |
+
top: 10px;
|
| 319 |
+
right: 15px;
|
| 320 |
+
background: none;
|
| 321 |
+
border: none;
|
| 322 |
+
color: #8b8ba7;
|
| 323 |
+
font-size: 1.5rem;
|
| 324 |
+
cursor: pointer;
|
| 325 |
+
padding: 0;
|
| 326 |
+
line-height: 1;
|
| 327 |
+
width: auto;
|
| 328 |
+
height: auto;
|
| 329 |
+
box-shadow: none;
|
| 330 |
+
}
|
| 331 |
+
.close-btn:hover {
|
| 332 |
+
color: #fff;
|
| 333 |
+
transform: none;
|
| 334 |
+
box-shadow: none;
|
| 335 |
+
}
|
| 336 |
+
</style>
|
| 337 |
+
</head>
|
| 338 |
+
|
| 339 |
+
<body>
|
| 340 |
+
<div class="container">
|
| 341 |
+
<header>
|
| 342 |
+
<h1>🔬 SysCRED</h1>
|
| 343 |
+
<p class="subtitle">Système Neuro-Symbolique de Vérification de Crédibilité</p>
|
| 344 |
+
</header>
|
| 345 |
+
|
| 346 |
+
<div class="search-box">
|
| 347 |
+
<div class="input-group">
|
| 348 |
+
<input type="text" id="urlInput" placeholder="Entrez une URL à analyser (ex: https://www.lemonde.fr)"
|
| 349 |
+
autofocus>
|
| 350 |
+
<button id="analyzeBtn" onclick="analyzeUrl()">
|
| 351 |
+
🔍 Analyser
|
| 352 |
+
</button>
|
| 353 |
+
</div>
|
| 354 |
+
</div>
|
| 355 |
+
|
| 356 |
+
<div class="loading" id="loading">
|
| 357 |
+
<div class="spinner"></div>
|
| 358 |
+
<p>Analyse en cours...</p>
|
| 359 |
+
</div>
|
| 360 |
+
|
| 361 |
+
<div class="error" id="error"></div>
|
| 362 |
+
|
| 363 |
+
<div class="results" id="results">
|
| 364 |
+
<div class="score-card">
|
| 365 |
+
<div class="score-label">Score de Crédibilité</div>
|
| 366 |
+
<div class="score-value" id="scoreValue">0.00</div>
|
| 367 |
+
<div class="credibility-badge" id="credibilityBadge">-</div>
|
| 368 |
+
</div>
|
| 369 |
+
|
| 370 |
+
<div class="summary-box">
|
| 371 |
+
<div class="summary-title">📋 Résumé de l'analyse</div>
|
| 372 |
+
<p id="summary">-</p>
|
| 373 |
+
</div>
|
| 374 |
+
|
| 375 |
+
<div class="details-grid" id="detailsGrid"></div>
|
| 376 |
+
|
| 377 |
+
<div class="graph-section" style="margin-top: 3rem;">
|
| 378 |
+
<div class="summary-title" style="margin-bottom: 2rem; color: #60a5fa;">🕸️ Réseau Neuro-Symbolique
|
| 379 |
+
(Ontologie)</div>
|
| 380 |
+
<!-- Debug link -->
|
| 381 |
+
<small style="color: #666; cursor: pointer;"
|
| 382 |
+
onclick="alert('D3 Loaded: ' + (typeof d3 !== 'undefined'))">Debug: Vérifier D3</small>
|
| 383 |
+
|
| 384 |
+
<div id="cy" class="graph-container"></div>
|
| 385 |
+
</div>
|
| 386 |
+
</div>
|
| 387 |
+
|
| 388 |
+
<footer>
|
| 389 |
+
<p>SysCRED v2.0 - Prototype de recherche doctorale</p>
|
| 390 |
+
<p>© Dominique S. Loyer - UQAM | <a href="https://doi.org/10.5281/zenodo.17943226" target="_blank">DOI:
|
| 391 |
+
10.5281/zenodo.17943226</a></p>
|
| 392 |
+
</footer>
|
| 393 |
+
</div>
|
| 394 |
+
|
| 395 |
+
<script>
|
| 396 |
+
const API_URL = 'http://localhost:5001';
|
| 397 |
+
|
| 398 |
+
async function analyzeUrl() {
|
| 399 |
+
const urlInput = document.getElementById('urlInput');
|
| 400 |
+
const loading = document.getElementById('loading');
|
| 401 |
+
const results = document.getElementById('results');
|
| 402 |
+
const error = document.getElementById('error');
|
| 403 |
+
const btn = document.getElementById('analyzeBtn');
|
| 404 |
+
|
| 405 |
+
const inputData = urlInput.value.trim();
|
| 406 |
+
|
| 407 |
+
if (!inputData) {
|
| 408 |
+
alert('Veuillez entrer une URL');
|
| 409 |
+
return;
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
// Reset UI
|
| 413 |
+
results.classList.remove('visible');
|
| 414 |
+
error.classList.remove('visible');
|
| 415 |
+
loading.classList.add('visible');
|
| 416 |
+
btn.disabled = true;
|
| 417 |
+
|
| 418 |
+
try {
|
| 419 |
+
const response = await fetch(`${API_URL}/api/verify`, {
|
| 420 |
+
method: 'POST',
|
| 421 |
+
headers: {
|
| 422 |
+
'Content-Type': 'application/json',
|
| 423 |
+
},
|
| 424 |
+
body: JSON.stringify({
|
| 425 |
+
input_data: inputData,
|
| 426 |
+
include_seo: true,
|
| 427 |
+
include_pagerank: true
|
| 428 |
+
})
|
| 429 |
+
});
|
| 430 |
+
|
| 431 |
+
const data = await response.json();
|
| 432 |
+
|
| 433 |
+
if (!response.ok) {
|
| 434 |
+
throw new Error(data.error || 'Erreur lors de l\'analyse');
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
displayResults(data);
|
| 438 |
+
|
| 439 |
+
} catch (err) {
|
| 440 |
+
error.textContent = `❌ Erreur: ${err.message}`;
|
| 441 |
+
error.classList.add('visible');
|
| 442 |
+
} finally {
|
| 443 |
+
loading.classList.remove('visible');
|
| 444 |
+
btn.disabled = false;
|
| 445 |
+
}
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
function displayResults(data) {
|
| 449 |
+
const results = document.getElementById('results');
|
| 450 |
+
const scoreValue = document.getElementById('scoreValue');
|
| 451 |
+
const credibilityBadge = document.getElementById('credibilityBadge');
|
| 452 |
+
const summary = document.getElementById('summary');
|
| 453 |
+
const detailsGrid = document.getElementById('detailsGrid');
|
| 454 |
+
|
| 455 |
+
// Score
|
| 456 |
+
const score = data.scoreCredibilite || 0;
|
| 457 |
+
scoreValue.textContent = score.toFixed(2);
|
| 458 |
+
|
| 459 |
+
// Conditional Display: Hide Score Card if TEXT input, show if URL
|
| 460 |
+
const isUrl = data.informationEntree && data.informationEntree.startsWith('http');
|
| 461 |
+
const scoreCard = document.querySelector('.score-card');
|
| 462 |
+
|
| 463 |
+
if (isUrl) {
|
| 464 |
+
scoreCard.style.display = 'block';
|
| 465 |
+
// Color based on score
|
| 466 |
+
scoreValue.className = 'score-value';
|
| 467 |
+
credibilityBadge.className = 'credibility-badge';
|
| 468 |
+
|
| 469 |
+
if (score >= 0.7) {
|
| 470 |
+
scoreValue.classList.add('score-high');
|
| 471 |
+
credibilityBadge.classList.add('badge-high');
|
| 472 |
+
credibilityBadge.textContent = '✓ Crédibilité Élevée';
|
| 473 |
+
} else if (score >= 0.4) {
|
| 474 |
+
scoreValue.classList.add('score-medium');
|
| 475 |
+
credibilityBadge.classList.add('badge-medium');
|
| 476 |
+
credibilityBadge.textContent = '⚠ Crédibilité Moyenne';
|
| 477 |
+
} else {
|
| 478 |
+
scoreValue.classList.add('score-low');
|
| 479 |
+
credibilityBadge.classList.add('badge-low');
|
| 480 |
+
credibilityBadge.textContent = '✗ Crédibilité Faible';
|
| 481 |
+
}
|
| 482 |
+
} else {
|
| 483 |
+
// Hide score card for text queries as requested
|
| 484 |
+
scoreCard.style.display = 'none';
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
// Summary
|
| 488 |
+
summary.textContent = data.resumeAnalyse || 'Aucun résumé disponible';
|
| 489 |
+
|
| 490 |
+
// Build details HTML
|
| 491 |
+
let detailsHTML = '';
|
| 492 |
+
|
| 493 |
+
// Source reputation from rule analysis
|
| 494 |
+
const ruleResults = data.reglesAppliquees || {};
|
| 495 |
+
const sourceAnalysis = ruleResults.source_analysis || {};
|
| 496 |
+
|
| 497 |
+
if (sourceAnalysis.reputation) {
|
| 498 |
+
const repColor = sourceAnalysis.reputation === 'High' ? '#22c55e' :
|
| 499 |
+
sourceAnalysis.reputation === 'Low' ? '#ef4444' : '#eab308';
|
| 500 |
+
detailsHTML += `
|
| 501 |
+
<div class="detail-card">
|
| 502 |
+
<div class="detail-label">🏛️ Réputation Source</div>
|
| 503 |
+
<div class="detail-value" style="color: ${repColor}">${sourceAnalysis.reputation}</div>
|
| 504 |
+
</div>
|
| 505 |
+
`;
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
if (sourceAnalysis.domain_age_days) {
|
| 509 |
+
const years = (sourceAnalysis.domain_age_days / 365).toFixed(1);
|
| 510 |
+
detailsHTML += `
|
| 511 |
+
<div class="detail-card">
|
| 512 |
+
<div class="detail-label">📅 Âge du Domaine</div>
|
| 513 |
+
<div class="detail-value">${years} ans</div>
|
| 514 |
+
</div>
|
| 515 |
+
`;
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
// NLP analysis
|
| 519 |
+
const nlpAnalysis = data.analyseNLP || {};
|
| 520 |
+
|
| 521 |
+
if (nlpAnalysis.sentiment) {
|
| 522 |
+
detailsHTML += `
|
| 523 |
+
<div class="detail-card">
|
| 524 |
+
<div class="detail-label">💭 Sentiment</div>
|
| 525 |
+
<div class="detail-value">${nlpAnalysis.sentiment.label} (${(nlpAnalysis.sentiment.score * 100).toFixed(0)}%)</div>
|
| 526 |
+
</div>
|
| 527 |
+
`;
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
if (nlpAnalysis.coherence_score !== null && nlpAnalysis.coherence_score !== undefined) {
|
| 531 |
+
detailsHTML += `
|
| 532 |
+
<div class="detail-card">
|
| 533 |
+
<div class="detail-label">📊 Cohérence</div>
|
| 534 |
+
<div class="detail-value">${(nlpAnalysis.coherence_score * 100).toFixed(0)}%</div>
|
| 535 |
+
</div>
|
| 536 |
+
`;
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
// Add PageRank if available
|
| 540 |
+
if (data.pageRankEstimation && data.pageRankEstimation.estimatedPR) {
|
| 541 |
+
detailsHTML += `
|
| 542 |
+
<div class="detail-card">
|
| 543 |
+
<div class="detail-label">📈 PageRank Estimé</div>
|
| 544 |
+
<div class="detail-value">${data.pageRankEstimation.estimatedPR.toFixed(3)}</div>
|
| 545 |
+
</div>
|
| 546 |
+
`;
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
// Add SEO score if available
|
| 550 |
+
if (data.seoAnalysis && data.seoAnalysis.seoScore) {
|
| 551 |
+
detailsHTML += `
|
| 552 |
+
<div class="detail-card">
|
| 553 |
+
<div class="detail-label">🔍 Score SEO</div>
|
| 554 |
+
<div class="detail-value">${data.seoAnalysis.seoScore}</div>
|
| 555 |
+
</div>
|
| 556 |
+
`;
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
// Fact checks
|
| 560 |
+
const factChecks = ruleResults.fact_checking || [];
|
| 561 |
+
if (factChecks.length > 0) {
|
| 562 |
+
// Add a header for fact checks
|
| 563 |
+
detailsHTML += `
|
| 564 |
+
<div style="grid-column: 1 / -1; margin-top: 1rem; margin-bottom: 0.5rem; font-weight: 600; color: #f472b6;">
|
| 565 |
+
🕵️ Fact-Checks Trouvés (${factChecks.length})
|
| 566 |
+
</div>
|
| 567 |
+
`;
|
| 568 |
+
|
| 569 |
+
factChecks.forEach(fc => {
|
| 570 |
+
detailsHTML += `
|
| 571 |
+
<div class="detail-card" style="grid-column: 1 / -1; border-color: rgba(244, 114, 182, 0.3);">
|
| 572 |
+
<div class="detail-label">🔍 ${fc.publisher || 'Source inconnue'}</div>
|
| 573 |
+
<div class="detail-value" style="font-size: 1rem; margin-bottom: 0.5rem;">"${fc.claim}"</div>
|
| 574 |
+
<div style="display: flex; justify-content: space-between; align-items: center;">
|
| 575 |
+
<span style="color: #f472b6; font-weight: 700;">Verdict: ${fc.rating}</span>
|
| 576 |
+
<a href="${fc.url}" target="_blank" style="color: #a855f7; text-decoration: none; font-size: 0.9rem;">Lire le rapport →</a>
|
| 577 |
+
</div>
|
| 578 |
+
</div>
|
| 579 |
+
`;
|
| 580 |
+
});
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
detailsGrid.innerHTML = detailsHTML;
|
| 584 |
+
|
| 585 |
+
results.classList.add('visible');
|
| 586 |
+
|
| 587 |
+
// Fetch and render graph with slight delay to ensure DOM is ready
|
| 588 |
+
requestAnimationFrame(() => {
|
| 589 |
+
renderD3Graph();
|
| 590 |
+
});
|
| 591 |
+
}
|
| 592 |
+
|
| 593 |
+
async function renderD3Graph() {
|
| 594 |
+
logDebug("Starting renderD3Graph...");
|
| 595 |
+
const container = document.getElementById('cy');
|
| 596 |
+
|
| 597 |
+
// Check if D3 is loaded
|
| 598 |
+
if (typeof d3 === 'undefined') {
|
| 599 |
+
container.innerHTML = '<p class="error visible">Erreur: D3.js n\'a pas pu être chargé.</p>';
|
| 600 |
+
logDebug("ERROR: D3 undefined");
|
| 601 |
+
return;
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
try {
|
| 605 |
+
container.innerHTML = '<div class="spinner"></div>'; // Loading state
|
| 606 |
+
logDebug("Fetching graph data...");
|
| 607 |
+
|
| 608 |
+
const response = await fetch(`${API_URL}/api/ontology/graph`);
|
| 609 |
+
const data = await response.json();
|
| 610 |
+
|
| 611 |
+
container.innerHTML = ''; // Clear loading
|
| 612 |
+
logDebug(`Data received. Nodes: ${data.nodes ? data.nodes.length : 0}, Links: ${data.links ? data.links.length : 0}`);
|
| 613 |
+
|
| 614 |
+
if (!data.nodes || data.nodes.length === 0) {
|
| 615 |
+
container.innerHTML = '<p style="text-align:center; padding:2rem; color:#6b6b8a; width:100%; display:flex; justify-content:center; align-items:center; height:100%;">Ayçune donnée ontologique disponible.</p>';
|
| 616 |
+
return;
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
// Get dimensions
|
| 620 |
+
const width = container.clientWidth || 800;
|
| 621 |
+
const height = container.clientHeight || 500;
|
| 622 |
+
logDebug(`Container size: ${width}x${height}`);
|
| 623 |
+
|
| 624 |
+
const svg = d3.select(container).append("svg")
|
| 625 |
+
.attr("width", "100%")
|
| 626 |
+
.attr("height", "100%")
|
| 627 |
+
.attr("viewBox", [-width / 2, -height / 2, width, height])
|
| 628 |
+
.style("background-color", "rgba(0,0,0,0.2)"); // Visible background
|
| 629 |
+
|
| 630 |
+
// ADDED: Overlay for details
|
| 631 |
+
const overlay = document.createElement('div');
|
| 632 |
+
overlay.id = 'nodeDetails';
|
| 633 |
+
overlay.className = 'node-details-overlay';
|
| 634 |
+
overlay.innerHTML = `
|
| 635 |
+
<button class="close-btn" onclick="document.getElementById('nodeDetails').classList.remove('visible')">×</button>
|
| 636 |
+
<h3 id="detailTitle" style="color:#fff; margin-bottom:0.5rem; font-size:1.1rem; border-bottom:1px solid rgba(255,255,255,0.1); padding-bottom:0.5rem;"></h3>
|
| 637 |
+
<div id="detailBody" style="font-size:0.9rem; color:#ccc; line-height:1.5;"></div>
|
| 638 |
+
`;
|
| 639 |
+
container.appendChild(overlay);
|
| 640 |
+
|
| 641 |
+
logDebug("SVG created. Starting simulation...");
|
| 642 |
+
|
| 643 |
+
// Colors: 1=Purple(Report), 2=Gray(Unknown), 3=Green(Good), 4=Red(Bad)
|
| 644 |
+
const color = d3.scaleOrdinal()
|
| 645 |
+
.domain([1, 2, 3, 4])
|
| 646 |
+
.range(["#8b5cf6", "#94a3b8", "#22c55e", "#ef4444"]);
|
| 647 |
+
|
| 648 |
+
const simulation = d3.forceSimulation(data.nodes)
|
| 649 |
+
.force("link", d3.forceLink(data.links).id(d => d.id).distance(120))
|
| 650 |
+
.force("charge", d3.forceManyBody().strength(-400))
|
| 651 |
+
.force("center", d3.forceCenter(0, 0));
|
| 652 |
+
|
| 653 |
+
// ADDED: Container click to close overlay
|
| 654 |
+
svg.on("click", () => {
|
| 655 |
+
document.getElementById('nodeDetails').classList.remove('visible');
|
| 656 |
+
node.attr("stroke", "#fff").attr("stroke-width", 1.5);
|
| 657 |
+
});
|
| 658 |
+
|
| 659 |
+
// Arrow marker
|
| 660 |
+
svg.append("defs").selectAll("marker")
|
| 661 |
+
.data(["end"])
|
| 662 |
+
.join("marker")
|
| 663 |
+
.attr("id", "arrow")
|
| 664 |
+
.attr("viewBox", "0 -5 10 10")
|
| 665 |
+
.attr("refX", 22)
|
| 666 |
+
.attr("refY", 0)
|
| 667 |
+
.attr("markerWidth", 6)
|
| 668 |
+
.attr("markerHeight", 6)
|
| 669 |
+
.attr("orient", "auto")
|
| 670 |
+
.append("path")
|
| 671 |
+
.attr("fill", "#64748b")
|
| 672 |
+
.attr("d", "M0,-5L10,0L0,5");
|
| 673 |
+
|
| 674 |
+
const link = svg.append("g")
|
| 675 |
+
.selectAll("line")
|
| 676 |
+
.data(data.links)
|
| 677 |
+
.join("line")
|
| 678 |
+
.attr("stroke", "#475569")
|
| 679 |
+
.attr("stroke-opacity", 0.6)
|
| 680 |
+
.attr("stroke-width", 2)
|
| 681 |
+
.attr("marker-end", "url(#arrow)");
|
| 682 |
+
|
| 683 |
+
const node = svg.append("g")
|
| 684 |
+
.selectAll("circle")
|
| 685 |
+
.data(data.nodes)
|
| 686 |
+
.join("circle")
|
| 687 |
+
.attr("r", d => d.group === 1 ? 18 : 8)
|
| 688 |
+
.attr("fill", d => color(d.group))
|
| 689 |
+
.attr("stroke", "#fff")
|
| 690 |
+
.attr("stroke-width", 1.5)
|
| 691 |
+
.style("cursor", "pointer")
|
| 692 |
+
.call(drag(simulation))
|
| 693 |
+
.on("click", (event, d) => {
|
| 694 |
+
event.stopPropagation(); // Stop background click
|
| 695 |
+
showNodeDetails(d);
|
| 696 |
+
|
| 697 |
+
// Highlight selected
|
| 698 |
+
node.attr("stroke", "#fff").attr("stroke-width", 1.5);
|
| 699 |
+
d3.select(event.currentTarget).attr("stroke", "#f43f5e").attr("stroke-width", 3);
|
| 700 |
+
});
|
| 701 |
+
|
| 702 |
+
// Labels
|
| 703 |
+
const text = svg.append("g")
|
| 704 |
+
.selectAll("text")
|
| 705 |
+
.data(data.nodes)
|
| 706 |
+
.join("text")
|
| 707 |
+
.text(d => d.name.length > 20 ? d.name.substring(0, 20) + "..." : d.name)
|
| 708 |
+
.attr("font-size", "11px")
|
| 709 |
+
.attr("fill", "#e0e0e0")
|
| 710 |
+
.attr("dx", 12)
|
| 711 |
+
.attr("dy", 4)
|
| 712 |
+
.style("pointer-events", "none")
|
| 713 |
+
.style("text-shadow", "0 1px 2px black");
|
| 714 |
+
|
| 715 |
+
// Tooltip
|
| 716 |
+
node.append("title").text(d => `${d.name}\n(${d.type})`);
|
| 717 |
+
|
| 718 |
+
simulation.on("tick", () => {
|
| 719 |
+
link
|
| 720 |
+
.attr("x1", d => d.source.x)
|
| 721 |
+
.attr("y1", d => d.source.y)
|
| 722 |
+
.attr("x2", d => d.target.x)
|
| 723 |
+
.attr("y2", d => d.target.y);
|
| 724 |
+
|
| 725 |
+
node
|
| 726 |
+
.attr("cx", d => d.x)
|
| 727 |
+
.attr("cy", d => d.y);
|
| 728 |
+
|
| 729 |
+
text
|
| 730 |
+
.attr("x", d => d.x)
|
| 731 |
+
.attr("y", d => d.y);
|
| 732 |
+
});
|
| 733 |
+
|
| 734 |
+
// Zoom
|
| 735 |
+
svg.call(d3.zoom().scaleExtent([0.1, 4]).on("zoom", (e) => {
|
| 736 |
+
svg.selectAll('g').attr('transform', e.transform);
|
| 737 |
+
}));
|
| 738 |
+
|
| 739 |
+
logDebug("Graph rendered successfully.");
|
| 740 |
+
|
| 741 |
+
} catch (err) {
|
| 742 |
+
console.error("D3 Graph error:", err);
|
| 743 |
+
const container = document.getElementById('cy');
|
| 744 |
+
if (container) container.innerHTML = `<p class="error visible">Erreur graphique: ${err.message}</p>`;
|
| 745 |
+
logDebug(`ERROR EXCEPTION: ${err.message}`);
|
| 746 |
+
}
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
function testD3() {
|
| 750 |
+
logDebug("Starting Static Test...");
|
| 751 |
+
const container = document.getElementById('cy');
|
| 752 |
+
container.innerHTML = '';
|
| 753 |
+
|
| 754 |
+
const width = container.clientWidth || 800;
|
| 755 |
+
const height = container.clientHeight || 500;
|
| 756 |
+
|
| 757 |
+
logDebug(`Container: ${width}x${height}`);
|
| 758 |
+
|
| 759 |
+
try {
|
| 760 |
+
const svg = d3.select(container).append("svg")
|
| 761 |
+
.attr("width", "100%")
|
| 762 |
+
.attr("height", "100%")
|
| 763 |
+
.attr("viewBox", [-width / 2, -height / 2, width, height])
|
| 764 |
+
.style("background-color", "#222");
|
| 765 |
+
|
| 766 |
+
svg.append("circle")
|
| 767 |
+
.attr("r", 50)
|
| 768 |
+
.attr("fill", "red")
|
| 769 |
+
.attr("cx", 0)
|
| 770 |
+
.attr("cy", 0);
|
| 771 |
+
|
| 772 |
+
svg.append("text")
|
| 773 |
+
.text("D3 WORKS")
|
| 774 |
+
.attr("fill", "white")
|
| 775 |
+
.attr("x", 0)
|
| 776 |
+
.attr("y", 5)
|
| 777 |
+
.attr("text-anchor", "middle");
|
| 778 |
+
|
| 779 |
+
logDebug("Static Test Complete. You should see a red circle.");
|
| 780 |
+
} catch (e) {
|
| 781 |
+
logDebug("Static Test ERROR: " + e.message);
|
| 782 |
+
alert("Static Test Failed: " + e.message);
|
| 783 |
+
}
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
// --- Helper Functions ---
|
| 788 |
+
|
| 789 |
+
function logDebug(msg) {
|
| 790 |
+
console.log(`[SysCRED Debug] ${msg}`);
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
function drag(simulation) {
|
| 794 |
+
function dragstarted(event) {
|
| 795 |
+
if (!event.active) simulation.alphaTarget(0.3).restart();
|
| 796 |
+
event.subject.fx = event.subject.x;
|
| 797 |
+
event.subject.fy = event.subject.y;
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
function dragged(event) {
|
| 801 |
+
event.subject.fx = event.x;
|
| 802 |
+
event.subject.fy = event.y;
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
function dragended(event) {
|
| 806 |
+
if (!event.active) simulation.alphaTarget(0);
|
| 807 |
+
event.subject.fx = null;
|
| 808 |
+
event.subject.fy = null;
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
return d3.drag()
|
| 812 |
+
.on("start", dragstarted)
|
| 813 |
+
.on("drag", dragged)
|
| 814 |
+
.on("end", dragended);
|
| 815 |
+
}
|
| 816 |
+
|
| 817 |
+
function showNodeDetails(d) {
|
| 818 |
+
const overlay = document.getElementById('nodeDetails');
|
| 819 |
+
const title = document.getElementById('detailTitle');
|
| 820 |
+
const body = document.getElementById('detailBody');
|
| 821 |
+
|
| 822 |
+
if(!overlay) return;
|
| 823 |
+
|
| 824 |
+
title.textContent = d.name;
|
| 825 |
+
|
| 826 |
+
let typeColor = "#94a3b8";
|
| 827 |
+
if(d.group === 1) typeColor = "#8b5cf6"; // Report
|
| 828 |
+
if(d.group === 3) typeColor = "#22c55e"; // Good
|
| 829 |
+
if(d.group === 4) typeColor = "#ef4444"; // Bad
|
| 830 |
+
|
| 831 |
+
body.innerHTML = `
|
| 832 |
+
<div style="margin-bottom:0.5rem">
|
| 833 |
+
<span style="background:${typeColor}; color:white; padding:2px 6px; border-radius:4px; font-size:0.75rem;">${d.type || 'Unknown Type'}</span>
|
| 834 |
+
</div>
|
| 835 |
+
<div><strong>URI:</strong> <br><span style="font-family:monospace; color:#a855f7; word-break:break-all;">${d.id}</span></div>
|
| 836 |
+
`;
|
| 837 |
+
|
| 838 |
+
overlay.classList.add('visible');
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
// Allow Enter key to trigger analysis
|
| 842 |
+
document.getElementById('urlInput').addEventListener('keypress', function (e) {
|
| 843 |
+
if (e.key === 'Enter') {
|
| 844 |
+
analyzeUrl();
|
| 845 |
+
}
|
| 846 |
+
});
|
| 847 |
+
</script>
|
| 848 |
+
</body>
|
| 849 |
+
|
| 850 |
+
</html>
|
syscred/static/js/d3.min.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
syscred/test_graphrag.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Test Script for GraphRAG
|
| 4 |
+
========================
|
| 5 |
+
Verifies that the GraphRAG module can correctly:
|
| 6 |
+
1. Connect to an in-memory ontology.
|
| 7 |
+
2. Retrieve context for a domain that has history.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
# Add parent directory to path to allow imports
|
| 14 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 15 |
+
|
| 16 |
+
from syscred.ontology_manager import OntologyManager
|
| 17 |
+
from syscred.graph_rag import GraphRAG
|
| 18 |
+
|
| 19 |
+
def test_graphrag_retrieval():
|
| 20 |
+
print("=== Testing GraphRAG Retrieval Logic ===\n")
|
| 21 |
+
|
| 22 |
+
# 1. Setup In-Memory Ontology
|
| 23 |
+
print("[1] Initializing in-memory Ontology...")
|
| 24 |
+
om = OntologyManager(base_ontology_path=None, data_path=None)
|
| 25 |
+
|
| 26 |
+
# 2. Add Fake History (Memory)
|
| 27 |
+
print("[2] Injecting test memory for 'lemonde.fr'...")
|
| 28 |
+
fake_report = {
|
| 29 |
+
'scoreCredibilite': 0.95,
|
| 30 |
+
'informationEntree': 'https://www.lemonde.fr/article/test',
|
| 31 |
+
'resumeAnalyse': "Reliable source.",
|
| 32 |
+
'reglesAppliquees': {
|
| 33 |
+
'source_analysis': {'reputation': 'High', 'domain': 'lemonde.fr'}
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
# Add it 3 times to simulate history
|
| 37 |
+
om.add_evaluation_triplets(fake_report)
|
| 38 |
+
om.add_evaluation_triplets(fake_report)
|
| 39 |
+
om.add_evaluation_triplets(fake_report)
|
| 40 |
+
print(" -> Added 3 evaluation records.")
|
| 41 |
+
|
| 42 |
+
# 3. Initialize GraphRAG
|
| 43 |
+
rag = GraphRAG(om)
|
| 44 |
+
|
| 45 |
+
# 4. Query Context
|
| 46 |
+
domain = "lemonde.fr"
|
| 47 |
+
print(f"\n[3] Querying GraphRAG for domain: '{domain}'...")
|
| 48 |
+
context = rag.get_context(domain)
|
| 49 |
+
|
| 50 |
+
print("\n--- Result Context (Domain History) ---")
|
| 51 |
+
print(context['full_text'])
|
| 52 |
+
print("---------------------------------------\n")
|
| 53 |
+
|
| 54 |
+
# 5. Validation 1 (History)
|
| 55 |
+
if "Analyzed 3 times" in context['full_text']:
|
| 56 |
+
print("✅ SUCCESS: GraphRAG correctly remembered the history.")
|
| 57 |
+
else:
|
| 58 |
+
print("❌ FAILURE: GraphRAG did not return the expected history count.")
|
| 59 |
+
|
| 60 |
+
# 6. Test Similar Claims (New Feature)
|
| 61 |
+
print(f"\n[4] Testing 'Similar Claims' for keywords: ['lemonde', 'fake']...")
|
| 62 |
+
# The previous injection didn't use 'fake', let's check what it finds or if we need to inject more
|
| 63 |
+
# Our fake_report had content: 'https://www.lemonde.fr/article/test'
|
| 64 |
+
# The new logic searches regex in 'informationContent'
|
| 65 |
+
|
| 66 |
+
# Let's add a specifically claim-like entry
|
| 67 |
+
fake_claim = {
|
| 68 |
+
'scoreCredibilite': 0.1,
|
| 69 |
+
'informationEntree': 'The earth is flat and fake',
|
| 70 |
+
'resumeAnalyse': "False claim.",
|
| 71 |
+
'reglesAppliquees': {'source_analysis': {'reputation': 'Low'}}
|
| 72 |
+
}
|
| 73 |
+
om.add_evaluation_triplets(fake_claim)
|
| 74 |
+
|
| 75 |
+
# Search for 'flat'
|
| 76 |
+
similar_context = rag.get_context("unknown.com", keywords=["flat", "earth"])
|
| 77 |
+
print("\n--- Result Context (Similar Claims) ---")
|
| 78 |
+
print(similar_context['full_text'])
|
| 79 |
+
print("---------------------------------------\n")
|
| 80 |
+
|
| 81 |
+
if "Found 1 similar claims" in similar_context['full_text'] or "The earth is flat" in similar_context['full_text']:
|
| 82 |
+
print("✅ SUCCESS: GraphRAG found similar claims by keywords.")
|
| 83 |
+
else:
|
| 84 |
+
print("❌ FAILURE: GraphRAG did not find the injected similar claim.")
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
test_graphrag_retrieval()
|
syscred/test_phase1.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add project root to path
|
| 5 |
+
sys.path.insert(0, os.getcwd())
|
| 6 |
+
|
| 7 |
+
from syscred.api_clients import ExternalAPIClients
|
| 8 |
+
|
| 9 |
+
def test_backlinks():
|
| 10 |
+
client = ExternalAPIClients()
|
| 11 |
+
|
| 12 |
+
test_urls = [
|
| 13 |
+
"https://www.lemonde.fr", # High + Old
|
| 14 |
+
"https://www.infowars.com", # Low + Old
|
| 15 |
+
"https://example.com", # Unknown + Old
|
| 16 |
+
"https://new-suspicious-site.xyz" # Unknown + New (likely)
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
print("=== Testing Backlink Estimation Heuristic ===")
|
| 20 |
+
for url in test_urls:
|
| 21 |
+
print(f"\nTesting: {url}")
|
| 22 |
+
res = client.estimate_backlinks(url)
|
| 23 |
+
print(f" Count: {res['estimated_count']}")
|
| 24 |
+
print(f" Method: {res['method']}")
|
| 25 |
+
print(f" Note: {res['note']}")
|
| 26 |
+
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
test_backlinks()
|
syscred/test_phase2.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add project root to path
|
| 5 |
+
sys.path.insert(0, os.path.dirname(os.getcwd())) # Assumes running from syscred/
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 9 |
+
except ImportError:
|
| 10 |
+
# Just in case of path issues
|
| 11 |
+
sys.path.append(os.getcwd())
|
| 12 |
+
from verification_system import CredibilityVerificationSystem
|
| 13 |
+
|
| 14 |
+
def test_nlp_fallbacks():
|
| 15 |
+
print("=== Testing NLP Hybrid Fallbacks ===")
|
| 16 |
+
|
| 17 |
+
# Initialize without loading standard ML (to test our new hybrid logic)
|
| 18 |
+
# Note: verification_system uses HAS_ML flag, but we want to test specific methods
|
| 19 |
+
syscred = CredibilityVerificationSystem(load_ml_models=False)
|
| 20 |
+
|
| 21 |
+
# Test 1: Coherence
|
| 22 |
+
print("\n[Test 1] Coherence")
|
| 23 |
+
coherent_text = "The quick brown fox jumps over the lazy dog. The dog was not amused. It barked loudly."
|
| 24 |
+
incoherent_text = "The quick brown fox. Banana republic creates clouds. Jump over the moon."
|
| 25 |
+
|
| 26 |
+
score1 = syscred._calculate_coherence(coherent_text)
|
| 27 |
+
score2 = syscred._calculate_coherence(incoherent_text)
|
| 28 |
+
|
| 29 |
+
print(f" Coherent Text Score: {score1}")
|
| 30 |
+
print(f" Incoherent Text Score: {score2}")
|
| 31 |
+
|
| 32 |
+
if score1 > score2:
|
| 33 |
+
print(" ✓ Coherence logic working (Metric discriminates)")
|
| 34 |
+
else:
|
| 35 |
+
print(" ! Coherence scores inconclusive (Might be heuristic limitations)")
|
| 36 |
+
|
| 37 |
+
# Test 2: Bias
|
| 38 |
+
print("\n[Test 2] Bias")
|
| 39 |
+
neutral_text = "The government announced a new policy today regarding taxation."
|
| 40 |
+
biased_text = "The corrupt regime stands accused of treason against the people by radical idiots."
|
| 41 |
+
|
| 42 |
+
res1 = syscred._analyze_bias(neutral_text)
|
| 43 |
+
res2 = syscred._analyze_bias(biased_text)
|
| 44 |
+
|
| 45 |
+
print(f" Neutral: {res1['label']} (Score: {res1['score']:.2f})")
|
| 46 |
+
print(f" Biased: {res2['label']} (Score: {res2['score']:.2f})")
|
| 47 |
+
print(f" Method Used: {res1.get('method', 'Unknown')}")
|
| 48 |
+
|
| 49 |
+
if res2['score'] > res1['score']:
|
| 50 |
+
print(" ✓ Bias detection working")
|
| 51 |
+
else:
|
| 52 |
+
print(" ! Bias detection inconclusive")
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
test_nlp_fallbacks()
|
syscred/test_suite.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Point to parent directory (MonCode) so we can import 'syscred' package
|
| 6 |
+
# Current file is in MonCode/syscred/test_suite.py
|
| 7 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
+
|
| 9 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 10 |
+
from syscred.api_clients import ExternalAPIClients
|
| 11 |
+
|
| 12 |
+
class TestSysCRED(unittest.TestCase):
|
| 13 |
+
|
| 14 |
+
@classmethod
|
| 15 |
+
def setUpClass(cls):
|
| 16 |
+
print("\n[TestSysCRED] Setting up system...")
|
| 17 |
+
cls.system = CredibilityVerificationSystem(load_ml_models=False)
|
| 18 |
+
cls.client = cls.system.api_clients
|
| 19 |
+
|
| 20 |
+
def test_backlink_estimation_heuristic(self):
|
| 21 |
+
"""Test that backlink estimation respects reputation."""
|
| 22 |
+
lemonde = self.client.estimate_backlinks("https://www.lemonde.fr")
|
| 23 |
+
infowars = self.client.estimate_backlinks("https://infowars.com")
|
| 24 |
+
|
| 25 |
+
self.assertGreater(lemonde['estimated_count'], infowars['estimated_count'],
|
| 26 |
+
"High reputation should have more backlinks than Low")
|
| 27 |
+
self.assertEqual(lemonde['method'], 'heuristic_v2.1')
|
| 28 |
+
|
| 29 |
+
def test_coherence_heuristic(self):
|
| 30 |
+
"""Test coherence scoring heuristic."""
|
| 31 |
+
good_text = "This is a coherent sentence. It follows logically."
|
| 32 |
+
bad_text = "This is. Random words. Banana. Cloud."
|
| 33 |
+
|
| 34 |
+
score_good = self.system._calculate_coherence(good_text)
|
| 35 |
+
score_bad = self.system._calculate_coherence(bad_text)
|
| 36 |
+
|
| 37 |
+
self.assertTrue(0 <= score_good <= 1)
|
| 38 |
+
# Note: Heuristic using sentence length variance might be sensitive
|
| 39 |
+
# bad_text has very short sentences, so average length is small -> penalty
|
| 40 |
+
# good_text has normal length
|
| 41 |
+
self.assertGreaterEqual(score_good, score_bad, "Coherent text should score >= incoherent")
|
| 42 |
+
|
| 43 |
+
def test_bias_heuristic(self):
|
| 44 |
+
"""Test bias detection heuristic."""
|
| 45 |
+
neutral = "The economy grew by 2%."
|
| 46 |
+
biased = "The radical corrupt regime is destroying us!"
|
| 47 |
+
|
| 48 |
+
res_neutral = self.system._analyze_bias(neutral)
|
| 49 |
+
res_biased = self.system._analyze_bias(biased)
|
| 50 |
+
|
| 51 |
+
self.assertLess(res_neutral['score'], res_biased['score'])
|
| 52 |
+
self.assertIn("biased", res_biased['label'].lower())
|
| 53 |
+
|
| 54 |
+
def test_full_pipeline(self):
|
| 55 |
+
"""Test the full verification pipeline (integration test)."""
|
| 56 |
+
input_data = "https://www.example.com"
|
| 57 |
+
result = self.system.verify_information(input_data)
|
| 58 |
+
|
| 59 |
+
self.assertIn('scoreCredibilite', result)
|
| 60 |
+
self.assertIn('resumeAnalyse', result)
|
| 61 |
+
self.assertIsNotNone(result['scoreCredibilite'])
|
| 62 |
+
|
| 63 |
+
if __name__ == '__main__':
|
| 64 |
+
unittest.main()
|
syscred/test_trec_integration.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Test TREC Integration - SysCRED
|
| 4 |
+
================================
|
| 5 |
+
Integration tests for TREC AP88-90 evidence retrieval.
|
| 6 |
+
|
| 7 |
+
Tests:
|
| 8 |
+
1. TRECRetriever initialization
|
| 9 |
+
2. Evidence retrieval
|
| 10 |
+
3. Integration with VerificationSystem
|
| 11 |
+
4. Batch retrieval
|
| 12 |
+
5. Metrics evaluation
|
| 13 |
+
|
| 14 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 15 |
+
Citation Key: loyerEvaluationModelesRecherche2025
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import sys
|
| 19 |
+
import unittest
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
# Add parent to path
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 24 |
+
|
| 25 |
+
from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult
|
| 26 |
+
from syscred.trec_dataset import TRECDataset, TRECTopic, SAMPLE_TOPICS
|
| 27 |
+
from syscred.eval_metrics import EvaluationMetrics
|
| 28 |
+
from syscred.ir_engine import IREngine
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TestTRECRetriever(unittest.TestCase):
|
| 32 |
+
"""Tests for TRECRetriever class."""
|
| 33 |
+
|
| 34 |
+
@classmethod
|
| 35 |
+
def setUpClass(cls):
|
| 36 |
+
"""Set up retriever with sample corpus."""
|
| 37 |
+
cls.retriever = TRECRetriever(use_stemming=True, enable_prf=False)
|
| 38 |
+
|
| 39 |
+
# Add sample corpus for testing
|
| 40 |
+
cls.retriever.corpus = {
|
| 41 |
+
"AP880101-0001": {
|
| 42 |
+
"text": "Climate change is primarily caused by human activities, particularly the burning of fossil fuels.",
|
| 43 |
+
"title": "Climate Science Report"
|
| 44 |
+
},
|
| 45 |
+
"AP880101-0002": {
|
| 46 |
+
"text": "The Earth's temperature has risen significantly over the past century due to greenhouse gas emissions.",
|
| 47 |
+
"title": "Global Warming Study"
|
| 48 |
+
},
|
| 49 |
+
"AP880102-0001": {
|
| 50 |
+
"text": "Natural climate variations have occurred throughout Earth's history, including ice ages.",
|
| 51 |
+
"title": "Climate History"
|
| 52 |
+
},
|
| 53 |
+
"AP880102-0002": {
|
| 54 |
+
"text": "Renewable energy sources like solar and wind can help reduce carbon emissions significantly.",
|
| 55 |
+
"title": "Green Energy Solutions"
|
| 56 |
+
},
|
| 57 |
+
"AP880103-0001": {
|
| 58 |
+
"text": "Scientific consensus supports the theory that humans are the primary cause of recent climate change.",
|
| 59 |
+
"title": "IPCC Summary"
|
| 60 |
+
},
|
| 61 |
+
"AP890215-0001": {
|
| 62 |
+
"text": "The presidential election campaign focused on economic issues and foreign policy.",
|
| 63 |
+
"title": "Election Coverage"
|
| 64 |
+
},
|
| 65 |
+
"AP890216-0001": {
|
| 66 |
+
"text": "Stock markets rose sharply after positive economic indicators were released.",
|
| 67 |
+
"title": "Financial News"
|
| 68 |
+
},
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
def test_retriever_initialization(self):
|
| 72 |
+
"""Test that retriever initializes correctly."""
|
| 73 |
+
self.assertIsNotNone(self.retriever)
|
| 74 |
+
self.assertIsNotNone(self.retriever.ir_engine)
|
| 75 |
+
self.assertEqual(len(self.retriever.corpus), 7)
|
| 76 |
+
|
| 77 |
+
def test_evidence_retrieval(self):
|
| 78 |
+
"""Test evidence retrieval for a claim."""
|
| 79 |
+
result = self.retriever.retrieve_evidence(
|
| 80 |
+
claim="Climate change is caused by human activities",
|
| 81 |
+
k=3
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
self.assertIsInstance(result, RetrievalResult)
|
| 85 |
+
self.assertGreater(len(result.evidences), 0)
|
| 86 |
+
self.assertLessEqual(len(result.evidences), 3)
|
| 87 |
+
|
| 88 |
+
# Check first evidence
|
| 89 |
+
first = result.evidences[0]
|
| 90 |
+
self.assertIsInstance(first, Evidence)
|
| 91 |
+
self.assertTrue(first.doc_id.startswith("AP"))
|
| 92 |
+
self.assertGreater(first.score, 0)
|
| 93 |
+
self.assertEqual(first.rank, 1)
|
| 94 |
+
|
| 95 |
+
def test_batch_retrieval(self):
|
| 96 |
+
"""Test batch evidence retrieval."""
|
| 97 |
+
claims = [
|
| 98 |
+
"Climate change is real",
|
| 99 |
+
"Stock markets and economy",
|
| 100 |
+
"Presidential election"
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
results = self.retriever.batch_retrieve(claims, k=2)
|
| 104 |
+
|
| 105 |
+
self.assertEqual(len(results), 3)
|
| 106 |
+
for result in results:
|
| 107 |
+
self.assertIsInstance(result, RetrievalResult)
|
| 108 |
+
|
| 109 |
+
def test_statistics(self):
|
| 110 |
+
"""Test statistics collection."""
|
| 111 |
+
# Run a query first
|
| 112 |
+
self.retriever.retrieve_evidence("test query", k=2)
|
| 113 |
+
|
| 114 |
+
stats = self.retriever.get_statistics()
|
| 115 |
+
|
| 116 |
+
self.assertIn("queries_processed", stats)
|
| 117 |
+
self.assertIn("corpus_size", stats)
|
| 118 |
+
self.assertGreater(stats["queries_processed"], 0)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class TestTRECDataset(unittest.TestCase):
|
| 122 |
+
"""Tests for TRECDataset class."""
|
| 123 |
+
|
| 124 |
+
def test_sample_topics(self):
|
| 125 |
+
"""Test sample topics availability."""
|
| 126 |
+
self.assertIsNotNone(SAMPLE_TOPICS)
|
| 127 |
+
self.assertGreater(len(SAMPLE_TOPICS), 0)
|
| 128 |
+
|
| 129 |
+
# Check structure
|
| 130 |
+
for topic_id, topic in SAMPLE_TOPICS.items():
|
| 131 |
+
self.assertIsInstance(topic, TRECTopic)
|
| 132 |
+
self.assertTrue(topic.title)
|
| 133 |
+
|
| 134 |
+
def test_dataset_initialization(self):
|
| 135 |
+
"""Test dataset initialization."""
|
| 136 |
+
dataset = TRECDataset()
|
| 137 |
+
self.assertIsNotNone(dataset)
|
| 138 |
+
self.assertEqual(len(dataset.topics), 0)
|
| 139 |
+
self.assertEqual(len(dataset.qrels), 0)
|
| 140 |
+
|
| 141 |
+
def test_topic_query_generation(self):
|
| 142 |
+
"""Test query generation from topics."""
|
| 143 |
+
dataset = TRECDataset()
|
| 144 |
+
dataset.topics = SAMPLE_TOPICS.copy()
|
| 145 |
+
|
| 146 |
+
short_queries = dataset.get_topic_queries(query_type="short")
|
| 147 |
+
long_queries = dataset.get_topic_queries(query_type="long")
|
| 148 |
+
|
| 149 |
+
self.assertEqual(len(short_queries), len(SAMPLE_TOPICS))
|
| 150 |
+
self.assertEqual(len(long_queries), len(SAMPLE_TOPICS))
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class TestEvaluationMetrics(unittest.TestCase):
|
| 154 |
+
"""Tests for EvaluationMetrics class."""
|
| 155 |
+
|
| 156 |
+
def setUp(self):
|
| 157 |
+
self.metrics = EvaluationMetrics()
|
| 158 |
+
|
| 159 |
+
def test_precision_at_k(self):
|
| 160 |
+
"""Test P@K calculation."""
|
| 161 |
+
retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"]
|
| 162 |
+
relevant = {"doc1", "doc3", "doc5"}
|
| 163 |
+
|
| 164 |
+
p_at_3 = self.metrics.precision_at_k(retrieved, relevant, k=3)
|
| 165 |
+
self.assertAlmostEqual(p_at_3, 2/3) # doc1 and doc3 in top 3
|
| 166 |
+
|
| 167 |
+
p_at_5 = self.metrics.precision_at_k(retrieved, relevant, k=5)
|
| 168 |
+
self.assertAlmostEqual(p_at_5, 3/5)
|
| 169 |
+
|
| 170 |
+
def test_recall_at_k(self):
|
| 171 |
+
"""Test R@K calculation."""
|
| 172 |
+
retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"]
|
| 173 |
+
relevant = {"doc1", "doc3", "doc5", "doc7"} # 4 relevant, doc7 not retrieved
|
| 174 |
+
|
| 175 |
+
r_at_5 = self.metrics.recall_at_k(retrieved, relevant, k=5)
|
| 176 |
+
self.assertAlmostEqual(r_at_5, 3/4) # 3 of 4 relevant docs retrieved
|
| 177 |
+
|
| 178 |
+
def test_average_precision(self):
|
| 179 |
+
"""Test AP calculation."""
|
| 180 |
+
retrieved = ["doc1", "doc2", "doc3", "doc4"]
|
| 181 |
+
relevant = {"doc1", "doc3"}
|
| 182 |
+
|
| 183 |
+
ap = self.metrics.average_precision(retrieved, relevant)
|
| 184 |
+
# AP = (1/2) * (1/1 + 2/3) = 0.5 * 1.667 = 0.833
|
| 185 |
+
expected = (1.0 + 2/3) / 2
|
| 186 |
+
self.assertAlmostEqual(ap, expected, places=4)
|
| 187 |
+
|
| 188 |
+
def test_reciprocal_rank(self):
|
| 189 |
+
"""Test MRR calculation."""
|
| 190 |
+
retrieved = ["doc2", "doc3", "doc1", "doc4"]
|
| 191 |
+
relevant = {"doc1"}
|
| 192 |
+
|
| 193 |
+
rr = self.metrics.reciprocal_rank(retrieved, relevant)
|
| 194 |
+
self.assertAlmostEqual(rr, 1/3) # doc1 is at rank 3
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
class TestIREngine(unittest.TestCase):
|
| 198 |
+
"""Tests for IREngine class."""
|
| 199 |
+
|
| 200 |
+
def setUp(self):
|
| 201 |
+
self.engine = IREngine(use_stemming=True)
|
| 202 |
+
|
| 203 |
+
def test_preprocessing(self):
|
| 204 |
+
"""Test text preprocessing."""
|
| 205 |
+
text = "The quick brown fox JUMPS over the lazy dog!"
|
| 206 |
+
processed = self.engine.preprocess(text)
|
| 207 |
+
|
| 208 |
+
# Should be lowercase, no common stopwords
|
| 209 |
+
self.assertNotIn("the", processed)
|
| 210 |
+
self.assertTrue(processed.islower())
|
| 211 |
+
# Should contain content words
|
| 212 |
+
self.assertIn("quick", processed)
|
| 213 |
+
self.assertIn("brown", processed)
|
| 214 |
+
|
| 215 |
+
def test_tfidf_calculation(self):
|
| 216 |
+
"""Test TF-IDF scoring (basic)."""
|
| 217 |
+
# This tests the internal TF-IDF if pyserini not available
|
| 218 |
+
self.assertIsNotNone(self.engine)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class TestVerificationSystemIntegration(unittest.TestCase):
|
| 222 |
+
"""Integration tests with VerificationSystem."""
|
| 223 |
+
|
| 224 |
+
@classmethod
|
| 225 |
+
def setUpClass(cls):
|
| 226 |
+
"""Initialize system without ML models for speed."""
|
| 227 |
+
try:
|
| 228 |
+
from syscred.verification_system import CredibilityVerificationSystem
|
| 229 |
+
cls.system = CredibilityVerificationSystem(load_ml_models=False)
|
| 230 |
+
cls.skip = False
|
| 231 |
+
except Exception as e:
|
| 232 |
+
print(f"Skipping integration tests: {e}")
|
| 233 |
+
cls.skip = True
|
| 234 |
+
|
| 235 |
+
def test_system_has_retriever(self):
|
| 236 |
+
"""Test that system has TREC retriever."""
|
| 237 |
+
if self.skip:
|
| 238 |
+
self.skipTest("VerificationSystem not available")
|
| 239 |
+
|
| 240 |
+
# Retriever might be None if no corpus configured
|
| 241 |
+
self.assertTrue(hasattr(self.system, 'trec_retriever'))
|
| 242 |
+
|
| 243 |
+
def test_retrieve_evidence_method(self):
|
| 244 |
+
"""Test retrieve_evidence method."""
|
| 245 |
+
if self.skip:
|
| 246 |
+
self.skipTest("VerificationSystem not available")
|
| 247 |
+
|
| 248 |
+
# Should return empty list if no corpus
|
| 249 |
+
evidences = self.system.retrieve_evidence("test claim")
|
| 250 |
+
self.assertIsInstance(evidences, list)
|
| 251 |
+
|
| 252 |
+
def test_verify_with_evidence_method(self):
|
| 253 |
+
"""Test verify_with_evidence method."""
|
| 254 |
+
if self.skip:
|
| 255 |
+
self.skipTest("VerificationSystem not available")
|
| 256 |
+
|
| 257 |
+
result = self.system.verify_with_evidence("Climate change is real")
|
| 258 |
+
|
| 259 |
+
self.assertIn('claim', result)
|
| 260 |
+
self.assertIn('evidences', result)
|
| 261 |
+
self.assertIn('verification_verdict', result)
|
| 262 |
+
self.assertIn('confidence', result)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
if __name__ == "__main__":
|
| 266 |
+
print("=" * 60)
|
| 267 |
+
print("SysCRED TREC Integration Tests")
|
| 268 |
+
print("=" * 60)
|
| 269 |
+
|
| 270 |
+
# Run with verbosity
|
| 271 |
+
unittest.main(verbosity=2)
|
syscred/trec_dataset.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
TREC Dataset Module - SysCRED
|
| 4 |
+
==============================
|
| 5 |
+
Loader and utilities for TREC AP88-90 dataset.
|
| 6 |
+
|
| 7 |
+
Handles:
|
| 8 |
+
- Topic/Query parsing
|
| 9 |
+
- Qrels (relevance judgments) loading
|
| 10 |
+
- Document corpus loading
|
| 11 |
+
- TREC run file generation
|
| 12 |
+
|
| 13 |
+
Based on: TREC_AP88-90_5juin2025.py
|
| 14 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 15 |
+
Citation Key: loyerEvaluationModelesRecherche2025
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import re
|
| 20 |
+
import json
|
| 21 |
+
import tarfile
|
| 22 |
+
from typing import Dict, List, Tuple, Optional, Set
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class TRECTopic:
|
| 29 |
+
"""A TREC topic (query)."""
|
| 30 |
+
topic_id: str
|
| 31 |
+
title: str # Short query
|
| 32 |
+
description: str # Long description
|
| 33 |
+
narrative: str = "" # Full narrative (optional)
|
| 34 |
+
|
| 35 |
+
@property
|
| 36 |
+
def short_query(self) -> str:
|
| 37 |
+
return self.title
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def long_query(self) -> str:
|
| 41 |
+
return f"{self.title} {self.description}".strip()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class TRECQrel:
|
| 46 |
+
"""A relevance judgment."""
|
| 47 |
+
topic_id: str
|
| 48 |
+
doc_id: str
|
| 49 |
+
relevance: int # 0=not relevant, 1=relevant, 2+=highly relevant
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@dataclass
|
| 53 |
+
class TRECDocument:
|
| 54 |
+
"""A document from the corpus."""
|
| 55 |
+
doc_id: str
|
| 56 |
+
text: str
|
| 57 |
+
title: str = ""
|
| 58 |
+
date: str = ""
|
| 59 |
+
source: str = ""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class TRECDataset:
|
| 63 |
+
"""
|
| 64 |
+
TREC AP88-90 Dataset loader and manager.
|
| 65 |
+
|
| 66 |
+
Provides utilities for:
|
| 67 |
+
- Loading topics (queries)
|
| 68 |
+
- Loading qrels (relevance judgments)
|
| 69 |
+
- Loading document corpus
|
| 70 |
+
- Creating TREC-format run files
|
| 71 |
+
|
| 72 |
+
Usage:
|
| 73 |
+
dataset = TRECDataset(base_path="/path/to/trec")
|
| 74 |
+
topics = dataset.load_topics()
|
| 75 |
+
qrels = dataset.load_qrels()
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
# Standard TREC file patterns
|
| 79 |
+
TOPIC_PATTERN = r"topics\.\d+\.txt"
|
| 80 |
+
QREL_PATTERN = r"qrels\.\d+\.txt"
|
| 81 |
+
|
| 82 |
+
def __init__(
|
| 83 |
+
self,
|
| 84 |
+
base_path: Optional[str] = None,
|
| 85 |
+
topics_dir: Optional[str] = None,
|
| 86 |
+
qrels_dir: Optional[str] = None,
|
| 87 |
+
corpus_path: Optional[str] = None
|
| 88 |
+
):
|
| 89 |
+
"""
|
| 90 |
+
Initialize the dataset loader.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
base_path: Base path containing TREC data
|
| 94 |
+
topics_dir: Path to topics directory (overrides base_path)
|
| 95 |
+
qrels_dir: Path to qrels directory (overrides base_path)
|
| 96 |
+
corpus_path: Path to corpus file (AP.tar or JSONL)
|
| 97 |
+
"""
|
| 98 |
+
self.base_path = Path(base_path) if base_path else None
|
| 99 |
+
self.topics_dir = Path(topics_dir) if topics_dir else None
|
| 100 |
+
self.qrels_dir = Path(qrels_dir) if qrels_dir else None
|
| 101 |
+
self.corpus_path = Path(corpus_path) if corpus_path else None
|
| 102 |
+
|
| 103 |
+
# Loaded data
|
| 104 |
+
self.topics: Dict[str, TRECTopic] = {}
|
| 105 |
+
self.qrels: Dict[str, Dict[str, int]] = {} # topic_id -> {doc_id: relevance}
|
| 106 |
+
self.documents: Dict[str, TRECDocument] = {}
|
| 107 |
+
|
| 108 |
+
# Statistics
|
| 109 |
+
self.stats = {
|
| 110 |
+
"topics_loaded": 0,
|
| 111 |
+
"qrels_loaded": 0,
|
| 112 |
+
"docs_loaded": 0
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
def load_topics(self, topics_path: Optional[str] = None) -> Dict[str, TRECTopic]:
|
| 116 |
+
"""
|
| 117 |
+
Load TREC topics from file(s).
|
| 118 |
+
|
| 119 |
+
Supports standard TREC topic format with <top>, <num>, <title>, <desc>, <narr> tags.
|
| 120 |
+
"""
|
| 121 |
+
search_path = Path(topics_path) if topics_path else self.topics_dir or self.base_path
|
| 122 |
+
|
| 123 |
+
if not search_path or not search_path.exists():
|
| 124 |
+
print(f"[TRECDataset] Topics path not found: {search_path}")
|
| 125 |
+
return {}
|
| 126 |
+
|
| 127 |
+
topic_files = []
|
| 128 |
+
if search_path.is_file():
|
| 129 |
+
topic_files = [search_path]
|
| 130 |
+
else:
|
| 131 |
+
topic_files = list(search_path.glob("topics*.txt"))
|
| 132 |
+
|
| 133 |
+
for topic_file in topic_files:
|
| 134 |
+
self._parse_topic_file(topic_file)
|
| 135 |
+
|
| 136 |
+
self.stats["topics_loaded"] = len(self.topics)
|
| 137 |
+
print(f"[TRECDataset] Loaded {len(self.topics)} topics from {len(topic_files)} files")
|
| 138 |
+
|
| 139 |
+
return self.topics
|
| 140 |
+
|
| 141 |
+
def _parse_topic_file(self, file_path: Path):
|
| 142 |
+
"""Parse a single TREC topic file."""
|
| 143 |
+
try:
|
| 144 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 145 |
+
content = f.read()
|
| 146 |
+
|
| 147 |
+
# Find all <top>...</top> blocks
|
| 148 |
+
for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
|
| 149 |
+
topic_content = top_match.group(1)
|
| 150 |
+
|
| 151 |
+
# Extract fields
|
| 152 |
+
num_match = re.search(r"<num>\s*(?:Number:)?\s*(\d+)", topic_content, re.IGNORECASE)
|
| 153 |
+
if not num_match:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
topic_id = num_match.group(1).strip()
|
| 157 |
+
|
| 158 |
+
title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
|
| 159 |
+
title = title_match.group(1).strip() if title_match else ""
|
| 160 |
+
|
| 161 |
+
desc_match = re.search(r"<desc>\s*(?:Description:)?\s*(.*?)\s*(?=<narr>|<|$)", topic_content, re.IGNORECASE | re.DOTALL)
|
| 162 |
+
desc = desc_match.group(1).strip() if desc_match else ""
|
| 163 |
+
|
| 164 |
+
narr_match = re.search(r"<narr>\s*(?:Narrative:)?\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
|
| 165 |
+
narr = narr_match.group(1).strip() if narr_match else ""
|
| 166 |
+
|
| 167 |
+
if topic_id and title:
|
| 168 |
+
self.topics[topic_id] = TRECTopic(
|
| 169 |
+
topic_id=topic_id,
|
| 170 |
+
title=title,
|
| 171 |
+
description=desc,
|
| 172 |
+
narrative=narr
|
| 173 |
+
)
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f"[TRECDataset] Error parsing {file_path}: {e}")
|
| 176 |
+
|
| 177 |
+
def load_qrels(self, qrels_path: Optional[str] = None) -> Dict[str, Dict[str, int]]:
|
| 178 |
+
"""
|
| 179 |
+
Load TREC qrels (relevance judgments).
|
| 180 |
+
|
| 181 |
+
Format: topic_id 0 doc_id relevance
|
| 182 |
+
"""
|
| 183 |
+
search_path = Path(qrels_path) if qrels_path else self.qrels_dir or self.base_path
|
| 184 |
+
|
| 185 |
+
if not search_path or not search_path.exists():
|
| 186 |
+
print(f"[TRECDataset] Qrels path not found: {search_path}")
|
| 187 |
+
return {}
|
| 188 |
+
|
| 189 |
+
qrel_files = []
|
| 190 |
+
if search_path.is_file():
|
| 191 |
+
qrel_files = [search_path]
|
| 192 |
+
else:
|
| 193 |
+
qrel_files = list(search_path.glob("qrels*.txt")) + list(search_path.glob("*.qrels"))
|
| 194 |
+
|
| 195 |
+
total_qrels = 0
|
| 196 |
+
for qrel_file in qrel_files:
|
| 197 |
+
count = self._parse_qrel_file(qrel_file)
|
| 198 |
+
total_qrels += count
|
| 199 |
+
|
| 200 |
+
self.stats["qrels_loaded"] = total_qrels
|
| 201 |
+
print(f"[TRECDataset] Loaded {total_qrels} qrels from {len(qrel_files)} files")
|
| 202 |
+
|
| 203 |
+
return self.qrels
|
| 204 |
+
|
| 205 |
+
def _parse_qrel_file(self, file_path: Path) -> int:
|
| 206 |
+
"""Parse a single qrel file. Returns count of qrels loaded."""
|
| 207 |
+
count = 0
|
| 208 |
+
try:
|
| 209 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 210 |
+
for line in f:
|
| 211 |
+
parts = line.strip().split()
|
| 212 |
+
if len(parts) >= 4:
|
| 213 |
+
topic_id = parts[0]
|
| 214 |
+
doc_id = parts[2]
|
| 215 |
+
relevance = int(parts[3])
|
| 216 |
+
|
| 217 |
+
if topic_id not in self.qrels:
|
| 218 |
+
self.qrels[topic_id] = {}
|
| 219 |
+
|
| 220 |
+
self.qrels[topic_id][doc_id] = relevance
|
| 221 |
+
count += 1
|
| 222 |
+
except Exception as e:
|
| 223 |
+
print(f"[TRECDataset] Error parsing {file_path}: {e}")
|
| 224 |
+
|
| 225 |
+
return count
|
| 226 |
+
|
| 227 |
+
def load_corpus_jsonl(self, jsonl_path: Optional[str] = None) -> Dict[str, TRECDocument]:
|
| 228 |
+
"""
|
| 229 |
+
Load corpus from JSONL format.
|
| 230 |
+
|
| 231 |
+
Expected format: {"id": "...", "contents": "...", "title": "..."}
|
| 232 |
+
"""
|
| 233 |
+
path = Path(jsonl_path) if jsonl_path else self.corpus_path
|
| 234 |
+
|
| 235 |
+
if not path or not path.exists():
|
| 236 |
+
print(f"[TRECDataset] Corpus path not found: {path}")
|
| 237 |
+
return {}
|
| 238 |
+
|
| 239 |
+
try:
|
| 240 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 241 |
+
for line in f:
|
| 242 |
+
doc = json.loads(line.strip())
|
| 243 |
+
doc_id = doc.get('id', doc.get('docid', ''))
|
| 244 |
+
text = doc.get('contents', doc.get('text', ''))
|
| 245 |
+
title = doc.get('title', '')
|
| 246 |
+
|
| 247 |
+
if doc_id:
|
| 248 |
+
self.documents[doc_id] = TRECDocument(
|
| 249 |
+
doc_id=doc_id,
|
| 250 |
+
text=text,
|
| 251 |
+
title=title
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
self.stats["docs_loaded"] = len(self.documents)
|
| 255 |
+
print(f"[TRECDataset] Loaded {len(self.documents)} documents")
|
| 256 |
+
|
| 257 |
+
except Exception as e:
|
| 258 |
+
print(f"[TRECDataset] Error loading corpus: {e}")
|
| 259 |
+
|
| 260 |
+
return self.documents
|
| 261 |
+
|
| 262 |
+
def get_relevant_docs(self, topic_id: str) -> Set[str]:
|
| 263 |
+
"""Get set of relevant document IDs for a topic."""
|
| 264 |
+
if topic_id not in self.qrels:
|
| 265 |
+
return set()
|
| 266 |
+
|
| 267 |
+
return {
|
| 268 |
+
doc_id for doc_id, rel in self.qrels[topic_id].items()
|
| 269 |
+
if rel > 0
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
def get_topic_queries(self, query_type: str = "short") -> Dict[str, str]:
|
| 273 |
+
"""
|
| 274 |
+
Get dictionary of topic_id -> query text.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
query_type: "short" (title only) or "long" (title + description)
|
| 278 |
+
"""
|
| 279 |
+
if query_type == "short":
|
| 280 |
+
return {tid: t.short_query for tid, t in self.topics.items()}
|
| 281 |
+
else:
|
| 282 |
+
return {tid: t.long_query for tid, t in self.topics.items()}
|
| 283 |
+
|
| 284 |
+
@staticmethod
|
| 285 |
+
def format_trec_run(
|
| 286 |
+
results: List[Tuple[str, str, float, int]], # (topic_id, doc_id, score, rank)
|
| 287 |
+
run_tag: str
|
| 288 |
+
) -> str:
|
| 289 |
+
"""
|
| 290 |
+
Format results as TREC run file.
|
| 291 |
+
|
| 292 |
+
Output format: topic_id Q0 doc_id rank score run_tag
|
| 293 |
+
"""
|
| 294 |
+
lines = []
|
| 295 |
+
for topic_id, doc_id, score, rank in results:
|
| 296 |
+
lines.append(f"{topic_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}")
|
| 297 |
+
return "\n".join(lines)
|
| 298 |
+
|
| 299 |
+
@staticmethod
|
| 300 |
+
def save_trec_run(
|
| 301 |
+
results: List[Tuple[str, str, float, int]],
|
| 302 |
+
run_tag: str,
|
| 303 |
+
output_path: str
|
| 304 |
+
):
|
| 305 |
+
"""Save results to TREC run file."""
|
| 306 |
+
run_content = TRECDataset.format_trec_run(results, run_tag)
|
| 307 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 308 |
+
f.write(run_content)
|
| 309 |
+
print(f"[TRECDataset] Saved run file: {output_path}")
|
| 310 |
+
|
| 311 |
+
def get_statistics(self) -> Dict[str, int]:
|
| 312 |
+
"""Get dataset statistics."""
|
| 313 |
+
return {
|
| 314 |
+
"topics": len(self.topics),
|
| 315 |
+
"qrels_topics": len(self.qrels),
|
| 316 |
+
"total_qrels": sum(len(q) for q in self.qrels.values()),
|
| 317 |
+
"documents": len(self.documents)
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# --- Sample Topics for Testing (AP88-90 subset) ---
|
| 322 |
+
|
| 323 |
+
SAMPLE_TOPICS = {
|
| 324 |
+
"51": TRECTopic(
|
| 325 |
+
topic_id="51",
|
| 326 |
+
title="Airbus Subsidies",
|
| 327 |
+
description="How much government money has been used to support Airbus aircraft manufacturing?",
|
| 328 |
+
narrative="A relevant document will contain information on subsidies or other financial support from government sources to Airbus."
|
| 329 |
+
),
|
| 330 |
+
"52": TRECTopic(
|
| 331 |
+
topic_id="52",
|
| 332 |
+
title="Japanese Auto Sales",
|
| 333 |
+
description="How have Japanese automobile sales fared in the U.S.?",
|
| 334 |
+
narrative="A relevant document will report on sales figures, trends, or market share of Japanese automobile manufacturers in the United States."
|
| 335 |
+
),
|
| 336 |
+
"53": TRECTopic(
|
| 337 |
+
topic_id="53",
|
| 338 |
+
title="Leveraged Buyouts",
|
| 339 |
+
description="What are the effects of leveraged buyouts on companies and industries?",
|
| 340 |
+
narrative="Relevant documents discuss the impact of LBOs on corporate structure, employment, or industry dynamics."
|
| 341 |
+
),
|
| 342 |
+
"54": TRECTopic(
|
| 343 |
+
topic_id="54",
|
| 344 |
+
title="Satellite Launches",
|
| 345 |
+
description="What are the commercial applications of satellite launches?",
|
| 346 |
+
narrative="A relevant document will discuss commercial satellite launches and their business applications."
|
| 347 |
+
),
|
| 348 |
+
"55": TRECTopic(
|
| 349 |
+
topic_id="55",
|
| 350 |
+
title="Insider Trading",
|
| 351 |
+
description="What individuals or companies have been accused or convicted of insider trading?",
|
| 352 |
+
narrative="A relevant document will identify specific cases of insider trading allegations or convictions."
|
| 353 |
+
),
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def create_sample_dataset() -> TRECDataset:
|
| 358 |
+
"""Create a sample dataset for testing."""
|
| 359 |
+
dataset = TRECDataset()
|
| 360 |
+
dataset.topics = SAMPLE_TOPICS.copy()
|
| 361 |
+
|
| 362 |
+
# Add sample qrels
|
| 363 |
+
dataset.qrels = {
|
| 364 |
+
"51": {"AP880212-0001": 1, "AP880215-0003": 1, "AP880301-0010": 0},
|
| 365 |
+
"52": {"AP890102-0020": 1, "AP890115-0045": 1},
|
| 366 |
+
"53": {"AP880325-0100": 1},
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
return dataset
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# --- Testing ---
|
| 373 |
+
|
| 374 |
+
if __name__ == "__main__":
|
| 375 |
+
print("=" * 60)
|
| 376 |
+
print("SysCRED TREC Dataset - Test Suite")
|
| 377 |
+
print("=" * 60)
|
| 378 |
+
|
| 379 |
+
# Create sample dataset
|
| 380 |
+
dataset = create_sample_dataset()
|
| 381 |
+
|
| 382 |
+
print(f"\n1. Sample Topics: {len(dataset.topics)}")
|
| 383 |
+
for tid, topic in list(dataset.topics.items())[:3]:
|
| 384 |
+
print(f" {tid}: {topic.title}")
|
| 385 |
+
print(f" Short: {topic.short_query}")
|
| 386 |
+
print(f" Long: {topic.long_query[:80]}...")
|
| 387 |
+
|
| 388 |
+
print(f"\n2. Sample Qrels:")
|
| 389 |
+
for tid, docs in dataset.qrels.items():
|
| 390 |
+
print(f" Topic {tid}: {len(docs)} judgments")
|
| 391 |
+
|
| 392 |
+
print(f"\n3. Query dictionaries:")
|
| 393 |
+
short_queries = dataset.get_topic_queries("short")
|
| 394 |
+
long_queries = dataset.get_topic_queries("long")
|
| 395 |
+
print(f" Short queries: {len(short_queries)}")
|
| 396 |
+
print(f" Long queries: {len(long_queries)}")
|
| 397 |
+
|
| 398 |
+
print(f"\n4. Relevant docs for topic 51:")
|
| 399 |
+
relevant = dataset.get_relevant_docs("51")
|
| 400 |
+
print(f" {relevant}")
|
| 401 |
+
|
| 402 |
+
print(f"\n5. Statistics:")
|
| 403 |
+
stats = dataset.get_statistics()
|
| 404 |
+
for key, value in stats.items():
|
| 405 |
+
print(f" {key}: {value}")
|
| 406 |
+
|
| 407 |
+
print("\n" + "=" * 60)
|
| 408 |
+
print("Tests complete!")
|
| 409 |
+
print("=" * 60)
|
syscred/trec_retriever.py
ADDED
|
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
TREC Retriever Module - SysCRED
|
| 4 |
+
================================
|
| 5 |
+
Information Retrieval component based on TREC AP88-90 methodology.
|
| 6 |
+
|
| 7 |
+
This module bridges the classic IR evaluation framework (TREC)
|
| 8 |
+
with the neuro-symbolic credibility verification system.
|
| 9 |
+
|
| 10 |
+
Features:
|
| 11 |
+
- BM25, TF-IDF, QLD scoring
|
| 12 |
+
- Pyserini/Lucene integration (optional)
|
| 13 |
+
- Evidence retrieval for fact-checking
|
| 14 |
+
- PRF (Pseudo-Relevance Feedback) query expansion
|
| 15 |
+
|
| 16 |
+
Based on: TREC_AP88-90_5juin2025.py
|
| 17 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 18 |
+
Citation Key: loyerEvaluationModelesRecherche2025
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import json
|
| 23 |
+
import time
|
| 24 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 25 |
+
from dataclasses import dataclass, field
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
|
| 28 |
+
from syscred.ir_engine import IREngine, SearchResult, SearchResponse
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class Evidence:
|
| 33 |
+
"""
|
| 34 |
+
A piece of evidence retrieved for fact-checking.
|
| 35 |
+
|
| 36 |
+
Represents a document or passage that can support or refute a claim.
|
| 37 |
+
"""
|
| 38 |
+
doc_id: str
|
| 39 |
+
text: str
|
| 40 |
+
score: float
|
| 41 |
+
rank: int
|
| 42 |
+
source: str = ""
|
| 43 |
+
retrieval_model: str = "bm25"
|
| 44 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 45 |
+
|
| 46 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 47 |
+
return {
|
| 48 |
+
"doc_id": self.doc_id,
|
| 49 |
+
"text": self.text[:500] + "..." if len(self.text) > 500 else self.text,
|
| 50 |
+
"score": round(self.score, 4),
|
| 51 |
+
"rank": self.rank,
|
| 52 |
+
"source": self.source,
|
| 53 |
+
"model": self.retrieval_model
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class RetrievalResult:
|
| 59 |
+
"""Complete result from evidence retrieval."""
|
| 60 |
+
query: str
|
| 61 |
+
evidences: List[Evidence]
|
| 62 |
+
total_retrieved: int
|
| 63 |
+
search_time_ms: float
|
| 64 |
+
model_used: str
|
| 65 |
+
expanded_query: Optional[str] = None
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class TRECRetriever:
|
| 69 |
+
"""
|
| 70 |
+
TREC-style retriever for evidence gathering in fact-checking.
|
| 71 |
+
|
| 72 |
+
This class wraps the IREngine to provide a fact-checking oriented
|
| 73 |
+
interface for retrieving evidence documents.
|
| 74 |
+
|
| 75 |
+
Usage:
|
| 76 |
+
retriever = TRECRetriever(index_path="/path/to/lucene/index")
|
| 77 |
+
result = retriever.retrieve_evidence("Climate change is caused by humans", k=10)
|
| 78 |
+
for evidence in result.evidences:
|
| 79 |
+
print(f"{evidence.rank}. [{evidence.score:.4f}] {evidence.text[:100]}...")
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
# Retrieval configuration
|
| 83 |
+
DEFAULT_K = 10
|
| 84 |
+
DEFAULT_MODEL = "bm25"
|
| 85 |
+
|
| 86 |
+
# BM25 parameters (optimized on AP88-90)
|
| 87 |
+
BM25_K1 = 0.9
|
| 88 |
+
BM25_B = 0.4
|
| 89 |
+
|
| 90 |
+
def __init__(
|
| 91 |
+
self,
|
| 92 |
+
index_path: Optional[str] = None,
|
| 93 |
+
corpus_path: Optional[str] = None,
|
| 94 |
+
use_stemming: bool = True,
|
| 95 |
+
enable_prf: bool = True,
|
| 96 |
+
prf_top_docs: int = 3,
|
| 97 |
+
prf_expansion_terms: int = 10
|
| 98 |
+
):
|
| 99 |
+
"""
|
| 100 |
+
Initialize the TREC retriever.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
index_path: Path to Lucene/Pyserini index (optional)
|
| 104 |
+
corpus_path: Path to JSONL corpus for in-memory search
|
| 105 |
+
use_stemming: Whether to apply Porter stemming
|
| 106 |
+
enable_prf: Enable Pseudo-Relevance Feedback
|
| 107 |
+
prf_top_docs: Number of top docs for PRF
|
| 108 |
+
prf_expansion_terms: Number of expansion terms from PRF
|
| 109 |
+
"""
|
| 110 |
+
self.index_path = index_path
|
| 111 |
+
self.corpus_path = corpus_path
|
| 112 |
+
self.enable_prf = enable_prf
|
| 113 |
+
self.prf_top_docs = prf_top_docs
|
| 114 |
+
self.prf_expansion_terms = prf_expansion_terms
|
| 115 |
+
|
| 116 |
+
# Initialize IR engine
|
| 117 |
+
self.ir_engine = IREngine(
|
| 118 |
+
index_path=index_path,
|
| 119 |
+
use_stemming=use_stemming
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# In-memory corpus (for lightweight mode)
|
| 123 |
+
self.corpus: Dict[str, Dict[str, str]] = {}
|
| 124 |
+
if corpus_path and os.path.exists(corpus_path):
|
| 125 |
+
self._load_corpus(corpus_path)
|
| 126 |
+
|
| 127 |
+
# Statistics
|
| 128 |
+
self.stats = {
|
| 129 |
+
"queries_processed": 0,
|
| 130 |
+
"total_search_time_ms": 0,
|
| 131 |
+
"avg_results_per_query": 0
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
print(f"[TRECRetriever] Initialized with index={index_path}, stemming={use_stemming}")
|
| 135 |
+
|
| 136 |
+
def _load_corpus(self, corpus_path: str):
|
| 137 |
+
"""Load JSONL corpus into memory for lightweight search."""
|
| 138 |
+
print(f"[TRECRetriever] Loading corpus from {corpus_path}...")
|
| 139 |
+
try:
|
| 140 |
+
with open(corpus_path, 'r', encoding='utf-8') as f:
|
| 141 |
+
for line in f:
|
| 142 |
+
doc = json.loads(line.strip())
|
| 143 |
+
self.corpus[doc['id']] = {
|
| 144 |
+
'text': doc.get('contents', doc.get('text', '')),
|
| 145 |
+
'title': doc.get('title', '')
|
| 146 |
+
}
|
| 147 |
+
print(f"[TRECRetriever] Loaded {len(self.corpus)} documents")
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"[TRECRetriever] Failed to load corpus: {e}")
|
| 150 |
+
|
| 151 |
+
def retrieve_evidence(
|
| 152 |
+
self,
|
| 153 |
+
claim: str,
|
| 154 |
+
k: int = None,
|
| 155 |
+
model: str = None,
|
| 156 |
+
use_prf: bool = None
|
| 157 |
+
) -> RetrievalResult:
|
| 158 |
+
"""
|
| 159 |
+
Retrieve evidence documents for a given claim.
|
| 160 |
+
|
| 161 |
+
This is the main method for fact-checking integration.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
claim: The claim or statement to verify
|
| 165 |
+
k: Number of evidence documents to retrieve
|
| 166 |
+
model: Retrieval model ('bm25', 'qld', 'tfidf')
|
| 167 |
+
use_prf: Override PRF setting for this query
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
RetrievalResult with list of Evidence objects
|
| 171 |
+
"""
|
| 172 |
+
start_time = time.time()
|
| 173 |
+
|
| 174 |
+
k = k or self.DEFAULT_K
|
| 175 |
+
model = model or self.DEFAULT_MODEL
|
| 176 |
+
use_prf = use_prf if use_prf is not None else self.enable_prf
|
| 177 |
+
|
| 178 |
+
# Preprocess the claim
|
| 179 |
+
processed_claim = self.ir_engine.preprocess(claim)
|
| 180 |
+
|
| 181 |
+
# Try Pyserini first, fall back to in-memory
|
| 182 |
+
if self.ir_engine.searcher:
|
| 183 |
+
response = self._search_pyserini(processed_claim, model, k)
|
| 184 |
+
else:
|
| 185 |
+
response = self._search_in_memory(processed_claim, k)
|
| 186 |
+
|
| 187 |
+
# Apply PRF if enabled
|
| 188 |
+
expanded_query = None
|
| 189 |
+
if use_prf and len(response.results) >= self.prf_top_docs:
|
| 190 |
+
expanded_query = self._apply_prf(claim, response.results[:self.prf_top_docs])
|
| 191 |
+
if expanded_query != claim:
|
| 192 |
+
# Re-search with expanded query
|
| 193 |
+
processed_expanded = self.ir_engine.preprocess(expanded_query)
|
| 194 |
+
if self.ir_engine.searcher:
|
| 195 |
+
response = self._search_pyserini(processed_expanded, model, k)
|
| 196 |
+
else:
|
| 197 |
+
response = self._search_in_memory(processed_expanded, k)
|
| 198 |
+
|
| 199 |
+
# Convert to Evidence objects
|
| 200 |
+
evidences = []
|
| 201 |
+
for result in response.results:
|
| 202 |
+
doc_text = self._get_document_text(result.doc_id)
|
| 203 |
+
evidences.append(Evidence(
|
| 204 |
+
doc_id=result.doc_id,
|
| 205 |
+
text=doc_text,
|
| 206 |
+
score=result.score,
|
| 207 |
+
rank=result.rank,
|
| 208 |
+
source="TREC-AP88-90" if "AP" in result.doc_id else "Unknown",
|
| 209 |
+
retrieval_model=model
|
| 210 |
+
))
|
| 211 |
+
|
| 212 |
+
search_time = (time.time() - start_time) * 1000
|
| 213 |
+
|
| 214 |
+
# Update statistics
|
| 215 |
+
self.stats["queries_processed"] += 1
|
| 216 |
+
self.stats["total_search_time_ms"] += search_time
|
| 217 |
+
|
| 218 |
+
return RetrievalResult(
|
| 219 |
+
query=claim,
|
| 220 |
+
evidences=evidences,
|
| 221 |
+
total_retrieved=len(evidences),
|
| 222 |
+
search_time_ms=search_time,
|
| 223 |
+
model_used=model,
|
| 224 |
+
expanded_query=expanded_query
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
def _search_pyserini(self, query: str, model: str, k: int) -> SearchResponse:
|
| 228 |
+
"""Search using Pyserini/Lucene."""
|
| 229 |
+
return self.ir_engine.search_pyserini(
|
| 230 |
+
query=query,
|
| 231 |
+
model=model,
|
| 232 |
+
k=k
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
def _search_in_memory(self, query: str, k: int) -> SearchResponse:
|
| 236 |
+
"""
|
| 237 |
+
Lightweight in-memory BM25 search.
|
| 238 |
+
|
| 239 |
+
Used when Pyserini is not available.
|
| 240 |
+
"""
|
| 241 |
+
start_time = time.time()
|
| 242 |
+
|
| 243 |
+
if not self.corpus:
|
| 244 |
+
return SearchResponse(
|
| 245 |
+
query_id="Q1",
|
| 246 |
+
query_text=query,
|
| 247 |
+
results=[],
|
| 248 |
+
model="bm25_memory",
|
| 249 |
+
total_hits=0,
|
| 250 |
+
search_time_ms=0
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
query_terms = query.split()
|
| 254 |
+
|
| 255 |
+
# Calculate document frequencies
|
| 256 |
+
doc_freq = {}
|
| 257 |
+
for term in query_terms:
|
| 258 |
+
doc_freq[term] = sum(
|
| 259 |
+
1 for doc in self.corpus.values()
|
| 260 |
+
if term in self.ir_engine.preprocess(doc['text'])
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Calculate average document length
|
| 264 |
+
total_length = sum(
|
| 265 |
+
len(self.ir_engine.preprocess(doc['text']).split())
|
| 266 |
+
for doc in self.corpus.values()
|
| 267 |
+
)
|
| 268 |
+
avg_doc_length = total_length / len(self.corpus) if self.corpus else 1
|
| 269 |
+
|
| 270 |
+
# Score all documents
|
| 271 |
+
scores = []
|
| 272 |
+
for doc_id, doc in self.corpus.items():
|
| 273 |
+
doc_text = self.ir_engine.preprocess(doc['text'])
|
| 274 |
+
doc_terms = doc_text.split()
|
| 275 |
+
|
| 276 |
+
score = self.ir_engine.calculate_bm25_score(
|
| 277 |
+
query_terms=query_terms,
|
| 278 |
+
doc_terms=doc_terms,
|
| 279 |
+
doc_length=len(doc_terms),
|
| 280 |
+
avg_doc_length=avg_doc_length,
|
| 281 |
+
doc_freq=doc_freq,
|
| 282 |
+
corpus_size=len(self.corpus)
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
if score > 0:
|
| 286 |
+
scores.append((doc_id, score))
|
| 287 |
+
|
| 288 |
+
# Sort and take top k
|
| 289 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
| 290 |
+
top_k = scores[:k]
|
| 291 |
+
|
| 292 |
+
results = [
|
| 293 |
+
SearchResult(doc_id=doc_id, score=score, rank=i+1)
|
| 294 |
+
for i, (doc_id, score) in enumerate(top_k)
|
| 295 |
+
]
|
| 296 |
+
|
| 297 |
+
return SearchResponse(
|
| 298 |
+
query_id="Q1",
|
| 299 |
+
query_text=query,
|
| 300 |
+
results=results,
|
| 301 |
+
model="bm25_memory",
|
| 302 |
+
total_hits=len(results),
|
| 303 |
+
search_time_ms=(time.time() - start_time) * 1000
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
def _apply_prf(self, original_query: str, top_results: List[SearchResult]) -> str:
|
| 307 |
+
"""Apply Pseudo-Relevance Feedback."""
|
| 308 |
+
top_docs_texts = [
|
| 309 |
+
self._get_document_text(r.doc_id)
|
| 310 |
+
for r in top_results
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
+
return self.ir_engine.pseudo_relevance_feedback(
|
| 314 |
+
query=original_query,
|
| 315 |
+
top_docs_texts=top_docs_texts,
|
| 316 |
+
num_expansion_terms=self.prf_expansion_terms
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
def _get_document_text(self, doc_id: str) -> str:
|
| 320 |
+
"""Get document text from corpus or index."""
|
| 321 |
+
if doc_id in self.corpus:
|
| 322 |
+
return self.corpus[doc_id]['text']
|
| 323 |
+
|
| 324 |
+
# Try Pyserini doc lookup
|
| 325 |
+
if self.ir_engine.searcher:
|
| 326 |
+
try:
|
| 327 |
+
doc = self.ir_engine.searcher.doc(doc_id)
|
| 328 |
+
if doc:
|
| 329 |
+
return doc.raw()
|
| 330 |
+
except:
|
| 331 |
+
pass
|
| 332 |
+
|
| 333 |
+
return f"[Document {doc_id} text not available]"
|
| 334 |
+
|
| 335 |
+
def batch_retrieve(
|
| 336 |
+
self,
|
| 337 |
+
claims: List[str],
|
| 338 |
+
k: int = None,
|
| 339 |
+
model: str = None
|
| 340 |
+
) -> List[RetrievalResult]:
|
| 341 |
+
"""
|
| 342 |
+
Retrieve evidence for multiple claims.
|
| 343 |
+
|
| 344 |
+
Useful for benchmark evaluation.
|
| 345 |
+
"""
|
| 346 |
+
results = []
|
| 347 |
+
for claim in claims:
|
| 348 |
+
result = self.retrieve_evidence(claim, k=k, model=model)
|
| 349 |
+
results.append(result)
|
| 350 |
+
return results
|
| 351 |
+
|
| 352 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 353 |
+
"""Get retrieval statistics."""
|
| 354 |
+
avg_time = 0
|
| 355 |
+
if self.stats["queries_processed"] > 0:
|
| 356 |
+
avg_time = self.stats["total_search_time_ms"] / self.stats["queries_processed"]
|
| 357 |
+
|
| 358 |
+
return {
|
| 359 |
+
"queries_processed": self.stats["queries_processed"],
|
| 360 |
+
"total_search_time_ms": round(self.stats["total_search_time_ms"], 2),
|
| 361 |
+
"avg_search_time_ms": round(avg_time, 2),
|
| 362 |
+
"corpus_size": len(self.corpus),
|
| 363 |
+
"has_pyserini_index": self.ir_engine.searcher is not None
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
# --- Integration with VerificationSystem ---
|
| 368 |
+
|
| 369 |
+
def create_retriever_for_syscred(
|
| 370 |
+
config: Optional[Any] = None
|
| 371 |
+
) -> TRECRetriever:
|
| 372 |
+
"""
|
| 373 |
+
Factory function to create a TRECRetriever for SysCRED integration.
|
| 374 |
+
|
| 375 |
+
Uses configuration from syscred.config if available.
|
| 376 |
+
"""
|
| 377 |
+
index_path = None
|
| 378 |
+
corpus_path = None
|
| 379 |
+
|
| 380 |
+
if config:
|
| 381 |
+
index_path = getattr(config, 'TREC_INDEX_PATH', None)
|
| 382 |
+
corpus_path = getattr(config, 'TREC_CORPUS_PATH', None)
|
| 383 |
+
|
| 384 |
+
# Try default paths
|
| 385 |
+
default_corpus = Path(__file__).parent.parent / "benchmarks" / "ap_corpus.jsonl"
|
| 386 |
+
if default_corpus.exists():
|
| 387 |
+
corpus_path = str(default_corpus)
|
| 388 |
+
|
| 389 |
+
return TRECRetriever(
|
| 390 |
+
index_path=index_path,
|
| 391 |
+
corpus_path=corpus_path,
|
| 392 |
+
use_stemming=True,
|
| 393 |
+
enable_prf=True
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
# --- Testing ---
|
| 398 |
+
|
| 399 |
+
if __name__ == "__main__":
|
| 400 |
+
print("=" * 60)
|
| 401 |
+
print("SysCRED TREC Retriever - Test Suite")
|
| 402 |
+
print("=" * 60)
|
| 403 |
+
|
| 404 |
+
# Initialize without index (in-memory mode)
|
| 405 |
+
retriever = TRECRetriever(use_stemming=True, enable_prf=False)
|
| 406 |
+
|
| 407 |
+
# Add some test documents to corpus
|
| 408 |
+
retriever.corpus = {
|
| 409 |
+
"DOC001": {"text": "Climate change is primarily caused by human activities, particularly the burning of fossil fuels.", "title": "Climate Science"},
|
| 410 |
+
"DOC002": {"text": "The Earth's temperature has risen significantly over the past century due to greenhouse gas emissions.", "title": "Global Warming"},
|
| 411 |
+
"DOC003": {"text": "Natural climate variations have occurred throughout Earth's history.", "title": "Climate History"},
|
| 412 |
+
"DOC004": {"text": "Renewable energy sources like solar and wind can help reduce carbon emissions.", "title": "Green Energy"},
|
| 413 |
+
"DOC005": {"text": "Scientific consensus supports anthropogenic climate change theory.", "title": "IPCC Report"},
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
print("\n1. Testing evidence retrieval...")
|
| 417 |
+
result = retriever.retrieve_evidence(
|
| 418 |
+
claim="Climate change is caused by human activities",
|
| 419 |
+
k=3
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
print(f"\n Query: {result.query}")
|
| 423 |
+
print(f" Model: {result.model_used}")
|
| 424 |
+
print(f" Search time: {result.search_time_ms:.2f} ms")
|
| 425 |
+
print(f" Results found: {result.total_retrieved}")
|
| 426 |
+
|
| 427 |
+
for evidence in result.evidences:
|
| 428 |
+
print(f"\n Rank {evidence.rank} [{evidence.score:.4f}]:")
|
| 429 |
+
print(f" {evidence.text[:100]}...")
|
| 430 |
+
|
| 431 |
+
print("\n2. Testing batch retrieval...")
|
| 432 |
+
claims = [
|
| 433 |
+
"Climate change is real",
|
| 434 |
+
"Renewable energy reduces emissions"
|
| 435 |
+
]
|
| 436 |
+
batch_results = retriever.batch_retrieve(claims, k=2)
|
| 437 |
+
print(f" Processed {len(batch_results)} claims")
|
| 438 |
+
|
| 439 |
+
print("\n3. Statistics:")
|
| 440 |
+
stats = retriever.get_statistics()
|
| 441 |
+
for key, value in stats.items():
|
| 442 |
+
print(f" {key}: {value}")
|
| 443 |
+
|
| 444 |
+
print("\n" + "=" * 60)
|
| 445 |
+
print("Tests complete!")
|
| 446 |
+
print("=" * 60)
|
syscred/verification_system.py
ADDED
|
@@ -0,0 +1,926 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Verification System Module - SysCRED v2.0
|
| 4 |
+
==========================================
|
| 5 |
+
Main credibility verification system with real API integration.
|
| 6 |
+
Refactored from sys-cred-Python-27avril2025.py
|
| 7 |
+
|
| 8 |
+
(c) Dominique S. Loyer - PhD Thesis Prototype
|
| 9 |
+
Citation Key: loyerModelingHybridSystem2025
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import json
|
| 14 |
+
import datetime
|
| 15 |
+
from typing import Optional, Dict, Any, List
|
| 16 |
+
from urllib.parse import urlparse
|
| 17 |
+
|
| 18 |
+
# Transformers and ML
|
| 19 |
+
try:
|
| 20 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
| 21 |
+
import numpy as np
|
| 22 |
+
import torch
|
| 23 |
+
from lime.lime_text import LimeTextExplainer
|
| 24 |
+
HAS_ML = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
HAS_ML = False
|
| 27 |
+
print("Warning: ML libraries not fully installed. Run: pip install transformers torch lime numpy")
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from sentence_transformers import SentenceTransformer, util
|
| 31 |
+
HAS_SBERT = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
HAS_SBERT = False
|
| 34 |
+
print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
|
| 35 |
+
|
| 36 |
+
# Local imports
|
| 37 |
+
from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
|
| 38 |
+
from syscred.ontology_manager import OntologyManager
|
| 39 |
+
from syscred.seo_analyzer import SEOAnalyzer
|
| 40 |
+
from syscred.graph_rag import GraphRAG # [NEW] GraphRAG
|
| 41 |
+
from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult # [NEW] TREC Integration
|
| 42 |
+
from syscred import config
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class CredibilityVerificationSystem:
|
| 46 |
+
"""
|
| 47 |
+
Système neuro-symbolique de vérification de crédibilité.
|
| 48 |
+
|
| 49 |
+
Combine:
|
| 50 |
+
- Analyse basée sur des règles (symbolique, transparent)
|
| 51 |
+
- Analyse NLP/IA (apprentissage automatique)
|
| 52 |
+
- Ontologie OWL pour la traçabilité
|
| 53 |
+
- APIs externes pour les données réelles
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(
|
| 57 |
+
self,
|
| 58 |
+
google_api_key: Optional[str] = None,
|
| 59 |
+
ontology_base_path: Optional[str] = None,
|
| 60 |
+
ontology_data_path: Optional[str] = None,
|
| 61 |
+
load_ml_models: bool = True
|
| 62 |
+
):
|
| 63 |
+
"""
|
| 64 |
+
Initialize the credibility verification system.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
google_api_key: API key for Google Fact Check (optional)
|
| 68 |
+
ontology_base_path: Path to base ontology TTL file
|
| 69 |
+
ontology_data_path: Path to store accumulated data
|
| 70 |
+
load_ml_models: Whether to load ML models (disable for testing)
|
| 71 |
+
"""
|
| 72 |
+
print("[SysCRED] Initializing Credibility Verification System v2.0...")
|
| 73 |
+
|
| 74 |
+
# Initialize API clients
|
| 75 |
+
self.api_clients = ExternalAPIClients(google_api_key=google_api_key)
|
| 76 |
+
print("[SysCRED] API clients initialized")
|
| 77 |
+
|
| 78 |
+
# Initialize ontology manager
|
| 79 |
+
self.ontology_manager = None
|
| 80 |
+
if ontology_base_path or ontology_data_path:
|
| 81 |
+
try:
|
| 82 |
+
self.ontology_manager = OntologyManager(
|
| 83 |
+
base_ontology_path=ontology_base_path,
|
| 84 |
+
data_path=ontology_data_path
|
| 85 |
+
)
|
| 86 |
+
self.graph_rag = GraphRAG(self.ontology_manager) # [NEW] Init GraphRAG
|
| 87 |
+
print("[SysCRED] Ontology manager & GraphRAG initialized")
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"[SysCRED] Ontology manager disabled: {e}")
|
| 90 |
+
self.graph_rag = None
|
| 91 |
+
else:
|
| 92 |
+
self.graph_rag = None
|
| 93 |
+
|
| 94 |
+
# [NEW] Initialize TREC Retriever for evidence gathering
|
| 95 |
+
self.trec_retriever = None
|
| 96 |
+
try:
|
| 97 |
+
self.trec_retriever = TRECRetriever(
|
| 98 |
+
index_path=config.Config.TREC_INDEX_PATH,
|
| 99 |
+
corpus_path=config.Config.TREC_CORPUS_PATH,
|
| 100 |
+
use_stemming=True,
|
| 101 |
+
enable_prf=config.Config.ENABLE_PRF,
|
| 102 |
+
prf_top_docs=config.Config.PRF_TOP_DOCS,
|
| 103 |
+
prf_expansion_terms=config.Config.PRF_EXPANSION_TERMS
|
| 104 |
+
)
|
| 105 |
+
print("[SysCRED] TREC Retriever initialized for evidence gathering")
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"[SysCRED] TREC Retriever disabled: {e}")
|
| 108 |
+
|
| 109 |
+
# Initialize ML models
|
| 110 |
+
self.sentiment_pipeline = None
|
| 111 |
+
self.ner_pipeline = None
|
| 112 |
+
self.bias_tokenizer = None
|
| 113 |
+
self.bias_model = None
|
| 114 |
+
self.coherence_model = None
|
| 115 |
+
self.explainer = None
|
| 116 |
+
|
| 117 |
+
if load_ml_models and HAS_ML:
|
| 118 |
+
self._load_ml_models()
|
| 119 |
+
|
| 120 |
+
# Weights for score calculation (configurable)
|
| 121 |
+
# Weights for score calculation (Loaded from Config)
|
| 122 |
+
self.weights = config.Config.SCORE_WEIGHTS
|
| 123 |
+
print(f"[SysCRED] Using weights: {self.weights}")
|
| 124 |
+
|
| 125 |
+
print("[SysCRED] System ready!")
|
| 126 |
+
|
| 127 |
+
def _load_ml_models(self):
|
| 128 |
+
"""Load ML models for NLP analysis."""
|
| 129 |
+
print("[SysCRED] Loading ML models (this may take a moment)...")
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
# Sentiment analysis
|
| 133 |
+
self.sentiment_pipeline = pipeline(
|
| 134 |
+
"sentiment-analysis",
|
| 135 |
+
model="distilbert-base-uncased-finetuned-sst-2-english"
|
| 136 |
+
)
|
| 137 |
+
print("[SysCRED] ✓ Sentiment model loaded")
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"[SysCRED] ✗ Sentiment model failed: {e}")
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
# NER pipeline
|
| 143 |
+
self.ner_pipeline = pipeline("ner", grouped_entities=True)
|
| 144 |
+
print("[SysCRED] ✓ NER model loaded")
|
| 145 |
+
except Exception as e:
|
| 146 |
+
print(f"[SysCRED] ✗ NER model failed: {e}")
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
# Bias detection - Specialized model
|
| 150 |
+
# Using 'd4data/bias-detection-model' or fallback to generic
|
| 151 |
+
bias_model_name = "d4data/bias-detection-model"
|
| 152 |
+
self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
|
| 153 |
+
self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
|
| 154 |
+
print("[SysCRED] ✓ Bias model loaded (d4data)")
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
# Semantic Coherence
|
| 160 |
+
if HAS_SBERT:
|
| 161 |
+
self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 162 |
+
print("[SysCRED] ✓ Coherence model loaded (SBERT)")
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"[SysCRED] ✗ Coherence model failed: {e}")
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
# LIME explainer
|
| 168 |
+
self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
|
| 169 |
+
print("[SysCRED] ✓ LIME explainer loaded")
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f"[SysCRED] ✗ LIME explainer failed: {e}")
|
| 172 |
+
|
| 173 |
+
def is_url(self, text: str) -> bool:
|
| 174 |
+
"""Check if a string is a valid URL."""
|
| 175 |
+
try:
|
| 176 |
+
result = urlparse(text)
|
| 177 |
+
return all([result.scheme, result.netloc])
|
| 178 |
+
except ValueError:
|
| 179 |
+
return False
|
| 180 |
+
|
| 181 |
+
def preprocess(self, text: str) -> str:
|
| 182 |
+
"""Clean and normalize text for analysis."""
|
| 183 |
+
if not isinstance(text, str):
|
| 184 |
+
return ""
|
| 185 |
+
|
| 186 |
+
# Remove URLs
|
| 187 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
| 188 |
+
# Normalize whitespace
|
| 189 |
+
text = re.sub(r'\s+', ' ', text)
|
| 190 |
+
# Keep basic punctuation
|
| 191 |
+
text = re.sub(r'[^\w\s\.\?,!]', '', text)
|
| 192 |
+
|
| 193 |
+
return text.lower().strip()
|
| 194 |
+
|
| 195 |
+
def rule_based_analysis(self, text: str, external_data: ExternalData) -> Dict[str, Any]:
|
| 196 |
+
"""
|
| 197 |
+
Perform rule-based analysis using symbolic reasoning.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
text: Preprocessed text to analyze
|
| 201 |
+
external_data: Data from external APIs
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
Dictionary with rule-based analysis results
|
| 205 |
+
"""
|
| 206 |
+
results = {
|
| 207 |
+
'linguistic_markers': {},
|
| 208 |
+
'source_analysis': {},
|
| 209 |
+
'timeliness_flags': [],
|
| 210 |
+
'fact_checking': []
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
# 1. Linguistic markers
|
| 214 |
+
sensational_words = [
|
| 215 |
+
'shocking', 'revealed', 'conspiracy', 'amazing', 'secret',
|
| 216 |
+
'breakthrough', 'miracle', 'unbelievable', 'exclusive', 'urgent'
|
| 217 |
+
]
|
| 218 |
+
certainty_words = [
|
| 219 |
+
'verified', 'authentic', 'credible', 'proven', 'fact',
|
| 220 |
+
'confirmed', 'official', 'legitimate', 'established'
|
| 221 |
+
]
|
| 222 |
+
doubt_words = [
|
| 223 |
+
'hoax', 'false', 'fake', 'unproven', 'rumor', 'allegedly',
|
| 224 |
+
'claim', 'debunked', 'misleading', 'disputed'
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
text_lower = text.lower()
|
| 228 |
+
results['linguistic_markers']['sensationalism'] = sum(
|
| 229 |
+
1 for word in sensational_words if word in text_lower
|
| 230 |
+
)
|
| 231 |
+
results['linguistic_markers']['certainty'] = sum(
|
| 232 |
+
1 for word in certainty_words if word in text_lower
|
| 233 |
+
)
|
| 234 |
+
results['linguistic_markers']['doubt'] = sum(
|
| 235 |
+
1 for word in doubt_words if word in text_lower
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# 2. Source analysis from external data
|
| 239 |
+
results['source_analysis']['reputation'] = external_data.source_reputation
|
| 240 |
+
results['source_analysis']['domain_age_days'] = external_data.domain_age_days
|
| 241 |
+
|
| 242 |
+
if external_data.domain_info:
|
| 243 |
+
results['source_analysis']['registrar'] = external_data.domain_info.registrar
|
| 244 |
+
results['source_analysis']['domain'] = external_data.domain_info.domain
|
| 245 |
+
|
| 246 |
+
# 3. Timeliness flags
|
| 247 |
+
if external_data.domain_age_days is not None:
|
| 248 |
+
if external_data.domain_age_days < 180:
|
| 249 |
+
results['timeliness_flags'].append('Source domain is relatively new (<6 months)')
|
| 250 |
+
elif external_data.domain_age_days < 365:
|
| 251 |
+
results['timeliness_flags'].append('Source domain is less than 1 year old')
|
| 252 |
+
|
| 253 |
+
# 4. Fact checking results
|
| 254 |
+
for fc in external_data.fact_checks:
|
| 255 |
+
results['fact_checking'].append({
|
| 256 |
+
'claim': fc.claim,
|
| 257 |
+
'rating': fc.rating,
|
| 258 |
+
'publisher': fc.publisher,
|
| 259 |
+
'url': fc.url
|
| 260 |
+
})
|
| 261 |
+
|
| 262 |
+
return results
|
| 263 |
+
|
| 264 |
+
def nlp_analysis(self, text: str) -> Dict[str, Any]:
|
| 265 |
+
"""
|
| 266 |
+
Perform NLP-based analysis using ML models.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
text: Preprocessed text to analyze
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
Dictionary with NLP analysis results
|
| 273 |
+
"""
|
| 274 |
+
results = {
|
| 275 |
+
'sentiment': None,
|
| 276 |
+
'sentiment_explanation': None,
|
| 277 |
+
'bias_analysis': {'score': None, 'label': 'Unavailable'},
|
| 278 |
+
'named_entities': [],
|
| 279 |
+
'coherence_score': None
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
if not text:
|
| 283 |
+
results['sentiment'] = {'label': 'Neutral', 'score': 0.5}
|
| 284 |
+
return results
|
| 285 |
+
|
| 286 |
+
# 1. Sentiment analysis with LIME explanation
|
| 287 |
+
if self.sentiment_pipeline:
|
| 288 |
+
try:
|
| 289 |
+
main_pred = self.sentiment_pipeline(text[:512])[0]
|
| 290 |
+
results['sentiment'] = main_pred
|
| 291 |
+
|
| 292 |
+
if self.explainer:
|
| 293 |
+
def predict_proba(texts):
|
| 294 |
+
if isinstance(texts, str):
|
| 295 |
+
texts = [texts]
|
| 296 |
+
predictions = self.sentiment_pipeline(list(texts))
|
| 297 |
+
probs = []
|
| 298 |
+
for pred in predictions:
|
| 299 |
+
if pred['label'] == 'POSITIVE':
|
| 300 |
+
probs.append([1 - pred['score'], pred['score']])
|
| 301 |
+
else:
|
| 302 |
+
probs.append([pred['score'], 1 - pred['score']])
|
| 303 |
+
return np.array(probs)
|
| 304 |
+
|
| 305 |
+
explanation = self.explainer.explain_instance(
|
| 306 |
+
text[:512], predict_proba, num_features=6
|
| 307 |
+
)
|
| 308 |
+
results['sentiment_explanation'] = explanation.as_list()
|
| 309 |
+
except Exception as e:
|
| 310 |
+
print(f"[NLP] Sentiment error: {e}")
|
| 311 |
+
results['sentiment'] = {'label': 'Error', 'score': 0.0}
|
| 312 |
+
|
| 313 |
+
# 2. Bias analysis
|
| 314 |
+
results['bias_analysis'] = self._analyze_bias(text)
|
| 315 |
+
|
| 316 |
+
# 3. Named Entity Recognition
|
| 317 |
+
if self.ner_pipeline:
|
| 318 |
+
try:
|
| 319 |
+
entities = self.ner_pipeline(text[:512])
|
| 320 |
+
results['named_entities'] = entities
|
| 321 |
+
except Exception as e:
|
| 322 |
+
print(f"[NLP] NER error: {e}")
|
| 323 |
+
|
| 324 |
+
# 4. Semantic Coherence
|
| 325 |
+
results['coherence_score'] = self._calculate_coherence(text)
|
| 326 |
+
|
| 327 |
+
return results
|
| 328 |
+
|
| 329 |
+
def _analyze_bias(self, text: str) -> Dict[str, Any]:
|
| 330 |
+
"""Analyze text for bias using ML or heuristics."""
|
| 331 |
+
# Method 1: ML Model
|
| 332 |
+
if self.bias_model and self.bias_tokenizer:
|
| 333 |
+
try:
|
| 334 |
+
inputs = self.bias_tokenizer(
|
| 335 |
+
text[:512], return_tensors="pt",
|
| 336 |
+
truncation=True, max_length=512, padding=True
|
| 337 |
+
)
|
| 338 |
+
with torch.no_grad():
|
| 339 |
+
logits = self.bias_model(**inputs).logits
|
| 340 |
+
probs = torch.softmax(logits, dim=1)[0]
|
| 341 |
+
# Label mapping depends on model, usually [Non-biased, Biased]
|
| 342 |
+
bias_score = probs[1].item()
|
| 343 |
+
|
| 344 |
+
label = " biased" if bias_score > 0.5 else "Non-biased"
|
| 345 |
+
return {'score': bias_score, 'label': label, 'method': 'ML (d4data)'}
|
| 346 |
+
except Exception as e:
|
| 347 |
+
print(f"[NLP] ML Bias error: {e}")
|
| 348 |
+
|
| 349 |
+
# Method 2: Heuristics
|
| 350 |
+
biased_words = [
|
| 351 |
+
'radical', 'extremist', 'disgraceful', 'shameful', 'corrupt',
|
| 352 |
+
'insane', 'idiot', 'disaster', 'propaganda', 'dictator',
|
| 353 |
+
'puppet', 'regime', 'tyrant', 'treason', 'traitor'
|
| 354 |
+
]
|
| 355 |
+
text_lower = text.lower()
|
| 356 |
+
count = sum(1 for w in biased_words if w in text_lower)
|
| 357 |
+
score = min(1.0, count * 0.15)
|
| 358 |
+
label = "Potentially Biased" if score > 0.3 else "Neutral"
|
| 359 |
+
return {'score': score, 'label': label, 'method': 'Heuristic'}
|
| 360 |
+
|
| 361 |
+
def _calculate_coherence(self, text: str) -> float:
|
| 362 |
+
"""Calculate semantic coherence score."""
|
| 363 |
+
sentences = re.split(r'[.!?]+', text)
|
| 364 |
+
sentences = [s.strip() for s in sentences if len(s.split()) > 3]
|
| 365 |
+
|
| 366 |
+
if len(sentences) < 2:
|
| 367 |
+
return 0.7 # Default to neutral/good for short text, not perfect 1.0
|
| 368 |
+
|
| 369 |
+
# Method 1: SBERT Semantic Similarity
|
| 370 |
+
if self.coherence_model and HAS_SBERT:
|
| 371 |
+
try:
|
| 372 |
+
embeddings = self.coherence_model.encode(sentences[:10]) # Limit to 10
|
| 373 |
+
sims = []
|
| 374 |
+
for i in range(len(embeddings) - 1):
|
| 375 |
+
sim = util.pytorch_cos_sim(embeddings[i], embeddings[i+1])
|
| 376 |
+
sims.append(sim.item())
|
| 377 |
+
return sum(sims) / len(sims) if sims else 0.5
|
| 378 |
+
except Exception as e:
|
| 379 |
+
print(f"[NLP] SBERT error: {e}")
|
| 380 |
+
|
| 381 |
+
# Method 2: Heuristic (Sentence Length Variance & Repetition)
|
| 382 |
+
lengths = [len(s.split()) for s in sentences]
|
| 383 |
+
avg_len = sum(lengths) / len(lengths)
|
| 384 |
+
variance = sum((l - avg_len) ** 2 for l in lengths) / len(lengths)
|
| 385 |
+
|
| 386 |
+
# High variance suggests simpler/choppier writing usually
|
| 387 |
+
score = 0.8
|
| 388 |
+
if variance > 100: score -= 0.2
|
| 389 |
+
if avg_len < 5: score -= 0.2
|
| 390 |
+
|
| 391 |
+
return max(0.0, score)
|
| 392 |
+
|
| 393 |
+
def calculate_overall_score(
|
| 394 |
+
self,
|
| 395 |
+
rule_results: Dict,
|
| 396 |
+
nlp_results: Dict
|
| 397 |
+
) -> float:
|
| 398 |
+
"""
|
| 399 |
+
Calculate overall credibility score based on User-Defined Metrics.
|
| 400 |
+
"""
|
| 401 |
+
score = 0.5 # Start neutral
|
| 402 |
+
adjustments = 0.0
|
| 403 |
+
total_weight_used = 0.0
|
| 404 |
+
|
| 405 |
+
# 1. Source Reputation (25%)
|
| 406 |
+
w_rep = self.weights.get('source_reputation', 0.25)
|
| 407 |
+
reputation = rule_results['source_analysis'].get('reputation', 'Unknown')
|
| 408 |
+
if reputation != 'Unknown' and "N/A" not in reputation:
|
| 409 |
+
if reputation == 'High':
|
| 410 |
+
adjustments += w_rep * 1.0 # Full boost
|
| 411 |
+
elif reputation == 'Low':
|
| 412 |
+
adjustments -= w_rep * 1.0 # Full penalty
|
| 413 |
+
elif reputation == 'Medium':
|
| 414 |
+
adjustments += w_rep * 0.2 # Slight boost
|
| 415 |
+
total_weight_used += w_rep
|
| 416 |
+
|
| 417 |
+
# 2. Domain Age (10%)
|
| 418 |
+
w_age = self.weights.get('domain_age', 0.10)
|
| 419 |
+
domain_age = rule_results['source_analysis'].get('domain_age_days')
|
| 420 |
+
if domain_age is not None:
|
| 421 |
+
if domain_age > 730: # > 2 years
|
| 422 |
+
adjustments += w_age
|
| 423 |
+
elif domain_age < 90: # < 3 months
|
| 424 |
+
adjustments -= w_age
|
| 425 |
+
total_weight_used += w_age
|
| 426 |
+
|
| 427 |
+
# 3. Fact Check (20%)
|
| 428 |
+
w_fc = self.weights.get('fact_check', 0.20)
|
| 429 |
+
fact_checks = rule_results.get('fact_checking', [])
|
| 430 |
+
if fact_checks:
|
| 431 |
+
fc_score = 0
|
| 432 |
+
for fc in fact_checks:
|
| 433 |
+
rating = fc.get('rating', '').lower()
|
| 434 |
+
if rating in ['true', 'verified', 'correct']:
|
| 435 |
+
fc_score += 1
|
| 436 |
+
elif rating in ['false', 'fake', 'incorrect']:
|
| 437 |
+
fc_score -= 1
|
| 438 |
+
|
| 439 |
+
# Normalize fc_score (-1 to 1) roughly
|
| 440 |
+
if fc_score > 0: adjustments += w_fc
|
| 441 |
+
elif fc_score < 0: adjustments -= w_fc
|
| 442 |
+
total_weight_used += w_fc
|
| 443 |
+
|
| 444 |
+
# 4. Sentiment Neutrality (15%)
|
| 445 |
+
# Extreme sentiment = lower score
|
| 446 |
+
w_sent = self.weights.get('sentiment_neutrality', 0.15)
|
| 447 |
+
sentiment = nlp_results.get('sentiment', {})
|
| 448 |
+
if sentiment:
|
| 449 |
+
s_score = sentiment.get('score', 0.5)
|
| 450 |
+
# If extremely positive or negative (>0.9), penalize
|
| 451 |
+
if s_score > 0.9:
|
| 452 |
+
adjustments -= w_sent * 0.5 # Penalty for extremism
|
| 453 |
+
else:
|
| 454 |
+
adjustments += w_sent * 0.2 # Slight boost for moderation
|
| 455 |
+
total_weight_used += w_sent
|
| 456 |
+
|
| 457 |
+
# 5. Entity Presence (15%)
|
| 458 |
+
# Presence of Named Entities (PER, ORG, LOC) suggests verifyiability
|
| 459 |
+
w_ent = self.weights.get('entity_presence', 0.15)
|
| 460 |
+
entities = nlp_results.get('named_entities', [])
|
| 461 |
+
if len(entities) > 0:
|
| 462 |
+
# More entities = better (capped)
|
| 463 |
+
boost = min(1.0, len(entities) * 0.2)
|
| 464 |
+
adjustments += w_ent * boost
|
| 465 |
+
total_weight_used += w_ent
|
| 466 |
+
|
| 467 |
+
# 6. Text Coherence (15%) (Vocabulary Diversity)
|
| 468 |
+
w_coh = self.weights.get('coherence', 0.15)
|
| 469 |
+
coherence = nlp_results.get('coherence_score')
|
| 470 |
+
if coherence is not None:
|
| 471 |
+
# Coherence is usually 0.0 to 1.0
|
| 472 |
+
# Center around 0.5: >0.5 improves, <0.5 penalizes
|
| 473 |
+
adjustments += (coherence - 0.5) * w_coh
|
| 474 |
+
total_weight_used += w_coh
|
| 475 |
+
|
| 476 |
+
# Final calculation
|
| 477 |
+
# Base 0.5 + sum of weighted adjustments
|
| 478 |
+
# Adjustments are in range [-weight, +weight]
|
| 479 |
+
|
| 480 |
+
final_score = 0.5 + adjustments
|
| 481 |
+
|
| 482 |
+
return max(0.0, min(1.0, final_score))
|
| 483 |
+
|
| 484 |
+
# --- [NEW] TREC Evidence Retrieval Methods ---
|
| 485 |
+
|
| 486 |
+
def retrieve_evidence(
|
| 487 |
+
self,
|
| 488 |
+
claim: str,
|
| 489 |
+
k: int = 10,
|
| 490 |
+
model: str = "bm25"
|
| 491 |
+
) -> List[Dict[str, Any]]:
|
| 492 |
+
"""
|
| 493 |
+
Retrieve evidence documents for a given claim using TREC methodology.
|
| 494 |
+
|
| 495 |
+
This integrates the classic IR evaluation framework (TREC AP88-90)
|
| 496 |
+
with the neuro-symbolic credibility verification system.
|
| 497 |
+
|
| 498 |
+
Args:
|
| 499 |
+
claim: The claim or statement to verify
|
| 500 |
+
k: Number of evidence documents to retrieve
|
| 501 |
+
model: Retrieval model ('bm25', 'qld', 'tfidf')
|
| 502 |
+
|
| 503 |
+
Returns:
|
| 504 |
+
List of evidence dictionaries with doc_id, text, score, rank
|
| 505 |
+
"""
|
| 506 |
+
if not self.trec_retriever:
|
| 507 |
+
return []
|
| 508 |
+
|
| 509 |
+
try:
|
| 510 |
+
result = self.trec_retriever.retrieve_evidence(
|
| 511 |
+
claim=claim,
|
| 512 |
+
k=k,
|
| 513 |
+
model=model
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
# Convert Evidence objects to dictionaries
|
| 517 |
+
evidences = [e.to_dict() for e in result.evidences]
|
| 518 |
+
|
| 519 |
+
# Add to ontology if available
|
| 520 |
+
if self.ontology_manager:
|
| 521 |
+
for e in result.evidences[:3]: # Top 3 only
|
| 522 |
+
self.ontology_manager.add_evidence(
|
| 523 |
+
evidence_id=e.doc_id,
|
| 524 |
+
source=e.source or "trec_corpus",
|
| 525 |
+
content=e.text[:500],
|
| 526 |
+
score=e.score
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
return evidences
|
| 530 |
+
|
| 531 |
+
except Exception as ex:
|
| 532 |
+
print(f"[SysCRED] Evidence retrieval error: {ex}")
|
| 533 |
+
return []
|
| 534 |
+
|
| 535 |
+
def verify_with_evidence(
|
| 536 |
+
self,
|
| 537 |
+
claim: str,
|
| 538 |
+
k: int = 5
|
| 539 |
+
) -> Dict[str, Any]:
|
| 540 |
+
"""
|
| 541 |
+
Complete fact-checking pipeline with evidence retrieval.
|
| 542 |
+
|
| 543 |
+
Combines:
|
| 544 |
+
1. TREC-style evidence retrieval
|
| 545 |
+
2. NLP analysis of claim
|
| 546 |
+
3. Evidence-claim comparison
|
| 547 |
+
4. Credibility scoring
|
| 548 |
+
|
| 549 |
+
Args:
|
| 550 |
+
claim: The claim to verify
|
| 551 |
+
k: Number of evidence documents
|
| 552 |
+
|
| 553 |
+
Returns:
|
| 554 |
+
Verification result with evidence, analysis, and score
|
| 555 |
+
"""
|
| 556 |
+
result = {
|
| 557 |
+
'claim': claim,
|
| 558 |
+
'evidences': [],
|
| 559 |
+
'nlp_analysis': {},
|
| 560 |
+
'evidence_support_score': 0.0,
|
| 561 |
+
'verification_verdict': 'UNKNOWN',
|
| 562 |
+
'confidence': 0.0
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
# 1. Retrieve evidence
|
| 566 |
+
evidences = self.retrieve_evidence(claim, k=k)
|
| 567 |
+
result['evidences'] = evidences
|
| 568 |
+
|
| 569 |
+
# 2. NLP analysis of claim
|
| 570 |
+
cleaned_claim = self.preprocess(claim)
|
| 571 |
+
result['nlp_analysis'] = self.nlp_analysis(cleaned_claim)
|
| 572 |
+
|
| 573 |
+
# 3. Calculate evidence support score
|
| 574 |
+
if evidences:
|
| 575 |
+
# Use semantic similarity if SBERT available
|
| 576 |
+
if self.coherence_model:
|
| 577 |
+
try:
|
| 578 |
+
claim_embedding = self.coherence_model.encode(claim)
|
| 579 |
+
evidence_texts = [e.get('text', '') for e in evidences]
|
| 580 |
+
evidence_embeddings = self.coherence_model.encode(evidence_texts)
|
| 581 |
+
|
| 582 |
+
from sentence_transformers import util
|
| 583 |
+
similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings)[0]
|
| 584 |
+
avg_similarity = similarities.mean().item()
|
| 585 |
+
max_similarity = similarities.max().item()
|
| 586 |
+
|
| 587 |
+
# Evidence support based on similarity
|
| 588 |
+
result['evidence_support_score'] = round(max_similarity, 4)
|
| 589 |
+
result['average_evidence_similarity'] = round(avg_similarity, 4)
|
| 590 |
+
except Exception as e:
|
| 591 |
+
print(f"[SysCRED] Similarity error: {e}")
|
| 592 |
+
# Fallback: use retrieval scores
|
| 593 |
+
result['evidence_support_score'] = evidences[0].get('score', 0) if evidences else 0
|
| 594 |
+
else:
|
| 595 |
+
# Fallback: use retrieval scores
|
| 596 |
+
result['evidence_support_score'] = evidences[0].get('score', 0) if evidences else 0
|
| 597 |
+
|
| 598 |
+
# 4. Determine verdict
|
| 599 |
+
support_score = result['evidence_support_score']
|
| 600 |
+
if support_score > 0.7:
|
| 601 |
+
result['verification_verdict'] = 'SUPPORTED'
|
| 602 |
+
result['confidence'] = support_score
|
| 603 |
+
elif support_score > 0.5:
|
| 604 |
+
result['verification_verdict'] = 'PARTIALLY_SUPPORTED'
|
| 605 |
+
result['confidence'] = support_score
|
| 606 |
+
elif support_score > 0.3:
|
| 607 |
+
result['verification_verdict'] = 'INSUFFICIENT_EVIDENCE'
|
| 608 |
+
result['confidence'] = 0.5
|
| 609 |
+
else:
|
| 610 |
+
result['verification_verdict'] = 'NOT_SUPPORTED'
|
| 611 |
+
result['confidence'] = 1 - support_score
|
| 612 |
+
|
| 613 |
+
return result
|
| 614 |
+
|
| 615 |
+
# --- End TREC Evidence Methods ---
|
| 616 |
+
|
| 617 |
+
def generate_report(
|
| 618 |
+
self,
|
| 619 |
+
input_data: str,
|
| 620 |
+
cleaned_text: str,
|
| 621 |
+
rule_results: Dict,
|
| 622 |
+
nlp_results: Dict,
|
| 623 |
+
external_data: ExternalData,
|
| 624 |
+
overall_score: float,
|
| 625 |
+
web_content: Optional[WebContent] = None,
|
| 626 |
+
graph_context: str = "", # [NEW]
|
| 627 |
+
evidences: List[Dict[str, Any]] = None # [NEW] TREC evidences
|
| 628 |
+
) -> Dict[str, Any]:
|
| 629 |
+
"""Generate the final evaluation report."""
|
| 630 |
+
|
| 631 |
+
report = {
|
| 632 |
+
'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
|
| 633 |
+
'informationEntree': input_data,
|
| 634 |
+
'dateGeneration': datetime.datetime.now().isoformat(),
|
| 635 |
+
'scoreCredibilite': round(overall_score, 2),
|
| 636 |
+
'resumeAnalyse': "",
|
| 637 |
+
'detailsScore': {
|
| 638 |
+
'base': 0.5,
|
| 639 |
+
'weights': self.weights,
|
| 640 |
+
'factors': self._get_score_factors(rule_results, nlp_results)
|
| 641 |
+
},
|
| 642 |
+
'sourcesUtilisees': [],
|
| 643 |
+
'reglesAppliquees': rule_results,
|
| 644 |
+
'analyseNLP': {
|
| 645 |
+
'sentiment': nlp_results.get('sentiment'),
|
| 646 |
+
'bias_analysis': nlp_results.get('bias_analysis'),
|
| 647 |
+
'named_entities_count': len(nlp_results.get('named_entities', [])),
|
| 648 |
+
'coherence_score': nlp_results.get('coherence_score'),
|
| 649 |
+
'sentiment_explanation_preview': (nlp_results.get('sentiment_explanation') or [])[:3]
|
| 650 |
+
},
|
| 651 |
+
# [NEW] TREC Evidence section
|
| 652 |
+
'evidences': evidences or [],
|
| 653 |
+
'metadonnees': {}
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
# Add web content metadata if available
|
| 657 |
+
if web_content:
|
| 658 |
+
if web_content.success:
|
| 659 |
+
report['metadonnees']['page_title'] = web_content.title
|
| 660 |
+
report['metadonnees']['meta_description'] = web_content.meta_description
|
| 661 |
+
report['metadonnees']['links_count'] = len(web_content.links)
|
| 662 |
+
else:
|
| 663 |
+
report['metadonnees']['warning'] = f"Content scrape failed: {web_content.error}"
|
| 664 |
+
|
| 665 |
+
# Generate summary
|
| 666 |
+
summary_parts = []
|
| 667 |
+
|
| 668 |
+
if web_content and not web_content.success:
|
| 669 |
+
summary_parts.append(f"⚠️ ATTENTION: Impossible de lire le texte de la page ({web_content.error}). Analyse basée uniquement sur la réputation du domaine.")
|
| 670 |
+
|
| 671 |
+
if overall_score > 0.75:
|
| 672 |
+
summary_parts.append("L'analyse suggère une crédibilité ÉLEVÉE.")
|
| 673 |
+
elif overall_score > 0.55:
|
| 674 |
+
summary_parts.append("L'analyse suggère une crédibilité MOYENNE à ÉLEVÉE.")
|
| 675 |
+
elif overall_score > 0.45:
|
| 676 |
+
summary_parts.append("L'analyse suggère une crédibilité MOYENNE.")
|
| 677 |
+
elif overall_score > 0.25:
|
| 678 |
+
summary_parts.append("L'analyse suggère une crédibilité FAIBLE à MOYENNE.")
|
| 679 |
+
else:
|
| 680 |
+
summary_parts.append("L'analyse suggère une crédibilité FAIBLE.")
|
| 681 |
+
|
| 682 |
+
if external_data.source_reputation != 'Unknown':
|
| 683 |
+
summary_parts.append(f"Réputation source : {external_data.source_reputation}.")
|
| 684 |
+
|
| 685 |
+
if external_data.domain_age_days:
|
| 686 |
+
years = external_data.domain_age_days / 365
|
| 687 |
+
summary_parts.append(f"Âge du domaine : {years:.1f} ans.")
|
| 688 |
+
|
| 689 |
+
if external_data.fact_checks:
|
| 690 |
+
summary_parts.append(f"{len(external_data.fact_checks)} vérification(s) de faits trouvée(s).")
|
| 691 |
+
|
| 692 |
+
report['resumeAnalyse'] = " ".join(summary_parts)
|
| 693 |
+
|
| 694 |
+
# List sources used
|
| 695 |
+
if self.is_url(input_data):
|
| 696 |
+
report['sourcesUtilisees'].append({
|
| 697 |
+
'type': 'Primary URL',
|
| 698 |
+
'url': input_data
|
| 699 |
+
})
|
| 700 |
+
report['sourcesUtilisees'].append({
|
| 701 |
+
'type': 'WHOIS Lookup',
|
| 702 |
+
'status': 'Success' if (external_data.domain_info and external_data.domain_info.success) else 'Failed/N/A'
|
| 703 |
+
})
|
| 704 |
+
report['sourcesUtilisees'].append({
|
| 705 |
+
'type': 'Fact Check API',
|
| 706 |
+
'results_count': len(external_data.fact_checks)
|
| 707 |
+
})
|
| 708 |
+
# [NEW] Add TREC evidence source
|
| 709 |
+
if evidences:
|
| 710 |
+
report['sourcesUtilisees'].append({
|
| 711 |
+
'type': 'TREC Evidence Retrieval',
|
| 712 |
+
'method': 'BM25/TF-IDF',
|
| 713 |
+
'corpus': 'AP88-90',
|
| 714 |
+
'results_count': len(evidences)
|
| 715 |
+
})
|
| 716 |
+
|
| 717 |
+
return report
|
| 718 |
+
|
| 719 |
+
def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
|
| 720 |
+
"""Get list of factors that influenced the score (For UI)."""
|
| 721 |
+
factors = []
|
| 722 |
+
|
| 723 |
+
# 1. Reputation
|
| 724 |
+
rep = rule_results['source_analysis'].get('reputation')
|
| 725 |
+
if rep and "N/A" not in rep:
|
| 726 |
+
factors.append({
|
| 727 |
+
'factor': 'Source Reputation',
|
| 728 |
+
'value': rep,
|
| 729 |
+
'weight': f"{int(self.weights.get('source_reputation',0)*100)}%",
|
| 730 |
+
'impact': '+' if rep == 'High' else ('-' if rep == 'Low' else '0')
|
| 731 |
+
})
|
| 732 |
+
|
| 733 |
+
# 2. Fact Checks
|
| 734 |
+
if rule_results.get('fact_checking'):
|
| 735 |
+
factors.append({
|
| 736 |
+
'factor': 'Fact Checks',
|
| 737 |
+
'value': f"{len(rule_results['fact_checking'])} found",
|
| 738 |
+
'weight': f"{int(self.weights.get('fact_check',0)*100)}%",
|
| 739 |
+
'impact': 'Variable'
|
| 740 |
+
})
|
| 741 |
+
|
| 742 |
+
# 3. Entities
|
| 743 |
+
n_ent = len(nlp_results.get('named_entities', []))
|
| 744 |
+
if n_ent > 0:
|
| 745 |
+
factors.append({
|
| 746 |
+
'factor': 'Entity Presence',
|
| 747 |
+
'value': f"{n_ent} entities",
|
| 748 |
+
'weight': f"{int(self.weights.get('entity_presence',0)*100)}%",
|
| 749 |
+
'impact': '+'
|
| 750 |
+
})
|
| 751 |
+
|
| 752 |
+
# 4. Sentiment
|
| 753 |
+
sent = nlp_results.get('sentiment', {})
|
| 754 |
+
if sent:
|
| 755 |
+
factors.append({
|
| 756 |
+
'factor': 'Sentiment Neutrality',
|
| 757 |
+
'value': f"{sent.get('label')} ({sent.get('score',0):.2f})",
|
| 758 |
+
'weight': f"{int(self.weights.get('sentiment_neutrality',0)*100)}%",
|
| 759 |
+
'impact': '-' if sent.get('score', 0) > 0.9 else '0'
|
| 760 |
+
})
|
| 761 |
+
|
| 762 |
+
return factors
|
| 763 |
+
|
| 764 |
+
def verify_information(self, input_data: str) -> Dict[str, Any]:
|
| 765 |
+
"""
|
| 766 |
+
Main pipeline to verify credibility of input data.
|
| 767 |
+
|
| 768 |
+
Args:
|
| 769 |
+
input_data: URL or text to verify
|
| 770 |
+
|
| 771 |
+
Returns:
|
| 772 |
+
Complete evaluation report
|
| 773 |
+
"""
|
| 774 |
+
if not isinstance(input_data, str) or not input_data.strip():
|
| 775 |
+
return {"error": "L'entrée doit être une chaîne non vide."}
|
| 776 |
+
|
| 777 |
+
print(f"\n[SysCRED] === Vérification: {input_data[:100]}... ===")
|
| 778 |
+
|
| 779 |
+
# 1. Determine input type and fetch content
|
| 780 |
+
text_to_analyze = ""
|
| 781 |
+
web_content = None
|
| 782 |
+
is_url = self.is_url(input_data)
|
| 783 |
+
|
| 784 |
+
if is_url:
|
| 785 |
+
print("[SysCRED] Fetching web content...")
|
| 786 |
+
web_content = self.api_clients.fetch_web_content(input_data)
|
| 787 |
+
|
| 788 |
+
if web_content.success:
|
| 789 |
+
text_to_analyze = web_content.text_content
|
| 790 |
+
print(f"[SysCRED] ✓ Content fetched: {len(text_to_analyze)} chars")
|
| 791 |
+
else:
|
| 792 |
+
print(f"[SysCRED] ⚠ Fetch failed: {web_content.error}")
|
| 793 |
+
print("[SysCRED] Proceeding with Domain/Metadata analysis only.")
|
| 794 |
+
text_to_analyze = ""
|
| 795 |
+
# We don't return error anymore, we proceed!
|
| 796 |
+
else:
|
| 797 |
+
text_to_analyze = input_data
|
| 798 |
+
|
| 799 |
+
# 2. Preprocess text
|
| 800 |
+
cleaned_text = self.preprocess(text_to_analyze)
|
| 801 |
+
|
| 802 |
+
# Only error on empty text if it wasn't a failed web fetch
|
| 803 |
+
# If web fetch failed, we proceed with empty text to give metadata analysis
|
| 804 |
+
if not cleaned_text and not (is_url and web_content and not web_content.success):
|
| 805 |
+
return {"error": "Le texte est vide après prétraitement."}
|
| 806 |
+
print(f"[SysCRED] Preprocessed text: {len(cleaned_text)} chars")
|
| 807 |
+
|
| 808 |
+
# Determine best query for Fact Checking
|
| 809 |
+
fact_check_query = input_data
|
| 810 |
+
if text_to_analyze and len(text_to_analyze) > 10:
|
| 811 |
+
# Use start of text if available
|
| 812 |
+
fact_check_query = text_to_analyze[:200]
|
| 813 |
+
elif is_url and web_content and web_content.title:
|
| 814 |
+
# Fallback to page title if text is missing (e.g. 403)
|
| 815 |
+
fact_check_query = web_content.title
|
| 816 |
+
|
| 817 |
+
# 3. Fetch external data
|
| 818 |
+
print(f"[SysCRED] Fetching external data (Query: {fact_check_query[:50]}...)...")
|
| 819 |
+
external_data = self.api_clients.fetch_external_data(input_data, fc_query=fact_check_query)
|
| 820 |
+
|
| 821 |
+
# [FIX] Handle text-only input reputation
|
| 822 |
+
if not is_url:
|
| 823 |
+
external_data.source_reputation = "N/A (User Input)"
|
| 824 |
+
|
| 825 |
+
print(f"[SysCRED] ✓ Reputation: {external_data.source_reputation}, Age: {external_data.domain_age_days} days")
|
| 826 |
+
|
| 827 |
+
# 4. Rule-based analysis
|
| 828 |
+
print("[SysCRED] Running rule-based analysis...")
|
| 829 |
+
rule_results = self.rule_based_analysis(cleaned_text, external_data)
|
| 830 |
+
|
| 831 |
+
# 5. NLP analysis
|
| 832 |
+
print("[SysCRED] Running NLP analysis...")
|
| 833 |
+
nlp_results = self.nlp_analysis(cleaned_text)
|
| 834 |
+
|
| 835 |
+
# 6. Calculate score
|
| 836 |
+
overall_score = self.calculate_overall_score(rule_results, nlp_results)
|
| 837 |
+
print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
|
| 838 |
+
|
| 839 |
+
# 7. [NEW] GraphRAG Context Retrieval
|
| 840 |
+
graph_context = ""
|
| 841 |
+
similar_uris = []
|
| 842 |
+
if self.graph_rag and 'source_analysis' in rule_results:
|
| 843 |
+
domain = rule_results['source_analysis'].get('domain', '')
|
| 844 |
+
# Pass keywords for text search if domain is empty or generic
|
| 845 |
+
keywords = []
|
| 846 |
+
if not domain and cleaned_text:
|
| 847 |
+
keywords = cleaned_text.split()[:5] # Simple keyword extraction
|
| 848 |
+
|
| 849 |
+
context = self.graph_rag.get_context(domain, keywords=keywords)
|
| 850 |
+
graph_context = context.get('full_text', '')
|
| 851 |
+
similar_uris = context.get('similar_uris', [])
|
| 852 |
+
|
| 853 |
+
if "Graph Memory" in graph_context:
|
| 854 |
+
print(f"[SysCRED] GraphRAG Context Found: {graph_context.splitlines()[1]}")
|
| 855 |
+
|
| 856 |
+
# 8. Generate report (Updated to include context)
|
| 857 |
+
report = self.generate_report(
|
| 858 |
+
input_data, cleaned_text, rule_results,
|
| 859 |
+
nlp_results, external_data, overall_score, web_content,
|
| 860 |
+
graph_context=graph_context
|
| 861 |
+
)
|
| 862 |
+
|
| 863 |
+
# Add similar URIs to report for ontology linking
|
| 864 |
+
if similar_uris:
|
| 865 |
+
report['similar_claims_uris'] = similar_uris
|
| 866 |
+
|
| 867 |
+
# 9. Save to ontology
|
| 868 |
+
if self.ontology_manager:
|
| 869 |
+
try:
|
| 870 |
+
report_uri = self.ontology_manager.add_evaluation_triplets(report)
|
| 871 |
+
report['ontology_uri'] = report_uri
|
| 872 |
+
self.ontology_manager.save_data()
|
| 873 |
+
except Exception as e:
|
| 874 |
+
print(f"[SysCRED] Ontology save failed: {e}")
|
| 875 |
+
|
| 876 |
+
print("[SysCRED] === Vérification terminée ===\n")
|
| 877 |
+
return report
|
| 878 |
+
|
| 879 |
+
|
| 880 |
+
# --- Main / Testing ---
|
| 881 |
+
if __name__ == "__main__":
|
| 882 |
+
import json
|
| 883 |
+
|
| 884 |
+
print("=" * 60)
|
| 885 |
+
print("SysCRED v2.0 - Système de Vérification de Crédibilité")
|
| 886 |
+
print("(c) Dominique S. Loyer - PhD Thesis Prototype")
|
| 887 |
+
print("=" * 60 + "\n")
|
| 888 |
+
|
| 889 |
+
# Initialize system (without ML models for quick testing)
|
| 890 |
+
system = CredibilityVerificationSystem(
|
| 891 |
+
ontology_base_path="/Users/bk280625/documents041025/MonCode/sysCRED_onto26avrtil.ttl",
|
| 892 |
+
ontology_data_path="/Users/bk280625/documents041025/MonCode/ontology/sysCRED_data.ttl",
|
| 893 |
+
load_ml_models=False # Set to True for full analysis
|
| 894 |
+
)
|
| 895 |
+
|
| 896 |
+
# Test cases
|
| 897 |
+
test_cases = {
|
| 898 |
+
"Test URL Crédible": "https://www.lemonde.fr",
|
| 899 |
+
"Test URL Inconnu": "https://example.com/article",
|
| 900 |
+
"Test Texte Simple": "This is a verified and authentic news report.",
|
| 901 |
+
"Test Texte Suspect": "Shocking conspiracy revealed! They don't want you to know this secret!",
|
| 902 |
+
}
|
| 903 |
+
|
| 904 |
+
results = {}
|
| 905 |
+
for name, test_input in test_cases.items():
|
| 906 |
+
print(f"\n{'='*50}")
|
| 907 |
+
print(f"Test: {name}")
|
| 908 |
+
print('='*50)
|
| 909 |
+
|
| 910 |
+
result = system.verify_information(test_input)
|
| 911 |
+
results[name] = result
|
| 912 |
+
|
| 913 |
+
if 'error' not in result:
|
| 914 |
+
print(f"\nScore: {result['scoreCredibilite']}")
|
| 915 |
+
print(f"Résumé: {result['resumeAnalyse']}")
|
| 916 |
+
else:
|
| 917 |
+
print(f"Erreur: {result['error']}")
|
| 918 |
+
|
| 919 |
+
print("\n" + "="*60)
|
| 920 |
+
print("Résumé des tests:")
|
| 921 |
+
print("="*60)
|
| 922 |
+
for name, result in results.items():
|
| 923 |
+
if 'error' not in result:
|
| 924 |
+
print(f" {name}: Score = {result['scoreCredibilite']:.2f}")
|
| 925 |
+
else:
|
| 926 |
+
print(f" {name}: ERREUR")
|