syscred_duplicate

Sleeping

File size: 9,829 Bytes

# -*- coding: utf-8 -*-
"""
SysCRED Configuration
=====================
Configuration centralisée pour le système de vérification de crédibilité.

Usage:
    from syscred.config import Config
    
    # Accéder aux paramètres
    config = Config()
    port = config.PORT
    
    # Ou avec variables d'environnement
    # export SYSCRED_GOOGLE_API_KEY=your_key
    # export SYSCRED_PORT=8080

(c) Dominique S. Loyer - PhD Thesis Prototype
"""

import os
from pathlib import Path
from typing import Dict, Optional
from dotenv import load_dotenv

# Charger les variables depuis .env (Project Root)
# Path: .../systemFactChecking/syscred/config.py
# Root .env is at .../systemFactChecking/.env (1 level up from syscred/)
current_path = Path(__file__).resolve()
env_path = current_path.parent.parent / '.env'

if not env_path.exists():
    print(f"[Config] WARNING: .env not found at {env_path}")
    # Try alternate locations
    for alt in [Path.cwd() / '.env', Path.cwd().parent / '.env']:
        if alt.exists():
            env_path = alt
            break
    
load_dotenv(dotenv_path=env_path)
print(f"[Config] Loading .env from {env_path}")
print(f"[Config] SYSCRED_GOOGLE_API_KEY loaded: {'Yes' if os.environ.get('SYSCRED_GOOGLE_API_KEY') else 'No'}")



class Config:
    """
    Configuration centralisée pour SysCRED.
    
    Les valeurs peuvent être override par des variables d'environnement
    préfixées par SYSCRED_.
    """
    
    # === Chemins ===
    # BASE_DIR = project root (parent of syscred/)
    BASE_DIR = Path(__file__).parent.parent
    ONTOLOGY_BASE_PATH = BASE_DIR / "ontology" / "sysCRED_onto26avrtil.ttl"
    ONTOLOGY_DATA_PATH = BASE_DIR / "ontology" / "sysCRED_data.ttl"
    
    # === Serveur Flask ===
    HOST = os.getenv("SYSCRED_HOST", "0.0.0.0")
    PORT = int(os.getenv("SYSCRED_PORT", "5000"))
    DEBUG = os.getenv("SYSCRED_DEBUG", "true").lower() == "true"
    
    # === API Keys ===
    GOOGLE_FACT_CHECK_API_KEY = os.getenv("SYSCRED_GOOGLE_API_KEY")
    DATABASE_URL = os.getenv("SYSCRED_DATABASE_URL", os.getenv("DATABASE_URL"))  # Standardized env var
    
    # === Modèles ML ===
    # Support both SYSCRED_LOAD_ML and SYSCRED_LOAD_ML_MODELS (for Render)
    LOAD_ML_MODELS = os.getenv("SYSCRED_LOAD_ML_MODELS", os.getenv("SYSCRED_LOAD_ML", "true")).lower() == "true"
    SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
    NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
    
    # === Timeouts ===
    WEB_FETCH_TIMEOUT = int(os.getenv("SYSCRED_TIMEOUT", "10"))
    
    # === TREC IR Configuration (NEW - Feb 2026) ===
    TREC_INDEX_PATH = os.getenv("SYSCRED_TREC_INDEX", None)  # Lucene/Pyserini index
    TREC_CORPUS_PATH = os.getenv("SYSCRED_TREC_CORPUS", None)  # JSONL corpus
    TREC_TOPICS_PATH = os.getenv("SYSCRED_TREC_TOPICS", None)  # Topics directory
    TREC_QRELS_PATH = os.getenv("SYSCRED_TREC_QRELS", None)  # Qrels directory
    
    # BM25 Parameters (optimized on AP88-90)
    BM25_K1 = float(os.getenv("SYSCRED_BM25_K1", "0.9"))
    BM25_B = float(os.getenv("SYSCRED_BM25_B", "0.4"))
    
    # PRF (Pseudo-Relevance Feedback) settings
    ENABLE_PRF = os.getenv("SYSCRED_ENABLE_PRF", "true").lower() == "true"
    PRF_TOP_DOCS = int(os.getenv("SYSCRED_PRF_TOP_DOCS", "3"))
    PRF_EXPANSION_TERMS = int(os.getenv("SYSCRED_PRF_TERMS", "10"))
    
    # === Pondération des scores ===
    # Note: Weights should sum to 1.0 for proper normalization
    SCORE_WEIGHTS = {
        'source_reputation': 0.22,    # Was 0.25, reduced for graph_context
        'domain_age': 0.08,           # Was 0.10
        'sentiment_neutrality': 0.13, # Was 0.15
        'entity_presence': 0.13,      # Was 0.15
        'coherence': 0.12,            # Was 0.15
        'fact_check': 0.17,           # Was 0.20
        'graph_context': 0.15         # NEW - Historical knowledge from GraphRAG
    }
    
    # === Seuils de crédibilité ===
    CREDIBILITY_THRESHOLDS = {
        'HIGH': 0.7,
        'MEDIUM': 0.4,
        'LOW': 0.0
    }
    
    # === Base de données de réputation ===
    # Les sources peuvent être étendues ou chargées d'un fichier externe
    SOURCE_REPUTATIONS: Dict[str, str] = {
        # === HAUTE CRÉDIBILITÉ ===
        # Médias internationaux
        'lemonde.fr': 'High',
        'nytimes.com': 'High',
        'reuters.com': 'High',
        'bbc.com': 'High',
        'bbc.co.uk': 'High',
        'theguardian.com': 'High',
        'apnews.com': 'High',
        'afp.com': 'High',
        'france24.com': 'High',
        
        # Médias canadiens
        'cbc.ca': 'High',
        'radio-canada.ca': 'High',
        'lapresse.ca': 'High',
        'ledevoir.com': 'High',
        'theglobeandmail.com': 'High',
        
        # Sources académiques
        'nature.com': 'High',
        'sciencedirect.com': 'High',
        'scholar.google.com': 'High',
        'pubmed.ncbi.nlm.nih.gov': 'High',
        'jstor.org': 'High',
        'springer.com': 'High',
        'ieee.org': 'High',
        'acm.org': 'High',
        'arxiv.org': 'High',
        
        # Fact-checkers
        'factcheck.org': 'High',
        'snopes.com': 'High',
        'politifact.com': 'High',
        'fullfact.org': 'High',
        'checknews.fr': 'High',
        
        # Institutions
        'who.int': 'High',
        'un.org': 'High',
        'europa.eu': 'High',
        'canada.ca': 'High',
        'gouv.fr': 'High',
        'gouv.qc.ca': 'High',
        
        # === CRÉDIBILITÉ MOYENNE ===
        'wikipedia.org': 'Medium',
        'medium.com': 'Medium',
        'huffpost.com': 'Medium',
        'buzzfeed.com': 'Medium',
        'vice.com': 'Medium',
        'slate.com': 'Medium',
        'theconversation.com': 'Medium',
        
        # === BASSE CRÉDIBILITÉ ===
        'infowars.com': 'Low',
        'naturalnews.com': 'Low',
        'breitbart.com': 'Low',
        'dailystormer.su': 'Low',
        'beforeitsnews.com': 'Low',
        'worldtruth.tv': 'Low',
        'yournewswire.com': 'Low',
    }
    
    # === Patterns de mésinformation ===
    MISINFORMATION_KEYWORDS = [
        'conspiracy', 'hoax', 'fake news', 'miracle cure', 
        "they don't want you to know", 'mainstream media lies',
        'deep state', 'plandemic', 'wake up sheeple',
        'big pharma cover-up', 'government conspiracy',
        'censored truth', 'what they hide'
    ]
    
    @classmethod
    def load_external_reputations(cls, filepath: str) -> None:
        """
        Charger des réputations supplémentaires depuis un fichier JSON.
        
        Args:
            filepath: Chemin vers le fichier JSON avec format:
                      {"domain.com": "High", "autre.com": "Low"}
        """
        import json
        try:
            with open(filepath, 'r') as f:
                external_reps = json.load(f)
                cls.SOURCE_REPUTATIONS.update(external_reps)
                print(f"[Config] Loaded {len(external_reps)} external reputations")
        except Exception as e:
            print(f"[Config] Could not load external reputations: {e}")
    
    @classmethod
    def update_weights(cls, new_weights: Dict[str, float]) -> None:
        """
        Mettre à jour les pondérations des scores.
        
        Args:
            new_weights: Dictionnaire avec les nouvelles pondérations
        """
        cls.SCORE_WEIGHTS.update(new_weights)
        # Normaliser pour que la somme = 1
        total = sum(cls.SCORE_WEIGHTS.values())
        cls.SCORE_WEIGHTS = {k: v/total for k, v in cls.SCORE_WEIGHTS.items()}
        print(f"[Config] Updated weights: {cls.SCORE_WEIGHTS}")
    
    @classmethod
    def to_dict(cls) -> Dict:
        """Exporter la configuration actuelle en dictionnaire."""
        return {
            'host': cls.HOST,
            'port': cls.PORT,
            'debug': cls.DEBUG,
            'google_api_configured': cls.GOOGLE_FACT_CHECK_API_KEY is not None,
            'ml_models_enabled': cls.LOAD_ML_MODELS,
            'score_weights': cls.SCORE_WEIGHTS,
            'known_sources_count': len(cls.SOURCE_REPUTATIONS),
            'ontology_base': str(cls.ONTOLOGY_BASE_PATH),
            'ontology_data': str(cls.ONTOLOGY_DATA_PATH),
        }
    
    @classmethod
    def print_config(cls) -> None:
        """Afficher la configuration actuelle."""
        print("=" * 50)
        print("SysCRED Configuration")
        print("=" * 50)
        for key, value in cls.to_dict().items():
            print(f"  {key}: {value}")
        print("=" * 50)


# === Configuration par environnement ===

class DevelopmentConfig(Config):
    """Configuration pour développement local."""
    DEBUG = True
    LOAD_ML_MODELS = True


class ProductionConfig(Config):
    """Configuration pour production."""
    DEBUG = False
    LOAD_ML_MODELS = True
    HOST = "0.0.0.0"


class TestingConfig(Config):
    """Configuration pour tests."""
    DEBUG = True
    LOAD_ML_MODELS = False  # Plus rapide pour les tests
    WEB_FETCH_TIMEOUT = 5


# Sélection automatique de la configuration
def get_config() -> Config:
    """
    Retourne la configuration appropriée selon l'environnement.
    
    Variable d'environnement: SYSCRED_ENV (development, production, testing)
    """
    env = os.getenv("SYSCRED_ENV", "development").lower()
    
    configs = {
        'development': DevelopmentConfig,
        'production': ProductionConfig,
        'testing': TestingConfig,
    }
    
    return configs.get(env, DevelopmentConfig)


# Instance par défaut
config = get_config()


if __name__ == "__main__":
    # Test de la configuration
    config.print_config()
    
    print("\n=== Source Reputations Sample ===")
    for domain, rep in list(config.SOURCE_REPUTATIONS.items())[:10]:
        print(f"  {domain}: {rep}")