File size: 9,829 Bytes
e70050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea9303b
 
e70050b
ea9303b
e70050b
ea9303b
 
 
 
 
 
 
 
 
 
e70050b
 
 
 
 
 
 
 
 
 
 
 
 
ea9303b
e70050b
ea9303b
e70050b
 
 
 
 
 
 
 
 
ea9303b
e70050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e97fc5
e70050b
8e97fc5
 
 
 
 
 
 
e70050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# -*- coding: utf-8 -*-
"""
SysCRED Configuration
=====================
Configuration centralisée pour le système de vérification de crédibilité.

Usage:
    from syscred.config import Config
    
    # Accéder aux paramètres
    config = Config()
    port = config.PORT
    
    # Ou avec variables d'environnement
    # export SYSCRED_GOOGLE_API_KEY=your_key
    # export SYSCRED_PORT=8080

(c) Dominique S. Loyer - PhD Thesis Prototype
"""

import os
from pathlib import Path
from typing import Dict, Optional
from dotenv import load_dotenv

# Charger les variables depuis .env (Project Root)
# Path: .../systemFactChecking/syscred/config.py
# Root .env is at .../systemFactChecking/.env (1 level up from syscred/)
current_path = Path(__file__).resolve()
env_path = current_path.parent.parent / '.env'

if not env_path.exists():
    print(f"[Config] WARNING: .env not found at {env_path}")
    # Try alternate locations
    for alt in [Path.cwd() / '.env', Path.cwd().parent / '.env']:
        if alt.exists():
            env_path = alt
            break
    
load_dotenv(dotenv_path=env_path)
print(f"[Config] Loading .env from {env_path}")
print(f"[Config] SYSCRED_GOOGLE_API_KEY loaded: {'Yes' if os.environ.get('SYSCRED_GOOGLE_API_KEY') else 'No'}")



class Config:
    """
    Configuration centralisée pour SysCRED.
    
    Les valeurs peuvent être override par des variables d'environnement
    préfixées par SYSCRED_.
    """
    
    # === Chemins ===
    # BASE_DIR = project root (parent of syscred/)
    BASE_DIR = Path(__file__).parent.parent
    ONTOLOGY_BASE_PATH = BASE_DIR / "ontology" / "sysCRED_onto26avrtil.ttl"
    ONTOLOGY_DATA_PATH = BASE_DIR / "ontology" / "sysCRED_data.ttl"
    
    # === Serveur Flask ===
    HOST = os.getenv("SYSCRED_HOST", "0.0.0.0")
    PORT = int(os.getenv("SYSCRED_PORT", "5000"))
    DEBUG = os.getenv("SYSCRED_DEBUG", "true").lower() == "true"
    
    # === API Keys ===
    GOOGLE_FACT_CHECK_API_KEY = os.getenv("SYSCRED_GOOGLE_API_KEY")
    DATABASE_URL = os.getenv("SYSCRED_DATABASE_URL", os.getenv("DATABASE_URL"))  # Standardized env var
    
    # === Modèles ML ===
    # Support both SYSCRED_LOAD_ML and SYSCRED_LOAD_ML_MODELS (for Render)
    LOAD_ML_MODELS = os.getenv("SYSCRED_LOAD_ML_MODELS", os.getenv("SYSCRED_LOAD_ML", "true")).lower() == "true"
    SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
    NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
    
    # === Timeouts ===
    WEB_FETCH_TIMEOUT = int(os.getenv("SYSCRED_TIMEOUT", "10"))
    
    # === TREC IR Configuration (NEW - Feb 2026) ===
    TREC_INDEX_PATH = os.getenv("SYSCRED_TREC_INDEX", None)  # Lucene/Pyserini index
    TREC_CORPUS_PATH = os.getenv("SYSCRED_TREC_CORPUS", None)  # JSONL corpus
    TREC_TOPICS_PATH = os.getenv("SYSCRED_TREC_TOPICS", None)  # Topics directory
    TREC_QRELS_PATH = os.getenv("SYSCRED_TREC_QRELS", None)  # Qrels directory
    
    # BM25 Parameters (optimized on AP88-90)
    BM25_K1 = float(os.getenv("SYSCRED_BM25_K1", "0.9"))
    BM25_B = float(os.getenv("SYSCRED_BM25_B", "0.4"))
    
    # PRF (Pseudo-Relevance Feedback) settings
    ENABLE_PRF = os.getenv("SYSCRED_ENABLE_PRF", "true").lower() == "true"
    PRF_TOP_DOCS = int(os.getenv("SYSCRED_PRF_TOP_DOCS", "3"))
    PRF_EXPANSION_TERMS = int(os.getenv("SYSCRED_PRF_TERMS", "10"))
    
    # === Pondération des scores ===
    # Note: Weights should sum to 1.0 for proper normalization
    SCORE_WEIGHTS = {
        'source_reputation': 0.22,    # Was 0.25, reduced for graph_context
        'domain_age': 0.08,           # Was 0.10
        'sentiment_neutrality': 0.13, # Was 0.15
        'entity_presence': 0.13,      # Was 0.15
        'coherence': 0.12,            # Was 0.15
        'fact_check': 0.17,           # Was 0.20
        'graph_context': 0.15         # NEW - Historical knowledge from GraphRAG
    }
    
    # === Seuils de crédibilité ===
    CREDIBILITY_THRESHOLDS = {
        'HIGH': 0.7,
        'MEDIUM': 0.4,
        'LOW': 0.0
    }
    
    # === Base de données de réputation ===
    # Les sources peuvent être étendues ou chargées d'un fichier externe
    SOURCE_REPUTATIONS: Dict[str, str] = {
        # === HAUTE CRÉDIBILITÉ ===
        # Médias internationaux
        'lemonde.fr': 'High',
        'nytimes.com': 'High',
        'reuters.com': 'High',
        'bbc.com': 'High',
        'bbc.co.uk': 'High',
        'theguardian.com': 'High',
        'apnews.com': 'High',
        'afp.com': 'High',
        'france24.com': 'High',
        
        # Médias canadiens
        'cbc.ca': 'High',
        'radio-canada.ca': 'High',
        'lapresse.ca': 'High',
        'ledevoir.com': 'High',
        'theglobeandmail.com': 'High',
        
        # Sources académiques
        'nature.com': 'High',
        'sciencedirect.com': 'High',
        'scholar.google.com': 'High',
        'pubmed.ncbi.nlm.nih.gov': 'High',
        'jstor.org': 'High',
        'springer.com': 'High',
        'ieee.org': 'High',
        'acm.org': 'High',
        'arxiv.org': 'High',
        
        # Fact-checkers
        'factcheck.org': 'High',
        'snopes.com': 'High',
        'politifact.com': 'High',
        'fullfact.org': 'High',
        'checknews.fr': 'High',
        
        # Institutions
        'who.int': 'High',
        'un.org': 'High',
        'europa.eu': 'High',
        'canada.ca': 'High',
        'gouv.fr': 'High',
        'gouv.qc.ca': 'High',
        
        # === CRÉDIBILITÉ MOYENNE ===
        'wikipedia.org': 'Medium',
        'medium.com': 'Medium',
        'huffpost.com': 'Medium',
        'buzzfeed.com': 'Medium',
        'vice.com': 'Medium',
        'slate.com': 'Medium',
        'theconversation.com': 'Medium',
        
        # === BASSE CRÉDIBILITÉ ===
        'infowars.com': 'Low',
        'naturalnews.com': 'Low',
        'breitbart.com': 'Low',
        'dailystormer.su': 'Low',
        'beforeitsnews.com': 'Low',
        'worldtruth.tv': 'Low',
        'yournewswire.com': 'Low',
    }
    
    # === Patterns de mésinformation ===
    MISINFORMATION_KEYWORDS = [
        'conspiracy', 'hoax', 'fake news', 'miracle cure', 
        "they don't want you to know", 'mainstream media lies',
        'deep state', 'plandemic', 'wake up sheeple',
        'big pharma cover-up', 'government conspiracy',
        'censored truth', 'what they hide'
    ]
    
    @classmethod
    def load_external_reputations(cls, filepath: str) -> None:
        """
        Charger des réputations supplémentaires depuis un fichier JSON.
        
        Args:
            filepath: Chemin vers le fichier JSON avec format:
                      {"domain.com": "High", "autre.com": "Low"}
        """
        import json
        try:
            with open(filepath, 'r') as f:
                external_reps = json.load(f)
                cls.SOURCE_REPUTATIONS.update(external_reps)
                print(f"[Config] Loaded {len(external_reps)} external reputations")
        except Exception as e:
            print(f"[Config] Could not load external reputations: {e}")
    
    @classmethod
    def update_weights(cls, new_weights: Dict[str, float]) -> None:
        """
        Mettre à jour les pondérations des scores.
        
        Args:
            new_weights: Dictionnaire avec les nouvelles pondérations
        """
        cls.SCORE_WEIGHTS.update(new_weights)
        # Normaliser pour que la somme = 1
        total = sum(cls.SCORE_WEIGHTS.values())
        cls.SCORE_WEIGHTS = {k: v/total for k, v in cls.SCORE_WEIGHTS.items()}
        print(f"[Config] Updated weights: {cls.SCORE_WEIGHTS}")
    
    @classmethod
    def to_dict(cls) -> Dict:
        """Exporter la configuration actuelle en dictionnaire."""
        return {
            'host': cls.HOST,
            'port': cls.PORT,
            'debug': cls.DEBUG,
            'google_api_configured': cls.GOOGLE_FACT_CHECK_API_KEY is not None,
            'ml_models_enabled': cls.LOAD_ML_MODELS,
            'score_weights': cls.SCORE_WEIGHTS,
            'known_sources_count': len(cls.SOURCE_REPUTATIONS),
            'ontology_base': str(cls.ONTOLOGY_BASE_PATH),
            'ontology_data': str(cls.ONTOLOGY_DATA_PATH),
        }
    
    @classmethod
    def print_config(cls) -> None:
        """Afficher la configuration actuelle."""
        print("=" * 50)
        print("SysCRED Configuration")
        print("=" * 50)
        for key, value in cls.to_dict().items():
            print(f"  {key}: {value}")
        print("=" * 50)


# === Configuration par environnement ===

class DevelopmentConfig(Config):
    """Configuration pour développement local."""
    DEBUG = True
    LOAD_ML_MODELS = True


class ProductionConfig(Config):
    """Configuration pour production."""
    DEBUG = False
    LOAD_ML_MODELS = True
    HOST = "0.0.0.0"


class TestingConfig(Config):
    """Configuration pour tests."""
    DEBUG = True
    LOAD_ML_MODELS = False  # Plus rapide pour les tests
    WEB_FETCH_TIMEOUT = 5


# Sélection automatique de la configuration
def get_config() -> Config:
    """
    Retourne la configuration appropriée selon l'environnement.
    
    Variable d'environnement: SYSCRED_ENV (development, production, testing)
    """
    env = os.getenv("SYSCRED_ENV", "development").lower()
    
    configs = {
        'development': DevelopmentConfig,
        'production': ProductionConfig,
        'testing': TestingConfig,
    }
    
    return configs.get(env, DevelopmentConfig)


# Instance par défaut
config = get_config()


if __name__ == "__main__":
    # Test de la configuration
    config.print_config()
    
    print("\n=== Source Reputations Sample ===")
    for domain, rep in list(config.SOURCE_REPUTATIONS.items())[:10]:
        print(f"  {domain}: {rep}")