D Ф m i И i q ц e L Ф y e r commited on
Commit
1a81e0d
·
1 Parent(s): 34a26a7

Fix: sync working Sandbox version - NER, E-E-A-T functional

Browse files
syscred/eeat_calculator.py CHANGED
@@ -1,41 +1,118 @@
 
1
  # -*- coding: utf-8 -*-
2
  """
3
- E-E-A-T Calculator Module - SysCRED
4
- ====================================
5
- Google Quality Rater Guidelines implementation.
6
 
7
- E-E-A-T Scores:
8
- - Experience: Domain age, content richness
9
- - Expertise: Technical vocabulary, citations
10
- - Authority: Estimated PageRank, backlinks
11
- - Trust: HTTPS, unbiased sentiment
12
-
13
- (c) Dominique S. Loyer - PhD Thesis Prototype
14
  """
15
 
 
 
16
  import re
17
- from typing import Dict, Optional
18
- from urllib.parse import urlparse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  class EEATCalculator:
22
  """
23
- Calculate E-E-A-T scores based on Google Quality Rater Guidelines.
 
 
 
 
 
 
24
  """
25
 
26
- # Technical terms that indicate expertise
27
- TECHNICAL_TERMS = {
28
- 'research', 'study', 'analysis', 'data', 'evidence', 'methodology',
29
- 'peer-reviewed', 'journal', 'university', 'professor', 'dr.', 'phd',
30
- 'statistics', 'experiment', 'hypothesis', 'publication', 'citation',
31
- 'algorithm', 'framework', 'systematic', 'empirical', 'quantitative'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
 
34
- # Trusted domains (simplified list)
35
- TRUSTED_DOMAINS = {
36
- '.edu', '.gov', '.org', 'reuters.com', 'apnews.com', 'bbc.com',
37
- 'nature.com', 'science.org', 'who.int', 'un.org', 'wikipedia.org',
38
- 'lemonde.fr', 'radio-canada.ca', 'uqam.ca', 'umontreal.ca'
 
39
  }
40
 
41
  def __init__(self):
@@ -44,227 +121,346 @@ class EEATCalculator:
44
 
45
  def calculate(
46
  self,
47
- url: Optional[str] = None,
48
- text: Optional[str] = None,
49
- sentiment_score: float = 0.5,
50
- has_citations: bool = False,
51
- domain_age_years: int = 0
52
- ) -> Dict:
 
 
 
 
53
  """
54
- Calculate E-E-A-T scores.
55
 
56
  Args:
57
  url: Source URL
58
- text: Content text
59
- sentiment_score: 0-1 (0.5 = neutral is best for trust)
60
- has_citations: Whether content has citations
61
- domain_age_years: Estimated domain age
62
-
 
 
 
 
63
  Returns:
64
- {
65
- 'experience': 0.75,
66
- 'expertise': 0.80,
67
- 'authority': 0.65,
68
- 'trust': 0.90,
69
- 'overall': 0.78,
70
- 'details': {...}
71
- }
72
  """
73
- details = {}
74
-
75
- # --- EXPERIENCE ---
76
- experience = 0.5
77
- if domain_age_years >= 10:
78
- experience += 0.3
79
- elif domain_age_years >= 5:
80
- experience += 0.2
81
- elif domain_age_years >= 2:
82
- experience += 0.1
83
-
84
- if text:
85
- word_count = len(text.split())
86
- if word_count >= 1000:
87
- experience += 0.15
88
- elif word_count >= 500:
89
- experience += 0.1
90
-
91
- experience = min(experience, 1.0)
92
- details['experience_factors'] = {
93
- 'domain_age_bonus': domain_age_years >= 2,
94
- 'content_richness': len(text.split()) if text else 0
95
- }
96
 
97
- # --- EXPERTISE ---
98
- expertise = 0.4
99
- tech_count = 0
 
 
 
100
 
101
- if text:
102
- text_lower = text.lower()
103
- for term in self.TECHNICAL_TERMS:
104
- if term in text_lower:
105
- tech_count += 1
106
-
107
- if tech_count >= 5:
108
- expertise += 0.35
109
- elif tech_count >= 3:
110
- expertise += 0.25
111
- elif tech_count >= 1:
112
- expertise += 0.15
113
-
114
- if has_citations:
115
- expertise += 0.2
116
-
117
- expertise = min(expertise, 1.0)
118
- details['expertise_factors'] = {
119
- 'technical_terms_found': tech_count,
120
- 'has_citations': has_citations
121
- }
122
 
123
- # --- AUTHORITY ---
124
- authority = 0.3
 
 
 
125
 
126
- if url:
127
- parsed = urlparse(url)
128
- domain = parsed.netloc.lower()
129
-
130
- for trusted in self.TRUSTED_DOMAINS:
131
- if trusted in domain:
132
- authority += 0.4
133
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- if parsed.scheme == 'https':
136
- authority += 0.1
137
-
138
- # Check for author indicators in text
139
- if text:
140
- author_patterns = [r'by\s+\w+\s+\w+', r'author:', r'written by', r'par\s+\w+']
141
- for pattern in author_patterns:
142
- if re.search(pattern, text.lower()):
143
- authority += 0.15
144
- break
145
-
146
- authority = min(authority, 1.0)
147
- details['authority_factors'] = {
148
- 'trusted_domain': False,
149
- 'https': url and urlparse(url).scheme == 'https' if url else False
150
- }
151
 
152
- # --- TRUST ---
153
- trust = 0.5
154
-
155
- # Neutral sentiment is best (0.5)
156
- sentiment_deviation = abs(sentiment_score - 0.5)
157
- if sentiment_deviation < 0.1:
158
- trust += 0.3 # Very neutral
159
- elif sentiment_deviation < 0.2:
160
- trust += 0.2
161
- elif sentiment_deviation < 0.3:
162
- trust += 0.1
163
-
164
- if url and urlparse(url).scheme == 'https':
165
- trust += 0.15
166
-
167
- trust = min(trust, 1.0)
168
- details['trust_factors'] = {
169
- 'sentiment_neutrality': 1 - sentiment_deviation * 2,
170
- 'secure_connection': url and 'https' in url if url else False
171
- }
172
 
173
- # --- OVERALL ---
174
- overall = (experience * 0.2 + expertise * 0.3 +
175
- authority * 0.25 + trust * 0.25)
 
 
176
 
177
- return {
178
- 'experience': round(experience, 2),
179
- 'expertise': round(expertise, 2),
180
- 'authority': round(authority, 2),
181
- 'trust': round(trust, 2),
182
- 'overall': round(overall, 2),
183
- 'details': details
184
- }
185
 
186
- def get_explanation(self, scores: Dict) -> str:
187
- """Generate human-readable explanation of E-E-A-T scores."""
 
 
 
 
 
 
 
 
 
 
 
188
  explanations = []
189
 
190
- exp = scores.get('experience', 0)
191
- if exp >= 0.7:
192
- explanations.append("✅ Expérience: Source établie avec contenu riche")
193
- elif exp >= 0.5:
194
- explanations.append("⚠️ Expérience: Source moyennement établie")
195
  else:
196
- explanations.append(" Expérience: Source nouvelle ou contenu limité")
197
 
198
- ext = scores.get('expertise', 0)
199
- if ext >= 0.7:
200
- explanations.append("✅ Expertise: Vocabulaire technique, citations présentes")
201
- elif ext >= 0.5:
202
- explanations.append("⚠️ Expertise: Niveau technique moyen")
203
  else:
204
- explanations.append(" Expertise: Manque de terminologie spécialisée")
205
 
206
- auth = scores.get('authority', 0)
207
- if auth >= 0.7:
208
- explanations.append("✅ Autorité: Domaine reconnu et fiable")
209
- elif auth >= 0.5:
210
- explanations.append("⚠️ Autorité: Niveau d'autorité moyen")
211
  else:
212
- explanations.append(" Autorité: Source non reconnue")
213
 
214
- tr = scores.get('trust', 0)
215
- if tr >= 0.7:
216
- explanations.append("✅ Confiance: Ton neutre, connexion sécurisée")
217
- elif tr >= 0.5:
218
- explanations.append("⚠️ Confiance: Niveau de confiance moyen")
219
  else:
220
- explanations.append(" Confiance: Ton biaisé ou connexion non sécurisée")
221
 
222
  return "\n".join(explanations)
223
 
224
 
225
- # Singleton
226
- _calculator = None
227
-
228
- def get_calculator() -> EEATCalculator:
229
- """Get or create E-E-A-T calculator singleton."""
230
- global _calculator
231
- if _calculator is None:
232
- _calculator = EEATCalculator()
233
- return _calculator
234
-
235
-
236
- # --- Testing ---
237
  if __name__ == "__main__":
238
- print("=" * 60)
239
- print("SysCRED E-E-A-T Calculator - Test")
240
- print("=" * 60)
241
-
242
  calc = EEATCalculator()
243
 
244
- test_url = "https://www.nature.com/articles/example"
245
  test_text = """
246
- A peer-reviewed study published in the journal Nature found evidence
247
- that the new methodology significantly improves research outcomes.
248
- Dr. Smith from Harvard University presented the statistics at the conference.
249
  """
250
 
251
- result = calc.calculate(
 
 
 
 
 
252
  url=test_url,
253
  text=test_text,
254
- sentiment_score=0.5,
255
- has_citations=True,
256
- domain_age_years=15
 
257
  )
258
 
259
- print("\n--- E-E-A-T Scores ---")
260
- print(f" Experience: {result['experience']:.0%}")
261
- print(f" Expertise: {result['expertise']:.0%}")
262
- print(f" Authority: {result['authority']:.0%}")
263
- print(f" Trust: {result['trust']:.0%}")
264
- print(f" ─────────────────")
265
- print(f" OVERALL: {result['overall']:.0%}")
266
-
267
- print("\n--- Explanation ---")
268
- print(calc.get_explanation(result))
269
-
270
- print("\n" + "=" * 60)
 
1
+ #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ E-E-A-T Metrics Calculator for SysCRED
5
+ ========================================
6
+ Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
7
 
8
+ These metrics mirror modern Google ranking signals:
9
+ - Experience: Domain age, content freshness
10
+ - Expertise: Author identification, depth of content
11
+ - Authority: PageRank simulation, citations/backlinks
12
+ - Trust: HTTPS, fact-checks, low bias score
 
 
13
  """
14
 
15
+ from typing import Dict, Any, Optional, List
16
+ from dataclasses import dataclass
17
  import re
18
+ from datetime import datetime
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class EEATScore:
26
+ """E-E-A-T score container."""
27
+ experience: float # 0-1
28
+ expertise: float # 0-1
29
+ authority: float # 0-1
30
+ trust: float # 0-1
31
+
32
+ @property
33
+ def overall(self) -> float:
34
+ """Weighted average of all E-E-A-T components."""
35
+ # Weights based on Google's emphasis
36
+ weights = {
37
+ 'experience': 0.15,
38
+ 'expertise': 0.25,
39
+ 'authority': 0.35,
40
+ 'trust': 0.25
41
+ }
42
+ return (
43
+ self.experience * weights['experience'] +
44
+ self.expertise * weights['expertise'] +
45
+ self.authority * weights['authority'] +
46
+ self.trust * weights['trust']
47
+ )
48
+
49
+ def to_dict(self) -> Dict[str, Any]:
50
+ """Convert to dictionary for JSON serialization."""
51
+ return {
52
+ 'experience': round(self.experience, 3),
53
+ 'expertise': round(self.expertise, 3),
54
+ 'authority': round(self.authority, 3),
55
+ 'trust': round(self.trust, 3),
56
+ 'overall': round(self.overall, 3),
57
+ 'experience_pct': f"{int(self.experience * 100)}%",
58
+ 'expertise_pct': f"{int(self.expertise * 100)}%",
59
+ 'authority_pct': f"{int(self.authority * 100)}%",
60
+ 'trust_pct': f"{int(self.trust * 100)}%",
61
+ 'overall_pct': f"{int(self.overall * 100)}%"
62
+ }
63
 
64
 
65
  class EEATCalculator:
66
  """
67
+ Calculate E-E-A-T metrics from various signals.
68
+
69
+ Mirrors Google's quality rater guidelines:
70
+ - Experience: Has the author demonstrated real experience?
71
+ - Expertise: Is the content expert-level?
72
+ - Authority: Is the source recognized as authoritative?
73
+ - Trust: Is the source trustworthy?
74
  """
75
 
76
+ # Known authoritative domains
77
+ AUTHORITATIVE_DOMAINS = {
78
+ # News
79
+ 'lemonde.fr': 0.95,
80
+ 'lefigaro.fr': 0.90,
81
+ 'liberation.fr': 0.88,
82
+ 'nytimes.com': 0.95,
83
+ 'washingtonpost.com': 0.93,
84
+ 'theguardian.com': 0.92,
85
+ 'bbc.com': 0.94,
86
+ 'bbc.co.uk': 0.94,
87
+ 'reuters.com': 0.96,
88
+ 'apnews.com': 0.95,
89
+ # Academic
90
+ 'nature.com': 0.98,
91
+ 'science.org': 0.98,
92
+ 'pubmed.ncbi.nlm.nih.gov': 0.97,
93
+ 'scholar.google.com': 0.85,
94
+ # Government
95
+ 'gouv.fr': 0.90,
96
+ 'gov.uk': 0.90,
97
+ 'whitehouse.gov': 0.88,
98
+ 'europa.eu': 0.92,
99
+ # Fact-checkers
100
+ 'snopes.com': 0.88,
101
+ 'factcheck.org': 0.90,
102
+ 'politifact.com': 0.88,
103
+ 'fullfact.org': 0.89,
104
+ # Wikipedia (moderate authority)
105
+ 'wikipedia.org': 0.75,
106
+ 'fr.wikipedia.org': 0.75,
107
+ 'en.wikipedia.org': 0.75,
108
  }
109
 
110
+ # Low-trust domains (misinformation sources)
111
+ LOW_TRUST_DOMAINS = {
112
+ 'infowars.com': 0.1,
113
+ 'breitbart.com': 0.3,
114
+ 'naturalnews.com': 0.15,
115
+ # Add more as needed
116
  }
117
 
118
  def __init__(self):
 
121
 
122
  def calculate(
123
  self,
124
+ url: str,
125
+ text: str,
126
+ nlp_analysis: Optional[Dict[str, Any]] = None,
127
+ pagerank: Optional[float] = None,
128
+ fact_checks: Optional[List[Dict]] = None,
129
+ domain_age_years: Optional[float] = None,
130
+ has_https: bool = True,
131
+ author_identified: bool = False,
132
+ seo_score: Optional[float] = None
133
+ ) -> EEATScore:
134
  """
135
+ Calculate E-E-A-T scores from available signals.
136
 
137
  Args:
138
  url: Source URL
139
+ text: Article text content
140
+ nlp_analysis: NLP analysis results (sentiment, coherence, bias)
141
+ pagerank: Simulated PageRank score (0-1)
142
+ fact_checks: List of fact-check results
143
+ domain_age_years: Domain age in years (from WHOIS)
144
+ has_https: Whether site uses HTTPS
145
+ author_identified: Whether author is clearly identified
146
+ seo_score: SEO/technical quality score
147
+
148
  Returns:
149
+ EEATScore with all component scores
 
 
 
 
 
 
 
150
  """
151
+ # Extract domain from URL
152
+ domain = self._extract_domain(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # Calculate each component
155
+ experience = self._calculate_experience(
156
+ domain_age_years,
157
+ text,
158
+ nlp_analysis
159
+ )
160
 
161
+ expertise = self._calculate_expertise(
162
+ text,
163
+ author_identified,
164
+ nlp_analysis
165
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ authority = self._calculate_authority(
168
+ domain,
169
+ pagerank,
170
+ seo_score
171
+ )
172
 
173
+ trust = self._calculate_trust(
174
+ domain,
175
+ has_https,
176
+ fact_checks,
177
+ nlp_analysis
178
+ )
179
+
180
+ return EEATScore(
181
+ experience=experience,
182
+ expertise=expertise,
183
+ authority=authority,
184
+ trust=trust
185
+ )
186
+
187
+ def _extract_domain(self, url: str) -> str:
188
+ """Extract domain from URL."""
189
+ import re
190
+ match = re.search(r'https?://(?:www\.)?([^/]+)', url)
191
+ return match.group(1).lower() if match else url.lower()
192
+
193
+ def _calculate_experience(
194
+ self,
195
+ domain_age_years: Optional[float],
196
+ text: str,
197
+ nlp_analysis: Optional[Dict]
198
+ ) -> float:
199
+ """
200
+ Calculate Experience score.
201
+
202
+ Factors:
203
+ - Domain age (longer = more experience)
204
+ - Content freshness (recently updated)
205
+ - First-hand experience indicators in text
206
+ """
207
+ score = 0.5 # Base score
208
+
209
+ # Domain age contribution (max 0.3)
210
+ if domain_age_years is not None:
211
+ age_score = min(domain_age_years / 20, 1.0) * 0.3 # 20 years = max
212
+ score += age_score
213
+ else:
214
+ score += 0.15 # Assume moderate age
215
+
216
+ # Content depth contribution (max 0.2)
217
+ word_count = len(text.split()) if text else 0
218
+ if word_count > 1000:
219
+ score += 0.2
220
+ elif word_count > 500:
221
+ score += 0.15
222
+ elif word_count > 200:
223
+ score += 0.1
224
+
225
+ # First-hand experience indicators (max 0.1)
226
+ experience_indicators = [
227
+ r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
228
+ r'\b(interview|entretien|témoignage|witness|firsthand)\b',
229
+ r'\b(sur place|on the ground|eyewitness)\b'
230
+ ]
231
+ for pattern in experience_indicators:
232
+ if re.search(pattern, text, re.IGNORECASE):
233
+ score += 0.03
234
+
235
+ return min(score, 1.0)
236
+
237
+ def _calculate_expertise(
238
+ self,
239
+ text: str,
240
+ author_identified: bool,
241
+ nlp_analysis: Optional[Dict]
242
+ ) -> float:
243
+ """
244
+ Calculate Expertise score.
245
+
246
+ Factors:
247
+ - Author identification
248
+ - Technical depth of content
249
+ - Citation of sources
250
+ - Coherence (from NLP)
251
+ """
252
+ score = 0.4 # Base score
253
+
254
+ # Author identification (0.2)
255
+ if author_identified:
256
+ score += 0.2
257
+
258
+ # Citation indicators (max 0.2)
259
+ citation_patterns = [
260
+ r'\b(selon|according to|d\'après|source:)\b',
261
+ r'\b(étude|study|research|rapport|report)\b',
262
+ r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
263
+ r'\[([\d]+)\]', # [1] style citations
264
+ r'https?://[^\s]+' # Links
265
+ ]
266
+ citation_count = 0
267
+ for pattern in citation_patterns:
268
+ citation_count += len(re.findall(pattern, text, re.IGNORECASE))
269
+ score += min(citation_count * 0.02, 0.2)
270
+
271
+ # Coherence from NLP analysis (0.2)
272
+ if nlp_analysis and 'coherence' in nlp_analysis:
273
+ coherence = nlp_analysis['coherence']
274
+ if isinstance(coherence, dict):
275
+ coherence = coherence.get('score', 0.5)
276
+ score += coherence * 0.2
277
+ else:
278
+ score += 0.1 # Assume moderate coherence
279
+
280
+ return min(score, 1.0)
281
+
282
+ def _calculate_authority(
283
+ self,
284
+ domain: str,
285
+ pagerank: Optional[float],
286
+ seo_score: Optional[float]
287
+ ) -> float:
288
+ """
289
+ Calculate Authority score.
290
+
291
+ Factors:
292
+ - Known authoritative domain
293
+ - PageRank simulation
294
+ - SEO/technical quality
295
+ """
296
+ score = 0.3 # Base score
297
+
298
+ # Known domain authority (max 0.5)
299
+ for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
300
+ if known_domain in domain:
301
+ score = max(score, authority * 0.5 + 0.3)
302
+ break
303
+
304
+ # Check low-trust domains
305
+ for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
306
+ if low_trust_domain in domain:
307
+ score = min(score, low_score)
308
+ break
309
+
310
+ # PageRank contribution (max 0.3)
311
+ if pagerank is not None:
312
+ score += pagerank * 0.3
313
+ else:
314
+ score += 0.15 # Assume moderate pagerank
315
+
316
+ # SEO score contribution (max 0.2)
317
+ if seo_score is not None:
318
+ score += seo_score * 0.2
319
+ else:
320
+ score += 0.1
321
+
322
+ return min(score, 1.0)
323
+
324
+ def _calculate_trust(
325
+ self,
326
+ domain: str,
327
+ has_https: bool,
328
+ fact_checks: Optional[List[Dict]],
329
+ nlp_analysis: Optional[Dict]
330
+ ) -> float:
331
+ """
332
+ Calculate Trust score.
333
+
334
+ Factors:
335
+ - HTTPS
336
+ - Fact-check results
337
+ - Bias score (low = better)
338
+ - Known trustworthy domain
339
+ """
340
+ score = 0.4 # Base score
341
+
342
+ # HTTPS (0.1)
343
+ if has_https:
344
+ score += 0.1
345
+
346
+ # Fact-check results (max 0.3)
347
+ if fact_checks:
348
+ positive_checks = sum(1 for fc in fact_checks
349
+ if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
350
+ negative_checks = sum(1 for fc in fact_checks
351
+ if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
352
 
353
+ if positive_checks > 0:
354
+ score += 0.2
355
+ if negative_checks > 0:
356
+ score -= 0.3
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ # Bias score (max 0.2, lower bias = higher trust)
359
+ if nlp_analysis:
360
+ bias_data = nlp_analysis.get('bias_analysis', {})
361
+ if isinstance(bias_data, dict):
362
+ bias_score = bias_data.get('score', 0.3)
363
+ else:
364
+ bias_score = 0.3
365
+ # Invert: low bias = high trust contribution
366
+ score += (1 - bias_score) * 0.2
367
+ else:
368
+ score += 0.1
 
 
 
 
 
 
 
 
 
369
 
370
+ # Known trustworthy domain (0.1)
371
+ for known_domain in self.AUTHORITATIVE_DOMAINS:
372
+ if known_domain in domain:
373
+ score += 0.1
374
+ break
375
 
376
+ # Known low-trust domain (penalty)
377
+ for low_trust_domain in self.LOW_TRUST_DOMAINS:
378
+ if low_trust_domain in domain:
379
+ score -= 0.3
380
+ break
381
+
382
+ return max(min(score, 1.0), 0.0)
 
383
 
384
+ def explain_score(self, eeat: EEATScore, url: str) -> str:
385
+ """
386
+ Generate human-readable explanation of E-E-A-T score.
387
+
388
+ Args:
389
+ eeat: EEATScore instance
390
+ url: Source URL
391
+
392
+ Returns:
393
+ Formatted explanation string
394
+ """
395
+ domain = self._extract_domain(url)
396
+
397
  explanations = []
398
 
399
+ # Experience
400
+ if eeat.experience >= 0.8:
401
+ explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
402
+ elif eeat.experience >= 0.5:
403
+ explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
404
  else:
405
+ explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
406
 
407
+ # Expertise
408
+ if eeat.expertise >= 0.8:
409
+ explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
410
+ elif eeat.expertise >= 0.5:
411
+ explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
412
  else:
413
+ explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
414
 
415
+ # Authority
416
+ if eeat.authority >= 0.8:
417
+ explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
418
+ elif eeat.authority >= 0.5:
419
+ explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
420
  else:
421
+ explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
422
 
423
+ # Trust
424
+ if eeat.trust >= 0.8:
425
+ explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
426
+ elif eeat.trust >= 0.5:
427
+ explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
428
  else:
429
+ explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
430
 
431
  return "\n".join(explanations)
432
 
433
 
434
+ # Test
 
 
 
 
 
 
 
 
 
 
 
435
  if __name__ == "__main__":
 
 
 
 
436
  calc = EEATCalculator()
437
 
438
+ test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
439
  test_text = """
440
+ Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
441
+ au Capitol a été un événement marquant. Notre reporter sur place a témoigné
442
+ des événements. Les experts politiques analysent les conséquences.
443
  """
444
 
445
+ nlp_analysis = {
446
+ 'coherence': {'score': 0.8},
447
+ 'bias_analysis': {'score': 0.2}
448
+ }
449
+
450
+ eeat = calc.calculate(
451
  url=test_url,
452
  text=test_text,
453
+ nlp_analysis=nlp_analysis,
454
+ pagerank=0.7,
455
+ has_https=True,
456
+ author_identified=True
457
  )
458
 
459
+ print("=== E-E-A-T Scores ===")
460
+ print(f"Experience: {eeat.experience_pct}")
461
+ print(f"Expertise: {eeat.expertise_pct}")
462
+ print(f"Authority: {eeat.authority_pct}")
463
+ print(f"Trust: {eeat.trust_pct}")
464
+ print(f"Overall: {eeat.overall_pct}")
465
+ print("\n=== Explanation ===")
466
+ print(calc.explain_score(eeat, test_url))
 
 
 
 
syscred/ner_analyzer.py CHANGED
@@ -1,198 +1,283 @@
 
1
  # -*- coding: utf-8 -*-
2
  """
3
- NER Analyzer Module - SysCRED
4
- ==============================
5
- Named Entity Recognition for fact-checking enhancement.
6
 
7
- Extracts: PERSON, ORG, GPE, DATE, MISC entities
8
-
9
- (c) Dominique S. Loyer - PhD Thesis Prototype
 
 
 
 
10
  """
11
 
12
- import os
 
13
 
14
- # Check for spaCy
15
  try:
16
  import spacy
 
17
  HAS_SPACY = True
18
  except ImportError:
19
  HAS_SPACY = False
20
- print("[NER] spaCy not installed. NER disabled.")
 
 
21
 
22
 
23
  class NERAnalyzer:
24
  """
25
- Named Entity Recognition using spaCy.
26
 
27
- Supports:
28
- - French (fr_core_news_md)
29
- - English (en_core_web_sm)
30
  """
31
 
32
- # Entity type mapping with icons
33
- ENTITY_ICONS = {
34
- 'PERSON': '👤',
35
- 'PER': '👤',
36
- 'ORG': '🏢',
37
- 'GPE': '📍',
38
- 'LOC': '📍',
39
- 'DATE': '📅',
40
- 'TIME': '🕐',
41
- 'MONEY': '💰',
42
- 'MISC': '🏷️',
43
- 'NORP': '👥',
44
- 'FAC': '🏛️',
45
- 'PRODUCT': '📦',
46
- 'EVENT': '🎉',
47
- 'WORK_OF_ART': '🎨',
48
- 'LAW': '⚖️',
49
- 'LANGUAGE': '🗣️',
50
  }
51
 
52
- def __init__(self, language: str = 'en'):
53
  """
54
  Initialize NER analyzer.
55
 
56
  Args:
57
- language: 'en' or 'fr'
 
58
  """
59
- self.language = language
 
60
  self.nlp = None
61
- self.enabled = False
62
 
63
  if HAS_SPACY:
64
- self._load_model()
65
-
66
- def _load_model(self):
67
- """Load the appropriate spaCy model."""
68
- models = {
69
- 'en': ['en_core_web_sm', 'en_core_web_md'],
70
- 'fr': ['fr_core_news_md', 'fr_core_news_sm']
71
- }
72
-
73
- for model_name in models.get(self.language, models['en']):
74
  try:
75
  self.nlp = spacy.load(model_name)
76
- self.enabled = True
77
- print(f"[NER] Loaded model: {model_name}")
78
- break
79
- except OSError:
80
- continue
81
-
82
- if not self.enabled:
83
- print(f"[NER] No model found for language: {self.language}")
 
 
84
 
85
- def extract_entities(self, text: str) -> dict:
86
  """
87
  Extract named entities from text.
88
 
 
 
 
89
  Returns:
90
- {
91
- 'entities': [
92
- {'text': 'Emmanuel Macron', 'type': 'PERSON', 'icon': '👤'},
93
- ...
94
- ],
95
- 'summary': {
96
- 'PERSON': ['Emmanuel Macron'],
97
- 'ORG': ['UQAM', 'Google'],
98
- ...
99
- }
100
- }
101
  """
102
- if not self.enabled or not text:
103
- return {'entities': [], 'summary': {}}
104
 
 
 
 
 
 
 
 
 
 
105
  doc = self.nlp(text)
106
-
107
- entities = []
108
- summary = {}
109
- seen = set()
110
 
111
  for ent in doc.ents:
112
- # Avoid duplicates
113
- key = (ent.text.lower(), ent.label_)
114
- if key in seen:
115
- continue
116
- seen.add(key)
117
 
118
- entity = {
 
 
 
 
 
 
 
119
  'text': ent.text,
120
- 'type': ent.label_,
121
- 'icon': self.ENTITY_ICONS.get(ent.label_, '🏷️'),
122
  'start': ent.start_char,
123
- 'end': ent.end_char
 
 
 
 
124
  }
125
- entities.append(entity)
126
 
127
- # Group by type
128
- if ent.label_ not in summary:
129
- summary[ent.label_] = []
130
- summary[ent.label_].append(ent.text)
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- return {
133
- 'entities': entities,
134
- 'summary': summary,
135
- 'count': len(entities)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- def analyze_for_factcheck(self, text: str) -> dict:
139
  """
140
- Analyze text for fact-checking relevance.
141
 
142
- Returns entities with credibility hints.
 
 
 
 
143
  """
144
- result = self.extract_entities(text)
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Add fact-checking hints
147
- hints = []
 
 
 
148
 
149
- for ent in result.get('entities', []):
150
- if ent['type'] in ['PERSON', 'PER']:
151
- hints.append(f"Verify claims about {ent['text']}")
152
- elif ent['type'] == 'ORG':
153
- hints.append(f"Check {ent['text']} official sources")
154
- elif ent['type'] in ['GPE', 'LOC']:
155
- hints.append(f"Verify location: {ent['text']}")
156
- elif ent['type'] == 'DATE':
157
- hints.append(f"Confirm date: {ent['text']}")
 
 
 
 
 
158
 
159
- result['fact_check_hints'] = hints[:5] # Top 5 hints
 
160
  return result
161
 
162
 
163
- # Singleton instance
164
- _analyzer = None
 
165
 
166
- def get_analyzer(language: str = 'en') -> NERAnalyzer:
167
- """Get or create the NER analyzer singleton."""
168
- global _analyzer
169
- if _analyzer is None:
170
- _analyzer = NERAnalyzer(language)
171
- return _analyzer
172
 
173
 
174
- # --- Testing ---
175
  if __name__ == "__main__":
176
- print("=" * 60)
177
- print("SysCRED NER Analyzer - Test")
178
- print("=" * 60)
179
-
180
- analyzer = NERAnalyzer('en')
181
 
182
  test_text = """
183
- Emmanuel Macron announced today that France will invest €500 million
184
- in AI research. The announcement was made at the UQAM in Montreal, Canada
185
- on February 8, 2026. Google and Microsoft also confirmed their participation.
186
  """
187
 
188
- result = analyzer.analyze_for_factcheck(test_text)
189
-
190
- print("\n--- Entities Found ---")
191
- for ent in result['entities']:
192
- print(f" {ent['icon']} {ent['text']} ({ent['type']})")
193
-
194
- print("\n--- Fact-Check Hints ---")
195
- for hint in result.get('fact_check_hints', []):
196
- print(f" • {hint}")
197
-
198
- print("\n" + "=" * 60)
 
1
+ #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ Named Entity Recognition (NER) Analyzer for SysCRED
5
+ ====================================================
6
+ Extracts named entities from text using spaCy.
7
 
8
+ Entities detected:
9
+ - PER: Persons (Donald Trump, Emmanuel Macron)
10
+ - ORG: Organizations (FBI, UN, Google)
11
+ - LOC: Locations (Paris, Capitol)
12
+ - DATE: Dates (January 6, 2021)
13
+ - MONEY: Amounts ($10 million)
14
+ - EVENT: Events (insurrection, election)
15
  """
16
 
17
+ from typing import Dict, List, Any, Optional
18
+ import logging
19
 
20
+ # Try to import spaCy
21
  try:
22
  import spacy
23
+ from spacy.language import Language
24
  HAS_SPACY = True
25
  except ImportError:
26
  HAS_SPACY = False
27
+ spacy = None
28
+
29
+ logger = logging.getLogger(__name__)
30
 
31
 
32
  class NERAnalyzer:
33
  """
34
+ Named Entity Recognition analyzer using spaCy.
35
 
36
+ Supports French (fr_core_news_md) and English (en_core_web_md).
37
+ Falls back to heuristic extraction if spaCy is not available.
 
38
  """
39
 
40
+ # Entity type mappings for display
41
+ ENTITY_LABELS = {
42
+ 'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
43
+ 'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
44
+ 'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
45
+ 'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
46
+ 'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
47
+ 'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
48
+ 'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
49
+ 'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
50
+ 'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
51
+ 'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
52
+ 'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
53
+ 'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
54
+ 'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
55
+ 'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
 
 
56
  }
57
 
58
+ def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
59
  """
60
  Initialize NER analyzer.
61
 
62
  Args:
63
+ model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
64
+ fallback: If True, use heuristics when spaCy unavailable
65
  """
66
+ self.model_name = model_name
67
+ self.fallback = fallback
68
  self.nlp = None
69
+ self.use_heuristics = False
70
 
71
  if HAS_SPACY:
 
 
 
 
 
 
 
 
 
 
72
  try:
73
  self.nlp = spacy.load(model_name)
74
+ logger.info(f"[NER] Loaded spaCy model: {model_name}")
75
+ except OSError as e:
76
+ logger.warning(f"[NER] Could not load model {model_name}: {e}")
77
+ if fallback:
78
+ self.use_heuristics = True
79
+ logger.info("[NER] Using heuristic entity extraction")
80
+ else:
81
+ if fallback:
82
+ self.use_heuristics = True
83
+ logger.info("[NER] spaCy not installed. Using heuristic extraction")
84
 
85
+ def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
86
  """
87
  Extract named entities from text.
88
 
89
+ Args:
90
+ text: Input text to analyze
91
+
92
  Returns:
93
+ Dictionary mapping entity types to lists of entities
94
+ Each entity has: text, start, end, label, label_display, emoji, confidence
 
 
 
 
 
 
 
 
 
95
  """
96
+ if not text or len(text.strip()) == 0:
97
+ return {}
98
 
99
+ if self.nlp:
100
+ return self._extract_with_spacy(text)
101
+ elif self.use_heuristics:
102
+ return self._extract_with_heuristics(text)
103
+ else:
104
+ return {}
105
+
106
+ def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
107
+ """Extract entities using spaCy NLP."""
108
  doc = self.nlp(text)
109
+ entities: Dict[str, List[Dict[str, Any]]] = {}
 
 
 
110
 
111
  for ent in doc.ents:
112
+ label = ent.label_
 
 
 
 
113
 
114
+ # Get display info
115
+ label_info = self.ENTITY_LABELS.get(label, {
116
+ 'fr': label,
117
+ 'en': label,
118
+ 'emoji': '🔖'
119
+ })
120
+
121
+ entity_data = {
122
  'text': ent.text,
 
 
123
  'start': ent.start_char,
124
+ 'end': ent.end_char,
125
+ 'label': label,
126
+ 'label_display': label_info.get('fr', label),
127
+ 'emoji': label_info.get('emoji', '🔖'),
128
+ 'confidence': 0.85 # spaCy doesn't provide confidence by default
129
  }
 
130
 
131
+ if label not in entities:
132
+ entities[label] = []
133
+
134
+ # Avoid duplicates
135
+ if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
136
+ entities[label].append(entity_data)
137
+
138
+ return entities
139
+
140
+ def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
141
+ """
142
+ Fallback heuristic entity extraction.
143
+ Uses pattern matching for common entities.
144
+ """
145
+ import re
146
+ entities: Dict[str, List[Dict[str, Any]]] = {}
147
 
148
+ # Common patterns
149
+ patterns = {
150
+ 'PER': [
151
+ # Known political figures
152
+ r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
153
+ r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
154
+ ],
155
+ 'ORG': [
156
+ r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
157
+ r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
158
+ r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
159
+ ],
160
+ 'LOC': [
161
+ r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
162
+ r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
163
+ ],
164
+ 'DATE': [
165
+ r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
166
+ r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
167
+ r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
168
+ r'\b(January|February|March|April|May|June|July|August|'
169
+ r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
170
+ ],
171
+ 'MONEY': [
172
+ r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
173
+ r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
174
+ r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
175
+ ],
176
+ 'PERCENT': [
177
+ r'\b\d+(?:\.\d+)?%',
178
+ r'\b\d+(?:\.\d+)?\s*pour\s*cent',
179
+ r'\b\d+(?:\.\d+)?\s*percent',
180
+ ],
181
  }
182
+
183
+ for label, pattern_list in patterns.items():
184
+ label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
185
+
186
+ for pattern in pattern_list:
187
+ for match in re.finditer(pattern, text, re.IGNORECASE):
188
+ entity_data = {
189
+ 'text': match.group(),
190
+ 'start': match.start(),
191
+ 'end': match.end(),
192
+ 'label': label,
193
+ 'label_display': label_info.get('fr', label),
194
+ 'emoji': label_info.get('emoji', '🔖'),
195
+ 'confidence': 0.70 # Lower confidence for heuristics
196
+ }
197
+
198
+ if label not in entities:
199
+ entities[label] = []
200
+
201
+ # Avoid duplicates
202
+ if not any(e['text'].lower() == entity_data['text'].lower()
203
+ for e in entities[label]):
204
+ entities[label].append(entity_data)
205
+
206
+ return entities
207
 
208
+ def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
209
  """
210
+ Generate a human-readable summary of extracted entities.
211
 
212
+ Args:
213
+ entities: Dictionary of entities from extract_entities()
214
+
215
+ Returns:
216
+ Formatted string summary
217
  """
218
+ if not entities:
219
+ return "Aucune entité nommée détectée."
220
+
221
+ lines = []
222
+ for label, ent_list in entities.items():
223
+ label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
224
+ emoji = label_info.get('emoji', '🔖')
225
+ label_display = label_info.get('fr', label)
226
+
227
+ entity_texts = [e['text'] for e in ent_list[:5]] # Limit to 5
228
+ lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
229
 
230
+ return "\n".join(lines)
231
+
232
+ def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
233
+ """
234
+ Convert entities to frontend-friendly format.
235
 
236
+ Returns:
237
+ List of entities with all info for display
238
+ """
239
+ result = []
240
+ for label, ent_list in entities.items():
241
+ for ent in ent_list:
242
+ result.append({
243
+ 'text': ent['text'],
244
+ 'type': ent['label'],
245
+ 'type_display': ent.get('label_display', ent['label']),
246
+ 'emoji': ent.get('emoji', '🔖'),
247
+ 'confidence': ent.get('confidence', 0.5),
248
+ 'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
249
+ })
250
 
251
+ # Sort by confidence
252
+ result.sort(key=lambda x: x['confidence'], reverse=True)
253
  return result
254
 
255
 
256
+ # Singleton instance for easy import
257
+ _ner_analyzer: Optional[NERAnalyzer] = None
258
+
259
 
260
+ def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
261
+ """Get or create singleton NER analyzer instance."""
262
+ global _ner_analyzer
263
+ if _ner_analyzer is None:
264
+ _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
265
+ return _ner_analyzer
266
 
267
 
268
+ # Quick test
269
  if __name__ == "__main__":
270
+ analyzer = NERAnalyzer(fallback=True)
 
 
 
 
271
 
272
  test_text = """
273
+ Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
274
+ Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
275
+ Les dégâts sont estimés à 30 millions de dollars.
276
  """
277
 
278
+ entities = analyzer.extract_entities(test_text)
279
+ print("=== Entités détectées ===")
280
+ print(analyzer.get_entity_summary(entities))
281
+ print("\n=== Format Frontend ===")
282
+ for e in analyzer.to_frontend_format(entities):
283
+ print(f" {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")
 
 
 
 
 
syscred/verification_system.py CHANGED
@@ -33,28 +33,35 @@ except ImportError:
33
  HAS_SBERT = False
34
  print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
35
 
36
- # Local imports
37
- from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
38
- from syscred.ontology_manager import OntologyManager
39
- from syscred.seo_analyzer import SEOAnalyzer
40
- from syscred.graph_rag import GraphRAG # [NEW] GraphRAG
41
- from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult # [NEW] TREC Integration
42
- from syscred import config
43
-
44
- # [NEW] NER and E-E-A-T modules
45
  try:
46
- from syscred.ner_analyzer import NERAnalyzer, get_ner_analyzer
47
- HAS_NER = True
 
 
 
 
48
  except ImportError:
49
- HAS_NER = False
50
- print("[SysCRED] Warning: NER module not available")
 
 
 
 
51
 
 
 
52
  try:
 
53
  from syscred.eeat_calculator import EEATCalculator, EEATScore
54
- HAS_EEAT = True
55
  except ImportError:
56
- HAS_EEAT = False
57
- print("[SysCRED] Warning: E-E-A-T module not available")
 
 
 
 
58
 
59
 
60
  class CredibilityVerificationSystem:
@@ -136,6 +143,18 @@ class CredibilityVerificationSystem:
136
  # Weights for score calculation (Loaded from Config)
137
  self.weights = config.Config.SCORE_WEIGHTS
138
  print(f"[SysCRED] Using weights: {self.weights}")
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  print("[SysCRED] System ready!")
141
 
@@ -144,40 +163,47 @@ class CredibilityVerificationSystem:
144
  print("[SysCRED] Loading ML models (this may take a moment)...")
145
 
146
  try:
147
- # Sentiment analysis
148
  self.sentiment_pipeline = pipeline(
149
- "sentiment-analysis",
150
- model="distilbert-base-uncased-finetuned-sst-2-english"
 
 
151
  )
152
- print("[SysCRED] ✓ Sentiment model loaded")
153
  except Exception as e:
154
  print(f"[SysCRED] ✗ Sentiment model failed: {e}")
155
-
156
  try:
157
- # NER pipeline
158
- self.ner_pipeline = pipeline("ner", grouped_entities=True)
159
- print("[SysCRED] ✓ NER model loaded")
 
 
 
 
 
 
160
  except Exception as e:
161
  print(f"[SysCRED] ✗ NER model failed: {e}")
162
-
163
  try:
164
- # Bias detection - Specialized model
165
- # Using 'd4data/bias-detection-model' or fallback to generic
166
- bias_model_name = "d4data/bias-detection-model"
167
  self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
168
  self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
169
- print("[SysCRED] ✓ Bias model loaded (d4data)")
170
  except Exception as e:
171
  print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
172
 
173
  try:
174
- # Semantic Coherence
175
  if HAS_SBERT:
176
  self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
177
- print("[SysCRED] ✓ Coherence model loaded (SBERT)")
178
  except Exception as e:
179
  print(f"[SysCRED] ✗ Coherence model failed: {e}")
180
-
181
  try:
182
  # LIME explainer
183
  self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
@@ -501,6 +527,26 @@ class CredibilityVerificationSystem:
501
  adjustment_factor = (graph_score - 0.5) * w_graph * confidence
502
  adjustments += adjustment_factor
503
  total_weight_used += w_graph * confidence # Partial weight based on confidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
  # Final calculation
506
  # Base 0.5 + sum of weighted adjustments
@@ -657,11 +703,24 @@ class CredibilityVerificationSystem:
657
  ) -> Dict[str, Any]:
658
  """Generate the final evaluation report."""
659
 
 
 
 
 
 
 
 
 
 
 
 
 
660
  report = {
661
  'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
662
  'informationEntree': input_data,
663
  'dateGeneration': datetime.datetime.now().isoformat(),
664
  'scoreCredibilite': round(overall_score, 2),
 
665
  'resumeAnalyse': "",
666
  'detailsScore': {
667
  'base': 0.5,
@@ -688,8 +747,6 @@ class CredibilityVerificationSystem:
688
  },
689
  # [NEW] TREC Evidence section
690
  'evidences': evidences or [],
691
- # [NEW] TREC IR Metrics for dashboard
692
- 'trec_metrics': self._calculate_trec_metrics(cleaned_text, evidences),
693
  'metadonnees': {}
694
  }
695
 
@@ -756,99 +813,6 @@ class CredibilityVerificationSystem:
756
 
757
  return report
758
 
759
- def _calculate_trec_metrics(self, text: str, evidences: List[Dict[str, Any]] = None) -> Dict[str, float]:
760
- """
761
- Calculate TREC-style IR metrics for display on dashboard.
762
-
763
- Computes:
764
- - Precision: Ratio of relevant retrieved documents
765
- - Recall: Ratio of relevant documents retrieved
766
- - MAP: Mean Average Precision
767
- - NDCG: Normalized Discounted Cumulative Gain
768
- - TF-IDF: Term Frequency-Inverse Document Frequency score
769
- - MRR: Mean Reciprocal Rank
770
- """
771
- import math
772
-
773
- metrics = {
774
- 'precision': 0.0,
775
- 'recall': 0.0,
776
- 'map': 0.0,
777
- 'ndcg': 0.0,
778
- 'tfidf': 0.0,
779
- 'mrr': 0.0
780
- }
781
-
782
- if not text:
783
- return metrics
784
-
785
- # TF-IDF based on text analysis
786
- words = text.lower().split()
787
- if words:
788
- # Simple TF calculation
789
- word_counts = {}
790
- for word in words:
791
- word_counts[word] = word_counts.get(word, 0) + 1
792
-
793
- # Calculate TF-IDF score (simplified)
794
- total_words = len(words)
795
- unique_words = len(word_counts)
796
-
797
- # Term frequency normalized
798
- tf_scores = [count / total_words for count in word_counts.values()]
799
- # IDF approximation based on word distribution
800
- idf_approx = math.log((unique_words + 1) / 2)
801
-
802
- tfidf_sum = sum(tf * idf_approx for tf in tf_scores)
803
- metrics['tfidf'] = min(1.0, tfidf_sum / max(1, unique_words) * 10)
804
-
805
- # If we have evidences, calculate retrieval metrics
806
- if evidences and len(evidences) > 0:
807
- k = len(evidences)
808
-
809
- # For now, assume all retrieved evidences have some relevance
810
- # based on their retrieval scores
811
- scores = [e.get('score', 0) for e in evidences]
812
-
813
- if scores:
814
- avg_score = sum(scores) / len(scores)
815
- max_score = max(scores)
816
-
817
- # Precision at K (proxy: avg relevance score)
818
- metrics['precision'] = min(1.0, avg_score if avg_score <= 1.0 else avg_score / max(1, max_score))
819
-
820
- # Recall (proxy: coverage based on number of evidences)
821
- metrics['recall'] = min(1.0, len(evidences) / 10) # Assuming 10 is target
822
-
823
- # MAP (proxy using score ranking)
824
- ap_sum = 0.0
825
- for i, score in enumerate(sorted(scores, reverse=True)):
826
- ap_sum += (i + 1) / (i + 2) * score if score <= 1.0 else (i + 1) / (i + 2)
827
- metrics['map'] = ap_sum / len(scores) if scores else 0.0
828
-
829
- # NDCG (simplified)
830
- dcg = sum(
831
- (2 ** (score if score <= 1.0 else 1.0) - 1) / math.log2(i + 2)
832
- for i, score in enumerate(scores[:k])
833
- )
834
- ideal_scores = sorted(scores, reverse=True)
835
- idcg = sum(
836
- (2 ** (score if score <= 1.0 else 1.0) - 1) / math.log2(i + 2)
837
- for i, score in enumerate(ideal_scores[:k])
838
- )
839
- metrics['ndcg'] = dcg / idcg if idcg > 0 else 0.0
840
-
841
- # MRR (first relevant result)
842
- for i, score in enumerate(scores):
843
- if (score > 0.5 if score <= 1.0 else score > max_score / 2):
844
- metrics['mrr'] = 1.0 / (i + 1)
845
- break
846
- if metrics['mrr'] == 0 and len(scores) > 0:
847
- metrics['mrr'] = 1.0 # First result
848
-
849
- # Round all values
850
- return {k: round(v, 4) for k, v in metrics.items()}
851
-
852
  def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
853
  """Get list of factors that influenced the score (For UI)."""
854
  factors = []
@@ -1009,6 +973,40 @@ class CredibilityVerificationSystem:
1009
  print("[SysCRED] Running NLP analysis...")
1010
  nlp_results = self.nlp_analysis(cleaned_text)
1011
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1012
  # 7. Calculate score (Now includes GraphRAG context)
1013
  overall_score = self.calculate_overall_score(rule_results, nlp_results)
1014
  print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
@@ -1020,6 +1018,10 @@ class CredibilityVerificationSystem:
1020
  graph_context=graph_context
1021
  )
1022
 
 
 
 
 
1023
  # Add similar URIs to report for ontology linking
1024
  if similar_uris:
1025
  report['similar_claims_uris'] = similar_uris
 
33
  HAS_SBERT = False
34
  print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
35
 
36
+ # Local imports - Support both syscred.module and relative imports
 
 
 
 
 
 
 
 
37
  try:
38
+ from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
39
+ from syscred.ontology_manager import OntologyManager
40
+ from syscred.seo_analyzer import SEOAnalyzer
41
+ from syscred.graph_rag import GraphRAG
42
+ from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult
43
+ from syscred import config
44
  except ImportError:
45
+ from api_clients import ExternalAPIClients, WebContent, ExternalData
46
+ from ontology_manager import OntologyManager
47
+ from seo_analyzer import SEOAnalyzer
48
+ from graph_rag import GraphRAG
49
+ from trec_retriever import TRECRetriever, Evidence, RetrievalResult
50
+ import config
51
 
52
+ # [NER + E-E-A-T] Imports optionnels - n'interferent pas avec les imports principaux
53
+ HAS_NER_EEAT = False
54
  try:
55
+ from syscred.ner_analyzer import NERAnalyzer
56
  from syscred.eeat_calculator import EEATCalculator, EEATScore
57
+ HAS_NER_EEAT = True
58
  except ImportError:
59
+ try:
60
+ from ner_analyzer import NERAnalyzer
61
+ from eeat_calculator import EEATCalculator, EEATScore
62
+ HAS_NER_EEAT = True
63
+ except ImportError:
64
+ pass
65
 
66
 
67
  class CredibilityVerificationSystem:
 
143
  # Weights for score calculation (Loaded from Config)
144
  self.weights = config.Config.SCORE_WEIGHTS
145
  print(f"[SysCRED] Using weights: {self.weights}")
146
+
147
+ # [NER + E-E-A-T] Initialize analyzers
148
+ self.ner_analyzer = None
149
+ self.eeat_calculator = None
150
+ if HAS_NER_EEAT:
151
+ try:
152
+ self.ner_analyzer = NERAnalyzer()
153
+ self.eeat_calculator = EEATCalculator()
154
+ print("[SysCRED] NER analyzer initialized")
155
+ print("[SysCRED] E-E-A-T calculator initialized")
156
+ except Exception as e:
157
+ print(f"[SysCRED] NER/E-E-A-T init failed: {e}")
158
 
159
  print("[SysCRED] System ready!")
160
 
 
163
  print("[SysCRED] Loading ML models (this may take a moment)...")
164
 
165
  try:
166
+ # Sentiment analysis - modèle ultra-léger
167
  self.sentiment_pipeline = pipeline(
168
+ "sentiment-analysis",
169
+ model="distilbert-base-uncased-finetuned-sst-2-english",
170
+ device=-1,
171
+ model_kwargs={"low_cpu_mem_usage": True}
172
  )
173
+ print("[SysCRED] ✓ Sentiment model loaded (distilbert-base)")
174
  except Exception as e:
175
  print(f"[SysCRED] ✗ Sentiment model failed: {e}")
176
+
177
  try:
178
+ # NER pipeline - modèle plus léger
179
+ self.ner_pipeline = pipeline(
180
+ "ner",
181
+ model="dslim/bert-base-NER",
182
+ grouped_entities=True,
183
+ device=-1,
184
+ model_kwargs={"low_cpu_mem_usage": True}
185
+ )
186
+ print("[SysCRED] ✓ NER model loaded (dslim/bert-base-NER)")
187
  except Exception as e:
188
  print(f"[SysCRED] ✗ NER model failed: {e}")
189
+
190
  try:
191
+ # Bias detection - modèle plus léger si possible
192
+ bias_model_name = "typeform/distilbert-base-uncased-mnli"
 
193
  self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
194
  self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
195
+ print("[SysCRED] ✓ Bias model loaded (distilbert-mnli)")
196
  except Exception as e:
197
  print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
198
 
199
  try:
200
+ # Semantic Coherence - modèle MiniLM (déjà léger)
201
  if HAS_SBERT:
202
  self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
203
+ print("[SysCRED] ✓ Coherence model loaded (SBERT MiniLM)")
204
  except Exception as e:
205
  print(f"[SysCRED] ✗ Coherence model failed: {e}")
206
+
207
  try:
208
  # LIME explainer
209
  self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
 
527
  adjustment_factor = (graph_score - 0.5) * w_graph * confidence
528
  adjustments += adjustment_factor
529
  total_weight_used += w_graph * confidence # Partial weight based on confidence
530
+
531
+ # 8. [NEW] Linguistic Markers Analysis (sensationalism penalty)
532
+ # Penalize sensational language heavily, reward doubt markers (critical thinking)
533
+ linguistic = rule_results.get('linguistic_markers', {})
534
+ sensationalism_count = linguistic.get('sensationalism', 0)
535
+ doubt_count = linguistic.get('doubt', 0)
536
+ certainty_count = linguistic.get('certainty', 0)
537
+
538
+ # Sensationalism is a strong negative signal
539
+ if sensationalism_count > 0:
540
+ penalty = min(0.20, sensationalism_count * 0.05) # Max 20% penalty
541
+ adjustments -= penalty
542
+
543
+ # Excessive certainty without sources is suspicious
544
+ if certainty_count > 2 and not fact_checks:
545
+ adjustments -= 0.05
546
+
547
+ # Doubt markers indicate critical/questioning tone (slight positive)
548
+ if doubt_count > 0:
549
+ adjustments += min(0.05, doubt_count * 0.02)
550
 
551
  # Final calculation
552
  # Base 0.5 + sum of weighted adjustments
 
703
  ) -> Dict[str, Any]:
704
  """Generate the final evaluation report."""
705
 
706
+ # Determine credibility level
707
+ if overall_score >= 0.75:
708
+ niveau = "Élevée"
709
+ elif overall_score >= 0.55:
710
+ niveau = "Moyenne-Élevée"
711
+ elif overall_score >= 0.45:
712
+ niveau = "Moyenne"
713
+ elif overall_score >= 0.25:
714
+ niveau = "Faible-Moyenne"
715
+ else:
716
+ niveau = "Faible"
717
+
718
  report = {
719
  'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
720
  'informationEntree': input_data,
721
  'dateGeneration': datetime.datetime.now().isoformat(),
722
  'scoreCredibilite': round(overall_score, 2),
723
+ 'niveauCredibilite': niveau,
724
  'resumeAnalyse': "",
725
  'detailsScore': {
726
  'base': 0.5,
 
747
  },
748
  # [NEW] TREC Evidence section
749
  'evidences': evidences or [],
 
 
750
  'metadonnees': {}
751
  }
752
 
 
813
 
814
  return report
815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
817
  """Get list of factors that influenced the score (For UI)."""
818
  factors = []
 
973
  print("[SysCRED] Running NLP analysis...")
974
  nlp_results = self.nlp_analysis(cleaned_text)
975
 
976
+ # 6.5 [NER] Named Entity Recognition
977
+ ner_entities = {}
978
+ if self.ner_analyzer and cleaned_text:
979
+ try:
980
+ ner_entities = self.ner_analyzer.extract_entities(cleaned_text)
981
+ total = sum(len(v) for v in ner_entities.values() if isinstance(v, list))
982
+ print(f"[SysCRED] NER: {total} entites detectees")
983
+ except Exception as e:
984
+ print(f"[SysCRED] NER failed: {e}")
985
+
986
+ # 6.6 [E-E-A-T] Experience-Expertise-Authority-Trust scoring
987
+ eeat_scores = {}
988
+ if self.eeat_calculator:
989
+ try:
990
+ url_for_eeat = input_data if is_url else ""
991
+ domain_age_years = None
992
+ if external_data.domain_age_days:
993
+ domain_age_years = external_data.domain_age_days / 365.0
994
+
995
+ eeat_raw = self.eeat_calculator.calculate(
996
+ url=url_for_eeat,
997
+ text=cleaned_text,
998
+ nlp_analysis=nlp_results,
999
+ fact_checks=rule_results.get('fact_checking', []),
1000
+ domain_age_years=domain_age_years,
1001
+ has_https=input_data.startswith("https://") if is_url else False
1002
+ )
1003
+ eeat_scores = eeat_raw.to_dict() if hasattr(eeat_raw, 'to_dict') else (
1004
+ eeat_raw if isinstance(eeat_raw, dict) else vars(eeat_raw)
1005
+ )
1006
+ print(f"[SysCRED] E-E-A-T score: {eeat_scores.get('overall', 'N/A')}")
1007
+ except Exception as e:
1008
+ print(f"[SysCRED] E-E-A-T failed: {e}")
1009
+
1010
  # 7. Calculate score (Now includes GraphRAG context)
1011
  overall_score = self.calculate_overall_score(rule_results, nlp_results)
1012
  print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
 
1018
  graph_context=graph_context
1019
  )
1020
 
1021
+ # [NER + E-E-A-T] Always include in report (even if empty)
1022
+ report['ner_entities'] = ner_entities
1023
+ report['eeat_scores'] = eeat_scores
1024
+
1025
  # Add similar URIs to report for ontology linking
1026
  if similar_uris:
1027
  report['similar_claims_uris'] = similar_uris