D Ф m i И i q ц e L Ф y e r commited on
Commit
6043bc8
·
1 Parent(s): ff19e9c

Add NER analyzer and EEAT calculator modules

Browse files
syscred/syscred/eeat_calculator.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ E-E-A-T Metrics Calculator for SysCRED
5
+ ========================================
6
+ Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
7
+
8
+ These metrics mirror modern Google ranking signals:
9
+ - Experience: Domain age, content freshness
10
+ - Expertise: Author identification, depth of content
11
+ - Authority: PageRank simulation, citations/backlinks
12
+ - Trust: HTTPS, fact-checks, low bias score
13
+ """
14
+
15
+ from typing import Dict, Any, Optional, List
16
+ from dataclasses import dataclass
17
+ import re
18
+ from datetime import datetime
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class EEATScore:
26
+ """E-E-A-T score container."""
27
+ experience: float # 0-1
28
+ expertise: float # 0-1
29
+ authority: float # 0-1
30
+ trust: float # 0-1
31
+
32
+ @property
33
+ def overall(self) -> float:
34
+ """Weighted average of all E-E-A-T components."""
35
+ # Weights based on Google's emphasis
36
+ weights = {
37
+ 'experience': 0.15,
38
+ 'expertise': 0.25,
39
+ 'authority': 0.35,
40
+ 'trust': 0.25
41
+ }
42
+ return (
43
+ self.experience * weights['experience'] +
44
+ self.expertise * weights['expertise'] +
45
+ self.authority * weights['authority'] +
46
+ self.trust * weights['trust']
47
+ )
48
+
49
+ def to_dict(self) -> Dict[str, Any]:
50
+ """Convert to dictionary for JSON serialization."""
51
+ return {
52
+ 'experience': round(self.experience, 3),
53
+ 'expertise': round(self.expertise, 3),
54
+ 'authority': round(self.authority, 3),
55
+ 'trust': round(self.trust, 3),
56
+ 'overall': round(self.overall, 3),
57
+ 'experience_pct': f"{int(self.experience * 100)}%",
58
+ 'expertise_pct': f"{int(self.expertise * 100)}%",
59
+ 'authority_pct': f"{int(self.authority * 100)}%",
60
+ 'trust_pct': f"{int(self.trust * 100)}%",
61
+ 'overall_pct': f"{int(self.overall * 100)}%"
62
+ }
63
+
64
+
65
+ class EEATCalculator:
66
+ """
67
+ Calculate E-E-A-T metrics from various signals.
68
+
69
+ Mirrors Google's quality rater guidelines:
70
+ - Experience: Has the author demonstrated real experience?
71
+ - Expertise: Is the content expert-level?
72
+ - Authority: Is the source recognized as authoritative?
73
+ - Trust: Is the source trustworthy?
74
+ """
75
+
76
+ # Known authoritative domains
77
+ AUTHORITATIVE_DOMAINS = {
78
+ # News
79
+ 'lemonde.fr': 0.95,
80
+ 'lefigaro.fr': 0.90,
81
+ 'liberation.fr': 0.88,
82
+ 'nytimes.com': 0.95,
83
+ 'washingtonpost.com': 0.93,
84
+ 'theguardian.com': 0.92,
85
+ 'bbc.com': 0.94,
86
+ 'bbc.co.uk': 0.94,
87
+ 'reuters.com': 0.96,
88
+ 'apnews.com': 0.95,
89
+ # Academic
90
+ 'nature.com': 0.98,
91
+ 'science.org': 0.98,
92
+ 'pubmed.ncbi.nlm.nih.gov': 0.97,
93
+ 'scholar.google.com': 0.85,
94
+ # Government
95
+ 'gouv.fr': 0.90,
96
+ 'gov.uk': 0.90,
97
+ 'whitehouse.gov': 0.88,
98
+ 'europa.eu': 0.92,
99
+ # Fact-checkers
100
+ 'snopes.com': 0.88,
101
+ 'factcheck.org': 0.90,
102
+ 'politifact.com': 0.88,
103
+ 'fullfact.org': 0.89,
104
+ # Wikipedia (moderate authority)
105
+ 'wikipedia.org': 0.75,
106
+ 'fr.wikipedia.org': 0.75,
107
+ 'en.wikipedia.org': 0.75,
108
+ }
109
+
110
+ # Low-trust domains (misinformation sources)
111
+ LOW_TRUST_DOMAINS = {
112
+ 'infowars.com': 0.1,
113
+ 'breitbart.com': 0.3,
114
+ 'naturalnews.com': 0.15,
115
+ # Add more as needed
116
+ }
117
+
118
+ def __init__(self):
119
+ """Initialize E-E-A-T calculator."""
120
+ pass
121
+
122
+ def calculate(
123
+ self,
124
+ url: str,
125
+ text: str,
126
+ nlp_analysis: Optional[Dict[str, Any]] = None,
127
+ pagerank: Optional[float] = None,
128
+ fact_checks: Optional[List[Dict]] = None,
129
+ domain_age_years: Optional[float] = None,
130
+ has_https: bool = True,
131
+ author_identified: bool = False,
132
+ seo_score: Optional[float] = None
133
+ ) -> EEATScore:
134
+ """
135
+ Calculate E-E-A-T scores from available signals.
136
+
137
+ Args:
138
+ url: Source URL
139
+ text: Article text content
140
+ nlp_analysis: NLP analysis results (sentiment, coherence, bias)
141
+ pagerank: Simulated PageRank score (0-1)
142
+ fact_checks: List of fact-check results
143
+ domain_age_years: Domain age in years (from WHOIS)
144
+ has_https: Whether site uses HTTPS
145
+ author_identified: Whether author is clearly identified
146
+ seo_score: SEO/technical quality score
147
+
148
+ Returns:
149
+ EEATScore with all component scores
150
+ """
151
+ # Extract domain from URL
152
+ domain = self._extract_domain(url)
153
+
154
+ # Calculate each component
155
+ experience = self._calculate_experience(
156
+ domain_age_years,
157
+ text,
158
+ nlp_analysis
159
+ )
160
+
161
+ expertise = self._calculate_expertise(
162
+ text,
163
+ author_identified,
164
+ nlp_analysis
165
+ )
166
+
167
+ authority = self._calculate_authority(
168
+ domain,
169
+ pagerank,
170
+ seo_score
171
+ )
172
+
173
+ trust = self._calculate_trust(
174
+ domain,
175
+ has_https,
176
+ fact_checks,
177
+ nlp_analysis
178
+ )
179
+
180
+ return EEATScore(
181
+ experience=experience,
182
+ expertise=expertise,
183
+ authority=authority,
184
+ trust=trust
185
+ )
186
+
187
+ def _extract_domain(self, url: str) -> str:
188
+ """Extract domain from URL."""
189
+ import re
190
+ match = re.search(r'https?://(?:www\.)?([^/]+)', url)
191
+ return match.group(1).lower() if match else url.lower()
192
+
193
+ def _calculate_experience(
194
+ self,
195
+ domain_age_years: Optional[float],
196
+ text: str,
197
+ nlp_analysis: Optional[Dict]
198
+ ) -> float:
199
+ """
200
+ Calculate Experience score.
201
+
202
+ Factors:
203
+ - Domain age (longer = more experience)
204
+ - Content freshness (recently updated)
205
+ - First-hand experience indicators in text
206
+ """
207
+ score = 0.5 # Base score
208
+
209
+ # Domain age contribution (max 0.3)
210
+ if domain_age_years is not None:
211
+ age_score = min(domain_age_years / 20, 1.0) * 0.3 # 20 years = max
212
+ score += age_score
213
+ else:
214
+ score += 0.15 # Assume moderate age
215
+
216
+ # Content depth contribution (max 0.2)
217
+ word_count = len(text.split()) if text else 0
218
+ if word_count > 1000:
219
+ score += 0.2
220
+ elif word_count > 500:
221
+ score += 0.15
222
+ elif word_count > 200:
223
+ score += 0.1
224
+
225
+ # First-hand experience indicators (max 0.1)
226
+ experience_indicators = [
227
+ r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
228
+ r'\b(interview|entretien|témoignage|witness|firsthand)\b',
229
+ r'\b(sur place|on the ground|eyewitness)\b'
230
+ ]
231
+ for pattern in experience_indicators:
232
+ if re.search(pattern, text, re.IGNORECASE):
233
+ score += 0.03
234
+
235
+ return min(score, 1.0)
236
+
237
+ def _calculate_expertise(
238
+ self,
239
+ text: str,
240
+ author_identified: bool,
241
+ nlp_analysis: Optional[Dict]
242
+ ) -> float:
243
+ """
244
+ Calculate Expertise score.
245
+
246
+ Factors:
247
+ - Author identification
248
+ - Technical depth of content
249
+ - Citation of sources
250
+ - Coherence (from NLP)
251
+ """
252
+ score = 0.4 # Base score
253
+
254
+ # Author identification (0.2)
255
+ if author_identified:
256
+ score += 0.2
257
+
258
+ # Citation indicators (max 0.2)
259
+ citation_patterns = [
260
+ r'\b(selon|according to|d\'après|source:)\b',
261
+ r'\b(étude|study|research|rapport|report)\b',
262
+ r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
263
+ r'\[([\d]+)\]', # [1] style citations
264
+ r'https?://[^\s]+' # Links
265
+ ]
266
+ citation_count = 0
267
+ for pattern in citation_patterns:
268
+ citation_count += len(re.findall(pattern, text, re.IGNORECASE))
269
+ score += min(citation_count * 0.02, 0.2)
270
+
271
+ # Coherence from NLP analysis (0.2)
272
+ if nlp_analysis and 'coherence' in nlp_analysis:
273
+ coherence = nlp_analysis['coherence']
274
+ if isinstance(coherence, dict):
275
+ coherence = coherence.get('score', 0.5)
276
+ score += coherence * 0.2
277
+ else:
278
+ score += 0.1 # Assume moderate coherence
279
+
280
+ return min(score, 1.0)
281
+
282
+ def _calculate_authority(
283
+ self,
284
+ domain: str,
285
+ pagerank: Optional[float],
286
+ seo_score: Optional[float]
287
+ ) -> float:
288
+ """
289
+ Calculate Authority score.
290
+
291
+ Factors:
292
+ - Known authoritative domain
293
+ - PageRank simulation
294
+ - SEO/technical quality
295
+ """
296
+ score = 0.3 # Base score
297
+
298
+ # Known domain authority (max 0.5)
299
+ for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
300
+ if known_domain in domain:
301
+ score = max(score, authority * 0.5 + 0.3)
302
+ break
303
+
304
+ # Check low-trust domains
305
+ for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
306
+ if low_trust_domain in domain:
307
+ score = min(score, low_score)
308
+ break
309
+
310
+ # PageRank contribution (max 0.3)
311
+ if pagerank is not None:
312
+ score += pagerank * 0.3
313
+ else:
314
+ score += 0.15 # Assume moderate pagerank
315
+
316
+ # SEO score contribution (max 0.2)
317
+ if seo_score is not None:
318
+ score += seo_score * 0.2
319
+ else:
320
+ score += 0.1
321
+
322
+ return min(score, 1.0)
323
+
324
+ def _calculate_trust(
325
+ self,
326
+ domain: str,
327
+ has_https: bool,
328
+ fact_checks: Optional[List[Dict]],
329
+ nlp_analysis: Optional[Dict]
330
+ ) -> float:
331
+ """
332
+ Calculate Trust score.
333
+
334
+ Factors:
335
+ - HTTPS
336
+ - Fact-check results
337
+ - Bias score (low = better)
338
+ - Known trustworthy domain
339
+ """
340
+ score = 0.4 # Base score
341
+
342
+ # HTTPS (0.1)
343
+ if has_https:
344
+ score += 0.1
345
+
346
+ # Fact-check results (max 0.3)
347
+ if fact_checks:
348
+ positive_checks = sum(1 for fc in fact_checks
349
+ if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
350
+ negative_checks = sum(1 for fc in fact_checks
351
+ if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
352
+
353
+ if positive_checks > 0:
354
+ score += 0.2
355
+ if negative_checks > 0:
356
+ score -= 0.3
357
+
358
+ # Bias score (max 0.2, lower bias = higher trust)
359
+ if nlp_analysis:
360
+ bias_data = nlp_analysis.get('bias_analysis', {})
361
+ if isinstance(bias_data, dict):
362
+ bias_score = bias_data.get('score', 0.3)
363
+ else:
364
+ bias_score = 0.3
365
+ # Invert: low bias = high trust contribution
366
+ score += (1 - bias_score) * 0.2
367
+ else:
368
+ score += 0.1
369
+
370
+ # Known trustworthy domain (0.1)
371
+ for known_domain in self.AUTHORITATIVE_DOMAINS:
372
+ if known_domain in domain:
373
+ score += 0.1
374
+ break
375
+
376
+ # Known low-trust domain (penalty)
377
+ for low_trust_domain in self.LOW_TRUST_DOMAINS:
378
+ if low_trust_domain in domain:
379
+ score -= 0.3
380
+ break
381
+
382
+ return max(min(score, 1.0), 0.0)
383
+
384
+ def explain_score(self, eeat: EEATScore, url: str) -> str:
385
+ """
386
+ Generate human-readable explanation of E-E-A-T score.
387
+
388
+ Args:
389
+ eeat: EEATScore instance
390
+ url: Source URL
391
+
392
+ Returns:
393
+ Formatted explanation string
394
+ """
395
+ domain = self._extract_domain(url)
396
+
397
+ explanations = []
398
+
399
+ # Experience
400
+ if eeat.experience >= 0.8:
401
+ explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
402
+ elif eeat.experience >= 0.5:
403
+ explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
404
+ else:
405
+ explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
406
+
407
+ # Expertise
408
+ if eeat.expertise >= 0.8:
409
+ explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
410
+ elif eeat.expertise >= 0.5:
411
+ explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
412
+ else:
413
+ explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
414
+
415
+ # Authority
416
+ if eeat.authority >= 0.8:
417
+ explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
418
+ elif eeat.authority >= 0.5:
419
+ explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
420
+ else:
421
+ explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
422
+
423
+ # Trust
424
+ if eeat.trust >= 0.8:
425
+ explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
426
+ elif eeat.trust >= 0.5:
427
+ explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
428
+ else:
429
+ explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
430
+
431
+ return "\n".join(explanations)
432
+
433
+
434
+ # Test
435
+ if __name__ == "__main__":
436
+ calc = EEATCalculator()
437
+
438
+ test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
439
+ test_text = """
440
+ Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
441
+ au Capitol a été un événement marquant. Notre reporter sur place a témoigné
442
+ des événements. Les experts politiques analysent les conséquences.
443
+ """
444
+
445
+ nlp_analysis = {
446
+ 'coherence': {'score': 0.8},
447
+ 'bias_analysis': {'score': 0.2}
448
+ }
449
+
450
+ eeat = calc.calculate(
451
+ url=test_url,
452
+ text=test_text,
453
+ nlp_analysis=nlp_analysis,
454
+ pagerank=0.7,
455
+ has_https=True,
456
+ author_identified=True
457
+ )
458
+
459
+ print("=== E-E-A-T Scores ===")
460
+ print(f"Experience: {eeat.experience_pct}")
461
+ print(f"Expertise: {eeat.expertise_pct}")
462
+ print(f"Authority: {eeat.authority_pct}")
463
+ print(f"Trust: {eeat.trust_pct}")
464
+ print(f"Overall: {eeat.overall_pct}")
465
+ print("\n=== Explanation ===")
466
+ print(calc.explain_score(eeat, test_url))
syscred/syscred/ner_analyzer.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Named Entity Recognition (NER) Analyzer for SysCRED
5
+ ====================================================
6
+ Extracts named entities from text using spaCy.
7
+
8
+ Entities detected:
9
+ - PER: Persons (Donald Trump, Emmanuel Macron)
10
+ - ORG: Organizations (FBI, UN, Google)
11
+ - LOC: Locations (Paris, Capitol)
12
+ - DATE: Dates (January 6, 2021)
13
+ - MONEY: Amounts ($10 million)
14
+ - EVENT: Events (insurrection, election)
15
+ """
16
+
17
+ from typing import Dict, List, Any, Optional
18
+ import logging
19
+
20
+ # Try to import spaCy
21
+ try:
22
+ import spacy
23
+ from spacy.language import Language
24
+ HAS_SPACY = True
25
+ except ImportError:
26
+ HAS_SPACY = False
27
+ spacy = None
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class NERAnalyzer:
33
+ """
34
+ Named Entity Recognition analyzer using spaCy.
35
+
36
+ Supports French (fr_core_news_md) and English (en_core_web_md).
37
+ Falls back to heuristic extraction if spaCy is not available.
38
+ """
39
+
40
+ # Entity type mappings for display
41
+ ENTITY_LABELS = {
42
+ 'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
43
+ 'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
44
+ 'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
45
+ 'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
46
+ 'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
47
+ 'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
48
+ 'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
49
+ 'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
50
+ 'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
51
+ 'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
52
+ 'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
53
+ 'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
54
+ 'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
55
+ 'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
56
+ }
57
+
58
+ def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
59
+ """
60
+ Initialize NER analyzer.
61
+
62
+ Args:
63
+ model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
64
+ fallback: If True, use heuristics when spaCy unavailable
65
+ """
66
+ self.model_name = model_name
67
+ self.fallback = fallback
68
+ self.nlp = None
69
+ self.use_heuristics = False
70
+
71
+ if HAS_SPACY:
72
+ try:
73
+ self.nlp = spacy.load(model_name)
74
+ logger.info(f"[NER] Loaded spaCy model: {model_name}")
75
+ except OSError as e:
76
+ logger.warning(f"[NER] Could not load model {model_name}: {e}")
77
+ if fallback:
78
+ self.use_heuristics = True
79
+ logger.info("[NER] Using heuristic entity extraction")
80
+ else:
81
+ if fallback:
82
+ self.use_heuristics = True
83
+ logger.info("[NER] spaCy not installed. Using heuristic extraction")
84
+
85
+ def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
86
+ """
87
+ Extract named entities from text.
88
+
89
+ Args:
90
+ text: Input text to analyze
91
+
92
+ Returns:
93
+ Dictionary mapping entity types to lists of entities
94
+ Each entity has: text, start, end, label, label_display, emoji, confidence
95
+ """
96
+ if not text or len(text.strip()) == 0:
97
+ return {}
98
+
99
+ if self.nlp:
100
+ return self._extract_with_spacy(text)
101
+ elif self.use_heuristics:
102
+ return self._extract_with_heuristics(text)
103
+ else:
104
+ return {}
105
+
106
+ def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
107
+ """Extract entities using spaCy NLP."""
108
+ doc = self.nlp(text)
109
+ entities: Dict[str, List[Dict[str, Any]]] = {}
110
+
111
+ for ent in doc.ents:
112
+ label = ent.label_
113
+
114
+ # Get display info
115
+ label_info = self.ENTITY_LABELS.get(label, {
116
+ 'fr': label,
117
+ 'en': label,
118
+ 'emoji': '🔖'
119
+ })
120
+
121
+ entity_data = {
122
+ 'text': ent.text,
123
+ 'start': ent.start_char,
124
+ 'end': ent.end_char,
125
+ 'label': label,
126
+ 'label_display': label_info.get('fr', label),
127
+ 'emoji': label_info.get('emoji', '🔖'),
128
+ 'confidence': 0.85 # spaCy doesn't provide confidence by default
129
+ }
130
+
131
+ if label not in entities:
132
+ entities[label] = []
133
+
134
+ # Avoid duplicates
135
+ if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
136
+ entities[label].append(entity_data)
137
+
138
+ return entities
139
+
140
+ def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
141
+ """
142
+ Fallback heuristic entity extraction.
143
+ Uses pattern matching for common entities.
144
+ """
145
+ import re
146
+ entities: Dict[str, List[Dict[str, Any]]] = {}
147
+
148
+ # Common patterns
149
+ patterns = {
150
+ 'PER': [
151
+ # Known political figures
152
+ r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
153
+ r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
154
+ ],
155
+ 'ORG': [
156
+ r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
157
+ r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
158
+ r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
159
+ ],
160
+ 'LOC': [
161
+ r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
162
+ r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
163
+ ],
164
+ 'DATE': [
165
+ r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
166
+ r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
167
+ r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
168
+ r'\b(January|February|March|April|May|June|July|August|'
169
+ r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
170
+ ],
171
+ 'MONEY': [
172
+ r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
173
+ r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
174
+ r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
175
+ ],
176
+ 'PERCENT': [
177
+ r'\b\d+(?:\.\d+)?%',
178
+ r'\b\d+(?:\.\d+)?\s*pour\s*cent',
179
+ r'\b\d+(?:\.\d+)?\s*percent',
180
+ ],
181
+ }
182
+
183
+ for label, pattern_list in patterns.items():
184
+ label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
185
+
186
+ for pattern in pattern_list:
187
+ for match in re.finditer(pattern, text, re.IGNORECASE):
188
+ entity_data = {
189
+ 'text': match.group(),
190
+ 'start': match.start(),
191
+ 'end': match.end(),
192
+ 'label': label,
193
+ 'label_display': label_info.get('fr', label),
194
+ 'emoji': label_info.get('emoji', '🔖'),
195
+ 'confidence': 0.70 # Lower confidence for heuristics
196
+ }
197
+
198
+ if label not in entities:
199
+ entities[label] = []
200
+
201
+ # Avoid duplicates
202
+ if not any(e['text'].lower() == entity_data['text'].lower()
203
+ for e in entities[label]):
204
+ entities[label].append(entity_data)
205
+
206
+ return entities
207
+
208
+ def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
209
+ """
210
+ Generate a human-readable summary of extracted entities.
211
+
212
+ Args:
213
+ entities: Dictionary of entities from extract_entities()
214
+
215
+ Returns:
216
+ Formatted string summary
217
+ """
218
+ if not entities:
219
+ return "Aucune entité nommée détectée."
220
+
221
+ lines = []
222
+ for label, ent_list in entities.items():
223
+ label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
224
+ emoji = label_info.get('emoji', '🔖')
225
+ label_display = label_info.get('fr', label)
226
+
227
+ entity_texts = [e['text'] for e in ent_list[:5]] # Limit to 5
228
+ lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
229
+
230
+ return "\n".join(lines)
231
+
232
+ def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
233
+ """
234
+ Convert entities to frontend-friendly format.
235
+
236
+ Returns:
237
+ List of entities with all info for display
238
+ """
239
+ result = []
240
+ for label, ent_list in entities.items():
241
+ for ent in ent_list:
242
+ result.append({
243
+ 'text': ent['text'],
244
+ 'type': ent['label'],
245
+ 'type_display': ent.get('label_display', ent['label']),
246
+ 'emoji': ent.get('emoji', '🔖'),
247
+ 'confidence': ent.get('confidence', 0.5),
248
+ 'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
249
+ })
250
+
251
+ # Sort by confidence
252
+ result.sort(key=lambda x: x['confidence'], reverse=True)
253
+ return result
254
+
255
+
256
+ # Singleton instance for easy import
257
+ _ner_analyzer: Optional[NERAnalyzer] = None
258
+
259
+
260
+ def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
261
+ """Get or create singleton NER analyzer instance."""
262
+ global _ner_analyzer
263
+ if _ner_analyzer is None:
264
+ _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
265
+ return _ner_analyzer
266
+
267
+
268
+ # Quick test
269
+ if __name__ == "__main__":
270
+ analyzer = NERAnalyzer(fallback=True)
271
+
272
+ test_text = """
273
+ Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
274
+ Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
275
+ Les dégâts sont estimés à 30 millions de dollars.
276
+ """
277
+
278
+ entities = analyzer.extract_entities(test_text)
279
+ print("=== Entités détectées ===")
280
+ print(analyzer.get_entity_summary(entities))
281
+ print("\n=== Format Frontend ===")
282
+ for e in analyzer.to_frontend_format(entities):
283
+ print(f" {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")