D Ф m i И i q ц e L Ф y e r commited on
Commit
fdaeab5
·
2 Parent(s): 6043bc8 1cafc3b

Merge: Add NER/EEAT modules + requirements-distilled.txt

Browse files
syscred/requirements-distilled.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SysCRED - Optimized Requirements with Distilled Models
2
+ # (c) Dominique S. Loyer
3
+ # Uses DISTILLED models for faster loading and lower memory
4
+
5
+ # === Core Dependencies ===
6
+ requests>=2.28.0
7
+ beautifulsoup4>=4.11.0
8
+ python-whois>=0.8.0
9
+
10
+ # === RDF/Ontology ===
11
+ rdflib>=6.0.0
12
+
13
+ # === Machine Learning (CPU-only) ===
14
+ --extra-index-url https://download.pytorch.org/whl/cpu
15
+ torch>=2.0.0
16
+ transformers>=4.30.0
17
+ sentence-transformers>=2.2.0
18
+
19
+ # === Data ===
20
+ numpy>=1.24.0
21
+ pandas>=2.0.0
22
+
23
+ # === Explainability ===
24
+ lime>=0.2.0
25
+
26
+ # === NLP ===
27
+ spacy>=3.5.0
28
+
29
+ # === Web Backend ===
30
+ flask>=2.3.0
31
+ flask-cors>=4.0.0
32
+ python-dotenv>=1.0.0
33
+ gunicorn>=20.1.0
34
+ flask-sqlalchemy>=3.1.0
35
+ scikit-learn>=1.3.0
36
+ scipy>=1.11.0
syscred/syscred/eeat_calculator.py DELETED
@@ -1,466 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- E-E-A-T Metrics Calculator for SysCRED
5
- ========================================
6
- Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
7
-
8
- These metrics mirror modern Google ranking signals:
9
- - Experience: Domain age, content freshness
10
- - Expertise: Author identification, depth of content
11
- - Authority: PageRank simulation, citations/backlinks
12
- - Trust: HTTPS, fact-checks, low bias score
13
- """
14
-
15
- from typing import Dict, Any, Optional, List
16
- from dataclasses import dataclass
17
- import re
18
- from datetime import datetime
19
- import logging
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- @dataclass
25
- class EEATScore:
26
- """E-E-A-T score container."""
27
- experience: float # 0-1
28
- expertise: float # 0-1
29
- authority: float # 0-1
30
- trust: float # 0-1
31
-
32
- @property
33
- def overall(self) -> float:
34
- """Weighted average of all E-E-A-T components."""
35
- # Weights based on Google's emphasis
36
- weights = {
37
- 'experience': 0.15,
38
- 'expertise': 0.25,
39
- 'authority': 0.35,
40
- 'trust': 0.25
41
- }
42
- return (
43
- self.experience * weights['experience'] +
44
- self.expertise * weights['expertise'] +
45
- self.authority * weights['authority'] +
46
- self.trust * weights['trust']
47
- )
48
-
49
- def to_dict(self) -> Dict[str, Any]:
50
- """Convert to dictionary for JSON serialization."""
51
- return {
52
- 'experience': round(self.experience, 3),
53
- 'expertise': round(self.expertise, 3),
54
- 'authority': round(self.authority, 3),
55
- 'trust': round(self.trust, 3),
56
- 'overall': round(self.overall, 3),
57
- 'experience_pct': f"{int(self.experience * 100)}%",
58
- 'expertise_pct': f"{int(self.expertise * 100)}%",
59
- 'authority_pct': f"{int(self.authority * 100)}%",
60
- 'trust_pct': f"{int(self.trust * 100)}%",
61
- 'overall_pct': f"{int(self.overall * 100)}%"
62
- }
63
-
64
-
65
- class EEATCalculator:
66
- """
67
- Calculate E-E-A-T metrics from various signals.
68
-
69
- Mirrors Google's quality rater guidelines:
70
- - Experience: Has the author demonstrated real experience?
71
- - Expertise: Is the content expert-level?
72
- - Authority: Is the source recognized as authoritative?
73
- - Trust: Is the source trustworthy?
74
- """
75
-
76
- # Known authoritative domains
77
- AUTHORITATIVE_DOMAINS = {
78
- # News
79
- 'lemonde.fr': 0.95,
80
- 'lefigaro.fr': 0.90,
81
- 'liberation.fr': 0.88,
82
- 'nytimes.com': 0.95,
83
- 'washingtonpost.com': 0.93,
84
- 'theguardian.com': 0.92,
85
- 'bbc.com': 0.94,
86
- 'bbc.co.uk': 0.94,
87
- 'reuters.com': 0.96,
88
- 'apnews.com': 0.95,
89
- # Academic
90
- 'nature.com': 0.98,
91
- 'science.org': 0.98,
92
- 'pubmed.ncbi.nlm.nih.gov': 0.97,
93
- 'scholar.google.com': 0.85,
94
- # Government
95
- 'gouv.fr': 0.90,
96
- 'gov.uk': 0.90,
97
- 'whitehouse.gov': 0.88,
98
- 'europa.eu': 0.92,
99
- # Fact-checkers
100
- 'snopes.com': 0.88,
101
- 'factcheck.org': 0.90,
102
- 'politifact.com': 0.88,
103
- 'fullfact.org': 0.89,
104
- # Wikipedia (moderate authority)
105
- 'wikipedia.org': 0.75,
106
- 'fr.wikipedia.org': 0.75,
107
- 'en.wikipedia.org': 0.75,
108
- }
109
-
110
- # Low-trust domains (misinformation sources)
111
- LOW_TRUST_DOMAINS = {
112
- 'infowars.com': 0.1,
113
- 'breitbart.com': 0.3,
114
- 'naturalnews.com': 0.15,
115
- # Add more as needed
116
- }
117
-
118
- def __init__(self):
119
- """Initialize E-E-A-T calculator."""
120
- pass
121
-
122
- def calculate(
123
- self,
124
- url: str,
125
- text: str,
126
- nlp_analysis: Optional[Dict[str, Any]] = None,
127
- pagerank: Optional[float] = None,
128
- fact_checks: Optional[List[Dict]] = None,
129
- domain_age_years: Optional[float] = None,
130
- has_https: bool = True,
131
- author_identified: bool = False,
132
- seo_score: Optional[float] = None
133
- ) -> EEATScore:
134
- """
135
- Calculate E-E-A-T scores from available signals.
136
-
137
- Args:
138
- url: Source URL
139
- text: Article text content
140
- nlp_analysis: NLP analysis results (sentiment, coherence, bias)
141
- pagerank: Simulated PageRank score (0-1)
142
- fact_checks: List of fact-check results
143
- domain_age_years: Domain age in years (from WHOIS)
144
- has_https: Whether site uses HTTPS
145
- author_identified: Whether author is clearly identified
146
- seo_score: SEO/technical quality score
147
-
148
- Returns:
149
- EEATScore with all component scores
150
- """
151
- # Extract domain from URL
152
- domain = self._extract_domain(url)
153
-
154
- # Calculate each component
155
- experience = self._calculate_experience(
156
- domain_age_years,
157
- text,
158
- nlp_analysis
159
- )
160
-
161
- expertise = self._calculate_expertise(
162
- text,
163
- author_identified,
164
- nlp_analysis
165
- )
166
-
167
- authority = self._calculate_authority(
168
- domain,
169
- pagerank,
170
- seo_score
171
- )
172
-
173
- trust = self._calculate_trust(
174
- domain,
175
- has_https,
176
- fact_checks,
177
- nlp_analysis
178
- )
179
-
180
- return EEATScore(
181
- experience=experience,
182
- expertise=expertise,
183
- authority=authority,
184
- trust=trust
185
- )
186
-
187
- def _extract_domain(self, url: str) -> str:
188
- """Extract domain from URL."""
189
- import re
190
- match = re.search(r'https?://(?:www\.)?([^/]+)', url)
191
- return match.group(1).lower() if match else url.lower()
192
-
193
- def _calculate_experience(
194
- self,
195
- domain_age_years: Optional[float],
196
- text: str,
197
- nlp_analysis: Optional[Dict]
198
- ) -> float:
199
- """
200
- Calculate Experience score.
201
-
202
- Factors:
203
- - Domain age (longer = more experience)
204
- - Content freshness (recently updated)
205
- - First-hand experience indicators in text
206
- """
207
- score = 0.5 # Base score
208
-
209
- # Domain age contribution (max 0.3)
210
- if domain_age_years is not None:
211
- age_score = min(domain_age_years / 20, 1.0) * 0.3 # 20 years = max
212
- score += age_score
213
- else:
214
- score += 0.15 # Assume moderate age
215
-
216
- # Content depth contribution (max 0.2)
217
- word_count = len(text.split()) if text else 0
218
- if word_count > 1000:
219
- score += 0.2
220
- elif word_count > 500:
221
- score += 0.15
222
- elif word_count > 200:
223
- score += 0.1
224
-
225
- # First-hand experience indicators (max 0.1)
226
- experience_indicators = [
227
- r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
228
- r'\b(interview|entretien|témoignage|witness|firsthand)\b',
229
- r'\b(sur place|on the ground|eyewitness)\b'
230
- ]
231
- for pattern in experience_indicators:
232
- if re.search(pattern, text, re.IGNORECASE):
233
- score += 0.03
234
-
235
- return min(score, 1.0)
236
-
237
- def _calculate_expertise(
238
- self,
239
- text: str,
240
- author_identified: bool,
241
- nlp_analysis: Optional[Dict]
242
- ) -> float:
243
- """
244
- Calculate Expertise score.
245
-
246
- Factors:
247
- - Author identification
248
- - Technical depth of content
249
- - Citation of sources
250
- - Coherence (from NLP)
251
- """
252
- score = 0.4 # Base score
253
-
254
- # Author identification (0.2)
255
- if author_identified:
256
- score += 0.2
257
-
258
- # Citation indicators (max 0.2)
259
- citation_patterns = [
260
- r'\b(selon|according to|d\'après|source:)\b',
261
- r'\b(étude|study|research|rapport|report)\b',
262
- r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
263
- r'\[([\d]+)\]', # [1] style citations
264
- r'https?://[^\s]+' # Links
265
- ]
266
- citation_count = 0
267
- for pattern in citation_patterns:
268
- citation_count += len(re.findall(pattern, text, re.IGNORECASE))
269
- score += min(citation_count * 0.02, 0.2)
270
-
271
- # Coherence from NLP analysis (0.2)
272
- if nlp_analysis and 'coherence' in nlp_analysis:
273
- coherence = nlp_analysis['coherence']
274
- if isinstance(coherence, dict):
275
- coherence = coherence.get('score', 0.5)
276
- score += coherence * 0.2
277
- else:
278
- score += 0.1 # Assume moderate coherence
279
-
280
- return min(score, 1.0)
281
-
282
- def _calculate_authority(
283
- self,
284
- domain: str,
285
- pagerank: Optional[float],
286
- seo_score: Optional[float]
287
- ) -> float:
288
- """
289
- Calculate Authority score.
290
-
291
- Factors:
292
- - Known authoritative domain
293
- - PageRank simulation
294
- - SEO/technical quality
295
- """
296
- score = 0.3 # Base score
297
-
298
- # Known domain authority (max 0.5)
299
- for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
300
- if known_domain in domain:
301
- score = max(score, authority * 0.5 + 0.3)
302
- break
303
-
304
- # Check low-trust domains
305
- for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
306
- if low_trust_domain in domain:
307
- score = min(score, low_score)
308
- break
309
-
310
- # PageRank contribution (max 0.3)
311
- if pagerank is not None:
312
- score += pagerank * 0.3
313
- else:
314
- score += 0.15 # Assume moderate pagerank
315
-
316
- # SEO score contribution (max 0.2)
317
- if seo_score is not None:
318
- score += seo_score * 0.2
319
- else:
320
- score += 0.1
321
-
322
- return min(score, 1.0)
323
-
324
- def _calculate_trust(
325
- self,
326
- domain: str,
327
- has_https: bool,
328
- fact_checks: Optional[List[Dict]],
329
- nlp_analysis: Optional[Dict]
330
- ) -> float:
331
- """
332
- Calculate Trust score.
333
-
334
- Factors:
335
- - HTTPS
336
- - Fact-check results
337
- - Bias score (low = better)
338
- - Known trustworthy domain
339
- """
340
- score = 0.4 # Base score
341
-
342
- # HTTPS (0.1)
343
- if has_https:
344
- score += 0.1
345
-
346
- # Fact-check results (max 0.3)
347
- if fact_checks:
348
- positive_checks = sum(1 for fc in fact_checks
349
- if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
350
- negative_checks = sum(1 for fc in fact_checks
351
- if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
352
-
353
- if positive_checks > 0:
354
- score += 0.2
355
- if negative_checks > 0:
356
- score -= 0.3
357
-
358
- # Bias score (max 0.2, lower bias = higher trust)
359
- if nlp_analysis:
360
- bias_data = nlp_analysis.get('bias_analysis', {})
361
- if isinstance(bias_data, dict):
362
- bias_score = bias_data.get('score', 0.3)
363
- else:
364
- bias_score = 0.3
365
- # Invert: low bias = high trust contribution
366
- score += (1 - bias_score) * 0.2
367
- else:
368
- score += 0.1
369
-
370
- # Known trustworthy domain (0.1)
371
- for known_domain in self.AUTHORITATIVE_DOMAINS:
372
- if known_domain in domain:
373
- score += 0.1
374
- break
375
-
376
- # Known low-trust domain (penalty)
377
- for low_trust_domain in self.LOW_TRUST_DOMAINS:
378
- if low_trust_domain in domain:
379
- score -= 0.3
380
- break
381
-
382
- return max(min(score, 1.0), 0.0)
383
-
384
- def explain_score(self, eeat: EEATScore, url: str) -> str:
385
- """
386
- Generate human-readable explanation of E-E-A-T score.
387
-
388
- Args:
389
- eeat: EEATScore instance
390
- url: Source URL
391
-
392
- Returns:
393
- Formatted explanation string
394
- """
395
- domain = self._extract_domain(url)
396
-
397
- explanations = []
398
-
399
- # Experience
400
- if eeat.experience >= 0.8:
401
- explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
402
- elif eeat.experience >= 0.5:
403
- explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
404
- else:
405
- explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
406
-
407
- # Expertise
408
- if eeat.expertise >= 0.8:
409
- explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
410
- elif eeat.expertise >= 0.5:
411
- explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
412
- else:
413
- explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
414
-
415
- # Authority
416
- if eeat.authority >= 0.8:
417
- explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
418
- elif eeat.authority >= 0.5:
419
- explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
420
- else:
421
- explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
422
-
423
- # Trust
424
- if eeat.trust >= 0.8:
425
- explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
426
- elif eeat.trust >= 0.5:
427
- explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
428
- else:
429
- explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
430
-
431
- return "\n".join(explanations)
432
-
433
-
434
- # Test
435
- if __name__ == "__main__":
436
- calc = EEATCalculator()
437
-
438
- test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
439
- test_text = """
440
- Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
441
- au Capitol a été un événement marquant. Notre reporter sur place a témoigné
442
- des événements. Les experts politiques analysent les conséquences.
443
- """
444
-
445
- nlp_analysis = {
446
- 'coherence': {'score': 0.8},
447
- 'bias_analysis': {'score': 0.2}
448
- }
449
-
450
- eeat = calc.calculate(
451
- url=test_url,
452
- text=test_text,
453
- nlp_analysis=nlp_analysis,
454
- pagerank=0.7,
455
- has_https=True,
456
- author_identified=True
457
- )
458
-
459
- print("=== E-E-A-T Scores ===")
460
- print(f"Experience: {eeat.experience_pct}")
461
- print(f"Expertise: {eeat.expertise_pct}")
462
- print(f"Authority: {eeat.authority_pct}")
463
- print(f"Trust: {eeat.trust_pct}")
464
- print(f"Overall: {eeat.overall_pct}")
465
- print("\n=== Explanation ===")
466
- print(calc.explain_score(eeat, test_url))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
syscred/syscred/ner_analyzer.py DELETED
@@ -1,283 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Named Entity Recognition (NER) Analyzer for SysCRED
5
- ====================================================
6
- Extracts named entities from text using spaCy.
7
-
8
- Entities detected:
9
- - PER: Persons (Donald Trump, Emmanuel Macron)
10
- - ORG: Organizations (FBI, UN, Google)
11
- - LOC: Locations (Paris, Capitol)
12
- - DATE: Dates (January 6, 2021)
13
- - MONEY: Amounts ($10 million)
14
- - EVENT: Events (insurrection, election)
15
- """
16
-
17
- from typing import Dict, List, Any, Optional
18
- import logging
19
-
20
- # Try to import spaCy
21
- try:
22
- import spacy
23
- from spacy.language import Language
24
- HAS_SPACY = True
25
- except ImportError:
26
- HAS_SPACY = False
27
- spacy = None
28
-
29
- logger = logging.getLogger(__name__)
30
-
31
-
32
- class NERAnalyzer:
33
- """
34
- Named Entity Recognition analyzer using spaCy.
35
-
36
- Supports French (fr_core_news_md) and English (en_core_web_md).
37
- Falls back to heuristic extraction if spaCy is not available.
38
- """
39
-
40
- # Entity type mappings for display
41
- ENTITY_LABELS = {
42
- 'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
43
- 'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
44
- 'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
45
- 'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
46
- 'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
47
- 'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
48
- 'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
49
- 'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
50
- 'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
51
- 'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
52
- 'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
53
- 'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
54
- 'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
55
- 'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
56
- }
57
-
58
- def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
59
- """
60
- Initialize NER analyzer.
61
-
62
- Args:
63
- model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
64
- fallback: If True, use heuristics when spaCy unavailable
65
- """
66
- self.model_name = model_name
67
- self.fallback = fallback
68
- self.nlp = None
69
- self.use_heuristics = False
70
-
71
- if HAS_SPACY:
72
- try:
73
- self.nlp = spacy.load(model_name)
74
- logger.info(f"[NER] Loaded spaCy model: {model_name}")
75
- except OSError as e:
76
- logger.warning(f"[NER] Could not load model {model_name}: {e}")
77
- if fallback:
78
- self.use_heuristics = True
79
- logger.info("[NER] Using heuristic entity extraction")
80
- else:
81
- if fallback:
82
- self.use_heuristics = True
83
- logger.info("[NER] spaCy not installed. Using heuristic extraction")
84
-
85
- def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
86
- """
87
- Extract named entities from text.
88
-
89
- Args:
90
- text: Input text to analyze
91
-
92
- Returns:
93
- Dictionary mapping entity types to lists of entities
94
- Each entity has: text, start, end, label, label_display, emoji, confidence
95
- """
96
- if not text or len(text.strip()) == 0:
97
- return {}
98
-
99
- if self.nlp:
100
- return self._extract_with_spacy(text)
101
- elif self.use_heuristics:
102
- return self._extract_with_heuristics(text)
103
- else:
104
- return {}
105
-
106
- def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
107
- """Extract entities using spaCy NLP."""
108
- doc = self.nlp(text)
109
- entities: Dict[str, List[Dict[str, Any]]] = {}
110
-
111
- for ent in doc.ents:
112
- label = ent.label_
113
-
114
- # Get display info
115
- label_info = self.ENTITY_LABELS.get(label, {
116
- 'fr': label,
117
- 'en': label,
118
- 'emoji': '🔖'
119
- })
120
-
121
- entity_data = {
122
- 'text': ent.text,
123
- 'start': ent.start_char,
124
- 'end': ent.end_char,
125
- 'label': label,
126
- 'label_display': label_info.get('fr', label),
127
- 'emoji': label_info.get('emoji', '🔖'),
128
- 'confidence': 0.85 # spaCy doesn't provide confidence by default
129
- }
130
-
131
- if label not in entities:
132
- entities[label] = []
133
-
134
- # Avoid duplicates
135
- if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
136
- entities[label].append(entity_data)
137
-
138
- return entities
139
-
140
- def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
141
- """
142
- Fallback heuristic entity extraction.
143
- Uses pattern matching for common entities.
144
- """
145
- import re
146
- entities: Dict[str, List[Dict[str, Any]]] = {}
147
-
148
- # Common patterns
149
- patterns = {
150
- 'PER': [
151
- # Known political figures
152
- r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
153
- r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
154
- ],
155
- 'ORG': [
156
- r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
157
- r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
158
- r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
159
- ],
160
- 'LOC': [
161
- r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
162
- r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
163
- ],
164
- 'DATE': [
165
- r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
166
- r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
167
- r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
168
- r'\b(January|February|March|April|May|June|July|August|'
169
- r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
170
- ],
171
- 'MONEY': [
172
- r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
173
- r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
174
- r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
175
- ],
176
- 'PERCENT': [
177
- r'\b\d+(?:\.\d+)?%',
178
- r'\b\d+(?:\.\d+)?\s*pour\s*cent',
179
- r'\b\d+(?:\.\d+)?\s*percent',
180
- ],
181
- }
182
-
183
- for label, pattern_list in patterns.items():
184
- label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
185
-
186
- for pattern in pattern_list:
187
- for match in re.finditer(pattern, text, re.IGNORECASE):
188
- entity_data = {
189
- 'text': match.group(),
190
- 'start': match.start(),
191
- 'end': match.end(),
192
- 'label': label,
193
- 'label_display': label_info.get('fr', label),
194
- 'emoji': label_info.get('emoji', '🔖'),
195
- 'confidence': 0.70 # Lower confidence for heuristics
196
- }
197
-
198
- if label not in entities:
199
- entities[label] = []
200
-
201
- # Avoid duplicates
202
- if not any(e['text'].lower() == entity_data['text'].lower()
203
- for e in entities[label]):
204
- entities[label].append(entity_data)
205
-
206
- return entities
207
-
208
- def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
209
- """
210
- Generate a human-readable summary of extracted entities.
211
-
212
- Args:
213
- entities: Dictionary of entities from extract_entities()
214
-
215
- Returns:
216
- Formatted string summary
217
- """
218
- if not entities:
219
- return "Aucune entité nommée détectée."
220
-
221
- lines = []
222
- for label, ent_list in entities.items():
223
- label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
224
- emoji = label_info.get('emoji', '🔖')
225
- label_display = label_info.get('fr', label)
226
-
227
- entity_texts = [e['text'] for e in ent_list[:5]] # Limit to 5
228
- lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
229
-
230
- return "\n".join(lines)
231
-
232
- def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
233
- """
234
- Convert entities to frontend-friendly format.
235
-
236
- Returns:
237
- List of entities with all info for display
238
- """
239
- result = []
240
- for label, ent_list in entities.items():
241
- for ent in ent_list:
242
- result.append({
243
- 'text': ent['text'],
244
- 'type': ent['label'],
245
- 'type_display': ent.get('label_display', ent['label']),
246
- 'emoji': ent.get('emoji', '🔖'),
247
- 'confidence': ent.get('confidence', 0.5),
248
- 'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
249
- })
250
-
251
- # Sort by confidence
252
- result.sort(key=lambda x: x['confidence'], reverse=True)
253
- return result
254
-
255
-
256
- # Singleton instance for easy import
257
- _ner_analyzer: Optional[NERAnalyzer] = None
258
-
259
-
260
- def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
261
- """Get or create singleton NER analyzer instance."""
262
- global _ner_analyzer
263
- if _ner_analyzer is None:
264
- _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
265
- return _ner_analyzer
266
-
267
-
268
- # Quick test
269
- if __name__ == "__main__":
270
- analyzer = NERAnalyzer(fallback=True)
271
-
272
- test_text = """
273
- Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
274
- Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
275
- Les dégâts sont estimés à 30 millions de dollars.
276
- """
277
-
278
- entities = analyzer.extract_entities(test_text)
279
- print("=== Entités détectées ===")
280
- print(analyzer.get_entity_summary(entities))
281
- print("\n=== Format Frontend ===")
282
- for e in analyzer.to_frontend_format(entities):
283
- print(f" {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")