Spaces:

DomLoyer
/

syscred

Running

App Files Files Community

D Ф m i И i q ц e L Ф y e r commited on 13 days ago

Commit

6043bc8

1 Parent(s): ff19e9c

Add NER analyzer and EEAT calculator modules

Browse files

Files changed (2) hide show

syscred/syscred/eeat_calculator.py +466 -0
syscred/syscred/ner_analyzer.py +283 -0

syscred/syscred/eeat_calculator.py ADDED Viewed

	@@ -0,0 +1,466 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+E-E-A-T Metrics Calculator for SysCRED
+========================================
+Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
+These metrics mirror modern Google ranking signals:
+- Experience: Domain age, content freshness
+- Expertise: Author identification, depth of content
+- Authority: PageRank simulation, citations/backlinks
+- Trust: HTTPS, fact-checks, low bias score
+"""
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
+import re
+from datetime import datetime
+import logging
+logger = logging.getLogger(__name__)
+@dataclass
+class EEATScore:
+    """E-E-A-T score container."""
+    experience: float  # 0-1
+    expertise: float   # 0-1
+    authority: float   # 0-1
+    trust: float       # 0-1
+    @property
+    def overall(self) -> float:
+        """Weighted average of all E-E-A-T components."""
+        # Weights based on Google's emphasis
+        weights = {
+            'experience': 0.15,
+            'expertise': 0.25,
+            'authority': 0.35,
+            'trust': 0.25
+        }
+        return (
+            self.experience * weights['experience'] +
+            self.expertise * weights['expertise'] +
+            self.authority * weights['authority'] +
+            self.trust * weights['trust']
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            'experience': round(self.experience, 3),
+            'expertise': round(self.expertise, 3),
+            'authority': round(self.authority, 3),
+            'trust': round(self.trust, 3),
+            'overall': round(self.overall, 3),
+            'experience_pct': f"{int(self.experience * 100)}%",
+            'expertise_pct': f"{int(self.expertise * 100)}%",
+            'authority_pct': f"{int(self.authority * 100)}%",
+            'trust_pct': f"{int(self.trust * 100)}%",
+            'overall_pct': f"{int(self.overall * 100)}%"
+        }
+class EEATCalculator:
+    """
+    Calculate E-E-A-T metrics from various signals.
+    Mirrors Google's quality rater guidelines:
+    - Experience: Has the author demonstrated real experience?
+    - Expertise: Is the content expert-level?
+    - Authority: Is the source recognized as authoritative?
+    - Trust: Is the source trustworthy?
+    """
+    # Known authoritative domains
+    AUTHORITATIVE_DOMAINS = {
+        # News
+        'lemonde.fr': 0.95,
+        'lefigaro.fr': 0.90,
+        'liberation.fr': 0.88,
+        'nytimes.com': 0.95,
+        'washingtonpost.com': 0.93,
+        'theguardian.com': 0.92,
+        'bbc.com': 0.94,
+        'bbc.co.uk': 0.94,
+        'reuters.com': 0.96,
+        'apnews.com': 0.95,
+        # Academic
+        'nature.com': 0.98,
+        'science.org': 0.98,
+        'pubmed.ncbi.nlm.nih.gov': 0.97,
+        'scholar.google.com': 0.85,
+        # Government
+        'gouv.fr': 0.90,
+        'gov.uk': 0.90,
+        'whitehouse.gov': 0.88,
+        'europa.eu': 0.92,
+        # Fact-checkers
+        'snopes.com': 0.88,
+        'factcheck.org': 0.90,
+        'politifact.com': 0.88,
+        'fullfact.org': 0.89,
+        # Wikipedia (moderate authority)
+        'wikipedia.org': 0.75,
+        'fr.wikipedia.org': 0.75,
+        'en.wikipedia.org': 0.75,
+    }
+    # Low-trust domains (misinformation sources)
+    LOW_TRUST_DOMAINS = {
+        'infowars.com': 0.1,
+        'breitbart.com': 0.3,
+        'naturalnews.com': 0.15,
+        # Add more as needed
+    }
+    def __init__(self):
+        """Initialize E-E-A-T calculator."""
+        pass
+    def calculate(
+        self,
+        url: str,
+        text: str,
+        nlp_analysis: Optional[Dict[str, Any]] = None,
+        pagerank: Optional[float] = None,
+        fact_checks: Optional[List[Dict]] = None,
+        domain_age_years: Optional[float] = None,
+        has_https: bool = True,
+        author_identified: bool = False,
+        seo_score: Optional[float] = None
+    ) -> EEATScore:
+        """
+        Calculate E-E-A-T scores from available signals.
+        Args:
+            url: Source URL
+            text: Article text content
+            nlp_analysis: NLP analysis results (sentiment, coherence, bias)
+            pagerank: Simulated PageRank score (0-1)
+            fact_checks: List of fact-check results
+            domain_age_years: Domain age in years (from WHOIS)
+            has_https: Whether site uses HTTPS
+            author_identified: Whether author is clearly identified
+            seo_score: SEO/technical quality score
+        Returns:
+            EEATScore with all component scores
+        """
+        # Extract domain from URL
+        domain = self._extract_domain(url)
+        # Calculate each component
+        experience = self._calculate_experience(
+            domain_age_years,
+            text,
+            nlp_analysis
+        )
+        expertise = self._calculate_expertise(
+            text,
+            author_identified,
+            nlp_analysis
+        )
+        authority = self._calculate_authority(
+            domain,
+            pagerank,
+            seo_score
+        )
+        trust = self._calculate_trust(
+            domain,
+            has_https,
+            fact_checks,
+            nlp_analysis
+        )
+        return EEATScore(
+            experience=experience,
+            expertise=expertise,
+            authority=authority,
+            trust=trust
+        )
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain from URL."""
+        import re
+        match = re.search(r'https?://(?:www\.)?([^/]+)', url)
+        return match.group(1).lower() if match else url.lower()
+    def _calculate_experience(
+        self,
+        domain_age_years: Optional[float],
+        text: str,
+        nlp_analysis: Optional[Dict]
+    ) -> float:
+        """
+        Calculate Experience score.
+        Factors:
+        - Domain age (longer = more experience)
+        - Content freshness (recently updated)
+        - First-hand experience indicators in text
+        """
+        score = 0.5  # Base score
+        # Domain age contribution (max 0.3)
+        if domain_age_years is not None:
+            age_score = min(domain_age_years / 20, 1.0) * 0.3  # 20 years = max
+            score += age_score
+        else:
+            score += 0.15  # Assume moderate age
+        # Content depth contribution (max 0.2)
+        word_count = len(text.split()) if text else 0
+        if word_count > 1000:
+            score += 0.2
+        elif word_count > 500:
+            score += 0.15
+        elif word_count > 200:
+            score += 0.1
+        # First-hand experience indicators (max 0.1)
+        experience_indicators = [
+            r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
+            r'\b(interview|entretien|témoignage|witness|firsthand)\b',
+            r'\b(sur place|on the ground|eyewitness)\b'
+        ]
+        for pattern in experience_indicators:
+            if re.search(pattern, text, re.IGNORECASE):
+                score += 0.03
+        return min(score, 1.0)
+    def _calculate_expertise(
+        self,
+        text: str,
+        author_identified: bool,
+        nlp_analysis: Optional[Dict]
+    ) -> float:
+        """
+        Calculate Expertise score.
+        Factors:
+        - Author identification
+        - Technical depth of content
+        - Citation of sources
+        - Coherence (from NLP)
+        """
+        score = 0.4  # Base score
+        # Author identification (0.2)
+        if author_identified:
+            score += 0.2
+        # Citation indicators (max 0.2)
+        citation_patterns = [
+            r'\b(selon|according to|d\'après|source:)\b',
+            r'\b(étude|study|research|rapport|report)\b',
+            r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
+            r'\[([\d]+)\]',  # [1] style citations
+            r'https?://[^\s]+'  # Links
+        ]
+        citation_count = 0
+        for pattern in citation_patterns:
+            citation_count += len(re.findall(pattern, text, re.IGNORECASE))
+        score += min(citation_count * 0.02, 0.2)
+        # Coherence from NLP analysis (0.2)
+        if nlp_analysis and 'coherence' in nlp_analysis:
+            coherence = nlp_analysis['coherence']
+            if isinstance(coherence, dict):
+                coherence = coherence.get('score', 0.5)
+            score += coherence * 0.2
+        else:
+            score += 0.1  # Assume moderate coherence
+        return min(score, 1.0)
+    def _calculate_authority(
+        self,
+        domain: str,
+        pagerank: Optional[float],
+        seo_score: Optional[float]
+    ) -> float:
+        """
+        Calculate Authority score.
+        Factors:
+        - Known authoritative domain
+        - PageRank simulation
+        - SEO/technical quality
+        """
+        score = 0.3  # Base score
+        # Known domain authority (max 0.5)
+        for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
+            if known_domain in domain:
+                score = max(score, authority * 0.5 + 0.3)
+                break
+        # Check low-trust domains
+        for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
+            if low_trust_domain in domain:
+                score = min(score, low_score)
+                break
+        # PageRank contribution (max 0.3)
+        if pagerank is not None:
+            score += pagerank * 0.3
+        else:
+            score += 0.15  # Assume moderate pagerank
+        # SEO score contribution (max 0.2)
+        if seo_score is not None:
+            score += seo_score * 0.2
+        else:
+            score += 0.1
+        return min(score, 1.0)
+    def _calculate_trust(
+        self,
+        domain: str,
+        has_https: bool,
+        fact_checks: Optional[List[Dict]],
+        nlp_analysis: Optional[Dict]
+    ) -> float:
+        """
+        Calculate Trust score.
+        Factors:
+        - HTTPS
+        - Fact-check results
+        - Bias score (low = better)
+        - Known trustworthy domain
+        """
+        score = 0.4  # Base score
+        # HTTPS (0.1)
+        if has_https:
+            score += 0.1
+        # Fact-check results (max 0.3)
+        if fact_checks:
+            positive_checks = sum(1 for fc in fact_checks
+                                  if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
+            negative_checks = sum(1 for fc in fact_checks
+                                  if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
+            if positive_checks > 0:
+                score += 0.2
+            if negative_checks > 0:
+                score -= 0.3
+        # Bias score (max 0.2, lower bias = higher trust)
+        if nlp_analysis:
+            bias_data = nlp_analysis.get('bias_analysis', {})
+            if isinstance(bias_data, dict):
+                bias_score = bias_data.get('score', 0.3)
+            else:
+                bias_score = 0.3
+            # Invert: low bias = high trust contribution
+            score += (1 - bias_score) * 0.2
+        else:
+            score += 0.1
+        # Known trustworthy domain (0.1)
+        for known_domain in self.AUTHORITATIVE_DOMAINS:
+            if known_domain in domain:
+                score += 0.1
+                break
+        # Known low-trust domain (penalty)
+        for low_trust_domain in self.LOW_TRUST_DOMAINS:
+            if low_trust_domain in domain:
+                score -= 0.3
+                break
+        return max(min(score, 1.0), 0.0)
+    def explain_score(self, eeat: EEATScore, url: str) -> str:
+        """
+        Generate human-readable explanation of E-E-A-T score.
+        Args:
+            eeat: EEATScore instance
+            url: Source URL
+        Returns:
+            Formatted explanation string
+        """
+        domain = self._extract_domain(url)
+        explanations = []
+        # Experience
+        if eeat.experience >= 0.8:
+            explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
+        elif eeat.experience >= 0.5:
+            explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
+        else:
+            explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
+        # Expertise
+        if eeat.expertise >= 0.8:
+            explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
+        elif eeat.expertise >= 0.5:
+            explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
+        else:
+            explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
+        # Authority
+        if eeat.authority >= 0.8:
+            explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
+        elif eeat.authority >= 0.5:
+            explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
+        else:
+            explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
+        # Trust
+        if eeat.trust >= 0.8:
+            explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
+        elif eeat.trust >= 0.5:
+            explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
+        else:
+            explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
+        return "\n".join(explanations)
+# Test
+if __name__ == "__main__":
+    calc = EEATCalculator()
+    test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
+    test_text = """
+    Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
+    au Capitol a été un événement marquant. Notre reporter sur place a témoigné
+    des événements. Les experts politiques analysent les conséquences.
+    """
+    nlp_analysis = {
+        'coherence': {'score': 0.8},
+        'bias_analysis': {'score': 0.2}
+    }
+    eeat = calc.calculate(
+        url=test_url,
+        text=test_text,
+        nlp_analysis=nlp_analysis,
+        pagerank=0.7,
+        has_https=True,
+        author_identified=True
+    )
+    print("=== E-E-A-T Scores ===")
+    print(f"Experience: {eeat.experience_pct}")
+    print(f"Expertise:  {eeat.expertise_pct}")
+    print(f"Authority:  {eeat.authority_pct}")
+    print(f"Trust:      {eeat.trust_pct}")
+    print(f"Overall:    {eeat.overall_pct}")
+    print("\n=== Explanation ===")
+    print(calc.explain_score(eeat, test_url))

syscred/syscred/ner_analyzer.py ADDED Viewed

	@@ -0,0 +1,283 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Named Entity Recognition (NER) Analyzer for SysCRED
+====================================================
+Extracts named entities from text using spaCy.
+Entities detected:
+- PER: Persons (Donald Trump, Emmanuel Macron)
+- ORG: Organizations (FBI, UN, Google)
+- LOC: Locations (Paris, Capitol)
+- DATE: Dates (January 6, 2021)
+- MONEY: Amounts ($10 million)
+- EVENT: Events (insurrection, election)
+"""
+from typing import Dict, List, Any, Optional
+import logging
+# Try to import spaCy
+try:
+    import spacy
+    from spacy.language import Language
+    HAS_SPACY = True
+except ImportError:
+    HAS_SPACY = False
+    spacy = None
+logger = logging.getLogger(__name__)
+class NERAnalyzer:
+    """
+    Named Entity Recognition analyzer using spaCy.
+    Supports French (fr_core_news_md) and English (en_core_web_md).
+    Falls back to heuristic extraction if spaCy is not available.
+    """
+    # Entity type mappings for display
+    ENTITY_LABELS = {
+        'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
+        'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
+        'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
+        'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
+        'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
+        'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
+        'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
+        'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
+        'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
+        'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
+        'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
+        'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
+        'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
+        'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
+    }
+    def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
+        """
+        Initialize NER analyzer.
+        Args:
+            model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
+            fallback: If True, use heuristics when spaCy unavailable
+        """
+        self.model_name = model_name
+        self.fallback = fallback
+        self.nlp = None
+        self.use_heuristics = False
+        if HAS_SPACY:
+            try:
+                self.nlp = spacy.load(model_name)
+                logger.info(f"[NER] Loaded spaCy model: {model_name}")
+            except OSError as e:
+                logger.warning(f"[NER] Could not load model {model_name}: {e}")
+                if fallback:
+                    self.use_heuristics = True
+                    logger.info("[NER] Using heuristic entity extraction")
+        else:
+            if fallback:
+                self.use_heuristics = True
+                logger.info("[NER] spaCy not installed. Using heuristic extraction")
+    def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Extract named entities from text.
+        Args:
+            text: Input text to analyze
+        Returns:
+            Dictionary mapping entity types to lists of entities
+            Each entity has: text, start, end, label, label_display, emoji, confidence
+        """
+        if not text or len(text.strip()) == 0:
+            return {}
+        if self.nlp:
+            return self._extract_with_spacy(text)
+        elif self.use_heuristics:
+            return self._extract_with_heuristics(text)
+        else:
+            return {}
+    def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
+        """Extract entities using spaCy NLP."""
+        doc = self.nlp(text)
+        entities: Dict[str, List[Dict[str, Any]]] = {}
+        for ent in doc.ents:
+            label = ent.label_
+            # Get display info
+            label_info = self.ENTITY_LABELS.get(label, {
+                'fr': label,
+                'en': label,
+                'emoji': '🔖'
+            })
+            entity_data = {
+                'text': ent.text,
+                'start': ent.start_char,
+                'end': ent.end_char,
+                'label': label,
+                'label_display': label_info.get('fr', label),
+                'emoji': label_info.get('emoji', '🔖'),
+                'confidence': 0.85  # spaCy doesn't provide confidence by default
+            }
+            if label not in entities:
+                entities[label] = []
+            # Avoid duplicates
+            if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
+                entities[label].append(entity_data)
+        return entities
+    def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Fallback heuristic entity extraction.
+        Uses pattern matching for common entities.
+        """
+        import re
+        entities: Dict[str, List[Dict[str, Any]]] = {}
+        # Common patterns
+        patterns = {
+            'PER': [
+                # Known political figures
+                r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
+                r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
+            ],
+            'ORG': [
+                r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
+                r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
+                r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
+            ],
+            'LOC': [
+                r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
+                r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
+            ],
+            'DATE': [
+                r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
+                r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
+                r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
+                r'\b(January|February|March|April|May|June|July|August|'
+                r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
+            ],
+            'MONEY': [
+                r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
+                r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
+                r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
+            ],
+            'PERCENT': [
+                r'\b\d+(?:\.\d+)?%',
+                r'\b\d+(?:\.\d+)?\s*pour\s*cent',
+                r'\b\d+(?:\.\d+)?\s*percent',
+            ],
+        }
+        for label, pattern_list in patterns.items():
+            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
+            for pattern in pattern_list:
+                for match in re.finditer(pattern, text, re.IGNORECASE):
+                    entity_data = {
+                        'text': match.group(),
+                        'start': match.start(),
+                        'end': match.end(),
+                        'label': label,
+                        'label_display': label_info.get('fr', label),
+                        'emoji': label_info.get('emoji', '🔖'),
+                        'confidence': 0.70  # Lower confidence for heuristics
+                    }
+                    if label not in entities:
+                        entities[label] = []
+                    # Avoid duplicates
+                    if not any(e['text'].lower() == entity_data['text'].lower()
+                              for e in entities[label]):
+                        entities[label].append(entity_data)
+        return entities
+    def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
+        """
+        Generate a human-readable summary of extracted entities.
+        Args:
+            entities: Dictionary of entities from extract_entities()
+        Returns:
+            Formatted string summary
+        """
+        if not entities:
+            return "Aucune entité nommée détectée."
+        lines = []
+        for label, ent_list in entities.items():
+            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
+            emoji = label_info.get('emoji', '🔖')
+            label_display = label_info.get('fr', label)
+            entity_texts = [e['text'] for e in ent_list[:5]]  # Limit to 5
+            lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
+        return "\n".join(lines)
+    def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
+        """
+        Convert entities to frontend-friendly format.
+        Returns:
+            List of entities with all info for display
+        """
+        result = []
+        for label, ent_list in entities.items():
+            for ent in ent_list:
+                result.append({
+                    'text': ent['text'],
+                    'type': ent['label'],
+                    'type_display': ent.get('label_display', ent['label']),
+                    'emoji': ent.get('emoji', '🔖'),
+                    'confidence': ent.get('confidence', 0.5),
+                    'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
+                })
+        # Sort by confidence
+        result.sort(key=lambda x: x['confidence'], reverse=True)
+        return result
+# Singleton instance for easy import
+_ner_analyzer: Optional[NERAnalyzer] = None
+def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
+    """Get or create singleton NER analyzer instance."""
+    global _ner_analyzer
+    if _ner_analyzer is None:
+        _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
+    return _ner_analyzer
+# Quick test
+if __name__ == "__main__":
+    analyzer = NERAnalyzer(fallback=True)
+    test_text = """
+    Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
+    Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
+    Les dégâts sont estimés à 30 millions de dollars.
+    """
+    entities = analyzer.extract_entities(test_text)
+    print("=== Entités détectées ===")
+    print(analyzer.get_entity_summary(entities))
+    print("\n=== Format Frontend ===")
+    for e in analyzer.to_frontend_format(entities):
+        print(f"  {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")