Spaces:

DomLoyer
/

syscred

Running

App Files Files Community

D Ф m i И i q ц e L Ф y e r commited on Feb 10

Commit

fdaeab5

2 Parent(s): 6043bc8 1cafc3b

Merge: Add NER/EEAT modules + requirements-distilled.txt

Browse files

Files changed (3) hide show

syscred/requirements-distilled.txt +36 -0
syscred/syscred/eeat_calculator.py +0 -466
syscred/syscred/ner_analyzer.py +0 -283

syscred/requirements-distilled.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+# SysCRED - Optimized Requirements with Distilled Models
+# (c) Dominique S. Loyer
+# Uses DISTILLED models for faster loading and lower memory
+# === Core Dependencies ===
+requests>=2.28.0
+beautifulsoup4>=4.11.0
+python-whois>=0.8.0
+# === RDF/Ontology ===
+rdflib>=6.0.0
+# === Machine Learning (CPU-only) ===
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch>=2.0.0
+transformers>=4.30.0
+sentence-transformers>=2.2.0
+# === Data ===
+numpy>=1.24.0
+pandas>=2.0.0
+# === Explainability ===
+lime>=0.2.0
+# === NLP ===
+spacy>=3.5.0
+# === Web Backend ===
+flask>=2.3.0
+flask-cors>=4.0.0
+python-dotenv>=1.0.0
+gunicorn>=20.1.0
+flask-sqlalchemy>=3.1.0
+scikit-learn>=1.3.0
+scipy>=1.11.0

syscred/syscred/eeat_calculator.py DELETED Viewed

@@ -1,466 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-E-E-A-T Metrics Calculator for SysCRED
-========================================
-Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
-These metrics mirror modern Google ranking signals:
-- Experience: Domain age, content freshness
-- Expertise: Author identification, depth of content
-- Authority: PageRank simulation, citations/backlinks
-- Trust: HTTPS, fact-checks, low bias score
-"""
-from typing import Dict, Any, Optional, List
-from dataclasses import dataclass
-import re
-from datetime import datetime
-import logging
-logger = logging.getLogger(__name__)
-@dataclass
-class EEATScore:
-    """E-E-A-T score container."""
-    experience: float  # 0-1
-    expertise: float   # 0-1
-    authority: float   # 0-1
-    trust: float       # 0-1
-    @property
-    def overall(self) -> float:
-        """Weighted average of all E-E-A-T components."""
-        # Weights based on Google's emphasis
-        weights = {
-            'experience': 0.15,
-            'expertise': 0.25,
-            'authority': 0.35,
-            'trust': 0.25
-        }
-        return (
-            self.experience * weights['experience'] +
-            self.expertise * weights['expertise'] +
-            self.authority * weights['authority'] +
-            self.trust * weights['trust']
-        )
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert to dictionary for JSON serialization."""
-        return {
-            'experience': round(self.experience, 3),
-            'expertise': round(self.expertise, 3),
-            'authority': round(self.authority, 3),
-            'trust': round(self.trust, 3),
-            'overall': round(self.overall, 3),
-            'experience_pct': f"{int(self.experience * 100)}%",
-            'expertise_pct': f"{int(self.expertise * 100)}%",
-            'authority_pct': f"{int(self.authority * 100)}%",
-            'trust_pct': f"{int(self.trust * 100)}%",
-            'overall_pct': f"{int(self.overall * 100)}%"
-        }
-class EEATCalculator:
-    """
-    Calculate E-E-A-T metrics from various signals.
-    Mirrors Google's quality rater guidelines:
-    - Experience: Has the author demonstrated real experience?
-    - Expertise: Is the content expert-level?
-    - Authority: Is the source recognized as authoritative?
-    - Trust: Is the source trustworthy?
-    """
-    # Known authoritative domains
-    AUTHORITATIVE_DOMAINS = {
-        # News
-        'lemonde.fr': 0.95,
-        'lefigaro.fr': 0.90,
-        'liberation.fr': 0.88,
-        'nytimes.com': 0.95,
-        'washingtonpost.com': 0.93,
-        'theguardian.com': 0.92,
-        'bbc.com': 0.94,
-        'bbc.co.uk': 0.94,
-        'reuters.com': 0.96,
-        'apnews.com': 0.95,
-        # Academic
-        'nature.com': 0.98,
-        'science.org': 0.98,
-        'pubmed.ncbi.nlm.nih.gov': 0.97,
-        'scholar.google.com': 0.85,
-        # Government
-        'gouv.fr': 0.90,
-        'gov.uk': 0.90,
-        'whitehouse.gov': 0.88,
-        'europa.eu': 0.92,
-        # Fact-checkers
-        'snopes.com': 0.88,
-        'factcheck.org': 0.90,
-        'politifact.com': 0.88,
-        'fullfact.org': 0.89,
-        # Wikipedia (moderate authority)
-        'wikipedia.org': 0.75,
-        'fr.wikipedia.org': 0.75,
-        'en.wikipedia.org': 0.75,
-    }
-    # Low-trust domains (misinformation sources)
-    LOW_TRUST_DOMAINS = {
-        'infowars.com': 0.1,
-        'breitbart.com': 0.3,
-        'naturalnews.com': 0.15,
-        # Add more as needed
-    }
-    def __init__(self):
-        """Initialize E-E-A-T calculator."""
-        pass
-    def calculate(
-        self,
-        url: str,
-        text: str,
-        nlp_analysis: Optional[Dict[str, Any]] = None,
-        pagerank: Optional[float] = None,
-        fact_checks: Optional[List[Dict]] = None,
-        domain_age_years: Optional[float] = None,
-        has_https: bool = True,
-        author_identified: bool = False,
-        seo_score: Optional[float] = None
-    ) -> EEATScore:
-        """
-        Calculate E-E-A-T scores from available signals.
-        Args:
-            url: Source URL
-            text: Article text content
-            nlp_analysis: NLP analysis results (sentiment, coherence, bias)
-            pagerank: Simulated PageRank score (0-1)
-            fact_checks: List of fact-check results
-            domain_age_years: Domain age in years (from WHOIS)
-            has_https: Whether site uses HTTPS
-            author_identified: Whether author is clearly identified
-            seo_score: SEO/technical quality score
-        Returns:
-            EEATScore with all component scores
-        """
-        # Extract domain from URL
-        domain = self._extract_domain(url)
-        # Calculate each component
-        experience = self._calculate_experience(
-            domain_age_years,
-            text,
-            nlp_analysis
-        )
-        expertise = self._calculate_expertise(
-            text,
-            author_identified,
-            nlp_analysis
-        )
-        authority = self._calculate_authority(
-            domain,
-            pagerank,
-            seo_score
-        )
-        trust = self._calculate_trust(
-            domain,
-            has_https,
-            fact_checks,
-            nlp_analysis
-        )
-        return EEATScore(
-            experience=experience,
-            expertise=expertise,
-            authority=authority,
-            trust=trust
-        )
-    def _extract_domain(self, url: str) -> str:
-        """Extract domain from URL."""
-        import re
-        match = re.search(r'https?://(?:www\.)?([^/]+)', url)
-        return match.group(1).lower() if match else url.lower()
-    def _calculate_experience(
-        self,
-        domain_age_years: Optional[float],
-        text: str,
-        nlp_analysis: Optional[Dict]
-    ) -> float:
-        """
-        Calculate Experience score.
-        Factors:
-        - Domain age (longer = more experience)
-        - Content freshness (recently updated)
-        - First-hand experience indicators in text
-        """
-        score = 0.5  # Base score
-        # Domain age contribution (max 0.3)
-        if domain_age_years is not None:
-            age_score = min(domain_age_years / 20, 1.0) * 0.3  # 20 years = max
-            score += age_score
-        else:
-            score += 0.15  # Assume moderate age
-        # Content depth contribution (max 0.2)
-        word_count = len(text.split()) if text else 0
-        if word_count > 1000:
-            score += 0.2
-        elif word_count > 500:
-            score += 0.15
-        elif word_count > 200:
-            score += 0.1
-        # First-hand experience indicators (max 0.1)
-        experience_indicators = [
-            r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
-            r'\b(interview|entretien|témoignage|witness|firsthand)\b',
-            r'\b(sur place|on the ground|eyewitness)\b'
-        ]
-        for pattern in experience_indicators:
-            if re.search(pattern, text, re.IGNORECASE):
-                score += 0.03
-        return min(score, 1.0)
-    def _calculate_expertise(
-        self,
-        text: str,
-        author_identified: bool,
-        nlp_analysis: Optional[Dict]
-    ) -> float:
-        """
-        Calculate Expertise score.
-        Factors:
-        - Author identification
-        - Technical depth of content
-        - Citation of sources
-        - Coherence (from NLP)
-        """
-        score = 0.4  # Base score
-        # Author identification (0.2)
-        if author_identified:
-            score += 0.2
-        # Citation indicators (max 0.2)
-        citation_patterns = [
-            r'\b(selon|according to|d\'après|source:)\b',
-            r'\b(étude|study|research|rapport|report)\b',
-            r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
-            r'\[([\d]+)\]',  # [1] style citations
-            r'https?://[^\s]+'  # Links
-        ]
-        citation_count = 0
-        for pattern in citation_patterns:
-            citation_count += len(re.findall(pattern, text, re.IGNORECASE))
-        score += min(citation_count * 0.02, 0.2)
-        # Coherence from NLP analysis (0.2)
-        if nlp_analysis and 'coherence' in nlp_analysis:
-            coherence = nlp_analysis['coherence']
-            if isinstance(coherence, dict):
-                coherence = coherence.get('score', 0.5)
-            score += coherence * 0.2
-        else:
-            score += 0.1  # Assume moderate coherence
-        return min(score, 1.0)
-    def _calculate_authority(
-        self,
-        domain: str,
-        pagerank: Optional[float],
-        seo_score: Optional[float]
-    ) -> float:
-        """
-        Calculate Authority score.
-        Factors:
-        - Known authoritative domain
-        - PageRank simulation
-        - SEO/technical quality
-        """
-        score = 0.3  # Base score
-        # Known domain authority (max 0.5)
-        for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
-            if known_domain in domain:
-                score = max(score, authority * 0.5 + 0.3)
-                break
-        # Check low-trust domains
-        for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
-            if low_trust_domain in domain:
-                score = min(score, low_score)
-                break
-        # PageRank contribution (max 0.3)
-        if pagerank is not None:
-            score += pagerank * 0.3
-        else:
-            score += 0.15  # Assume moderate pagerank
-        # SEO score contribution (max 0.2)
-        if seo_score is not None:
-            score += seo_score * 0.2
-        else:
-            score += 0.1
-        return min(score, 1.0)
-    def _calculate_trust(
-        self,
-        domain: str,
-        has_https: bool,
-        fact_checks: Optional[List[Dict]],
-        nlp_analysis: Optional[Dict]
-    ) -> float:
-        """
-        Calculate Trust score.
-        Factors:
-        - HTTPS
-        - Fact-check results
-        - Bias score (low = better)
-        - Known trustworthy domain
-        """
-        score = 0.4  # Base score
-        # HTTPS (0.1)
-        if has_https:
-            score += 0.1
-        # Fact-check results (max 0.3)
-        if fact_checks:
-            positive_checks = sum(1 for fc in fact_checks
-                                  if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
-            negative_checks = sum(1 for fc in fact_checks
-                                  if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
-            if positive_checks > 0:
-                score += 0.2
-            if negative_checks > 0:
-                score -= 0.3
-        # Bias score (max 0.2, lower bias = higher trust)
-        if nlp_analysis:
-            bias_data = nlp_analysis.get('bias_analysis', {})
-            if isinstance(bias_data, dict):
-                bias_score = bias_data.get('score', 0.3)
-            else:
-                bias_score = 0.3
-            # Invert: low bias = high trust contribution
-            score += (1 - bias_score) * 0.2
-        else:
-            score += 0.1
-        # Known trustworthy domain (0.1)
-        for known_domain in self.AUTHORITATIVE_DOMAINS:
-            if known_domain in domain:
-                score += 0.1
-                break
-        # Known low-trust domain (penalty)
-        for low_trust_domain in self.LOW_TRUST_DOMAINS:
-            if low_trust_domain in domain:
-                score -= 0.3
-                break
-        return max(min(score, 1.0), 0.0)
-    def explain_score(self, eeat: EEATScore, url: str) -> str:
-        """
-        Generate human-readable explanation of E-E-A-T score.
-        Args:
-            eeat: EEATScore instance
-            url: Source URL
-        Returns:
-            Formatted explanation string
-        """
-        domain = self._extract_domain(url)
-        explanations = []
-        # Experience
-        if eeat.experience >= 0.8:
-            explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
-        elif eeat.experience >= 0.5:
-            explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
-        else:
-            explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
-        # Expertise
-        if eeat.expertise >= 0.8:
-            explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
-        elif eeat.expertise >= 0.5:
-            explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
-        else:
-            explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
-        # Authority
-        if eeat.authority >= 0.8:
-            explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
-        elif eeat.authority >= 0.5:
-            explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
-        else:
-            explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
-        # Trust
-        if eeat.trust >= 0.8:
-            explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
-        elif eeat.trust >= 0.5:
-            explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
-        else:
-            explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
-        return "\n".join(explanations)
-# Test
-if __name__ == "__main__":
-    calc = EEATCalculator()
-    test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
-    test_text = """
-    Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
-    au Capitol a été un événement marquant. Notre reporter sur place a témoigné
-    des événements. Les experts politiques analysent les conséquences.
-    """
-    nlp_analysis = {
-        'coherence': {'score': 0.8},
-        'bias_analysis': {'score': 0.2}
-    }
-    eeat = calc.calculate(
-        url=test_url,
-        text=test_text,
-        nlp_analysis=nlp_analysis,
-        pagerank=0.7,
-        has_https=True,
-        author_identified=True
-    )
-    print("=== E-E-A-T Scores ===")
-    print(f"Experience: {eeat.experience_pct}")
-    print(f"Expertise:  {eeat.expertise_pct}")
-    print(f"Authority:  {eeat.authority_pct}")
-    print(f"Trust:      {eeat.trust_pct}")
-    print(f"Overall:    {eeat.overall_pct}")
-    print("\n=== Explanation ===")
-    print(calc.explain_score(eeat, test_url))

syscred/syscred/ner_analyzer.py DELETED Viewed

@@ -1,283 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Named Entity Recognition (NER) Analyzer for SysCRED
-====================================================
-Extracts named entities from text using spaCy.
-Entities detected:
-- PER: Persons (Donald Trump, Emmanuel Macron)
-- ORG: Organizations (FBI, UN, Google)
-- LOC: Locations (Paris, Capitol)
-- DATE: Dates (January 6, 2021)
-- MONEY: Amounts ($10 million)
-- EVENT: Events (insurrection, election)
-"""
-from typing import Dict, List, Any, Optional
-import logging
-# Try to import spaCy
-try:
-    import spacy
-    from spacy.language import Language
-    HAS_SPACY = True
-except ImportError:
-    HAS_SPACY = False
-    spacy = None
-logger = logging.getLogger(__name__)
-class NERAnalyzer:
-    """
-    Named Entity Recognition analyzer using spaCy.
-    Supports French (fr_core_news_md) and English (en_core_web_md).
-    Falls back to heuristic extraction if spaCy is not available.
-    """
-    # Entity type mappings for display
-    ENTITY_LABELS = {
-        'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
-        'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
-        'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
-        'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
-        'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
-        'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
-        'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
-        'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
-        'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
-        'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
-        'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
-        'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
-        'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
-        'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
-    }
-    def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
-        """
-        Initialize NER analyzer.
-        Args:
-            model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
-            fallback: If True, use heuristics when spaCy unavailable
-        """
-        self.model_name = model_name
-        self.fallback = fallback
-        self.nlp = None
-        self.use_heuristics = False
-        if HAS_SPACY:
-            try:
-                self.nlp = spacy.load(model_name)
-                logger.info(f"[NER] Loaded spaCy model: {model_name}")
-            except OSError as e:
-                logger.warning(f"[NER] Could not load model {model_name}: {e}")
-                if fallback:
-                    self.use_heuristics = True
-                    logger.info("[NER] Using heuristic entity extraction")
-        else:
-            if fallback:
-                self.use_heuristics = True
-                logger.info("[NER] spaCy not installed. Using heuristic extraction")
-    def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
-        """
-        Extract named entities from text.
-        Args:
-            text: Input text to analyze
-        Returns:
-            Dictionary mapping entity types to lists of entities
-            Each entity has: text, start, end, label, label_display, emoji, confidence
-        """
-        if not text or len(text.strip()) == 0:
-            return {}
-        if self.nlp:
-            return self._extract_with_spacy(text)
-        elif self.use_heuristics:
-            return self._extract_with_heuristics(text)
-        else:
-            return {}
-    def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
-        """Extract entities using spaCy NLP."""
-        doc = self.nlp(text)
-        entities: Dict[str, List[Dict[str, Any]]] = {}
-        for ent in doc.ents:
-            label = ent.label_
-            # Get display info
-            label_info = self.ENTITY_LABELS.get(label, {
-                'fr': label,
-                'en': label,
-                'emoji': '🔖'
-            })
-            entity_data = {
-                'text': ent.text,
-                'start': ent.start_char,
-                'end': ent.end_char,
-                'label': label,
-                'label_display': label_info.get('fr', label),
-                'emoji': label_info.get('emoji', '🔖'),
-                'confidence': 0.85  # spaCy doesn't provide confidence by default
-            }
-            if label not in entities:
-                entities[label] = []
-            # Avoid duplicates
-            if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
-                entities[label].append(entity_data)
-        return entities
-    def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
-        """
-        Fallback heuristic entity extraction.
-        Uses pattern matching for common entities.
-        """
-        import re
-        entities: Dict[str, List[Dict[str, Any]]] = {}
-        # Common patterns
-        patterns = {
-            'PER': [
-                # Known political figures
-                r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
-                r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
-            ],
-            'ORG': [
-                r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
-                r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
-                r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
-            ],
-            'LOC': [
-                r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
-                r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
-            ],
-            'DATE': [
-                r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
-                r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
-                r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
-                r'\b(January|February|March|April|May|June|July|August|'
-                r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
-            ],
-            'MONEY': [
-                r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
-                r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
-                r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
-            ],
-            'PERCENT': [
-                r'\b\d+(?:\.\d+)?%',
-                r'\b\d+(?:\.\d+)?\s*pour\s*cent',
-                r'\b\d+(?:\.\d+)?\s*percent',
-            ],
-        }
-        for label, pattern_list in patterns.items():
-            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
-            for pattern in pattern_list:
-                for match in re.finditer(pattern, text, re.IGNORECASE):
-                    entity_data = {
-                        'text': match.group(),
-                        'start': match.start(),
-                        'end': match.end(),
-                        'label': label,
-                        'label_display': label_info.get('fr', label),
-                        'emoji': label_info.get('emoji', '🔖'),
-                        'confidence': 0.70  # Lower confidence for heuristics
-                    }
-                    if label not in entities:
-                        entities[label] = []
-                    # Avoid duplicates
-                    if not any(e['text'].lower() == entity_data['text'].lower()
-                              for e in entities[label]):
-                        entities[label].append(entity_data)
-        return entities
-    def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
-        """
-        Generate a human-readable summary of extracted entities.
-        Args:
-            entities: Dictionary of entities from extract_entities()
-        Returns:
-            Formatted string summary
-        """
-        if not entities:
-            return "Aucune entité nommée détectée."
-        lines = []
-        for label, ent_list in entities.items():
-            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
-            emoji = label_info.get('emoji', '🔖')
-            label_display = label_info.get('fr', label)
-            entity_texts = [e['text'] for e in ent_list[:5]]  # Limit to 5
-            lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
-        return "\n".join(lines)
-    def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
-        """
-        Convert entities to frontend-friendly format.
-        Returns:
-            List of entities with all info for display
-        """
-        result = []
-        for label, ent_list in entities.items():
-            for ent in ent_list:
-                result.append({
-                    'text': ent['text'],
-                    'type': ent['label'],
-                    'type_display': ent.get('label_display', ent['label']),
-                    'emoji': ent.get('emoji', '🔖'),
-                    'confidence': ent.get('confidence', 0.5),
-                    'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
-                })
-        # Sort by confidence
-        result.sort(key=lambda x: x['confidence'], reverse=True)
-        return result
-# Singleton instance for easy import
-_ner_analyzer: Optional[NERAnalyzer] = None
-def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
-    """Get or create singleton NER analyzer instance."""
-    global _ner_analyzer
-    if _ner_analyzer is None:
-        _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
-    return _ner_analyzer
-# Quick test
-if __name__ == "__main__":
-    analyzer = NERAnalyzer(fallback=True)
-    test_text = """
-    Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
-    Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
-    Les dégâts sont estimés à 30 millions de dollars.
-    """
-    entities = analyzer.extract_entities(test_text)
-    print("=== Entités détectées ===")
-    print(analyzer.get_entity_summary(entities))
-    print("\n=== Format Frontend ===")
-    for e in analyzer.to_frontend_format(entities):
-        print(f"  {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")