Spaces:

DomLoyer
/

syscred

Running

App Files Files Community

D Ф m i И i q ц e L Ф y e r commited on Mar 6

Commit

1a81e0d

1 Parent(s): 34a26a7

Fix: sync working Sandbox version - NER, E-E-A-T functional

Browse files

Files changed (3) hide show

syscred/eeat_calculator.py +406 -210
syscred/ner_analyzer.py +218 -133
syscred/verification_system.py +129 -127

syscred/eeat_calculator.py CHANGED Viewed

@@ -1,41 +1,118 @@
 # -*- coding: utf-8 -*-
 """
-E-E-A-T Calculator Module - SysCRED
-====================================
-Google Quality Rater Guidelines implementation.
-E-E-A-T Scores:
-- Experience: Domain age, content richness
-- Expertise: Technical vocabulary, citations
-- Authority: Estimated PageRank, backlinks
-- Trust: HTTPS, unbiased sentiment
-(c) Dominique S. Loyer - PhD Thesis Prototype
 """
 import re
-from typing import Dict, Optional
-from urllib.parse import urlparse
 class EEATCalculator:
     """
-    Calculate E-E-A-T scores based on Google Quality Rater Guidelines.
     """
-    # Technical terms that indicate expertise
-    TECHNICAL_TERMS = {
-        'research', 'study', 'analysis', 'data', 'evidence', 'methodology',
-        'peer-reviewed', 'journal', 'university', 'professor', 'dr.', 'phd',
-        'statistics', 'experiment', 'hypothesis', 'publication', 'citation',
-        'algorithm', 'framework', 'systematic', 'empirical', 'quantitative'
     }
-    # Trusted domains (simplified list)
-    TRUSTED_DOMAINS = {
-        '.edu', '.gov', '.org', 'reuters.com', 'apnews.com', 'bbc.com',
-        'nature.com', 'science.org', 'who.int', 'un.org', 'wikipedia.org',
-        'lemonde.fr', 'radio-canada.ca', 'uqam.ca', 'umontreal.ca'
     }
     def __init__(self):
@@ -44,227 +121,346 @@ class EEATCalculator:
     def calculate(
         self,
-        url: Optional[str] = None,
-        text: Optional[str] = None,
-        sentiment_score: float = 0.5,
-        has_citations: bool = False,
-        domain_age_years: int = 0
-    ) -> Dict:
         """
-        Calculate E-E-A-T scores.
         Args:
             url: Source URL
-            text: Content text
-            sentiment_score: 0-1 (0.5 = neutral is best for trust)
-            has_citations: Whether content has citations
-            domain_age_years: Estimated domain age
         Returns:
-            {
-                'experience': 0.75,
-                'expertise': 0.80,
-                'authority': 0.65,
-                'trust': 0.90,
-                'overall': 0.78,
-                'details': {...}
-            }
         """
-        details = {}
-        # --- EXPERIENCE ---
-        experience = 0.5
-        if domain_age_years >= 10:
-            experience += 0.3
-        elif domain_age_years >= 5:
-            experience += 0.2
-        elif domain_age_years >= 2:
-            experience += 0.1
-        if text:
-            word_count = len(text.split())
-            if word_count >= 1000:
-                experience += 0.15
-            elif word_count >= 500:
-                experience += 0.1
-        experience = min(experience, 1.0)
-        details['experience_factors'] = {
-            'domain_age_bonus': domain_age_years >= 2,
-            'content_richness': len(text.split()) if text else 0
-        }
-        # --- EXPERTISE ---
-        expertise = 0.4
-        tech_count = 0
-        if text:
-            text_lower = text.lower()
-            for term in self.TECHNICAL_TERMS:
-                if term in text_lower:
-                    tech_count += 1
-            if tech_count >= 5:
-                expertise += 0.35
-            elif tech_count >= 3:
-                expertise += 0.25
-            elif tech_count >= 1:
-                expertise += 0.15
-        if has_citations:
-            expertise += 0.2
-        expertise = min(expertise, 1.0)
-        details['expertise_factors'] = {
-            'technical_terms_found': tech_count,
-            'has_citations': has_citations
-        }
-        # --- AUTHORITY ---
-        authority = 0.3
-        if url:
-            parsed = urlparse(url)
-            domain = parsed.netloc.lower()
-            for trusted in self.TRUSTED_DOMAINS:
-                if trusted in domain:
-                    authority += 0.4
-                    break
-            if parsed.scheme == 'https':
-                authority += 0.1
-        # Check for author indicators in text
-        if text:
-            author_patterns = [r'by\s+\w+\s+\w+', r'author:', r'written by', r'par\s+\w+']
-            for pattern in author_patterns:
-                if re.search(pattern, text.lower()):
-                    authority += 0.15
-                    break
-        authority = min(authority, 1.0)
-        details['authority_factors'] = {
-            'trusted_domain': False,
-            'https': url and urlparse(url).scheme == 'https' if url else False
-        }
-        # --- TRUST ---
-        trust = 0.5
-        # Neutral sentiment is best (0.5)
-        sentiment_deviation = abs(sentiment_score - 0.5)
-        if sentiment_deviation < 0.1:
-            trust += 0.3  # Very neutral
-        elif sentiment_deviation < 0.2:
-            trust += 0.2
-        elif sentiment_deviation < 0.3:
-            trust += 0.1
-        if url and urlparse(url).scheme == 'https':
-            trust += 0.15
-        trust = min(trust, 1.0)
-        details['trust_factors'] = {
-            'sentiment_neutrality': 1 - sentiment_deviation * 2,
-            'secure_connection': url and 'https' in url if url else False
-        }
-        # --- OVERALL ---
-        overall = (experience * 0.2 + expertise * 0.3 +
-                   authority * 0.25 + trust * 0.25)
-        return {
-            'experience': round(experience, 2),
-            'expertise': round(expertise, 2),
-            'authority': round(authority, 2),
-            'trust': round(trust, 2),
-            'overall': round(overall, 2),
-            'details': details
-        }
-    def get_explanation(self, scores: Dict) -> str:
-        """Generate human-readable explanation of E-E-A-T scores."""
         explanations = []
-        exp = scores.get('experience', 0)
-        if exp >= 0.7:
-            explanations.append("✅ Expérience: Source établie avec contenu riche")
-        elif exp >= 0.5:
-            explanations.append("⚠️ Expérience: Source moyennement établie")
         else:
-            explanations.append("❌ Expérience: Source nouvelle ou contenu limité")
-        ext = scores.get('expertise', 0)
-        if ext >= 0.7:
-            explanations.append("✅ Expertise: Vocabulaire technique, citations présentes")
-        elif ext >= 0.5:
-            explanations.append("⚠️ Expertise: Niveau technique moyen")
         else:
-            explanations.append("❌ Expertise: Manque de terminologie spécialisée")
-        auth = scores.get('authority', 0)
-        if auth >= 0.7:
-            explanations.append("✅ Autorité: Domaine reconnu et fiable")
-        elif auth >= 0.5:
-            explanations.append("⚠️ Autorité: Niveau d'autorité moyen")
         else:
-            explanations.append("❌ Autorité: Source non reconnue")
-        tr = scores.get('trust', 0)
-        if tr >= 0.7:
-            explanations.append("✅ Confiance: Ton neutre, connexion sécurisée")
-        elif tr >= 0.5:
-            explanations.append("⚠️ Confiance: Niveau de confiance moyen")
         else:
-            explanations.append("❌ Confiance: Ton biaisé ou connexion non sécurisée")
         return "\n".join(explanations)
-# Singleton
-_calculator = None
-def get_calculator() -> EEATCalculator:
-    """Get or create E-E-A-T calculator singleton."""
-    global _calculator
-    if _calculator is None:
-        _calculator = EEATCalculator()
-    return _calculator
-# --- Testing ---
 if __name__ == "__main__":
-    print("=" * 60)
-    print("SysCRED E-E-A-T Calculator - Test")
-    print("=" * 60)
     calc = EEATCalculator()
-    test_url = "https://www.nature.com/articles/example"
     test_text = """
-    A peer-reviewed study published in the journal Nature found evidence
-    that the new methodology significantly improves research outcomes.
-    Dr. Smith from Harvard University presented the statistics at the conference.
     """
-    result = calc.calculate(
         url=test_url,
         text=test_text,
-        sentiment_score=0.5,
-        has_citations=True,
-        domain_age_years=15
     )
-    print("\n--- E-E-A-T Scores ---")
-    print(f"  Experience: {result['experience']:.0%}")
-    print(f"  Expertise:  {result['expertise']:.0%}")
-    print(f"  Authority:  {result['authority']:.0%}")
-    print(f"  Trust:      {result['trust']:.0%}")
-    print(f"  ─────────────────")
-    print(f"  OVERALL:    {result['overall']:.0%}")
-    print("\n--- Explanation ---")
-    print(calc.get_explanation(result))
-    print("\n" + "=" * 60)

+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+E-E-A-T Metrics Calculator for SysCRED
+========================================
+Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
+These metrics mirror modern Google ranking signals:
+- Experience: Domain age, content freshness
+- Expertise: Author identification, depth of content
+- Authority: PageRank simulation, citations/backlinks
+- Trust: HTTPS, fact-checks, low bias score
 """
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
 import re
+from datetime import datetime
+import logging
+logger = logging.getLogger(__name__)
+@dataclass
+class EEATScore:
+    """E-E-A-T score container."""
+    experience: float  # 0-1
+    expertise: float   # 0-1
+    authority: float   # 0-1
+    trust: float       # 0-1
+    @property
+    def overall(self) -> float:
+        """Weighted average of all E-E-A-T components."""
+        # Weights based on Google's emphasis
+        weights = {
+            'experience': 0.15,
+            'expertise': 0.25,
+            'authority': 0.35,
+            'trust': 0.25
+        }
+        return (
+            self.experience * weights['experience'] +
+            self.expertise * weights['expertise'] +
+            self.authority * weights['authority'] +
+            self.trust * weights['trust']
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            'experience': round(self.experience, 3),
+            'expertise': round(self.expertise, 3),
+            'authority': round(self.authority, 3),
+            'trust': round(self.trust, 3),
+            'overall': round(self.overall, 3),
+            'experience_pct': f"{int(self.experience * 100)}%",
+            'expertise_pct': f"{int(self.expertise * 100)}%",
+            'authority_pct': f"{int(self.authority * 100)}%",
+            'trust_pct': f"{int(self.trust * 100)}%",
+            'overall_pct': f"{int(self.overall * 100)}%"
+        }
 class EEATCalculator:
     """
+    Calculate E-E-A-T metrics from various signals.
+    Mirrors Google's quality rater guidelines:
+    - Experience: Has the author demonstrated real experience?
+    - Expertise: Is the content expert-level?
+    - Authority: Is the source recognized as authoritative?
+    - Trust: Is the source trustworthy?
     """
+    # Known authoritative domains
+    AUTHORITATIVE_DOMAINS = {
+        # News
+        'lemonde.fr': 0.95,
+        'lefigaro.fr': 0.90,
+        'liberation.fr': 0.88,
+        'nytimes.com': 0.95,
+        'washingtonpost.com': 0.93,
+        'theguardian.com': 0.92,
+        'bbc.com': 0.94,
+        'bbc.co.uk': 0.94,
+        'reuters.com': 0.96,
+        'apnews.com': 0.95,
+        # Academic
+        'nature.com': 0.98,
+        'science.org': 0.98,
+        'pubmed.ncbi.nlm.nih.gov': 0.97,
+        'scholar.google.com': 0.85,
+        # Government
+        'gouv.fr': 0.90,
+        'gov.uk': 0.90,
+        'whitehouse.gov': 0.88,
+        'europa.eu': 0.92,
+        # Fact-checkers
+        'snopes.com': 0.88,
+        'factcheck.org': 0.90,
+        'politifact.com': 0.88,
+        'fullfact.org': 0.89,
+        # Wikipedia (moderate authority)
+        'wikipedia.org': 0.75,
+        'fr.wikipedia.org': 0.75,
+        'en.wikipedia.org': 0.75,
     }
+    # Low-trust domains (misinformation sources)
+    LOW_TRUST_DOMAINS = {
+        'infowars.com': 0.1,
+        'breitbart.com': 0.3,
+        'naturalnews.com': 0.15,
+        # Add more as needed
     }
     def __init__(self):
     def calculate(
         self,
+        url: str,
+        text: str,
+        nlp_analysis: Optional[Dict[str, Any]] = None,
+        pagerank: Optional[float] = None,
+        fact_checks: Optional[List[Dict]] = None,
+        domain_age_years: Optional[float] = None,
+        has_https: bool = True,
+        author_identified: bool = False,
+        seo_score: Optional[float] = None
+    ) -> EEATScore:
         """
+        Calculate E-E-A-T scores from available signals.
         Args:
             url: Source URL
+            text: Article text content
+            nlp_analysis: NLP analysis results (sentiment, coherence, bias)
+            pagerank: Simulated PageRank score (0-1)
+            fact_checks: List of fact-check results
+            domain_age_years: Domain age in years (from WHOIS)
+            has_https: Whether site uses HTTPS
+            author_identified: Whether author is clearly identified
+            seo_score: SEO/technical quality score
         Returns:
+            EEATScore with all component scores
         """
+        # Extract domain from URL
+        domain = self._extract_domain(url)
+        # Calculate each component
+        experience = self._calculate_experience(
+            domain_age_years,
+            text,
+            nlp_analysis
+        )
+        expertise = self._calculate_expertise(
+            text,
+            author_identified,
+            nlp_analysis
+        )
+        authority = self._calculate_authority(
+            domain,
+            pagerank,
+            seo_score
+        )
+        trust = self._calculate_trust(
+            domain,
+            has_https,
+            fact_checks,
+            nlp_analysis
+        )
+        return EEATScore(
+            experience=experience,
+            expertise=expertise,
+            authority=authority,
+            trust=trust
+        )
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain from URL."""
+        import re
+        match = re.search(r'https?://(?:www\.)?([^/]+)', url)
+        return match.group(1).lower() if match else url.lower()
+    def _calculate_experience(
+        self,
+        domain_age_years: Optional[float],
+        text: str,
+        nlp_analysis: Optional[Dict]
+    ) -> float:
+        """
+        Calculate Experience score.
+        Factors:
+        - Domain age (longer = more experience)
+        - Content freshness (recently updated)
+        - First-hand experience indicators in text
+        """
+        score = 0.5  # Base score
+        # Domain age contribution (max 0.3)
+        if domain_age_years is not None:
+            age_score = min(domain_age_years / 20, 1.0) * 0.3  # 20 years = max
+            score += age_score
+        else:
+            score += 0.15  # Assume moderate age
+        # Content depth contribution (max 0.2)
+        word_count = len(text.split()) if text else 0
+        if word_count > 1000:
+            score += 0.2
+        elif word_count > 500:
+            score += 0.15
+        elif word_count > 200:
+            score += 0.1
+        # First-hand experience indicators (max 0.1)
+        experience_indicators = [
+            r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
+            r'\b(interview|entretien|témoignage|witness|firsthand)\b',
+            r'\b(sur place|on the ground|eyewitness)\b'
+        ]
+        for pattern in experience_indicators:
+            if re.search(pattern, text, re.IGNORECASE):
+                score += 0.03
+        return min(score, 1.0)
+    def _calculate_expertise(
+        self,
+        text: str,
+        author_identified: bool,
+        nlp_analysis: Optional[Dict]
+    ) -> float:
+        """
+        Calculate Expertise score.
+        Factors:
+        - Author identification
+        - Technical depth of content
+        - Citation of sources
+        - Coherence (from NLP)
+        """
+        score = 0.4  # Base score
+        # Author identification (0.2)
+        if author_identified:
+            score += 0.2
+        # Citation indicators (max 0.2)
+        citation_patterns = [
+            r'\b(selon|according to|d\'après|source:)\b',
+            r'\b(étude|study|research|rapport|report)\b',
+            r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
+            r'\[([\d]+)\]',  # [1] style citations
+            r'https?://[^\s]+'  # Links
+        ]
+        citation_count = 0
+        for pattern in citation_patterns:
+            citation_count += len(re.findall(pattern, text, re.IGNORECASE))
+        score += min(citation_count * 0.02, 0.2)
+        # Coherence from NLP analysis (0.2)
+        if nlp_analysis and 'coherence' in nlp_analysis:
+            coherence = nlp_analysis['coherence']
+            if isinstance(coherence, dict):
+                coherence = coherence.get('score', 0.5)
+            score += coherence * 0.2
+        else:
+            score += 0.1  # Assume moderate coherence
+        return min(score, 1.0)
+    def _calculate_authority(
+        self,
+        domain: str,
+        pagerank: Optional[float],
+        seo_score: Optional[float]
+    ) -> float:
+        """
+        Calculate Authority score.
+        Factors:
+        - Known authoritative domain
+        - PageRank simulation
+        - SEO/technical quality
+        """
+        score = 0.3  # Base score
+        # Known domain authority (max 0.5)
+        for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
+            if known_domain in domain:
+                score = max(score, authority * 0.5 + 0.3)
+                break
+        # Check low-trust domains
+        for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
+            if low_trust_domain in domain:
+                score = min(score, low_score)
+                break
+        # PageRank contribution (max 0.3)
+        if pagerank is not None:
+            score += pagerank * 0.3
+        else:
+            score += 0.15  # Assume moderate pagerank
+        # SEO score contribution (max 0.2)
+        if seo_score is not None:
+            score += seo_score * 0.2
+        else:
+            score += 0.1
+        return min(score, 1.0)
+    def _calculate_trust(
+        self,
+        domain: str,
+        has_https: bool,
+        fact_checks: Optional[List[Dict]],
+        nlp_analysis: Optional[Dict]
+    ) -> float:
+        """
+        Calculate Trust score.
+        Factors:
+        - HTTPS
+        - Fact-check results
+        - Bias score (low = better)
+        - Known trustworthy domain
+        """
+        score = 0.4  # Base score
+        # HTTPS (0.1)
+        if has_https:
+            score += 0.1
+        # Fact-check results (max 0.3)
+        if fact_checks:
+            positive_checks = sum(1 for fc in fact_checks
+                                  if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
+            negative_checks = sum(1 for fc in fact_checks
+                                  if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
+            if positive_checks > 0:
+                score += 0.2
+            if negative_checks > 0:
+                score -= 0.3
+        # Bias score (max 0.2, lower bias = higher trust)
+        if nlp_analysis:
+            bias_data = nlp_analysis.get('bias_analysis', {})
+            if isinstance(bias_data, dict):
+                bias_score = bias_data.get('score', 0.3)
+            else:
+                bias_score = 0.3
+            # Invert: low bias = high trust contribution
+            score += (1 - bias_score) * 0.2
+        else:
+            score += 0.1
+        # Known trustworthy domain (0.1)
+        for known_domain in self.AUTHORITATIVE_DOMAINS:
+            if known_domain in domain:
+                score += 0.1
+                break
+        # Known low-trust domain (penalty)
+        for low_trust_domain in self.LOW_TRUST_DOMAINS:
+            if low_trust_domain in domain:
+                score -= 0.3
+                break
+        return max(min(score, 1.0), 0.0)
+    def explain_score(self, eeat: EEATScore, url: str) -> str:
+        """
+        Generate human-readable explanation of E-E-A-T score.
+        Args:
+            eeat: EEATScore instance
+            url: Source URL
+        Returns:
+            Formatted explanation string
+        """
+        domain = self._extract_domain(url)
         explanations = []
+        # Experience
+        if eeat.experience >= 0.8:
+            explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
+        elif eeat.experience >= 0.5:
+            explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
         else:
+            explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
+        # Expertise
+        if eeat.expertise >= 0.8:
+            explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
+        elif eeat.expertise >= 0.5:
+            explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
         else:
+            explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
+        # Authority
+        if eeat.authority >= 0.8:
+            explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
+        elif eeat.authority >= 0.5:
+            explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
         else:
+            explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
+        # Trust
+        if eeat.trust >= 0.8:
+            explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
+        elif eeat.trust >= 0.5:
+            explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
         else:
+            explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
         return "\n".join(explanations)
+# Test
 if __name__ == "__main__":
     calc = EEATCalculator()
+    test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
     test_text = """
+    Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
+    au Capitol a été un événement marquant. Notre reporter sur place a témoigné
+    des événements. Les experts politiques analysent les conséquences.
     """
+    nlp_analysis = {
+        'coherence': {'score': 0.8},
+        'bias_analysis': {'score': 0.2}
+    }
+    eeat = calc.calculate(
         url=test_url,
         text=test_text,
+        nlp_analysis=nlp_analysis,
+        pagerank=0.7,
+        has_https=True,
+        author_identified=True
     )
+    print("=== E-E-A-T Scores ===")
+    print(f"Experience: {eeat.experience_pct}")
+    print(f"Expertise:  {eeat.expertise_pct}")
+    print(f"Authority:  {eeat.authority_pct}")
+    print(f"Trust:      {eeat.trust_pct}")
+    print(f"Overall:    {eeat.overall_pct}")
+    print("\n=== Explanation ===")
+    print(calc.explain_score(eeat, test_url))

syscred/ner_analyzer.py CHANGED Viewed

@@ -1,198 +1,283 @@
 # -*- coding: utf-8 -*-
 """
-NER Analyzer Module - SysCRED
-==============================
-Named Entity Recognition for fact-checking enhancement.
-Extracts: PERSON, ORG, GPE, DATE, MISC entities
-(c) Dominique S. Loyer - PhD Thesis Prototype
 """
-import os
-# Check for spaCy
 try:
     import spacy
     HAS_SPACY = True
 except ImportError:
     HAS_SPACY = False
-    print("[NER] spaCy not installed. NER disabled.")
 class NERAnalyzer:
     """
-    Named Entity Recognition using spaCy.
-    Supports:
-    - French (fr_core_news_md)
-    - English (en_core_web_sm)
     """
-    # Entity type mapping with icons
-    ENTITY_ICONS = {
-        'PERSON': '👤',
-        'PER': '👤',
-        'ORG': '🏢',
-        'GPE': '📍',
-        'LOC': '📍',
-        'DATE': '📅',
-        'TIME': '🕐',
-        'MONEY': '💰',
-        'MISC': '🏷️',
-        'NORP': '👥',
-        'FAC': '🏛️',
-        'PRODUCT': '📦',
-        'EVENT': '🎉',
-        'WORK_OF_ART': '🎨',
-        'LAW': '⚖️',
-        'LANGUAGE': '🗣️',
     }
-    def __init__(self, language: str = 'en'):
         """
         Initialize NER analyzer.
         Args:
-            language: 'en' or 'fr'
         """
-        self.language = language
         self.nlp = None
-        self.enabled = False
         if HAS_SPACY:
-            self._load_model()
-    def _load_model(self):
-        """Load the appropriate spaCy model."""
-        models = {
-            'en': ['en_core_web_sm', 'en_core_web_md'],
-            'fr': ['fr_core_news_md', 'fr_core_news_sm']
-        }
-        for model_name in models.get(self.language, models['en']):
             try:
                 self.nlp = spacy.load(model_name)
-                self.enabled = True
-                print(f"[NER] Loaded model: {model_name}")
-                break
-            except OSError:
-                continue
-        if not self.enabled:
-            print(f"[NER] No model found for language: {self.language}")
-    def extract_entities(self, text: str) -> dict:
         """
         Extract named entities from text.
         Returns:
-            {
-                'entities': [
-                    {'text': 'Emmanuel Macron', 'type': 'PERSON', 'icon': '👤'},
-                    ...
-                ],
-                'summary': {
-                    'PERSON': ['Emmanuel Macron'],
-                    'ORG': ['UQAM', 'Google'],
-                    ...
-                }
-            }
         """
-        if not self.enabled or not text:
-            return {'entities': [], 'summary': {}}
         doc = self.nlp(text)
-        entities = []
-        summary = {}
-        seen = set()
         for ent in doc.ents:
-            # Avoid duplicates
-            key = (ent.text.lower(), ent.label_)
-            if key in seen:
-                continue
-            seen.add(key)
-            entity = {
                 'text': ent.text,
-                'type': ent.label_,
-                'icon': self.ENTITY_ICONS.get(ent.label_, '🏷️'),
                 'start': ent.start_char,
-                'end': ent.end_char
             }
-            entities.append(entity)
-            # Group by type
-            if ent.label_ not in summary:
-                summary[ent.label_] = []
-            summary[ent.label_].append(ent.text)
-        return {
-            'entities': entities,
-            'summary': summary,
-            'count': len(entities)
         }
-    def analyze_for_factcheck(self, text: str) -> dict:
         """
-        Analyze text for fact-checking relevance.
-        Returns entities with credibility hints.
         """
-        result = self.extract_entities(text)
-        # Add fact-checking hints
-        hints = []
-        for ent in result.get('entities', []):
-            if ent['type'] in ['PERSON', 'PER']:
-                hints.append(f"Verify claims about {ent['text']}")
-            elif ent['type'] == 'ORG':
-                hints.append(f"Check {ent['text']} official sources")
-            elif ent['type'] in ['GPE', 'LOC']:
-                hints.append(f"Verify location: {ent['text']}")
-            elif ent['type'] == 'DATE':
-                hints.append(f"Confirm date: {ent['text']}")
-        result['fact_check_hints'] = hints[:5]  # Top 5 hints
         return result
-# Singleton instance
-_analyzer = None
-def get_analyzer(language: str = 'en') -> NERAnalyzer:
-    """Get or create the NER analyzer singleton."""
-    global _analyzer
-    if _analyzer is None:
-        _analyzer = NERAnalyzer(language)
-    return _analyzer
-# --- Testing ---
 if __name__ == "__main__":
-    print("=" * 60)
-    print("SysCRED NER Analyzer - Test")
-    print("=" * 60)
-    analyzer = NERAnalyzer('en')
     test_text = """
-    Emmanuel Macron announced today that France will invest €500 million
-    in AI research. The announcement was made at the UQAM in Montreal, Canada
-    on February 8, 2026. Google and Microsoft also confirmed their participation.
     """
-    result = analyzer.analyze_for_factcheck(test_text)
-    print("\n--- Entities Found ---")
-    for ent in result['entities']:
-        print(f"  {ent['icon']} {ent['text']} ({ent['type']})")
-    print("\n--- Fact-Check Hints ---")
-    for hint in result.get('fact_check_hints', []):
-        print(f"  • {hint}")
-    print("\n" + "=" * 60)

+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+Named Entity Recognition (NER) Analyzer for SysCRED
+====================================================
+Extracts named entities from text using spaCy.
+Entities detected:
+- PER: Persons (Donald Trump, Emmanuel Macron)
+- ORG: Organizations (FBI, UN, Google)
+- LOC: Locations (Paris, Capitol)
+- DATE: Dates (January 6, 2021)
+- MONEY: Amounts ($10 million)
+- EVENT: Events (insurrection, election)
 """
+from typing import Dict, List, Any, Optional
+import logging
+# Try to import spaCy
 try:
     import spacy
+    from spacy.language import Language
     HAS_SPACY = True
 except ImportError:
     HAS_SPACY = False
+    spacy = None
+logger = logging.getLogger(__name__)
 class NERAnalyzer:
     """
+    Named Entity Recognition analyzer using spaCy.
+    Supports French (fr_core_news_md) and English (en_core_web_md).
+    Falls back to heuristic extraction if spaCy is not available.
     """
+    # Entity type mappings for display
+    ENTITY_LABELS = {
+        'PER': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
+        'PERSON': {'fr': 'Personne', 'en': 'Person', 'emoji': '👤'},
+        'ORG': {'fr': 'Organisation', 'en': 'Organization', 'emoji': '🏢'},
+        'LOC': {'fr': 'Lieu', 'en': 'Location', 'emoji': '📍'},
+        'GPE': {'fr': 'Lieu géopolitique', 'en': 'Geopolitical', 'emoji': '🌍'},
+        'DATE': {'fr': 'Date', 'en': 'Date', 'emoji': '📅'},
+        'TIME': {'fr': 'Heure', 'en': 'Time', 'emoji': '⏰'},
+        'MONEY': {'fr': 'Montant', 'en': 'Money', 'emoji': '💰'},
+        'PERCENT': {'fr': 'Pourcentage', 'en': 'Percent', 'emoji': '📊'},
+        'EVENT': {'fr': 'Événement', 'en': 'Event', 'emoji': '📰'},
+        'PRODUCT': {'fr': 'Produit', 'en': 'Product', 'emoji': '📦'},
+        'LAW': {'fr': 'Loi', 'en': 'Law', 'emoji': '⚖️'},
+        'NORP': {'fr': 'Groupe', 'en': 'Group', 'emoji': '👥'},
+        'MISC': {'fr': 'Divers', 'en': 'Miscellaneous', 'emoji': '🔖'},
     }
+    def __init__(self, model_name: str = "fr_core_news_md", fallback: bool = True):
         """
         Initialize NER analyzer.
         Args:
+            model_name: spaCy model to load (fr_core_news_md, en_core_web_md)
+            fallback: If True, use heuristics when spaCy unavailable
         """
+        self.model_name = model_name
+        self.fallback = fallback
         self.nlp = None
+        self.use_heuristics = False
         if HAS_SPACY:
             try:
                 self.nlp = spacy.load(model_name)
+                logger.info(f"[NER] Loaded spaCy model: {model_name}")
+            except OSError as e:
+                logger.warning(f"[NER] Could not load model {model_name}: {e}")
+                if fallback:
+                    self.use_heuristics = True
+                    logger.info("[NER] Using heuristic entity extraction")
+        else:
+            if fallback:
+                self.use_heuristics = True
+                logger.info("[NER] spaCy not installed. Using heuristic extraction")
+    def extract_entities(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
         """
         Extract named entities from text.
+        Args:
+            text: Input text to analyze
         Returns:
+            Dictionary mapping entity types to lists of entities
+            Each entity has: text, start, end, label, label_display, emoji, confidence
         """
+        if not text or len(text.strip()) == 0:
+            return {}
+        if self.nlp:
+            return self._extract_with_spacy(text)
+        elif self.use_heuristics:
+            return self._extract_with_heuristics(text)
+        else:
+            return {}
+    def _extract_with_spacy(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
+        """Extract entities using spaCy NLP."""
         doc = self.nlp(text)
+        entities: Dict[str, List[Dict[str, Any]]] = {}
         for ent in doc.ents:
+            label = ent.label_
+            # Get display info
+            label_info = self.ENTITY_LABELS.get(label, {
+                'fr': label,
+                'en': label,
+                'emoji': '🔖'
+            })
+            entity_data = {
                 'text': ent.text,
                 'start': ent.start_char,
+                'end': ent.end_char,
+                'label': label,
+                'label_display': label_info.get('fr', label),
+                'emoji': label_info.get('emoji', '🔖'),
+                'confidence': 0.85  # spaCy doesn't provide confidence by default
             }
+            if label not in entities:
+                entities[label] = []
+            # Avoid duplicates
+            if not any(e['text'].lower() == entity_data['text'].lower() for e in entities[label]):
+                entities[label].append(entity_data)
+        return entities
+    def _extract_with_heuristics(self, text: str) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Fallback heuristic entity extraction.
+        Uses pattern matching for common entities.
+        """
+        import re
+        entities: Dict[str, List[Dict[str, Any]]] = {}
+        # Common patterns
+        patterns = {
+            'PER': [
+                # Known political figures
+                r'\b(Donald Trump|Joe Biden|Emmanuel Macron|Hillary Clinton|Barack Obama|'
+                r'Vladimir Putin|Angela Merkel|Justin Trudeau|Boris Johnson)\b',
+            ],
+            'ORG': [
+                r'\b(FBI|CIA|NSA|ONU|NATO|OTAN|Google|Facebook|Twitter|Meta|'
+                r'Amazon|Microsoft|Apple|CNN|BBC|Le Monde|New York Times|'
+                r'Parti Républicain|Parti Démocrate|Republican Party|Democratic Party)\b',
+            ],
+            'LOC': [
+                r'\b(Capitol|White House|Maison Blanche|Kremlin|Élysée|Pentagon|'
+                r'New York|Washington|Paris|Londres|Moscou|Berlin|Beijing)\b',
+            ],
+            'DATE': [
+                r'\b(\d{1,2}\s+(janvier|février|mars|avril|mai|juin|juillet|août|'
+                r'septembre|octobre|novembre|décembre)\s+\d{4})\b',
+                r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b',
+                r'\b(January|February|March|April|May|June|July|August|'
+                r'September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
+            ],
+            'MONEY': [
+                r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|trillion))?',
+                r'[\d,]+(?:\.\d{2})?\s*(?:dollars?|euros?|€|\$)',
+                r'[\d,]+\s*(?:million|milliard)s?\s*(?:de\s+)?(?:dollars?|euros?)',
+            ],
+            'PERCENT': [
+                r'\b\d+(?:\.\d+)?%',
+                r'\b\d+(?:\.\d+)?\s*pour\s*cent',
+                r'\b\d+(?:\.\d+)?\s*percent',
+            ],
         }
+        for label, pattern_list in patterns.items():
+            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
+            for pattern in pattern_list:
+                for match in re.finditer(pattern, text, re.IGNORECASE):
+                    entity_data = {
+                        'text': match.group(),
+                        'start': match.start(),
+                        'end': match.end(),
+                        'label': label,
+                        'label_display': label_info.get('fr', label),
+                        'emoji': label_info.get('emoji', '🔖'),
+                        'confidence': 0.70  # Lower confidence for heuristics
+                    }
+                    if label not in entities:
+                        entities[label] = []
+                    # Avoid duplicates
+                    if not any(e['text'].lower() == entity_data['text'].lower()
+                              for e in entities[label]):
+                        entities[label].append(entity_data)
+        return entities
+    def get_entity_summary(self, entities: Dict[str, List[Dict[str, Any]]]) -> str:
         """
+        Generate a human-readable summary of extracted entities.
+        Args:
+            entities: Dictionary of entities from extract_entities()
+        Returns:
+            Formatted string summary
         """
+        if not entities:
+            return "Aucune entité nommée détectée."
+        lines = []
+        for label, ent_list in entities.items():
+            label_info = self.ENTITY_LABELS.get(label, {'fr': label, 'emoji': '🔖'})
+            emoji = label_info.get('emoji', '🔖')
+            label_display = label_info.get('fr', label)
+            entity_texts = [e['text'] for e in ent_list[:5]]  # Limit to 5
+            lines.append(f"{emoji} {label_display}: {', '.join(entity_texts)}")
+        return "\n".join(lines)
+    def to_frontend_format(self, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict]:
+        """
+        Convert entities to frontend-friendly format.
+        Returns:
+            List of entities with all info for display
+        """
+        result = []
+        for label, ent_list in entities.items():
+            for ent in ent_list:
+                result.append({
+                    'text': ent['text'],
+                    'type': ent['label'],
+                    'type_display': ent.get('label_display', ent['label']),
+                    'emoji': ent.get('emoji', '🔖'),
+                    'confidence': ent.get('confidence', 0.5),
+                    'confidence_pct': f"{int(ent.get('confidence', 0.5) * 100)}%"
+                })
+        # Sort by confidence
+        result.sort(key=lambda x: x['confidence'], reverse=True)
         return result
+# Singleton instance for easy import
+_ner_analyzer: Optional[NERAnalyzer] = None
+def get_ner_analyzer(model_name: str = "fr_core_news_md") -> NERAnalyzer:
+    """Get or create singleton NER analyzer instance."""
+    global _ner_analyzer
+    if _ner_analyzer is None:
+        _ner_analyzer = NERAnalyzer(model_name=model_name, fallback=True)
+    return _ner_analyzer
+# Quick test
 if __name__ == "__main__":
+    analyzer = NERAnalyzer(fallback=True)
     test_text = """
+    Donald Trump a affirmé que l'insurrection du 6 janvier 2021 au Capitol n'est jamais arrivée.
+    Le FBI enquête sur les événements. Le président Joe Biden a condamné ces déclarations à Washington.
+    Les dégâts sont estimés à 30 millions de dollars.
     """
+    entities = analyzer.extract_entities(test_text)
+    print("=== Entités détectées ===")
+    print(analyzer.get_entity_summary(entities))
+    print("\n=== Format Frontend ===")
+    for e in analyzer.to_frontend_format(entities):
+        print(f"  {e['emoji']} {e['text']} ({e['type_display']}, {e['confidence_pct']})")

syscred/verification_system.py CHANGED Viewed

@@ -33,28 +33,35 @@ except ImportError:
     HAS_SBERT = False
     print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
-# Local imports
-from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
-from syscred.ontology_manager import OntologyManager
-from syscred.seo_analyzer import SEOAnalyzer
-from syscred.graph_rag import GraphRAG  # [NEW] GraphRAG
-from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult  # [NEW] TREC Integration
-from syscred import config
-# [NEW] NER and E-E-A-T modules
 try:
-    from syscred.ner_analyzer import NERAnalyzer, get_ner_analyzer
-    HAS_NER = True
 except ImportError:
-    HAS_NER = False
-    print("[SysCRED] Warning: NER module not available")
 try:
     from syscred.eeat_calculator import EEATCalculator, EEATScore
-    HAS_EEAT = True
 except ImportError:
-    HAS_EEAT = False
-    print("[SysCRED] Warning: E-E-A-T module not available")
 class CredibilityVerificationSystem:
@@ -136,6 +143,18 @@ class CredibilityVerificationSystem:
         # Weights for score calculation (Loaded from Config)
         self.weights = config.Config.SCORE_WEIGHTS
         print(f"[SysCRED] Using weights: {self.weights}")
         print("[SysCRED] System ready!")
@@ -144,40 +163,47 @@ class CredibilityVerificationSystem:
         print("[SysCRED] Loading ML models (this may take a moment)...")
         try:
-            # Sentiment analysis
             self.sentiment_pipeline = pipeline(
-                "sentiment-analysis",
-                model="distilbert-base-uncased-finetuned-sst-2-english"
             )
-            print("[SysCRED] ✓ Sentiment model loaded")
         except Exception as e:
             print(f"[SysCRED] ✗ Sentiment model failed: {e}")
         try:
-            # NER pipeline
-            self.ner_pipeline = pipeline("ner", grouped_entities=True)
-            print("[SysCRED] ✓ NER model loaded")
         except Exception as e:
             print(f"[SysCRED] ✗ NER model failed: {e}")
         try:
-            # Bias detection - Specialized model
-            # Using 'd4data/bias-detection-model' or fallback to generic
-            bias_model_name = "d4data/bias-detection-model"
             self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
             self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
-            print("[SysCRED] ✓ Bias model loaded (d4data)")
         except Exception as e:
             print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
         try:
-            # Semantic Coherence
             if HAS_SBERT:
                 self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
-                print("[SysCRED] ✓ Coherence model loaded (SBERT)")
         except Exception as e:
             print(f"[SysCRED] ✗ Coherence model failed: {e}")
         try:
             # LIME explainer
             self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
@@ -501,6 +527,26 @@ class CredibilityVerificationSystem:
             adjustment_factor = (graph_score - 0.5) * w_graph * confidence
             adjustments += adjustment_factor
             total_weight_used += w_graph * confidence  # Partial weight based on confidence
         # Final calculation
         # Base 0.5 + sum of weighted adjustments
@@ -657,11 +703,24 @@ class CredibilityVerificationSystem:
     ) -> Dict[str, Any]:
         """Generate the final evaluation report."""
         report = {
             'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
             'informationEntree': input_data,
             'dateGeneration': datetime.datetime.now().isoformat(),
             'scoreCredibilite': round(overall_score, 2),
             'resumeAnalyse': "",
             'detailsScore': {
                 'base': 0.5,
@@ -688,8 +747,6 @@ class CredibilityVerificationSystem:
             },
             # [NEW] TREC Evidence section
             'evidences': evidences or [],
-            # [NEW] TREC IR Metrics for dashboard
-            'trec_metrics': self._calculate_trec_metrics(cleaned_text, evidences),
             'metadonnees': {}
         }
@@ -756,99 +813,6 @@ class CredibilityVerificationSystem:
         return report
-    def _calculate_trec_metrics(self, text: str, evidences: List[Dict[str, Any]] = None) -> Dict[str, float]:
-        """
-        Calculate TREC-style IR metrics for display on dashboard.
-        Computes:
-        - Precision: Ratio of relevant retrieved documents
-        - Recall: Ratio of relevant documents retrieved
-        - MAP: Mean Average Precision
-        - NDCG: Normalized Discounted Cumulative Gain
-        - TF-IDF: Term Frequency-Inverse Document Frequency score
-        - MRR: Mean Reciprocal Rank
-        """
-        import math
-        metrics = {
-            'precision': 0.0,
-            'recall': 0.0,
-            'map': 0.0,
-            'ndcg': 0.0,
-            'tfidf': 0.0,
-            'mrr': 0.0
-        }
-        if not text:
-            return metrics
-        # TF-IDF based on text analysis
-        words = text.lower().split()
-        if words:
-            # Simple TF calculation
-            word_counts = {}
-            for word in words:
-                word_counts[word] = word_counts.get(word, 0) + 1
-            # Calculate TF-IDF score (simplified)
-            total_words = len(words)
-            unique_words = len(word_counts)
-            # Term frequency normalized
-            tf_scores = [count / total_words for count in word_counts.values()]
-            # IDF approximation based on word distribution
-            idf_approx = math.log((unique_words + 1) / 2)
-            tfidf_sum = sum(tf * idf_approx for tf in tf_scores)
-            metrics['tfidf'] = min(1.0, tfidf_sum / max(1, unique_words) * 10)
-        # If we have evidences, calculate retrieval metrics
-        if evidences and len(evidences) > 0:
-            k = len(evidences)
-            # For now, assume all retrieved evidences have some relevance
-            # based on their retrieval scores
-            scores = [e.get('score', 0) for e in evidences]
-            if scores:
-                avg_score = sum(scores) / len(scores)
-                max_score = max(scores)
-                # Precision at K (proxy: avg relevance score)
-                metrics['precision'] = min(1.0, avg_score if avg_score <= 1.0 else avg_score / max(1, max_score))
-                # Recall (proxy: coverage based on number of evidences)
-                metrics['recall'] = min(1.0, len(evidences) / 10)  # Assuming 10 is target
-                # MAP (proxy using score ranking)
-                ap_sum = 0.0
-                for i, score in enumerate(sorted(scores, reverse=True)):
-                    ap_sum += (i + 1) / (i + 2) * score if score <= 1.0 else (i + 1) / (i + 2)
-                metrics['map'] = ap_sum / len(scores) if scores else 0.0
-                # NDCG (simplified)
-                dcg = sum(
-                    (2 ** (score if score <= 1.0 else 1.0) - 1) / math.log2(i + 2)
-                    for i, score in enumerate(scores[:k])
-                )
-                ideal_scores = sorted(scores, reverse=True)
-                idcg = sum(
-                    (2 ** (score if score <= 1.0 else 1.0) - 1) / math.log2(i + 2)
-                    for i, score in enumerate(ideal_scores[:k])
-                )
-                metrics['ndcg'] = dcg / idcg if idcg > 0 else 0.0
-                # MRR (first relevant result)
-                for i, score in enumerate(scores):
-                    if (score > 0.5 if score <= 1.0 else score > max_score / 2):
-                        metrics['mrr'] = 1.0 / (i + 1)
-                        break
-                if metrics['mrr'] == 0 and len(scores) > 0:
-                    metrics['mrr'] = 1.0  # First result
-        # Round all values
-        return {k: round(v, 4) for k, v in metrics.items()}
     def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
         """Get list of factors that influenced the score (For UI)."""
         factors = []
@@ -1009,6 +973,40 @@ class CredibilityVerificationSystem:
         print("[SysCRED] Running NLP analysis...")
         nlp_results = self.nlp_analysis(cleaned_text)
         # 7. Calculate score (Now includes GraphRAG context)
         overall_score = self.calculate_overall_score(rule_results, nlp_results)
         print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
@@ -1020,6 +1018,10 @@ class CredibilityVerificationSystem:
             graph_context=graph_context
         )
         # Add similar URIs to report for ontology linking
         if similar_uris:
             report['similar_claims_uris'] = similar_uris

     HAS_SBERT = False
     print("Warning: sentence-transformers not installed. Semantic coherence will use heuristics.")
+# Local imports - Support both syscred.module and relative imports
 try:
+    from syscred.api_clients import ExternalAPIClients, WebContent, ExternalData
+    from syscred.ontology_manager import OntologyManager
+    from syscred.seo_analyzer import SEOAnalyzer
+    from syscred.graph_rag import GraphRAG
+    from syscred.trec_retriever import TRECRetriever, Evidence, RetrievalResult
+    from syscred import config
 except ImportError:
+    from api_clients import ExternalAPIClients, WebContent, ExternalData
+    from ontology_manager import OntologyManager
+    from seo_analyzer import SEOAnalyzer
+    from graph_rag import GraphRAG
+    from trec_retriever import TRECRetriever, Evidence, RetrievalResult
+    import config
+# [NER + E-E-A-T] Imports optionnels - n'interferent pas avec les imports principaux
+HAS_NER_EEAT = False
 try:
+    from syscred.ner_analyzer import NERAnalyzer
     from syscred.eeat_calculator import EEATCalculator, EEATScore
+    HAS_NER_EEAT = True
 except ImportError:
+    try:
+        from ner_analyzer import NERAnalyzer
+        from eeat_calculator import EEATCalculator, EEATScore
+        HAS_NER_EEAT = True
+    except ImportError:
+        pass
 class CredibilityVerificationSystem:
         # Weights for score calculation (Loaded from Config)
         self.weights = config.Config.SCORE_WEIGHTS
         print(f"[SysCRED] Using weights: {self.weights}")
+        # [NER + E-E-A-T] Initialize analyzers
+        self.ner_analyzer = None
+        self.eeat_calculator = None
+        if HAS_NER_EEAT:
+            try:
+                self.ner_analyzer = NERAnalyzer()
+                self.eeat_calculator = EEATCalculator()
+                print("[SysCRED] NER analyzer initialized")
+                print("[SysCRED] E-E-A-T calculator initialized")
+            except Exception as e:
+                print(f"[SysCRED] NER/E-E-A-T init failed: {e}")
         print("[SysCRED] System ready!")
         print("[SysCRED] Loading ML models (this may take a moment)...")
         try:
+            # Sentiment analysis - modèle ultra-léger
             self.sentiment_pipeline = pipeline(
+                "sentiment-analysis",
+                model="distilbert-base-uncased-finetuned-sst-2-english",
+                device=-1,
+                model_kwargs={"low_cpu_mem_usage": True}
             )
+            print("[SysCRED] ✓ Sentiment model loaded (distilbert-base)")
         except Exception as e:
             print(f"[SysCRED] ✗ Sentiment model failed: {e}")
         try:
+            # NER pipeline - modèle plus léger
+            self.ner_pipeline = pipeline(
+                "ner",
+                model="dslim/bert-base-NER",
+                grouped_entities=True,
+                device=-1,
+                model_kwargs={"low_cpu_mem_usage": True}
+            )
+            print("[SysCRED] ✓ NER model loaded (dslim/bert-base-NER)")
         except Exception as e:
             print(f"[SysCRED] ✗ NER model failed: {e}")
         try:
+            # Bias detection - modèle plus léger si possible
+            bias_model_name = "typeform/distilbert-base-uncased-mnli"
             self.bias_tokenizer = AutoTokenizer.from_pretrained(bias_model_name)
             self.bias_model = AutoModelForSequenceClassification.from_pretrained(bias_model_name)
+            print("[SysCRED] ✓ Bias model loaded (distilbert-mnli)")
         except Exception as e:
             print(f"[SysCRED] ✗ Bias model failed: {e}. Using heuristics.")
         try:
+            # Semantic Coherence - modèle MiniLM (déjà léger)
             if HAS_SBERT:
                 self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
+                print("[SysCRED] ✓ Coherence model loaded (SBERT MiniLM)")
         except Exception as e:
             print(f"[SysCRED] ✗ Coherence model failed: {e}")
         try:
             # LIME explainer
             self.explainer = LimeTextExplainer(class_names=['NEGATIVE', 'POSITIVE'])
             adjustment_factor = (graph_score - 0.5) * w_graph * confidence
             adjustments += adjustment_factor
             total_weight_used += w_graph * confidence  # Partial weight based on confidence
+        # 8. [NEW] Linguistic Markers Analysis (sensationalism penalty)
+        # Penalize sensational language heavily, reward doubt markers (critical thinking)
+        linguistic = rule_results.get('linguistic_markers', {})
+        sensationalism_count = linguistic.get('sensationalism', 0)
+        doubt_count = linguistic.get('doubt', 0)
+        certainty_count = linguistic.get('certainty', 0)
+        # Sensationalism is a strong negative signal
+        if sensationalism_count > 0:
+            penalty = min(0.20, sensationalism_count * 0.05)  # Max 20% penalty
+            adjustments -= penalty
+        # Excessive certainty without sources is suspicious
+        if certainty_count > 2 and not fact_checks:
+            adjustments -= 0.05
+        # Doubt markers indicate critical/questioning tone (slight positive)
+        if doubt_count > 0:
+            adjustments += min(0.05, doubt_count * 0.02)
         # Final calculation
         # Base 0.5 + sum of weighted adjustments
     ) -> Dict[str, Any]:
         """Generate the final evaluation report."""
+        # Determine credibility level
+        if overall_score >= 0.75:
+            niveau = "Élevée"
+        elif overall_score >= 0.55:
+            niveau = "Moyenne-Élevée"
+        elif overall_score >= 0.45:
+            niveau = "Moyenne"
+        elif overall_score >= 0.25:
+            niveau = "Faible-Moyenne"
+        else:
+            niveau = "Faible"
         report = {
             'idRapport': f"report_{int(datetime.datetime.now().timestamp())}",
             'informationEntree': input_data,
             'dateGeneration': datetime.datetime.now().isoformat(),
             'scoreCredibilite': round(overall_score, 2),
+            'niveauCredibilite': niveau,
             'resumeAnalyse': "",
             'detailsScore': {
                 'base': 0.5,
             },
             # [NEW] TREC Evidence section
             'evidences': evidences or [],
             'metadonnees': {}
         }
         return report
     def _get_score_factors(self, rule_results: Dict, nlp_results: Dict) -> List[Dict]:
         """Get list of factors that influenced the score (For UI)."""
         factors = []
         print("[SysCRED] Running NLP analysis...")
         nlp_results = self.nlp_analysis(cleaned_text)
+        # 6.5 [NER] Named Entity Recognition
+        ner_entities = {}
+        if self.ner_analyzer and cleaned_text:
+            try:
+                ner_entities = self.ner_analyzer.extract_entities(cleaned_text)
+                total = sum(len(v) for v in ner_entities.values() if isinstance(v, list))
+                print(f"[SysCRED] NER: {total} entites detectees")
+            except Exception as e:
+                print(f"[SysCRED] NER failed: {e}")
+        # 6.6 [E-E-A-T] Experience-Expertise-Authority-Trust scoring
+        eeat_scores = {}
+        if self.eeat_calculator:
+            try:
+                url_for_eeat = input_data if is_url else ""
+                domain_age_years = None
+                if external_data.domain_age_days:
+                    domain_age_years = external_data.domain_age_days / 365.0
+                eeat_raw = self.eeat_calculator.calculate(
+                    url=url_for_eeat,
+                    text=cleaned_text,
+                    nlp_analysis=nlp_results,
+                    fact_checks=rule_results.get('fact_checking', []),
+                    domain_age_years=domain_age_years,
+                    has_https=input_data.startswith("https://") if is_url else False
+                )
+                eeat_scores = eeat_raw.to_dict() if hasattr(eeat_raw, 'to_dict') else (
+                    eeat_raw if isinstance(eeat_raw, dict) else vars(eeat_raw)
+                )
+                print(f"[SysCRED] E-E-A-T score: {eeat_scores.get('overall', 'N/A')}")
+            except Exception as e:
+                print(f"[SysCRED] E-E-A-T failed: {e}")
         # 7. Calculate score (Now includes GraphRAG context)
         overall_score = self.calculate_overall_score(rule_results, nlp_results)
         print(f"[SysCRED] ✓ Credibility score: {overall_score:.2f}")
             graph_context=graph_context
         )
+        # [NER + E-E-A-T] Always include in report (even if empty)
+        report['ner_entities'] = ner_entities
+        report['eeat_scores'] = eeat_scores
         # Add similar URIs to report for ontology linking
         if similar_uris:
             report['similar_claims_uris'] = similar_uris