Spaces:

Tremick
/

PIOE

Runtime error

File size: 4,606 Bytes

4d92cd5

"""
PIOE Credibility Scorer

Evaluates trustworthiness of sources and authors.
"""
from ..models import SourceType


class CredibilityScorer:
    """
    Scores credibility based on source type, author history, and content signals.
    """
    
    # Base credibility scores by source type
    SOURCE_CREDIBILITY = {
        SourceType.ARXIV: 0.95,      # Academic papers - highest trust
        SourceType.GITHUB: 0.8,       # Open source - high trust
        SourceType.RSS: 0.7,          # Varies by feed
        SourceType.SUPERTEAM: 0.85,   # Official platform
        SourceType.REDDIT: 0.5,       # Community - variable
        SourceType.TWITTER: 0.4,      # Social - requires filtering
        SourceType.LINKEDIN: 0.6,     # Professional but noisy
        SourceType.WEB_SCRAPE: 0.5,   # Unknown quality
    }
    
    def __init__(self):
        pass
    
    def score_source(self, source_type: SourceType) -> float:
        """Get base credibility score for source type."""
        return self.SOURCE_CREDIBILITY.get(source_type, 0.5)
    
    def score_content_signals(self, text: str, metadata: dict = None) -> dict:
        """
        Evaluate content signals that indicate credibility.
        Returns individual signal scores.
        """
        metadata = metadata or {}
        signals = {}
        
        text_lower = text.lower() if text else ""
        
        # Has deadline (official announcements usually have deadlines)
        signals["has_deadline"] = 1.0 if metadata.get("deadline") or \
            any(kw in text_lower for kw in ["deadline", "due date", "apply by", "closes"]) else 0.0
        
        # Has organization/institution
        signals["has_organization"] = 1.0 if metadata.get("organization") else 0.5
        
        # Contains action URL
        signals["has_action_url"] = 1.0 if metadata.get("url") or \
            any(kw in text_lower for kw in ["apply here", "register at", "sign up"]) else 0.0
        
        # Is first announcement (not a repost)
        signals["is_original"] = 0.0 if any(kw in text_lower for kw in [
            "repost", "sharing", "fyi", "icymi", "in case you missed"
        ]) else 1.0
        
        # Has specific requirements (detailed = more credible)
        signals["has_requirements"] = 1.0 if metadata.get("requirements") or \
            any(kw in text_lower for kw in ["requirements", "qualifications", "must have"]) else 0.0
        
        return signals
    
    def calculate_signal_strength(self, signals: dict) -> float:
        """
        Calculate overall signal strength from content signals.
        High signal strength = actionable, official, time-sensitive.
        """
        weights = {
            "has_deadline": 0.3,
            "has_organization": 0.2,
            "has_action_url": 0.2,
            "is_original": 0.2,
            "has_requirements": 0.1
        }
        
        total = sum(signals.get(k, 0) * w for k, w in weights.items())
        return round(total, 3)
    
    def score(
        self, 
        source_type: SourceType, 
        text: str = "", 
        metadata: dict = None,
        author_credibility: float = 0.5,
        social_engagement: int = 0
    ) -> dict:
        """
        Calculate comprehensive credibility score.
        
        Returns dict with:
        - source_score: Base source credibility
        - signal_strength: Content actionability
        - credibility_score: Combined score
        """
        source_score = self.score_source(source_type)
        content_signals = self.score_content_signals(text, metadata)
        signal_strength = self.calculate_signal_strength(content_signals)
        
        # Social engagement boost (for social sources)
        engagement_boost = 0.0
        if source_type in [SourceType.REDDIT, SourceType.TWITTER]:
            if social_engagement > 100:
                engagement_boost = 0.15
            elif social_engagement > 50:
                engagement_boost = 0.1
            elif social_engagement > 20:
                engagement_boost = 0.05
        
        # Combined credibility:
        # 50% source, 30% signals, 10% author, 10% engagement
        credibility_score = (
            0.5 * source_score +
            0.3 * signal_strength +
            0.1 * author_credibility +
            0.1 * min(engagement_boost + 0.5, 1.0)
        )
        
        return {
            "source_score": round(source_score, 3),
            "signal_strength": signal_strength,
            "signals": content_signals,
            "credibility_score": round(credibility_score, 3)
        }