File size: 4,606 Bytes
4d92cd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
PIOE Credibility Scorer

Evaluates trustworthiness of sources and authors.
"""
from ..models import SourceType


class CredibilityScorer:
    """
    Scores credibility based on source type, author history, and content signals.
    """
    
    # Base credibility scores by source type
    SOURCE_CREDIBILITY = {
        SourceType.ARXIV: 0.95,      # Academic papers - highest trust
        SourceType.GITHUB: 0.8,       # Open source - high trust
        SourceType.RSS: 0.7,          # Varies by feed
        SourceType.SUPERTEAM: 0.85,   # Official platform
        SourceType.REDDIT: 0.5,       # Community - variable
        SourceType.TWITTER: 0.4,      # Social - requires filtering
        SourceType.LINKEDIN: 0.6,     # Professional but noisy
        SourceType.WEB_SCRAPE: 0.5,   # Unknown quality
    }
    
    def __init__(self):
        pass
    
    def score_source(self, source_type: SourceType) -> float:
        """Get base credibility score for source type."""
        return self.SOURCE_CREDIBILITY.get(source_type, 0.5)
    
    def score_content_signals(self, text: str, metadata: dict = None) -> dict:
        """
        Evaluate content signals that indicate credibility.
        Returns individual signal scores.
        """
        metadata = metadata or {}
        signals = {}
        
        text_lower = text.lower() if text else ""
        
        # Has deadline (official announcements usually have deadlines)
        signals["has_deadline"] = 1.0 if metadata.get("deadline") or \
            any(kw in text_lower for kw in ["deadline", "due date", "apply by", "closes"]) else 0.0
        
        # Has organization/institution
        signals["has_organization"] = 1.0 if metadata.get("organization") else 0.5
        
        # Contains action URL
        signals["has_action_url"] = 1.0 if metadata.get("url") or \
            any(kw in text_lower for kw in ["apply here", "register at", "sign up"]) else 0.0
        
        # Is first announcement (not a repost)
        signals["is_original"] = 0.0 if any(kw in text_lower for kw in [
            "repost", "sharing", "fyi", "icymi", "in case you missed"
        ]) else 1.0
        
        # Has specific requirements (detailed = more credible)
        signals["has_requirements"] = 1.0 if metadata.get("requirements") or \
            any(kw in text_lower for kw in ["requirements", "qualifications", "must have"]) else 0.0
        
        return signals
    
    def calculate_signal_strength(self, signals: dict) -> float:
        """
        Calculate overall signal strength from content signals.
        High signal strength = actionable, official, time-sensitive.
        """
        weights = {
            "has_deadline": 0.3,
            "has_organization": 0.2,
            "has_action_url": 0.2,
            "is_original": 0.2,
            "has_requirements": 0.1
        }
        
        total = sum(signals.get(k, 0) * w for k, w in weights.items())
        return round(total, 3)
    
    def score(
        self, 
        source_type: SourceType, 
        text: str = "", 
        metadata: dict = None,
        author_credibility: float = 0.5,
        social_engagement: int = 0
    ) -> dict:
        """
        Calculate comprehensive credibility score.
        
        Returns dict with:
        - source_score: Base source credibility
        - signal_strength: Content actionability
        - credibility_score: Combined score
        """
        source_score = self.score_source(source_type)
        content_signals = self.score_content_signals(text, metadata)
        signal_strength = self.calculate_signal_strength(content_signals)
        
        # Social engagement boost (for social sources)
        engagement_boost = 0.0
        if source_type in [SourceType.REDDIT, SourceType.TWITTER]:
            if social_engagement > 100:
                engagement_boost = 0.15
            elif social_engagement > 50:
                engagement_boost = 0.1
            elif social_engagement > 20:
                engagement_boost = 0.05
        
        # Combined credibility:
        # 50% source, 30% signals, 10% author, 10% engagement
        credibility_score = (
            0.5 * source_score +
            0.3 * signal_strength +
            0.1 * author_credibility +
            0.1 * min(engagement_boost + 0.5, 1.0)
        )
        
        return {
            "source_score": round(source_score, 3),
            "signal_strength": signal_strength,
            "signals": content_signals,
            "credibility_score": round(credibility_score, 3)
        }