File size: 7,793 Bytes
198ccb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""Predictive intervals for sentiment analysis using Beta distribution."""

import math
import logging
from typing import List, Dict, Tuple, Optional
import pandas as pd

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def calculate_predictive_interval(
    positive_count: int,
    negative_count: int,
    neutral_count: int = 0,
    confidence_level: float = 0.95
) -> float:
    """
    Calculate lower bound of predictive interval for positive comment ratio.
    
    Uses Beta distribution to model the proportion of positive comments.
    This accounts for uncertainty when sample size is small.
    
    Formula:
        a = 1 + u (positive comments)
        b = 1 + d (negative + neutral comments)
        Lower bound = mean - z_score * std_dev
    
    Args:
        positive_count: Number of positive comments
        negative_count: Number of negative comments
        neutral_count: Number of neutral comments (default: 0)
        confidence_level: Confidence level (0.95 for 95%, 0.99 for 99%)
        
    Returns:
        Lower bound of predictive interval (0.0 to 1.0)
        
    Example:
        >>> # 80 positive, 20 negative out of 100 comments
        >>> lower_bound = calculate_predictive_interval(80, 20)
        >>> print(f"Lower bound: {lower_bound:.3f}")
        Lower bound: 0.742
    """
    u = positive_count
    d = negative_count + neutral_count
    
    # Beta distribution parameters
    a = 1 + u
    b = 1 + d
    
    # Mean of Beta distribution
    mean = a / (a + b)
    
    # Variance of Beta distribution
    variance = (a * b) / ((a + b) ** 2 * (a + b + 1))
    std_dev = math.sqrt(variance)
    
    # Z-score for confidence level
    # 95% confidence: z = 1.65 (one-sided)
    # 99% confidence: z = 2.33 (one-sided)
    z_scores = {
        0.90: 1.28,
        0.95: 1.65,
        0.99: 2.33
    }
    z_score = z_scores.get(confidence_level, 1.65)
    
    # Lower bound of predictive interval
    lower_bound = mean - z_score * std_dev
    
    # Ensure non-negative and within [0, 1]
    lower_bound = max(0.0, min(1.0, lower_bound))
    
    return lower_bound


def rank_by_predictive_interval(
    data: List[Dict],
    positive_key: str = "positive_count",
    negative_key: str = "negative_count",
    neutral_key: str = "neutral_count",
    confidence_level: float = 0.95
) -> List[Dict]:
    """
    Rank items by predictive interval lower bound.
    
    This is useful for ranking news articles or categories by positive
    sentiment while accounting for sample size uncertainty.
    
    Args:
        data: List of dictionaries with sentiment counts
        positive_key: Key for positive count in data dict
        negative_key: Key for negative count in data dict
        neutral_key: Key for neutral count in data dict
        confidence_level: Confidence level for interval
        
    Returns:
        List of dictionaries sorted by predictive interval (descending)
        Each dict includes 'predictive_interval' field
        
    Example:
        >>> data = [
        ...     {"id": 1, "positive_count": 80, "negative_count": 20},
        ...     {"id": 2, "positive_count": 1, "negative_count": 0},
        ... ]
        >>> ranked = rank_by_predictive_interval(data)
        >>> ranked[0]["id"]  # First item has higher interval
        1
    """
    results = []
    
    for item in data:
        positive = item.get(positive_key, 0)
        negative = item.get(negative_key, 0)
        neutral = item.get(neutral_key, 0)
        
        interval = calculate_predictive_interval(
            positive_count=positive,
            negative_count=negative,
            neutral_count=neutral,
            confidence_level=confidence_level
        )
        
        # Create new dict with interval
        result = item.copy()
        result["predictive_interval"] = interval
        result["total_comments"] = positive + negative + neutral
        result["positive_ratio"] = positive / (positive + negative + neutral) if (positive + negative + neutral) > 0 else 0.0
        
        results.append(result)
    
    # Sort by predictive interval (descending)
    results.sort(key=lambda x: x["predictive_interval"], reverse=True)
    
    return results


def calculate_intervals_for_dataframe(
    df: pd.DataFrame,
    positive_col: str = "positive_count",
    negative_col: str = "negative_count",
    neutral_col: str = "neutral_count",
    confidence_level: float = 0.95
) -> pd.DataFrame:
    """
    Calculate predictive intervals for DataFrame.
    
    Args:
        df: DataFrame with sentiment counts
        positive_col: Column name for positive counts
        negative_col: Column name for negative counts
        neutral_col: Column name for neutral counts
        confidence_level: Confidence level
        
    Returns:
        DataFrame with added 'predictive_interval' column
        
    Example:
        >>> df = pd.DataFrame({
        ...     "positive_count": [80, 1],
        ...     "negative_count": [20, 0]
        ... })
        >>> df_with_intervals = calculate_intervals_for_dataframe(df)
        >>> "predictive_interval" in df_with_intervals.columns
        True
    """
    df = df.copy()
    
    df["predictive_interval"] = df.apply(
        lambda row: calculate_predictive_interval(
            positive_count=row.get(positive_col, 0),
            negative_count=row.get(negative_col, 0),
            neutral_count=row.get(neutral_col, 0),
            confidence_level=confidence_level
        ),
        axis=1
    )
    
    return df


def get_top_positive_by_interval(
    data: List[Dict],
    top_k: int = 10,
    min_comments: int = 1,
    **kwargs
) -> List[Dict]:
    """
    Get top K items ranked by predictive interval.
    
    Args:
        data: List of dictionaries with sentiment counts
        top_k: Number of top items to return
        min_comments: Minimum number of comments required
        **kwargs: Additional arguments for rank_by_predictive_interval
        
    Returns:
        Top K items sorted by predictive interval
        
    Example:
        >>> data = [
        ...     {"id": 1, "positive_count": 80, "negative_count": 20},
        ...     {"id": 2, "positive_count": 1, "negative_count": 0},
        ... ]
        >>> top = get_top_positive_by_interval(data, top_k=1)
        >>> len(top)
        1
    """
    # Filter by minimum comments
    filtered = [
        item for item in data
        if (item.get("positive_count", 0) + 
            item.get("negative_count", 0) + 
            item.get("neutral_count", 0)) >= min_comments
    ]
    
    # Rank by predictive interval
    ranked = rank_by_predictive_interval(filtered, **kwargs)
    
    # Return top K
    return ranked[:top_k]


def get_top_negative_by_interval(
    data: List[Dict],
    top_k: int = 10,
    min_comments: int = 1,
    **kwargs
) -> List[Dict]:
    """
    Get top K items ranked by negative sentiment (lowest predictive interval).
    
    Args:
        data: List of dictionaries with sentiment counts
        top_k: Number of top items to return
        min_comments: Minimum number of comments required
        **kwargs: Additional arguments for rank_by_predictive_interval
        
    Returns:
        Top K items with lowest predictive intervals (most negative)
    """
    # Filter by minimum comments
    filtered = [
        item for item in data
        if (item.get("positive_count", 0) + 
            item.get("negative_count", 0) + 
            item.get("neutral_count", 0)) >= min_comments
    ]
    
    # Rank by predictive interval
    ranked = rank_by_predictive_interval(filtered, **kwargs)
    
    # Return bottom K (most negative)
    return ranked[-top_k:][::-1]  # Reverse to get most negative first