| | """Predictive intervals for sentiment analysis using Beta distribution.""" |
| |
|
| | import math |
| | import logging |
| | from typing import List, Dict, Tuple, Optional |
| | import pandas as pd |
| |
|
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def calculate_predictive_interval( |
| | positive_count: int, |
| | negative_count: int, |
| | neutral_count: int = 0, |
| | confidence_level: float = 0.95 |
| | ) -> float: |
| | """ |
| | Calculate lower bound of predictive interval for positive comment ratio. |
| | |
| | Uses Beta distribution to model the proportion of positive comments. |
| | This accounts for uncertainty when sample size is small. |
| | |
| | Formula: |
| | a = 1 + u (positive comments) |
| | b = 1 + d (negative + neutral comments) |
| | Lower bound = mean - z_score * std_dev |
| | |
| | Args: |
| | positive_count: Number of positive comments |
| | negative_count: Number of negative comments |
| | neutral_count: Number of neutral comments (default: 0) |
| | confidence_level: Confidence level (0.95 for 95%, 0.99 for 99%) |
| | |
| | Returns: |
| | Lower bound of predictive interval (0.0 to 1.0) |
| | |
| | Example: |
| | >>> # 80 positive, 20 negative out of 100 comments |
| | >>> lower_bound = calculate_predictive_interval(80, 20) |
| | >>> print(f"Lower bound: {lower_bound:.3f}") |
| | Lower bound: 0.742 |
| | """ |
| | u = positive_count |
| | d = negative_count + neutral_count |
| | |
| | |
| | a = 1 + u |
| | b = 1 + d |
| | |
| | |
| | mean = a / (a + b) |
| | |
| | |
| | variance = (a * b) / ((a + b) ** 2 * (a + b + 1)) |
| | std_dev = math.sqrt(variance) |
| | |
| | |
| | |
| | |
| | z_scores = { |
| | 0.90: 1.28, |
| | 0.95: 1.65, |
| | 0.99: 2.33 |
| | } |
| | z_score = z_scores.get(confidence_level, 1.65) |
| | |
| | |
| | lower_bound = mean - z_score * std_dev |
| | |
| | |
| | lower_bound = max(0.0, min(1.0, lower_bound)) |
| | |
| | return lower_bound |
| |
|
| |
|
| | def rank_by_predictive_interval( |
| | data: List[Dict], |
| | positive_key: str = "positive_count", |
| | negative_key: str = "negative_count", |
| | neutral_key: str = "neutral_count", |
| | confidence_level: float = 0.95 |
| | ) -> List[Dict]: |
| | """ |
| | Rank items by predictive interval lower bound. |
| | |
| | This is useful for ranking news articles or categories by positive |
| | sentiment while accounting for sample size uncertainty. |
| | |
| | Args: |
| | data: List of dictionaries with sentiment counts |
| | positive_key: Key for positive count in data dict |
| | negative_key: Key for negative count in data dict |
| | neutral_key: Key for neutral count in data dict |
| | confidence_level: Confidence level for interval |
| | |
| | Returns: |
| | List of dictionaries sorted by predictive interval (descending) |
| | Each dict includes 'predictive_interval' field |
| | |
| | Example: |
| | >>> data = [ |
| | ... {"id": 1, "positive_count": 80, "negative_count": 20}, |
| | ... {"id": 2, "positive_count": 1, "negative_count": 0}, |
| | ... ] |
| | >>> ranked = rank_by_predictive_interval(data) |
| | >>> ranked[0]["id"] # First item has higher interval |
| | 1 |
| | """ |
| | results = [] |
| | |
| | for item in data: |
| | positive = item.get(positive_key, 0) |
| | negative = item.get(negative_key, 0) |
| | neutral = item.get(neutral_key, 0) |
| | |
| | interval = calculate_predictive_interval( |
| | positive_count=positive, |
| | negative_count=negative, |
| | neutral_count=neutral, |
| | confidence_level=confidence_level |
| | ) |
| | |
| | |
| | result = item.copy() |
| | result["predictive_interval"] = interval |
| | result["total_comments"] = positive + negative + neutral |
| | result["positive_ratio"] = positive / (positive + negative + neutral) if (positive + negative + neutral) > 0 else 0.0 |
| | |
| | results.append(result) |
| | |
| | |
| | results.sort(key=lambda x: x["predictive_interval"], reverse=True) |
| | |
| | return results |
| |
|
| |
|
| | def calculate_intervals_for_dataframe( |
| | df: pd.DataFrame, |
| | positive_col: str = "positive_count", |
| | negative_col: str = "negative_count", |
| | neutral_col: str = "neutral_count", |
| | confidence_level: float = 0.95 |
| | ) -> pd.DataFrame: |
| | """ |
| | Calculate predictive intervals for DataFrame. |
| | |
| | Args: |
| | df: DataFrame with sentiment counts |
| | positive_col: Column name for positive counts |
| | negative_col: Column name for negative counts |
| | neutral_col: Column name for neutral counts |
| | confidence_level: Confidence level |
| | |
| | Returns: |
| | DataFrame with added 'predictive_interval' column |
| | |
| | Example: |
| | >>> df = pd.DataFrame({ |
| | ... "positive_count": [80, 1], |
| | ... "negative_count": [20, 0] |
| | ... }) |
| | >>> df_with_intervals = calculate_intervals_for_dataframe(df) |
| | >>> "predictive_interval" in df_with_intervals.columns |
| | True |
| | """ |
| | df = df.copy() |
| | |
| | df["predictive_interval"] = df.apply( |
| | lambda row: calculate_predictive_interval( |
| | positive_count=row.get(positive_col, 0), |
| | negative_count=row.get(negative_col, 0), |
| | neutral_count=row.get(neutral_col, 0), |
| | confidence_level=confidence_level |
| | ), |
| | axis=1 |
| | ) |
| | |
| | return df |
| |
|
| |
|
| | def get_top_positive_by_interval( |
| | data: List[Dict], |
| | top_k: int = 10, |
| | min_comments: int = 1, |
| | **kwargs |
| | ) -> List[Dict]: |
| | """ |
| | Get top K items ranked by predictive interval. |
| | |
| | Args: |
| | data: List of dictionaries with sentiment counts |
| | top_k: Number of top items to return |
| | min_comments: Minimum number of comments required |
| | **kwargs: Additional arguments for rank_by_predictive_interval |
| | |
| | Returns: |
| | Top K items sorted by predictive interval |
| | |
| | Example: |
| | >>> data = [ |
| | ... {"id": 1, "positive_count": 80, "negative_count": 20}, |
| | ... {"id": 2, "positive_count": 1, "negative_count": 0}, |
| | ... ] |
| | >>> top = get_top_positive_by_interval(data, top_k=1) |
| | >>> len(top) |
| | 1 |
| | """ |
| | |
| | filtered = [ |
| | item for item in data |
| | if (item.get("positive_count", 0) + |
| | item.get("negative_count", 0) + |
| | item.get("neutral_count", 0)) >= min_comments |
| | ] |
| | |
| | |
| | ranked = rank_by_predictive_interval(filtered, **kwargs) |
| | |
| | |
| | return ranked[:top_k] |
| |
|
| |
|
| | def get_top_negative_by_interval( |
| | data: List[Dict], |
| | top_k: int = 10, |
| | min_comments: int = 1, |
| | **kwargs |
| | ) -> List[Dict]: |
| | """ |
| | Get top K items ranked by negative sentiment (lowest predictive interval). |
| | |
| | Args: |
| | data: List of dictionaries with sentiment counts |
| | top_k: Number of top items to return |
| | min_comments: Minimum number of comments required |
| | **kwargs: Additional arguments for rank_by_predictive_interval |
| | |
| | Returns: |
| | Top K items with lowest predictive intervals (most negative) |
| | """ |
| | |
| | filtered = [ |
| | item for item in data |
| | if (item.get("positive_count", 0) + |
| | item.get("negative_count", 0) + |
| | item.get("neutral_count", 0)) >= min_comments |
| | ] |
| | |
| | |
| | ranked = rank_by_predictive_interval(filtered, **kwargs) |
| | |
| | |
| | return ranked[-top_k:][::-1] |
| |
|
| |
|
| |
|
| |
|