multilabel-news-classifier / analysis /predictive_intervals.py
Solareva Taisia
chore(release): initial public snapshot
198ccb0
"""Predictive intervals for sentiment analysis using Beta distribution."""
import math
import logging
from typing import List, Dict, Tuple, Optional
import pandas as pd
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def calculate_predictive_interval(
positive_count: int,
negative_count: int,
neutral_count: int = 0,
confidence_level: float = 0.95
) -> float:
"""
Calculate lower bound of predictive interval for positive comment ratio.
Uses Beta distribution to model the proportion of positive comments.
This accounts for uncertainty when sample size is small.
Formula:
a = 1 + u (positive comments)
b = 1 + d (negative + neutral comments)
Lower bound = mean - z_score * std_dev
Args:
positive_count: Number of positive comments
negative_count: Number of negative comments
neutral_count: Number of neutral comments (default: 0)
confidence_level: Confidence level (0.95 for 95%, 0.99 for 99%)
Returns:
Lower bound of predictive interval (0.0 to 1.0)
Example:
>>> # 80 positive, 20 negative out of 100 comments
>>> lower_bound = calculate_predictive_interval(80, 20)
>>> print(f"Lower bound: {lower_bound:.3f}")
Lower bound: 0.742
"""
u = positive_count
d = negative_count + neutral_count
# Beta distribution parameters
a = 1 + u
b = 1 + d
# Mean of Beta distribution
mean = a / (a + b)
# Variance of Beta distribution
variance = (a * b) / ((a + b) ** 2 * (a + b + 1))
std_dev = math.sqrt(variance)
# Z-score for confidence level
# 95% confidence: z = 1.65 (one-sided)
# 99% confidence: z = 2.33 (one-sided)
z_scores = {
0.90: 1.28,
0.95: 1.65,
0.99: 2.33
}
z_score = z_scores.get(confidence_level, 1.65)
# Lower bound of predictive interval
lower_bound = mean - z_score * std_dev
# Ensure non-negative and within [0, 1]
lower_bound = max(0.0, min(1.0, lower_bound))
return lower_bound
def rank_by_predictive_interval(
data: List[Dict],
positive_key: str = "positive_count",
negative_key: str = "negative_count",
neutral_key: str = "neutral_count",
confidence_level: float = 0.95
) -> List[Dict]:
"""
Rank items by predictive interval lower bound.
This is useful for ranking news articles or categories by positive
sentiment while accounting for sample size uncertainty.
Args:
data: List of dictionaries with sentiment counts
positive_key: Key for positive count in data dict
negative_key: Key for negative count in data dict
neutral_key: Key for neutral count in data dict
confidence_level: Confidence level for interval
Returns:
List of dictionaries sorted by predictive interval (descending)
Each dict includes 'predictive_interval' field
Example:
>>> data = [
... {"id": 1, "positive_count": 80, "negative_count": 20},
... {"id": 2, "positive_count": 1, "negative_count": 0},
... ]
>>> ranked = rank_by_predictive_interval(data)
>>> ranked[0]["id"] # First item has higher interval
1
"""
results = []
for item in data:
positive = item.get(positive_key, 0)
negative = item.get(negative_key, 0)
neutral = item.get(neutral_key, 0)
interval = calculate_predictive_interval(
positive_count=positive,
negative_count=negative,
neutral_count=neutral,
confidence_level=confidence_level
)
# Create new dict with interval
result = item.copy()
result["predictive_interval"] = interval
result["total_comments"] = positive + negative + neutral
result["positive_ratio"] = positive / (positive + negative + neutral) if (positive + negative + neutral) > 0 else 0.0
results.append(result)
# Sort by predictive interval (descending)
results.sort(key=lambda x: x["predictive_interval"], reverse=True)
return results
def calculate_intervals_for_dataframe(
df: pd.DataFrame,
positive_col: str = "positive_count",
negative_col: str = "negative_count",
neutral_col: str = "neutral_count",
confidence_level: float = 0.95
) -> pd.DataFrame:
"""
Calculate predictive intervals for DataFrame.
Args:
df: DataFrame with sentiment counts
positive_col: Column name for positive counts
negative_col: Column name for negative counts
neutral_col: Column name for neutral counts
confidence_level: Confidence level
Returns:
DataFrame with added 'predictive_interval' column
Example:
>>> df = pd.DataFrame({
... "positive_count": [80, 1],
... "negative_count": [20, 0]
... })
>>> df_with_intervals = calculate_intervals_for_dataframe(df)
>>> "predictive_interval" in df_with_intervals.columns
True
"""
df = df.copy()
df["predictive_interval"] = df.apply(
lambda row: calculate_predictive_interval(
positive_count=row.get(positive_col, 0),
negative_count=row.get(negative_col, 0),
neutral_count=row.get(neutral_col, 0),
confidence_level=confidence_level
),
axis=1
)
return df
def get_top_positive_by_interval(
data: List[Dict],
top_k: int = 10,
min_comments: int = 1,
**kwargs
) -> List[Dict]:
"""
Get top K items ranked by predictive interval.
Args:
data: List of dictionaries with sentiment counts
top_k: Number of top items to return
min_comments: Minimum number of comments required
**kwargs: Additional arguments for rank_by_predictive_interval
Returns:
Top K items sorted by predictive interval
Example:
>>> data = [
... {"id": 1, "positive_count": 80, "negative_count": 20},
... {"id": 2, "positive_count": 1, "negative_count": 0},
... ]
>>> top = get_top_positive_by_interval(data, top_k=1)
>>> len(top)
1
"""
# Filter by minimum comments
filtered = [
item for item in data
if (item.get("positive_count", 0) +
item.get("negative_count", 0) +
item.get("neutral_count", 0)) >= min_comments
]
# Rank by predictive interval
ranked = rank_by_predictive_interval(filtered, **kwargs)
# Return top K
return ranked[:top_k]
def get_top_negative_by_interval(
data: List[Dict],
top_k: int = 10,
min_comments: int = 1,
**kwargs
) -> List[Dict]:
"""
Get top K items ranked by negative sentiment (lowest predictive interval).
Args:
data: List of dictionaries with sentiment counts
top_k: Number of top items to return
min_comments: Minimum number of comments required
**kwargs: Additional arguments for rank_by_predictive_interval
Returns:
Top K items with lowest predictive intervals (most negative)
"""
# Filter by minimum comments
filtered = [
item for item in data
if (item.get("positive_count", 0) +
item.get("negative_count", 0) +
item.get("neutral_count", 0)) >= min_comments
]
# Rank by predictive interval
ranked = rank_by_predictive_interval(filtered, **kwargs)
# Return bottom K (most negative)
return ranked[-top_k:][::-1] # Reverse to get most negative first