Spaces:

klzn
/

sentimentstream-worker

Running

sentimentstream-worker / backend /app /core /sampling.py

GitHub Action

deploy: worker release from GitHub

8ff1b66 15 days ago

4.5 kB

	"""
	Moduł do obliczania statystycznej wielkości próbki.

	Implementuje wzory statystyczne dla próbkowania populacji.
	"""

	import math
	from dataclasses import dataclass

	from app.core.config import settings


	# Wartości Z dla poziomów ufności
	Z_SCORES = {
	0.90: 1.645,
	0.95: 1.96,
	0.99: 2.576,
	}


	@dataclass
	class SamplePlan:
	"""
	Plan próbkowania dla gry.

	Attributes:
	top_helpful: Liczba najprzydatniejszych recenzji.
	statistical_sample: Wielkość próbki statystycznej.
	positive_count: Ile pobrać pozytywnych (stratified).
	negative_count: Ile pobrać negatywnych (stratified).
	total: Łączna liczba recenzji do pobrania.
	"""

	top_helpful: int
	statistical_sample: int
	positive_count: int
	negative_count: int
	total: int


	def calculate_sample_size(
	population: int,
	confidence_level: float \| None = None,
	margin_of_error: float \| None = None,
	) -> int:
	"""
	Oblicza minimalną wielkość próbki dla danej populacji.
	Wykorzystuje wzór Cochrana z korektą dla populacji skończonej.
	"""
	if confidence_level is None:
	confidence_level = settings.sample_confidence_level
	if margin_of_error is None:
	margin_of_error = settings.sample_margin_of_error

	# 1. Pobieramy Z-score (np. 1.96 dla 95% ufności).
	# Mówi on, jak bardzo wynik może odbiegać od średniej w jednostkach odchylenia standardowego.
	z = Z_SCORES.get(confidence_level, 1.96)

	# 2. Zakładamy p=0.5 (maksymalna zmienność).
	# To daje nam najbezpieczniejszą (największą) wielkość próbki.
	p = 0.5

	# 3. Wzór Cochrana dla nieskończonej populacji:
	# n0 = (Z^2 * p * (1-p)) / e^2
	# Wyjaśnienie: Z kwadrat razy zmienność, podzielone przez kwadrat błędu.
	n_0 = (z ** 2 * p * (1 - p)) / (margin_of_error ** 2)

	# 4. Korekta dla populacji skończonej (Steam ma policzalną liczbę recenzji):
	# n = n0 / (1 + (n0 - 1) / N)
	# Wyjaśnienie: Zmniejszamy próbkę, bo wiemy dokładnie, ile osób (recenzji) jest w "całym świecie" tej gry.
	n = n_0 / (1 + (n_0 - 1) / population)

	# Zaokrąglamy w górę do pełnej recenzji
	return math.ceil(n)


	def create_sample_plan(
	total_reviews: int,
	positive_reviews: int,
	negative_reviews: int,
	) -> SamplePlan:
	"""
	Tworzy plan próbkowania, łącząc dwa podejścia.
	"""
	top_helpful = settings.sample_top_helpful
	max_reviews = settings.sample_max_reviews

	# Obliczamy, ile recenzji musimy pobrać, żeby wynik był wiarygodny
	statistical_sample = calculate_sample_size(total_reviews)

	# Pilnujemy, żeby nie przekroczyć ustawionego limitu (np. 3000)
	statistical_sample = min(statistical_sample, max_reviews - top_helpful)

	# Obliczamy jaki procent stanowią pozytywy i negatywy w całości
	if total_reviews > 0:
	pos_ratio = positive_reviews / total_reviews
	neg_ratio = negative_reviews / total_reviews
	else:
	pos_ratio = 0.5
	neg_ratio = 0.5

	# Rozdzielamy naszą próbkę proporcjonalnie do tych wyników (Stratified Sampling)
	pos_target = math.ceil(statistical_sample * pos_ratio)
	neg_target = math.ceil(statistical_sample * neg_ratio)

	# Minority protection: boost the smaller group to minority_min if possible
	minority_min = settings.sample_minority_min

	if pos_target < minority_min and positive_reviews > pos_target:
	pos_target = min(minority_min, positive_reviews)

	if neg_target < minority_min and negative_reviews > neg_target:
	neg_target = min(minority_min, negative_reviews)

	# Final adjustment to stay within statistical_sample limit
	if pos_target + neg_target > statistical_sample:
	if pos_target > neg_target:
	pos_target = max(pos_target - (pos_target + neg_target - statistical_sample), minority_min)
	else:
	neg_target = max(neg_target - (pos_target + neg_target - statistical_sample), minority_min)

	# Final cap by actual availability
	positive_count = min(pos_target, positive_reviews)
	negative_count = min(neg_target, negative_reviews)

	# Sumujemy wszystko (Top Helpful + Próbka Statystyczna)
	total = top_helpful + positive_count + negative_count

	return SamplePlan(
	top_helpful=top_helpful,
	statistical_sample=statistical_sample,
	positive_count=positive_count,
	negative_count=negative_count,
	total=total,
	)