Spaces:

ric912
/

customer-feedback-intelligence-demo

Sleeping

customer-feedback-intelligence-demo / src /feedback_intelligence /data /imdb.py

Richard CHEAM

Deploy customer feedback intelligence demo

73b0303 about 2 months ago

3.03 kB

	"""Load and summarize IMDb reviews from local files."""

	from __future__ import annotations

	import random
	from collections import Counter
	from pathlib import Path
	from statistics import mean, median
	from typing import Literal

	from feedback_intelligence.types import ReviewRecord

	Split = Literal["train", "test"]


	def load_local_imdb_reviews(
	base_path: Path,
	split: Split,
	sample_size: int \| None = None,
	seed: int = 42,
	) -> list[ReviewRecord]:
	"""Load deterministic review samples from a local IMDb directory."""
	categories = ("pos", "neg")
	label_map = {"pos": "positive", "neg": "negative"}

	records_by_label: dict[str, list[ReviewRecord]] = {}
	rng = random.Random(seed)

	for category in categories:
	category_path = base_path / split / category
	if not category_path.exists():
	raise FileNotFoundError(f"IMDb directory not found: {category_path}")

	files = sorted(category_path.glob("*.txt"))
	if not files:
	raise FileNotFoundError(f"No review files found in: {category_path}")

	if sample_size is None:
	selected = files
	else:
	per_label = sample_size // 2
	if per_label <= 0:
	raise ValueError("sample_size must be at least 2 for balanced sampling.")
	if per_label > len(files):
	raise ValueError(
	f"Requested {per_label} files from {category_path}, "
	f"but only {len(files)} are available."
	)
	selected = rng.sample(files, per_label)

	records_by_label[category] = [
	_record_from_path(path=file_path, split=split, label=label_map[category])
	for file_path in sorted(selected)
	]

	records = records_by_label["pos"] + records_by_label["neg"]
	rng.shuffle(records)
	return records


	def summarize_reviews(records: list[ReviewRecord]) -> dict[str, object]:
	"""Compute a compact dataset summary for CLI inspection."""
	if not records:
	return {
	"rows": 0,
	"label_distribution": {},
	"word_count": {"min": 0, "median": 0, "mean": 0, "max": 0},
	}

	word_counts = [record.word_count for record in records]
	labels = Counter(record.label for record in records)
	return {
	"rows": len(records),
	"label_distribution": dict(labels),
	"word_count": {
	"min": min(word_counts),
	"median": int(median(word_counts)),
	"mean": round(mean(word_counts), 2),
	"max": max(word_counts),
	},
	}


	def _record_from_path(path: Path, split: str, label: str) -> ReviewRecord:
	stem_parts = path.stem.split("_")
	review_id = stem_parts[0]
	score = int(stem_parts[1]) if len(stem_parts) == 2 else None
	return ReviewRecord(
	review_id=review_id,
	text=path.read_text(encoding="utf-8"),
	label=label,
	split=split,
	source="local-imdb",
	score=score,
	)