Spaces:

ric912
/

customer-feedback-intelligence-demo

Sleeping

customer-feedback-intelligence-demo / src /feedback_intelligence /data /amazon_reviews.py

Richard CHEAM

Deploy customer feedback intelligence demo

73b0303 2 months ago

4.64 kB

	"""Load Amazon polarity reviews from Hugging Face datasets."""

	from __future__ import annotations

	import random
	from collections import Counter
	from statistics import mean, median
	from typing import Any, Iterable, Literal

	from feedback_intelligence.types import ReviewRecord

	AmazonSplit = Literal["train", "test"]
	AMAZON_POLARITY_DATASET = "SetFit/amazon_polarity"
	AMAZON_LABEL_MAP = {0: "negative", 1: "positive"}


	def load_amazon_polarity_reviews(
	split: AmazonSplit,
	sample_size: int \| None = None,
	seed: int = 42,
	include_title: bool = True,
	dataset_name: str = AMAZON_POLARITY_DATASET,
	shuffle_buffer_size: int = 10_000,
	load_dataset_fn=None,
	) -> list[ReviewRecord]:
	"""Load Amazon polarity reviews with deterministic balanced sampling."""
	if load_dataset_fn is None:
	from datasets import load_dataset as load_dataset_fn

	if sample_size is None:
	dataset = load_dataset_fn(dataset_name, split=split)
	rows = list(dataset)
	return _rows_to_review_records(
	rows=rows,
	split=split,
	include_title=include_title,
	source=dataset_name,
	)

	per_label = sample_size // 2
	if per_label <= 0:
	raise ValueError("sample_size must be at least 2 for balanced sampling.")

	dataset = load_dataset_fn(dataset_name, split=split, streaming=True)
	if hasattr(dataset, "shuffle"):
	dataset = dataset.shuffle(seed=seed, buffer_size=shuffle_buffer_size)

	sampled_rows = _sample_balanced_rows(dataset=dataset, per_label=per_label)
	if len(sampled_rows) < per_label * 2:
	raise ValueError(
	f"Unable to collect {per_label} rows per label from {dataset_name} split={split}."
	)

	records = _rows_to_review_records(
	rows=sampled_rows,
	split=split,
	include_title=include_title,
	source=dataset_name,
	)
	rng = random.Random(seed)
	rng.shuffle(records)
	return records


	def summarize_reviews(records: list[ReviewRecord]) -> dict[str, object]:
	"""Compute a compact dataset summary for CLI inspection."""
	if not records:
	return {
	"rows": 0,
	"label_distribution": {},
	"word_count": {"min": 0, "median": 0, "mean": 0, "max": 0},
	}

	word_counts = [record.word_count for record in records]
	labels = Counter(record.label for record in records)
	return {
	"rows": len(records),
	"label_distribution": dict(labels),
	"word_count": {
	"min": min(word_counts),
	"median": int(median(word_counts)),
	"mean": round(mean(word_counts), 2),
	"max": max(word_counts),
	},
	}


	def _sample_balanced_rows(
	dataset: Iterable[dict[str, Any]],
	per_label: int,
	) -> list[dict[str, Any]]:
	selected_rows: list[dict[str, Any]] = []
	counts = {0: 0, 1: 0}

	for row in dataset:
	raw_label = int(row["label"])
	if raw_label not in counts:
	continue
	if counts[raw_label] >= per_label:
	continue
	selected_rows.append(dict(row))
	counts[raw_label] += 1
	if counts[0] >= per_label and counts[1] >= per_label:
	break

	return selected_rows


	def _rows_to_review_records(
	rows: list[dict[str, Any]],
	split: str,
	include_title: bool,
	source: str,
	) -> list[ReviewRecord]:
	return [
	_review_record_from_row(
	row=row,
	row_index=index,
	split=split,
	include_title=include_title,
	source=source,
	)
	for index, row in enumerate(rows)
	]


	def _review_record_from_row(
	row: dict[str, Any],
	row_index: int,
	split: str,
	include_title: bool,
	source: str,
	) -> ReviewRecord:
	raw_label = int(row["label"])
	label = AMAZON_LABEL_MAP[raw_label]
	review_id = str(row.get("review_id") or row.get("id") or f"{split}-{row_index}")
	title = str(row.get("title", "")).strip()
	body = str(row.get("content") or row.get("text") or "").strip()
	text = _combine_title_and_body(title=title, body=body, include_title=include_title)
	if not text:
	raise ValueError(f"Amazon review row {review_id} does not contain text.")

	return ReviewRecord(
	review_id=review_id,
	text=text,
	label=label,
	split=split,
	source=source,
	)


	def _combine_title_and_body(title: str, body: str, include_title: bool) -> str:
	if include_title and title and body:
	return f"{title}\n\n{body}"
	if include_title and title:
	return title
	return body