from __future__ import annotations import random from typing import Any, Callable import pandas as pd RowToSentence = Callable[[pd.Series], dict[str, Any]] def _nonempty_frame(frame: pd.DataFrame, *, text_column: str = "text") -> pd.DataFrame: if text_column not in frame.columns: raise RuntimeError(f"Expected a {text_column!r} column in the cache frame.") return frame[frame[text_column].astype(str).str.strip().ne("")] def build_sentence_bundle(rows: pd.DataFrame, row_to_sentence: RowToSentence) -> dict[str, Any]: sentences = [row_to_sentence(row) for _, row in rows.iterrows()] if not sentences: raise RuntimeError("Unable to sample a sentence bundle.") combined_text = "\n\n".join(sentence["text"] for sentence in sentences if sentence.get("text")) primary = sentences[0] return { **primary, "text": combined_text, "raw_text": combined_text, "sentences": sentences, "lang_count": len(sentences), "langs": [sentence.get("lang_iso2", "") for sentence in sentences], "lang_iso3s": [sentence.get("lang_iso3", "") for sentence in sentences], } def sample_single_group_bundle( frame: pd.DataFrame, *, group_column: str, row_to_sentence: RowToSentence, attempts: int = 8, min_sentences: int = 1, max_sentences: int = 3, multi_sentence_probability: float = 0.55, text_column: str = "text", allowed_groups: set[str] | frozenset[str] | None = None, ) -> dict[str, Any]: """Sample 1-3 sentences from a single random group, often more than one.""" candidate_frame = _nonempty_frame(frame, text_column=text_column) if allowed_groups is not None: candidate_frame = candidate_frame[candidate_frame[group_column].isin(allowed_groups)] distinct_groups = [value for value in candidate_frame[group_column].dropna().unique().tolist() if value] if not distinct_groups: raise RuntimeError(f"No usable values were found in {group_column!r}.") min_sentences = max(1, int(min_sentences)) max_sentences = max(min_sentences, int(max_sentences)) for _ in range(max(1, attempts)): group = random.choice(distinct_groups) group_rows = candidate_frame[candidate_frame[group_column] == group] if group_rows.empty: continue sample_size = min_sentences if len(group_rows) > 1 and random.random() < multi_sentence_probability: sample_size = random.randint(min(2, max_sentences), min(max_sentences, len(group_rows))) rows = group_rows.sample(n=min(sample_size, len(group_rows))) bundle = build_sentence_bundle(rows, row_to_sentence) if bundle["text"]: return bundle raise RuntimeError(f"Unable to sample a random bundle from {group_column!r}.") def sample_multi_group_bundle( frame: pd.DataFrame, *, group_column: str, row_to_sentence: RowToSentence, min_groups: int = 2, max_groups: int = 3, min_sentences_per_group: int = 1, max_sentences_per_group: int = 2, text_column: str = "text", allowed_groups: set[str] | frozenset[str] | None = None, ) -> dict[str, Any]: """Sample 1-2 sentences from multiple random groups and concatenate them.""" candidate_frame = _nonempty_frame(frame, text_column=text_column) if allowed_groups is not None: candidate_frame = candidate_frame[candidate_frame[group_column].isin(allowed_groups)] distinct_groups = [value for value in candidate_frame[group_column].dropna().unique().tolist() if value] if not distinct_groups: raise RuntimeError(f"No usable values were found in {group_column!r}.") min_groups = max(1, int(min_groups)) max_groups = max(min_groups, int(max_groups)) min_sentences_per_group = max(1, int(min_sentences_per_group)) max_sentences_per_group = max(min_sentences_per_group, int(max_sentences_per_group)) group_count = random.randint(min_groups, max_groups) random.shuffle(distinct_groups) chosen_groups = distinct_groups[: min(group_count, len(distinct_groups))] rows: list[pd.Series] = [] for group in chosen_groups: group_rows = candidate_frame[candidate_frame[group_column] == group] if group_rows.empty: continue row_count = random.randint(min_sentences_per_group, min(max_sentences_per_group, len(group_rows))) sampled_rows = group_rows.sample(n=row_count) rows.extend(row for _, row in sampled_rows.iterrows()) if not rows: raise RuntimeError(f"Unable to sample a multi-group bundle from {group_column!r}.") combined_rows = pd.DataFrame(rows) return build_sentence_bundle(combined_rows, row_to_sentence)