from __future__ import annotations

import random
from typing import Any, Callable

import pandas as pd


RowToSentence = Callable[[pd.Series], dict[str, Any]]


def _nonempty_frame(frame: pd.DataFrame, *, text_column: str = "text") -> pd.DataFrame:
    if text_column not in frame.columns:
        raise RuntimeError(f"Expected a {text_column!r} column in the cache frame.")
    return frame[frame[text_column].astype(str).str.strip().ne("")]


def build_sentence_bundle(rows: pd.DataFrame, row_to_sentence: RowToSentence) -> dict[str, Any]:
    sentences = [row_to_sentence(row) for _, row in rows.iterrows()]
    if not sentences:
        raise RuntimeError("Unable to sample a sentence bundle.")

    combined_text = "\n\n".join(sentence["text"] for sentence in sentences if sentence.get("text"))
    primary = sentences[0]
    return {
        **primary,
        "text": combined_text,
        "raw_text": combined_text,
        "sentences": sentences,
        "lang_count": len(sentences),
        "langs": [sentence.get("lang_iso2", "") for sentence in sentences],
        "lang_iso3s": [sentence.get("lang_iso3", "") for sentence in sentences],
    }


def sample_single_group_bundle(
    frame: pd.DataFrame,
    *,
    group_column: str,
    row_to_sentence: RowToSentence,
    attempts: int = 8,
    min_sentences: int = 1,
    max_sentences: int = 3,
    multi_sentence_probability: float = 0.55,
    text_column: str = "text",
    allowed_groups: set[str] | frozenset[str] | None = None,
) -> dict[str, Any]:
    """Sample 1-3 sentences from a single random group, often more than one."""
    candidate_frame = _nonempty_frame(frame, text_column=text_column)
    if allowed_groups is not None:
        candidate_frame = candidate_frame[candidate_frame[group_column].isin(allowed_groups)]
    distinct_groups = [value for value in candidate_frame[group_column].dropna().unique().tolist() if value]
    if not distinct_groups:
        raise RuntimeError(f"No usable values were found in {group_column!r}.")

    min_sentences = max(1, int(min_sentences))
    max_sentences = max(min_sentences, int(max_sentences))

    for _ in range(max(1, attempts)):
        group = random.choice(distinct_groups)
        group_rows = candidate_frame[candidate_frame[group_column] == group]
        if group_rows.empty:
            continue

        sample_size = min_sentences
        if len(group_rows) > 1 and random.random() < multi_sentence_probability:
            sample_size = random.randint(min(2, max_sentences), min(max_sentences, len(group_rows)))
        rows = group_rows.sample(n=min(sample_size, len(group_rows)))
        bundle = build_sentence_bundle(rows, row_to_sentence)
        if bundle["text"]:
            return bundle

    raise RuntimeError(f"Unable to sample a random bundle from {group_column!r}.")


def sample_multi_group_bundle(
    frame: pd.DataFrame,
    *,
    group_column: str,
    row_to_sentence: RowToSentence,
    min_groups: int = 2,
    max_groups: int = 3,
    min_sentences_per_group: int = 1,
    max_sentences_per_group: int = 2,
    text_column: str = "text",
    allowed_groups: set[str] | frozenset[str] | None = None,
) -> dict[str, Any]:
    """Sample 1-2 sentences from multiple random groups and concatenate them."""
    candidate_frame = _nonempty_frame(frame, text_column=text_column)
    if allowed_groups is not None:
        candidate_frame = candidate_frame[candidate_frame[group_column].isin(allowed_groups)]
    distinct_groups = [value for value in candidate_frame[group_column].dropna().unique().tolist() if value]
    if not distinct_groups:
        raise RuntimeError(f"No usable values were found in {group_column!r}.")

    min_groups = max(1, int(min_groups))
    max_groups = max(min_groups, int(max_groups))
    min_sentences_per_group = max(1, int(min_sentences_per_group))
    max_sentences_per_group = max(min_sentences_per_group, int(max_sentences_per_group))

    group_count = random.randint(min_groups, max_groups)
    random.shuffle(distinct_groups)
    chosen_groups = distinct_groups[: min(group_count, len(distinct_groups))]

    rows: list[pd.Series] = []
    for group in chosen_groups:
        group_rows = candidate_frame[candidate_frame[group_column] == group]
        if group_rows.empty:
            continue
        row_count = random.randint(min_sentences_per_group, min(max_sentences_per_group, len(group_rows)))
        sampled_rows = group_rows.sample(n=row_count)
        rows.extend(row for _, row in sampled_rows.iterrows())

    if not rows:
        raise RuntimeError(f"Unable to sample a multi-group bundle from {group_column!r}.")

    combined_rows = pd.DataFrame(rows)
    return build_sentence_bundle(combined_rows, row_to_sentence)