from __future__ import annotations from pathlib import Path import pandas as pd from .config import LABEL2ID from .text import compact_for_key, normalize_label, normalize_text BAD_TEXT_VALUES = {"", "x", "-", ".", "n/a", "na", "none", "null"} def load_binary_dataset(path: str | Path) -> pd.DataFrame: df = pd.read_csv(path) required = {"text", "label", "label_name"} missing = required - set(df.columns) if missing: raise ValueError(f"Dataset is missing required columns: {sorted(missing)}") df = df.copy() df["text"] = df["text"].map(normalize_text) df["label"] = df["label"].astype(int) df["label_name"] = df["label_name"].map(normalize_label) df = df[df["label_name"].isin(LABEL2ID)] df = df[df["text"].str.len() > 0] return df.reset_index(drop=True) def prepare_binary_dataset( input_path: str | Path, output_path: str | Path, *, sheet_name: str = "Data", dedupe: bool = True, ) -> tuple[pd.DataFrame, dict]: input_path = Path(input_path) output_path = Path(output_path) raw = pd.read_excel(input_path, sheet_name=sheet_name) raw.columns = [normalize_text(c) for c in raw.columns] rows: list[dict] = [] seen: set[str] = set() summary = { "input_file": input_path.name, "output_file": str(output_path).replace("\\", "/"), "original_rows": int(len(raw)), "dropped_netral": 0, "dropped_other_label": 0, "dropped_bad_text": 0, "dropped_duplicates": 0, } for _, row in raw.iterrows(): label_name = normalize_label(row.get("sentimen")) if label_name == "Netral": summary["dropped_netral"] += 1 continue if label_name not in LABEL2ID: summary["dropped_other_label"] += 1 continue source_column = "perbaikan" text = normalize_text(row.get("perbaikan")) if not text: source_column = "textTranslated" text = normalize_text(row.get("textTranslated")) if not text: source_column = "text" text = normalize_text(row.get("text")) if text.lower() in BAD_TEXT_VALUES: summary["dropped_bad_text"] += 1 continue key = compact_for_key(text) if dedupe and key in seen: summary["dropped_duplicates"] += 1 continue seen.add(key) rows.append( { "text": text, "label": LABEL2ID[label_name], "label_name": label_name, "kategori": normalize_text(row.get("kategori")), "stars": normalize_text(row.get("stars")), "source_column": source_column, } ) df = pd.DataFrame(rows) output_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(output_path, index=False) summary["kept_rows"] = int(len(df)) summary["labels"] = df["label_name"].value_counts().to_dict() return df, summary