File size: 3,028 Bytes
4ce2b3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from __future__ import annotations

from pathlib import Path

import pandas as pd

from .config import LABEL2ID
from .text import compact_for_key, normalize_label, normalize_text


BAD_TEXT_VALUES = {"", "x", "-", ".", "n/a", "na", "none", "null"}


def load_binary_dataset(path: str | Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    required = {"text", "label", "label_name"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Dataset is missing required columns: {sorted(missing)}")
    df = df.copy()
    df["text"] = df["text"].map(normalize_text)
    df["label"] = df["label"].astype(int)
    df["label_name"] = df["label_name"].map(normalize_label)
    df = df[df["label_name"].isin(LABEL2ID)]
    df = df[df["text"].str.len() > 0]
    return df.reset_index(drop=True)


def prepare_binary_dataset(
    input_path: str | Path,
    output_path: str | Path,
    *,
    sheet_name: str = "Data",
    dedupe: bool = True,
) -> tuple[pd.DataFrame, dict]:
    input_path = Path(input_path)
    output_path = Path(output_path)
    raw = pd.read_excel(input_path, sheet_name=sheet_name)
    raw.columns = [normalize_text(c) for c in raw.columns]

    rows: list[dict] = []
    seen: set[str] = set()
    summary = {
        "input_file": input_path.name,
        "output_file": str(output_path).replace("\\", "/"),
        "original_rows": int(len(raw)),
        "dropped_netral": 0,
        "dropped_other_label": 0,
        "dropped_bad_text": 0,
        "dropped_duplicates": 0,
    }

    for _, row in raw.iterrows():
        label_name = normalize_label(row.get("sentimen"))
        if label_name == "Netral":
            summary["dropped_netral"] += 1
            continue
        if label_name not in LABEL2ID:
            summary["dropped_other_label"] += 1
            continue

        source_column = "perbaikan"
        text = normalize_text(row.get("perbaikan"))
        if not text:
            source_column = "textTranslated"
            text = normalize_text(row.get("textTranslated"))
        if not text:
            source_column = "text"
            text = normalize_text(row.get("text"))

        if text.lower() in BAD_TEXT_VALUES:
            summary["dropped_bad_text"] += 1
            continue

        key = compact_for_key(text)
        if dedupe and key in seen:
            summary["dropped_duplicates"] += 1
            continue
        seen.add(key)

        rows.append(
            {
                "text": text,
                "label": LABEL2ID[label_name],
                "label_name": label_name,
                "kategori": normalize_text(row.get("kategori")),
                "stars": normalize_text(row.get("stars")),
                "source_column": source_column,
            }
        )

    df = pd.DataFrame(rows)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    summary["kept_rows"] = int(len(df))
    summary["labels"] = df["label_name"].value_counts().to_dict()
    return df, summary