Spaces:

seedflora
/

matchaSentiment

Running

App Files Files Community

matchaSentiment / src /matcha_sentiment /data.py

seedflora

Initial Space upload from matchaSentiment repo

4ce2b3e verified 6 days ago

raw

history blame contribute delete

3.03 kB

	from __future__ import annotations

	from pathlib import Path

	import pandas as pd

	from .config import LABEL2ID
	from .text import compact_for_key, normalize_label, normalize_text


	BAD_TEXT_VALUES = {"", "x", "-", ".", "n/a", "na", "none", "null"}


	def load_binary_dataset(path: str \| Path) -> pd.DataFrame:
	df = pd.read_csv(path)
	required = {"text", "label", "label_name"}
	missing = required - set(df.columns)
	if missing:
	raise ValueError(f"Dataset is missing required columns: {sorted(missing)}")
	df = df.copy()
	df["text"] = df["text"].map(normalize_text)
	df["label"] = df["label"].astype(int)
	df["label_name"] = df["label_name"].map(normalize_label)
	df = df[df["label_name"].isin(LABEL2ID)]
	df = df[df["text"].str.len() > 0]
	return df.reset_index(drop=True)


	def prepare_binary_dataset(
	input_path: str \| Path,
	output_path: str \| Path,
	*,
	sheet_name: str = "Data",
	dedupe: bool = True,
	) -> tuple[pd.DataFrame, dict]:
	input_path = Path(input_path)
	output_path = Path(output_path)
	raw = pd.read_excel(input_path, sheet_name=sheet_name)
	raw.columns = [normalize_text(c) for c in raw.columns]

	rows: list[dict] = []
	seen: set[str] = set()
	summary = {
	"input_file": input_path.name,
	"output_file": str(output_path).replace("\\", "/"),
	"original_rows": int(len(raw)),
	"dropped_netral": 0,
	"dropped_other_label": 0,
	"dropped_bad_text": 0,
	"dropped_duplicates": 0,
	}

	for _, row in raw.iterrows():
	label_name = normalize_label(row.get("sentimen"))
	if label_name == "Netral":
	summary["dropped_netral"] += 1
	continue
	if label_name not in LABEL2ID:
	summary["dropped_other_label"] += 1
	continue

	source_column = "perbaikan"
	text = normalize_text(row.get("perbaikan"))
	if not text:
	source_column = "textTranslated"
	text = normalize_text(row.get("textTranslated"))
	if not text:
	source_column = "text"
	text = normalize_text(row.get("text"))

	if text.lower() in BAD_TEXT_VALUES:
	summary["dropped_bad_text"] += 1
	continue

	key = compact_for_key(text)
	if dedupe and key in seen:
	summary["dropped_duplicates"] += 1
	continue
	seen.add(key)

	rows.append(
	{
	"text": text,
	"label": LABEL2ID[label_name],
	"label_name": label_name,
	"kategori": normalize_text(row.get("kategori")),
	"stars": normalize_text(row.get("stars")),
	"source_column": source_column,
	}
	)

	df = pd.DataFrame(rows)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	df.to_csv(output_path, index=False)
	summary["kept_rows"] = int(len(df))
	summary["labels"] = df["label_name"].value_counts().to_dict()
	return df, summary