Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| from pathlib import Path | |
| sys.path.append(str(Path(__file__).resolve().parents[1] / "src")) | |
| from matcha_sentiment.data import prepare_binary_dataset | |
| ROOT = Path(__file__).resolve().parents[1] | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Prepare binary Indonesian sentiment dataset.") | |
| parser.add_argument( | |
| "--input", | |
| default=str(ROOT / "Matchaya Gandaria City + IKUYO (done; need recheck).xlsx"), | |
| help="Input Excel file.", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| default=str(ROOT / "data" / "processed" / "matcha_sentiment_binary.csv"), | |
| help="Output CSV path.", | |
| ) | |
| parser.add_argument("--sheet", default="Data", help="Excel sheet name.") | |
| parser.add_argument( | |
| "--keep-duplicates", | |
| action="store_true", | |
| help="Keep duplicate cleaned texts. Default removes duplicates.", | |
| ) | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| df, summary = prepare_binary_dataset( | |
| args.input, | |
| args.output, | |
| sheet_name=args.sheet, | |
| dedupe=not args.keep_duplicates, | |
| ) | |
| summary_path = Path(args.output).with_name("summary.json") | |
| summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(json.dumps(summary, indent=2, ensure_ascii=False)) | |
| print(f"Saved {len(df)} rows to {args.output}") | |
| if __name__ == "__main__": | |
| main() | |