"""Filter BhashaBench-Krishi (English) to dairy/poultry questions only. Reads: data/raw/bbk_english.parquet Writes: data/processed/bbk_dairy_poultry.parquet data/processed/bbk_dairy_poultry.csv (for human inspection) Usage: python scripts/extract_dairy_split.py """ from pathlib import Path import pandas as pd ROOT = Path(__file__).resolve().parent.parent SRC = ROOT / "data" / "raw" / "bbk_english.parquet" OUT_DIR = ROOT / "data" / "processed" OUT_PARQUET = OUT_DIR / "bbk_dairy_poultry.parquet" OUT_CSV = OUT_DIR / "bbk_dairy_poultry.csv" KEEP_PATTERN = r"dairy|poultry" def main() -> None: if not SRC.exists(): raise SystemExit(f"Missing {SRC}. Run scripts/fetch_bbk.py first.") df = pd.read_parquet(SRC) mask = df["subject_domain"].str.lower().str.contains(KEEP_PATTERN, regex=True, na=False) sub = df.loc[mask].reset_index(drop=True) OUT_DIR.mkdir(parents=True, exist_ok=True) sub.to_parquet(OUT_PARQUET, index=False) sub.to_csv(OUT_CSV, index=False) print(f"Source rows: {len(df)}") print(f"Filtered rows: {len(sub)} (subject_domain matches /{KEEP_PATTERN}/i)") print() print("subject_domain breakdown:") print(sub["subject_domain"].value_counts().to_string()) print() print("question_level breakdown:") print(sub["question_level"].value_counts().to_string()) print() print(f"Wrote: {OUT_PARQUET}") print(f"Wrote: {OUT_CSV}") if __name__ == "__main__": main()