Spaces:
Sleeping
Sleeping
| """Filter BhashaBench-Krishi (English) to dairy/poultry questions only. | |
| Reads: data/raw/bbk_english.parquet | |
| Writes: data/processed/bbk_dairy_poultry.parquet | |
| data/processed/bbk_dairy_poultry.csv (for human inspection) | |
| Usage: | |
| python scripts/extract_dairy_split.py | |
| """ | |
| from pathlib import Path | |
| import pandas as pd | |
| ROOT = Path(__file__).resolve().parent.parent | |
| SRC = ROOT / "data" / "raw" / "bbk_english.parquet" | |
| OUT_DIR = ROOT / "data" / "processed" | |
| OUT_PARQUET = OUT_DIR / "bbk_dairy_poultry.parquet" | |
| OUT_CSV = OUT_DIR / "bbk_dairy_poultry.csv" | |
| KEEP_PATTERN = r"dairy|poultry" | |
| def main() -> None: | |
| if not SRC.exists(): | |
| raise SystemExit(f"Missing {SRC}. Run scripts/fetch_bbk.py first.") | |
| df = pd.read_parquet(SRC) | |
| mask = df["subject_domain"].str.lower().str.contains(KEEP_PATTERN, regex=True, na=False) | |
| sub = df.loc[mask].reset_index(drop=True) | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| sub.to_parquet(OUT_PARQUET, index=False) | |
| sub.to_csv(OUT_CSV, index=False) | |
| print(f"Source rows: {len(df)}") | |
| print(f"Filtered rows: {len(sub)} (subject_domain matches /{KEEP_PATTERN}/i)") | |
| print() | |
| print("subject_domain breakdown:") | |
| print(sub["subject_domain"].value_counts().to_string()) | |
| print() | |
| print("question_level breakdown:") | |
| print(sub["question_level"].value_counts().to_string()) | |
| print() | |
| print(f"Wrote: {OUT_PARQUET}") | |
| print(f"Wrote: {OUT_CSV}") | |
| if __name__ == "__main__": | |
| main() |