amul-ai-eval / scripts /extract_dairy_split.py
bpHigh's picture
HF Space: add charts tab
74e6b83
Raw
History Blame Contribute Delete
1.48 kB
"""Filter BhashaBench-Krishi (English) to dairy/poultry questions only.
Reads: data/raw/bbk_english.parquet
Writes: data/processed/bbk_dairy_poultry.parquet
data/processed/bbk_dairy_poultry.csv (for human inspection)
Usage:
python scripts/extract_dairy_split.py
"""
from pathlib import Path
import pandas as pd
ROOT = Path(__file__).resolve().parent.parent
SRC = ROOT / "data" / "raw" / "bbk_english.parquet"
OUT_DIR = ROOT / "data" / "processed"
OUT_PARQUET = OUT_DIR / "bbk_dairy_poultry.parquet"
OUT_CSV = OUT_DIR / "bbk_dairy_poultry.csv"
KEEP_PATTERN = r"dairy|poultry"
def main() -> None:
if not SRC.exists():
raise SystemExit(f"Missing {SRC}. Run scripts/fetch_bbk.py first.")
df = pd.read_parquet(SRC)
mask = df["subject_domain"].str.lower().str.contains(KEEP_PATTERN, regex=True, na=False)
sub = df.loc[mask].reset_index(drop=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)
sub.to_parquet(OUT_PARQUET, index=False)
sub.to_csv(OUT_CSV, index=False)
print(f"Source rows: {len(df)}")
print(f"Filtered rows: {len(sub)} (subject_domain matches /{KEEP_PATTERN}/i)")
print()
print("subject_domain breakdown:")
print(sub["subject_domain"].value_counts().to_string())
print()
print("question_level breakdown:")
print(sub["question_level"].value_counts().to_string())
print()
print(f"Wrote: {OUT_PARQUET}")
print(f"Wrote: {OUT_CSV}")
if __name__ == "__main__":
main()