Spaces:
Sleeping
Sleeping
| """CSV validation and batch export (no Gradio).""" | |
| from __future__ import annotations | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Callable, Optional | |
| import pandas as pd | |
| from inference import classify_batch | |
| MAX_BYTES = 200 * 1024 * 1024 | |
| def validate_csv_path(path: Optional[str]) -> tuple[bool, str]: | |
| if not path: | |
| return False, "" | |
| if not os.path.isfile(path): | |
| return False, "Not a file" | |
| try: | |
| size = os.path.getsize(path) | |
| except OSError as exc: | |
| print("validate_csv_path size:", exc) | |
| return False, "Could not read file" | |
| if size > MAX_BYTES: | |
| return False, "File exceeds 20 MB (only the text column is used)." | |
| try: | |
| df = pd.read_csv(path, nrows=0) | |
| except Exception as exc: | |
| print("validate_csv_path read_csv:", exc) | |
| return False, "Could not parse CSV header" | |
| if "text" not in df.columns: | |
| return False, 'CSV must include a column named exactly "text".' | |
| return True, "" | |
| def export_path_for_source(source_path: str) -> str: | |
| stem = Path(source_path).stem | |
| out_name = f"{stem}_v3_teacher_results.csv" | |
| out_dir = tempfile.mkdtemp(prefix="bill_csv_") | |
| return str(Path(out_dir) / out_name) | |
| def run_csv_inference( | |
| csv_path: str, | |
| progress_cb: Optional[Callable[[int, int], None]] = None, | |
| ) -> str: | |
| df = pd.read_csv(csv_path) | |
| texts = ["" if pd.isna(x) else str(x) for x in df["text"].tolist()] | |
| batch_rows = classify_batch(texts, batch_size=32, progress_cb=progress_cb) | |
| if batch_rows: | |
| v3_teacher_results, v3_teacher_label, v3_teacher_prob = zip(*batch_rows) | |
| df["v3_teacher_results"] = list(v3_teacher_results) | |
| df["v3_teacher_label"] = list(v3_teacher_label) | |
| df["v3_teacher_prob"] = list(v3_teacher_prob) | |
| else: | |
| df["v3_teacher_results"] = [] | |
| df["v3_teacher_label"] = [] | |
| df["v3_teacher_prob"] = [] | |
| out_path = export_path_for_source(csv_path) | |
| df.to_csv(out_path, index=False) | |
| return out_path | |