Spaces:
Sleeping
Sleeping
| """ | |
| Batch process all documents in DataRef folder using subprocess. | |
| Calls `python -m guichetoi.inference` on each image to avoid import issues. | |
| """ | |
| import json | |
| import logging | |
| import subprocess | |
| from pathlib import Path | |
| from collections import defaultdict | |
| import sys | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-7s %(message)s") | |
| log = logging.getLogger("batch_process") | |
| def main(): | |
| dataref_dir = Path("DataRef") | |
| if not dataref_dir.exists(): | |
| log.error(f"DataRef directory not found: {dataref_dir}") | |
| return | |
| # Find all image/PDF files | |
| image_extensions = {".png", ".jpg", ".jpeg", ".pdf", ".bmp", ".tif", ".tiff"} | |
| files = [f for f in dataref_dir.rglob("*") if f.suffix.lower() in image_extensions] | |
| log.info(f"Found {len(files)} document(s) in DataRef") | |
| results = [] | |
| stats = defaultdict(int) | |
| # destination for per-document JSON results from this batch | |
| processed_dir = Path("processed_dataref") | |
| processed_dir.mkdir(parents=True, exist_ok=True) | |
| for i, file_path in enumerate(sorted(files), 1): | |
| rel_path = file_path.relative_to(dataref_dir) | |
| log.info(f"[{i}/{len(files)}] Processing: {rel_path}") | |
| try: | |
| # Call inference CLI via subprocess (`pip install -e .` required) | |
| cmd = ["python", "-m", "guichetoi.inference", "--image", str(file_path), "--device", "cpu"] | |
| result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) | |
| if result.returncode != 0: | |
| log.error(f" ERROR: CLI returned code {result.returncode}: {result.stderr[:200]}") | |
| stats["errors"] += 1 | |
| continue | |
| # Read JSON output from outputs/{filename}_result.json | |
| try: | |
| result_file = Path("outputs") / f"{file_path.stem}_result.json" | |
| if not result_file.exists(): | |
| log.error(f" ERROR: Output file not created: {result_file}") | |
| stats["errors"] += 1 | |
| continue | |
| # move the per-document JSON into the processed_dataref folder | |
| dest_file = processed_dir / result_file.name | |
| try: | |
| result_file.replace(dest_file) | |
| except Exception: | |
| import shutil | |
| shutil.copy(result_file, dest_file) | |
| try: | |
| result_file.unlink() | |
| except Exception: | |
| pass | |
| with open(dest_file, "r", encoding="utf-8") as f: | |
| output_data = json.load(f) | |
| results.append(output_data) | |
| stats["total"] += 1 | |
| if "doc_class" in output_data: | |
| stats[f"class_{output_data['doc_class']}"] += 1 | |
| if output_data.get("fields"): | |
| stats["with_fields"] += 1 | |
| # Log key fields | |
| fields = output_data.get("fields", {}) | |
| log_fields = ["Reference_Urbanisme", "DLPI", "cabinet_conseil", "nb_log_totale", "Nb_log_pro", "Nb_log_res"] | |
| extracted = [f for f in log_fields if f in fields] | |
| if extracted: | |
| field_strs = [f"{f}={fields[f].get('value', '?')}" for f in extracted] | |
| log.info(f" → Extracted: {', '.join(field_strs)}") | |
| except json.JSONDecodeError as e: | |
| log.error(f" ERROR: Failed to parse JSON output: {e}") | |
| stats["errors"] += 1 | |
| except subprocess.TimeoutExpired: | |
| log.error(f" ERROR: Processing timed out (>120s)") | |
| stats["errors"] += 1 | |
| except Exception as e: | |
| log.error(f" ERROR: {e}") | |
| stats["errors"] += 1 | |
| # Save batch results into processed_dataref | |
| output_file = processed_dir / "batch_dataref_results.json" | |
| output_file.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump({ | |
| "total_processed": len(results), | |
| "statistics": dict(stats), | |
| "results": results | |
| }, f, ensure_ascii=False, indent=2) | |
| log.info(f"\n{'='*60}") | |
| log.info(f"Batch processing complete!") | |
| log.info(f" Total: {stats['total']}") | |
| log.info(f" With fields extracted: {stats['with_fields']}") | |
| log.info(f" Errors: {stats['errors']}") | |
| log.info(f" Results saved to: {output_file}") | |
| log.info(f"{'='*60}") | |
| if __name__ == "__main__": | |
| main() | |