Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Load PDF/DOCX legal documents into the database with full text + sections. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Dict, List | |
| PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| BACKEND_DIR = PROJECT_ROOT / "backend" | |
| # Only add BACKEND_DIR to sys.path (not hue_portal subdirectory) | |
| # Django needs to find hue_portal package (which is in backend/hue_portal) | |
| if str(BACKEND_DIR) not in sys.path: | |
| sys.path.insert(0, str(BACKEND_DIR)) | |
| os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hue_portal.hue_portal.settings") | |
| import django | |
| django.setup() | |
| from django.core.management import call_command # noqa: E402 | |
| def parse_manifest(path: Path) -> List[Dict[str, Any]]: | |
| data = json.loads(path.read_text(encoding="utf-8")) | |
| if not isinstance(data, list): | |
| raise ValueError("Manifest must be a list of document entries.") | |
| return data | |
| def ingest_document(root: Path, entry: Dict[str, Any], dry_run: bool = False) -> None: | |
| source_file = root / entry["source_file"] | |
| if not source_file.exists(): | |
| raise FileNotFoundError(source_file) | |
| if dry_run: | |
| print(f"▶ (dry-run) Would ingest {entry['code']} from {source_file}") | |
| return | |
| args = { | |
| "file": str(source_file), | |
| "code": entry["code"], | |
| "title": entry.get("title"), | |
| "doc_type": entry.get("doc_type", "other"), | |
| "summary": entry.get("summary", ""), | |
| "issued_by": entry.get("issued_by", ""), | |
| "issued_at": entry.get("issued_at"), | |
| "source_url": entry.get("source_url", ""), | |
| "metadata": json.dumps(entry.get("metadata", {})), | |
| } | |
| print(f"▶ Loading {entry['code']} from {source_file}") | |
| call_command("load_legal_document", **args) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Load legal documents into DB.") | |
| parser.add_argument( | |
| "--manifest", | |
| type=Path, | |
| default=Path(__file__).with_name("legal_documents_manifest.json"), | |
| help="Path to JSON manifest describing documents.", | |
| ) | |
| parser.add_argument( | |
| "--root", | |
| type=Path, | |
| default=PROJECT_ROOT, | |
| help="Root directory for relative source_file paths.", | |
| ) | |
| parser.add_argument("--dry-run", action="store_true", help="Parse files without DB writes.") | |
| args = parser.parse_args() | |
| manifest = parse_manifest(args.manifest) | |
| for entry in manifest: | |
| ingest_document(args.root, entry, dry_run=args.dry_run) | |
| if __name__ == "__main__": | |
| main() | |