davidtran999 commited on
Commit
d23adb1
·
verified ·
1 Parent(s): 2e60707

Upload backend/scripts/load_legal_documents.py with huggingface_hub

Browse files
backend/scripts/load_legal_documents.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Load PDF/DOCX legal documents into the database with full text + sections.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List
14
+
15
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
16
+ BACKEND_DIR = PROJECT_ROOT / "backend"
17
+ # Only add BACKEND_DIR to sys.path (not hue_portal subdirectory)
18
+ # Django needs to find hue_portal package (which is in backend/hue_portal)
19
+ if str(BACKEND_DIR) not in sys.path:
20
+ sys.path.insert(0, str(BACKEND_DIR))
21
+
22
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hue_portal.hue_portal.settings")
23
+
24
+ import django
25
+ django.setup()
26
+
27
+ from django.core.management import call_command # noqa: E402
28
+
29
+
30
+ def parse_manifest(path: Path) -> List[Dict[str, Any]]:
31
+ data = json.loads(path.read_text(encoding="utf-8"))
32
+ if not isinstance(data, list):
33
+ raise ValueError("Manifest must be a list of document entries.")
34
+ return data
35
+
36
+
37
+ def ingest_document(root: Path, entry: Dict[str, Any], dry_run: bool = False) -> None:
38
+ source_file = root / entry["source_file"]
39
+ if not source_file.exists():
40
+ raise FileNotFoundError(source_file)
41
+
42
+ if dry_run:
43
+ print(f"▶ (dry-run) Would ingest {entry['code']} from {source_file}")
44
+ return
45
+
46
+ args = {
47
+ "file": str(source_file),
48
+ "code": entry["code"],
49
+ "title": entry.get("title"),
50
+ "doc_type": entry.get("doc_type", "other"),
51
+ "summary": entry.get("summary", ""),
52
+ "issued_by": entry.get("issued_by", ""),
53
+ "issued_at": entry.get("issued_at"),
54
+ "source_url": entry.get("source_url", ""),
55
+ "metadata": json.dumps(entry.get("metadata", {})),
56
+ }
57
+ print(f"▶ Loading {entry['code']} from {source_file}")
58
+ call_command("load_legal_document", **args)
59
+
60
+
61
+ def main():
62
+ parser = argparse.ArgumentParser(description="Load legal documents into DB.")
63
+ parser.add_argument(
64
+ "--manifest",
65
+ type=Path,
66
+ default=Path(__file__).with_name("legal_documents_manifest.json"),
67
+ help="Path to JSON manifest describing documents.",
68
+ )
69
+ parser.add_argument(
70
+ "--root",
71
+ type=Path,
72
+ default=PROJECT_ROOT,
73
+ help="Root directory for relative source_file paths.",
74
+ )
75
+ parser.add_argument("--dry-run", action="store_true", help="Parse files without DB writes.")
76
+ args = parser.parse_args()
77
+
78
+ manifest = parse_manifest(args.manifest)
79
+ for entry in manifest:
80
+ ingest_document(args.root, entry, dry_run=args.dry_run)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
85
+