davidtran999 commited on
Commit
e1ecd91
·
verified ·
1 Parent(s): 99f4190

Upload backend/core/services/legal_ingestion.py with huggingface_hub

Browse files
backend/core/services/legal_ingestion.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities to ingest uploaded legal documents into persistent storage.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import hashlib
8
+ from dataclasses import dataclass
9
+ from datetime import datetime, date
10
+ from io import BytesIO
11
+ from typing import BinaryIO, Dict, Optional
12
+ from pathlib import Path
13
+ import re
14
+
15
+ from django.conf import settings
16
+ from django.core.files.base import ContentFile
17
+ from django.db import transaction
18
+ from django.utils import timezone
19
+
20
+ from hue_portal.core.models import (
21
+ LegalDocument,
22
+ LegalSection,
23
+ LegalDocumentImage,
24
+ IngestionJob,
25
+ )
26
+ from hue_portal.core.etl.legal_document_loader import load_legal_document
27
+ from hue_portal.core.tasks import process_ingestion_job
28
+
29
+
30
+ @dataclass
31
+ class LegalIngestionResult:
32
+ document: LegalDocument
33
+ created: bool
34
+ sections_count: int
35
+ images_count: int
36
+
37
+
38
+ def _parse_date(value: Optional[str | date]) -> Optional[date]:
39
+ if isinstance(value, date):
40
+ return value
41
+ if not value:
42
+ return None
43
+ for fmt in ("%Y-%m-%d", "%d/%m/%Y"):
44
+ try:
45
+ return datetime.strptime(value, fmt).date()
46
+ except ValueError:
47
+ continue
48
+ return None
49
+
50
+
51
+ def _sha256(data: bytes) -> str:
52
+ digest = hashlib.sha256()
53
+ digest.update(data)
54
+ return digest.hexdigest()
55
+
56
+
57
+ def _normalize_text(text: str) -> str:
58
+ cleaned = re.sub(r"\s+", "", text or "")
59
+ return cleaned.lower()
60
+
61
+
62
+ DOC_TYPE_KEYWORDS = {
63
+ "decision": ["quyết định"],
64
+ "circular": ["thông tư"],
65
+ "guideline": ["hướng dẫn"],
66
+ "plan": ["kế hoạch"],
67
+ }
68
+
69
+
70
+ def _auto_fill_metadata(
71
+ *, text: str, title: str, issued_by: str, issued_at: Optional[date], doc_type: str
72
+ ) -> tuple[str, str, Optional[date], str]:
73
+ head = (text or "")[:2000]
74
+ if not issued_by:
75
+ match = re.search(r"(BỘ\s+[A-ZÂĂÊÔƠƯ\s]+|ỦY BAN\s+NHÂN DÂN\s+[^\n]+)", head, re.IGNORECASE)
76
+ if match:
77
+ issued_by = match.group(0).strip()
78
+
79
+ if not issued_at:
80
+ match = re.search(
81
+ r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", head,
82
+ )
83
+ if match:
84
+ day, month, year = match.groups()
85
+ issued_at = _parse_date(f"{year}-{int(month):02d}-{int(day):02d}")
86
+ else:
87
+ match = re.search(
88
+ r"ngày\s+(\d{1,2})\s+tháng\s+(\d{1,2})\s+năm\s+(\d{4})",
89
+ head,
90
+ re.IGNORECASE,
91
+ )
92
+ if match:
93
+ day, month, year = match.groups()
94
+ issued_at = _parse_date(f"{year}-{int(month):02d}-{int(day):02d}")
95
+
96
+ if doc_type == "other":
97
+ lower = head.lower()
98
+ for dtype, keywords in DOC_TYPE_KEYWORDS.items():
99
+ if any(keyword in lower for keyword in keywords):
100
+ doc_type = dtype
101
+ break
102
+
103
+ if not title or title == (DOC_TYPE_KEYWORDS.get(doc_type, [title])[0] if doc_type != "other" else ""):
104
+ match = re.search(r"(QUYẾT ĐỊNH|THÔNG TƯ|HƯỚNG DẪN|KẾ HOẠCH)[^\n]+", head, re.IGNORECASE)
105
+ if match:
106
+ title = match.group(0).strip().title()
107
+
108
+ return title, issued_by, issued_at, doc_type
109
+
110
+
111
+ def ingest_uploaded_document(
112
+ *,
113
+ file_obj: BinaryIO,
114
+ filename: str,
115
+ metadata: Dict,
116
+ ) -> LegalIngestionResult:
117
+ """
118
+ Ingest uploaded PDF/DOCX file, storing raw file, sections, and extracted images.
119
+
120
+ Args:
121
+ file_obj: Binary file-like object positioned at start.
122
+ filename: Original filename.
123
+ metadata: dict containing code, title, doc_type, summary, issued_by, issued_at, source_url, extra_metadata.
124
+ """
125
+ code = metadata.get("code", "").strip()
126
+ if not code:
127
+ raise ValueError("Document code is required.")
128
+
129
+ title = metadata.get("title") or code
130
+ doc_type = metadata.get("doc_type", "other")
131
+ issued_at = _parse_date(metadata.get("issued_at"))
132
+ summary = metadata.get("summary", "")
133
+ issued_by = metadata.get("issued_by", "")
134
+ source_url = metadata.get("source_url", "")
135
+ extra_metadata = metadata.get("metadata") or {}
136
+
137
+ file_bytes = file_obj.read()
138
+ if hasattr(file_obj, "seek"):
139
+ file_obj.seek(0)
140
+ checksum = _sha256(file_bytes)
141
+ mime_type = metadata.get("mime_type") or getattr(file_obj, "content_type", "")
142
+ size = len(file_bytes)
143
+
144
+ extracted = load_legal_document(BytesIO(file_bytes), filename=filename)
145
+ title, issued_by, issued_at, doc_type = _auto_fill_metadata(
146
+ text=extracted.text, title=title, issued_by=issued_by, issued_at=issued_at, doc_type=doc_type
147
+ )
148
+ normalized_text = _normalize_text(extracted.text)
149
+ content_checksum = _sha256(normalized_text.encode("utf-8"))
150
+
151
+ duplicate = (
152
+ LegalDocument.objects.filter(content_checksum=content_checksum)
153
+ .exclude(code=code)
154
+ .first()
155
+ )
156
+ if duplicate:
157
+ raise ValueError(f"Nội dung trùng với văn bản hiện có: {duplicate.code}")
158
+
159
+ with transaction.atomic():
160
+ doc, created = LegalDocument.objects.get_or_create(
161
+ code=code,
162
+ defaults={
163
+ "title": title,
164
+ "doc_type": doc_type,
165
+ "summary": summary,
166
+ "issued_by": issued_by,
167
+ "issued_at": issued_at,
168
+ "source_url": source_url,
169
+ "metadata": extra_metadata,
170
+ },
171
+ )
172
+
173
+ # Update metadata if document already existed (keep latest info)
174
+ doc.title = title
175
+ doc.doc_type = doc_type
176
+ doc.summary = summary
177
+ doc.issued_by = issued_by
178
+ doc.issued_at = issued_at
179
+ doc.source_url = source_url
180
+ doc.metadata = extra_metadata
181
+ doc.page_count = extracted.page_count
182
+ doc.raw_text = extracted.text
183
+ doc.raw_text_ocr = extracted.ocr_text or ""
184
+ doc.file_checksum = checksum
185
+ doc.content_checksum = content_checksum
186
+ doc.file_size = size
187
+ doc.mime_type = mime_type
188
+ doc.original_filename = filename
189
+ doc.updated_at = timezone.now()
190
+
191
+ # Save binary file
192
+ content = ContentFile(file_bytes)
193
+ storage_name = f"{code}/{filename}"
194
+ doc.uploaded_file.save(storage_name, content, save=False)
195
+ doc.source_file = doc.uploaded_file.name
196
+ doc.save()
197
+
198
+ # Replace sections
199
+ doc.sections.all().delete()
200
+ sections = []
201
+ for idx, section in enumerate(extracted.sections, start=1):
202
+ sections.append(
203
+ LegalSection(
204
+ document=doc,
205
+ section_code=section.code,
206
+ section_title=section.title,
207
+ level=section.level,
208
+ order=idx,
209
+ content=section.content,
210
+ excerpt=section.content[:400],
211
+ page_start=section.page_start,
212
+ page_end=section.page_end,
213
+ is_ocr=section.is_ocr,
214
+ metadata=section.metadata or {},
215
+ )
216
+ )
217
+ LegalSection.objects.bulk_create(sections, batch_size=200)
218
+
219
+ # Replace images
220
+ doc.images.all().delete()
221
+ images = []
222
+ for idx, image in enumerate(extracted.images, start=1):
223
+ image_content = ContentFile(image.data)
224
+ image_name = f"{code}/img_{idx}.{image.extension}"
225
+ img_instance = LegalDocumentImage(
226
+ document=doc,
227
+ page_number=image.page_number,
228
+ description=image.description,
229
+ width=image.width,
230
+ height=image.height,
231
+ checksum=_sha256(image.data),
232
+ )
233
+ img_instance.image.save(image_name, image_content, save=False)
234
+ images.append(img_instance)
235
+ LegalDocumentImage.objects.bulk_create(images, batch_size=100)
236
+
237
+ return LegalIngestionResult(
238
+ document=doc,
239
+ created=created,
240
+ sections_count=len(sections),
241
+ images_count=len(images),
242
+ )
243
+
244
+
245
+ def enqueue_ingestion_job(*, file_obj, filename: str, metadata: Dict) -> IngestionJob:
246
+ """
247
+ Persist uploaded file to a temporary job folder and enqueue Celery processing.
248
+ """
249
+
250
+ job = IngestionJob.objects.create(
251
+ code=metadata.get("code", ""),
252
+ filename=filename,
253
+ metadata=metadata,
254
+ status=IngestionJob.STATUS_PENDING,
255
+ )
256
+
257
+ temp_dir = Path(settings.MEDIA_ROOT) / "ingestion_jobs" / str(job.id)
258
+ temp_dir.mkdir(parents=True, exist_ok=True)
259
+ temp_path = temp_dir / filename
260
+
261
+ if hasattr(file_obj, "seek"):
262
+ file_obj.seek(0)
263
+ if hasattr(file_obj, "chunks"):
264
+ with temp_path.open("wb") as dest:
265
+ for chunk in file_obj.chunks():
266
+ dest.write(chunk)
267
+ else:
268
+ data = file_obj.read()
269
+ with temp_path.open("wb") as dest:
270
+ dest.write(data)
271
+
272
+ job.storage_path = str(temp_path)
273
+ job.save(update_fields=["storage_path"])
274
+ process_ingestion_job.delay(str(job.id))
275
+ return job
276
+