fix metadata
Browse files- app/law_document_chunker.py +79 -72
app/law_document_chunker.py
CHANGED
|
@@ -263,20 +263,15 @@ class LawDocumentChunker:
|
|
| 263 |
|
| 264 |
def _process_document_recursive(self, content: str, vanbanid: int,
|
| 265 |
document_title: str) -> List[ChunkMetadata]:
|
| 266 |
-
"""Xử lý văn bản theo cấu trúc phân cấp."""
|
| 267 |
lines = content.split('\n')
|
| 268 |
chunks = []
|
| 269 |
-
|
| 270 |
-
# Stack để theo dõi các chunks theo thứ tự xuất hiện
|
| 271 |
-
# Mỗi item là (chunk_id, level, level_value, content)
|
| 272 |
-
chunk_stack = []
|
| 273 |
-
|
| 274 |
current_chunk_content = ""
|
| 275 |
-
current_level =
|
| 276 |
current_level_value = None
|
| 277 |
current_parent = None
|
| 278 |
-
|
| 279 |
-
# Định nghĩa thứ tự ưu tiên của các level (số càng nhỏ càng cao)
|
| 280 |
level_priority = {
|
| 281 |
"PHAN": 1,
|
| 282 |
"PHU_LUC": 1,
|
|
@@ -287,54 +282,91 @@ class LawDocumentChunker:
|
|
| 287 |
"DIEM": 6,
|
| 288 |
"CONTENT": 7
|
| 289 |
}
|
| 290 |
-
|
| 291 |
for line in lines:
|
| 292 |
-
level, level_value,
|
| 293 |
-
|
| 294 |
-
#
|
| 295 |
-
if level
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
| 301 |
if current_chunk_content.strip():
|
| 302 |
metadata = self._create_chunk_metadata(
|
| 303 |
current_chunk_content.strip(),
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
vanbanid,
|
| 308 |
document_title,
|
| 309 |
chunk_stack
|
| 310 |
)
|
| 311 |
chunks.append(metadata)
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
logger.debug(f"[CHUNKER] Created chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
|
| 316 |
-
|
| 317 |
-
# Tìm parent cho level mới TRƯỚC KHI tạo chunk mới
|
| 318 |
-
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 319 |
-
logger.debug(f"[CHUNKER] Found parent for {level}: {current_parent}")
|
| 320 |
-
|
| 321 |
-
# Bắt đầu chunk mới
|
| 322 |
-
current_chunk_content = line + "\n"
|
| 323 |
current_level = level
|
| 324 |
current_level_value = level_value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
else:
|
| 326 |
-
#
|
| 327 |
current_chunk_content += line + "\n"
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
if len(current_chunk_content) > self.CHUNK_SIZE:
|
| 331 |
-
# Chia chunk hiện tại
|
| 332 |
sub_chunks = self._split_into_chunks(current_chunk_content, self.CHUNK_SIZE, self.CHUNK_OVERLAP)
|
| 333 |
-
|
| 334 |
-
for i, sub_chunk in enumerate(sub_chunks):
|
| 335 |
metadata = self._create_chunk_metadata(
|
| 336 |
sub_chunk.strip(),
|
| 337 |
-
current_level,
|
| 338 |
current_level_value,
|
| 339 |
current_parent,
|
| 340 |
vanbanid,
|
|
@@ -342,18 +374,13 @@ class LawDocumentChunker:
|
|
| 342 |
chunk_stack
|
| 343 |
)
|
| 344 |
chunks.append(metadata)
|
| 345 |
-
|
| 346 |
-
# Thêm vào stack NGAY LẬP TỨC
|
| 347 |
-
chunk_stack.append((metadata.id, current_level, current_level_value, sub_chunk.strip()))
|
| 348 |
-
logger.debug(f"[CHUNKER] Created sub-chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
|
| 349 |
-
|
| 350 |
current_chunk_content = ""
|
| 351 |
-
|
| 352 |
# Lưu chunk cuối cùng
|
| 353 |
-
if current_chunk_content.strip():
|
| 354 |
metadata = self._create_chunk_metadata(
|
| 355 |
current_chunk_content.strip(),
|
| 356 |
-
current_level,
|
| 357 |
current_level_value,
|
| 358 |
current_parent,
|
| 359 |
vanbanid,
|
|
@@ -361,22 +388,13 @@ class LawDocumentChunker:
|
|
| 361 |
chunk_stack
|
| 362 |
)
|
| 363 |
chunks.append(metadata)
|
| 364 |
-
|
| 365 |
-
# Thêm vào stack NGAY LẬP TỨC
|
| 366 |
-
chunk_stack.append((metadata.id, current_level, current_level_value, current_chunk_content.strip()))
|
| 367 |
-
logger.debug(f"[CHUNKER] Created final chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
|
| 368 |
-
|
| 369 |
-
# Debug: Kiểm tra kết quả
|
| 370 |
root_count = sum(1 for chunk in chunks if chunk.cha is None)
|
| 371 |
logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
|
| 372 |
-
|
| 373 |
-
# Debug: Log chi tiết từng chunk
|
| 374 |
-
for i, chunk in enumerate(chunks[:10]): # Log 10 chunks đầu tiên
|
| 375 |
logger.debug(f"[CHUNKER] Chunk {i+1}: {chunk.content[:100]}... -> Parent: {chunk.cha}")
|
| 376 |
-
|
| 377 |
if len(chunks) > 10:
|
| 378 |
logger.debug(f"[CHUNKER] ... and {len(chunks) - 10} more chunks")
|
| 379 |
-
|
| 380 |
return chunks
|
| 381 |
|
| 382 |
def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
|
|
@@ -385,20 +403,9 @@ class LawDocumentChunker:
|
|
| 385 |
Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
|
| 386 |
"""
|
| 387 |
current_priority = level_priority.get(current_level, 999)
|
| 388 |
-
|
| 389 |
-
# Tìm từ cuối stack (gần nhất) đến đầu stack
|
| 390 |
for chunk_id, level, level_value, content in reversed(chunk_stack):
|
| 391 |
-
# Kiểm tra cả priority và quan hệ hợp lệ
|
| 392 |
if level_priority.get(level, 999) < current_priority:
|
| 393 |
-
|
| 394 |
-
if current_level == "KHOAN" and level == "DIEU":
|
| 395 |
-
return chunk_id
|
| 396 |
-
elif current_level == "DIEM" and level == "KHOAN":
|
| 397 |
-
return chunk_id
|
| 398 |
-
elif current_level == "CONTENT":
|
| 399 |
-
# Content có thể có parent là bất kỳ level nào
|
| 400 |
-
return chunk_id
|
| 401 |
-
|
| 402 |
return None
|
| 403 |
|
| 404 |
async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
|
|
|
|
| 263 |
|
| 264 |
def _process_document_recursive(self, content: str, vanbanid: int,
|
| 265 |
document_title: str) -> List[ChunkMetadata]:
|
| 266 |
+
"""Xử lý văn bản theo cấu trúc phân cấp."""
|
| 267 |
lines = content.split('\n')
|
| 268 |
chunks = []
|
| 269 |
+
chunk_stack = [] # (chunk_id, level, level_value, content)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
current_chunk_content = ""
|
| 271 |
+
current_level = None
|
| 272 |
current_level_value = None
|
| 273 |
current_parent = None
|
| 274 |
+
current_level_priority = None
|
|
|
|
| 275 |
level_priority = {
|
| 276 |
"PHAN": 1,
|
| 277 |
"PHU_LUC": 1,
|
|
|
|
| 282 |
"DIEM": 6,
|
| 283 |
"CONTENT": 7
|
| 284 |
}
|
| 285 |
+
preamble_done = False
|
| 286 |
for line in lines:
|
| 287 |
+
level, level_value, _ = self._detect_structure_level(line)
|
| 288 |
+
line_priority = level_priority.get(level, 7)
|
| 289 |
+
# Nếu là dòng đầu tiên hoặc preamble
|
| 290 |
+
if not preamble_done and (level == "CONTENT" or not level_value):
|
| 291 |
+
current_chunk_content += line + "\n"
|
| 292 |
+
current_level = "CONTENT"
|
| 293 |
+
current_level_value = None
|
| 294 |
+
current_parent = None
|
| 295 |
+
current_level_priority = 7
|
| 296 |
+
continue
|
| 297 |
+
if not preamble_done and (level != "CONTENT" and level_value):
|
| 298 |
+
# Kết thúc preamble
|
| 299 |
if current_chunk_content.strip():
|
| 300 |
metadata = self._create_chunk_metadata(
|
| 301 |
current_chunk_content.strip(),
|
| 302 |
+
"CONTENT",
|
| 303 |
+
None,
|
| 304 |
+
None,
|
| 305 |
vanbanid,
|
| 306 |
document_title,
|
| 307 |
chunk_stack
|
| 308 |
)
|
| 309 |
chunks.append(metadata)
|
| 310 |
+
chunk_stack.append((metadata.id, "CONTENT", None, current_chunk_content.strip()))
|
| 311 |
+
preamble_done = True
|
| 312 |
+
current_chunk_content = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
current_level = level
|
| 314 |
current_level_value = level_value
|
| 315 |
+
current_level_priority = line_priority
|
| 316 |
+
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 317 |
+
current_chunk_content += line + "\n"
|
| 318 |
+
continue
|
| 319 |
+
# Nếu gặp level mới
|
| 320 |
+
if level != "CONTENT" and level_value:
|
| 321 |
+
if current_level is not None and current_level_priority is not None and line_priority <= current_level_priority:
|
| 322 |
+
# Kết thúc chunk hiện tại
|
| 323 |
+
if current_chunk_content.strip():
|
| 324 |
+
metadata = self._create_chunk_metadata(
|
| 325 |
+
current_chunk_content.strip(),
|
| 326 |
+
str(current_level),
|
| 327 |
+
current_level_value,
|
| 328 |
+
current_parent,
|
| 329 |
+
vanbanid,
|
| 330 |
+
document_title,
|
| 331 |
+
chunk_stack
|
| 332 |
+
)
|
| 333 |
+
chunks.append(metadata)
|
| 334 |
+
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
| 335 |
+
# Bắt đầu chunk mới
|
| 336 |
+
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 337 |
+
current_chunk_content = line + "\n"
|
| 338 |
+
current_level = level
|
| 339 |
+
current_level_value = level_value
|
| 340 |
+
current_level_priority = line_priority
|
| 341 |
+
else:
|
| 342 |
+
# Level mới nhưng priority cao hơn (ví dụ: Mục trong Chương)
|
| 343 |
+
if current_chunk_content.strip() and current_level is not None:
|
| 344 |
+
metadata = self._create_chunk_metadata(
|
| 345 |
+
current_chunk_content.strip(),
|
| 346 |
+
str(current_level),
|
| 347 |
+
current_level_value,
|
| 348 |
+
current_parent,
|
| 349 |
+
vanbanid,
|
| 350 |
+
document_title,
|
| 351 |
+
chunk_stack
|
| 352 |
+
)
|
| 353 |
+
chunks.append(metadata)
|
| 354 |
+
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
| 355 |
+
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 356 |
+
current_chunk_content = line + "\n"
|
| 357 |
+
current_level = level
|
| 358 |
+
current_level_value = level_value
|
| 359 |
+
current_level_priority = line_priority
|
| 360 |
else:
|
| 361 |
+
# CONTENT nối vào chunk hiện tại
|
| 362 |
current_chunk_content += line + "\n"
|
| 363 |
+
# Nếu chunk quá lớn thì chia nhỏ
|
| 364 |
+
if len(current_chunk_content) > self.CHUNK_SIZE and current_level is not None:
|
|
|
|
|
|
|
| 365 |
sub_chunks = self._split_into_chunks(current_chunk_content, self.CHUNK_SIZE, self.CHUNK_OVERLAP)
|
| 366 |
+
for sub_chunk in sub_chunks:
|
|
|
|
| 367 |
metadata = self._create_chunk_metadata(
|
| 368 |
sub_chunk.strip(),
|
| 369 |
+
str(current_level),
|
| 370 |
current_level_value,
|
| 371 |
current_parent,
|
| 372 |
vanbanid,
|
|
|
|
| 374 |
chunk_stack
|
| 375 |
)
|
| 376 |
chunks.append(metadata)
|
| 377 |
+
chunk_stack.append((metadata.id, str(current_level), current_level_value, sub_chunk.strip()))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
current_chunk_content = ""
|
|
|
|
| 379 |
# Lưu chunk cuối cùng
|
| 380 |
+
if current_chunk_content.strip() and current_level is not None:
|
| 381 |
metadata = self._create_chunk_metadata(
|
| 382 |
current_chunk_content.strip(),
|
| 383 |
+
str(current_level),
|
| 384 |
current_level_value,
|
| 385 |
current_parent,
|
| 386 |
vanbanid,
|
|
|
|
| 388 |
chunk_stack
|
| 389 |
)
|
| 390 |
chunks.append(metadata)
|
| 391 |
+
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
root_count = sum(1 for chunk in chunks if chunk.cha is None)
|
| 393 |
logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
|
| 394 |
+
for i, chunk in enumerate(chunks[:10]):
|
|
|
|
|
|
|
| 395 |
logger.debug(f"[CHUNKER] Chunk {i+1}: {chunk.content[:100]}... -> Parent: {chunk.cha}")
|
|
|
|
| 396 |
if len(chunks) > 10:
|
| 397 |
logger.debug(f"[CHUNKER] ... and {len(chunks) - 10} more chunks")
|
|
|
|
| 398 |
return chunks
|
| 399 |
|
| 400 |
def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
|
|
|
|
| 403 |
Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
|
| 404 |
"""
|
| 405 |
current_priority = level_priority.get(current_level, 999)
|
|
|
|
|
|
|
| 406 |
for chunk_id, level, level_value, content in reversed(chunk_stack):
|
|
|
|
| 407 |
if level_priority.get(level, 999) < current_priority:
|
| 408 |
+
return chunk_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
return None
|
| 410 |
|
| 411 |
async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
|