VietCat commited on
Commit
8efb617
·
1 Parent(s): 9219e02

fix metadata

Browse files
Files changed (1) hide show
  1. app/law_document_chunker.py +79 -72
app/law_document_chunker.py CHANGED
@@ -263,20 +263,15 @@ class LawDocumentChunker:
263
 
264
  def _process_document_recursive(self, content: str, vanbanid: int,
265
  document_title: str) -> List[ChunkMetadata]:
266
- """Xử lý văn bản theo cấu trúc phân cấp."""##
267
  lines = content.split('\n')
268
  chunks = []
269
-
270
- # Stack để theo dõi các chunks theo thứ tự xuất hiện
271
- # Mỗi item là (chunk_id, level, level_value, content)
272
- chunk_stack = []
273
-
274
  current_chunk_content = ""
275
- current_level = "CONTENT"
276
  current_level_value = None
277
  current_parent = None
278
-
279
- # Định nghĩa thứ tự ưu tiên của các level (số càng nhỏ càng cao)
280
  level_priority = {
281
  "PHAN": 1,
282
  "PHU_LUC": 1,
@@ -287,54 +282,91 @@ class LawDocumentChunker:
287
  "DIEM": 6,
288
  "CONTENT": 7
289
  }
290
-
291
  for line in lines:
292
- level, level_value, level_content = self._detect_structure_level(line)
293
-
294
- # Debug logging
295
- if level != "CONTENT" and level_value:
296
- logger.debug(f"[CHUNKER] Line: '{line.strip()}' -> Level: {level}, Value: {level_value}")
297
-
298
- # Nếu phát hiện cấp độ mới
299
- if level != "CONTENT" and level_value:
300
- # Lưu chunk hiện tại nếu có
 
 
 
301
  if current_chunk_content.strip():
302
  metadata = self._create_chunk_metadata(
303
  current_chunk_content.strip(),
304
- current_level,
305
- current_level_value,
306
- current_parent,
307
  vanbanid,
308
  document_title,
309
  chunk_stack
310
  )
311
  chunks.append(metadata)
312
-
313
- # Thêm vào stack NGAY LẬP TỨC
314
- chunk_stack.append((metadata.id, current_level, current_level_value, current_chunk_content.strip()))
315
- logger.debug(f"[CHUNKER] Created chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
316
-
317
- # Tìm parent cho level mới TRƯỚC KHI tạo chunk mới
318
- current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
319
- logger.debug(f"[CHUNKER] Found parent for {level}: {current_parent}")
320
-
321
- # Bắt đầu chunk mới
322
- current_chunk_content = line + "\n"
323
  current_level = level
324
  current_level_value = level_value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  else:
326
- # Thêm vào chunk hiện tại
327
  current_chunk_content += line + "\n"
328
-
329
- # Kiểm tra nếu chunk quá lớn
330
- if len(current_chunk_content) > self.CHUNK_SIZE:
331
- # Chia chunk hiện tại
332
  sub_chunks = self._split_into_chunks(current_chunk_content, self.CHUNK_SIZE, self.CHUNK_OVERLAP)
333
-
334
- for i, sub_chunk in enumerate(sub_chunks):
335
  metadata = self._create_chunk_metadata(
336
  sub_chunk.strip(),
337
- current_level,
338
  current_level_value,
339
  current_parent,
340
  vanbanid,
@@ -342,18 +374,13 @@ class LawDocumentChunker:
342
  chunk_stack
343
  )
344
  chunks.append(metadata)
345
-
346
- # Thêm vào stack NGAY LẬP TỨC
347
- chunk_stack.append((metadata.id, current_level, current_level_value, sub_chunk.strip()))
348
- logger.debug(f"[CHUNKER] Created sub-chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
349
-
350
  current_chunk_content = ""
351
-
352
  # Lưu chunk cuối cùng
353
- if current_chunk_content.strip():
354
  metadata = self._create_chunk_metadata(
355
  current_chunk_content.strip(),
356
- current_level,
357
  current_level_value,
358
  current_parent,
359
  vanbanid,
@@ -361,22 +388,13 @@ class LawDocumentChunker:
361
  chunk_stack
362
  )
363
  chunks.append(metadata)
364
-
365
- # Thêm vào stack NGAY LẬP TỨC
366
- chunk_stack.append((metadata.id, current_level, current_level_value, current_chunk_content.strip()))
367
- logger.debug(f"[CHUNKER] Created final chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
368
-
369
- # Debug: Kiểm tra kết quả
370
  root_count = sum(1 for chunk in chunks if chunk.cha is None)
371
  logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
372
-
373
- # Debug: Log chi tiết từng chunk
374
- for i, chunk in enumerate(chunks[:10]): # Log 10 chunks đầu tiên
375
  logger.debug(f"[CHUNKER] Chunk {i+1}: {chunk.content[:100]}... -> Parent: {chunk.cha}")
376
-
377
  if len(chunks) > 10:
378
  logger.debug(f"[CHUNKER] ... and {len(chunks) - 10} more chunks")
379
-
380
  return chunks
381
 
382
  def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
@@ -385,20 +403,9 @@ class LawDocumentChunker:
385
  Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
386
  """
387
  current_priority = level_priority.get(current_level, 999)
388
-
389
- # Tìm từ cuối stack (gần nhất) đến đầu stack
390
  for chunk_id, level, level_value, content in reversed(chunk_stack):
391
- # Kiểm tra cả priority và quan hệ hợp lệ
392
  if level_priority.get(level, 999) < current_priority:
393
- # Kiểm tra quan hệ hợp lệ: Điều -> Khoản -> Điểm
394
- if current_level == "KHOAN" and level == "DIEU":
395
- return chunk_id
396
- elif current_level == "DIEM" and level == "KHOAN":
397
- return chunk_id
398
- elif current_level == "CONTENT":
399
- # Content có thể có parent là bất kỳ level nào
400
- return chunk_id
401
-
402
  return None
403
 
404
  async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
 
263
 
264
  def _process_document_recursive(self, content: str, vanbanid: int,
265
  document_title: str) -> List[ChunkMetadata]:
266
+ """Xử lý văn bản theo cấu trúc phân cấp."""
267
  lines = content.split('\n')
268
  chunks = []
269
+ chunk_stack = [] # (chunk_id, level, level_value, content)
 
 
 
 
270
  current_chunk_content = ""
271
+ current_level = None
272
  current_level_value = None
273
  current_parent = None
274
+ current_level_priority = None
 
275
  level_priority = {
276
  "PHAN": 1,
277
  "PHU_LUC": 1,
 
282
  "DIEM": 6,
283
  "CONTENT": 7
284
  }
285
+ preamble_done = False
286
  for line in lines:
287
+ level, level_value, _ = self._detect_structure_level(line)
288
+ line_priority = level_priority.get(level, 7)
289
+ # Nếu là dòng đầu tiên hoặc preamble
290
+ if not preamble_done and (level == "CONTENT" or not level_value):
291
+ current_chunk_content += line + "\n"
292
+ current_level = "CONTENT"
293
+ current_level_value = None
294
+ current_parent = None
295
+ current_level_priority = 7
296
+ continue
297
+ if not preamble_done and (level != "CONTENT" and level_value):
298
+ # Kết thúc preamble
299
  if current_chunk_content.strip():
300
  metadata = self._create_chunk_metadata(
301
  current_chunk_content.strip(),
302
+ "CONTENT",
303
+ None,
304
+ None,
305
  vanbanid,
306
  document_title,
307
  chunk_stack
308
  )
309
  chunks.append(metadata)
310
+ chunk_stack.append((metadata.id, "CONTENT", None, current_chunk_content.strip()))
311
+ preamble_done = True
312
+ current_chunk_content = ""
 
 
 
 
 
 
 
 
313
  current_level = level
314
  current_level_value = level_value
315
+ current_level_priority = line_priority
316
+ current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
317
+ current_chunk_content += line + "\n"
318
+ continue
319
+ # Nếu gặp level mới
320
+ if level != "CONTENT" and level_value:
321
+ if current_level is not None and current_level_priority is not None and line_priority <= current_level_priority:
322
+ # Kết thúc chunk hiện tại
323
+ if current_chunk_content.strip():
324
+ metadata = self._create_chunk_metadata(
325
+ current_chunk_content.strip(),
326
+ str(current_level),
327
+ current_level_value,
328
+ current_parent,
329
+ vanbanid,
330
+ document_title,
331
+ chunk_stack
332
+ )
333
+ chunks.append(metadata)
334
+ chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
335
+ # Bắt đầu chunk mới
336
+ current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
337
+ current_chunk_content = line + "\n"
338
+ current_level = level
339
+ current_level_value = level_value
340
+ current_level_priority = line_priority
341
+ else:
342
+ # Level mới nhưng priority cao hơn (ví dụ: Mục trong Chương)
343
+ if current_chunk_content.strip() and current_level is not None:
344
+ metadata = self._create_chunk_metadata(
345
+ current_chunk_content.strip(),
346
+ str(current_level),
347
+ current_level_value,
348
+ current_parent,
349
+ vanbanid,
350
+ document_title,
351
+ chunk_stack
352
+ )
353
+ chunks.append(metadata)
354
+ chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
355
+ current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
356
+ current_chunk_content = line + "\n"
357
+ current_level = level
358
+ current_level_value = level_value
359
+ current_level_priority = line_priority
360
  else:
361
+ # CONTENT nối vào chunk hiện tại
362
  current_chunk_content += line + "\n"
363
+ # Nếu chunk quá lớn thì chia nhỏ
364
+ if len(current_chunk_content) > self.CHUNK_SIZE and current_level is not None:
 
 
365
  sub_chunks = self._split_into_chunks(current_chunk_content, self.CHUNK_SIZE, self.CHUNK_OVERLAP)
366
+ for sub_chunk in sub_chunks:
 
367
  metadata = self._create_chunk_metadata(
368
  sub_chunk.strip(),
369
+ str(current_level),
370
  current_level_value,
371
  current_parent,
372
  vanbanid,
 
374
  chunk_stack
375
  )
376
  chunks.append(metadata)
377
+ chunk_stack.append((metadata.id, str(current_level), current_level_value, sub_chunk.strip()))
 
 
 
 
378
  current_chunk_content = ""
 
379
  # Lưu chunk cuối cùng
380
+ if current_chunk_content.strip() and current_level is not None:
381
  metadata = self._create_chunk_metadata(
382
  current_chunk_content.strip(),
383
+ str(current_level),
384
  current_level_value,
385
  current_parent,
386
  vanbanid,
 
388
  chunk_stack
389
  )
390
  chunks.append(metadata)
391
+ chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
 
 
 
 
 
392
  root_count = sum(1 for chunk in chunks if chunk.cha is None)
393
  logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
394
+ for i, chunk in enumerate(chunks[:10]):
 
 
395
  logger.debug(f"[CHUNKER] Chunk {i+1}: {chunk.content[:100]}... -> Parent: {chunk.cha}")
 
396
  if len(chunks) > 10:
397
  logger.debug(f"[CHUNKER] ... and {len(chunks) - 10} more chunks")
 
398
  return chunks
399
 
400
  def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
 
403
  Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
404
  """
405
  current_priority = level_priority.get(current_level, 999)
 
 
406
  for chunk_id, level, level_value, content in reversed(chunk_stack):
 
407
  if level_priority.get(level, 999) < current_priority:
408
+ return chunk_id
 
 
 
 
 
 
 
 
409
  return None
410
 
411
  async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int: