VietCat commited on
Commit
811b7b0
·
1 Parent(s): f5552f4

fix metadata

Browse files
Files changed (1) hide show
  1. app/law_document_chunker.py +53 -80
app/law_document_chunker.py CHANGED
@@ -133,10 +133,9 @@ class LawDocumentChunker:
133
 
134
  def _create_chunk_metadata(self, content: str, level: str, level_value: Optional[str],
135
  parent_id: Optional[str], vanbanid: int,
136
- document_title: str, chunk_stack: List[Tuple[str, str, Optional[str], str]] = []) -> ChunkMetadata:
137
  """Tạo metadata cho chunk."""
138
  chunk_id = str(uuid.uuid4())
139
-
140
  metadata = ChunkMetadata(
141
  id=chunk_id,
142
  content=content,
@@ -144,7 +143,6 @@ class LawDocumentChunker:
144
  cha=parent_id,
145
  document_title=document_title
146
  )
147
-
148
  # Điền metadata từ chunk hiện tại
149
  if level == "DIEU" and level_value:
150
  metadata.article_number = int(level_value) if level_value.isdigit() else None
@@ -153,85 +151,39 @@ class LawDocumentChunker:
153
  metadata.clause_number = level_value
154
  elif level == "DIEM" and level_value:
155
  metadata.sub_clause_letter = level_value
156
-
157
  # Điền metadata từ parent chunks nếu có
158
  logger.debug(f"[CHUNKER] Creating chunk with level: {level}, parent_id: {parent_id}, stack_size: {len(chunk_stack)}")
159
- if chunk_stack and parent_id:
160
- self._fill_metadata_from_parents(metadata, chunk_stack, parent_id)
161
  else:
162
- logger.debug(f"[CHUNKER] Skipping metadata fill - no parent_id or empty stack")
163
-
164
- # Debug final metadata
165
  logger.debug(f"[CHUNKER] Final metadata for chunk {chunk_id[:8]}... - Level: {level}, Article: {metadata.article_number}, Clause: {metadata.clause_number}, Point: {metadata.sub_clause_letter}")
166
-
167
  return metadata
168
 
169
- def _fill_metadata_from_parents(self, metadata: ChunkMetadata, chunk_stack: List[Tuple[str, str, Optional[str], str]], parent_id: str):
170
  """
171
- Điền metadata từ parent chunks (Điều, Khoản) nếu chunk hiện tại có cha hoặc ông là Điều/Khoản.
172
  """
173
- if not parent_id:
174
- return
175
-
176
- # Debug logging
177
- logger.debug(f"[CHUNKER] Filling metadata for chunk with parent_id: {parent_id}")
178
- logger.debug(f"[CHUNKER] Chunk stack has {len(chunk_stack)} items")
179
-
180
- # Tìm parent chunk trong stack
181
- parent_chunk = None
182
- for chunk_id, level, level_value, content in chunk_stack:
183
- if chunk_id == parent_id:
184
- parent_chunk = (level, level_value, content)
185
- break
186
-
187
- if not parent_chunk:
188
- logger.warning(f"[CHUNKER] Parent chunk {parent_id} not found in stack")
189
  return
190
-
191
- parent_level, parent_value, parent_content = parent_chunk
192
-
193
- # Điền metadata từ parent trực tiếp
194
- if parent_level == "DIEU" and parent_value:
195
- if not metadata.article_number:
196
- metadata.article_number = int(parent_value) if parent_value.isdigit() else None
197
- logger.debug(f"[CHUNKER] Set article_number from parent: {metadata.article_number}")
198
- if not metadata.article_title:
199
- first_line = parent_content.split('\n')[0].strip() if parent_content else ""
200
- metadata.article_title = first_line
201
- logger.debug(f"[CHUNKER] Set article_title from parent: {metadata.article_title}")
202
-
203
- elif parent_level == "KHOAN" and parent_value:
204
- if not metadata.clause_number:
205
- metadata.clause_number = parent_value
206
- logger.debug(f"[CHUNKER] Set clause_number from parent: {metadata.clause_number}")
207
-
208
- # Tìm grandparent (ông) nếu cần
209
- # Tìm parent của parent trong stack
210
- grandparent_id = None
211
- for chunk_id, level, level_value, content in chunk_stack:
212
- if chunk_id == parent_id:
213
- # Tìm parent của chunk này
214
- for cid, lvl, lv, cont in reversed(chunk_stack):
215
- if cid == chunk_id:
216
- break
217
- if lvl == "DIEU" and parent_level == "KHOAN":
218
- grandparent_id = cid
219
- break
220
- break
221
-
222
- if grandparent_id:
223
- # Điền metadata từ grandparent
224
- for chunk_id, level, level_value, content in chunk_stack:
225
- if chunk_id == grandparent_id:
226
- if level == "DIEU" and level_value:
227
- if not metadata.article_number:
228
- metadata.article_number = int(level_value) if level_value.isdigit() else None
229
- logger.debug(f"[CHUNKER] Set article_number from grandparent: {metadata.article_number}")
230
- if not metadata.article_title:
231
- first_line = content.split('\n')[0].strip() if content else ""
232
- metadata.article_title = first_line
233
- logger.debug(f"[CHUNKER] Set article_title from grandparent: {metadata.article_title}")
234
- break
235
 
236
  def _split_into_chunks(self, text: str, chunk_size: int, overlap: int) -> List[str]:
237
  """Chia text thành các chunk với overlap."""
@@ -267,6 +219,7 @@ class LawDocumentChunker:
267
  lines = content.split('\n')
268
  chunks = []
269
  chunk_stack = [] # (chunk_id, level, level_value, content)
 
270
  current_chunk_content = ""
271
  current_level = None
272
  current_level_value = None
@@ -304,10 +257,12 @@ class LawDocumentChunker:
304
  None,
305
  vanbanid,
306
  document_title,
307
- chunk_stack
 
308
  )
309
  chunks.append(metadata)
310
  chunk_stack.append((metadata.id, "CONTENT", None, current_chunk_content.strip()))
 
311
  preamble_done = True
312
  current_chunk_content = ""
313
  current_level = level
@@ -328,10 +283,12 @@ class LawDocumentChunker:
328
  current_parent,
329
  vanbanid,
330
  document_title,
331
- chunk_stack
 
332
  )
333
  chunks.append(metadata)
334
  chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
 
335
  # Bắt đầu chunk mới
336
  current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
337
  current_chunk_content = line + "\n"
@@ -348,10 +305,12 @@ class LawDocumentChunker:
348
  current_parent,
349
  vanbanid,
350
  document_title,
351
- chunk_stack
 
352
  )
353
  chunks.append(metadata)
354
  chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
 
355
  current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
356
  current_chunk_content = line + "\n"
357
  current_level = level
@@ -371,10 +330,12 @@ class LawDocumentChunker:
371
  current_parent,
372
  vanbanid,
373
  document_title,
374
- chunk_stack
 
375
  )
376
  chunks.append(metadata)
377
  chunk_stack.append((metadata.id, str(current_level), current_level_value, sub_chunk.strip()))
 
378
  current_chunk_content = ""
379
  # Lưu chunk cuối cùng
380
  if current_chunk_content.strip() and current_level is not None:
@@ -385,10 +346,12 @@ class LawDocumentChunker:
385
  current_parent,
386
  vanbanid,
387
  document_title,
388
- chunk_stack
 
389
  )
390
  chunks.append(metadata)
391
  chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
 
392
  root_count = sum(1 for chunk in chunks if chunk.cha is None)
393
  logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
394
  for i, chunk in enumerate(chunks[:10]):
@@ -400,12 +363,22 @@ class LawDocumentChunker:
400
  def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
401
  current_level: str, level_priority: Dict[str, int]) -> Optional[str]:
402
  """
403
- Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
404
  """
405
  current_priority = level_priority.get(current_level, 999)
 
 
 
 
 
 
406
  for chunk_id, level, level_value, content in reversed(chunk_stack):
407
  if level_priority.get(level, 999) < current_priority:
408
- return chunk_id
 
 
 
 
409
  return None
410
 
411
  async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
 
133
 
134
  def _create_chunk_metadata(self, content: str, level: str, level_value: Optional[str],
135
  parent_id: Optional[str], vanbanid: int,
136
+ document_title: str, chunk_stack: List[Tuple[str, str, Optional[str], str]], chunk_dict: dict) -> 'ChunkMetadata':
137
  """Tạo metadata cho chunk."""
138
  chunk_id = str(uuid.uuid4())
 
139
  metadata = ChunkMetadata(
140
  id=chunk_id,
141
  content=content,
 
143
  cha=parent_id,
144
  document_title=document_title
145
  )
 
146
  # Điền metadata từ chunk hiện tại
147
  if level == "DIEU" and level_value:
148
  metadata.article_number = int(level_value) if level_value.isdigit() else None
 
151
  metadata.clause_number = level_value
152
  elif level == "DIEM" and level_value:
153
  metadata.sub_clause_letter = level_value
 
154
  # Điền metadata từ parent chunks nếu có
155
  logger.debug(f"[CHUNKER] Creating chunk with level: {level}, parent_id: {parent_id}, stack_size: {len(chunk_stack)}")
156
+ if chunk_dict is not None and parent_id:
157
+ self._fill_metadata_from_parents(metadata, parent_id, chunk_dict)
158
  else:
159
+ logger.debug(f"[CHUNKER] Skipping metadata fill - no parent_id or chunk_dict")
 
 
160
  logger.debug(f"[CHUNKER] Final metadata for chunk {chunk_id[:8]}... - Level: {level}, Article: {metadata.article_number}, Clause: {metadata.clause_number}, Point: {metadata.sub_clause_letter}")
 
161
  return metadata
162
 
163
+ def _fill_metadata_from_parents(self, metadata: ChunkMetadata, parent_id: str, chunk_dict: Dict[str, ChunkMetadata]):
164
  """
165
+ Điền metadata từ parent ancestor (cha, ông, ...), sử dụng dict id->chunk.
166
  """
167
+ parent = chunk_dict.get(parent_id)
168
+ if not parent:
169
+ logger.warning(f"[CHUNKER] Parent chunk {parent_id} not found in chunk_dict")
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  return
171
+ # Điền từ cha
172
+ if parent.article_number and not metadata.article_number:
173
+ metadata.article_number = parent.article_number
174
+ if parent.article_title and not metadata.article_title:
175
+ metadata.article_title = parent.article_title
176
+ if parent.clause_number and not metadata.clause_number:
177
+ metadata.clause_number = parent.clause_number
178
+ if parent.sub_clause_letter and not metadata.sub_clause_letter:
179
+ metadata.sub_clause_letter = parent.sub_clause_letter
180
+ # Nếu cha Khoản, tìm ông là Điều
181
+ if parent.clause_number and not metadata.article_number:
182
+ grandparent = chunk_dict.get(parent.cha) if parent.cha else None
183
+ if grandparent and grandparent.article_number:
184
+ metadata.article_number = grandparent.article_number
185
+ if grandparent and grandparent.article_title:
186
+ metadata.article_title = grandparent.article_title
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  def _split_into_chunks(self, text: str, chunk_size: int, overlap: int) -> List[str]:
189
  """Chia text thành các chunk với overlap."""
 
219
  lines = content.split('\n')
220
  chunks = []
221
  chunk_stack = [] # (chunk_id, level, level_value, content)
222
+ chunk_dict = {} # id -> ChunkMetadata
223
  current_chunk_content = ""
224
  current_level = None
225
  current_level_value = None
 
257
  None,
258
  vanbanid,
259
  document_title,
260
+ chunk_stack,
261
+ chunk_dict
262
  )
263
  chunks.append(metadata)
264
  chunk_stack.append((metadata.id, "CONTENT", None, current_chunk_content.strip()))
265
+ chunk_dict[metadata.id] = metadata
266
  preamble_done = True
267
  current_chunk_content = ""
268
  current_level = level
 
283
  current_parent,
284
  vanbanid,
285
  document_title,
286
+ chunk_stack,
287
+ chunk_dict
288
  )
289
  chunks.append(metadata)
290
  chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
291
+ chunk_dict[metadata.id] = metadata
292
  # Bắt đầu chunk mới
293
  current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
294
  current_chunk_content = line + "\n"
 
305
  current_parent,
306
  vanbanid,
307
  document_title,
308
+ chunk_stack,
309
+ chunk_dict
310
  )
311
  chunks.append(metadata)
312
  chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
313
+ chunk_dict[metadata.id] = metadata
314
  current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
315
  current_chunk_content = line + "\n"
316
  current_level = level
 
330
  current_parent,
331
  vanbanid,
332
  document_title,
333
+ chunk_stack,
334
+ chunk_dict
335
  )
336
  chunks.append(metadata)
337
  chunk_stack.append((metadata.id, str(current_level), current_level_value, sub_chunk.strip()))
338
+ chunk_dict[metadata.id] = metadata
339
  current_chunk_content = ""
340
  # Lưu chunk cuối cùng
341
  if current_chunk_content.strip() and current_level is not None:
 
346
  current_parent,
347
  vanbanid,
348
  document_title,
349
+ chunk_stack,
350
+ chunk_dict
351
  )
352
  chunks.append(metadata)
353
  chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
354
+ chunk_dict[metadata.id] = metadata
355
  root_count = sum(1 for chunk in chunks if chunk.cha is None)
356
  logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
357
  for i, chunk in enumerate(chunks[:10]):
 
363
  def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
364
  current_level: str, level_priority: Dict[str, int]) -> Optional[str]:
365
  """
366
+ Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại, kiểm tra hợp lệ cha-con.
367
  """
368
  current_priority = level_priority.get(current_level, 999)
369
+ valid_parents = {
370
+ "MUC": ["CHUONG", "PHAN"],
371
+ "DIEU": ["MUC", "CHUONG", "PHAN"],
372
+ "CHUONG": ["PHAN"],
373
+ # Các level khác giữ nguyên logic cũ
374
+ }
375
  for chunk_id, level, level_value, content in reversed(chunk_stack):
376
  if level_priority.get(level, 999) < current_priority:
377
+ if current_level in valid_parents:
378
+ if level in valid_parents[current_level]:
379
+ return chunk_id
380
+ else:
381
+ return chunk_id
382
  return None
383
 
384
  async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int: