fix metadata
Browse files- app/law_document_chunker.py +53 -80
app/law_document_chunker.py
CHANGED
|
@@ -133,10 +133,9 @@ class LawDocumentChunker:
|
|
| 133 |
|
| 134 |
def _create_chunk_metadata(self, content: str, level: str, level_value: Optional[str],
|
| 135 |
parent_id: Optional[str], vanbanid: int,
|
| 136 |
-
document_title: str, chunk_stack: List[Tuple[str, str, Optional[str], str]]
|
| 137 |
"""Tạo metadata cho chunk."""
|
| 138 |
chunk_id = str(uuid.uuid4())
|
| 139 |
-
|
| 140 |
metadata = ChunkMetadata(
|
| 141 |
id=chunk_id,
|
| 142 |
content=content,
|
|
@@ -144,7 +143,6 @@ class LawDocumentChunker:
|
|
| 144 |
cha=parent_id,
|
| 145 |
document_title=document_title
|
| 146 |
)
|
| 147 |
-
|
| 148 |
# Điền metadata từ chunk hiện tại
|
| 149 |
if level == "DIEU" and level_value:
|
| 150 |
metadata.article_number = int(level_value) if level_value.isdigit() else None
|
|
@@ -153,85 +151,39 @@ class LawDocumentChunker:
|
|
| 153 |
metadata.clause_number = level_value
|
| 154 |
elif level == "DIEM" and level_value:
|
| 155 |
metadata.sub_clause_letter = level_value
|
| 156 |
-
|
| 157 |
# Điền metadata từ parent chunks nếu có
|
| 158 |
logger.debug(f"[CHUNKER] Creating chunk with level: {level}, parent_id: {parent_id}, stack_size: {len(chunk_stack)}")
|
| 159 |
-
if
|
| 160 |
-
self._fill_metadata_from_parents(metadata,
|
| 161 |
else:
|
| 162 |
-
logger.debug(f"[CHUNKER] Skipping metadata fill - no parent_id or
|
| 163 |
-
|
| 164 |
-
# Debug final metadata
|
| 165 |
logger.debug(f"[CHUNKER] Final metadata for chunk {chunk_id[:8]}... - Level: {level}, Article: {metadata.article_number}, Clause: {metadata.clause_number}, Point: {metadata.sub_clause_letter}")
|
| 166 |
-
|
| 167 |
return metadata
|
| 168 |
|
| 169 |
-
def _fill_metadata_from_parents(self, metadata: ChunkMetadata,
|
| 170 |
"""
|
| 171 |
-
Điền metadata từ parent
|
| 172 |
"""
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# Debug logging
|
| 177 |
-
logger.debug(f"[CHUNKER] Filling metadata for chunk with parent_id: {parent_id}")
|
| 178 |
-
logger.debug(f"[CHUNKER] Chunk stack has {len(chunk_stack)} items")
|
| 179 |
-
|
| 180 |
-
# Tìm parent chunk trong stack
|
| 181 |
-
parent_chunk = None
|
| 182 |
-
for chunk_id, level, level_value, content in chunk_stack:
|
| 183 |
-
if chunk_id == parent_id:
|
| 184 |
-
parent_chunk = (level, level_value, content)
|
| 185 |
-
break
|
| 186 |
-
|
| 187 |
-
if not parent_chunk:
|
| 188 |
-
logger.warning(f"[CHUNKER] Parent chunk {parent_id} not found in stack")
|
| 189 |
return
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
if
|
| 205 |
-
metadata.
|
| 206 |
-
logger.debug(f"[CHUNKER] Set clause_number from parent: {metadata.clause_number}")
|
| 207 |
-
|
| 208 |
-
# Tìm grandparent (ông) nếu cần
|
| 209 |
-
# Tìm parent của parent trong stack
|
| 210 |
-
grandparent_id = None
|
| 211 |
-
for chunk_id, level, level_value, content in chunk_stack:
|
| 212 |
-
if chunk_id == parent_id:
|
| 213 |
-
# Tìm parent của chunk này
|
| 214 |
-
for cid, lvl, lv, cont in reversed(chunk_stack):
|
| 215 |
-
if cid == chunk_id:
|
| 216 |
-
break
|
| 217 |
-
if lvl == "DIEU" and parent_level == "KHOAN":
|
| 218 |
-
grandparent_id = cid
|
| 219 |
-
break
|
| 220 |
-
break
|
| 221 |
-
|
| 222 |
-
if grandparent_id:
|
| 223 |
-
# Điền metadata từ grandparent
|
| 224 |
-
for chunk_id, level, level_value, content in chunk_stack:
|
| 225 |
-
if chunk_id == grandparent_id:
|
| 226 |
-
if level == "DIEU" and level_value:
|
| 227 |
-
if not metadata.article_number:
|
| 228 |
-
metadata.article_number = int(level_value) if level_value.isdigit() else None
|
| 229 |
-
logger.debug(f"[CHUNKER] Set article_number from grandparent: {metadata.article_number}")
|
| 230 |
-
if not metadata.article_title:
|
| 231 |
-
first_line = content.split('\n')[0].strip() if content else ""
|
| 232 |
-
metadata.article_title = first_line
|
| 233 |
-
logger.debug(f"[CHUNKER] Set article_title from grandparent: {metadata.article_title}")
|
| 234 |
-
break
|
| 235 |
|
| 236 |
def _split_into_chunks(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 237 |
"""Chia text thành các chunk với overlap."""
|
|
@@ -267,6 +219,7 @@ class LawDocumentChunker:
|
|
| 267 |
lines = content.split('\n')
|
| 268 |
chunks = []
|
| 269 |
chunk_stack = [] # (chunk_id, level, level_value, content)
|
|
|
|
| 270 |
current_chunk_content = ""
|
| 271 |
current_level = None
|
| 272 |
current_level_value = None
|
|
@@ -304,10 +257,12 @@ class LawDocumentChunker:
|
|
| 304 |
None,
|
| 305 |
vanbanid,
|
| 306 |
document_title,
|
| 307 |
-
chunk_stack
|
|
|
|
| 308 |
)
|
| 309 |
chunks.append(metadata)
|
| 310 |
chunk_stack.append((metadata.id, "CONTENT", None, current_chunk_content.strip()))
|
|
|
|
| 311 |
preamble_done = True
|
| 312 |
current_chunk_content = ""
|
| 313 |
current_level = level
|
|
@@ -328,10 +283,12 @@ class LawDocumentChunker:
|
|
| 328 |
current_parent,
|
| 329 |
vanbanid,
|
| 330 |
document_title,
|
| 331 |
-
chunk_stack
|
|
|
|
| 332 |
)
|
| 333 |
chunks.append(metadata)
|
| 334 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
|
|
|
| 335 |
# Bắt đầu chunk mới
|
| 336 |
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 337 |
current_chunk_content = line + "\n"
|
|
@@ -348,10 +305,12 @@ class LawDocumentChunker:
|
|
| 348 |
current_parent,
|
| 349 |
vanbanid,
|
| 350 |
document_title,
|
| 351 |
-
chunk_stack
|
|
|
|
| 352 |
)
|
| 353 |
chunks.append(metadata)
|
| 354 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
|
|
|
| 355 |
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 356 |
current_chunk_content = line + "\n"
|
| 357 |
current_level = level
|
|
@@ -371,10 +330,12 @@ class LawDocumentChunker:
|
|
| 371 |
current_parent,
|
| 372 |
vanbanid,
|
| 373 |
document_title,
|
| 374 |
-
chunk_stack
|
|
|
|
| 375 |
)
|
| 376 |
chunks.append(metadata)
|
| 377 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, sub_chunk.strip()))
|
|
|
|
| 378 |
current_chunk_content = ""
|
| 379 |
# Lưu chunk cuối cùng
|
| 380 |
if current_chunk_content.strip() and current_level is not None:
|
|
@@ -385,10 +346,12 @@ class LawDocumentChunker:
|
|
| 385 |
current_parent,
|
| 386 |
vanbanid,
|
| 387 |
document_title,
|
| 388 |
-
chunk_stack
|
|
|
|
| 389 |
)
|
| 390 |
chunks.append(metadata)
|
| 391 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
|
|
|
| 392 |
root_count = sum(1 for chunk in chunks if chunk.cha is None)
|
| 393 |
logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
|
| 394 |
for i, chunk in enumerate(chunks[:10]):
|
|
@@ -400,12 +363,22 @@ class LawDocumentChunker:
|
|
| 400 |
def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
|
| 401 |
current_level: str, level_priority: Dict[str, int]) -> Optional[str]:
|
| 402 |
"""
|
| 403 |
-
Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
|
| 404 |
"""
|
| 405 |
current_priority = level_priority.get(current_level, 999)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
for chunk_id, level, level_value, content in reversed(chunk_stack):
|
| 407 |
if level_priority.get(level, 999) < current_priority:
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
return None
|
| 410 |
|
| 411 |
async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
|
|
|
|
| 133 |
|
| 134 |
def _create_chunk_metadata(self, content: str, level: str, level_value: Optional[str],
|
| 135 |
parent_id: Optional[str], vanbanid: int,
|
| 136 |
+
document_title: str, chunk_stack: List[Tuple[str, str, Optional[str], str]], chunk_dict: dict) -> 'ChunkMetadata':
|
| 137 |
"""Tạo metadata cho chunk."""
|
| 138 |
chunk_id = str(uuid.uuid4())
|
|
|
|
| 139 |
metadata = ChunkMetadata(
|
| 140 |
id=chunk_id,
|
| 141 |
content=content,
|
|
|
|
| 143 |
cha=parent_id,
|
| 144 |
document_title=document_title
|
| 145 |
)
|
|
|
|
| 146 |
# Điền metadata từ chunk hiện tại
|
| 147 |
if level == "DIEU" and level_value:
|
| 148 |
metadata.article_number = int(level_value) if level_value.isdigit() else None
|
|
|
|
| 151 |
metadata.clause_number = level_value
|
| 152 |
elif level == "DIEM" and level_value:
|
| 153 |
metadata.sub_clause_letter = level_value
|
|
|
|
| 154 |
# Điền metadata từ parent chunks nếu có
|
| 155 |
logger.debug(f"[CHUNKER] Creating chunk with level: {level}, parent_id: {parent_id}, stack_size: {len(chunk_stack)}")
|
| 156 |
+
if chunk_dict is not None and parent_id:
|
| 157 |
+
self._fill_metadata_from_parents(metadata, parent_id, chunk_dict)
|
| 158 |
else:
|
| 159 |
+
logger.debug(f"[CHUNKER] Skipping metadata fill - no parent_id or chunk_dict")
|
|
|
|
|
|
|
| 160 |
logger.debug(f"[CHUNKER] Final metadata for chunk {chunk_id[:8]}... - Level: {level}, Article: {metadata.article_number}, Clause: {metadata.clause_number}, Point: {metadata.sub_clause_letter}")
|
|
|
|
| 161 |
return metadata
|
| 162 |
|
| 163 |
+
def _fill_metadata_from_parents(self, metadata: ChunkMetadata, parent_id: str, chunk_dict: Dict[str, ChunkMetadata]):
|
| 164 |
"""
|
| 165 |
+
Điền metadata từ parent và ancestor (cha, ông, ...), sử dụng dict id->chunk.
|
| 166 |
"""
|
| 167 |
+
parent = chunk_dict.get(parent_id)
|
| 168 |
+
if not parent:
|
| 169 |
+
logger.warning(f"[CHUNKER] Parent chunk {parent_id} not found in chunk_dict")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
return
|
| 171 |
+
# Điền từ cha
|
| 172 |
+
if parent.article_number and not metadata.article_number:
|
| 173 |
+
metadata.article_number = parent.article_number
|
| 174 |
+
if parent.article_title and not metadata.article_title:
|
| 175 |
+
metadata.article_title = parent.article_title
|
| 176 |
+
if parent.clause_number and not metadata.clause_number:
|
| 177 |
+
metadata.clause_number = parent.clause_number
|
| 178 |
+
if parent.sub_clause_letter and not metadata.sub_clause_letter:
|
| 179 |
+
metadata.sub_clause_letter = parent.sub_clause_letter
|
| 180 |
+
# Nếu cha là Khoản, tìm ông là Điều
|
| 181 |
+
if parent.clause_number and not metadata.article_number:
|
| 182 |
+
grandparent = chunk_dict.get(parent.cha) if parent.cha else None
|
| 183 |
+
if grandparent and grandparent.article_number:
|
| 184 |
+
metadata.article_number = grandparent.article_number
|
| 185 |
+
if grandparent and grandparent.article_title:
|
| 186 |
+
metadata.article_title = grandparent.article_title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
def _split_into_chunks(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 189 |
"""Chia text thành các chunk với overlap."""
|
|
|
|
| 219 |
lines = content.split('\n')
|
| 220 |
chunks = []
|
| 221 |
chunk_stack = [] # (chunk_id, level, level_value, content)
|
| 222 |
+
chunk_dict = {} # id -> ChunkMetadata
|
| 223 |
current_chunk_content = ""
|
| 224 |
current_level = None
|
| 225 |
current_level_value = None
|
|
|
|
| 257 |
None,
|
| 258 |
vanbanid,
|
| 259 |
document_title,
|
| 260 |
+
chunk_stack,
|
| 261 |
+
chunk_dict
|
| 262 |
)
|
| 263 |
chunks.append(metadata)
|
| 264 |
chunk_stack.append((metadata.id, "CONTENT", None, current_chunk_content.strip()))
|
| 265 |
+
chunk_dict[metadata.id] = metadata
|
| 266 |
preamble_done = True
|
| 267 |
current_chunk_content = ""
|
| 268 |
current_level = level
|
|
|
|
| 283 |
current_parent,
|
| 284 |
vanbanid,
|
| 285 |
document_title,
|
| 286 |
+
chunk_stack,
|
| 287 |
+
chunk_dict
|
| 288 |
)
|
| 289 |
chunks.append(metadata)
|
| 290 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
| 291 |
+
chunk_dict[metadata.id] = metadata
|
| 292 |
# Bắt đầu chunk mới
|
| 293 |
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 294 |
current_chunk_content = line + "\n"
|
|
|
|
| 305 |
current_parent,
|
| 306 |
vanbanid,
|
| 307 |
document_title,
|
| 308 |
+
chunk_stack,
|
| 309 |
+
chunk_dict
|
| 310 |
)
|
| 311 |
chunks.append(metadata)
|
| 312 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
| 313 |
+
chunk_dict[metadata.id] = metadata
|
| 314 |
current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
|
| 315 |
current_chunk_content = line + "\n"
|
| 316 |
current_level = level
|
|
|
|
| 330 |
current_parent,
|
| 331 |
vanbanid,
|
| 332 |
document_title,
|
| 333 |
+
chunk_stack,
|
| 334 |
+
chunk_dict
|
| 335 |
)
|
| 336 |
chunks.append(metadata)
|
| 337 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, sub_chunk.strip()))
|
| 338 |
+
chunk_dict[metadata.id] = metadata
|
| 339 |
current_chunk_content = ""
|
| 340 |
# Lưu chunk cuối cùng
|
| 341 |
if current_chunk_content.strip() and current_level is not None:
|
|
|
|
| 346 |
current_parent,
|
| 347 |
vanbanid,
|
| 348 |
document_title,
|
| 349 |
+
chunk_stack,
|
| 350 |
+
chunk_dict
|
| 351 |
)
|
| 352 |
chunks.append(metadata)
|
| 353 |
chunk_stack.append((metadata.id, str(current_level), current_level_value, current_chunk_content.strip()))
|
| 354 |
+
chunk_dict[metadata.id] = metadata
|
| 355 |
root_count = sum(1 for chunk in chunks if chunk.cha is None)
|
| 356 |
logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
|
| 357 |
for i, chunk in enumerate(chunks[:10]):
|
|
|
|
| 363 |
def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str], str]],
|
| 364 |
current_level: str, level_priority: Dict[str, int]) -> Optional[str]:
|
| 365 |
"""
|
| 366 |
+
Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại, kiểm tra hợp lệ cha-con.
|
| 367 |
"""
|
| 368 |
current_priority = level_priority.get(current_level, 999)
|
| 369 |
+
valid_parents = {
|
| 370 |
+
"MUC": ["CHUONG", "PHAN"],
|
| 371 |
+
"DIEU": ["MUC", "CHUONG", "PHAN"],
|
| 372 |
+
"CHUONG": ["PHAN"],
|
| 373 |
+
# Các level khác giữ nguyên logic cũ
|
| 374 |
+
}
|
| 375 |
for chunk_id, level, level_value, content in reversed(chunk_stack):
|
| 376 |
if level_priority.get(level, 999) < current_priority:
|
| 377 |
+
if current_level in valid_parents:
|
| 378 |
+
if level in valid_parents[current_level]:
|
| 379 |
+
return chunk_id
|
| 380 |
+
else:
|
| 381 |
+
return chunk_id
|
| 382 |
return None
|
| 383 |
|
| 384 |
async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
|