fix metadata
Browse files- app/law_document_chunker.py +13 -0
app/law_document_chunker.py
CHANGED
|
@@ -131,6 +131,15 @@ class LawDocumentChunker:
|
|
| 131 |
logger.error(f"[CHUNKER] Error in _detect_structure_level for line '{line}': {e}")
|
| 132 |
return "CONTENT", None, None
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
def _create_chunk_metadata(self, content: str, level: str, level_value: Optional[str],
|
| 135 |
parent_id: Optional[str], vanbanid: int,
|
| 136 |
document_title: str, chunk_stack: List[Tuple[str, str, Optional[str], str]], chunk_dict: dict) -> 'ChunkMetadata':
|
|
@@ -157,6 +166,10 @@ class LawDocumentChunker:
|
|
| 157 |
self._fill_metadata_from_parents(metadata, parent_id, chunk_dict)
|
| 158 |
else:
|
| 159 |
logger.debug(f"[CHUNKER] Skipping metadata fill - no parent_id or chunk_dict")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
logger.debug(f"[CHUNKER] Final metadata for chunk {chunk_id[:8]}... - Level: {level}, Article: {metadata.article_number}, Clause: {metadata.clause_number}, Point: {metadata.sub_clause_letter}")
|
| 161 |
return metadata
|
| 162 |
|
|
|
|
| 131 |
logger.error(f"[CHUNKER] Error in _detect_structure_level for line '{line}': {e}")
|
| 132 |
return "CONTENT", None, None
|
| 133 |
|
| 134 |
+
def _build_structure_summary(self, article_number, clause_number, sub_clause_letter):
|
| 135 |
+
if sub_clause_letter and clause_number and article_number:
|
| 136 |
+
return f"Điểm {sub_clause_letter} Khoản {clause_number} Điều {article_number}"
|
| 137 |
+
elif clause_number and article_number:
|
| 138 |
+
return f"Khoản {clause_number} Điều {article_number}"
|
| 139 |
+
elif article_number:
|
| 140 |
+
return f"Điều {article_number}"
|
| 141 |
+
return ""
|
| 142 |
+
|
| 143 |
def _create_chunk_metadata(self, content: str, level: str, level_value: Optional[str],
|
| 144 |
parent_id: Optional[str], vanbanid: int,
|
| 145 |
document_title: str, chunk_stack: List[Tuple[str, str, Optional[str], str]], chunk_dict: dict) -> 'ChunkMetadata':
|
|
|
|
| 166 |
self._fill_metadata_from_parents(metadata, parent_id, chunk_dict)
|
| 167 |
else:
|
| 168 |
logger.debug(f"[CHUNKER] Skipping metadata fill - no parent_id or chunk_dict")
|
| 169 |
+
# Gán context_summary theo format pháp lý
|
| 170 |
+
metadata.context_summary = self._build_structure_summary(
|
| 171 |
+
metadata.article_number, metadata.clause_number, metadata.sub_clause_letter
|
| 172 |
+
)
|
| 173 |
logger.debug(f"[CHUNKER] Final metadata for chunk {chunk_id[:8]}... - Level: {level}, Article: {metadata.article_number}, Clause: {metadata.clause_number}, Point: {metadata.sub_clause_letter}")
|
| 174 |
return metadata
|
| 175 |
|