fix metadata)
Browse files- app/law_document_chunker.py +80 -105
- app/supabase_db.py +2 -1
app/law_document_chunker.py
CHANGED
|
@@ -177,108 +177,61 @@ class LawDocumentChunker:
|
|
| 177 |
logger.debug(f"[CHUNKER] Filling metadata for chunk with parent_id: {parent_id}")
|
| 178 |
logger.debug(f"[CHUNKER] Chunk stack has {len(chunk_stack)} items")
|
| 179 |
|
| 180 |
-
# Tìm
|
| 181 |
-
|
| 182 |
-
ancestors = []
|
| 183 |
-
|
| 184 |
-
# Tìm tất cả chunks Điều và Khoản xuất hiện trước chunk hiện tại
|
| 185 |
for chunk_id, level, level_value, content in chunk_stack:
|
| 186 |
-
# Dừng khi gặp chunk hiện tại
|
| 187 |
if chunk_id == parent_id:
|
|
|
|
| 188 |
break
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
break
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
elif level == "KHOAN" and level_value:
|
| 221 |
-
if not metadata.clause_number: # Chỉ điền nếu chưa có
|
| 222 |
-
metadata.clause_number = level_value
|
| 223 |
-
logger.debug(f"[CHUNKER] Set clause_number: {metadata.clause_number}")
|
| 224 |
-
elif level == "DIEM" and level_value:
|
| 225 |
-
if not metadata.sub_clause_letter: # Chỉ điền nếu chưa có
|
| 226 |
-
metadata.sub_clause_letter = level_value
|
| 227 |
-
logger.debug(f"[CHUNKER] Set sub_clause_letter: {metadata.sub_clause_letter}")
|
| 228 |
-
|
| 229 |
-
async def _create_context_summary_with_llm(self, content: str, metadata: ChunkMetadata) -> str:
|
| 230 |
-
"""
|
| 231 |
-
Tạo context_summary bằng LLM theo format: "Structure: LEVEL | Semantic: SUMMARY"
|
| 232 |
-
"""
|
| 233 |
-
try:
|
| 234 |
-
# Tạo LEVEL từ metadata
|
| 235 |
-
level_parts = []
|
| 236 |
-
if metadata.sub_clause_letter:
|
| 237 |
-
level_parts.append(f"Điểm {metadata.sub_clause_letter}")
|
| 238 |
-
if metadata.clause_number:
|
| 239 |
-
level_parts.append(f"Khoản {metadata.clause_number}")
|
| 240 |
-
if metadata.article_number:
|
| 241 |
-
level_parts.append(f"Điều {metadata.article_number}")
|
| 242 |
-
|
| 243 |
-
level = " ".join(reversed(level_parts)) if level_parts else "Nội dung"
|
| 244 |
-
|
| 245 |
-
# Gọi LLM để tóm tắt chủ đề
|
| 246 |
-
summary_prompt = f"""
|
| 247 |
-
Tóm tắt ngắn gọn chủ đề chính của đoạn văn bản sau trong 1-2 câu:
|
| 248 |
-
|
| 249 |
-
{content[:500]}...
|
| 250 |
-
|
| 251 |
-
Trả về chỉ nội dung tóm tắt, không có thêm text nào khác.
|
| 252 |
-
"""
|
| 253 |
-
|
| 254 |
-
# Sử dụng GeminiClient với RequestLimitManager
|
| 255 |
-
from .gemini_client import GeminiClient
|
| 256 |
-
|
| 257 |
-
gemini_client = GeminiClient()
|
| 258 |
-
summary_response = gemini_client.generate_text(
|
| 259 |
-
prompt=summary_prompt
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
summary = summary_response.strip() if summary_response else "Không có tóm tắt"
|
| 263 |
-
|
| 264 |
-
# Tạo context_summary theo format yêu cầu
|
| 265 |
-
context_summary = f"Structure: {level} | Semantic: {summary}"
|
| 266 |
-
|
| 267 |
-
return context_summary
|
| 268 |
-
|
| 269 |
-
except Exception as e:
|
| 270 |
-
logger.error(f"[CHUNKER] Error creating context_summary with LLM: {e}")
|
| 271 |
-
# Fallback nếu LLM lỗi
|
| 272 |
-
level_parts = []
|
| 273 |
-
if metadata.sub_clause_letter:
|
| 274 |
-
level_parts.append(f"Điểm {metadata.sub_clause_letter}")
|
| 275 |
-
if metadata.clause_number:
|
| 276 |
-
level_parts.append(f"Khoản {metadata.clause_number}")
|
| 277 |
-
if metadata.article_number:
|
| 278 |
-
level_parts.append(f"Điều {metadata.article_number}")
|
| 279 |
-
|
| 280 |
-
level = " ".join(reversed(level_parts)) if level_parts else "Nội dung"
|
| 281 |
-
return f"Structure: {level} | Semantic: Không có tóm tắt"
|
| 282 |
|
| 283 |
def _split_into_chunks(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 284 |
"""Chia text thành các chunk với overlap."""
|
|
@@ -435,8 +388,16 @@ class LawDocumentChunker:
|
|
| 435 |
|
| 436 |
# Tìm từ cuối stack (gần nhất) đến đầu stack
|
| 437 |
for chunk_id, level, level_value, content in reversed(chunk_stack):
|
|
|
|
| 438 |
if level_priority.get(level, 999) < current_priority:
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
return None
|
| 442 |
|
|
@@ -447,19 +408,33 @@ class LawDocumentChunker:
|
|
| 447 |
success_count = 0
|
| 448 |
failed_count = 0
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
for i, chunk in enumerate(chunks, 1):
|
| 451 |
try:
|
| 452 |
-
# Tạo embedding
|
| 453 |
-
embedding = await self.embedding_client.create_embedding(chunk.content)
|
| 454 |
|
| 455 |
-
# Tạo context_summary bằng LLM
|
| 456 |
-
context_summary = await self._create_context_summary_with_llm(chunk.content, chunk)
|
| 457 |
|
| 458 |
# Chuẩn bị data cho Supabase
|
| 459 |
chunk_dict = {
|
| 460 |
'id': chunk.id,
|
| 461 |
'content': chunk.content,
|
| 462 |
-
'embedding': embedding
|
| 463 |
'vanbanid': chunk.vanbanid,
|
| 464 |
'cha': chunk.cha,
|
| 465 |
'document_title': chunk.document_title,
|
|
@@ -467,7 +442,7 @@ class LawDocumentChunker:
|
|
| 467 |
'article_title': chunk.article_title,
|
| 468 |
'clause_number': chunk.clause_number,
|
| 469 |
'sub_clause_letter': chunk.sub_clause_letter,
|
| 470 |
-
'context_summary': context_summary
|
| 471 |
}
|
| 472 |
|
| 473 |
# Lưu ngay lập tức vào Supabase
|
|
|
|
| 177 |
logger.debug(f"[CHUNKER] Filling metadata for chunk with parent_id: {parent_id}")
|
| 178 |
logger.debug(f"[CHUNKER] Chunk stack has {len(chunk_stack)} items")
|
| 179 |
|
| 180 |
+
# Tìm parent chunk trong stack
|
| 181 |
+
parent_chunk = None
|
|
|
|
|
|
|
|
|
|
| 182 |
for chunk_id, level, level_value, content in chunk_stack:
|
|
|
|
| 183 |
if chunk_id == parent_id:
|
| 184 |
+
parent_chunk = (level, level_value, content)
|
| 185 |
break
|
| 186 |
+
|
| 187 |
+
if not parent_chunk:
|
| 188 |
+
logger.warning(f"[CHUNKER] Parent chunk {parent_id} not found in stack")
|
| 189 |
+
return
|
| 190 |
+
|
| 191 |
+
parent_level, parent_value, parent_content = parent_chunk
|
| 192 |
+
|
| 193 |
+
# Điền metadata từ parent trực tiếp
|
| 194 |
+
if parent_level == "DIEU" and parent_value:
|
| 195 |
+
if not metadata.article_number:
|
| 196 |
+
metadata.article_number = int(parent_value) if parent_value.isdigit() else None
|
| 197 |
+
logger.debug(f"[CHUNKER] Set article_number from parent: {metadata.article_number}")
|
| 198 |
+
if not metadata.article_title:
|
| 199 |
+
first_line = parent_content.split('\n')[0].strip() if parent_content else ""
|
| 200 |
+
metadata.article_title = first_line
|
| 201 |
+
logger.debug(f"[CHUNKER] Set article_title from parent: {metadata.article_title}")
|
| 202 |
+
|
| 203 |
+
elif parent_level == "KHOAN" and parent_value:
|
| 204 |
+
if not metadata.clause_number:
|
| 205 |
+
metadata.clause_number = parent_value
|
| 206 |
+
logger.debug(f"[CHUNKER] Set clause_number from parent: {metadata.clause_number}")
|
| 207 |
+
|
| 208 |
+
# Tìm grandparent (ông) nếu cần
|
| 209 |
+
# Tìm parent của parent trong stack
|
| 210 |
+
grandparent_id = None
|
| 211 |
+
for chunk_id, level, level_value, content in chunk_stack:
|
| 212 |
+
if chunk_id == parent_id:
|
| 213 |
+
# Tìm parent của chunk này
|
| 214 |
+
for cid, lvl, lv, cont in reversed(chunk_stack):
|
| 215 |
+
if cid == chunk_id:
|
| 216 |
break
|
| 217 |
+
if lvl == "DIEU" and parent_level == "KHOAN":
|
| 218 |
+
grandparent_id = cid
|
| 219 |
+
break
|
| 220 |
+
break
|
| 221 |
+
|
| 222 |
+
if grandparent_id:
|
| 223 |
+
# Điền metadata từ grandparent
|
| 224 |
+
for chunk_id, level, level_value, content in chunk_stack:
|
| 225 |
+
if chunk_id == grandparent_id:
|
| 226 |
+
if level == "DIEU" and level_value:
|
| 227 |
+
if not metadata.article_number:
|
| 228 |
+
metadata.article_number = int(level_value) if level_value.isdigit() else None
|
| 229 |
+
logger.debug(f"[CHUNKER] Set article_number from grandparent: {metadata.article_number}")
|
| 230 |
+
if not metadata.article_title:
|
| 231 |
+
first_line = content.split('\n')[0].strip() if content else ""
|
| 232 |
+
metadata.article_title = first_line
|
| 233 |
+
logger.debug(f"[CHUNKER] Set article_title from grandparent: {metadata.article_title}")
|
| 234 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
def _split_into_chunks(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 237 |
"""Chia text thành các chunk với overlap."""
|
|
|
|
| 388 |
|
| 389 |
# Tìm từ cuối stack (gần nhất) đến đầu stack
|
| 390 |
for chunk_id, level, level_value, content in reversed(chunk_stack):
|
| 391 |
+
# Kiểm tra cả priority và quan hệ hợp lệ
|
| 392 |
if level_priority.get(level, 999) < current_priority:
|
| 393 |
+
# Kiểm tra quan hệ hợp lệ: Điều -> Khoản -> Điểm
|
| 394 |
+
if current_level == "KHOAN" and level == "DIEU":
|
| 395 |
+
return chunk_id
|
| 396 |
+
elif current_level == "DIEM" and level == "KHOAN":
|
| 397 |
+
return chunk_id
|
| 398 |
+
elif current_level == "CONTENT":
|
| 399 |
+
# Content có thể có parent là bất kỳ level nào
|
| 400 |
+
return chunk_id
|
| 401 |
|
| 402 |
return None
|
| 403 |
|
|
|
|
| 408 |
success_count = 0
|
| 409 |
failed_count = 0
|
| 410 |
|
| 411 |
+
# Debug: Log chi tiết metadata của từng chunk
|
| 412 |
+
logger.info(f"[CHUNKER] === DETAILED METADATA ANALYSIS ===")
|
| 413 |
+
for i, chunk in enumerate(chunks[:20]): # Log 20 chunks đầu tiên
|
| 414 |
+
logger.info(f"[CHUNKER] Chunk {i+1}:")
|
| 415 |
+
logger.info(f" - ID: {chunk.id[:8]}...")
|
| 416 |
+
logger.info(f" - Content: {chunk.content[:100]}...")
|
| 417 |
+
logger.info(f" - Parent: {chunk.cha}")
|
| 418 |
+
logger.info(f" - Article: {chunk.article_number}")
|
| 419 |
+
logger.info(f" - Article Title: {chunk.article_title}")
|
| 420 |
+
logger.info(f" - Clause: {chunk.clause_number}")
|
| 421 |
+
logger.info(f" - Point: {chunk.sub_clause_letter}")
|
| 422 |
+
logger.info(f" - Document: {chunk.document_title}")
|
| 423 |
+
logger.info(f" ---")
|
| 424 |
+
|
| 425 |
for i, chunk in enumerate(chunks, 1):
|
| 426 |
try:
|
| 427 |
+
# # Tạo embedding
|
| 428 |
+
# embedding = await self.embedding_client.create_embedding(chunk.content)
|
| 429 |
|
| 430 |
+
# # Tạo context_summary bằng LLM
|
| 431 |
+
# context_summary = await self._create_context_summary_with_llm(chunk.content, chunk)
|
| 432 |
|
| 433 |
# Chuẩn bị data cho Supabase
|
| 434 |
chunk_dict = {
|
| 435 |
'id': chunk.id,
|
| 436 |
'content': chunk.content,
|
| 437 |
+
'embedding': [], # Empty embedding for testing
|
| 438 |
'vanbanid': chunk.vanbanid,
|
| 439 |
'cha': chunk.cha,
|
| 440 |
'document_title': chunk.document_title,
|
|
|
|
| 442 |
'article_title': chunk.article_title,
|
| 443 |
'clause_number': chunk.clause_number,
|
| 444 |
'sub_clause_letter': chunk.sub_clause_letter,
|
| 445 |
+
'context_summary': f"Structure: Test | Semantic: Test" # Test context_summary
|
| 446 |
}
|
| 447 |
|
| 448 |
# Lưu ngay lập tức vào Supabase
|
app/supabase_db.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from typing import Any, Dict, List, Optional
|
|
|
|
| 2 |
from supabase.client import create_client, Client
|
| 3 |
from loguru import logger
|
| 4 |
|
|
@@ -192,7 +193,7 @@ class SupabaseClient:
|
|
| 192 |
logger.info("[SUPABASE] Fetching all document chunks")
|
| 193 |
|
| 194 |
# Đếm tổng số records trước
|
| 195 |
-
count_response = self.client.table('document_chunks').select('*', count=
|
| 196 |
total_count = count_response.count if hasattr(count_response, 'count') else 'unknown'
|
| 197 |
logger.info(f"[SUPABASE] Total records in table: {total_count}")
|
| 198 |
|
|
|
|
| 1 |
from typing import Any, Dict, List, Optional
|
| 2 |
+
from postgrest.types import CountMethod
|
| 3 |
from supabase.client import create_client, Client
|
| 4 |
from loguru import logger
|
| 5 |
|
|
|
|
| 193 |
logger.info("[SUPABASE] Fetching all document chunks")
|
| 194 |
|
| 195 |
# Đếm tổng số records trước
|
| 196 |
+
count_response = self.client.table('document_chunks').select('*', count=CountMethod.exact).execute()
|
| 197 |
total_count = count_response.count if hasattr(count_response, 'count') else 'unknown'
|
| 198 |
logger.info(f"[SUPABASE] Total records in table: {total_count}")
|
| 199 |
|