fix metadata
Browse files- app/gemini_client.py +3 -3
- app/law_document_chunker.py +25 -9
- app/main.py +1 -0
app/gemini_client.py
CHANGED
|
@@ -103,7 +103,7 @@ class GeminiClient:
|
|
| 103 |
logger.error(f"[GEMINI] Error counting tokens: {e}")
|
| 104 |
return 0
|
| 105 |
|
| 106 |
-
def create_embedding(self, text: str, model: Optional[str] = None) -> list:
|
| 107 |
last_error = None
|
| 108 |
max_retries = 3
|
| 109 |
|
|
@@ -117,13 +117,13 @@ class GeminiClient:
|
|
| 117 |
if not use_model:
|
| 118 |
raise ValueError("No model specified for embedding")
|
| 119 |
|
| 120 |
-
logger.info(f"[GEMINI][EMBEDDING] Using model={use_model} (requested={model}, default={default_model})")
|
| 121 |
|
| 122 |
configure(api_key=key)
|
| 123 |
response = embed_content(
|
| 124 |
model=use_model,
|
| 125 |
content=text,
|
| 126 |
-
task_type=
|
| 127 |
)
|
| 128 |
|
| 129 |
self.limit_manager.log_request(key, use_model, success=True)
|
|
|
|
| 103 |
logger.error(f"[GEMINI] Error counting tokens: {e}")
|
| 104 |
return 0
|
| 105 |
|
| 106 |
+
def create_embedding(self, text: str, model: Optional[str] = None, task_type: str = "retrieval_query") -> list:
|
| 107 |
last_error = None
|
| 108 |
max_retries = 3
|
| 109 |
|
|
|
|
| 117 |
if not use_model:
|
| 118 |
raise ValueError("No model specified for embedding")
|
| 119 |
|
| 120 |
+
logger.info(f"[GEMINI][EMBEDDING] Using model={use_model} (requested={model}, default={default_model}), task_type={task_type}")
|
| 121 |
|
| 122 |
configure(api_key=key)
|
| 123 |
response = embed_content(
|
| 124 |
model=use_model,
|
| 125 |
content=text,
|
| 126 |
+
task_type=task_type
|
| 127 |
)
|
| 128 |
|
| 129 |
self.limit_manager.log_request(key, use_model, success=True)
|
app/law_document_chunker.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
import os
|
| 3 |
import uuid
|
| 4 |
-
from typing import List, Dict, Optional, Tuple
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from loguru import logger
|
| 7 |
from .supabase_db import SupabaseClient
|
|
@@ -30,6 +30,7 @@ class LawDocumentChunker:
|
|
| 30 |
settings = get_settings()
|
| 31 |
self.supabase_client = SupabaseClient(settings.supabase_url, settings.supabase_key)
|
| 32 |
self.embedding_client = EmbeddingClient()
|
|
|
|
| 33 |
|
| 34 |
# Regex patterns cho các cấp độ cấu trúc - SỬA LẠI ĐỂ CHÍNH XÁC HƠN
|
| 35 |
# Đảm bảo mỗi pattern có đúng số group
|
|
@@ -417,17 +418,14 @@ class LawDocumentChunker:
|
|
| 417 |
|
| 418 |
for i, chunk in enumerate(chunks, 1):
|
| 419 |
try:
|
| 420 |
-
#
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
# # Tạo context_summary bằng LLM
|
| 424 |
-
# context_summary = await self._create_context_summary_with_llm(chunk.content, chunk)
|
| 425 |
|
| 426 |
# Chuẩn bị data cho Supabase
|
| 427 |
chunk_dict = {
|
| 428 |
'id': chunk.id,
|
| 429 |
'content': chunk.content,
|
| 430 |
-
'embedding': [0.0] * 768, #
|
| 431 |
'vanbanid': chunk.vanbanid,
|
| 432 |
'cha': chunk.cha,
|
| 433 |
'document_title': chunk.document_title,
|
|
@@ -435,7 +433,7 @@ class LawDocumentChunker:
|
|
| 435 |
'article_title': chunk.article_title,
|
| 436 |
'clause_number': chunk.clause_number,
|
| 437 |
'sub_clause_letter': chunk.sub_clause_letter,
|
| 438 |
-
'context_summary': f"Structure:
|
| 439 |
}
|
| 440 |
|
| 441 |
# Lưu ngay lập tức vào Supabase
|
|
@@ -508,4 +506,22 @@ class LawDocumentChunker:
|
|
| 508 |
|
| 509 |
except Exception as e:
|
| 510 |
logger.error(f"[CHUNKER] Error processing document {document_id}: {e}") ##
|
| 511 |
-
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import os
|
| 3 |
import uuid
|
| 4 |
+
from typing import List, Dict, Optional, Tuple, Any
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from loguru import logger
|
| 7 |
from .supabase_db import SupabaseClient
|
|
|
|
| 30 |
settings = get_settings()
|
| 31 |
self.supabase_client = SupabaseClient(settings.supabase_url, settings.supabase_key)
|
| 32 |
self.embedding_client = EmbeddingClient()
|
| 33 |
+
self.llm_client: Optional[Any] = None
|
| 34 |
|
| 35 |
# Regex patterns cho các cấp độ cấu trúc - SỬA LẠI ĐỂ CHÍNH XÁC HƠN
|
| 36 |
# Đảm bảo mỗi pattern có đúng số group
|
|
|
|
| 418 |
|
| 419 |
for i, chunk in enumerate(chunks, 1):
|
| 420 |
try:
|
| 421 |
+
# Tạo embedding
|
| 422 |
+
embedding = await self.embedding_client.create_embedding(chunk.content, task_type="retrieval_document")
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
# Chuẩn bị data cho Supabase
|
| 425 |
chunk_dict = {
|
| 426 |
'id': chunk.id,
|
| 427 |
'content': chunk.content,
|
| 428 |
+
'embedding': embedding if embedding is not None else [0.0] * 768, # Sử dụng embedding thực tế nếu có
|
| 429 |
'vanbanid': chunk.vanbanid,
|
| 430 |
'cha': chunk.cha,
|
| 431 |
'document_title': chunk.document_title,
|
|
|
|
| 433 |
'article_title': chunk.article_title,
|
| 434 |
'clause_number': chunk.clause_number,
|
| 435 |
'sub_clause_letter': chunk.sub_clause_letter,
|
| 436 |
+
'context_summary': f"Structure: {chunk.context_summary}|Semantic: {chunk.content}"
|
| 437 |
}
|
| 438 |
|
| 439 |
# Lưu ngay lập tức vào Supabase
|
|
|
|
| 506 |
|
| 507 |
except Exception as e:
|
| 508 |
logger.error(f"[CHUNKER] Error processing document {document_id}: {e}") ##
|
| 509 |
+
return False
|
| 510 |
+
|
| 511 |
+
async def _create_semantic_summary_with_llm(self, chunk_content: str) -> str:
|
| 512 |
+
"""
|
| 513 |
+
Sinh semantic summary ngắn gọn, súc tích cho chunk bằng LLM.
|
| 514 |
+
"""
|
| 515 |
+
if not hasattr(self, "llm_client") or self.llm_client is None:
|
| 516 |
+
logger.warning("[CHUNKER] llm_client chưa được gán, bỏ qua semantic summary.")
|
| 517 |
+
return ""
|
| 518 |
+
prompt = (
|
| 519 |
+
"Tóm tắt thật ngắn gọn, súc tích nội dung luật sau (1-2 câu, không lặp lại tiêu đề, không giải thích):\n"
|
| 520 |
+
f"{chunk_content.strip()}"
|
| 521 |
+
)
|
| 522 |
+
try:
|
| 523 |
+
summary = await self.llm_client.generate_text(prompt)
|
| 524 |
+
return summary.strip() if summary else ""
|
| 525 |
+
except Exception as e:
|
| 526 |
+
logger.error(f"[CHUNKER] Lỗi khi sinh semantic summary bằng LLM: {e}")
|
| 527 |
+
return ""
|
app/main.py
CHANGED
|
@@ -77,6 +77,7 @@ reranker = Reranker()
|
|
| 77 |
|
| 78 |
# Khởi tạo LawDocumentChunker
|
| 79 |
law_chunker = LawDocumentChunker()
|
|
|
|
| 80 |
|
| 81 |
logger.info("[STARTUP] Mount health router...")
|
| 82 |
app.include_router(health_router)
|
|
|
|
| 77 |
|
| 78 |
# Khởi tạo LawDocumentChunker
|
| 79 |
law_chunker = LawDocumentChunker()
|
| 80 |
+
law_chunker.llm_client = llm_client
|
| 81 |
|
| 82 |
logger.info("[STARTUP] Mount health router...")
|
| 83 |
app.include_router(health_router)
|