Spaces:

VietCat
/

RAGSample

Sleeping

App Files Files Community

VietCat commited on Jun 20, 2025

Commit

7bc48c0

1 Parent(s): 2db7d31

fix log and download file

Browse files

Files changed (4) hide show

app.py +19 -1
rag_core/business.py +1 -1
rag_core/chunker.py +77 -60
rag_core/llm.py +2 -1

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from rag_core.business import answer_query, rescan_index
 from ui import app_ui
 import gradio as gr
 import logging
 logging.info("🚀 Khởi động ứng dụng FastAPI...")
@@ -20,7 +23,22 @@ async def rescan_api():
     logging.info("♻️ API /rescan được gọi")
     return rescan_index()
 # Mount Gradio UI vào FastAPI tại root
 app = gr.mount_gradio_app(app, app_ui, path="")
-logging.info("✅ Gradio UI đã mount vào root /")

 from ui import app_ui
 import gradio as gr
 import logging
+from fastapi.responses import JSONResponse
+import json
+import os
 logging.info("🚀 Khởi động ứng dụng FastAPI...")
     logging.info("♻️ API /rescan được gọi")
     return rescan_index()
+@app.get("/get_structure_file")
+def get_structure_file():
+    path = "faiss_index/chunk_structure.json"
+    if os.path.exists(path):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            return JSONResponse(content=data)
+        except Exception as e:
+            logging.error(f"❌ Lỗi đọc file JSON: {e}")
+            return {"error": f"Lỗi đọc file: {str(e)}"}
+    else:
+        logging.warning("⚠️ File chunk_structure.json không tồn tại.")
+        return {"error": "File không tồn tại."}
 # Mount Gradio UI vào FastAPI tại root
 app = gr.mount_gradio_app(app, app_ui, path="")
+logging.info("✅ Gradio UI đã mount vào root /")

rag_core/business.py CHANGED Viewed

@@ -66,7 +66,7 @@ def answer_query(query: str) -> str:
         prompt = (
             "Bạn là một trợ lý AI có kiến thức pháp luật, hãy trả lời câu hỏi dựa trên các đoạn luật sau. "
-            "Chỉ sử dụng thông tin có trong các đoạn, không tự đoán. Nếu không có kết quả, hãy trả lời là không có kết quả.\n"
         )
         prompt += "\n\n".join(docs)
         prompt += f"\n\nCâu hỏi: {query}\nTrả lời:"

         prompt = (
             "Bạn là một trợ lý AI có kiến thức pháp luật, hãy trả lời câu hỏi dựa trên các đoạn luật sau. "
+            "Chỉ sử dụng thông tin có trong các đoạn, không tự đoán.\n"
         )
         prompt += "\n\n".join(docs)
         prompt += f"\n\nCâu hỏi: {query}\nTrả lời:"

rag_core/chunker.py CHANGED Viewed

@@ -1,74 +1,91 @@
 import re
-import os
 import json
 import logging
-from typing import List
-from rag_core.utils import log_timed
-@log_timed("chunking văn bản luật")
-def chunk_legal_text(text: str) -> List[str]:
-    # Mẫu regex cho cấp chương, điều, khoản, điểm
-    chapter_pattern = r"(Chương\s+[IVXLC]+\s+.+?)(?=(?:\nChương\s+[IVXLC]+\s+)|\Z)"
-    article_pattern = r"(Điều\s+\d+\..+?)(?=(?:\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
-    clause_pattern = r"(?:\n|\A)(\d+\..+?)(?=(?:\n\d+\.|\n[a-zA-Z]\)|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
-    point_pattern = r"(?:\n|\A)([a-zA-Z][\)|\.].+?)(?=(?:\n[a-zA-Z][\)|\.]|\n\d+\.|\nĐiều\s+\d+\.|\nChương\s+[IVXLC]+|\Z))"
-    chapters = []
-    for chapter_match in re.finditer(chapter_pattern, text, flags=re.DOTALL):
-        chapter_text = chapter_match.group(1).strip()
-        chapter = {
-            "title": chapter_text.split("\n")[0],
-            "articles": []
-        }
-        article_block = chapter_text[len(chapter["title"]):].strip()
-        for article_match in re.finditer(article_pattern, article_block, flags=re.DOTALL):
-            article_text = article_match.group(1).strip()
-            article = {
-                "title": article_text.split("\n")[0],
                 "clauses": []
             }
-            clause_block = article_text[len(article["title"]):].strip()
-            clauses = re.findall(clause_pattern, clause_block, flags=re.DOTALL)
-            if clauses:
-                for clause_text in clauses:
-                    clause = {
-                        "title": clause_text.split("\n")[0],
-                        "points": []
-                    }
-                    point_block = clause_text[len(clause["title"]):].strip()
-                    points = re.findall(point_pattern, point_block, flags=re.DOTALL)
-                    if points:
-                        clause["points"] = [p.strip() for p in points if len(p.strip()) > 10]
-                    article["clauses"].append(clause)
             else:
-                # Nếu không có clause, thêm luôn toàn bộ nội dung
-                article["clauses"] = []
-            chapter["articles"].append(article)
-        chapters.append(chapter)
-    # Flatten để tạo chunks
     chunks = []
-    for chapter in chapters:
-        for article in chapter.get("articles", []):
-            if not article.get("clauses"):
-                chunks.append(article["title"])
                 continue
-            for clause in article.get("clauses", []):
-                if not clause.get("points"):
-                    chunks.append(clause["title"])
-                else:
-                    for point in clause.get("points"):
-                        chunks.append(point)
-    # Log cấu trúc nested ra file JSON
-    json_path_local = "faiss_index/chunk_structure.json"
-    os.makedirs(os.path.dirname(json_path_local), exist_ok=True)
-    with open(json_path_local, "w", encoding="utf-8") as f:
-        json.dump(chapters, f, indent=2, ensure_ascii=False)
-    logging.info(f"✅ Đã ghi cấu trúc nested vào {json_path_local}")
-    # Gợi ý đường dẫn tải cho Hugging Face Spaces
-    download_link = "/file/faiss_index/chunk_structure.json"
-    logging.info(f"📎 Link tải JSON cấu trúc: {download_link}")
     return chunks

 import re
 import json
+import os
 import logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+SECTION_RE = re.compile(r"^\s*(Điều\s+\d+[A-Z]?)\.?\s*(.*)")
+CLAUSE_RE = re.compile(r"^\s*(\d+)\.?\s+(.*)")
+POINT_RE = re.compile(r"^\s*([a-zA-Z])\)\s+(.*)")
+PUBLIC_CHUNK_JSON_PATH = "faiss_index/chunk_structure.json"
+def chunk_legal_text(text):
+    logging.info("📑 Bắt đầu chunk văn bản luật...")
+    articles = []
+    current_article = None
+    current_clause = None
+    for line in text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        sec_match = SECTION_RE.match(line)
+        clause_match = CLAUSE_RE.match(line)
+        point_match = POINT_RE.match(line)
+        if sec_match:
+            if current_article:
+                articles.append(current_article)
+            current_article = {
+                "article": sec_match.group(1),
+                "title": sec_match.group(2),
                 "clauses": []
             }
+            current_clause = None
+        elif clause_match and current_article:
+            current_clause = {
+                "clause": clause_match.group(1),
+                "text": clause_match.group(2),
+                "points": []
+            }
+            current_article["clauses"].append(current_clause)
+        elif point_match and current_clause:
+            current_clause["points"].append({
+                "point": point_match.group(1),
+                "text": point_match.group(2)
+            })
+        elif current_clause:
+            if current_clause["points"]:
+                current_clause["points"][-1]["text"] += " " + line
             else:
+                current_clause["text"] += " " + line
+        elif current_article:
+            if current_article["clauses"]:
+                current_article["clauses"][-1]["text"] += " " + line
+    if current_article:
+        articles.append(current_article)
+    logging.info(f"🔎 Đã phân tích được {len(articles)} điều luật")
     chunks = []
+    for article in articles:
+        article_header = f"{article['article']}. {article['title']}"
+        if not article.get("clauses"):
+            chunks.append(article_header)
+            continue
+        for clause in article.get("clauses", []):
+            clause_header = f"{article['article']}.{clause['clause']}: {clause['text']}"
+            if not clause.get("points"):
+                chunks.append(f"{article_header}\n{clause_header}")
                 continue
+            for point in clause.get("points", []):
+                chunks.append(f"{article_header}\n{clause_header}\n{point['point']}) {point['text']}")
+    try:
+        os.makedirs(os.path.dirname(PUBLIC_CHUNK_JSON_PATH), exist_ok=True)
+        with open(PUBLIC_CHUNK_JSON_PATH, "w", encoding="utf-8") as f:
+            json.dump(articles, f, ensure_ascii=False, indent=2)
+        logging.info(f"✅ Đã ghi cấu trúc nested vào {PUBLIC_CHUNK_JSON_PATH}")
+        if os.path.exists(PUBLIC_CHUNK_JSON_PATH):
+            logging.info("📁 File chunk_structure.json đã được tạo thành công và có thể truy cập công khai.")
+        else:
+            logging.warning("⚠️ File chunk_structure.json không tồn tại sau khi ghi.")
+    except Exception as e:
+        logging.error(f"❌ Lỗi khi ghi file JSON: {e}")
     return chunks

rag_core/llm.py CHANGED Viewed

@@ -2,7 +2,7 @@ import requests
 import logging
 import time
-LLM_ENDPOINT = "https://vietcat-gemma34b.hf.space/analyze"
 def generate_answer(prompt: str) -> str:
     max_retries = 3
@@ -10,6 +10,7 @@ def generate_answer(prompt: str) -> str:
     for attempt in range(1, max_retries + 1):
         try:
             logging.info(f"📡 Gửi request đến LLM (lần {attempt}, timeout={timeout}s)...")
             response = requests.post(
                 LLM_ENDPOINT,

 import logging
 import time
+LLM_ENDPOINT = "https://vietcat-gemma34b.hf.space/purechat"
 def generate_answer(prompt: str) -> str:
     max_retries = 3
     for attempt in range(1, max_retries + 1):
         try:
+            logging.info(f"📡 Gửi request đến LLM tại {LLM_ENDPOINT}")
             logging.info(f"📡 Gửi request đến LLM (lần {attempt}, timeout={timeout}s)...")
             response = requests.post(
                 LLM_ENDPOINT,