Spaces:

TradaAI
/

Chatopus

Running

App Files Files Community

VietCat commited on Jul 5, 2025

Commit

35f4ffd

1 Parent(s): 55d95bd

add data viewer

Browse files

Files changed (2) hide show

app/law_document_chunker.py +36 -14
app/main.py +75 -8

app/law_document_chunker.py CHANGED Viewed

@@ -183,12 +183,27 @@ class LawDocumentChunker:
         """Xử lý văn bản theo cấu trúc phân cấp."""
         lines = content.split('\n')
         chunks = []
-        parent_stack = []  # Stack để theo dõi parent IDs
-        current_parent = None
         current_chunk_content = ""
         current_level = "CONTENT"
         current_level_value = None
         for line in lines:
             level, level_value, level_content = self._detect_structure_level(line)
@@ -207,18 +222,11 @@ class LawDocumentChunker:
                     )
                     chunks.append(metadata)
-                    # Cập nhật parent stack
-                    if level in ["PHAN", "PHU_LUC", "CHUONG", "MUC"]:
-                        # Cấp độ cao, reset stack
-                        parent_stack = [metadata.id]
-                        current_parent = metadata.id
-                    elif level == "DIEU":
-                        # Điều thuộc về cấp độ cao nhất hiện tại
-                        current_parent = parent_stack[-1] if parent_stack else None
-                        parent_stack.append(metadata.id)
-                    elif level in ["KHOAN", "DIEM"]:
-                        # Khoản/Điểm thuộc về Điều hiện tại
-                        current_parent = parent_stack[-1] if parent_stack else None
                 # Bắt đầu chunk mới
                 current_chunk_content = line + "\n"
@@ -260,6 +268,20 @@ class LawDocumentChunker:
         logger.info(f"[CHUNKER] Created {len(chunks)} chunks from document")
         return chunks
     async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
         """Tạo embeddings cho các chunks và lưu ngay lập tức vào Supabase."""

         """Xử lý văn bản theo cấu trúc phân cấp."""
         lines = content.split('\n')
         chunks = []
+        # Stack để theo dõi các chunks theo thứ tự xuất hiện
+        # Mỗi item là (chunk_id, level, level_value)
+        chunk_stack = []
         current_chunk_content = ""
         current_level = "CONTENT"
         current_level_value = None
+        current_parent = None
+        # Định nghĩa thứ tự ưu tiên của các level (số càng nhỏ càng cao)
+        level_priority = {
+            "PHAN": 1,
+            "PHU_LUC": 1,
+            "CHUONG": 2,
+            "MUC": 3,
+            "DIEU": 4,
+            "KHOAN": 5,
+            "DIEM": 6,
+            "CONTENT": 7
+        }
         for line in lines:
             level, level_value, level_content = self._detect_structure_level(line)
                     )
                     chunks.append(metadata)
+                    # Thêm vào stack
+                    chunk_stack.append((metadata.id, current_level, current_level_value))
+                # Tìm parent cho level mới
+                current_parent = self._find_parent_for_level(chunk_stack, level, level_priority)
                 # Bắt đầu chunk mới
                 current_chunk_content = line + "\n"
         logger.info(f"[CHUNKER] Created {len(chunks)} chunks from document")
         return chunks
+    def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str]]],
+                              current_level: str, level_priority: Dict[str, int]) -> Optional[str]:
+        """
+        Tìm parent gần nhất có level cao hơn (priority thấp hơn) cho level hiện tại.
+        """
+        current_priority = level_priority.get(current_level, 999)
+        # Tìm từ cuối stack (gần nhất) đến đầu stack
+        for chunk_id, level, level_value in reversed(chunk_stack):
+            if level_priority.get(level, 999) < current_priority:
+                return chunk_id
+        return None
     async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
         """Tạo embeddings cho các chunks và lưu ngay lập tức vào Supabase."""

app/main.py CHANGED Viewed

@@ -2,7 +2,7 @@ from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from loguru import logger
 import json
-from typing import Dict, Any, List
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import os
@@ -698,7 +698,7 @@ async def update_all_documents():
 @timing_decorator_async
 async def view_all_document_chunks():
     """
-    API xem toàn bộ dữ liệu trong bảng document_chunks.
     """
     try:
         logger.info("[API] Starting view all document chunks")
@@ -710,7 +710,7 @@ async def view_all_document_chunks():
         total_chunks = len(chunks_data)
         unique_documents = len(set(chunk.get('vanbanid') for chunk in chunks_data if chunk.get('vanbanid')))
-        # Nhóm theo vanbanid để thống kê và tổ chức data
         chunks_by_document = {}
         for chunk in chunks_data:
             vanbanid = chunk.get('vanbanid')
@@ -720,7 +720,7 @@ async def view_all_document_chunks():
         # Thống kê chi tiết
         document_stats = []
-        grouped_data = []
         for vanbanid, chunks in chunks_by_document.items():
             # Thống kê
@@ -730,12 +730,14 @@ async def view_all_document_chunks():
                 "document_title": chunks[0].get('document_title', 'Unknown') if chunks else 'Unknown'
             })
-            # Nhóm data theo vanbanid
-            grouped_data.append({
                 "vanbanid": vanbanid,
                 "document_title": chunks[0].get('document_title', 'Unknown') if chunks else 'Unknown',
                 "chunk_count": len(chunks),
-                "chunks": chunks
             })
         return {
@@ -746,13 +748,78 @@ async def view_all_document_chunks():
                 "unique_documents": unique_documents,
                 "document_stats": document_stats
             },
-            "data": grouped_data
         }
     except Exception as e:
         logger.error(f"[API] Error in view_all_document_chunks: {e}")
         raise HTTPException(status_code=500, detail=f"Lỗi: {str(e)}")
 @app.get("/api/document-chunks/status")
 @timing_decorator_async
 async def get_document_chunks_status():

 from fastapi.middleware.cors import CORSMiddleware
 from loguru import logger
 import json
+from typing import Dict, Any, List, Optional
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import os
 @timing_decorator_async
 async def view_all_document_chunks():
     """
+    API xem toàn bộ dữ liệu trong bảng document_chunks theo cấu trúc cây.
     """
     try:
         logger.info("[API] Starting view all document chunks")
         total_chunks = len(chunks_data)
         unique_documents = len(set(chunk.get('vanbanid') for chunk in chunks_data if chunk.get('vanbanid')))
+        # Nhóm theo vanbanid
         chunks_by_document = {}
         for chunk in chunks_data:
             vanbanid = chunk.get('vanbanid')
         # Thống kê chi tiết
         document_stats = []
+        hierarchical_data = []
         for vanbanid, chunks in chunks_by_document.items():
             # Thống kê
                 "document_title": chunks[0].get('document_title', 'Unknown') if chunks else 'Unknown'
             })
+            # Tạo cấu trúc cây cho từng văn bản
+            tree_structure = build_chunk_tree(chunks)
+            hierarchical_data.append({
                 "vanbanid": vanbanid,
                 "document_title": chunks[0].get('document_title', 'Unknown') if chunks else 'Unknown',
                 "chunk_count": len(chunks),
+                "chunks": tree_structure
             })
         return {
                 "unique_documents": unique_documents,
                 "document_stats": document_stats
             },
+            "data": hierarchical_data
         }
     except Exception as e:
         logger.error(f"[API] Error in view_all_document_chunks: {e}")
         raise HTTPException(status_code=500, detail=f"Lỗi: {str(e)}")
+def build_chunk_tree(chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Xây dựng cấu trúc cây từ danh sách chunks phẳng.
+    """
+    # Tạo dictionary để truy cập nhanh
+    chunks_dict = {chunk['id']: chunk for chunk in chunks}
+    # Tạo cấu trúc cây
+    root_chunks = []
+    for chunk in chunks:
+        chunk_id = chunk['id']
+        parent_id = chunk.get('cha')
+        # Tạo node mới với cấu trúc cây
+        tree_node = {
+            "id": chunk_id,
+            "content": chunk.get('content', ''),
+            "vanbanid": chunk.get('vanbanid'),
+            "cha": parent_id,
+            "document_title": chunk.get('document_title', ''),
+            "article_number": chunk.get('article_number'),
+            "article_title": chunk.get('article_title', ''),
+            "clause_number": chunk.get('clause_number', ''),
+            "sub_clause_letter": chunk.get('sub_clause_letter', ''),
+            "context_summary": chunk.get('context_summary', ''),
+            "data": chunk,  # Toàn bộ dữ liệu gốc
+            "children": []
+        }
+        if parent_id is None:
+            # Đây là root node
+            root_chunks.append(tree_node)
+        else:
+            # Tìm parent và thêm vào children
+            parent = chunks_dict.get(parent_id)
+            if parent:
+                # Tìm node parent trong cây
+                parent_node = find_node_in_tree(root_chunks, parent_id)
+                if parent_node:
+                    parent_node["children"].append(tree_node)
+                else:
+                    # Nếu không tìm thấy parent, coi như root
+                    root_chunks.append(tree_node)
+            else:
+                # Parent không tồn tại, coi như root
+                root_chunks.append(tree_node)
+    return root_chunks
+def find_node_in_tree(nodes: List[Dict[str, Any]], target_id: str) -> Optional[Dict[str, Any]]:
+    """
+    Tìm node trong cây theo ID.
+    """
+    for node in nodes:
+        if node["id"] == target_id:
+            return node
+        # Tìm trong children
+        found = find_node_in_tree(node.get("children", []), target_id)
+        if found:
+            return found
+    return None
 @app.get("/api/document-chunks/status")
 @timing_decorator_async
 async def get_document_chunks_status():