File size: 1,906 Bytes
74b76f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
from pathlib import Path
from langchain_core.documents import Document

BASE_DIR = Path(r"D:\Storage\rag_project")
DATA_DIR = BASE_DIR / "data"

def test_single_file(filename):
    json_path = DATA_DIR / filename
    print(f"\n{'='*60}")
    print(f" TEST FILE: {filename}")
    
    with open(json_path, 'r', encoding='utf-8') as f:
        chapters = json.load(f)
    
    print(f" Chapters: {len(chapters)}")
    
    # Tạo chunks ĐÚNG Colab
    all_chunks = [] 
    # Tạo từng chunk dựa vào id mỗi sample 
    for chap in chapters:
        chunk = {
            "chunk_id": chap.get("id"),
            "title": chap.get("Index"),
            "level1_items": chap.get("level1_items", []),
            "contents": chap.get("contents", [])
        }
        all_chunks.append(chunk)
    
    print(f" Chunks: {len(all_chunks)}")
    
    # Tạo Documents
    docs = []
    # Tạo documents từ all_chunk
    
    for chunk in all_chunks:
        for i, section in enumerate(chunk["contents"]):
            doc = Document(
                page_content=section.get("content", ""),
                metadata={
                    "source_file": filename,
                    "chunk_id": chunk["chunk_id"],
                    "chunk_title": chunk["title"],
                    "section_id": f"{chunk['chunk_id']}.{i+1}",
                    "section_title": section.get("title", "")
                }
            )
            docs.append(doc)
    
    print(f" Documents: {len(docs)}")
    print(f" Mẫu doc 0:")
    print(f"  Title: {docs[0].metadata['chunk_title']}")
    print(f"  Section: {docs[0].metadata["section_id"]}")
    print(f"  Content: {docs[0].page_content[:100]}...")
    return docs

if __name__ == "__main__":
    # Test từng file
    test_single_file("NHIKHOA2.json")
    test_single_file("BoYTe200_v3.json")
    test_single_file("PHACDODIEUTRI_2016.json")