File size: 2,086 Bytes
74b76f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import sys
import json
from pathlib import Path
from langchain_core.documents import Document

#  PATH ĐÚNG
BASE_DIR = Path(r'D:\Storage\rag_project')
sys.path.insert(0, str(BASE_DIR / 'src'))

print(" Đường dẫn Python search:")
print(f"  - BASE_DIR: {BASE_DIR}")

#  LOAD TẤT CẢ JSON TRONG DATA/
DATA_DIR = BASE_DIR / 'data'
all_docs = []

print("\n TẤT CẢ JSON TRONG DATA:")
json_files = list(DATA_DIR.glob("*.json"))
for json_file in json_files:
    print(f"   {json_file.name}")

# Load TẤT CẢ JSON files
total_chunks = 0
for json_file in json_files:
    print(f"\n Đang load {json_file.name}...")
    
    with open(json_file, 'r', encoding='utf-8') as f:
        chapters = json.load(f)
    
    # Tạo chunks như notebook gốc
    file_chunks = []
    for chap in chapters:
        all_chunks = {
            "chunkid": chap.get("id", "unknown"),
            "title": chap.get("index", "unknown"),
            "level1items": chap.get("level1items", []),
            "contents": chap.get("contents", [])
        }
        
        # Tạo Documents
        for i, section in enumerate(all_chunks["contents"]):
            doc = Document(
                page_content=section["content"],
                metadata={
                    "source_file": json_file.name,
                    "chunkid": all_chunks["chunkid"],
                    "sectionid": f"{all_chunks['chunkid']}.{i+1}",
                    "title": all_chunks["title"],
                    "sectiontitle": section["title"]
                }
            )
            file_chunks.append(doc)
    
    all_docs.extend(file_chunks)
    total_chunks += len(file_chunks)
    print(f"   {json_file.name}: {len(file_chunks)} chunks")

print(f"\n TỔNG KẾT:")
print(f" Tổng chunks từ {len(json_files)} files: {total_chunks}")
print(f" Chunk mẫu 1:")
if all_docs:
    doc = all_docs[0]
    print(f"  File: {doc.metadata['source_file']}")
    print(f"  Content: {doc.page_content[:150]}...")
    print(f"  Title: {doc.metadata['title']}")

print("\n READY CHO RAG - FAISS + LLM!")