File size: 2,086 Bytes
74b76f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import os
import sys
import json
from pathlib import Path
from langchain_core.documents import Document
# PATH ĐÚNG
BASE_DIR = Path(r'D:\Storage\rag_project')
sys.path.insert(0, str(BASE_DIR / 'src'))
print(" Đường dẫn Python search:")
print(f" - BASE_DIR: {BASE_DIR}")
# LOAD TẤT CẢ JSON TRONG DATA/
DATA_DIR = BASE_DIR / 'data'
all_docs = []
print("\n TẤT CẢ JSON TRONG DATA:")
json_files = list(DATA_DIR.glob("*.json"))
for json_file in json_files:
print(f" {json_file.name}")
# Load TẤT CẢ JSON files
total_chunks = 0
for json_file in json_files:
print(f"\n Đang load {json_file.name}...")
with open(json_file, 'r', encoding='utf-8') as f:
chapters = json.load(f)
# Tạo chunks như notebook gốc
file_chunks = []
for chap in chapters:
all_chunks = {
"chunkid": chap.get("id", "unknown"),
"title": chap.get("index", "unknown"),
"level1items": chap.get("level1items", []),
"contents": chap.get("contents", [])
}
# Tạo Documents
for i, section in enumerate(all_chunks["contents"]):
doc = Document(
page_content=section["content"],
metadata={
"source_file": json_file.name,
"chunkid": all_chunks["chunkid"],
"sectionid": f"{all_chunks['chunkid']}.{i+1}",
"title": all_chunks["title"],
"sectiontitle": section["title"]
}
)
file_chunks.append(doc)
all_docs.extend(file_chunks)
total_chunks += len(file_chunks)
print(f" {json_file.name}: {len(file_chunks)} chunks")
print(f"\n TỔNG KẾT:")
print(f" Tổng chunks từ {len(json_files)} files: {total_chunks}")
print(f" Chunk mẫu 1:")
if all_docs:
doc = all_docs[0]
print(f" File: {doc.metadata['source_file']}")
print(f" Content: {doc.page_content[:150]}...")
print(f" Title: {doc.metadata['title']}")
print("\n READY CHO RAG - FAISS + LLM!")
|