File size: 1,906 Bytes
74b76f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import json
from pathlib import Path
from langchain_core.documents import Document
BASE_DIR = Path(r"D:\Storage\rag_project")
DATA_DIR = BASE_DIR / "data"
def test_single_file(filename):
json_path = DATA_DIR / filename
print(f"\n{'='*60}")
print(f" TEST FILE: {filename}")
with open(json_path, 'r', encoding='utf-8') as f:
chapters = json.load(f)
print(f" Chapters: {len(chapters)}")
# Tạo chunks ĐÚNG Colab
all_chunks = []
# Tạo từng chunk dựa vào id mỗi sample
for chap in chapters:
chunk = {
"chunk_id": chap.get("id"),
"title": chap.get("Index"),
"level1_items": chap.get("level1_items", []),
"contents": chap.get("contents", [])
}
all_chunks.append(chunk)
print(f" Chunks: {len(all_chunks)}")
# Tạo Documents
docs = []
# Tạo documents từ all_chunk
for chunk in all_chunks:
for i, section in enumerate(chunk["contents"]):
doc = Document(
page_content=section.get("content", ""),
metadata={
"source_file": filename,
"chunk_id": chunk["chunk_id"],
"chunk_title": chunk["title"],
"section_id": f"{chunk['chunk_id']}.{i+1}",
"section_title": section.get("title", "")
}
)
docs.append(doc)
print(f" Documents: {len(docs)}")
print(f" Mẫu doc 0:")
print(f" Title: {docs[0].metadata['chunk_title']}")
print(f" Section: {docs[0].metadata["section_id"]}")
print(f" Content: {docs[0].page_content[:100]}...")
return docs
if __name__ == "__main__":
# Test từng file
test_single_file("NHIKHOA2.json")
test_single_file("BoYTe200_v3.json")
test_single_file("PHACDODIEUTRI_2016.json")
|