MedChat / test /test_data_loader.py
huydt11502
Add RAG integration: Flask API server, disease selector, evaluation system with improved case generation
74b76f3
import os
import sys
import json
from pathlib import Path
from langchain_core.documents import Document
# PATH ĐÚNG
BASE_DIR = Path(r'D:\Storage\rag_project')
sys.path.insert(0, str(BASE_DIR / 'src'))
print(" Đường dẫn Python search:")
print(f" - BASE_DIR: {BASE_DIR}")
# LOAD TẤT CẢ JSON TRONG DATA/
DATA_DIR = BASE_DIR / 'data'
all_docs = []
print("\n TẤT CẢ JSON TRONG DATA:")
json_files = list(DATA_DIR.glob("*.json"))
for json_file in json_files:
print(f" {json_file.name}")
# Load TẤT CẢ JSON files
total_chunks = 0
for json_file in json_files:
print(f"\n Đang load {json_file.name}...")
with open(json_file, 'r', encoding='utf-8') as f:
chapters = json.load(f)
# Tạo chunks như notebook gốc
file_chunks = []
for chap in chapters:
all_chunks = {
"chunkid": chap.get("id", "unknown"),
"title": chap.get("index", "unknown"),
"level1items": chap.get("level1items", []),
"contents": chap.get("contents", [])
}
# Tạo Documents
for i, section in enumerate(all_chunks["contents"]):
doc = Document(
page_content=section["content"],
metadata={
"source_file": json_file.name,
"chunkid": all_chunks["chunkid"],
"sectionid": f"{all_chunks['chunkid']}.{i+1}",
"title": all_chunks["title"],
"sectiontitle": section["title"]
}
)
file_chunks.append(doc)
all_docs.extend(file_chunks)
total_chunks += len(file_chunks)
print(f" {json_file.name}: {len(file_chunks)} chunks")
print(f"\n TỔNG KẾT:")
print(f" Tổng chunks từ {len(json_files)} files: {total_chunks}")
print(f" Chunk mẫu 1:")
if all_docs:
doc = all_docs[0]
print(f" File: {doc.metadata['source_file']}")
print(f" Content: {doc.page_content[:150]}...")
print(f" Title: {doc.metadata['title']}")
print("\n READY CHO RAG - FAISS + LLM!")