huydt11502
Add RAG integration: Flask API server, disease selector, evaluation system with improved case generation
74b76f3 | import json | |
| from pathlib import Path | |
| from langchain_core.documents import Document | |
| BASE_DIR = Path(r"D:\Storage\rag_project") | |
| DATA_DIR = BASE_DIR / "data" | |
| def test_single_file(filename): | |
| json_path = DATA_DIR / filename | |
| print(f"\n{'='*60}") | |
| print(f" TEST FILE: {filename}") | |
| with open(json_path, 'r', encoding='utf-8') as f: | |
| chapters = json.load(f) | |
| print(f" Chapters: {len(chapters)}") | |
| # Tạo chunks ĐÚNG Colab | |
| all_chunks = [] | |
| # Tạo từng chunk dựa vào id mỗi sample | |
| for chap in chapters: | |
| chunk = { | |
| "chunk_id": chap.get("id"), | |
| "title": chap.get("Index"), | |
| "level1_items": chap.get("level1_items", []), | |
| "contents": chap.get("contents", []) | |
| } | |
| all_chunks.append(chunk) | |
| print(f" Chunks: {len(all_chunks)}") | |
| # Tạo Documents | |
| docs = [] | |
| # Tạo documents từ all_chunk | |
| for chunk in all_chunks: | |
| for i, section in enumerate(chunk["contents"]): | |
| doc = Document( | |
| page_content=section.get("content", ""), | |
| metadata={ | |
| "source_file": filename, | |
| "chunk_id": chunk["chunk_id"], | |
| "chunk_title": chunk["title"], | |
| "section_id": f"{chunk['chunk_id']}.{i+1}", | |
| "section_title": section.get("title", "") | |
| } | |
| ) | |
| docs.append(doc) | |
| print(f" Documents: {len(docs)}") | |
| print(f" Mẫu doc 0:") | |
| print(f" Title: {docs[0].metadata['chunk_title']}") | |
| print(f" Section: {docs[0].metadata["section_id"]}") | |
| print(f" Content: {docs[0].page_content[:100]}...") | |
| return docs | |
| if __name__ == "__main__": | |
| # Test từng file | |
| test_single_file("NHIKHOA2.json") | |
| test_single_file("BoYTe200_v3.json") | |
| test_single_file("PHACDODIEUTRI_2016.json") | |