huydt11502
Add RAG integration: Flask API server, disease selector, evaluation system with improved case generation
74b76f3 | import json | |
| from pathlib import Path | |
| """ | |
| BoYTe200_v3.json: 157 chunks | |
| NHIKHOA2.json: 47 chunks | |
| PHACDODIEUTRI_2016.json: 156 chunks | |
| TỔNG: 360 chunks logic | |
| chunk = { | |
| "chunk_id": chap["id"], | |
| "title": chap["Index"], | |
| "level1_items": chap["level1_items"], | |
| "contents": [ # Mảng sections | |
| {"title": "Section 1", "content": "..."}, | |
| {"title": "Section 2", "content": "..."}, | |
| ... | |
| ] | |
| } | |
| """ | |
| BASE_DIR = Path(r"D:\Storage\rag_project") # sửa cho đúng đường dẫn | |
| DATA_DIR = BASE_DIR / "data" | |
| def load_chapters(json_path: Path): | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| chapters = json.load(f) | |
| return chapters | |
| def main(): | |
| print(" BASE_DIR:", BASE_DIR) | |
| print(" DATA_DIR:", DATA_DIR, "\n") | |
| json_files = list(DATA_DIR.glob("*.json")) | |
| if not json_files: | |
| print(" Không tìm thấy file .json nào trong data/") | |
| return | |
| total_chunks = 0 | |
| for json_file in json_files: | |
| print(f"==============================") | |
| print(f" FILE: {json_file.name}") | |
| chapters = load_chapters(json_file) | |
| all_chunks = [] | |
| for chap in chapters: | |
| all_chunks.append({ | |
| "chunk_id": chap.get("id"), | |
| "title": chap.get("Index"), | |
| "level1_items": chap.get("level1_items", []), | |
| "contents": chap.get("contents", []), | |
| }) | |
| num_chunks = len(all_chunks) | |
| total_chunks += num_chunks | |
| print(f" Số chunk (theo id) trong file này: {num_chunks}") | |
| # In MẪU 1 chunk đầu tiên của file | |
| if num_chunks > 0: | |
| sample = all_chunks[0] | |
| print("\n MẪU CHUNK ĐẦU TIÊN:") | |
| print(" chunk_id:", sample["chunk_id"]) | |
| print(" title :", sample["title"]) | |
| print(" level1_items:", sample["level1_items"]) | |
| print(" Số sections trong contents:", len(sample["contents"])) | |
| if sample["contents"]: | |
| sec0 = sample["contents"][0] | |
| print(" ➜ Section 1 title :", sec0.get("title")) | |
| print(" ➜ Section 1 content preview:", | |
| (sec0.get("content") or "")[:150], "...") | |
| print() | |
| print("====================================") | |
| print(" TỔNG SỐ CHUNK (theo id) TỪ TẤT CẢ FILE:", total_chunks) | |
| if __name__ == "__main__": | |
| main() | |