File size: 2,411 Bytes
74b76f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | import json
from pathlib import Path
"""
BoYTe200_v3.json: 157 chunks
NHIKHOA2.json: 47 chunks
PHACDODIEUTRI_2016.json: 156 chunks
TỔNG: 360 chunks logic
chunk = {
"chunk_id": chap["id"],
"title": chap["Index"],
"level1_items": chap["level1_items"],
"contents": [ # Mảng sections
{"title": "Section 1", "content": "..."},
{"title": "Section 2", "content": "..."},
...
]
}
"""
BASE_DIR = Path(r"D:\Storage\rag_project") # sửa cho đúng đường dẫn
DATA_DIR = BASE_DIR / "data"
def load_chapters(json_path: Path):
with open(json_path, "r", encoding="utf-8") as f:
chapters = json.load(f)
return chapters
def main():
print(" BASE_DIR:", BASE_DIR)
print(" DATA_DIR:", DATA_DIR, "\n")
json_files = list(DATA_DIR.glob("*.json"))
if not json_files:
print(" Không tìm thấy file .json nào trong data/")
return
total_chunks = 0
for json_file in json_files:
print(f"==============================")
print(f" FILE: {json_file.name}")
chapters = load_chapters(json_file)
all_chunks = []
for chap in chapters:
all_chunks.append({
"chunk_id": chap.get("id"),
"title": chap.get("Index"),
"level1_items": chap.get("level1_items", []),
"contents": chap.get("contents", []),
})
num_chunks = len(all_chunks)
total_chunks += num_chunks
print(f" Số chunk (theo id) trong file này: {num_chunks}")
# In MẪU 1 chunk đầu tiên của file
if num_chunks > 0:
sample = all_chunks[0]
print("\n MẪU CHUNK ĐẦU TIÊN:")
print(" chunk_id:", sample["chunk_id"])
print(" title :", sample["title"])
print(" level1_items:", sample["level1_items"])
print(" Số sections trong contents:", len(sample["contents"]))
if sample["contents"]:
sec0 = sample["contents"][0]
print(" ➜ Section 1 title :", sec0.get("title"))
print(" ➜ Section 1 content preview:",
(sec0.get("content") or "")[:150], "...")
print()
print("====================================")
print(" TỔNG SỐ CHUNK (theo id) TỪ TẤT CẢ FILE:", total_chunks)
if __name__ == "__main__":
main()
|