Spaces:
Sleeping
Sleeping
| # chunking.py | |
| import os | |
| import json | |
| RAW_DIR = "data/raw" | |
| CHUNK_DIR = "chunks" | |
| CHUNK_SIZE = 500 | |
| os.makedirs(CHUNK_DIR, exist_ok=True) | |
| def split_into_chunks(text, chunk_size=CHUNK_SIZE): | |
| words = text.split() | |
| for i in range(0, len(words), chunk_size): | |
| yield " ".join(words[i: i + chunk_size]) | |
| def process_file(path, rel_path): | |
| # rel_path = "Том 1/Розділ 1.txt" | |
| with open(path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| # Том 1, Розділ 1.txt | |
| parts = rel_path.split(os.sep) | |
| volume = parts[0] # "Том 1" | |
| chapter = os.path.splitext(parts[1])[0] # "Розділ 1" | |
| chunks = list(split_into_chunks(text)) | |
| for idx, ch in enumerate(chunks): | |
| # Унікальний ID без дурних символів | |
| chunk_id = f"{volume}_chapter{chapter.replace(' ', '')}_chunk{idx}" | |
| out = { | |
| "id": chunk_id, | |
| "text": ch, | |
| "source": rel_path, | |
| "volume": volume, | |
| "chapter": chapter, | |
| "chunk_index": idx | |
| } | |
| json.dump( | |
| out, | |
| open(f"{CHUNK_DIR}/{chunk_id}.json", "w", encoding="utf-8"), | |
| ensure_ascii=False | |
| ) | |
| print(f"Processed {rel_path}: {len(chunks)} chunks") | |
| def main(): | |
| for root, dirs, files in os.walk(RAW_DIR): | |
| for file in files: | |
| if file.endswith(".txt"): | |
| full_path = os.path.join(root, file) | |
| rel_path = os.path.relpath(full_path, RAW_DIR) | |
| process_file(full_path, rel_path) | |
| if __name__ == "__main__": | |
| main() | |