rag-qa-novel / chunking.py
ariidorosh's picture
Upload 7 files
f8e98a1 verified
# chunking.py
import os
import json
RAW_DIR = "data/raw"
CHUNK_DIR = "chunks"
CHUNK_SIZE = 500
os.makedirs(CHUNK_DIR, exist_ok=True)
def split_into_chunks(text, chunk_size=CHUNK_SIZE):
words = text.split()
for i in range(0, len(words), chunk_size):
yield " ".join(words[i: i + chunk_size])
def process_file(path, rel_path):
# rel_path = "Том 1/Розділ 1.txt"
with open(path, "r", encoding="utf-8") as f:
text = f.read()
# Том 1, Розділ 1.txt
parts = rel_path.split(os.sep)
volume = parts[0] # "Том 1"
chapter = os.path.splitext(parts[1])[0] # "Розділ 1"
chunks = list(split_into_chunks(text))
for idx, ch in enumerate(chunks):
# Унікальний ID без дурних символів
chunk_id = f"{volume}_chapter{chapter.replace(' ', '')}_chunk{idx}"
out = {
"id": chunk_id,
"text": ch,
"source": rel_path,
"volume": volume,
"chapter": chapter,
"chunk_index": idx
}
json.dump(
out,
open(f"{CHUNK_DIR}/{chunk_id}.json", "w", encoding="utf-8"),
ensure_ascii=False
)
print(f"Processed {rel_path}: {len(chunks)} chunks")
def main():
for root, dirs, files in os.walk(RAW_DIR):
for file in files:
if file.endswith(".txt"):
full_path = os.path.join(root, file)
rel_path = os.path.relpath(full_path, RAW_DIR)
process_file(full_path, rel_path)
if __name__ == "__main__":
main()