Spaces:

ariidorosh
/

rag-qa-novel

Sleeping

rag-qa-novel / chunking.py

Upload 7 files

f8e98a1 verified 4 months ago

1.69 kB

	# chunking.py
	import os
	import json

	RAW_DIR = "data/raw"
	CHUNK_DIR = "chunks"
	CHUNK_SIZE = 500

	os.makedirs(CHUNK_DIR, exist_ok=True)


	def split_into_chunks(text, chunk_size=CHUNK_SIZE):
	words = text.split()
	for i in range(0, len(words), chunk_size):
	yield " ".join(words[i: i + chunk_size])


	def process_file(path, rel_path):
	# rel_path = "Том 1/Розділ 1.txt"
	with open(path, "r", encoding="utf-8") as f:
	text = f.read()

	# Том 1, Розділ 1.txt
	parts = rel_path.split(os.sep)
	volume = parts[0] # "Том 1"
	chapter = os.path.splitext(parts[1])[0] # "Розділ 1"

	chunks = list(split_into_chunks(text))

	for idx, ch in enumerate(chunks):
	# Унікальний ID без дурних символів
	chunk_id = f"{volume}_chapter{chapter.replace(' ', '')}_chunk{idx}"

	out = {
	"id": chunk_id,
	"text": ch,
	"source": rel_path,
	"volume": volume,
	"chapter": chapter,
	"chunk_index": idx
	}

	json.dump(
	out,
	open(f"{CHUNK_DIR}/{chunk_id}.json", "w", encoding="utf-8"),
	ensure_ascii=False
	)

	print(f"Processed {rel_path}: {len(chunks)} chunks")


	def main():
	for root, dirs, files in os.walk(RAW_DIR):
	for file in files:
	if file.endswith(".txt"):
	full_path = os.path.join(root, file)
	rel_path = os.path.relpath(full_path, RAW_DIR)
	process_file(full_path, rel_path)


	if __name__ == "__main__":
	main()