Spaces:

jira877832
/

ai

Running

ai / chunker.py

Update chunker.py

9bd49ee verified about 1 month ago

1.79 kB

	import re
	import fitz
	from PyPDF2 import PdfReader
	from haystack import Document
	from haystack.components.preprocessors import DocumentSplitter
	import nltk

	nltk.download("punkt", quiet=True)
	nltk.download("punkt_tab", quiet=True)


	def extract_text_from_file(file_path: str) -> str:
	print(f"DEBUG: file_path = {file_path}")
	print(f"DEBUG: file_path ends with = {file_path.split('.')[-1]}")

	if file_path.lower().endswith(".pdf"):
	reader = PdfReader(file_path)
	print(f"DEBUG: number of pages = {len(reader.pages)}")

	full_text = ""
	for i, page in enumerate(reader.pages):
	text = page.extract_text()
	print(f"DEBUG: page {i} text preview = {repr(text[:100]) if text else 'None'}")
	if text:
	full_text += text + " "

	full_text = re.sub(r"\s+", " ", full_text).strip()
	print(f"DEBUG: total extracted length = {len(full_text)}")
	print(f"DEBUG: first 200 chars = {repr(full_text[:200])}")
	return full_text
	else:
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()
	except UnicodeDecodeError:
	with open(file_path, "r", encoding="latin-1") as f:
	return f.read()


	def chunk_text(text: str, source: str = "upload", split_length: int = 6, split_overlap: int = 2) -> list[str]:
	if not text.strip():
	return []
	raw_docs = [Document(content=text, meta={"source": source})]
	splitter = DocumentSplitter(
	split_by="sentence",
	split_length=split_length,
	split_overlap=split_overlap
	)
	splitter.warm_up()
	result = splitter.run(documents=raw_docs)
	return [c.content for c in result["documents"]]