ai / chunker.py
jira877832's picture
Update chunker.py
9bd49ee verified
import re
import fitz
from PyPDF2 import PdfReader
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
def extract_text_from_file(file_path: str) -> str:
print(f"DEBUG: file_path = {file_path}")
print(f"DEBUG: file_path ends with = {file_path.split('.')[-1]}")
if file_path.lower().endswith(".pdf"):
reader = PdfReader(file_path)
print(f"DEBUG: number of pages = {len(reader.pages)}")
full_text = ""
for i, page in enumerate(reader.pages):
text = page.extract_text()
print(f"DEBUG: page {i} text preview = {repr(text[:100]) if text else 'None'}")
if text:
full_text += text + " "
full_text = re.sub(r"\s+", " ", full_text).strip()
print(f"DEBUG: total extracted length = {len(full_text)}")
print(f"DEBUG: first 200 chars = {repr(full_text[:200])}")
return full_text
else:
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(file_path, "r", encoding="latin-1") as f:
return f.read()
def chunk_text(text: str, source: str = "upload", split_length: int = 6, split_overlap: int = 2) -> list[str]:
if not text.strip():
return []
raw_docs = [Document(content=text, meta={"source": source})]
splitter = DocumentSplitter(
split_by="sentence",
split_length=split_length,
split_overlap=split_overlap
)
splitter.warm_up()
result = splitter.run(documents=raw_docs)
return [c.content for c in result["documents"]]