Spaces:
Running
Running
| import re | |
| import fitz | |
| from PyPDF2 import PdfReader | |
| from haystack import Document | |
| from haystack.components.preprocessors import DocumentSplitter | |
| import nltk | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| def extract_text_from_file(file_path: str) -> str: | |
| print(f"DEBUG: file_path = {file_path}") | |
| print(f"DEBUG: file_path ends with = {file_path.split('.')[-1]}") | |
| if file_path.lower().endswith(".pdf"): | |
| reader = PdfReader(file_path) | |
| print(f"DEBUG: number of pages = {len(reader.pages)}") | |
| full_text = "" | |
| for i, page in enumerate(reader.pages): | |
| text = page.extract_text() | |
| print(f"DEBUG: page {i} text preview = {repr(text[:100]) if text else 'None'}") | |
| if text: | |
| full_text += text + " " | |
| full_text = re.sub(r"\s+", " ", full_text).strip() | |
| print(f"DEBUG: total extracted length = {len(full_text)}") | |
| print(f"DEBUG: first 200 chars = {repr(full_text[:200])}") | |
| return full_text | |
| else: | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| except UnicodeDecodeError: | |
| with open(file_path, "r", encoding="latin-1") as f: | |
| return f.read() | |
| def chunk_text(text: str, source: str = "upload", split_length: int = 6, split_overlap: int = 2) -> list[str]: | |
| if not text.strip(): | |
| return [] | |
| raw_docs = [Document(content=text, meta={"source": source})] | |
| splitter = DocumentSplitter( | |
| split_by="sentence", | |
| split_length=split_length, | |
| split_overlap=split_overlap | |
| ) | |
| splitter.warm_up() | |
| result = splitter.run(documents=raw_docs) | |
| return [c.content for c in result["documents"]] |