from typing import Optional from docx import Document try: import fitz # PyMuPDF except Exception: # fall back to pymupdf module name if present import pymupdf as fitz def load_pdf_text(file_path: str) -> str: try: doc = fitz.open(file_path) text = "" # iterate directly over pages for page in doc: # use standard PyMuPDF API try: page_text = page.get_text() except Exception: # try alternate name for older versions page_text = page.getText() if hasattr(page, 'getText') else '' if page_text: text += page_text + "\n" try: doc.close() except Exception: pass return text.strip() except Exception as e: print(f"Error reading PDF {file_path}: {e}") return "" def load_docx_text(file_path: str) -> str: try: doc = Document(file_path) paragraphs = [p.text for p in doc.paragraphs if p.text] return "\n".join(paragraphs).strip() except Exception as e: print(f"Error reading DOCX {file_path}: {e}") return "" def load_txt_text(file_path: str) -> str: try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except Exception as e: print(f"Error reading TXT {file_path}: {e}") return "" def extract_text_from_path(path: str) -> Optional[str]: if path.lower().endswith('.pdf'): return load_pdf_text(path) if path.lower().endswith('.docx'): return load_docx_text(path) if path.lower().endswith('.txt'): return load_txt_text(path) return None def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list: chunks = [] start = 0 text_length = len(text) while start < text_length: end = min(start + chunk_size, text_length) chunk = text[start:end] chunks.append(chunk) start += chunk_size - overlap return chunks if __name__ == '__main__': import sys def usage(): print('Usage: python src/index_docs.py [chunk_size]') if len(sys.argv) < 2: usage() sys.exit(1) path = sys.argv[1] chunk_size = int(sys.argv[2]) if len(sys.argv) > 2 else 500 print(f'Testing extraction for: {path}') text = extract_text_from_path(path) if not text: print('No text extracted or unsupported file type.') sys.exit(1) print('Characters extracted:', len(text)) chunks = chunk_text(text, chunk_size=chunk_size) print('Chunks produced:', len(chunks)) if chunks: preview = 300 print('\n--- First chunk preview ---') print(chunks[0][:preview]) print('\n--- Second chunk preview ---') print(chunks[1][:preview] if len(chunks) > 1 else '')