| | from typing import Optional |
| | from docx import Document |
| | try: |
| | import fitz |
| | except Exception: |
| | |
| | import pymupdf as fitz |
| |
|
| | def load_pdf_text(file_path: str) -> str: |
| | try: |
| | doc = fitz.open(file_path) |
| | text = "" |
| | |
| | for page in doc: |
| | |
| | try: |
| | page_text = page.get_text() |
| | except Exception: |
| | |
| | page_text = page.getText() if hasattr(page, 'getText') else '' |
| | if page_text: |
| | text += page_text + "\n" |
| | try: |
| | doc.close() |
| | except Exception: |
| | pass |
| | return text.strip() |
| | except Exception as e: |
| | print(f"Error reading PDF {file_path}: {e}") |
| | return "" |
| |
|
| |
|
| | def load_docx_text(file_path: str) -> str: |
| | try: |
| | doc = Document(file_path) |
| | paragraphs = [p.text for p in doc.paragraphs if p.text] |
| | return "\n".join(paragraphs).strip() |
| | except Exception as e: |
| | print(f"Error reading DOCX {file_path}: {e}") |
| | return "" |
| |
|
| |
|
| | def load_txt_text(file_path: str) -> str: |
| | try: |
| | with open(file_path, 'r', encoding='utf-8') as f: |
| | return f.read() |
| | except Exception as e: |
| | print(f"Error reading TXT {file_path}: {e}") |
| | return "" |
| |
|
| |
|
| | def extract_text_from_path(path: str) -> Optional[str]: |
| | if path.lower().endswith('.pdf'): |
| | return load_pdf_text(path) |
| | if path.lower().endswith('.docx'): |
| | return load_docx_text(path) |
| | if path.lower().endswith('.txt'): |
| | return load_txt_text(path) |
| | return None |
| |
|
| |
|
| | def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list: |
| | chunks = [] |
| | start = 0 |
| | text_length = len(text) |
| | while start < text_length: |
| | end = min(start + chunk_size, text_length) |
| | chunk = text[start:end] |
| | chunks.append(chunk) |
| | start += chunk_size - overlap |
| | return chunks |
| |
|
| |
|
| | if __name__ == '__main__': |
| | import sys |
| |
|
| | def usage(): |
| | print('Usage: python src/index_docs.py <path-to-file-or-folder> [chunk_size]') |
| |
|
| | if len(sys.argv) < 2: |
| | usage() |
| | sys.exit(1) |
| |
|
| | path = sys.argv[1] |
| | chunk_size = int(sys.argv[2]) if len(sys.argv) > 2 else 500 |
| |
|
| | print(f'Testing extraction for: {path}') |
| | text = extract_text_from_path(path) |
| | if not text: |
| | print('No text extracted or unsupported file type.') |
| | sys.exit(1) |
| |
|
| | print('Characters extracted:', len(text)) |
| | chunks = chunk_text(text, chunk_size=chunk_size) |
| | print('Chunks produced:', len(chunks)) |
| | if chunks: |
| | preview = 300 |
| | print('\n--- First chunk preview ---') |
| | print(chunks[0][:preview]) |
| | print('\n--- Second chunk preview ---') |
| | print(chunks[1][:preview] if len(chunks) > 1 else '<none>') |
| |
|