rag / index_docs.py
jessica45's picture
updated rag
5f04d6e verified
from typing import Optional
from docx import Document
try:
import fitz # PyMuPDF
except Exception:
# fall back to pymupdf module name if present
import pymupdf as fitz
def load_pdf_text(file_path: str) -> str:
try:
doc = fitz.open(file_path)
text = ""
# iterate directly over pages
for page in doc:
# use standard PyMuPDF API
try:
page_text = page.get_text()
except Exception:
# try alternate name for older versions
page_text = page.getText() if hasattr(page, 'getText') else ''
if page_text:
text += page_text + "\n"
try:
doc.close()
except Exception:
pass
return text.strip()
except Exception as e:
print(f"Error reading PDF {file_path}: {e}")
return ""
def load_docx_text(file_path: str) -> str:
try:
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text]
return "\n".join(paragraphs).strip()
except Exception as e:
print(f"Error reading DOCX {file_path}: {e}")
return ""
def load_txt_text(file_path: str) -> str:
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"Error reading TXT {file_path}: {e}")
return ""
def extract_text_from_path(path: str) -> Optional[str]:
if path.lower().endswith('.pdf'):
return load_pdf_text(path)
if path.lower().endswith('.docx'):
return load_docx_text(path)
if path.lower().endswith('.txt'):
return load_txt_text(path)
return None
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = min(start + chunk_size, text_length)
chunk = text[start:end]
chunks.append(chunk)
start += chunk_size - overlap
return chunks
if __name__ == '__main__':
import sys
def usage():
print('Usage: python src/index_docs.py <path-to-file-or-folder> [chunk_size]')
if len(sys.argv) < 2:
usage()
sys.exit(1)
path = sys.argv[1]
chunk_size = int(sys.argv[2]) if len(sys.argv) > 2 else 500
print(f'Testing extraction for: {path}')
text = extract_text_from_path(path)
if not text:
print('No text extracted or unsupported file type.')
sys.exit(1)
print('Characters extracted:', len(text))
chunks = chunk_text(text, chunk_size=chunk_size)
print('Chunks produced:', len(chunks))
if chunks:
preview = 300
print('\n--- First chunk preview ---')
print(chunks[0][:preview])
print('\n--- Second chunk preview ---')
print(chunks[1][:preview] if len(chunks) > 1 else '<none>')