Chatbot-Documents / src /parsers.py
amitcoolll's picture
Initial RAG document chatbot deployment
c4233b7
raw
history blame contribute delete
545 Bytes
from typing import List, Tuple
from pypdf import PdfReader
from docx import Document
def read_pdf(path: str) -> List[Tuple[int, str]]:
reader = PdfReader(path)
pages = []
for i, page in enumerate(reader.pages):
text = (page.extract_text() or "").strip()
if text:
pages.append((i + 1, text))
return pages
def read_docx(path: str) -> List[Tuple[int, str]]:
doc = Document(path)
text = "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
return [(1, text)] if text else []