HRMS_Chatbot / ingestion.py
neerajkalyank's picture
Update ingestion.py
152d5f0 verified
raw
history blame contribute delete
317 Bytes
from pypdf import PdfReader
import docx2txt
def read_file(file):
if file.name.endswith(".pdf"):
reader = PdfReader(file)
return " ".join(p.extract_text() or "" for p in reader.pages)
if file.name.endswith(".docx"):
return docx2txt.process(file)
return file.read().decode("utf-8")