Pujan-Dev's picture
fixed the debug
139a872
raw
history blame contribute delete
708 Bytes
from pypdf import PdfReader
import docx
from io import BytesIO
import logging
from fastapi import HTTPException
def parse_docx(file: BytesIO):
doc = docx.Document(file)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
def parse_pdf(file: BytesIO):
try:
doc = PdfReader(file)
text = ""
for page in doc.pages:
text += page.extract_text()
return text
except Exception as e:
logging.error(f"Error while processing PDF: {str(e)}")
raise HTTPException(
status_code=500, detail="Error processing PDF file")
def parse_txt(file: BytesIO):
return file.read().decode("utf-8")