from pypdf import PdfReader import docx from io import BytesIO import logging from fastapi import HTTPException def parse_docx(file: BytesIO): doc = docx.Document(file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text def parse_pdf(file: BytesIO): try: doc = PdfReader(file) text = "" for page in doc.pages: text += page.extract_text() return text except Exception as e: logging.error(f"Error while processing PDF: {str(e)}") raise HTTPException( status_code=500, detail="Error processing PDF file") def parse_txt(file: BytesIO): return file.read().decode("utf-8")