# import fitz # PyMuPDF import docx from io import BytesIO import logging from fastapi import HTTPException from pypdf import PdfReader def parse_docx(file: BytesIO): doc = docx.Document(file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text def parse_pdf(file: BytesIO): try: doc = PdfReader(file) text = "" for page in doc.pages: text += page.extract_text() return text except Exception as e: logging.error(f"Error while processing PDF: {str(e)}") raise HTTPException( status_code=500, detail="Error processing PDF file") def parse_txt(file: BytesIO): return file.read().decode("utf-8") def end_symbol_for_NP_text(text: str) -> str: text = text.strip() if not text.endswith("।"): text += "।" return text