import fitz import docx from io import BytesIO async def extract_text(file): filename = (file.filename or "").lower() contents = await file.read() if not contents: raise ValueError("Uploaded file is empty.") if filename.endswith(".pdf"): pdf = fitz.open(stream=contents, filetype="pdf") text = "" for page in pdf: text += page.get_text() if not text.strip(): raise ValueError("Could not extract text from PDF.") return text elif filename.endswith(".docx"): doc = docx.Document(BytesIO(contents)) text = "" for para in doc.paragraphs: text += para.text + "\n" if not text.strip(): raise ValueError("Could not extract text from DOCX.") return text elif filename.endswith(".txt"): try: text = contents.decode("utf-8") except UnicodeDecodeError as e: raise ValueError("TXT file must be UTF-8 encoded.") from e if not text.strip(): raise ValueError("Uploaded TXT file is empty.") return text else: raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")