from __future__ import annotations from typing import Optional import io import logging logger = logging.getLogger(__name__) # Try to import document libraries try: from docx import Document # type: ignore DOCX_AVAILABLE = True except Exception: # pragma: no cover Document = None # type: ignore DOCX_AVAILABLE = False logger.info("python-docx not available - .docx support disabled") try: import PyPDF2 # type: ignore PDF_AVAILABLE = True except Exception: PyPDF2 = None # type: ignore PDF_AVAILABLE = False logger.info("PyPDF2 not available - .pdf support disabled") def read_uploaded_text(file) -> Optional[str]: """Read text from a Streamlit UploadedFile. Supports .txt, .docx, and .pdf.""" if file is None: return None name = file.name.lower() logger.info(f"Attempting to read file: {file.name}") try: if name.endswith(".txt"): data = file.getvalue() text = data.decode("utf-8", errors="ignore") logger.info(f"Successfully read .txt file: {len(text)} characters") return text elif name.endswith(".docx"): if not DOCX_AVAILABLE: logger.warning("python-docx not installed. Cannot read .docx files.") logger.info("Install with: pip install python-docx") return None data = file.getvalue() bio = io.BytesIO(data) doc = Document(bio) # type: ignore parts = [] for p in doc.paragraphs: if p.text.strip(): # Only add non-empty paragraphs parts.append(p.text) text = "\n".join(parts) logger.info(f"Successfully read .docx file: {len(text)} characters") return text elif name.endswith(".pdf"): if not PDF_AVAILABLE: logger.warning("PyPDF2 not installed. Cannot read .pdf files.") logger.info("Install with: pip install PyPDF2") return None data = file.getvalue() bio = io.BytesIO(data) pdf_reader = PyPDF2.PdfReader(bio) # type: ignore text_parts = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text_parts.append(page.extract_text()) text = "\n".join(text_parts) logger.info(f"Successfully read .pdf file: {len(text)} characters") return text else: logger.warning(f"Unsupported file type: {name}") return None except Exception as e: logger.error(f"Error reading file {file.name}: {str(e)}", exc_info=True) return None