Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| from typing import Optional | |
| import io | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| # Try to import document libraries | |
| try: | |
| from docx import Document # type: ignore | |
| DOCX_AVAILABLE = True | |
| except Exception: # pragma: no cover | |
| Document = None # type: ignore | |
| DOCX_AVAILABLE = False | |
| logger.info("python-docx not available - .docx support disabled") | |
| try: | |
| import PyPDF2 # type: ignore | |
| PDF_AVAILABLE = True | |
| except Exception: | |
| PyPDF2 = None # type: ignore | |
| PDF_AVAILABLE = False | |
| logger.info("PyPDF2 not available - .pdf support disabled") | |
| def read_uploaded_text(file) -> Optional[str]: | |
| """Read text from a Streamlit UploadedFile. Supports .txt, .docx, and .pdf.""" | |
| if file is None: | |
| return None | |
| name = file.name.lower() | |
| logger.info(f"Attempting to read file: {file.name}") | |
| try: | |
| if name.endswith(".txt"): | |
| data = file.getvalue() | |
| text = data.decode("utf-8", errors="ignore") | |
| logger.info(f"Successfully read .txt file: {len(text)} characters") | |
| return text | |
| elif name.endswith(".docx"): | |
| if not DOCX_AVAILABLE: | |
| logger.warning("python-docx not installed. Cannot read .docx files.") | |
| logger.info("Install with: pip install python-docx") | |
| return None | |
| data = file.getvalue() | |
| bio = io.BytesIO(data) | |
| doc = Document(bio) # type: ignore | |
| parts = [] | |
| for p in doc.paragraphs: | |
| if p.text.strip(): # Only add non-empty paragraphs | |
| parts.append(p.text) | |
| text = "\n".join(parts) | |
| logger.info(f"Successfully read .docx file: {len(text)} characters") | |
| return text | |
| elif name.endswith(".pdf"): | |
| if not PDF_AVAILABLE: | |
| logger.warning("PyPDF2 not installed. Cannot read .pdf files.") | |
| logger.info("Install with: pip install PyPDF2") | |
| return None | |
| data = file.getvalue() | |
| bio = io.BytesIO(data) | |
| pdf_reader = PyPDF2.PdfReader(bio) # type: ignore | |
| text_parts = [] | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text_parts.append(page.extract_text()) | |
| text = "\n".join(text_parts) | |
| logger.info(f"Successfully read .pdf file: {len(text)} characters") | |
| return text | |
| else: | |
| logger.warning(f"Unsupported file type: {name}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error reading file {file.name}: {str(e)}", exc_info=True) | |
| return None |