File size: 2,939 Bytes
7498f2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from __future__ import annotations
from typing import Optional
import io
import logging

logger = logging.getLogger(__name__)

# Try to import document libraries
try:
    from docx import Document  # type: ignore
    DOCX_AVAILABLE = True
except Exception:  # pragma: no cover
    Document = None  # type: ignore
    DOCX_AVAILABLE = False
    logger.info("python-docx not available - .docx support disabled")

try:
    import PyPDF2  # type: ignore
    PDF_AVAILABLE = True
except Exception:
    PyPDF2 = None  # type: ignore
    PDF_AVAILABLE = False
    logger.info("PyPDF2 not available - .pdf support disabled")


def read_uploaded_text(file) -> Optional[str]:
    """Read text from a Streamlit UploadedFile. Supports .txt, .docx, and .pdf."""
    if file is None:
        return None
    
    name = file.name.lower()
    logger.info(f"Attempting to read file: {file.name}")
    
    try:
        if name.endswith(".txt"):
            data = file.getvalue()
            text = data.decode("utf-8", errors="ignore")
            logger.info(f"Successfully read .txt file: {len(text)} characters")
            return text
            
        elif name.endswith(".docx"):
            if not DOCX_AVAILABLE:
                logger.warning("python-docx not installed. Cannot read .docx files.")
                logger.info("Install with: pip install python-docx")
                return None
                
            data = file.getvalue()
            bio = io.BytesIO(data)
            doc = Document(bio)  # type: ignore
            parts = []
            for p in doc.paragraphs:
                if p.text.strip():  # Only add non-empty paragraphs
                    parts.append(p.text)
            text = "\n".join(parts)
            logger.info(f"Successfully read .docx file: {len(text)} characters")
            return text
            
        elif name.endswith(".pdf"):
            if not PDF_AVAILABLE:
                logger.warning("PyPDF2 not installed. Cannot read .pdf files.")
                logger.info("Install with: pip install PyPDF2")
                return None
                
            data = file.getvalue()
            bio = io.BytesIO(data)
            pdf_reader = PyPDF2.PdfReader(bio)  # type: ignore
            text_parts = []
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text_parts.append(page.extract_text())
            
            text = "\n".join(text_parts)
            logger.info(f"Successfully read .pdf file: {len(text)} characters")
            return text
            
        else:
            logger.warning(f"Unsupported file type: {name}")
            return None
            
    except Exception as e:
        logger.error(f"Error reading file {file.name}: {str(e)}", exc_info=True)
        return None