File size: 5,330 Bytes
1ae86a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
PDF Processing Module
Handles PDF file upload, text extraction, and preprocessing.
"""

import PyPDF2
import io
import re
from typing import Optional, List
import streamlit as st

class PDFProcessor:
    """Class to handle PDF file processing and text extraction"""
    
    def __init__(self):
        self.max_file_size = 10 * 1024 * 1024  # 10MB limit
    
    def validate_pdf(self, uploaded_file) -> bool:
        """
        Validate uploaded PDF file
        
        Args:
            uploaded_file: Streamlit uploaded file object
            
        Returns:
            bool: True if valid, False otherwise
        """
        # Check file size
        if uploaded_file.size > self.max_file_size:
            st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)")
            return False
        
        # Check file type
        if uploaded_file.type != "application/pdf":
            st.error("Please upload a valid PDF file")
            return False
        
        return True
    
    def extract_text_from_pdf(self, uploaded_file) -> Optional[str]:
        """
        Extract text content from uploaded PDF file

        Args:
            uploaded_file: Streamlit uploaded file object

        Returns:
            str: Extracted text content or None if extraction fails
        """
        try:
            # Reset file pointer
            uploaded_file.seek(0)

            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))

            # Check if PDF is encrypted
            if pdf_reader.is_encrypted:
                st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.")
                return None

            # Check number of pages
            num_pages = len(pdf_reader.pages)
            if num_pages == 0:
                st.error("❌ PDF file appears to be empty or corrupted.")
                return None

            if num_pages > 100:
                st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.")

            # Extract text from all pages
            text_content = ""
            failed_pages = []

            for page_num, page in enumerate(pdf_reader.pages):
                try:
                    page_text = page.extract_text()
                    if page_text.strip():  # Only add non-empty pages
                        text_content += page_text + "\n"
                except Exception as e:
                    failed_pages.append(page_num + 1)
                    continue

            # Report failed pages
            if failed_pages:
                if len(failed_pages) < 5:
                    st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}")
                else:
                    st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages")

            if not text_content.strip():
                st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.")
                return None

            # Check if extracted text is too short
            if len(text_content.strip()) < 100:
                st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.")

            return text_content

        except PyPDF2.errors.PdfReadError as e:
            st.error(f"❌ Invalid or corrupted PDF file: {str(e)}")
            return None
        except MemoryError:
            st.error("❌ PDF file is too large to process. Please try a smaller file.")
            return None
        except Exception as e:
            st.error(f"❌ Unexpected error processing PDF file: {str(e)}")
            return None
    
    def preprocess_text(self, text: str) -> str:
        """
        Clean and preprocess extracted text
        
        Args:
            text: Raw extracted text
            
        Returns:
            str: Cleaned and preprocessed text
        """
        if not text:
            return ""
        
        # Remove excessive whitespace and newlines
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters that might interfere with processing
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
        
        # Remove extra spaces
        text = ' '.join(text.split())
        
        return text.strip()
    
    def process_pdf(self, uploaded_file) -> Optional[str]:
        """
        Complete PDF processing pipeline
        
        Args:
            uploaded_file: Streamlit uploaded file object
            
        Returns:
            str: Processed text content or None if processing fails
        """
        if not self.validate_pdf(uploaded_file):
            return None
        
        # Extract text
        raw_text = self.extract_text_from_pdf(uploaded_file)
        if raw_text is None:
            return None
        
        # Preprocess text
        processed_text = self.preprocess_text(raw_text)
        
        if len(processed_text) < 50:
            st.warning("The extracted text is very short. Please check if the PDF contains readable text.")
        
        return processed_text