File size: 12,877 Bytes
6a70d5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import os
import io
import sqlite3
import pandas as pd
from typing import List, Dict, Any
from pathlib import Path

# Document processing libraries
import PyPDF2
import pdfplumber
from docx import Document
import pytesseract
from PIL import Image

# ML libraries
from sentence_transformers import SentenceTransformer

from config import Config

class DocumentProcessor:
    """Handle document processing for various file types"""
    
    def __init__(self, config: Config = None):
        self.config = config or Config()
        
        # Initialize embedding model
        print(f"Loading embedding model: {self.config.EMBEDDING_MODEL}")
        self.embedding_model = SentenceTransformer(self.config.EMBEDDING_MODEL)
        
        # Configure Tesseract if available
        self._setup_tesseract()
    
    def _setup_tesseract(self):
        """Setup Tesseract OCR configuration"""
        try:
            if os.path.exists(self.config.TESSERACT_CMD):
                pytesseract.pytesseract.tesseract_cmd = self.config.TESSERACT_CMD
            print("✅ Tesseract OCR configured successfully")
        except Exception as e:
            print(f"⚠️ Tesseract setup warning: {e}")
    
    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF using multiple methods"""
        text = ""
        
        try:
            # Primary method: pdfplumber
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    try:
                        page_text = page.extract_text()
                        if page_text and page_text.strip():
                            text += f"\n[Page {page_num + 1}]\n{page_text}\n"
                    except Exception as e:
                        print(f"Warning: Could not extract text from page {page_num + 1}: {e}")
                        
        except Exception as e:
            print(f"pdfplumber failed, trying PyPDF2: {e}")
            
            # Fallback method: PyPDF2
            try:
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page_num, page in enumerate(pdf_reader.pages):
                        try:
                            page_text = page.extract_text()
                            if page_text and page_text.strip():
                                text += f"\n[Page {page_num + 1}]\n{page_text}\n"
                        except Exception as e:
                            print(f"Warning: Could not extract text from page {page_num + 1}: {e}")
            except Exception as e:
                print(f"PyPDF2 also failed: {e}")
                raise ValueError(f"Could not extract text from PDF: {e}")
        
        if not text.strip():
            raise ValueError("No text content found in PDF")
            
        return text
    
    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text from Word document"""
        try:
            doc = Document(file_path)
            text = ""
            
            # Extract paragraph text
            for para_num, paragraph in enumerate(doc.paragraphs):
                if paragraph.text.strip():
                    text += f"{paragraph.text}\n"
            
            # Extract table text if any
            for table_num, table in enumerate(doc.tables):
                text += f"\n[Table {table_num + 1}]\n"
                for row in table.rows:
                    row_text = " | ".join([cell.text.strip() for cell in row.cells])
                    if row_text.strip():
                        text += f"{row_text}\n"
            
            if not text.strip():
                raise ValueError("No text content found in Word document")
                
            return text
            
        except Exception as e:
            raise ValueError(f"Could not process Word document: {e}")
    
    def extract_text_from_image(self, image_data: bytes) -> str:
        """Extract text from image using OCR"""
        try:
            # Open image
            image = Image.open(io.BytesIO(image_data))
            
            # Convert to RGB if necessary
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            # Perform OCR
            text = pytesseract.image_to_string(
                image,
                lang=self.config.OCR_LANGUAGE,
                config='--psm 6'  # Uniform block of text
            )
            
            if not text.strip():
                # Try different PSM mode
                text = pytesseract.image_to_string(
                    image,
                    lang=self.config.OCR_LANGUAGE,
                    config='--psm 3'  # Fully automatic page segmentation
                )
            
            return text.strip()
            
        except Exception as e:
            raise ValueError(f"OCR failed: {e}")
    
    def extract_text_from_csv(self, file_path: str) -> str:
        """Extract text from CSV file"""
        try:
            # Try different encodings
            encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
            df = None
            
            for encoding in encodings:
                try:
                    df = pd.read_csv(file_path, encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue
            
            if df is None:
                raise ValueError("Could not read CSV with any supported encoding")
            
            # Convert DataFrame to text
            text = f"CSV Data from: {Path(file_path).name}\n\n"
            text += f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n\n"
            
            # Add column information
            text += "Columns:\n"
            for col in df.columns:
                text += f"- {col}\n"
            text += "\n"
            
            # Add sample data (first few rows)
            text += "Sample Data:\n"
            text += df.head(10).to_string(index=False) + "\n\n"
            
            # Add summary statistics for numeric columns
            numeric_cols = df.select_dtypes(include=['number']).columns
            if len(numeric_cols) > 0:
                text += "Numeric Summary:\n"
                text += df[numeric_cols].describe().to_string() + "\n\n"
            
            return text
            
        except Exception as e:
            raise ValueError(f"Could not process CSV file: {e}")
    
    def extract_text_from_db(self, file_path: str) -> str:
        """Extract text from SQLite database"""
        try:
            conn = sqlite3.connect(file_path)
            text = f"SQLite Database: {Path(file_path).name}\n\n"
            
            # Get all table names
            cursor = conn.cursor()
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
            tables = cursor.fetchall()
            
            if not tables:
                raise ValueError("No tables found in database")
            
            text += f"Tables found: {len(tables)}\n\n"
            
            for table_name_tuple in tables:
                table_name = table_name_tuple[0]
                text += f"=== Table: {table_name} ===\n"
                
                try:
                    # Get table schema
                    cursor.execute(f"PRAGMA table_info({table_name})")
                    columns = cursor.fetchall()
                    
                    text += "Columns:\n"
                    for col in columns:
                        text += f"- {col[1]} ({col[2]})\n"
                    
                    # Get row count
                    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
                    row_count = cursor.fetchone()[0]
                    text += f"Row count: {row_count}\n\n"
                    
                    # Get sample data
                    df = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 10", conn)
                    text += "Sample Data:\n"
                    text += df.to_string(index=False) + "\n\n"
                    
                except Exception as e:
                    text += f"Error reading table {table_name}: {e}\n\n"
            
            conn.close()
            return text
            
        except Exception as e:
            raise ValueError(f"Could not process SQLite database: {e}")
    
    def chunk_text(self, text: str, metadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
        """Split text into chunks with overlap and metadata"""
        if not text.strip():
            return []
        
        # Clean text
        text = self._clean_text(text)
        
        chunks = []
        words = text.split()
        
        if len(words) <= self.config.CHUNK_SIZE:
            # If text is smaller than chunk size, return as single chunk
            chunks.append({
                'text': text,
                'metadata': metadata or {},
                'chunk_index': 0,
                'word_count': len(words)
            })
        else:
            # Split into overlapping chunks
            for i in range(0, len(words), self.config.CHUNK_SIZE - self.config.CHUNK_OVERLAP):
                chunk_words = words[i:i + self.config.CHUNK_SIZE]
                chunk_text = " ".join(chunk_words)
                
                chunk_metadata = (metadata or {}).copy()
                chunk_metadata.update({
                    'chunk_index': len(chunks),
                    'word_count': len(chunk_words),
                    'start_word': i,
                    'end_word': i + len(chunk_words)
                })
                
                chunks.append({
                    'text': chunk_text,
                    'metadata': chunk_metadata
                })
                
                # Break if we've covered all words
                if i + self.config.CHUNK_SIZE >= len(words):
                    break
        
        return chunks
    
    def _clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        # Remove excessive whitespace
        import re
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters that might cause issues
        text = re.sub(r'[^\w\s\.,!?;:()\-\'"$%&@#]', ' ', text)
        
        # Remove excessive punctuation
        text = re.sub(r'[.]{3,}', '...', text)
        text = re.sub(r'[-]{3,}', '---', text)
        
        return text.strip()
    
    def process_document(self, file_path: str, file_type: str) -> List[str]:
        """Process document based on file type and return text chunks"""
        try:
            # Extract text based on file type
            if file_type.lower() == '.pdf':
                text = self.extract_text_from_pdf(file_path)
            elif file_type.lower() == '.docx':
                text = self.extract_text_from_docx(file_path)
            elif file_type.lower() == '.txt':
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
            elif file_type.lower() in ['.jpg', '.jpeg', '.png']:
                with open(file_path, 'rb') as f:
                    text = self.extract_text_from_image(f.read())
            elif file_type.lower() == '.csv':
                text = self.extract_text_from_csv(file_path)
            elif file_type.lower() == '.db':
                text = self.extract_text_from_db(file_path)
            else:
                raise ValueError(f"Unsupported file type: {file_type}")
            
            if not text or not text.strip():
                raise ValueError("No text content extracted from file")
            
            # Create metadata
            metadata = {
                'filename': Path(file_path).name,
                'file_type': file_type,
                'file_size': os.path.getsize(file_path)
            }
            
            # Chunk the text
            chunks_data = self.chunk_text(text, metadata)
            
            # Return just the text chunks for backward compatibility
            return [chunk['text'] for chunk in chunks_data]
            
        except Exception as e:
            print(f"Error processing document {file_path}: {e}")
            raise
    
    def get_supported_formats(self) -> Dict[str, str]:
        """Get supported file formats"""
        return {
            '.pdf': 'PDF documents',
            '.docx': 'Microsoft Word documents',
            '.txt': 'Plain text files',
            '.jpg': 'JPEG images (with OCR)',
            '.jpeg': 'JPEG images (with OCR)',
            '.png': 'PNG images (with OCR)',
            '.csv': 'Comma-separated values',
            '.db': 'SQLite databases'
        }