File size: 2,172 Bytes
4d48d5a
 
 
d0d0352
 
4d48d5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import logging
from pathlib import Path

import fitz  # PyMuPDF

logger = logging.getLogger(__name__)

class DocumentProcessor:
    """Simplified document processor for the API service"""
    
    def __init__(self):
        """Initialize the document processor"""
        self.supported_formats = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
    
    def process_document(
        self,
        file_data: bytes,
        filename: str,
        use_ocr: bool = False
    ) -> str:
        """
        Extract text from document (PDF or image)
        
        Args:
            file_data: Raw file content
            filename: Original filename
            use_ocr: Whether to use OCR (not implemented in this simplified version)
            
        Returns:
            Extracted text as string
        """
        try:
            file_ext = Path(filename).suffix.lower()
            logger.info(f"Processing file: {filename} with extension: {file_ext}")
            
            if file_ext not in self.supported_formats:
                raise ValueError(f"Unsupported file format: {file_ext}")
            
            if file_ext == '.pdf':
                return self._process_pdf(file_data)
            
            else:
                if use_ocr:
                    raise NotImplementedError("OCR for images not implemented")
                else:
                    return "Text extraction from images requires OCR to be enabled"
                
        except Exception as e:
            logger.error(f"Error processing document: {str(e)}")
            raise
    
    def _process_pdf(self, file_data: bytes) -> str:
        """Process PDF to extract text using PyMuPDF"""
        try:
            with fitz.open(stream=file_data, filetype="pdf") as pdf_doc:
                text_parts = []
                for page_num in range(len(pdf_doc)):
                    page = pdf_doc[page_num]
                    text = page.get_text()
                    text_parts.append(text)
                
                return "\n\n".join(text_parts)
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise