Spaces:

Agents-MCP-Hackathon
/

pdf_explainer

Sleeping

File size: 14,531 Bytes

4ce49c4
 
 
 
68c0007
4ce49c4

import base64
import os
from typing import Optional, Tuple, List, Dict, Any
from mistralai import Mistral
from src.utils.text_explainer import TextExplainer

class PDFTextExtractor:
    """PDF text extraction using Mistral AI OCR."""
    
    def __init__(self):
        """Initialize the PDF text extractor with Mistral AI client."""
        self.api_key = os.environ.get("MISTRAL_API_KEY")
        if not self.api_key:
            raise ValueError("MISTRAL_API_KEY environment variable is required")
        self.client = Mistral(api_key=self.api_key)
        self.text_explainer = TextExplainer()
    
    def encode_pdf(self, pdf_path: str) -> Optional[str]:
        """
        Encode the PDF file to base64.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Base64 encoded string or None if error
        """
        try:
            with open(pdf_path, "rb") as pdf_file:
                return base64.b64encode(pdf_file.read()).decode('utf-8')
        except FileNotFoundError:
            print(f"Error: The file {pdf_path} was not found.")
            return None
        except Exception as e:
            print(f"Error encoding PDF: {e}")
            return None
    
    def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
        """
        Extract text and images from uploaded PDF using Mistral AI OCR.
        
        Args:
            pdf_file: Gradio file object
            
        Returns:
            Tuple of (extracted_text, status_message, images_data)
        """
        if pdf_file is None:
            return "", "Please upload a PDF file.", []
        
        try:
            # Get the file path from Gradio file object
            pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
            
            # Encode PDF to base64
            base64_pdf = self.encode_pdf(pdf_path)
            if base64_pdf is None:
                return "", "Failed to encode PDF file.", []
            
            # Process with Mistral OCR
            print(f"🔄 Processing PDF with Mistral OCR...")
            ocr_response = self.client.ocr.process(
                model="mistral-ocr-latest",
                document={
                    "type": "document_url",
                    "document_url": f"data:application/pdf;base64,{base64_pdf}" 
                },
                include_image_base64=True
            )
            
            # Enhanced debugging and response parsing
            print("🔍 Analyzing OCR Response Structure...")
            print(f"  Type: {type(ocr_response)}")
            print(f"  String representation: {str(ocr_response)[:500]}...")
            
            # Check if it's a simple object with attributes
            if hasattr(ocr_response, '__dict__'):
                print(f"  Object attributes: {list(ocr_response.__dict__.keys())}")
                for key, value in ocr_response.__dict__.items():
                    print(f"    {key}: {type(value)} = {str(value)[:100]}...")
            
            # Check if it has commonly expected attributes
            common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
            for attr in common_attrs:
                if hasattr(ocr_response, attr):
                    value = getattr(ocr_response, attr)
                    print(f"  Has '{attr}': {type(value)} = {str(value)[:100]}...")
            
            # Check if it's iterable but not a string
            try:
                if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
                    print(f"  Iterable with {len(list(ocr_response))} items")
                    for i, item in enumerate(ocr_response):
                        if i < 3:  # Show first 3 items
                            print(f"    Item {i}: {type(item)} = {str(item)[:100]}...")
            except Exception as e:
                print(f"  Error checking iteration: {e}")
            
            # Advanced text extraction with multiple strategies
            extracted_text = ""
            extraction_method = "none"
            extracted_images = []
            
            # Strategy 1: Mistral OCR specific - pages with markdown content and images
            if hasattr(ocr_response, 'pages') and ocr_response.pages:
                pages = ocr_response.pages
                if isinstance(pages, list) and len(pages) > 0:
                    page_texts = []
                    
                    for i, page in enumerate(pages):
                        # Extract text
                        if hasattr(page, 'markdown') and page.markdown:
                            page_texts.append(page.markdown)
                            print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
                        
                        # Extract images
                        if hasattr(page, 'images') and page.images:
                            for j, img in enumerate(page.images):
                                image_data = {
                                    'page': i,
                                    'image_id': f"img-{i}-{j}",
                                    'top_left_x': getattr(img, 'top_left_x', 0),
                                    'top_left_y': getattr(img, 'top_left_y', 0),
                                    'bottom_right_x': getattr(img, 'bottom_right_x', 0),
                                    'bottom_right_y': getattr(img, 'bottom_right_y', 0),
                                    'base64': getattr(img, 'image_base64', '')
                                }
                                extracted_images.append(image_data)
                                print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
                    
                    if page_texts:
                        extracted_text = "\n\n".join(page_texts)
                        extraction_method = f"pages_markdown_{len(page_texts)}_pages"
            
            # Try to extract images from other response structures if no images found yet
            if not extracted_images:
                # Check if response has images attribute directly
                if hasattr(ocr_response, 'images') and ocr_response.images:
                    for j, img in enumerate(ocr_response.images):
                        image_data = {
                            'page': 0,
                            'image_id': getattr(img, 'id', f"img-{j}"),
                            'top_left_x': getattr(img, 'top_left_x', 0),
                            'top_left_y': getattr(img, 'top_left_y', 0),
                            'bottom_right_x': getattr(img, 'bottom_right_x', 0),
                            'bottom_right_y': getattr(img, 'bottom_right_y', 0),
                            'base64': getattr(img, 'image_base64', '')
                        }
                        extracted_images.append(image_data)
                        print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
            
            # Continue with fallback strategies for text extraction
            if not extracted_text:
                # Strategy 2: Direct text attribute (fallback)
                if hasattr(ocr_response, 'text') and ocr_response.text:
                    extracted_text = str(ocr_response.text)
                    extraction_method = "direct_text_attribute"
                
                # Strategy 3: Content attribute (fallback)
                elif hasattr(ocr_response, 'content') and ocr_response.content:
                    content = ocr_response.content
                    if isinstance(content, str):
                        extracted_text = content
                        extraction_method = "content_attribute_string"
                    elif hasattr(content, 'text'):
                        extracted_text = str(content.text)
                        extraction_method = "content_text_attribute"
                    else:
                        extracted_text = str(content)
                        extraction_method = "content_attribute_converted"
                
                # Strategy 4: Result attribute (fallback)
                elif hasattr(ocr_response, 'result'):
                    result = ocr_response.result
                    if isinstance(result, str):
                        extracted_text = result
                        extraction_method = "result_string"
                    elif hasattr(result, 'text'):
                        extracted_text = str(result.text)
                        extraction_method = "result_text_attribute"
                    elif isinstance(result, dict) and 'text' in result:
                        extracted_text = str(result['text'])
                        extraction_method = "result_dict_text"
                    else:
                        extracted_text = str(result)
                        extraction_method = "result_converted"
                
                # Strategy 5: Choices attribute (ChatGPT-style response - fallback)
                elif hasattr(ocr_response, 'choices') and ocr_response.choices:
                    choices = ocr_response.choices
                    if isinstance(choices, list) and len(choices) > 0:
                        choice = choices[0]
                        if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
                            extracted_text = str(choice.message.content)
                            extraction_method = "choices_message_content"
                        elif hasattr(choice, 'text'):
                            extracted_text = str(choice.text)
                            extraction_method = "choices_text"
                        else:
                            extracted_text = str(choice)
                            extraction_method = "choices_converted"
                
                # Strategy 6: Dict-like access (fallback)
                elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
                    for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
                        if hasattr(ocr_response, 'get'):
                            value = ocr_response.get(key)
                        else:
                            value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
                        
                        if value:
                            extracted_text = str(value)
                            extraction_method = f"dict_key_{key}"
                            break
                
                # Strategy 7: Inspect all attributes for string-like content (fallback)
                elif hasattr(ocr_response, '__dict__'):
                    for key, value in ocr_response.__dict__.items():
                        if isinstance(value, str) and len(value) > 20:  # Likely text content
                            extracted_text = value
                            extraction_method = f"attribute_{key}"
                            break
                        elif hasattr(value, 'text') and isinstance(value.text, str):
                            extracted_text = str(value.text)
                            extraction_method = f"nested_text_in_{key}"
                            break
                
                # Strategy 8: Convert entire response to string if it seems to contain text (fallback)
                if not extracted_text:
                    response_str = str(ocr_response)
                    if len(response_str) > 50 and not response_str.startswith('<'):  # Not an object reference
                        extracted_text = response_str
                        extraction_method = "full_response_string"
            
            print(f"🎯 Extraction method used: {extraction_method}")
            print(f"📏 Extracted text length: {len(extracted_text)} characters")
            print(f"🖼️ Extracted images: {len(extracted_images)}")
            
            if extracted_text:
                status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
                if extracted_images:
                    status += f" and {len(extracted_images)} image(s)"
            else:
                extracted_text = "No text could be extracted from this PDF."
                status = "⚠️ OCR completed but no text was found in response."
                if extracted_images:
                    status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
                print(f"❌ No extractable text found in OCR response")
            
            return extracted_text, status, extracted_images
            
        except Exception as e:
            error_msg = f"Error processing PDF: {str(e)}"
            print(error_msg)
            return "", f"❌ {error_msg}", []
    
    def generate_explanations(self, extracted_text: str) -> str:
        """
        Generate explanations for the extracted text sections.
        
        Args:
            extracted_text: The extracted text from PDF
            
        Returns:
            Formatted explanations for all sections
        """
        try:
            if not extracted_text or extracted_text.strip() == "":
                return "No text available to explain."
            
            if extracted_text.startswith("No text could be extracted"):
                return "Cannot generate explanations - no text was extracted from the PDF."
            
            print("🤖 Generating explanations for extracted text...")
            explained_sections = self.text_explainer.explain_all_sections(extracted_text)
            
            if not explained_sections:
                return "No sections found to explain in the extracted text."
            
            formatted_explanations = self.text_explainer.format_explanations_for_display(explained_sections)
            return formatted_explanations
            
        except Exception as e:
            error_msg = f"Error generating explanations: {str(e)}"
            print(error_msg)
            return f"❌ {error_msg}"