# app.py - Correct structure with monkey patch BEFORE any fitz imports

import os
import subprocess
import sys
import tempfile
import time
import base64
import json

# SSL fix function (keep as is)
def fix_ssl_library():
    """Download and install libssl1.1 if not present"""
    try:
        if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
            print("libssl.so.1.1 already exists")
            return True
            
        print("Attempting to install libssl1.1...")
        
        subprocess.run([
            'wget', '-q', 
            'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
            '-O', '/tmp/libssl1.1.deb'
        ], check=True)
        
        result = subprocess.run([
            'dpkg', '-i', '/tmp/libssl1.1.deb'
        ], capture_output=True, text=True)
        
        if result.returncode != 0:
            print("dpkg install failed, trying manual extraction...")
            subprocess.run([
                'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
            ], check=True)
            
            lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
            current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
            if current_ld_path:
                os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
            else:
                os.environ['LD_LIBRARY_PATH'] = lib_path
            print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
        
        return True
        
    except Exception as e:
        print(f"Failed to install libssl1.1: {e}")
        return False

# CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF
def monkey_patch_pymupdf():
    """Fix PaddleOCR compatibility with newer PyMuPDF versions"""
    print("Applying PyMuPDF compatibility patches...")
    
    # Import fitz here to apply patches
    import fitz
    
    # Add pageCount property to Document class if it doesn't exist
    if not hasattr(fitz.Document, 'pageCount'):
        def pageCount_property(self):
            return self.page_count
        
        fitz.Document.pageCount = property(pageCount_property)
        print("✓ Added pageCount compatibility property to PyMuPDF Document class")
    else:
        print("✓ pageCount already exists")
    
    # Add getPixmap method to Page class if it doesn't exist
    if not hasattr(fitz.Page, 'getPixmap'):
        def getPixmap(self, matrix=None, alpha=True):
            return self.get_pixmap(matrix=matrix, alpha=alpha)
        
        fitz.Page.getPixmap = getPixmap
        print("✓ Added getPixmap compatibility method to PyMuPDF Page class")
    else:
        print("✓ getPixmap already exists")
    
    # Add getText method if it doesn't exist
    if not hasattr(fitz.Page, 'getText'):
        def getText(self, option="text"):
            return self.get_text(option)
        
        fitz.Page.getText = getText
        print("✓ Added getText compatibility method to PyMuPDF Page class")
    else:
        print("✓ getText already exists")
    
    print("✓ PyMuPDF compatibility patches applied successfully")

def try_paddle_import():
    """Try different approaches to import PaddleOCR"""
    
    # First try the SSL fix
    fix_ssl_library()
    
    # CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR
    monkey_patch_pymupdf()
    
    # Try importing with different environment variables
    os.environ['PADDLE_GIT_DISABLE'] = '1'
    
    try:
        from paddleocr import PaddleOCR
        return PaddleOCR
    except ImportError as e:
        if 'libssl.so.1.1' in str(e):
            print("Still having SSL issues, trying alternative PaddlePaddle version...")
            
            try:
                subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'], 
                             capture_output=True)
                subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'], 
                             check=True)
                from paddleocr import PaddleOCR
                return PaddleOCR
            except Exception as inner_e:
                print(f"Failed to install alternative version: {inner_e}")
                
        print(f"PaddleOCR import failed: {e}")
        raise e

# Import Gradio
import gradio as gr

# Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR
import fitz  # This import will use the patched version

# Try to import PaddleOCR with fixes
print("Attempting to import PaddleOCR...")
try:
    PaddleOCR = try_paddle_import()
    print("Loading PaddleOCR models...")
    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
    print("PaddleOCR models loaded successfully!")
except Exception as e:
    print(f"Failed to load PaddleOCR: {e}")
    print("Application will exit - compatibility issue not resolved")
    sys.exit(1)

# Test the monkey patch
print("Testing monkey patch...")
test_doc = None
try:
    # Create a simple test to verify pageCount exists
    import io
    pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF"
    test_doc = fitz.open(stream=pdf_content, filetype="pdf")
    
    if hasattr(test_doc, 'pageCount'):
        print(f"✓ Monkey patch successful! pageCount = {test_doc.pageCount}")
    else:
        print("✗ Monkey patch failed - pageCount not found")
        print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}")
    
    test_doc.close()
except Exception as e:
    print(f"Monkey patch test failed: {e}")
    if test_doc:
        test_doc.close()

# Rest of your app code (process_document, API functions, Gradio interface, etc.)
def process_document(file):
    """Process uploaded document with PaddleOCR"""
    if file is None:
        return "No file uploaded", "", ""
    
    start_time = time.time()
    
    try:
        filename = os.path.basename(file.name)
        print(f"Processing: {filename}")
        
        file_path = file.name
        print(f"File path: {file_path}")
        
        # Count pages if PDF
        total_pages = 1
        if filename.lower().endswith('.pdf'):
            try:
                print(f"Opening PDF: {file_path}")
                doc = fitz.open(file_path)
                
                # Test pageCount attribute
                print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
                print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")
                
                if hasattr(doc, 'pageCount'):
                    total_pages = doc.pageCount
                    print(f"Used pageCount: {total_pages}")
                elif hasattr(doc, 'page_count'):
                    total_pages = doc.page_count
                    print(f"Used page_count: {total_pages}")
                else:
                    total_pages = len(doc)
                    print(f"Used len(): {total_pages}")
                    
                doc.close()
            except Exception as e:
                print(f"PDF page counting error: {e}")
                total_pages = 1
        
        # Run OCR
        print(f"Running OCR on: {file_path}")
        result = ocr.ocr(file_path, cls=True)
        
        # Extract text
        extracted_text = ""
        pages_processed = 0
        
        if result:
            for page_idx, page_result in enumerate(result):
                if page_result:
                    pages_processed += 1
                    for line in page_result:
                        if len(line) >= 2 and line[1][1] > 0.5:
                            extracted_text += line[1][0] + "\n"
        
        processing_time = time.time() - start_time
        
        summary = f"""
📄 **File**: {filename}
📊 **Pages Processed**: {pages_processed}/{total_pages}
⏱️ **Processing Time**: {processing_time:.2f} seconds
📝 **Text Length**: {len(extracted_text)} characters
🔧 **OCR Engine**: PaddleOCR
        """
        
        api_response = json.dumps({
            "success": True,
            "text": extracted_text,
            "filename": filename,
            "pages_processed": pages_processed,
            "total_pages": total_pages,
            "processing_time": processing_time,
            "ocr_engine": "PaddleOCR"
        }, indent=2)
        
        return summary, extracted_text, api_response
        
    except Exception as e:
        error_msg = f"Error processing file: {str(e)}"
        print(f"Full error: {e}")
        import traceback
        traceback.print_exc()
        return error_msg, "", json.dumps({"success": False, "error": str(e)})

def process_api_request(api_data):
    """Process API-style requests (for integration with your Vercel app)"""
    try:
        data = json.loads(api_data)
        
        if 'file' not in data:
            return json.dumps({"success": False, "error": "No file data provided"})
        
        # Decode base64 file
        file_data = base64.b64decode(data['file'])
        filename = data.get('filename', 'unknown.pdf')
        
        # Save to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
            tmp_file.write(file_data)
            tmp_file_path = tmp_file.name
        
        try:
            # Run OCR
            result = ocr.ocr(tmp_file_path, cls=True)
            
            # Extract text
            text = ""
            for page_result in result:
                if page_result:
                    for line in page_result:
                        if len(line) >= 2:
                            text += line[1][0] + "\n"
            
            return json.dumps({
                "success": True,
                "text": text,
                "filename": filename,
                "ocr_engine": "PaddleOCR"
            })
            
        finally:
            os.unlink(tmp_file_path)
            
    except Exception as e:
        return json.dumps({"success": False, "error": str(e)})

# Create Gradio interface with multiple tabs
with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
    gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
    gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
    
    with gr.Tab("📄 File Upload"):
        with gr.Row():
            with gr.Column():
                file_input = gr.File(
                    label="Upload Document (PDF, JPG, PNG)",
                    file_types=[".pdf", ".jpg", ".jpeg", ".png"]
                )
                process_btn = gr.Button("🔍 Process Document", variant="primary")
            
            with gr.Column():
                summary_output = gr.Markdown(label="📊 Processing Summary")
        
        with gr.Row():
            text_output = gr.Textbox(
                label="📝 Extracted Text",
                lines=15,
                max_lines=20
            )
        
        process_btn.click(
            fn=process_document,
            inputs=[file_input],
            outputs=[summary_output, text_output, gr.Textbox(visible=False)]
        )
    
    with gr.Tab("🔌 API Integration"):
        gr.Markdown("### For integration with your Vercel app:")
        gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
        gr.Markdown("**Method**: POST")
        gr.Markdown("**Headers**: `Content-Type: application/json`")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("**Sample Request:**")
                gr.Code('''
{
  "data": [
    {
      "file": "base64_encoded_file_data_here",
      "filename": "lab_report.pdf"
    }
  ]
}
                ''', language="json")
            
            with gr.Column():
                gr.Markdown("**Sample Response:**")
                gr.Code('''
{
  "data": [
    {
      "success": true,
      "text": "Extracted text content...",
      "filename": "lab_report.pdf",
      "ocr_engine": "PaddleOCR"
    }
  ]
}
                ''', language="json")
        
        gr.Markdown("### Test API Request:")
        api_input = gr.Textbox(
            label="API Request (JSON)",
            placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
            lines=5
        )
        api_btn = gr.Button("🧪 Test API Request")
        api_output = gr.Textbox(
            label="API Response (JSON)",
            lines=10
        )
        
        api_btn.click(
            fn=process_api_request,
            inputs=[api_input],
            outputs=[api_output]
        )
    
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ### 🎯 Purpose
        This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
        
        ### 🔧 Integration
        This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
        
        ### 📚 Supported Formats
        - PDF documents (multi-page)
        - JPEG/JPG images
        - PNG images
        
        ### 🚀 Features
        - High accuracy OCR with PaddleOCR
        - Medical document optimization
        - Multi-page PDF support
        - RESTful API integration
        - Free hosting on Hugging Face
        - SSL compatibility fixes included
        
        ### 🔗 Integration URL
        `https://mbuck17-paddleocr-processor.hf.space/api/predict`
        """)

# Launch the app
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)