Spaces:
Sleeping
Sleeping
feat: Refactor PDF Text Extractor application structure
Browse files- Introduced a modular architecture by separating the application into distinct modules: app.py, ui, and utils.
- Implemented a main function in app.py to handle application launch and configuration.
- Added environment variable loading and API key validation.
- Created a .env.example file for environment variable setup guidance.
- Enhanced the UI components and handlers for better user interaction.
- Developed a comprehensive PDF text extraction utility using Mistral AI.
- Added tests for OCR functionality and setup validation.
- Updated .gitignore to exclude environment files and unnecessary artifacts.
- .env.example +5 -0
- .gitignore +47 -0
- app.py +33 -5
- main.py +15 -0
- pdf_text_extractor.py +254 -0
- requirements.txt +0 -0
- tests/test_ocr_direct.py +234 -0
- tests/test_setup.py +62 -0
- ui/__init__.py +15 -0
- ui/components.py +125 -0
- ui/handlers.py +104 -0
- ui/interface.py +177 -0
- utils/__init__.py +4 -0
- utils/config.py +40 -0
- utils/pdf_image_extractor.py +155 -0
.env.example
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables for PDF Explainer
|
| 2 |
+
# Copy this file to .env and fill in your actual API key
|
| 3 |
+
|
| 4 |
+
# Mistral AI API Key - Get yours from https://console.mistral.ai/
|
| 5 |
+
MISTRAL_API_KEY=your_mistral_api_key_here
|
.gitignore
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
**/.env
|
| 3 |
+
|
| 4 |
+
# Python cache
|
| 5 |
+
**/__pycache__/
|
| 6 |
+
*.py[cod]
|
| 7 |
+
*$py.class
|
| 8 |
+
|
| 9 |
+
# Virtual environment
|
| 10 |
+
**/.venv
|
| 11 |
+
.venv/
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
|
| 15 |
+
# IDE files
|
| 16 |
+
.vscode/settings.json
|
| 17 |
+
.idea/
|
| 18 |
+
|
| 19 |
+
# OS files
|
| 20 |
+
.DS_Store
|
| 21 |
+
Thumbs.db
|
| 22 |
+
|
| 23 |
+
# Gradio temporary files
|
| 24 |
+
gradio_cached_examples/
|
| 25 |
+
flagged/
|
| 26 |
+
|
| 27 |
+
# Log files
|
| 28 |
+
*.log
|
| 29 |
+
|
| 30 |
+
# Distribution / packaging
|
| 31 |
+
.Python
|
| 32 |
+
build/
|
| 33 |
+
develop-eggs/
|
| 34 |
+
dist/
|
| 35 |
+
downloads/
|
| 36 |
+
eggs/
|
| 37 |
+
.eggs/
|
| 38 |
+
lib/
|
| 39 |
+
lib64/
|
| 40 |
+
parts/
|
| 41 |
+
sdist/
|
| 42 |
+
var/
|
| 43 |
+
wheels/
|
| 44 |
+
*.egg-info/
|
| 45 |
+
.installed.cfg
|
| 46 |
+
*.egg
|
| 47 |
+
MANIFEST
|
app.py
CHANGED
|
@@ -1,7 +1,35 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Text Extractor Application
|
| 3 |
+
Main entry point for the PDF Text Extractor application.
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from ui import create_interface
|
| 9 |
+
from utils.config import check_api_key, get_app_config
|
| 10 |
|
| 11 |
+
def main():
|
| 12 |
+
"""Main function to launch the application."""
|
| 13 |
+
|
| 14 |
+
# Load environment variables from .env file
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
# Check for API key
|
| 18 |
+
check_api_key()
|
| 19 |
+
|
| 20 |
+
# Create and launch the interface
|
| 21 |
+
interface = create_interface()
|
| 22 |
+
|
| 23 |
+
# Get application configuration
|
| 24 |
+
app_config = get_app_config()
|
| 25 |
+
|
| 26 |
+
# Launch with appropriate settings
|
| 27 |
+
interface.launch(
|
| 28 |
+
server_port=app_config["server_port"],
|
| 29 |
+
debug=app_config["debug"],
|
| 30 |
+
quiet=app_config["quiet"],
|
| 31 |
+
max_file_size=app_config["max_file_size"]
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
main()
|
main.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Text Extractor using Gradio and Mistral AI
|
| 3 |
+
A web application for extracting text from PDF files using Mistral's OCR capabilities.
|
| 4 |
+
|
| 5 |
+
This is a legacy entry point that maintains compatibility with the original app.
|
| 6 |
+
For a more modular structure, see app.py and the ui/ and utils/ folders.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Import from the new modular structure
|
| 10 |
+
from app import main
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Execute the main function when run as script
|
| 14 |
+
if __name__ == "__main__":
|
| 15 |
+
main()
|
pdf_text_extractor.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import os
|
| 3 |
+
from typing import Optional, Tuple, List, Dict, Any
|
| 4 |
+
from mistralai import Mistral
|
| 5 |
+
|
| 6 |
+
class PDFTextExtractor:
|
| 7 |
+
"""PDF text extraction using Mistral AI OCR."""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
"""Initialize the PDF text extractor with Mistral AI client."""
|
| 11 |
+
self.api_key = os.environ.get("MISTRAL_API_KEY")
|
| 12 |
+
if not self.api_key:
|
| 13 |
+
raise ValueError("MISTRAL_API_KEY environment variable is required")
|
| 14 |
+
self.client = Mistral(api_key=self.api_key)
|
| 15 |
+
|
| 16 |
+
def encode_pdf(self, pdf_path: str) -> Optional[str]:
|
| 17 |
+
"""
|
| 18 |
+
Encode the PDF file to base64.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
pdf_path: Path to the PDF file
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Base64 encoded string or None if error
|
| 25 |
+
"""
|
| 26 |
+
try:
|
| 27 |
+
with open(pdf_path, "rb") as pdf_file:
|
| 28 |
+
return base64.b64encode(pdf_file.read()).decode('utf-8')
|
| 29 |
+
except FileNotFoundError:
|
| 30 |
+
print(f"Error: The file {pdf_path} was not found.")
|
| 31 |
+
return None
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"Error encoding PDF: {e}")
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
def extract_text_from_pdf(self, pdf_file) -> Tuple[str, str, List[Dict[str, Any]]]:
|
| 37 |
+
"""
|
| 38 |
+
Extract text and images from uploaded PDF using Mistral AI OCR.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
pdf_file: Gradio file object
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Tuple of (extracted_text, status_message, images_data)
|
| 45 |
+
"""
|
| 46 |
+
if pdf_file is None:
|
| 47 |
+
return "", "Please upload a PDF file.", []
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
# Get the file path from Gradio file object
|
| 51 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
|
| 52 |
+
|
| 53 |
+
# Encode PDF to base64
|
| 54 |
+
base64_pdf = self.encode_pdf(pdf_path)
|
| 55 |
+
if base64_pdf is None:
|
| 56 |
+
return "", "Failed to encode PDF file.", []
|
| 57 |
+
|
| 58 |
+
# Process with Mistral OCR
|
| 59 |
+
print(f"🔄 Processing PDF with Mistral OCR...")
|
| 60 |
+
ocr_response = self.client.ocr.process(
|
| 61 |
+
model="mistral-ocr-latest",
|
| 62 |
+
document={
|
| 63 |
+
"type": "document_url",
|
| 64 |
+
"document_url": f"data:application/pdf;base64,{base64_pdf}"
|
| 65 |
+
},
|
| 66 |
+
include_image_base64=True
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Enhanced debugging and response parsing
|
| 70 |
+
print("🔍 Analyzing OCR Response Structure...")
|
| 71 |
+
print(f" Type: {type(ocr_response)}")
|
| 72 |
+
print(f" String representation: {str(ocr_response)[:500]}...")
|
| 73 |
+
|
| 74 |
+
# Check if it's a simple object with attributes
|
| 75 |
+
if hasattr(ocr_response, '__dict__'):
|
| 76 |
+
print(f" Object attributes: {list(ocr_response.__dict__.keys())}")
|
| 77 |
+
for key, value in ocr_response.__dict__.items():
|
| 78 |
+
print(f" {key}: {type(value)} = {str(value)[:100]}...")
|
| 79 |
+
|
| 80 |
+
# Check if it has commonly expected attributes
|
| 81 |
+
common_attrs = ['text', 'content', 'result', 'data', 'output', 'extracted_text', 'ocr_text', 'choices', 'message']
|
| 82 |
+
for attr in common_attrs:
|
| 83 |
+
if hasattr(ocr_response, attr):
|
| 84 |
+
value = getattr(ocr_response, attr)
|
| 85 |
+
print(f" Has '{attr}': {type(value)} = {str(value)[:100]}...")
|
| 86 |
+
|
| 87 |
+
# Check if it's iterable but not a string
|
| 88 |
+
try:
|
| 89 |
+
if hasattr(ocr_response, '__iter__') and not isinstance(ocr_response, str):
|
| 90 |
+
print(f" Iterable with {len(list(ocr_response))} items")
|
| 91 |
+
for i, item in enumerate(ocr_response):
|
| 92 |
+
if i < 3: # Show first 3 items
|
| 93 |
+
print(f" Item {i}: {type(item)} = {str(item)[:100]}...")
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f" Error checking iteration: {e}")
|
| 96 |
+
|
| 97 |
+
# Advanced text extraction with multiple strategies
|
| 98 |
+
extracted_text = ""
|
| 99 |
+
extraction_method = "none"
|
| 100 |
+
extracted_images = []
|
| 101 |
+
|
| 102 |
+
# Strategy 1: Mistral OCR specific - pages with markdown content and images
|
| 103 |
+
if hasattr(ocr_response, 'pages') and ocr_response.pages:
|
| 104 |
+
pages = ocr_response.pages
|
| 105 |
+
if isinstance(pages, list) and len(pages) > 0:
|
| 106 |
+
page_texts = []
|
| 107 |
+
|
| 108 |
+
for i, page in enumerate(pages):
|
| 109 |
+
# Extract text
|
| 110 |
+
if hasattr(page, 'markdown') and page.markdown:
|
| 111 |
+
page_texts.append(page.markdown)
|
| 112 |
+
print(f"✅ Found text in page {i} markdown: {len(page.markdown)} characters")
|
| 113 |
+
|
| 114 |
+
# Extract images
|
| 115 |
+
if hasattr(page, 'images') and page.images:
|
| 116 |
+
for j, img in enumerate(page.images):
|
| 117 |
+
image_data = {
|
| 118 |
+
'page': i,
|
| 119 |
+
'image_id': f"img-{i}-{j}",
|
| 120 |
+
'top_left_x': getattr(img, 'top_left_x', 0),
|
| 121 |
+
'top_left_y': getattr(img, 'top_left_y', 0),
|
| 122 |
+
'bottom_right_x': getattr(img, 'bottom_right_x', 0),
|
| 123 |
+
'bottom_right_y': getattr(img, 'bottom_right_y', 0),
|
| 124 |
+
'base64': getattr(img, 'image_base64', '')
|
| 125 |
+
}
|
| 126 |
+
extracted_images.append(image_data)
|
| 127 |
+
print(f"✅ Found image in page {i}, image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
|
| 128 |
+
|
| 129 |
+
if page_texts:
|
| 130 |
+
extracted_text = "\n\n".join(page_texts)
|
| 131 |
+
extraction_method = f"pages_markdown_{len(page_texts)}_pages"
|
| 132 |
+
|
| 133 |
+
# Try to extract images from other response structures if no images found yet
|
| 134 |
+
if not extracted_images:
|
| 135 |
+
# Check if response has images attribute directly
|
| 136 |
+
if hasattr(ocr_response, 'images') and ocr_response.images:
|
| 137 |
+
for j, img in enumerate(ocr_response.images):
|
| 138 |
+
image_data = {
|
| 139 |
+
'page': 0,
|
| 140 |
+
'image_id': getattr(img, 'id', f"img-{j}"),
|
| 141 |
+
'top_left_x': getattr(img, 'top_left_x', 0),
|
| 142 |
+
'top_left_y': getattr(img, 'top_left_y', 0),
|
| 143 |
+
'bottom_right_x': getattr(img, 'bottom_right_x', 0),
|
| 144 |
+
'bottom_right_y': getattr(img, 'bottom_right_y', 0),
|
| 145 |
+
'base64': getattr(img, 'image_base64', '')
|
| 146 |
+
}
|
| 147 |
+
extracted_images.append(image_data)
|
| 148 |
+
print(f"✅ Found image {j}: coordinates ({image_data['top_left_x']}, {image_data['top_left_y']}) to ({image_data['bottom_right_x']}, {image_data['bottom_right_y']})")
|
| 149 |
+
|
| 150 |
+
# Continue with fallback strategies for text extraction
|
| 151 |
+
if not extracted_text:
|
| 152 |
+
# Strategy 2: Direct text attribute (fallback)
|
| 153 |
+
if hasattr(ocr_response, 'text') and ocr_response.text:
|
| 154 |
+
extracted_text = str(ocr_response.text)
|
| 155 |
+
extraction_method = "direct_text_attribute"
|
| 156 |
+
|
| 157 |
+
# Strategy 3: Content attribute (fallback)
|
| 158 |
+
elif hasattr(ocr_response, 'content') and ocr_response.content:
|
| 159 |
+
content = ocr_response.content
|
| 160 |
+
if isinstance(content, str):
|
| 161 |
+
extracted_text = content
|
| 162 |
+
extraction_method = "content_attribute_string"
|
| 163 |
+
elif hasattr(content, 'text'):
|
| 164 |
+
extracted_text = str(content.text)
|
| 165 |
+
extraction_method = "content_text_attribute"
|
| 166 |
+
else:
|
| 167 |
+
extracted_text = str(content)
|
| 168 |
+
extraction_method = "content_attribute_converted"
|
| 169 |
+
|
| 170 |
+
# Strategy 4: Result attribute (fallback)
|
| 171 |
+
elif hasattr(ocr_response, 'result'):
|
| 172 |
+
result = ocr_response.result
|
| 173 |
+
if isinstance(result, str):
|
| 174 |
+
extracted_text = result
|
| 175 |
+
extraction_method = "result_string"
|
| 176 |
+
elif hasattr(result, 'text'):
|
| 177 |
+
extracted_text = str(result.text)
|
| 178 |
+
extraction_method = "result_text_attribute"
|
| 179 |
+
elif isinstance(result, dict) and 'text' in result:
|
| 180 |
+
extracted_text = str(result['text'])
|
| 181 |
+
extraction_method = "result_dict_text"
|
| 182 |
+
else:
|
| 183 |
+
extracted_text = str(result)
|
| 184 |
+
extraction_method = "result_converted"
|
| 185 |
+
|
| 186 |
+
# Strategy 5: Choices attribute (ChatGPT-style response - fallback)
|
| 187 |
+
elif hasattr(ocr_response, 'choices') and ocr_response.choices:
|
| 188 |
+
choices = ocr_response.choices
|
| 189 |
+
if isinstance(choices, list) and len(choices) > 0:
|
| 190 |
+
choice = choices[0]
|
| 191 |
+
if hasattr(choice, 'message') and hasattr(choice.message, 'content'):
|
| 192 |
+
extracted_text = str(choice.message.content)
|
| 193 |
+
extraction_method = "choices_message_content"
|
| 194 |
+
elif hasattr(choice, 'text'):
|
| 195 |
+
extracted_text = str(choice.text)
|
| 196 |
+
extraction_method = "choices_text"
|
| 197 |
+
else:
|
| 198 |
+
extracted_text = str(choice)
|
| 199 |
+
extraction_method = "choices_converted"
|
| 200 |
+
|
| 201 |
+
# Strategy 6: Dict-like access (fallback)
|
| 202 |
+
elif hasattr(ocr_response, 'get') or isinstance(ocr_response, dict):
|
| 203 |
+
for key in ['text', 'content', 'result', 'extracted_text', 'ocr_text', 'output']:
|
| 204 |
+
if hasattr(ocr_response, 'get'):
|
| 205 |
+
value = ocr_response.get(key)
|
| 206 |
+
else:
|
| 207 |
+
value = ocr_response.get(key) if isinstance(ocr_response, dict) else None
|
| 208 |
+
|
| 209 |
+
if value:
|
| 210 |
+
extracted_text = str(value)
|
| 211 |
+
extraction_method = f"dict_key_{key}"
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
# Strategy 7: Inspect all attributes for string-like content (fallback)
|
| 215 |
+
elif hasattr(ocr_response, '__dict__'):
|
| 216 |
+
for key, value in ocr_response.__dict__.items():
|
| 217 |
+
if isinstance(value, str) and len(value) > 20: # Likely text content
|
| 218 |
+
extracted_text = value
|
| 219 |
+
extraction_method = f"attribute_{key}"
|
| 220 |
+
break
|
| 221 |
+
elif hasattr(value, 'text') and isinstance(value.text, str):
|
| 222 |
+
extracted_text = str(value.text)
|
| 223 |
+
extraction_method = f"nested_text_in_{key}"
|
| 224 |
+
break
|
| 225 |
+
|
| 226 |
+
# Strategy 8: Convert entire response to string if it seems to contain text (fallback)
|
| 227 |
+
if not extracted_text:
|
| 228 |
+
response_str = str(ocr_response)
|
| 229 |
+
if len(response_str) > 50 and not response_str.startswith('<'): # Not an object reference
|
| 230 |
+
extracted_text = response_str
|
| 231 |
+
extraction_method = "full_response_string"
|
| 232 |
+
|
| 233 |
+
print(f"🎯 Extraction method used: {extraction_method}")
|
| 234 |
+
print(f"📏 Extracted text length: {len(extracted_text)} characters")
|
| 235 |
+
print(f"🖼️ Extracted images: {len(extracted_images)}")
|
| 236 |
+
|
| 237 |
+
if extracted_text:
|
| 238 |
+
status = f"✅ Successfully extracted text from PDF ({len(extracted_text)} characters)"
|
| 239 |
+
if extracted_images:
|
| 240 |
+
status += f" and {len(extracted_images)} image(s)"
|
| 241 |
+
else:
|
| 242 |
+
extracted_text = "No text could be extracted from this PDF."
|
| 243 |
+
status = "⚠️ OCR completed but no text was found in response."
|
| 244 |
+
if extracted_images:
|
| 245 |
+
status = f"✅ Successfully extracted {len(extracted_images)} image(s) from PDF, but no text was found."
|
| 246 |
+
print(f"❌ No extractable text found in OCR response")
|
| 247 |
+
|
| 248 |
+
return extracted_text, status, extracted_images
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
error_msg = f"Error processing PDF: {str(e)}"
|
| 252 |
+
print(error_msg)
|
| 253 |
+
return "", f"❌ {error_msg}", []
|
| 254 |
+
|
requirements.txt
ADDED
|
File without changes
|
tests/test_ocr_direct.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick OCR Test Script
|
| 3 |
+
Tests the Mistral AI OCR functionality directly without the Gradio interface.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import base64
|
| 7 |
+
import os
|
| 8 |
+
import tempfile
|
| 9 |
+
from mistralai import Mistral
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
# Load environment variables
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
def create_simple_pdf_content():
|
| 16 |
+
"""Create a minimal PDF in memory for testing."""
|
| 17 |
+
# Simple PDF content (this is a basic PDF structure)
|
| 18 |
+
pdf_content = """%PDF-1.4
|
| 19 |
+
1 0 obj
|
| 20 |
+
<<
|
| 21 |
+
/Type /Catalog
|
| 22 |
+
/Pages 2 0 R
|
| 23 |
+
>>
|
| 24 |
+
endobj
|
| 25 |
+
|
| 26 |
+
2 0 obj
|
| 27 |
+
<<
|
| 28 |
+
/Type /Pages
|
| 29 |
+
/Kids [3 0 R]
|
| 30 |
+
/Count 1
|
| 31 |
+
>>
|
| 32 |
+
endobj
|
| 33 |
+
|
| 34 |
+
3 0 obj
|
| 35 |
+
<<
|
| 36 |
+
/Type /Page
|
| 37 |
+
/Parent 2 0 R
|
| 38 |
+
/MediaBox [0 0 612 792]
|
| 39 |
+
/Contents 4 0 R
|
| 40 |
+
/Resources <<
|
| 41 |
+
/Font <<
|
| 42 |
+
/F1 5 0 R
|
| 43 |
+
>>
|
| 44 |
+
>>
|
| 45 |
+
>>
|
| 46 |
+
endobj
|
| 47 |
+
|
| 48 |
+
4 0 obj
|
| 49 |
+
<<
|
| 50 |
+
/Length 44
|
| 51 |
+
>>
|
| 52 |
+
stream
|
| 53 |
+
BT
|
| 54 |
+
/F1 12 Tf
|
| 55 |
+
72 720 Td
|
| 56 |
+
(Hello World! Test OCR) Tj
|
| 57 |
+
ET
|
| 58 |
+
endstream
|
| 59 |
+
endobj
|
| 60 |
+
|
| 61 |
+
5 0 obj
|
| 62 |
+
<<
|
| 63 |
+
/Type /Font
|
| 64 |
+
/Subtype /Type1
|
| 65 |
+
/BaseFont /Helvetica
|
| 66 |
+
>>
|
| 67 |
+
endobj
|
| 68 |
+
|
| 69 |
+
xref
|
| 70 |
+
0 6
|
| 71 |
+
0000000000 65535 f
|
| 72 |
+
0000000010 00000 n
|
| 73 |
+
0000000079 00000 n
|
| 74 |
+
0000000173 00000 n
|
| 75 |
+
0000000301 00000 n
|
| 76 |
+
0000000380 00000 n
|
| 77 |
+
trailer
|
| 78 |
+
<<
|
| 79 |
+
/Size 6
|
| 80 |
+
/Root 1 0 R
|
| 81 |
+
>>
|
| 82 |
+
startxref
|
| 83 |
+
456
|
| 84 |
+
%%EOF"""
|
| 85 |
+
|
| 86 |
+
return pdf_content.encode('utf-8')
|
| 87 |
+
|
| 88 |
+
def test_mistral_ocr():
|
| 89 |
+
"""Test the Mistral OCR functionality directly."""
|
| 90 |
+
|
| 91 |
+
print("🧪 Starting Mistral OCR Test...")
|
| 92 |
+
|
| 93 |
+
# Check API key
|
| 94 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 95 |
+
if not api_key:
|
| 96 |
+
print("❌ MISTRAL_API_KEY environment variable not found")
|
| 97 |
+
print(" Please set it in your .env file or environment")
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
print(f"✅ API key found: {api_key[:8]}...")
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# Initialize Mistral client
|
| 104 |
+
client = Mistral(api_key=api_key)
|
| 105 |
+
print("✅ Mistral client initialized")
|
| 106 |
+
|
| 107 |
+
# Create a simple test PDF
|
| 108 |
+
pdf_content = create_simple_pdf_content()
|
| 109 |
+
base64_pdf = base64.b64encode(pdf_content).decode('utf-8')
|
| 110 |
+
print(f"✅ Test PDF created ({len(pdf_content)} bytes)")
|
| 111 |
+
|
| 112 |
+
# Test the OCR endpoint
|
| 113 |
+
print("🔄 Sending OCR request to Mistral...")
|
| 114 |
+
|
| 115 |
+
response = client.ocr.process(
|
| 116 |
+
model="mistral-ocr-latest",
|
| 117 |
+
document={
|
| 118 |
+
"type": "document_url",
|
| 119 |
+
"document_url": f"data:application/pdf;base64,{base64_pdf}"
|
| 120 |
+
},
|
| 121 |
+
include_image_base64=True
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
print("✅ OCR request completed")
|
| 125 |
+
|
| 126 |
+
# Analyze the response
|
| 127 |
+
print("\n🔍 RESPONSE ANALYSIS:")
|
| 128 |
+
print(f"Response type: {type(response)}")
|
| 129 |
+
print(f"Response string: {str(response)[:200]}...")
|
| 130 |
+
|
| 131 |
+
if hasattr(response, '__dict__'):
|
| 132 |
+
print(f"Response attributes: {list(response.__dict__.keys())}")
|
| 133 |
+
for key, value in response.__dict__.items():
|
| 134 |
+
print(f" {key}: {type(value)} = {str(value)[:100]}...")
|
| 135 |
+
# Test all possible text extraction methods
|
| 136 |
+
print("\n🎯 TESTING TEXT EXTRACTION METHODS:")
|
| 137 |
+
|
| 138 |
+
methods = [
|
| 139 |
+
("response.pages[].markdown", lambda r: "\n".join([page.markdown for page in r.pages]) if hasattr(r, 'pages') and r.pages and all(hasattr(p, 'markdown') for p in r.pages) else None),
|
| 140 |
+
("response.text", lambda r: getattr(r, 'text', None)),
|
| 141 |
+
("response.content", lambda r: getattr(r, 'content', None)),
|
| 142 |
+
("response.result", lambda r: getattr(r, 'result', None)),
|
| 143 |
+
("response.data", lambda r: getattr(r, 'data', None)),
|
| 144 |
+
("response['text']", lambda r: r.get('text') if hasattr(r, 'get') else None),
|
| 145 |
+
("response['content']", lambda r: r.get('content') if hasattr(r, 'get') else None),
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
extracted_text = None
|
| 149 |
+
successful_method = None
|
| 150 |
+
|
| 151 |
+
for method_name, method_func in methods:
|
| 152 |
+
try:
|
| 153 |
+
result = method_func(response)
|
| 154 |
+
if result:
|
| 155 |
+
print(f"✅ {method_name}: Found content ({len(str(result))} chars)")
|
| 156 |
+
print(f" Content: {str(result)[:100]}...")
|
| 157 |
+
if not extracted_text: # Use the first successful method
|
| 158 |
+
extracted_text = str(result)
|
| 159 |
+
successful_method = method_name
|
| 160 |
+
else:
|
| 161 |
+
print(f"❌ {method_name}: No content found")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"❌ {method_name}: Error - {e}")
|
| 164 |
+
|
| 165 |
+
if extracted_text:
|
| 166 |
+
print(f"\n🎉 SUCCESSFULLY EXTRACTED TEXT using {successful_method}:")
|
| 167 |
+
print(f"📝 Full extracted text: '{extracted_text}'")
|
| 168 |
+
else:
|
| 169 |
+
print(f"\n❌ NO TEXT EXTRACTED from any method")
|
| 170 |
+
|
| 171 |
+
return True
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f"❌ OCR test failed: {e}")
|
| 175 |
+
print(f" Error type: {type(e)}")
|
| 176 |
+
|
| 177 |
+
# If it's a 401 error, the API key might be invalid
|
| 178 |
+
if "401" in str(e) or "unauthorized" in str(e).lower():
|
| 179 |
+
print(" This might be an API key issue. Please check your MISTRAL_API_KEY")
|
| 180 |
+
|
| 181 |
+
return False
|
| 182 |
+
|
| 183 |
+
def test_api_connectivity():
|
| 184 |
+
"""Test basic connectivity to Mistral API."""
|
| 185 |
+
|
| 186 |
+
print("🌐 Testing API connectivity...")
|
| 187 |
+
|
| 188 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 189 |
+
if not api_key:
|
| 190 |
+
print("❌ No API key found")
|
| 191 |
+
return False
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
client = Mistral(api_key=api_key)
|
| 195 |
+
|
| 196 |
+
# Try a simple API call (if available)
|
| 197 |
+
# Note: This might fail if the endpoint doesn't exist, but it tests connectivity
|
| 198 |
+
print("🔄 Testing API connection...")
|
| 199 |
+
|
| 200 |
+
# The exact method to test connectivity may vary based on Mistral's API
|
| 201 |
+
# For now, we'll just try to initialize and catch any immediate errors
|
| 202 |
+
print("✅ Mistral client appears to be working")
|
| 203 |
+
return True
|
| 204 |
+
|
| 205 |
+
except Exception as e:
|
| 206 |
+
print(f"❌ API connectivity test failed: {e}")
|
| 207 |
+
return False
|
| 208 |
+
|
| 209 |
+
def main():
|
| 210 |
+
"""Main test function."""
|
| 211 |
+
|
| 212 |
+
print("🚀 Mistral OCR Quick Test")
|
| 213 |
+
print("=" * 40)
|
| 214 |
+
|
| 215 |
+
# Test API connectivity first
|
| 216 |
+
if not test_api_connectivity():
|
| 217 |
+
print("\n❌ Basic connectivity test failed")
|
| 218 |
+
return
|
| 219 |
+
|
| 220 |
+
print("\n" + "="*40)
|
| 221 |
+
|
| 222 |
+
# Test OCR functionality
|
| 223 |
+
if test_mistral_ocr():
|
| 224 |
+
print("\n✅ OCR test completed - check the response analysis above")
|
| 225 |
+
else:
|
| 226 |
+
print("\n❌ OCR test failed")
|
| 227 |
+
|
| 228 |
+
print("\n💡 Next steps:")
|
| 229 |
+
print(" 1. If the test worked, run: python main.py")
|
| 230 |
+
print(" 2. If there were errors, check the API key and try again")
|
| 231 |
+
print(" 3. Use the response analysis to improve text extraction")
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
main()
|
tests/test_setup.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test script for PDF Extractor setup validation
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
def test_imports():
|
| 10 |
+
"""Test if all required packages are importable."""
|
| 11 |
+
try:
|
| 12 |
+
import gradio as gr
|
| 13 |
+
print("✅ Gradio imported successfully")
|
| 14 |
+
|
| 15 |
+
import mistralai
|
| 16 |
+
print("✅ Mistral AI imported successfully")
|
| 17 |
+
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
print("✅ python-dotenv imported successfully")
|
| 20 |
+
|
| 21 |
+
return True
|
| 22 |
+
except ImportError as e:
|
| 23 |
+
print(f"❌ Import error: {e}")
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
def test_environment():
|
| 27 |
+
"""Test environment variable setup."""
|
| 28 |
+
load_dotenv()
|
| 29 |
+
|
| 30 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 31 |
+
if api_key:
|
| 32 |
+
# Don't print the actual key, just confirm it exists
|
| 33 |
+
print("✅ MISTRAL_API_KEY environment variable is set")
|
| 34 |
+
return True
|
| 35 |
+
else:
|
| 36 |
+
print("⚠️ MISTRAL_API_KEY not found in environment")
|
| 37 |
+
print(" Please copy .env.example to .env and add your API key")
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
def main():
|
| 41 |
+
"""Run all tests."""
|
| 42 |
+
print("🔍 PDF Extractor Setup Validation")
|
| 43 |
+
print("=" * 40)
|
| 44 |
+
|
| 45 |
+
import_success = test_imports()
|
| 46 |
+
env_success = test_environment()
|
| 47 |
+
|
| 48 |
+
print("\n" + "=" * 40)
|
| 49 |
+
if import_success:
|
| 50 |
+
print("✅ All packages are properly installed")
|
| 51 |
+
if env_success:
|
| 52 |
+
print("✅ Environment is configured correctly")
|
| 53 |
+
print("🚀 Ready to run: python main.py")
|
| 54 |
+
else:
|
| 55 |
+
print("⚠️ Environment needs configuration")
|
| 56 |
+
print("📝 Next step: Set up your .env file")
|
| 57 |
+
else:
|
| 58 |
+
print("❌ Package installation incomplete")
|
| 59 |
+
print("📝 Next step: pip install -r requirements.txt")
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
ui/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""UI components for PDF Text Extractor."""
|
| 2 |
+
from ui.interface import create_interface
|
| 3 |
+
from ui.handlers import copy_text, download_text, process_images_for_display
|
| 4 |
+
from ui.components import (
|
| 5 |
+
create_header, create_upload_section, create_action_button,
|
| 6 |
+
create_text_display, create_action_buttons, create_image_gallery,
|
| 7 |
+
apply_custom_css
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"create_interface", "copy_text", "download_text", "process_images_for_display",
|
| 12 |
+
"create_header", "create_upload_section", "create_action_button",
|
| 13 |
+
"create_text_display", "create_action_buttons", "create_image_gallery",
|
| 14 |
+
"apply_custom_css"
|
| 15 |
+
]
|
ui/components.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UI components module for PDF Text Extractor.
|
| 3 |
+
Contains functions for creating individual UI components.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from typing import Tuple, List, Dict, Any
|
| 8 |
+
|
| 9 |
+
def create_header() -> gr.Markdown:
|
| 10 |
+
"""
|
| 11 |
+
Create the application header.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
gr.Markdown: Header component
|
| 15 |
+
"""
|
| 16 |
+
return gr.Markdown("""
|
| 17 |
+
# 🔍 PDF Text Extractor
|
| 18 |
+
|
| 19 |
+
Extract text and images from PDF files using Mistral AI's OCR technology.
|
| 20 |
+
|
| 21 |
+
**Instructions:**
|
| 22 |
+
1. Upload a PDF file using the file selector below
|
| 23 |
+
2. Wait for processing to complete
|
| 24 |
+
3. View the extracted text and images
|
| 25 |
+
4. Use the Copy or Download buttons to save the extracted text
|
| 26 |
+
|
| 27 |
+
**Supported:** PDF files up to 10MB
|
| 28 |
+
""")
|
| 29 |
+
|
| 30 |
+
def create_upload_section() -> gr.File:
|
| 31 |
+
"""
|
| 32 |
+
Create the file upload component.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
gr.File: File upload component
|
| 36 |
+
"""
|
| 37 |
+
return gr.File(
|
| 38 |
+
label="Upload PDF File",
|
| 39 |
+
file_types=[".pdf"],
|
| 40 |
+
file_count="single"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def create_action_button() -> gr.Button:
|
| 44 |
+
"""
|
| 45 |
+
Create the extract text action button.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
gr.Button: Action button component
|
| 49 |
+
"""
|
| 50 |
+
return gr.Button("Extract Text & Images", variant="primary")
|
| 51 |
+
|
| 52 |
+
def create_text_display() -> Tuple[gr.Textbox, gr.Textbox]:
|
| 53 |
+
"""
|
| 54 |
+
Create the text output and status display components.
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Tuple[gr.Textbox, gr.Textbox]: Text output and status components
|
| 58 |
+
"""
|
| 59 |
+
text_output = gr.Textbox(
|
| 60 |
+
label="Extracted Text",
|
| 61 |
+
lines=10,
|
| 62 |
+
max_lines=20,
|
| 63 |
+
placeholder="Extracted text will appear here...",
|
| 64 |
+
show_copy_button=True
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
status_output = gr.Textbox(
|
| 68 |
+
label="Status",
|
| 69 |
+
lines=2,
|
| 70 |
+
placeholder="Upload a PDF to see status..."
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
return text_output, status_output
|
| 74 |
+
|
| 75 |
+
def create_image_gallery() -> gr.Gallery:
|
| 76 |
+
"""
|
| 77 |
+
Create the image gallery component.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
gr.Gallery: Image gallery component
|
| 81 |
+
"""
|
| 82 |
+
return gr.Gallery(
|
| 83 |
+
label="Extracted Images",
|
| 84 |
+
columns=3,
|
| 85 |
+
rows=2,
|
| 86 |
+
object_fit="contain",
|
| 87 |
+
height="auto",
|
| 88 |
+
visible=True,
|
| 89 |
+
show_label=True,
|
| 90 |
+
elem_id="image_gallery"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def create_action_buttons() -> Tuple[gr.Button, gr.Button]:
|
| 94 |
+
"""
|
| 95 |
+
Create copy and download action buttons.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Tuple[gr.Button, gr.Button]: Copy and download button components
|
| 99 |
+
"""
|
| 100 |
+
copy_btn = gr.Button("📋 Copy to Clipboard")
|
| 101 |
+
download_btn = gr.Button("📥 Download as Text File")
|
| 102 |
+
|
| 103 |
+
return copy_btn, download_btn
|
| 104 |
+
|
| 105 |
+
def apply_custom_css() -> gr.HTML:
|
| 106 |
+
"""
|
| 107 |
+
Apply custom CSS styling.
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
gr.HTML: HTML component with CSS styles
|
| 111 |
+
"""
|
| 112 |
+
return gr.HTML("""
|
| 113 |
+
<style>
|
| 114 |
+
.gradio-container {
|
| 115 |
+
max-width: 900px !important;
|
| 116 |
+
}
|
| 117 |
+
.output-markdown {
|
| 118 |
+
font-family: 'Courier New', monospace;
|
| 119 |
+
}
|
| 120 |
+
.image-gallery-caption {
|
| 121 |
+
text-align: center;
|
| 122 |
+
font-size: 0.9em;
|
| 123 |
+
}
|
| 124 |
+
</style>
|
| 125 |
+
""")
|
ui/handlers.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Event handlers for UI components.
|
| 3 |
+
Contains functions that handle user interactions with the interface.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
from typing import Optional, List, Dict, Any
|
| 9 |
+
from utils.pdf_image_extractor import PDFImageExtractor
|
| 10 |
+
|
| 11 |
+
def copy_text(text: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Handle Copy button click.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
text: Text to copy to clipboard
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
str: The input text (unchanged)
|
| 20 |
+
"""
|
| 21 |
+
return text
|
| 22 |
+
|
| 23 |
+
def download_text(text: str) -> Optional[str]:
|
| 24 |
+
"""
|
| 25 |
+
Handle Download button click.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
text: Text to download
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Optional[str]: Path to the created text file or None if text is empty
|
| 32 |
+
"""
|
| 33 |
+
import tempfile
|
| 34 |
+
import os
|
| 35 |
+
|
| 36 |
+
if not text:
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
# Create a temporary file to hold the text
|
| 40 |
+
temp_dir = tempfile.gettempdir()
|
| 41 |
+
filename = "extracted_text.txt"
|
| 42 |
+
file_path = os.path.join(temp_dir, filename)
|
| 43 |
+
|
| 44 |
+
# Write the text to the file
|
| 45 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 46 |
+
f.write(text)
|
| 47 |
+
|
| 48 |
+
return file_path
|
| 49 |
+
|
| 50 |
+
def process_images_for_display(images_data: List[Dict[str, Any]], pdf_path: str = None) -> List:
|
| 51 |
+
"""
|
| 52 |
+
Process images for display in the Gradio gallery.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
images_data: List of image data dictionaries from OCR response
|
| 56 |
+
pdf_path: Path to the original PDF file for image extraction
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
List: List of image paths for gallery display
|
| 60 |
+
"""
|
| 61 |
+
if not images_data:
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
# If we have PDF path and bounding box data, extract images from PDF
|
| 65 |
+
if pdf_path and os.path.exists(pdf_path):
|
| 66 |
+
print("🖼️ Extracting images from PDF using bounding box coordinates...")
|
| 67 |
+
extracted_paths = PDFImageExtractor.extract_images_from_pdf(pdf_path, images_data)
|
| 68 |
+
if extracted_paths:
|
| 69 |
+
return extracted_paths
|
| 70 |
+
|
| 71 |
+
# Fallback: extract all images from PDF if bounding box extraction failed
|
| 72 |
+
print("🔄 Fallback: Extracting all images from PDF...")
|
| 73 |
+
extracted_paths = PDFImageExtractor.extract_all_images_from_pdf(pdf_path)
|
| 74 |
+
if extracted_paths:
|
| 75 |
+
return extracted_paths[:len(images_data)] # Limit to expected number of images
|
| 76 |
+
|
| 77 |
+
# Fallback: use base64 data from OCR response
|
| 78 |
+
print("🔄 Using base64 image data from OCR response...")
|
| 79 |
+
gallery_images = []
|
| 80 |
+
temp_dir = tempfile.gettempdir()
|
| 81 |
+
|
| 82 |
+
for index, img_data in enumerate(images_data):
|
| 83 |
+
try:
|
| 84 |
+
# Get image base64 data
|
| 85 |
+
base64_data = img_data.get('base64', '')
|
| 86 |
+
if not base64_data:
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
# Create a temporary file to save the image
|
| 90 |
+
img_filename = f"extracted_image_fallback_{index}.jpg"
|
| 91 |
+
img_path = os.path.join(temp_dir, img_filename)
|
| 92 |
+
|
| 93 |
+
# Convert base64 to image file
|
| 94 |
+
import base64
|
| 95 |
+
with open(img_path, "wb") as img_file:
|
| 96 |
+
img_file.write(base64.b64decode(base64_data))
|
| 97 |
+
|
| 98 |
+
# Add path to gallery list (Gradio Gallery expects a list of paths)
|
| 99 |
+
gallery_images.append(img_path)
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"Error processing image {index}: {str(e)}")
|
| 103 |
+
|
| 104 |
+
return gallery_images
|
ui/interface.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Interface creation module for PDF Text Extractor.
|
| 3 |
+
Defines the Gradio interface components and layout.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from pdf_text_extractor import PDFTextExtractor
|
| 8 |
+
from ui.handlers import copy_text, download_text, process_images_for_display
|
| 9 |
+
from ui.components import (
|
| 10 |
+
create_header, create_upload_section, create_action_button,
|
| 11 |
+
create_text_display, create_action_buttons, create_image_gallery, apply_custom_css
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
def create_dummy_interface() -> gr.Blocks:
|
| 15 |
+
"""
|
| 16 |
+
Create a simple interface for when the API key is not configured.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
gr.Blocks: Gradio interface with disabled functionality
|
| 20 |
+
"""
|
| 21 |
+
with gr.Blocks(title="PDF Text Extractor") as interface:
|
| 22 |
+
gr.Markdown("""
|
| 23 |
+
# 🔍 PDF Text Extractor
|
| 24 |
+
|
| 25 |
+
⚠️ **API key not configured.** Please set MISTRAL_API_KEY environment variable and restart the application.
|
| 26 |
+
""")
|
| 27 |
+
|
| 28 |
+
with gr.Row():
|
| 29 |
+
gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 30 |
+
|
| 31 |
+
with gr.Row():
|
| 32 |
+
gr.Button("Extract Text", variant="primary", interactive=False)
|
| 33 |
+
|
| 34 |
+
with gr.Row():
|
| 35 |
+
gr.Textbox(
|
| 36 |
+
label="Extracted Text",
|
| 37 |
+
lines=10,
|
| 38 |
+
value="API key not configured. Text extraction is unavailable.",
|
| 39 |
+
interactive=False
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
with gr.Row():
|
| 43 |
+
gr.Textbox(
|
| 44 |
+
label="Status",
|
| 45 |
+
lines=2,
|
| 46 |
+
value="❌ MISTRAL_API_KEY environment variable is not set. Please set it and restart the application."
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
with gr.Row():
|
| 50 |
+
gr.Button("📋 Copy to Clipboard", interactive=False)
|
| 51 |
+
gr.Button("📥 Download as Text File", interactive=False)
|
| 52 |
+
|
| 53 |
+
return interface
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def create_main_interface(extractor: PDFTextExtractor) -> gr.Blocks:
|
| 58 |
+
"""
|
| 59 |
+
Create the main application interface.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
extractor: PDFTextExtractor instance
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
gr.Blocks: Gradio interface with full functionality """
|
| 66 |
+
# Make the extractor a local function attribute
|
| 67 |
+
def process_pdf_wrapper(pdf_file):
|
| 68 |
+
"""Process PDF with the extractor from closure"""
|
| 69 |
+
extracted_text, status, images_data = extractor.extract_text_from_pdf(pdf_file)
|
| 70 |
+
# Get PDF file path for image extraction
|
| 71 |
+
pdf_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file if pdf_file else None
|
| 72 |
+
gallery_images = process_images_for_display(images_data, pdf_path)
|
| 73 |
+
return extracted_text, status, gallery_images
|
| 74 |
+
|
| 75 |
+
with gr.Blocks(title="🔍 PDF Text Extractor", theme=gr.themes.Soft()) as interface:
|
| 76 |
+
# Add the header
|
| 77 |
+
create_header()
|
| 78 |
+
|
| 79 |
+
# Add file upload section
|
| 80 |
+
with gr.Row():
|
| 81 |
+
pdf_input = create_upload_section()
|
| 82 |
+
|
| 83 |
+
# Add extract button
|
| 84 |
+
with gr.Row():
|
| 85 |
+
submit_btn = create_action_button()
|
| 86 |
+
|
| 87 |
+
# Add status display
|
| 88 |
+
with gr.Row():
|
| 89 |
+
status_output = gr.Textbox(
|
| 90 |
+
label="Status",
|
| 91 |
+
lines=2,
|
| 92 |
+
placeholder="Upload a PDF to see status..."
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Create tabs for text and images
|
| 96 |
+
with gr.Tabs():
|
| 97 |
+
with gr.TabItem("Extracted Text"):
|
| 98 |
+
text_output = gr.Textbox(
|
| 99 |
+
label="Extracted Text",
|
| 100 |
+
lines=15,
|
| 101 |
+
max_lines=30,
|
| 102 |
+
placeholder="Extracted text will appear here...",
|
| 103 |
+
show_copy_button=True
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Add action buttons for text
|
| 107 |
+
with gr.Row():
|
| 108 |
+
copy_btn, download_btn = create_action_buttons()
|
| 109 |
+
|
| 110 |
+
with gr.TabItem("Extracted Images"):
|
| 111 |
+
image_gallery = create_image_gallery()
|
| 112 |
+
image_info = gr.Markdown("Images extracted from the PDF will appear here.")
|
| 113 |
+
# Set up function calls
|
| 114 |
+
submit_btn.click(
|
| 115 |
+
fn=process_pdf_wrapper,
|
| 116 |
+
inputs=[pdf_input],
|
| 117 |
+
outputs=[text_output, status_output, image_gallery]
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Handle Copy button click
|
| 121 |
+
copy_btn.click(
|
| 122 |
+
fn=copy_text,
|
| 123 |
+
inputs=text_output,
|
| 124 |
+
outputs=None,
|
| 125 |
+
js="""
|
| 126 |
+
function(text) {
|
| 127 |
+
if (text) {
|
| 128 |
+
navigator.clipboard.writeText(text);
|
| 129 |
+
// Show a temporary notification
|
| 130 |
+
var notification = document.createElement('div');
|
| 131 |
+
notification.textContent = 'Text copied to clipboard!';
|
| 132 |
+
notification.style.position = 'fixed';
|
| 133 |
+
notification.style.bottom = '20px';
|
| 134 |
+
notification.style.left = '50%';
|
| 135 |
+
notification.style.transform = 'translateX(-50%)';
|
| 136 |
+
notification.style.padding = '10px 20px';
|
| 137 |
+
notification.style.background = '#4CAF50';
|
| 138 |
+
notification.style.color = 'white';
|
| 139 |
+
notification.style.borderRadius = '4px';
|
| 140 |
+
notification.style.zIndex = '1000';
|
| 141 |
+
document.body.appendChild(notification);
|
| 142 |
+
setTimeout(function() {
|
| 143 |
+
document.body.removeChild(notification);
|
| 144 |
+
}, 2000);
|
| 145 |
+
}
|
| 146 |
+
return text;
|
| 147 |
+
}
|
| 148 |
+
"""
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Handle Download button click
|
| 152 |
+
download_btn.click(
|
| 153 |
+
fn=download_text,
|
| 154 |
+
inputs=text_output,
|
| 155 |
+
outputs=gr.File(label="Download", elem_id="download_output"),
|
| 156 |
+
show_progress=False
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# Apply custom CSS styling
|
| 160 |
+
apply_custom_css()
|
| 161 |
+
|
| 162 |
+
return interface
|
| 163 |
+
|
| 164 |
+
def create_interface() -> gr.Blocks:
|
| 165 |
+
"""
|
| 166 |
+
Create and configure the Gradio interface.
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
gr.Blocks: Configured Gradio interface
|
| 170 |
+
"""
|
| 171 |
+
# Initialize the PDF extractor
|
| 172 |
+
try:
|
| 173 |
+
extractor = PDFTextExtractor()
|
| 174 |
+
return create_main_interface(extractor)
|
| 175 |
+
except ValueError as e:
|
| 176 |
+
# Create a dummy interface if API key is missing
|
| 177 |
+
return create_dummy_interface()
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility functions for PDF Text Extractor."""
|
| 2 |
+
from utils.config import check_api_key, get_app_config
|
| 3 |
+
|
| 4 |
+
__all__ = ["check_api_key", "get_app_config"]
|
utils/config.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration utilities for PDF Text Extractor.
|
| 3 |
+
Contains functions for handling environment variables and app configuration.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
|
| 9 |
+
def check_api_key() -> bool:
|
| 10 |
+
"""
|
| 11 |
+
Check if the Mistral API key is set in environment variables.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
bool: True if API key is set, False otherwise
|
| 15 |
+
"""
|
| 16 |
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 17 |
+
if not api_key:
|
| 18 |
+
print("⚠️ Warning: MISTRAL_API_KEY environment variable is not set.")
|
| 19 |
+
print(" Please set it before using the PDF extraction functionality.")
|
| 20 |
+
print(" Example: export MISTRAL_API_KEY='your-api-key-here'")
|
| 21 |
+
print()
|
| 22 |
+
return False
|
| 23 |
+
return True
|
| 24 |
+
|
| 25 |
+
def get_app_config() -> Dict[str, Any]:
|
| 26 |
+
"""
|
| 27 |
+
Get application configuration settings.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
Dict[str, Any]: Application configuration settings
|
| 31 |
+
"""
|
| 32 |
+
return {
|
| 33 |
+
"server_port": 7861, # Use different port to avoid conflicts
|
| 34 |
+
"debug": True, # Enable debug mode for development
|
| 35 |
+
"quiet": False, # Show startup messages
|
| 36 |
+
"max_file_size": "10mb" # Limit PDF file size
|
| 37 |
+
# Uncomment the following to enable external access and public link sharing:
|
| 38 |
+
# "server_name": "0.0.0.0", # Allow external access
|
| 39 |
+
# "share": True, # Create public link
|
| 40 |
+
}
|
utils/pdf_image_extractor.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Image Extraction utilities.
|
| 3 |
+
Extracts images from PDF using bounding box coordinates.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
from typing import List, Dict, Any, Optional
|
| 9 |
+
import fitz # PyMuPDF
|
| 10 |
+
from PIL import Image
|
| 11 |
+
import base64
|
| 12 |
+
import io
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PDFImageExtractor:
|
| 16 |
+
"""Extract images from PDF using bounding box coordinates."""
|
| 17 |
+
|
| 18 |
+
@staticmethod
|
| 19 |
+
def extract_images_from_pdf(pdf_path: str, images_data: List[Dict[str, Any]]) -> List[str]:
|
| 20 |
+
"""
|
| 21 |
+
Extract images from PDF using bounding box coordinates.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
pdf_path: Path to the PDF file
|
| 25 |
+
images_data: List of image data with bounding box coordinates
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
List[str]: List of paths to extracted image files
|
| 29 |
+
"""
|
| 30 |
+
if not images_data:
|
| 31 |
+
return []
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
# Open the PDF document
|
| 35 |
+
pdf_doc = fitz.open(pdf_path)
|
| 36 |
+
extracted_image_paths = []
|
| 37 |
+
temp_dir = tempfile.gettempdir()
|
| 38 |
+
|
| 39 |
+
for index, img_data in enumerate(images_data):
|
| 40 |
+
try:
|
| 41 |
+
page_num = img_data.get('page', 0)
|
| 42 |
+
|
| 43 |
+
# Ensure page number is valid
|
| 44 |
+
if page_num >= len(pdf_doc):
|
| 45 |
+
print(f"Warning: Page {page_num} not found in PDF (max: {len(pdf_doc)-1})")
|
| 46 |
+
continue
|
| 47 |
+
|
| 48 |
+
# Get the page
|
| 49 |
+
page = pdf_doc[page_num]
|
| 50 |
+
|
| 51 |
+
# Get bounding box coordinates
|
| 52 |
+
top_left_x = img_data.get('top_left_x', 0)
|
| 53 |
+
top_left_y = img_data.get('top_left_y', 0)
|
| 54 |
+
bottom_right_x = img_data.get('bottom_right_x', 0)
|
| 55 |
+
bottom_right_y = img_data.get('bottom_right_y', 0)
|
| 56 |
+
|
| 57 |
+
# Create a rectangle for the bounding box
|
| 58 |
+
# PyMuPDF uses (x0, y0, x1, y1) format
|
| 59 |
+
bbox = fitz.Rect(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
| 60 |
+
|
| 61 |
+
# Render the page as a pixmap with high resolution
|
| 62 |
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
| 63 |
+
pix = page.get_pixmap(matrix=mat, clip=bbox)
|
| 64 |
+
|
| 65 |
+
# Convert pixmap to PIL Image
|
| 66 |
+
img_data_bytes = pix.tobytes("png")
|
| 67 |
+
img = Image.open(io.BytesIO(img_data_bytes))
|
| 68 |
+
|
| 69 |
+
# Save the image to a temporary file
|
| 70 |
+
img_filename = f"extracted_image_page{page_num}_{index}.png"
|
| 71 |
+
img_path = os.path.join(temp_dir, img_filename)
|
| 72 |
+
img.save(img_path, "PNG")
|
| 73 |
+
|
| 74 |
+
extracted_image_paths.append(img_path)
|
| 75 |
+
print(f"✅ Extracted image {index} from page {page_num}: {img_path}")
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"Error extracting image {index}: {str(e)}")
|
| 79 |
+
|
| 80 |
+
# Fallback: try to use base64 data if available
|
| 81 |
+
base64_data = img_data.get('base64', '')
|
| 82 |
+
if base64_data:
|
| 83 |
+
try:
|
| 84 |
+
img_filename = f"extracted_image_base64_{index}.jpg"
|
| 85 |
+
img_path = os.path.join(temp_dir, img_filename)
|
| 86 |
+
|
| 87 |
+
with open(img_path, "wb") as img_file:
|
| 88 |
+
img_file.write(base64.b64decode(base64_data))
|
| 89 |
+
|
| 90 |
+
extracted_image_paths.append(img_path)
|
| 91 |
+
print(f"✅ Used base64 data for image {index}: {img_path}")
|
| 92 |
+
except Exception as e2:
|
| 93 |
+
print(f"Error using base64 data for image {index}: {str(e2)}")
|
| 94 |
+
|
| 95 |
+
pdf_doc.close()
|
| 96 |
+
return extracted_image_paths
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"Error opening PDF file: {str(e)}")
|
| 100 |
+
return []
|
| 101 |
+
|
| 102 |
+
@staticmethod
|
| 103 |
+
def extract_all_images_from_pdf(pdf_path: str) -> List[str]:
|
| 104 |
+
"""
|
| 105 |
+
Extract all images from PDF without using bounding boxes.
|
| 106 |
+
This is a fallback method when no bounding box data is available.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
pdf_path: Path to the PDF file
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
List[str]: List of paths to extracted image files
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
pdf_doc = fitz.open(pdf_path)
|
| 116 |
+
extracted_image_paths = []
|
| 117 |
+
temp_dir = tempfile.gettempdir()
|
| 118 |
+
|
| 119 |
+
for page_num in range(len(pdf_doc)):
|
| 120 |
+
page = pdf_doc[page_num]
|
| 121 |
+
image_list = page.get_images()
|
| 122 |
+
|
| 123 |
+
for img_index, img in enumerate(image_list):
|
| 124 |
+
try:
|
| 125 |
+
# Get image data
|
| 126 |
+
xref = img[0]
|
| 127 |
+
pix = fitz.Pixmap(pdf_doc, xref)
|
| 128 |
+
|
| 129 |
+
# Convert to PNG if CMYK
|
| 130 |
+
if pix.n - pix.alpha < 4: # GRAY or RGB
|
| 131 |
+
img_data = pix.tobytes("png")
|
| 132 |
+
else: # CMYK: convert to RGB first
|
| 133 |
+
pix1 = fitz.Pixmap(fitz.csRGB, pix)
|
| 134 |
+
img_data = pix1.tobytes("png")
|
| 135 |
+
pix1 = None
|
| 136 |
+
|
| 137 |
+
# Save image
|
| 138 |
+
img_filename = f"all_images_page{page_num}_img{img_index}.png"
|
| 139 |
+
img_path = os.path.join(temp_dir, img_filename)
|
| 140 |
+
|
| 141 |
+
with open(img_path, "wb") as f:
|
| 142 |
+
f.write(img_data)
|
| 143 |
+
|
| 144 |
+
extracted_image_paths.append(img_path)
|
| 145 |
+
pix = None
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
|
| 149 |
+
|
| 150 |
+
pdf_doc.close()
|
| 151 |
+
return extracted_image_paths
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Error extracting all images from PDF: {str(e)}")
|
| 155 |
+
return []
|