Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import requests | |
| import os | |
| import tempfile | |
| import base64 | |
| from typing import Optional, Tuple | |
| # OCR.space API configuration | |
| OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here') | |
| OCR_API_URL = 'https://api.ocr.space/parse/image' | |
| def extract_text_with_ocr(pdf_file_path: str) -> str: | |
| """Extract text using OCR.space API as fallback""" | |
| try: | |
| # Convert PDF to image first (using first page) | |
| doc = fitz.open(pdf_file_path) | |
| page = doc[0] # Get first page | |
| # Convert page to image | |
| mat = fitz.Matrix(2.0, 2.0) # Higher resolution | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("png") | |
| doc.close() | |
| # Encode image to base64 | |
| img_base64 = base64.b64encode(img_data).decode('utf-8') | |
| # Prepare OCR.space API request | |
| payload = { | |
| 'apikey': OCR_API_KEY, | |
| 'language': 'eng', | |
| 'isOverlayRequired': False, | |
| 'base64Image': f'data:image/png;base64,{img_base64}', | |
| 'iscreatesearchablepdf': False, | |
| 'issearchablepdfhidetextlayer': False | |
| } | |
| # Make API request | |
| response = requests.post(OCR_API_URL, data=payload, timeout=60) | |
| if response.status_code == 200: | |
| result = response.json() | |
| if result.get('IsErroredOnProcessing', False): | |
| return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}" | |
| parsed_results = result.get('ParsedResults', []) | |
| if parsed_results: | |
| return parsed_results[0].get('ParsedText', 'No text found') | |
| else: | |
| return "No text extracted from OCR" | |
| else: | |
| return f"OCR API Error: {response.status_code}" | |
| except Exception as e: | |
| return f"OCR processing error: {str(e)}" | |
| def extract_text_from_pdf(pdf_file) -> Tuple[str, str]: | |
| """Extract text from uploaded PDF file with OCR fallback""" | |
| if pdf_file is None: | |
| return "No file uploaded", "β Error" | |
| status = "β Success" | |
| try: | |
| # Primary method: PyMuPDF text extraction | |
| doc = fitz.open(pdf_file.name) | |
| text = "" | |
| # Extract text from each page | |
| for page_num, page in enumerate(doc): | |
| page_text = page.get_text("text") | |
| if page_text.strip(): | |
| text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" | |
| doc.close() | |
| # If we got meaningful text, return it | |
| if text.strip() and len(text.strip()) > 50: # Arbitrary threshold | |
| return text.strip(), status | |
| # If no text or very little text, try OCR fallback | |
| status = "β οΈ Using OCR (Image-based PDF detected)" | |
| # Check if OCR API key is configured | |
| if OCR_API_KEY == 'your_ocr_space_api_key_here': | |
| return ("No extractable text found. This appears to be an image-based PDF.\n" | |
| "To extract text from image-based PDFs, please:\n" | |
| "1. Get a free API key from https://ocr.space/ocrapi\n" | |
| "2. Set the OCR_API_KEY environment variable\n" | |
| "3. Restart the application"), "β OCR Not Configured" | |
| # Try OCR extraction | |
| ocr_text = extract_text_with_ocr(pdf_file.name) | |
| if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"): | |
| return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "β OCR Failed" | |
| return f"Extracted using OCR:\n\n{ocr_text}", status | |
| except Exception as e: | |
| # Complete fallback error handling | |
| error_msg = f"Error processing PDF: {str(e)}" | |
| # Try to provide helpful error messages | |
| if "No such file" in str(e): | |
| error_msg = "File not found. Please try uploading the PDF again." | |
| elif "not a PDF" in str(e): | |
| error_msg = "Invalid file format. Please upload a valid PDF file." | |
| elif "encrypted" in str(e).lower(): | |
| error_msg = "This PDF is password-protected. Please provide an unlocked PDF." | |
| elif "corrupted" in str(e).lower(): | |
| error_msg = "This PDF file appears to be corrupted. Please try a different file." | |
| return error_msg, "β Error" | |
| def clear_output(): | |
| """Clear the output textbox""" | |
| return "", "π Ready" | |
| # Create the Gradio interface | |
| with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π PDF Text Extraction App") | |
| gr.Markdown(""" | |
| Upload a PDF file to extract its text content. | |
| **Features:** | |
| - β Direct text extraction from text-based PDFs | |
| - π OCR fallback for image-based PDFs (requires OCR.space API key) | |
| - π Status indicators for extraction method used | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File( | |
| label="π Upload PDF File", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| with gr.Row(): | |
| extract_btn = gr.Button("π Extract Text", variant="primary", size="lg") | |
| clear_btn = gr.Button("ποΈ Clear", variant="secondary") | |
| # Status indicator | |
| status_output = gr.Textbox( | |
| label="Status", | |
| value="π Ready", | |
| interactive=False, | |
| max_lines=1 | |
| ) | |
| # OCR Configuration info | |
| gr.Markdown(""" | |
| **OCR Configuration:** | |
| Set `OCR_API_KEY` environment variable for image-based PDF support. | |
| Get free API key at: https://ocr.space/ocrapi | |
| """) | |
| with gr.Column(scale=2): | |
| text_output = gr.Textbox( | |
| label="π Extracted Text", | |
| lines=25, | |
| max_lines=50, | |
| placeholder="Extracted text will appear here...", | |
| show_copy_button=True | |
| ) | |
| # Event handlers | |
| extract_btn.click( | |
| fn=extract_text_from_pdf, | |
| inputs=pdf_input, | |
| outputs=[text_output, status_output] | |
| ) | |
| clear_btn.click( | |
| fn=clear_output, | |
| outputs=[text_output, status_output] | |
| ) | |
| # Auto-extract when file is uploaded | |
| pdf_input.change( | |
| fn=extract_text_from_pdf, | |
| inputs=pdf_input, | |
| outputs=[text_output, status_output] | |
| ) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| **Tips:** | |
| - For best results with image-based PDFs, ensure good image quality | |
| - Large PDFs may take longer to process | |
| - OCR works best with clear, high-contrast text | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| debug=True | |
| ) |