Spaces:
Build error
Build error
| import gradio as gr | |
| import PyPDF2 | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| from PIL import Image | |
| import io | |
| import tempfile | |
| import os | |
| def extract_text_from_pdf(pdf_file): | |
| """ | |
| Extract text from PDF. Uses direct text extraction if available, | |
| falls back to OCR if the PDF is image-based. | |
| """ | |
| if pdf_file is None: | |
| return None, "Please upload a PDF file first." | |
| try: | |
| # Try direct text extraction first | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| extracted_text = "" | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| page_text = page.extract_text() | |
| extracted_text += page_text | |
| # Check if we got meaningful text (more than just whitespace) | |
| if extracted_text.strip(): | |
| status = f"✓ Text extracted directly from PDF ({len(pdf_reader.pages)} pages)\nNo OCR needed - PDF contains searchable text." | |
| return create_txt_file(extracted_text), status | |
| # If no text found, use OCR | |
| status_msg = "⚠ PDF appears to be image-based. Running OCR...\n" | |
| # Save uploaded file temporarily for pdf2image | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: | |
| tmp_file.write(pdf_file.read()) | |
| tmp_path = tmp_file.name | |
| # Convert PDF pages to images | |
| images = convert_from_path(tmp_path) | |
| # Perform OCR on each page | |
| ocr_text = "" | |
| for i, image in enumerate(images): | |
| page_text = pytesseract.image_to_string(image) | |
| ocr_text += f"\n--- Page {i + 1} ---\n{page_text}\n" | |
| # Clean up temp file | |
| os.unlink(tmp_path) | |
| status_msg += f"✓ OCR completed on {len(images)} pages" | |
| return create_txt_file(ocr_text), status_msg | |
| except Exception as e: | |
| return None, f"Error processing PDF: {str(e)}" | |
| def create_txt_file(text): | |
| """Create a downloadable text file from the extracted text.""" | |
| txt_bytes = text.encode('utf-8') | |
| return txt_bytes | |
| # Create Gradio interface | |
| with gr.Blocks(title="PDF to Text Converter") as demo: | |
| gr.Markdown( | |
| """ | |
| # 📄 PDF to Text Converter with Smart OCR | |
| Upload a PDF file and get downloadable text. The app automatically: | |
| - Extracts text directly if the PDF contains searchable text | |
| - Uses OCR (Optical Character Recognition) only for image-based PDFs | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File( | |
| label="Upload PDF File", | |
| file_types=[".pdf"], | |
| type="binary" | |
| ) | |
| convert_btn = gr.Button("Convert to Text", variant="primary") | |
| with gr.Column(): | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=3, | |
| interactive=False | |
| ) | |
| txt_output = gr.File( | |
| label="Download Text File", | |
| type="binary" | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### How it works: | |
| 1. Upload your PDF file | |
| 2. Click "Convert to Text" | |
| 3. The app will extract text directly if possible, or use OCR if needed | |
| 4. Download the resulting .txt file | |
| **Note:** OCR may take longer for large PDFs with many pages. | |
| """ | |
| ) | |
| convert_btn.click( | |
| fn=extract_text_from_pdf, | |
| inputs=pdf_input, | |
| outputs=[txt_output, status_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |