Spaces:

heerjtdev
/

pddle

Sleeping

App Files Files Community

heerjtdev commited on Nov 4, 2025

Commit

58317ea

verified ·

1 Parent(s): 216e98c

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -90

app.py CHANGED Viewed

@@ -1,111 +1,85 @@
 import gradio as gr
-import pytesseract
-from PIL import Image
-from pdf2image import convert_from_path
 import os
-import tempfile
-# ----------------------------------------------------------------------
-# 1. OCR Core Function
-# ----------------------------------------------------------------------
-def perform_ocr_on_pdf(pdf_file_path, language="eng"):
-    """
-    Converts a PDF file to images and performs OCR on each page.
-    Args:
-        pdf_file_path (str): The file path to the uploaded PDF.
-        language (str): The Tesseract language code (e.g., 'eng', 'fra+deu').
-    Returns:
-        str: The combined extracted text from all PDF pages.
     """
     if pdf_file_path is None:
-        return "Please upload a PDF file."
-    extracted_text = []
-    try:
-        # 1. Convert PDF pages to PIL images (requires poppler-utils, installed via Dockerfile)
-        # Setting a high DPI (300) improves OCR accuracy for scanned documents.
-        images = convert_from_path(pdf_file_path, dpi=300)
-        # 2. Iterate through each page image and perform OCR
-        for i, image in enumerate(images):
-            # Using tempfile to save the image is sometimes necessary for pytesseract,
-            # though convert_from_path often returns PIL objects directly.
-            # We'll use the PIL object directly for efficiency.
-            # Perform OCR on the image
-            page_text = pytesseract.image_to_string(image, lang=language)
-            extracted_text.append(f"--- PAGE {i+1} ---\n{page_text}\n")
-        return "\n".join(extracted_text)
-    except pytesseract.TesseractNotFoundError:
-        return "Error: Tesseract is not installed or not in PATH. This should be handled by the Dockerfile."
-    except Exception as e:
-        return f"An error occurred during OCR processing: {str(e)}"
-# ----------------------------------------------------------------------
-# 2. Gradio Interface
-# ----------------------------------------------------------------------
-# Define the supported languages for the dropdown
-LANGUAGES = {
-    "English": "eng",
-    "Spanish": "spa",
-    "French": "fra",
-    "German": "deu",
-    "Japanese": "jpn",
-    "Chinese (Simplified)": "chi_sim"
-}
-# Create the Gradio interface components
 pdf_input = gr.File(
-    label="Upload PDF Document",
     file_types=[".pdf"],
-    type="filepath",
-    interactive=True
 )
-lang_dropdown = gr.Dropdown(
-    label="Select OCR Language",
-    choices=list(LANGUAGES.keys()),
-    value="English",
-    type="value",
-    interactive=True
 )
-ocr_output = gr.Textbox(
-    label="Extracted Text (Output)",
-    lines=25,
-    max_lines=30,
-    show_copy_button=True,
-    placeholder="Extracted text will appear here...",
 )
-# Custom wrapper to map the dropdown name back to the Tesseract code
-def lang_wrapper(file_path, lang_name):
-    lang_code = LANGUAGES.get(lang_name, "eng")
-    return perform_ocr_on_pdf(file_path, lang_code)
-# Create the Gradio Interface
-gr.Interface(
-    fn=lang_wrapper,
-    inputs=[pdf_input, lang_dropdown],
-    outputs=ocr_output,
-    title="PDF Optical Character Recognition (OCR) App",
-    description=(
-        "Upload a PDF file to extract text from it using Tesseract OCR. "
-        "Select the primary language to improve accuracy. "
-        "Note: Requires Tesseract and Poppler system dependencies."
-    ),
-    allow_flagging="never",
-    theme=gr.themes.Soft(primary_hue="blue").set(
-        body_background_fill="#f5f7fa",
-        background_fill_primary="#ffffff",
-        shadow_drop_lg="0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05)",
-    )
-).launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+from paddleocr import PaddleOCR
 import os
+# --- Configuration ---
+# Initialize PaddleOCR globally for efficiency.
+# Setting 'use_angle_cls=False' and 'use_text_cls=False' speeds up inference,
+# making it better suited for the default CPU tier on Hugging Face Spaces.
+# We are using the English model (en) for general document parsing.
+# PaddleOCR natively supports taking a PDF file path as input using pymupdf,
+# which it installs as a dependency.
+# Note on 'use_gpu': Hugging Face Spaces typically default to CPU.
+# If you deploy on a paid GPU Space, set this to True.
+try:
+    ocr = PaddleOCR(lang='en', use_angle_cls=False, use_gpu=False)
+except Exception as e:
+    # Fallback initialization in case of deployment issues
+    print(f"Error initializing PaddleOCR: {e}. Attempting default initialization.")
+    ocr = PaddleOCR()
+def process_pdf_for_ocr(pdf_file_path):
+    """
+    Takes a PDF file path, runs PaddleOCR on it, and returns the extracted text.
+    Only the first page is processed for demonstration and speed on a free tier.
     """
     if pdf_file_path is None:
+        return "Please upload a PDF file to analyze."
+    print(f"Processing PDF: {pdf_file_path}")
+    # Run OCR inference. PaddleOCR intelligently handles PDF input.
+    # It will extract text from the first page by default.
+    result = ocr.ocr(pdf_file_path, cls=False, det=True, rec=True)
+    # --- Post-processing: Format the results into clean text ---
+    extracted_text = []
+    # PaddleOCR result format: list of pages -> list of detection results
+    # Each detection result is: [bounding_box, (text, confidence)]
+    # Check if result is not None and has content
+    if result and result[0] is not None:
+        # Assuming single-page processing for simplicity. `result[0]` is the first page.
+        for line in result[0]:
+            # The text is the first element of the tuple inside the list (line[1][0])
+            text = line[1][0]
+            extracted_text.append(text)
+    if not extracted_text:
+        return "OCR analysis complete, but no readable text was detected on the first page."
+    # Join all detected lines into a single, clean block of text
+    return "\n".join(extracted_text)
+# --- Gradio Interface Setup ---
+# Create a custom Gradio component for PDF upload
 pdf_input = gr.File(
+    label="Upload PDF Document (Only the first page is processed)",
     file_types=[".pdf"],
 )
+# Create a text output box
+text_output = gr.Textbox(
+    label="Extracted Text Results",
+    lines=20,
+    placeholder="The text extracted from the PDF will appear here."
 )
+# Define the Gradio Interface
+iface = gr.Interface(
+    fn=process_pdf_for_ocr,
+    inputs=pdf_input,
+    outputs=text_output,
+    title="PDF OCR Parser using PaddleOCR",
+    description="Upload a PDF file, and this app will use the powerful PaddleOCR system (PP-OCRv3) to extract the text from the document (first page only for quick demo)."
 )
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()