heerjtdev commited on
Commit
58317ea
·
verified ·
1 Parent(s): 216e98c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -90
app.py CHANGED
@@ -1,111 +1,85 @@
1
  import gradio as gr
2
- import pytesseract
3
- from PIL import Image
4
- from pdf2image import convert_from_path
5
  import os
6
- import tempfile
7
 
8
- # ----------------------------------------------------------------------
9
- # 1. OCR Core Function
10
- # ----------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- def perform_ocr_on_pdf(pdf_file_path, language="eng"):
13
- """
14
- Converts a PDF file to images and performs OCR on each page.
15
 
16
- Args:
17
- pdf_file_path (str): The file path to the uploaded PDF.
18
- language (str): The Tesseract language code (e.g., 'eng', 'fra+deu').
19
-
20
- Returns:
21
- str: The combined extracted text from all PDF pages.
22
  """
23
  if pdf_file_path is None:
24
- return "Please upload a PDF file."
25
-
26
- extracted_text = []
27
 
28
- try:
29
- # 1. Convert PDF pages to PIL images (requires poppler-utils, installed via Dockerfile)
30
- # Setting a high DPI (300) improves OCR accuracy for scanned documents.
31
- images = convert_from_path(pdf_file_path, dpi=300)
 
32
 
33
- # 2. Iterate through each page image and perform OCR
34
- for i, image in enumerate(images):
35
- # Using tempfile to save the image is sometimes necessary for pytesseract,
36
- # though convert_from_path often returns PIL objects directly.
37
- # We'll use the PIL object directly for efficiency.
38
-
39
- # Perform OCR on the image
40
- page_text = pytesseract.image_to_string(image, lang=language)
 
 
 
 
 
 
41
 
42
- extracted_text.append(f"--- PAGE {i+1} ---\n{page_text}\n")
43
-
44
- return "\n".join(extracted_text)
 
 
45
 
46
- except pytesseract.TesseractNotFoundError:
47
- return "Error: Tesseract is not installed or not in PATH. This should be handled by the Dockerfile."
48
- except Exception as e:
49
- return f"An error occurred during OCR processing: {str(e)}"
50
 
51
- # ----------------------------------------------------------------------
52
- # 2. Gradio Interface
53
- # ----------------------------------------------------------------------
54
 
55
- # Define the supported languages for the dropdown
56
- LANGUAGES = {
57
- "English": "eng",
58
- "Spanish": "spa",
59
- "French": "fra",
60
- "German": "deu",
61
- "Japanese": "jpn",
62
- "Chinese (Simplified)": "chi_sim"
63
- }
64
-
65
- # Create the Gradio interface components
66
  pdf_input = gr.File(
67
- label="Upload PDF Document",
68
  file_types=[".pdf"],
69
- type="filepath",
70
- interactive=True
71
  )
72
 
73
- lang_dropdown = gr.Dropdown(
74
- label="Select OCR Language",
75
- choices=list(LANGUAGES.keys()),
76
- value="English",
77
- type="value",
78
- interactive=True
79
  )
80
 
81
- ocr_output = gr.Textbox(
82
- label="Extracted Text (Output)",
83
- lines=25,
84
- max_lines=30,
85
- show_copy_button=True,
86
- placeholder="Extracted text will appear here...",
 
87
  )
88
 
89
- # Custom wrapper to map the dropdown name back to the Tesseract code
90
- def lang_wrapper(file_path, lang_name):
91
- lang_code = LANGUAGES.get(lang_name, "eng")
92
- return perform_ocr_on_pdf(file_path, lang_code)
93
-
94
- # Create the Gradio Interface
95
- gr.Interface(
96
- fn=lang_wrapper,
97
- inputs=[pdf_input, lang_dropdown],
98
- outputs=ocr_output,
99
- title="PDF Optical Character Recognition (OCR) App",
100
- description=(
101
- "Upload a PDF file to extract text from it using Tesseract OCR. "
102
- "Select the primary language to improve accuracy. "
103
- "Note: Requires Tesseract and Poppler system dependencies."
104
- ),
105
- allow_flagging="never",
106
- theme=gr.themes.Soft(primary_hue="blue").set(
107
- body_background_fill="#f5f7fa",
108
- background_fill_primary="#ffffff",
109
- shadow_drop_lg="0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05)",
110
- )
111
- ).launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from paddleocr import PaddleOCR
 
 
3
  import os
 
4
 
5
+ # --- Configuration ---
6
+ # Initialize PaddleOCR globally for efficiency.
7
+ # Setting 'use_angle_cls=False' and 'use_text_cls=False' speeds up inference,
8
+ # making it better suited for the default CPU tier on Hugging Face Spaces.
9
+ # We are using the English model (en) for general document parsing.
10
+ # PaddleOCR natively supports taking a PDF file path as input using pymupdf,
11
+ # which it installs as a dependency.
12
+
13
+ # Note on 'use_gpu': Hugging Face Spaces typically default to CPU.
14
+ # If you deploy on a paid GPU Space, set this to True.
15
+ try:
16
+ ocr = PaddleOCR(lang='en', use_angle_cls=False, use_gpu=False)
17
+ except Exception as e:
18
+ # Fallback initialization in case of deployment issues
19
+ print(f"Error initializing PaddleOCR: {e}. Attempting default initialization.")
20
+ ocr = PaddleOCR()
21
 
 
 
 
22
 
23
+ def process_pdf_for_ocr(pdf_file_path):
24
+ """
25
+ Takes a PDF file path, runs PaddleOCR on it, and returns the extracted text.
26
+ Only the first page is processed for demonstration and speed on a free tier.
 
 
27
  """
28
  if pdf_file_path is None:
29
+ return "Please upload a PDF file to analyze."
 
 
30
 
31
+ print(f"Processing PDF: {pdf_file_path}")
32
+
33
+ # Run OCR inference. PaddleOCR intelligently handles PDF input.
34
+ # It will extract text from the first page by default.
35
+ result = ocr.ocr(pdf_file_path, cls=False, det=True, rec=True)
36
 
37
+ # --- Post-processing: Format the results into clean text ---
38
+
39
+ extracted_text = []
40
+
41
+ # PaddleOCR result format: list of pages -> list of detection results
42
+ # Each detection result is: [bounding_box, (text, confidence)]
43
+
44
+ # Check if result is not None and has content
45
+ if result and result[0] is not None:
46
+ # Assuming single-page processing for simplicity. `result[0]` is the first page.
47
+ for line in result[0]:
48
+ # The text is the first element of the tuple inside the list (line[1][0])
49
+ text = line[1][0]
50
+ extracted_text.append(text)
51
 
52
+ if not extracted_text:
53
+ return "OCR analysis complete, but no readable text was detected on the first page."
54
+
55
+ # Join all detected lines into a single, clean block of text
56
+ return "\n".join(extracted_text)
57
 
 
 
 
 
58
 
59
+ # --- Gradio Interface Setup ---
 
 
60
 
61
+ # Create a custom Gradio component for PDF upload
 
 
 
 
 
 
 
 
 
 
62
  pdf_input = gr.File(
63
+ label="Upload PDF Document (Only the first page is processed)",
64
  file_types=[".pdf"],
 
 
65
  )
66
 
67
+ # Create a text output box
68
+ text_output = gr.Textbox(
69
+ label="Extracted Text Results",
70
+ lines=20,
71
+ placeholder="The text extracted from the PDF will appear here."
 
72
  )
73
 
74
+ # Define the Gradio Interface
75
+ iface = gr.Interface(
76
+ fn=process_pdf_for_ocr,
77
+ inputs=pdf_input,
78
+ outputs=text_output,
79
+ title="PDF OCR Parser using PaddleOCR",
80
+ description="Upload a PDF file, and this app will use the powerful PaddleOCR system (PP-OCRv3) to extract the text from the document (first page only for quick demo)."
81
  )
82
 
83
+ # Launch the app
84
+ if __name__ == "__main__":
85
+ iface.launch()