heerjtdev commited on
Commit
905c2c1
·
verified ·
1 Parent(s): e40f046

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -62
app.py CHANGED
@@ -1,85 +1,76 @@
1
  import gradio as gr
2
  from paddleocr import PaddleOCR
3
  import os
 
 
4
 
5
- # --- Configuration ---
6
- # Initialize PaddleOCR globally for efficiency.
7
- # Setting 'use_angle_cls=False' and 'use_text_cls=False' speeds up inference,
8
- # making it better suited for the default CPU tier on Hugging Face Spaces.
9
- # We are using the English model (en) for general document parsing.
10
- # PaddleOCR natively supports taking a PDF file path as input using pymupdf,
11
- # which it installs as a dependency.
12
-
13
- # Note on 'use_gpu': Hugging Face Spaces typically default to CPU.
14
- # If you deploy on a paid GPU Space, set this to True.
15
  try:
16
- ocr = PaddleOCR(lang='en', use_angle_cls=False, use_gpu=False)
 
17
  except Exception as e:
18
- # Fallback initialization in case of deployment issues
19
- print(f"Error initializing PaddleOCR: {e}. Attempting default initialization.")
20
- ocr = PaddleOCR()
21
 
22
 
23
- def process_pdf_for_ocr(pdf_file_path):
24
  """
25
- Takes a PDF file path, runs PaddleOCR on it, and returns the extracted text.
26
- Only the first page is processed for demonstration and speed on a free tier.
27
  """
28
- if pdf_file_path is None:
29
- return "Please upload a PDF file to analyze."
30
 
 
31
  print(f"Processing PDF: {pdf_file_path}")
32
-
33
- # Run OCR inference. PaddleOCR intelligently handles PDF input.
34
- # It will extract text from the first page by default.
35
- result = ocr.ocr(pdf_file_path, cls=False, det=True, rec=True)
36
 
37
- # --- Post-processing: Format the results into clean text ---
38
-
39
- extracted_text = []
40
-
41
- # PaddleOCR result format: list of pages -> list of detection results
42
- # Each detection result is: [bounding_box, (text, confidence)]
43
-
44
- # Check if result is not None and has content
45
- if result and result[0] is not None:
46
- # Assuming single-page processing for simplicity. `result[0]` is the first page.
47
- for line in result[0]:
48
- # The text is the first element of the tuple inside the list (line[1][0])
49
- text = line[1][0]
50
- extracted_text.append(text)
51
-
52
- if not extracted_text:
53
- return "OCR analysis complete, but no readable text was detected on the first page."
54
-
55
- # Join all detected lines into a single, clean block of text
56
- return "\n".join(extracted_text)
57
 
 
 
58
 
59
- # --- Gradio Interface Setup ---
60
 
61
- # Create a custom Gradio component for PDF upload
62
- pdf_input = gr.File(
63
- label="Upload PDF Document (Only the first page is processed)",
64
- file_types=[".pdf"],
65
- )
66
 
67
- # Create a text output box
68
- text_output = gr.Textbox(
69
- label="Extracted Text Results",
70
- lines=20,
71
- placeholder="The text extracted from the PDF will appear here."
72
- )
73
 
74
- # Define the Gradio Interface
75
  iface = gr.Interface(
76
  fn=process_pdf_for_ocr,
77
- inputs=pdf_input,
78
- outputs=text_output,
79
- title="PDF OCR Parser using PaddleOCR",
80
- description="Upload a PDF file, and this app will use the powerful PaddleOCR system (PP-OCRv3) to extract the text from the document (first page only for quick demo)."
 
 
 
 
 
 
 
81
  )
82
 
83
- # Launch the app
84
  if __name__ == "__main__":
85
- iface.launch()
 
 
1
  import gradio as gr
2
  from paddleocr import PaddleOCR
3
  import os
4
+ import tempfile
5
+ from pdf2image import convert_from_path
6
 
7
+ # --- Global Initialization (Fixes Applied Here) ---
8
+ # 1. Deprecation fix: 'use_angle_cls' changed to 'use_textline_orientation'
9
+ # 2. Initialization error fix: Removed 'use_gpu=False' as it caused an 'Unknown argument' error.
 
 
 
 
 
 
 
10
  try:
11
+ print("Initializing PaddleOCR (lang='en', use_textline_orientation=False)...")
12
+ ocr = PaddleOCR(lang='en', use_textline_orientation=False)
13
  except Exception as e:
14
+ print(f"Error initializing PaddleOCR: {e}. Falling back to default initialization.")
15
+ ocr = PaddleOCR(lang='en')
 
16
 
17
 
18
+ def process_pdf_for_ocr(pdf_file_obj):
19
  """
20
+ Processes the uploaded PDF file object to perform OCR and extract text.
21
+ The Gradio file object provides the file path automatically.
22
  """
23
+ if pdf_file_obj is None:
24
+ return "Please upload a PDF file."
25
 
26
+ pdf_file_path = pdf_file_obj.name
27
  print(f"Processing PDF: {pdf_file_path}")
 
 
 
 
28
 
29
+ try:
30
+ # --- Function Call Fix Applied Here (Former Line 35) ---
31
+ # The 'cls=False' argument was removed because it is no longer supported
32
+ # and caused the TypeError in PaddleOCR's internal 'predict' function.
33
+ result = ocr.ocr(pdf_file_path, det=True, rec=True)
34
+
35
+ extracted_text_lines = []
36
+
37
+ # Parse the OCR result
38
+ for page_result in result:
39
+ if page_result is None:
40
+ continue
41
+ for line in page_result:
42
+ # line format: [bounding_box, (text, confidence)]
43
+ if isinstance(line, list) and len(line) == 2 and isinstance(line[1], tuple):
44
+ extracted_text_lines.append(line[1][0])
 
 
 
 
45
 
46
+ if not extracted_text_lines:
47
+ return "OCR completed, but no text was found in the document."
48
 
49
+ return "\n".join(extracted_text_lines)
50
 
51
+ except Exception as e:
52
+ # Use a custom message box instead of alert/confirm
53
+ error_message = f"An error occurred during OCR processing: {e}"
54
+ print(error_message)
55
+ return error_message
56
 
57
+ # --- Gradio Interface Setup ---
 
 
 
 
 
58
 
 
59
  iface = gr.Interface(
60
  fn=process_pdf_for_ocr,
61
+ inputs=gr.File(
62
+ label="Upload PDF for Multi-Page OCR",
63
+ file_types=['.pdf'],
64
+ type="filepath" # Ensure we get the path for PaddleOCR
65
+ ),
66
+ outputs=gr.Textbox(
67
+ label="Extracted Text Content",
68
+ placeholder="Upload a PDF and click Submit to see the OCR results here."
69
+ ),
70
+ title="Fixed PaddleOCR PDF Processor",
71
+ description="This application uses PaddleOCR to extract text from a multi-page PDF file. The previously encountered errors regarding unknown arguments and unsupported keywords have been fixed."
72
  )
73
 
 
74
  if __name__ == "__main__":
75
+ # Launch configuration based on your original logs
76
+ iface.launch(server_name="0.0.0.0", server_port=7860)