lolhaha002 commited on
Commit
6752cd0
Β·
verified Β·
1 Parent(s): 5c23bea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -12
app.py CHANGED
@@ -3,23 +3,44 @@ from pdf2image import convert_from_path
3
  from PIL import Image
4
  import pytesseract
5
 
6
- OCR_LANG = "guj"
 
 
7
 
8
  def extract_gujarati_text(pdf_file, page_number):
9
- images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
10
- image = images[0]
11
- text = pytesseract.image_to_string(image, lang=OCR_LANG)
12
- return text, image # Returning both OCR text and snapshot
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  with gr.Blocks() as demo:
15
- gr.Markdown("## πŸ“š Gujarati OCR from PDF (with Page Snapshot)")
16
- pdf = gr.File(label="πŸ“€ Upload Gujarati PDF", file_types=[".pdf"])
17
- page = gr.Number(label="πŸ“„ Page Number", minimum=1, value=1, step=1)
18
- button = gr.Button("πŸ” Extract Text")
19
-
20
  with gr.Row():
21
- image_output = gr.Image(label="πŸ–ΌοΈ PDF Page Snapshot")
22
- text_output = gr.Textbox(label="πŸ“ Extracted Gujarati Text", lines=20)
 
 
 
 
 
 
 
 
 
 
23
 
24
  button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])
25
 
 
3
  from PIL import Image
4
  import pytesseract
5
 
6
+ # UPDATED: Added 'eng' to support mixed words (e.g., "Statistics", "Mean")
7
+ # The '+' operator tells Tesseract to look for both languages.
8
+ OCR_LANG = "guj+eng"
9
 
10
  def extract_gujarati_text(pdf_file, page_number):
11
+ try:
12
+ # Convert PDF page to image
13
+ images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
14
+ image = images[0]
15
+
16
+ # UPDATED: added configuration
17
+ # --psm 3: Fully automatic page segmentation, but no OSD. (Good for mixed layouts)
18
+ # --oem 3: Default OCR Engine Mode.
19
+ custom_config = r'--oem 3 --psm 3'
20
+
21
+ text = pytesseract.image_to_string(image, lang=OCR_LANG, config=custom_config)
22
+ return text, image
23
+
24
+ except Exception as e:
25
+ return f"Error: {str(e)}", None
26
 
27
  with gr.Blocks() as demo:
28
+ gr.Markdown("## πŸ“š Gujarati + English OCR (Textbook Friendly)")
29
+ gr.Markdown("Upload your Maths/Stats textbook page. This tool now supports English words mixed with Gujarati.")
30
+
 
 
31
  with gr.Row():
32
+ with gr.Column():
33
+ pdf = gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"])
34
+ page = gr.Number(label="πŸ“„ Page Number", minimum=1, value=1, step=1)
35
+ button = gr.Button("πŸ” Extract Text", variant="primary")
36
+
37
+ with gr.Column():
38
+ image_output = gr.Image(label="πŸ–ΌοΈ Page Preview", type="pil")
39
+ text_output = gr.Textbox(
40
+ label="πŸ“ Extracted Text (Copyable)",
41
+ lines=20,
42
+ show_copy_button=True # Added a copy button for ease
43
+ )
44
 
45
  button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])
46