lolhaha002 commited on
Commit
3aa2d23
Β·
verified Β·
1 Parent(s): 0919904

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -0
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pdf2image import convert_from_path
3
+ from PIL import Image
4
+ import pytesseract
5
+ import os
6
+
7
+ # Set Gujarati as OCR language
8
+ OCR_LANG = "guj"
9
+
10
+ def extract_gujarati_text(pdf_file, page_number):
11
+ # Convert selected page to image
12
+ images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
13
+ image = images[0]
14
+ image_path = f"/tmp/page_{page_number}.png"
15
+ image.save(image_path)
16
+
17
+ # Run OCR with Gujarati
18
+ text = pytesseract.image_to_string(Image.open(image_path), lang=OCR_LANG)
19
+ return text
20
+
21
+ with gr.Blocks() as demo:
22
+ gr.Markdown("## πŸ“š Gujarati OCR from PDF (Tesseract-powered)")
23
+
24
+ pdf = gr.File(label="πŸ“€ Upload Gujarati PDF", file_types=[".pdf"])
25
+ page = gr.Number(label="πŸ“„ Page Number", minimum=1, value=1, step=1)
26
+ button = gr.Button("πŸ” Extract Text")
27
+ output = gr.Textbox(label="πŸ“ Extracted Gujarati Text", lines=20)
28
+
29
+ button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=output)
30
+
31
+ demo.launch()