GermanySutherland commited on
Commit
25f374f
·
verified ·
1 Parent(s): fc7a5ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -67
app.py CHANGED
@@ -1,80 +1,41 @@
1
  import gradio as gr
2
- from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
 
3
  from PIL import Image
4
- import fitz # PyMuPDF
5
- import io
6
 
7
- # --- Hugging Face Models ---
8
- # 1. Optical Character Recognition (OCR) model
9
- # This model is specifically trained to read text from images.
10
- try:
11
- processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
12
- model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
13
- trocr_pipeline = pipeline("image-to-text", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)
14
- except Exception as e:
15
- print(f"Error loading models: {e}")
16
- trocr_pipeline = None
17
 
18
- # --- Functions ---
19
- def extract_text_from_pdf(pdf_file):
20
- """
21
- Extracts text from a PDF file by rendering each page to an image and
22
- then applying a TrOCR model for text extraction.
23
 
24
- Args:
25
- pdf_file: The uploaded PDF file object from Gradio.
26
 
27
- Returns:
28
- A formatted string of the extracted text.
29
- """
30
- if not trocr_pipeline:
31
- return "Model failed to load. Please check your dependencies."
32
 
33
- extracted_pages = []
34
- # Open the PDF file using PyMuPDF (fitz)
35
- try:
36
- pdf_document = fitz.open(stream=pdf_file.name, filetype="pdf")
37
- except Exception as e:
38
- return f"Error opening PDF: {e}"
39
 
40
- # Loop through each page of the PDF
41
- for page_num in range(pdf_document.page_count):
42
- page = pdf_document.load_page(page_num)
43
 
44
- # Render the page as a high-resolution image (300 DPI)
45
- pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
46
 
47
- # Convert the image to a PIL Image object
48
- img_bytes = pix.tobytes("png")
49
- image = Image.open(io.BytesIO(img_bytes))
50
-
51
- # Apply the TrOCR pipeline to the image
52
- try:
53
- # The pipeline automatically handles the model and tokenizer
54
- extracted_text = trocr_pipeline(image, max_new_tokens=256)[0]['generated_text']
55
- except Exception as e:
56
- extracted_text = f"[OCR Failed on this page: {e}]"
57
-
58
- extracted_pages.append(f"--- Page {page_num + 1} ---\n{extracted_text}\n")
59
-
60
- # Join all page texts into a single string
61
- return "\n".join(extracted_pages)
62
-
63
- # --- Gradio UI ---
64
- with gr.Blocks(title="PDF Text Extractor") as demo:
65
- gr.Markdown("## 📄 AI PDF Text Extractor")
66
- gr.Markdown("Upload a PDF file to extract text from its pages using a powerful OCR model. "
67
- "This tool handles complex layouts, including tilted text, by "
68
- "first converting each page into an image.")
69
-
70
- with gr.Row():
71
- pdf_input = gr.File(label="Upload PDF File", file_types=["pdf"])
72
-
73
  btn = gr.Button("Extract Text")
74
-
75
- text_output = gr.Textbox(label="Extracted Text", lines=20)
76
-
77
- # Set up the button click event
78
- btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)
79
 
80
  demo.launch()
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
4
  from PIL import Image
5
+ from textblob import TextBlob
 
6
 
7
+ # Load lightweight Hugging Face OCR model
8
+ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-stage1")
9
+ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-stage1")
 
 
 
 
 
 
 
10
 
11
+ def pdf_to_text(pdf_file):
12
+ if not pdf_file:
13
+ return "No PDF uploaded."
 
 
14
 
15
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
16
+ all_text = []
17
 
18
+ for page in doc:
19
+ pix = page.get_pixmap()
20
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
 
21
 
22
+ # OCR inference
23
+ pixel_values = processor(images=img, return_tensors="pt").pixel_values
24
+ generated_ids = model.generate(pixel_values)
25
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
26
 
27
+ # Spell & grammar correction
28
+ corrected = str(TextBlob(text).correct())
29
+ all_text.append(corrected)
30
 
31
+ return "\n\n".join(all_text)
 
32
 
33
+ with gr.Blocks() as demo:
34
+ gr.Markdown("## 📄 Robust PDF OCR MVP (Handles Tilted Words)")
35
+ pdf_input = gr.File(label="Upload PDF", type="file", file_types=[".pdf"]) # ✅ fixed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  btn = gr.Button("Extract Text")
37
+ output = gr.Textbox(label="Extracted Text", lines=15)
38
+
39
+ btn.click(fn=pdf_to_text, inputs=pdf_input, outputs=output)
 
 
40
 
41
  demo.launch()