GermanySutherland commited on
Commit
b003c87
·
verified ·
1 Parent(s): d5925a1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
3
+ from PIL import Image
4
+ import fitz # PyMuPDF
5
+ import io
6
+
7
+ # --- Hugging Face Models ---
8
+ # 1. Optical Character Recognition (OCR) model
9
+ # This model is specifically trained to read text from images.
10
+ try:
11
+ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
12
+ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
13
+ trocr_pipeline = pipeline("image-to-text", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)
14
+ except Exception as e:
15
+ print(f"Error loading models: {e}")
16
+ trocr_pipeline = None
17
+
18
+ # --- Functions ---
19
+ def extract_text_from_pdf(pdf_file):
20
+ """
21
+ Extracts text from a PDF file by rendering each page to an image and
22
+ then applying a TrOCR model for text extraction.
23
+
24
+ Args:
25
+ pdf_file: The uploaded PDF file object from Gradio.
26
+
27
+ Returns:
28
+ A formatted string of the extracted text.
29
+ """
30
+ if not trocr_pipeline:
31
+ return "Model failed to load. Please check your dependencies."
32
+
33
+ extracted_pages = []
34
+ # Open the PDF file using PyMuPDF (fitz)
35
+ try:
36
+ pdf_document = fitz.open(stream=pdf_file.name, filetype="pdf")
37
+ except Exception as e:
38
+ return f"Error opening PDF: {e}"
39
+
40
+ # Loop through each page of the PDF
41
+ for page_num in range(pdf_document.page_count):
42
+ page = pdf_document.load_page(page_num)
43
+
44
+ # Render the page as a high-resolution image (300 DPI)
45
+ pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
46
+
47
+ # Convert the image to a PIL Image object
48
+ img_bytes = pix.tobytes("png")
49
+ image = Image.open(io.BytesIO(img_bytes))
50
+
51
+ # Apply the TrOCR pipeline to the image
52
+ try:
53
+ # The pipeline automatically handles the model and tokenizer
54
+ extracted_text = trocr_pipeline(image, max_new_tokens=256)[0]['generated_text']
55
+ except Exception as e:
56
+ extracted_text = f"[OCR Failed on this page: {e}]"
57
+
58
+ extracted_pages.append(f"--- Page {page_num + 1} ---\n{extracted_text}\n")
59
+
60
+ # Join all page texts into a single string
61
+ return "\n".join(extracted_pages)
62
+
63
+ # --- Gradio UI ---
64
+ with gr.Blocks(title="PDF Text Extractor") as demo:
65
+ gr.Markdown("## 📄 AI PDF Text Extractor")
66
+ gr.Markdown("Upload a PDF file to extract text from its pages using a powerful OCR model. "
67
+ "This tool handles complex layouts, including tilted text, by "
68
+ "first converting each page into an image.")
69
+
70
+ with gr.Row():
71
+ pdf_input = gr.File(label="Upload PDF File", file_types=["pdf"])
72
+
73
+ btn = gr.Button("Extract Text")
74
+
75
+ text_output = gr.Textbox(label="Extracted Text", lines=20)
76
+
77
+ # Set up the button click event
78
+ btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)
79
+
80
+ demo.launch()