heerjtdev commited on
Commit
5b5cfef
·
verified ·
1 Parent(s): 776d228

Upload nougat.py

Browse files
Files changed (1) hide show
  1. nougat.py +104 -0
nougat.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ import torch
5
+ from PIL import Image
6
+ import io
7
+ import fitz # PyMuPDF
8
+
9
+ # --- Model Loading ---
10
+ # Nougat is typically used for PDF/document image OCR.
11
+ #The `facebook/nougat-small` model is a good starting point.
12
+ # Using 'facebook/nougat-base' or 'facebook/nougat-large' is more accurate but requires more GPU memory/power.
13
+ try:
14
+ # Set up the device based on availability
15
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+
17
+ # Load the Nougat pipeline
18
+ # The task is technically 'document-image-to-text' but can be inferred by the model name
19
+ nougat_pipeline = pipeline(
20
+ "image-to-text",
21
+ model="facebook/nougat-small",
22
+ device=device,
23
+ # Set max_new_tokens for the output length
24
+ max_new_tokens=1024,
25
+ # Set to False to prevent a warning about the model not having an image-to-text pipeline
26
+ # (The pipeline can still wrap the VisionEncoderDecoder model)
27
+ trust_remote_code=True
28
+ )
29
+ print(f"Nougat model loaded successfully on {device}")
30
+
31
+ except Exception as e:
32
+ # Fallback/error handling for model loading
33
+ print(f"Error loading Nougat model: {e}")
34
+ nougat_pipeline = None
35
+
36
+
37
+ # --- OCR Function ---
38
+ def nougat_ocr(document):
39
+ """Performs Nougat OCR on a single-page document image or PDF."""
40
+ if nougat_pipeline is None:
41
+ return "Error: Nougat model failed to load. Check your Space hardware and dependencies."
42
+
43
+ # Handle File object from Gradio (could be an image or a PDF)
44
+ file_path = document.name
45
+
46
+ # 1. Convert PDF (or first page of PDF) to an image
47
+ if file_path.lower().endswith(('.pdf')):
48
+ try:
49
+ # Open PDF using PyMuPDF (fitz)
50
+ doc = fitz.open(file_path)
51
+ if len(doc) == 0:
52
+ return "Error: PDF contains no pages."
53
+
54
+ # Render the first page at a high DPI for better OCR
55
+ page = doc.load_page(0)
56
+ pix = page.get_pixmap(dpi=300)
57
+
58
+ # Convert pixmap to PIL Image
59
+ img_data = pix.tobytes("png")
60
+ image = Image.open(io.BytesIO(img_data))
61
+ doc.close()
62
+
63
+ except Exception as e:
64
+ return f"Error processing PDF: {e}"
65
+
66
+ # 2. Handle image file (png, jpg, etc.)
67
+ elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
68
+ image = Image.open(file_path).convert("RGB")
69
+ else:
70
+ return "Error: Unsupported file format. Please upload an image or a PDF."
71
+
72
+ # 3. Perform OCR inference
73
+ try:
74
+ # Pass the PIL image to the pipeline
75
+ output = nougat_pipeline(image)
76
+ # The output is typically a list of dicts: [{'generated_text': '...'}]
77
+ markdown_text = output[0]['generated_text'] if output else "OCR failed to generate text."
78
+
79
+ return markdown_text
80
+
81
+ except Exception as e:
82
+ return f"An error occurred during OCR: {e}"
83
+
84
+
85
+ # --- Gradio Interface ---
86
+ title = "🍫 Nougat OCR for Documents"
87
+ description = "Upload a single-page document image (PNG/JPG) or a PDF to transcribe it into Markdown format using the Nougat-small model. **Note: For multi-page PDFs, only the first page is processed.**"
88
+
89
+ iface = gr.Interface(
90
+ fn=nougat_ocr,
91
+ inputs=gr.File(
92
+ label="Upload Document (Image or PDF)",
93
+ file_types=["image", ".pdf"],
94
+ file_count="single"
95
+ ),
96
+ outputs=gr.Markdown(label="Generated Markdown Output"),
97
+ title=title,
98
+ description=description,
99
+ allow_flagging="auto",
100
+ theme=gr.themes.Soft()
101
+ )
102
+
103
+ if __name__ == "__main__":
104
+ iface.launch()