Spaces:

heerjtdev
/

nougat

Sleeping

App Files Files Community

heerjtdev commited on Nov 10, 2025

Commit

5b5cfef

verified ·

1 Parent(s): 776d228

Upload nougat.py

Browse files

Files changed (1) hide show

nougat.py +104 -0

nougat.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# app.py
+import gradio as gr
+from transformers import pipeline
+import torch
+from PIL import Image
+import io
+import fitz  # PyMuPDF
+# --- Model Loading ---
+# Nougat is typically used for PDF/document image OCR.
+#The `facebook/nougat-small` model is a good starting point.
+# Using 'facebook/nougat-base' or 'facebook/nougat-large' is more accurate but requires more GPU memory/power.
+try:
+    # Set up the device based on availability
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    # Load the Nougat pipeline
+    # The task is technically 'document-image-to-text' but can be inferred by the model name
+    nougat_pipeline = pipeline(
+        "image-to-text",
+        model="facebook/nougat-small",
+        device=device,
+        # Set max_new_tokens for the output length
+        max_new_tokens=1024,
+        # Set to False to prevent a warning about the model not having an image-to-text pipeline
+        # (The pipeline can still wrap the VisionEncoderDecoder model)
+        trust_remote_code=True
+    )
+    print(f"Nougat model loaded successfully on {device}")
+except Exception as e:
+    # Fallback/error handling for model loading
+    print(f"Error loading Nougat model: {e}")
+    nougat_pipeline = None
+# --- OCR Function ---
+def nougat_ocr(document):
+    """Performs Nougat OCR on a single-page document image or PDF."""
+    if nougat_pipeline is None:
+        return "Error: Nougat model failed to load. Check your Space hardware and dependencies."
+    # Handle File object from Gradio (could be an image or a PDF)
+    file_path = document.name
+    # 1. Convert PDF (or first page of PDF) to an image
+    if file_path.lower().endswith(('.pdf')):
+        try:
+            # Open PDF using PyMuPDF (fitz)
+            doc = fitz.open(file_path)
+            if len(doc) == 0:
+                return "Error: PDF contains no pages."
+            # Render the first page at a high DPI for better OCR
+            page = doc.load_page(0)
+            pix = page.get_pixmap(dpi=300)
+            # Convert pixmap to PIL Image
+            img_data = pix.tobytes("png")
+            image = Image.open(io.BytesIO(img_data))
+            doc.close()
+        except Exception as e:
+            return f"Error processing PDF: {e}"
+    # 2. Handle image file (png, jpg, etc.)
+    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
+        image = Image.open(file_path).convert("RGB")
+    else:
+        return "Error: Unsupported file format. Please upload an image or a PDF."
+    # 3. Perform OCR inference
+    try:
+        # Pass the PIL image to the pipeline
+        output = nougat_pipeline(image)
+        # The output is typically a list of dicts: [{'generated_text': '...'}]
+        markdown_text = output[0]['generated_text'] if output else "OCR failed to generate text."
+        return markdown_text
+    except Exception as e:
+        return f"An error occurred during OCR: {e}"
+# --- Gradio Interface ---
+title = "🍫 Nougat OCR for Documents"
+description = "Upload a single-page document image (PNG/JPG) or a PDF to transcribe it into Markdown format using the Nougat-small model. **Note: For multi-page PDFs, only the first page is processed.**"
+iface = gr.Interface(
+    fn=nougat_ocr,
+    inputs=gr.File(
+        label="Upload Document (Image or PDF)",
+        file_types=["image", ".pdf"],
+        file_count="single"
+    ),
+    outputs=gr.Markdown(label="Generated Markdown Output"),
+    title=title,
+    description=description,
+    allow_flagging="auto",
+    theme=gr.themes.Soft()
+)
+if __name__ == "__main__":
+    iface.launch()