d3p4rt's picture
Update app.py
b849bb0 verified
import sys
from unittest.mock import MagicMock
sys.modules["flash_attn"] = MagicMock()
sys.modules["flash_attn.flash_attn_interface"] = MagicMock()
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
MODEL_ID = "d3p4rt/newtype-cognition"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=DTYPE,
attn_implementation="eager",
).to(DEVICE).eval()
TASK_TOKENS = [
"<OCR_WITH_REGION>",
"<CAPTION>",
"<MORE_DETAILED_CAPTION>",
"<DETAILED_CAPTION>",
"<OCR>",
]
def predict(image: Image.Image, task_token: str):
inputs = processor(text=task_token, images=image, return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=128,
num_beams=3,
do_sample=False,
early_stopping=True,
)
return processor.batch_decode(out, skip_special_tokens=True)[0]
demo = gr.Interface(
fn=predict,
inputs=[
gr.Image(type="pil", label="Document Image"),
gr.Dropdown(choices=TASK_TOKENS, value="<OCR_WITH_REGION>", label="Task Token"),
],
outputs=gr.Textbox(label="Output"),
title="newtype-cognition — Document OCR Captioner",
description="Upload a document image and select a Florence-2 task token. Do not expect question-answering behavior — see model card for limitations.",
)
demo.launch()