import sys from unittest.mock import MagicMock sys.modules["flash_attn"] = MagicMock() sys.modules["flash_attn.flash_attn_interface"] = MagicMock() import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoProcessor from PIL import Image MODEL_ID = "d3p4rt/newtype-cognition" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=DTYPE, attn_implementation="eager", ).to(DEVICE).eval() TASK_TOKENS = [ "", "", "", "", "", ] def predict(image: Image.Image, task_token: str): inputs = processor(text=task_token, images=image, return_tensors="pt") inputs = {k: v.to(DEVICE) for k, v in inputs.items()} with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=128, num_beams=3, do_sample=False, early_stopping=True, ) return processor.batch_decode(out, skip_special_tokens=True)[0] demo = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Document Image"), gr.Dropdown(choices=TASK_TOKENS, value="", label="Task Token"), ], outputs=gr.Textbox(label="Output"), title="newtype-cognition — Document OCR Captioner", description="Upload a document image and select a Florence-2 task token. Do not expect question-answering behavior — see model card for limitations.", ) demo.launch()