Spaces:
Sleeping
Sleeping
| import sys | |
| from unittest.mock import MagicMock | |
| sys.modules["flash_attn"] = MagicMock() | |
| sys.modules["flash_attn.flash_attn_interface"] = MagicMock() | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoProcessor | |
| from PIL import Image | |
| MODEL_ID = "d3p4rt/newtype-cognition" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| torch_dtype=DTYPE, | |
| attn_implementation="eager", | |
| ).to(DEVICE).eval() | |
| TASK_TOKENS = [ | |
| "<OCR_WITH_REGION>", | |
| "<CAPTION>", | |
| "<MORE_DETAILED_CAPTION>", | |
| "<DETAILED_CAPTION>", | |
| "<OCR>", | |
| ] | |
| def predict(image: Image.Image, task_token: str): | |
| inputs = processor(text=task_token, images=image, return_tensors="pt") | |
| inputs = {k: v.to(DEVICE) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=128, | |
| num_beams=3, | |
| do_sample=False, | |
| early_stopping=True, | |
| ) | |
| return processor.batch_decode(out, skip_special_tokens=True)[0] | |
| demo = gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Image(type="pil", label="Document Image"), | |
| gr.Dropdown(choices=TASK_TOKENS, value="<OCR_WITH_REGION>", label="Task Token"), | |
| ], | |
| outputs=gr.Textbox(label="Output"), | |
| title="newtype-cognition — Document OCR Captioner", | |
| description="Upload a document image and select a Florence-2 task token. Do not expect question-answering behavior — see model card for limitations.", | |
| ) | |
| demo.launch() |