MultiVLM-OCR

Running on Zero

Geraldine commited on 22 days ago

Commit

28c56d5

verified ·

1 Parent(s): ccde86d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -51,12 +51,12 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 MODEL_ID_Y = "rednote-hilab/dots.ocr"
-processor_y = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_y = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_Y,
     attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
-    torch_dtype=torch.float16
 ).to(device).eval()
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -319,6 +319,22 @@ def calc_timeout_duration(*args, **kwargs):
         return 60
 @spaces.GPU(duration=calc_timeout_duration)
 def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
@@ -359,6 +375,7 @@ def generate_image(model_name, text, image, max_new_tokens, temperature, top_p,
             truncation=True,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
         streamer = TextIteratorStreamer(
             processor.tokenizer if hasattr(processor, "tokenizer") else processor,

 ).to(device).eval()
 MODEL_ID_Y = "rednote-hilab/dots.ocr"
+processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
 model_y = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_Y,
     attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
 ).to(device).eval()
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
         return 60
+def align_inputs_to_model_dtype(inputs, model):
+    model_dtype = getattr(model, "dtype", None)
+    if model_dtype is None:
+        try:
+            model_dtype = next(model.parameters()).dtype
+        except StopIteration:
+            model_dtype = None
+    if model_dtype is None:
+        return inputs
+    for key, value in list(inputs.items()):
+        if torch.is_tensor(value) and value.is_floating_point():
+            inputs[key] = value.to(dtype=model_dtype)
+    return inputs
 @spaces.GPU(duration=calc_timeout_duration)
 def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
             truncation=True,
             max_length=MAX_INPUT_TOKEN_LENGTH
         ).to(device)
+        inputs = align_inputs_to_model_dtype(inputs, model)
         streamer = TextIteratorStreamer(
             processor.tokenizer if hasattr(processor, "tokenizer") else processor,