MultiVLM-OCR

Running on Zero

App Files Files Community

Geraldine commited on 4 days ago

Commit

96a1472

verified ·

1 Parent(s): 67b3520

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -0

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ from transformers import (
     Qwen3VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
     AutoModel,
     AutoTokenizer,
@@ -170,6 +171,14 @@ model_m = load_model_with_attention_fallback(
 MODEL_ID_A = "mistralai/Ministral-3-8B-Instruct-2512"
 ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1/chat/completions"
 MODEL_MAP = {
     "Nanonets-OCR2-3B": (processor_v, model_v),
     "LightOnOCR-2-1B": (processor_y, model_y),
@@ -177,6 +186,7 @@ MODEL_MAP = {
     "Qwen3-VL-4B-Instruct": (processor_m, model_m),
     "Qwen2-VL-OCR-2B": (processor_x, model_x),
     "Ministral-3-8B-Instruct-2512": (None, MODEL_ID_A),
 }
 MODEL_CHOICES = list(MODEL_MAP.keys())
@@ -578,6 +588,37 @@ def generate_image(model_name, text, image, max_new_tokens, temperature, top_p,
             inputs.pop("token_type_ids", None)
             inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
             generation_kwargs = {
                 **inputs,
                 "streamer": streamer,

     Qwen3VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForCausalLM,
+    AutoModelForMultimodalLM,
     AutoProcessor,
     AutoModel,
     AutoTokenizer,
 MODEL_ID_A = "mistralai/Ministral-3-8B-Instruct-2512"
 ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1/chat/completions"
+MODEL_ID_G = "google/gemma-4-E4B-it"
+processor_g = AutoProcessor.from_pretrained(MODEL_ID_G)
+model_g = load_model_with_attention_fallback(
+    AutoModelForMultimodalLM,
+    MODEL_ID_G,
+    torch_dtype="auto"
+).to(device).eval()
 MODEL_MAP = {
     "Nanonets-OCR2-3B": (processor_v, model_v),
     "LightOnOCR-2-1B": (processor_y, model_y),
     "Qwen3-VL-4B-Instruct": (processor_m, model_m),
     "Qwen2-VL-OCR-2B": (processor_x, model_x),
     "Ministral-3-8B-Instruct-2512": (None, MODEL_ID_A),
+    "gemma-4-E4B-it": (processor_g, model_g),
 }
 MODEL_CHOICES = list(MODEL_MAP.keys())
             inputs.pop("token_type_ids", None)
             inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
+            generation_kwargs = {
+                **inputs,
+                "streamer": streamer,
+                "max_new_tokens": int(max_new_tokens),
+                "do_sample": True,
+                "temperature": float(temperature),
+                "top_p": float(top_p),
+                "top_k": int(top_k),
+                "repetition_penalty": float(repetition_penalty),
+            }
+        elif model_name == "gemma-4-E4B-it":
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": text},
+                    ],
+                }
+            ]
+            inputs = processor.apply_chat_template(
+                messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_dict=True,
+                return_tensors="pt",
+                enable_thinking=False,
+            )
+            inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
             generation_kwargs = {
                 **inputs,
                 "streamer": streamer,