Spaces:

developer0hye
/

InternVL3-14B

Running on Zero

App Files Files Community

developer0hye commited on May 6, 2025

Commit

bade21f

verified ·

1 Parent(s): 38cd071

Create app.py

Browse files

Files changed (1) hide show

app.py +179 -0

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import gradio as gr
+import spaces
+import torch
+import math
+import numpy as np
+import os
+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer, AutoConfig
+# =============================================================================
+# InternVL‑3 preprocessing utilities (image‑only version)
+# =============================================================================
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size: int = 448):
+    """Return torchvision transform matching InternVL pre‑training."""
+    return T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+        ]
+    )
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        tgt_ar = ratio[0] / ratio[1]
+        diff = abs(aspect_ratio - tgt_ar)
+        if diff < best_ratio_diff or (diff == best_ratio_diff and area > 0.5 * image_size * image_size * ratio[0] * ratio[1]):
+            best_ratio_diff = diff
+            best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    """Split arbitrarily‑sized image into ≤12 tiles sized 448×448 (InternVL spec)."""
+    ow, oh = image.size
+    aspect_ratio = ow / oh
+    target_ratios = sorted(
+        {(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if min_num <= i * j <= max_num},
+        key=lambda x: x[0] * x[1],
+    )
+    ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, ow, oh, image_size)
+    tw, th = image_size * ratio[0], image_size * ratio[1]
+    blocks = ratio[0] * ratio[1]
+    resized = image.resize((tw, th))
+    tiles = [
+        resized.crop(
+            (
+                (idx % (tw // image_size)) * image_size,
+                (idx // (tw // image_size)) * image_size,
+                ((idx % (tw // image_size)) + 1) * image_size,
+                ((idx // (tw // image_size)) + 1) * image_size,
+            )
+        )
+        for idx in range(blocks)
+    ]
+    if use_thumbnail and blocks != 1:
+        tiles.append(image.resize((image_size, image_size)))
+    return tiles
+def load_image(path: str, input_size: int = 448, max_num: int = 12):
+    """Return tensor of shape (N, 3, H, W) ready for InternVL."""
+    img = Image.open(path).convert("RGB")
+    transform = build_transform(input_size)
+    tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    return torch.stack([transform(t) for t in tiles])
+# =============================================================================
+# InternVL‑3‑14B model loading (multi‑GPU aware)
+# =============================================================================
+MODEL_ID = "OpenGVLab/InternVL3-14B"
+def split_model(model_name: str):
+    """Distribute LLM layers across GPUs, keeping vision encoder on GPU 0."""
+    n_gpu = torch.cuda.device_count()
+    if n_gpu < 2:
+        return "auto"  # let transformers decide
+    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    n_layers = cfg.llm_config.num_hidden_layers  # type: ignore[attr-defined]
+    # GPU0 does vision + some text layers => treat as 0.5 GPU
+    per_gpu = math.ceil(n_layers / (n_gpu - 0.5))
+    alloc = [per_gpu] * n_gpu
+    alloc[0] = math.ceil(alloc[0] * 0.5)
+    dmap = {
+        "vision_model": 0,
+        "mlp1": 0,
+        "language_model.model.tok_embeddings": 0,
+        "language_model.model.embed_tokens": 0,
+        "language_model.output": 0,
+        "language_model.model.norm": 0,
+        "language_model.model.rotary_emb": 0,
+        "language_model.lm_head": 0,
+    }
+    layer_idx = 0
+    for gpu, n in enumerate(alloc):
+        for _ in range(n):
+            if layer_idx >= n_layers:
+                break
+            dmap[f"language_model.model.layers.{layer_idx}"] = 0 if layer_idx == n_layers - 1 else gpu
+            layer_idx += 1
+    return dmap
+device_map = split_model(MODEL_ID)
+model = AutoModel.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True,
+    device_map=device_map,
+).eval()
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
+# =============================================================================
+# Inference function (image‑only)
+# =============================================================================
+@spaces.GPU
+def internvl_inference(image_path: str | None, text_input: str | None = None):
+    if image_path is None:
+        return "Please upload an image first."
+    pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()
+    prompt = f"<image>\n{text_input}" if text_input else "<image>\n"
+    gen_cfg = dict(max_new_tokens=1024, do_sample=True)
+    return model.chat(tokenizer, pixel_values, prompt, gen_cfg)
+# =============================================================================
+# Gradio UI (image‑only, Gradio 5 compatible)
+# =============================================================================
+DESCRIPTION = (
+    "[InternVL 3‑14B demo](https://huggingface.co/OpenGVLab/InternVL3-14B) — "
+    "upload an image and ask anything about it."
+)
+css = """
+#output_text {
+  height: 500px;
+  overflow: auto;
+  border: 1px solid #ccc;
+}
+"""
+with gr.Blocks(css=css, theme="origin") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        # Left column: image, question, submit button (stacked vertically)
+        with gr.Column(scale=1):
+            input_image = gr.Image(label="Upload Image", type="filepath")
+            text_input = gr.Textbox(label="Question")
+            submit_btn = gr.Button("Submit")
+        # Right column: model output
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(label="Model Output", elem_id="output_text")
+    submit_btn.click(internvl_inference, [input_image, text_input], [output_text])
+if __name__ == "__main__":
+    demo.launch()