EveryonesGPT_Vision_Pretrained

Sleeping

App Files Files Community

HayatoHongoEveryonesAI commited on Jan 14

Commit

6b1d95e

1 Parent(s): ebce892

only vlm

Browse files

Files changed (3) hide show

app.py +32 -113
inference.py +0 -75
vlm_inference.py +40 -21

app.py CHANGED Viewed

@@ -2,57 +2,22 @@
 import gradio as gr
 import spaces
 import torch
-import tiktoken
-from huggingface_hub import hf_hub_download
 from PIL import Image
-from model import GPT, ModelConfig
-from inference import generate_stream
 from vlm_inference import (
-    build_vlm_model,
     vlm_infer_stream,
     image_processor,
 )
 # =====================================================
-# Text-only LLM load (CPU)
 # =====================================================
-TEXT_REPO_ID = "HayatoHongo/everyoneschat-checkpoints"
-TEXT_FILENAME = "model_sft.pt"
-text_ckpt_path = hf_hub_download(
-    repo_id=TEXT_REPO_ID,
-    filename=TEXT_FILENAME,
-)
-text_state_dict = torch.load(text_ckpt_path, map_location="cpu")
-text_config = ModelConfig(
-    embedding_dim=1280,
-    hidden_dim=5120,
-    num_attention_heads=10,
-    layer_count=20,
-    max_sequence_length=2048,
-    rope_theta=1_000_000.0,
-    vocab_size=50257,
-)
-text_model = GPT(text_config)
-text_model.load_state_dict(text_state_dict)
-text_model.eval()
-tokenizer = tiktoken.get_encoding("gpt2")
-EOS_ID = 50256
-# =====================================================
-# Vision-Language Model load (CPU)
-# =====================================================
-vlm_model = build_vlm_model()   # CPU load, frozen
 # =====================================================
-# Router (GPU only here)
 # =====================================================
 @spaces.GPU
 def chat_fn(
@@ -63,80 +28,37 @@ def chat_fn(
     top_p,
     top_k,
 ):
-    device = "cuda"
-    # ==============================
-    # Text-only route
-    # ==============================
     if image is None:
-        model_gpu = text_model.to(device)
-        # reset KV cache
-        for block in model_gpu.blocks:
-            block.multihead_attention.reset_cache()
-        prompt = (
-            "<user>\n"
-            f"{message}"
-            "<assistant>\n"
-        )
-        input_ids = torch.tensor(
-            [tokenizer.encode(prompt, allowed_special="all")],
-            device=device
-        )
-        output = ""
-        with torch.no_grad(), torch.autocast(
-            device_type="cuda",
-            dtype=torch.bfloat16,
         ):
-            for tid in generate_stream(
-                model_gpu,
-                input_ids,
-                max_new_tokens=256,
-                temperature=temperature,
-                top_p=top_p if top_p > 0 else None,
-                top_k=top_k if top_k > 0 else None,
-            ):
-                if tid == EOS_ID:
-                    break
-                output += tokenizer.decode([tid])
         model_gpu.to("cpu")
         torch.cuda.empty_cache()
-        return output
-    # ==============================
-    # Vision route
-    # ==============================
-    else:
-        model_gpu = vlm_model.to(device)
-        image_tensor = image_processor(
-            images=image.convert("RGB"),
-            return_tensors="pt"
-        )["pixel_values"].to(device)
-        prompt = ({message})
-        def stream():
-            for chunk in vlm_infer_stream(
-                model=model_gpu,
-                image_tensor=image_tensor,
-                prompt=prompt,
-                max_new_tokens=256,
-                temperature=temperature,
-                top_p=top_p if top_p > 0 else None,
-                top_k=top_k if top_k > 0 else None,
-            ):
-                yield chunk
-            model_gpu.to("cpu")
-            torch.cuda.empty_cache()
-        return stream()
 # =====================================================
@@ -145,14 +67,11 @@ def chat_fn(
 demo = gr.ChatInterface(
     fn=chat_fn,
     multimodal=True,
-    title="EveryonesGPT (Text + Vision)",
-    description=(
-        "- Text only → fast LLM\n"
-        "- Image + Text → CLIP-VLM\n"
-    ),
     additional_inputs=[
-        gr.Image(type="pil", label="Image (optional)"),
-        gr.Slider(0.1, 2.0, value=0.7, step=0.05, label="Temperature"),
         gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"),
         gr.Slider(0, 200, value=0, step=1, label="Top-k"),
     ],

 import gradio as gr
 import spaces
 import torch
 from PIL import Image
 from vlm_inference import (
+    load_vlm_model,
     vlm_infer_stream,
     image_processor,
 )
 # =====================================================
+# Load model on CPU (ZeroGPU)
 # =====================================================
+model = load_vlm_model()   # CPU load, eval
 # =====================================================
+# GPU inference (VLM only)
 # =====================================================
 @spaces.GPU
 def chat_fn(
     top_p,
     top_k,
 ):
     if image is None:
+        return "Please upload an image."
+    device = "cuda"
+    model_gpu = model.to(device)
+    image_tensor = image_processor(
+        images=image.convert("RGB"),
+        return_tensors="pt"
+    )["pixel_values"].to(device)
+    prompt = (
+        f"{message}"
+    )
+    def stream():
+        for chunk in vlm_infer_stream(
+            model=model_gpu,
+            image_tensor=image_tensor,
+            prompt=prompt,
+            max_new_tokens=256,
+            temperature=temperature,
+            top_p=top_p if top_p > 0 else None,
+            top_k=top_k if top_k > 0 else None,
         ):
+            yield chunk
         model_gpu.to("cpu")
         torch.cuda.empty_cache()
+    return stream()
 # =====================================================
 demo = gr.ChatInterface(
     fn=chat_fn,
     multimodal=True,
+    title="EveryonesGPT Vision (CLIP)",
+    description="Vision-only VLM demo (CLIP ViT-L/14)",
     additional_inputs=[
+        gr.Image(type="pil", label="Image"),
+        gr.Slider(0.1, 2.0, value=0.5, step=0.05, label="Temperature"),
         gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"),
         gr.Slider(0, 200, value=0, step=1, label="Top-k"),
     ],

inference.py DELETED Viewed

@@ -1,75 +0,0 @@
-# inference.py
-import torch
-import torch.nn.functional as F
-def generate_stream(
-    model,
-    input_ids,
-    max_new_tokens,
-    temperature,
-    top_p=None,
-    top_k=None,
-):
-    """
-    ストリーミング生成（batch size = 1 固定）
-    - GPT.generate と同じロジック
-    - KV cache 使用
-    - top-k / top-p 対応
-    """
-    model.eval()
-    next_token = None
-    with torch.no_grad():
-        for i in range(max_new_tokens):
-            # ===== forward =====
-            if i == 0:
-                logits, _ = model(input_ids, None, use_cache=True)
-            else:
-                logits, _ = model(next_token, None, use_cache=True)
-            # last token logits
-            last_logits = logits[:, -1, :] / temperature  # [1, vocab]
-            # ===== top-k =====
-            if top_k is not None:
-                top_k = min(top_k, last_logits.size(-1))
-                values, _ = torch.topk(last_logits, top_k)
-                min_value = values[:, -1].unsqueeze(-1)
-                last_logits = torch.where(
-                    last_logits < min_value,
-                    torch.full_like(last_logits, float("-inf")),
-                    last_logits,
-                )
-            # ===== top-p (nucleus) =====
-            if top_p is not None:
-                sorted_logits, sorted_indices = torch.sort(
-                    last_logits, descending=True
-                )
-                sorted_probs = F.softmax(sorted_logits, dim=-1)
-                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
-                sorted_mask = cumulative_probs > top_p
-                # ★ ここが重要：clone() を入れる
-                sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
-                sorted_mask[..., 0] = False
-                sorted_logits = torch.where(
-                    sorted_mask,
-                    torch.full_like(sorted_logits, float("-inf")),
-                    sorted_logits,
-                )
-                last_logits = torch.zeros_like(last_logits).scatter(
-                    -1, sorted_indices, sorted_logits
-                )
-            # ===== sample =====
-            probs = F.softmax(last_logits, dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]
-            yield int(next_token.item())
-            # 次ステップ用に連結
-            input_ids = torch.cat([input_ids, next_token], dim=1)

vlm_inference.py CHANGED Viewed

@@ -6,13 +6,13 @@ import tiktoken
 from huggingface_hub import hf_hub_download
 from transformers import CLIPVisionModel, CLIPImageProcessor
-from model import GPT, ModelConfig
 # =====================================================
 # Constants
 # =====================================================
-VISION_REPO_ID = "HayatoHongo/everyoneschat-checkpoints"
-VISION_FILENAME = "checkpoint_015000_vision_pretrained.pt"
 VISION_ENCODER = "openai/clip-vit-large-patch14"
 NUM_IMAGE_PATCHES = 256
@@ -24,6 +24,26 @@ tokenizer = tiktoken.get_encoding("gpt2")
 image_processor = CLIPImageProcessor.from_pretrained(VISION_ENCODER)
 # =====================================================
 # VLM wrapper
 # =====================================================
@@ -47,12 +67,13 @@ class VLM(nn.Module):
 # =====================================================
-# Build model (CPU)
 # =====================================================
-def build_vlm_model():
     ckpt_path = hf_hub_download(
-        repo_id=VISION_REPO_ID,
-        filename=VISION_FILENAME,
     )
     checkpoint = torch.load(ckpt_path, map_location="cpu")
@@ -63,10 +84,9 @@ def build_vlm_model():
             torch, config_dict["autocast_dtype"].split(".")[-1]
         )
-    config = ModelConfig(**{
-        k: v for k, v in config_dict.items()
-        if k in ModelConfig.__annotations__
-    })
     llm = GPT(config)
     model = VLM(llm)
@@ -77,7 +97,7 @@ def build_vlm_model():
 # =====================================================
-# Inference helpers
 # =====================================================
 @torch.no_grad()
 def vlm_prefill(model, image_tensor, input_ids):
@@ -105,12 +125,12 @@ def vlm_next_token(model, input_ids, temperature, top_k, top_p):
     logits = model.llm.vocab_projection(x)[:, -1, :] / temperature
     if top_k:
-        v, _ = torch.topk(logits, top_k)
         logits = torch.where(logits < v[:, -1:], -float("inf"), logits)
     if top_p:
         s_logits, s_idx = torch.sort(logits, descending=True)
-        probs = torch.softmax(s_logits, dim=-1)
         cum = probs.cumsum(dim=-1)
         mask = cum > top_p
         mask[..., 1:] = mask[..., :-1].clone()
@@ -118,7 +138,7 @@ def vlm_next_token(model, input_ids, temperature, top_k, top_p):
         s_logits[mask] = -float("inf")
         logits = torch.zeros_like(logits).scatter(-1, s_idx, s_logits)
-    probs = torch.softmax(logits, dim=-1)
     return torch.multinomial(probs, 1)
@@ -126,15 +146,15 @@ def vlm_infer_stream(
     model,
     image_tensor,
     prompt,
-    max_new_tokens,
-    temperature,
     top_k=None,
     top_p=None,
     stop_ids={50256},
 ):
     device = next(model.parameters()).device
     prompt_ids = tokenizer.encode(prompt, allowed_special="all")
     input_ids = (
         [PAD_TOKEN_ID] * NUM_IMAGE_PATCHES + prompt_ids
     )
@@ -145,11 +165,10 @@ def vlm_infer_stream(
     x = vlm_prefill(model, image_tensor, input_ids)
     logits = model.llm.vocab_projection(x)[:, -1, :] / temperature
-    probs = torch.softmax(logits, dim=-1)
     next_token = torch.multinomial(probs, 1)
-    acc = []
-    last = ""
     for _ in range(max_new_tokens):
         tid = int(next_token.item())

 from huggingface_hub import hf_hub_download
 from transformers import CLIPVisionModel, CLIPImageProcessor
+from model import GPT
 # =====================================================
 # Constants
 # =====================================================
+REPO_ID = "HayatoHongo/everyoneschat-checkpoints"
+FILENAME = "checkpoint_015000_vision_pretrained.pt"
 VISION_ENCODER = "openai/clip-vit-large-patch14"
 NUM_IMAGE_PATCHES = 256
 image_processor = CLIPImageProcessor.from_pretrained(VISION_ENCODER)
+# =====================================================
+# ModelConfig (same as Colab)
+# =====================================================
+from dataclasses import dataclass, fields
+@dataclass
+class ModelConfig:
+    input_sequence_length: int
+    max_sequence_length: int
+    embedding_dim: int
+    hidden_dim: int
+    num_attention_heads: int
+    layer_count: int
+    rope_theta: float
+    vocab_size: int
+    device_type: str
+    random_seed_value: int
+    autocast_dtype: torch.dtype
 # =====================================================
 # VLM wrapper
 # =====================================================
 # =====================================================
+# Load model (CPU)
 # =====================================================
+def load_vlm_model():
     ckpt_path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=FILENAME,
+        repo_type="model"
     )
     checkpoint = torch.load(ckpt_path, map_location="cpu")
             torch, config_dict["autocast_dtype"].split(".")[-1]
         )
+    model_config_fields = {f.name for f in fields(ModelConfig)}
+    filtered = {k: v for k, v in config_dict.items() if k in model_config_fields}
+    config = ModelConfig(**filtered)
     llm = GPT(config)
     model = VLM(llm)
 # =====================================================
+# Inference helpers (Colab準拠)
 # =====================================================
 @torch.no_grad()
 def vlm_prefill(model, image_tensor, input_ids):
     logits = model.llm.vocab_projection(x)[:, -1, :] / temperature
     if top_k:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
         logits = torch.where(logits < v[:, -1:], -float("inf"), logits)
     if top_p:
         s_logits, s_idx = torch.sort(logits, descending=True)
+        probs = F.softmax(s_logits, dim=-1)
         cum = probs.cumsum(dim=-1)
         mask = cum > top_p
         mask[..., 1:] = mask[..., :-1].clone()
         s_logits[mask] = -float("inf")
         logits = torch.zeros_like(logits).scatter(-1, s_idx, s_logits)
+    probs = F.softmax(logits, dim=-1)
     return torch.multinomial(probs, 1)
     model,
     image_tensor,
     prompt,
+    max_new_tokens=256,
+    temperature=0.7,
     top_k=None,
     top_p=None,
     stop_ids={50256},
 ):
     device = next(model.parameters()).device
     prompt_ids = tokenizer.encode(prompt, allowed_special="all")
     input_ids = (
         [PAD_TOKEN_ID] * NUM_IMAGE_PATCHES + prompt_ids
     )
     x = vlm_prefill(model, image_tensor, input_ids)
     logits = model.llm.vocab_projection(x)[:, -1, :] / temperature
+    probs = F.softmax(logits, dim=-1)
     next_token = torch.multinomial(probs, 1)
+    acc, last = [], ""
     for _ in range(max_new_tokens):
         tid = int(next_token.item())