EveryonesGPT_Vision_Pretrained

Sleeping

App Files Files Community

HayatoHongoEveryonesAI commited on Jan 12

Commit

81845c2

1 Parent(s): 7657160

initial commit

Browse files

Files changed (4) hide show

app.py +113 -4
inference.py +75 -0
model.py +413 -0
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -1,7 +1,116 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# app.py
 import gradio as gr
+import spaces
+import torch
+import tiktoken
+from huggingface_hub import hf_hub_download
+from collections import OrderedDict
+from model import GPT, ModelConfig
+from inference import generate_stream
+# -------------------------
+# CPU 上でモデルロード（ZeroGPU重要）
+# -------------------------
+# Hugging Face からダウンロード
+model_path = hf_hub_download(
+    repo_id="HayatoHongo/everyoneschat-checkpoints",
+    filename="model.pt"
+)
+# state_dict をロード
+state_dict = torch.load(model_path, map_location="cpu")
+cfg = checkpoint["config"]
+config = ModelConfig(
+    embedding_dim=cfg["embedding_dim"],
+    hidden_dim=cfg["hidden_dim"],
+    num_attention_heads=cfg["num_attention_heads"],
+    layer_count=cfg["layer_count"],
+    max_sequence_length=cfg["max_sequence_length"],
+    rope_theta=cfg["rope_theta"],
+    vocab_size=cfg["vocab_size"],
+)
+# モデル生成 & load
+model = GPT(config)
+model.load_state_dict(state_dict)
+model.eval()
+tokenizer = tiktoken.get_encoding("gpt2")
+EOS_ID = 50256  # GPT-2 EOS
+# -------------------------
+# GPU を使う関数だけ ZeroGPU で囲む
+# -------------------------
+@spaces.GPU
+def chat_fn(
+    message,
+    history,
+    temperature,
+    top_p,
+    top_k,
+):
+    device = "cuda"
+    model_gpu = model.to(device)
+    # シングルターンなので毎回 cache を完全リセット
+    for block in model_gpu.blocks:
+        block.multihead_attention.reset_cache()
+    # ---- ここが超シンプルな prompt 整形 ----
+    prompt = (
+        "<user>\n"
+        f"{message}\n"
+        "<assistant>\n"
+    )
+    input_ids = torch.tensor(
+        [tokenizer.encode(prompt, allowed_special="all")],
+        device=device
+    )
+    output = ""
+    with torch.no_grad(), torch.autocast(
+        device_type="cuda",
+        dtype=torch.bfloat16,
+    ):
+        for tid in generate_stream(
+            model_gpu,
+            input_ids,
+            max_new_tokens=256,
+            temperature=temperature,
+            top_p=top_p if top_p > 0 else None,
+            top_k=top_k if top_k > 0 else None,
+        ):
+            if tid == EOS_ID:
+                break
+            output += tokenizer.decode([tid])
+    model_gpu.to("cpu")
+    torch.cuda.empty_cache()
+    return output
+# -------------------------
+# UI 定義
+# -------------------------
+demo = gr.ChatInterface(
+    chat_fn,
+    title="EveryonesGPT Pretrained (No Instruction-tuning). Single-turn English-only demo.",
+    description=(
+        "**Try prompts like:**\n"
+        "- What is the capital city of Japan?\n"
+        "- What is the element symbol of silver?\n"
+        "- Explain AI in simple terms"
+    ),
+    additional_inputs=[
+        gr.Slider(0.1, 2.0, value=0.7, step=0.05, label="Temperature"),
+        gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p"),
+        gr.Slider(0, 200, value=0, step=1, label="Top-k"),
+    ],
+)
+demo.launch()

inference.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# inference.py
+import torch
+import torch.nn.functional as F
+def generate_stream(
+    model,
+    input_ids,
+    max_new_tokens,
+    temperature,
+    top_p=None,
+    top_k=None,
+):
+    """
+    ストリーミング生成（batch size = 1 固定）
+    - GPT.generate と同じロジック
+    - KV cache 使用
+    - top-k / top-p 対応
+    """
+    model.eval()
+    next_token = None
+    with torch.no_grad():
+        for i in range(max_new_tokens):
+            # ===== forward =====
+            if i == 0:
+                logits, _ = model(input_ids, None, use_cache=True)
+            else:
+                logits, _ = model(next_token, None, use_cache=True)
+            # last token logits
+            last_logits = logits[:, -1, :] / temperature  # [1, vocab]
+            # ===== top-k =====
+            if top_k is not None:
+                top_k = min(top_k, last_logits.size(-1))
+                values, _ = torch.topk(last_logits, top_k)
+                min_value = values[:, -1].unsqueeze(-1)
+                last_logits = torch.where(
+                    last_logits < min_value,
+                    torch.full_like(last_logits, float("-inf")),
+                    last_logits,
+                )
+            # ===== top-p (nucleus) =====
+            if top_p is not None:
+                sorted_logits, sorted_indices = torch.sort(
+                    last_logits, descending=True
+                )
+                sorted_probs = F.softmax(sorted_logits, dim=-1)
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                sorted_mask = cumulative_probs > top_p
+                # ★ ここが重要：clone() を入れる
+                sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+                sorted_mask[..., 0] = False
+                sorted_logits = torch.where(
+                    sorted_mask,
+                    torch.full_like(sorted_logits, float("-inf")),
+                    sorted_logits,
+                )
+                last_logits = torch.zeros_like(last_logits).scatter(
+                    -1, sorted_indices, sorted_logits
+                )
+            # ===== sample =====
+            probs = F.softmax(last_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]
+            yield int(next_token.item())
+            # 次ステップ用に連結
+            input_ids = torch.cat([input_ids, next_token], dim=1)

model.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# model.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+@dataclass
+class ModelConfig:
+    embedding_dim: int
+    hidden_dim: int
+    num_attention_heads: int
+    layer_count: int
+    max_sequence_length: int
+    rope_theta: float
+    vocab_size: int
+# ---- 以下 TokenEmbedding / RotaryEmbedding / MHA / FFN / Block / GPT ----
+# （あなたが提示したコードをそのまま貼る）
+# added top-p and top-k filtering in generate function
+# set vocab_size in config.py
+# MHA with KV cache + RoPE + PyTorch SDPA.
+# This traditional implementation is easier to understand, and still efficient in practice.
+# GQA and MLA is a great way for long-text inference with reduced KV cache size,
+# but both comes with slight loss increase and no efficiency merits during training phase.
+# KV cache does not help training speed. Codebase will be simpler without it.
+# KV cache supports multi-turn continuation by RoPE with position offset.
+# No Dropout. Dataset is large enough and regularization is not necessary.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class TokenEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(config.vocab_size, config.embedding_dim)
+        # keep embedding in default dtype (autocast will handle bf16 when enabled)
+    def forward(self, input_indices):
+        return self.token_embedding_table(input_indices)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len=2048, rope_theta=1e6):
+        super().__init__()
+        inv_freq = 1.0 / (rope_theta ** (torch.arange(0, dim, 2) / dim))
+        position_index = torch.arange(max_seq_len)
+        frequency_matrix = torch.einsum('i,j->ij', position_index, inv_freq)
+        cosine = torch.cos(frequency_matrix)[None, None, :, :]
+        sine = torch.sin(frequency_matrix)[None, None, :, :]
+        self.register_buffer("cos_cached", cosine, persistent=False)
+        self.register_buffer("sin_cached", sine, persistent=False)
+    def apply_rotary_emb(self, x, position_offset=0):
+        sequence_length = x.size(2)
+        cosine = self.cos_cached[:, :, position_offset:position_offset + sequence_length, :]
+        sine = self.sin_cached[:, :, position_offset:position_offset + sequence_length, :]
+        x_even = x[..., 0::2]
+        x_odd = x[..., 1::2]
+        rotated_even = x_even * cosine - x_odd * sine
+        rotated_odd = x_odd * cosine + x_even * sine
+        rotated = torch.empty_like(x)
+        rotated[..., 0::2] = rotated_even
+        rotated[..., 1::2] = rotated_odd
+        return rotated
+class MultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_heads = config.num_attention_heads
+        self.embed_dim = config.embedding_dim
+        self.head_dim = self.embed_dim // self.num_heads
+        # QKV projection
+        self.query_fc = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.key_fc   = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.value_fc = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        # Rotary Positional Embedding (RoPE)
+        self.rotary_emb = RotaryEmbedding(
+            dim=self.head_dim,
+            max_seq_len=config.max_sequence_length,
+            rope_theta=config.rope_theta
+        )
+        self.output_projection = nn.Linear(self.embed_dim, self.embed_dim)
+        self.register_buffer(
+            "causal_mask",
+            torch.tril(torch.ones(
+                config.max_sequence_length,
+                config.max_sequence_length,
+                dtype=torch.bool
+            )),
+            persistent=False
+        )
+        # KV cache
+        self.register_buffer("cache_k", None, persistent=False)
+        self.register_buffer("cache_v", None, persistent=False)
+        self.current_pos = 0
+    # --------------------------------------------------
+    # router
+    # --------------------------------------------------
+    def forward(self, x, use_cache=False):
+        input_len = x.size(1)
+        if use_cache is False:
+            return self.forward_no_cache(x)
+        elif use_cache is True and input_len > 1:
+            return self.forward_prefill(x)
+        elif use_cache is True and input_len == 1: # Hi scenario also starts with T==1
+            return self.forward_cached_decoding(x)
+        else:
+            raise RuntimeError("Unexpected condition in MultiHeadAttention forward")
+    # --------------------------------------------------
+    # (1) no cache : training
+    # --------------------------------------------------
+    def forward_no_cache(self, x):
+        B, T, C = x.shape
+        Q = self.query_fc(x)
+        K = self.key_fc(x)
+        V = self.value_fc(x)
+        Q = Q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        # RoPE : offset = 0
+        Q = self.rotary_emb.apply_rotary_emb(Q, position_offset=0)
+        K = self.rotary_emb.apply_rotary_emb(K, position_offset=0)
+        out = F.scaled_dot_product_attention(
+            Q, K, V,
+            attn_mask=None,
+            is_causal=True
+        )
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        out = self.output_projection(out)
+        return out
+    # --------------------------------------------------
+    # (2) prefill : initialize KV cache
+    # --------------------------------------------------
+    def forward_prefill(self, x):
+        B, T, C = x.shape
+        Q = self.query_fc(x)
+        K = self.key_fc(x)
+        V = self.value_fc(x)
+        Q = Q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        # init cache
+        if self.cache_k is None:
+            self.cache_k = torch.zeros(
+                B, self.num_heads, self.config.max_sequence_length, self.head_dim,
+                device=x.device, dtype=K.dtype
+            )
+            self.cache_v = torch.zeros(
+                B, self.num_heads, self.config.max_sequence_length, self.head_dim,
+                device=x.device, dtype=V.dtype
+            )
+            self.current_pos = 0
+        # RoPE : offset = current_pos (supports multi-turn continuation)
+        Q = self.rotary_emb.apply_rotary_emb(Q, position_offset=self.current_pos)
+        K = self.rotary_emb.apply_rotary_emb(K, position_offset=self.current_pos)
+        # prevent overflow
+        if self.current_pos + T > self.config.max_sequence_length:
+            raise RuntimeError("KV cache exceeded max_sequence_length")
+        self.cache_k[:, :, self.current_pos:self.current_pos + T, :] = K
+        self.cache_v[:, :, self.current_pos:self.current_pos + T, :] = V
+        K = self.cache_k[:, :, :self.current_pos + T, :]
+        V = self.cache_v[:, :, :self.current_pos + T, :]
+        attn_mask = self.causal_mask[
+            self.current_pos : self.current_pos + T,
+            : self.current_pos + T
+        ]
+        out = F.scaled_dot_product_attention(
+            Q, K, V,
+            attn_mask=attn_mask,
+            is_causal=False
+        )
+        self.current_pos += T
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        out = self.output_projection(out)
+        return out
+    # --------------------------------------------------
+    # (3) decode : cached decoding (1 token)
+    # --------------------------------------------------
+    def forward_cached_decoding(self, x):
+        B, T, C = x.shape
+        assert T == 1, "cached decoding expects T==1"
+        Q = self.query_fc(x)
+        K = self.key_fc(x)
+        V = self.value_fc(x)
+        Q = Q.view(B, 1, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(B, 1, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(B, 1, self.num_heads, self.head_dim).transpose(1, 2)
+        # This is not usually needed since prefill should have initialized the cache.
+        # Just in case for "Hi" scenario, which starts with single token input.
+        if self.cache_k is None:
+            self.cache_k = torch.zeros(
+                B, self.num_heads, self.config.max_sequence_length, self.head_dim,
+                device=x.device, dtype=K.dtype
+            )
+            self.cache_v = torch.zeros(
+                B, self.num_heads, self.config.max_sequence_length, self.head_dim,
+                device=x.device, dtype=V.dtype
+            )
+            self.current_pos = 0
+        if self.current_pos + 1 >= self.config.max_sequence_length:
+            raise RuntimeError("KV cache exceeded max_sequence_length")
+        # RoPE : offset = current_pos
+        Q = self.rotary_emb.apply_rotary_emb(Q, position_offset=self.current_pos)
+        K = self.rotary_emb.apply_rotary_emb(K, position_offset=self.current_pos)
+        self.cache_k[:, :, self.current_pos:self.current_pos + 1, :] = K
+        self.cache_v[:, :, self.current_pos:self.current_pos + 1, :] = V
+        K = self.cache_k[:, :, :self.current_pos + 1, :]
+        V = self.cache_v[:, :, :self.current_pos + 1, :]
+        out = F.scaled_dot_product_attention(
+            Q, K, V,
+            attn_mask=None,
+            is_causal=False
+        )
+        self.current_pos += 1
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        out = self.output_projection(out)
+        return out
+    def reset_cache(self):
+        self.cache_k = None
+        self.cache_v = None
+        self.current_pos = 0
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(config.embedding_dim, config.hidden_dim, bias=False),
+            nn.ReLU(),
+            nn.Linear(config.hidden_dim, config.embedding_dim, bias=False),
+        )
+    def forward(self, input_tensor):
+        return self.net(input_tensor)
+class TransformerBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(config.embedding_dim)
+        self.layer_norm2 = nn.LayerNorm(config.embedding_dim)
+        self.multihead_attention = MultiHeadAttention(config=config)
+        self.feed_forward = FeedForward(config=config)
+    def forward(self, input_tensor, use_cache=False):
+        normed_input = self.layer_norm1(input_tensor)
+        attention_output = self.multihead_attention(normed_input, use_cache=use_cache)
+        residual_attention = attention_output + input_tensor
+        normed_attention = self.layer_norm2(residual_attention)
+        feedforward_output = self.feed_forward(normed_attention)
+        final_output = feedforward_output + residual_attention
+        return final_output
+class VocabularyLogits(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.output_norm = nn.LayerNorm(config.embedding_dim)
+        self.vocab_projection = nn.Linear(config.embedding_dim, config.vocab_size, bias=False)
+    def forward(self, transformer_block_output):
+        x = transformer_block_output
+        normalized_output = self.output_norm(x)
+        vocab_logits = self.vocab_projection(normalized_output)
+        return vocab_logits
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.token_embedding_layer = TokenEmbedding(config=config)
+        self.blocks = nn.ModuleList([TransformerBlock(config=config) for _ in range(config.layer_count)])
+        self.vocab_projection = VocabularyLogits(config=config)
+        self.criterion = nn.CrossEntropyLoss()
+    def forward(self, input_indices, target_indices, use_cache=False):
+        token_embeddings = self.token_embedding_layer.forward(input_indices)
+        x = token_embeddings
+        for block in self.blocks:
+            x = block(x, use_cache=use_cache)
+        logits = self.vocab_projection(x)
+        if target_indices is None:
+            return logits, None
+        batch_size, token_len, vocab_size = logits.shape
+        logits_flat = logits.view(batch_size * token_len, vocab_size)
+        targets_flat = target_indices.view(batch_size * token_len)
+        loss = self.criterion(logits_flat, targets_flat)
+        return logits, loss
+    def generate(self,
+        input_indices,
+        max_new_tokens,
+        temperature=1.0,
+        use_cache=True,
+        reset_cache=True,
+        top_k=None,      # ### NEW ###
+        top_p=None,      # ### NEW ###
+    ):
+        self.eval()
+        if reset_cache:
+            for block in self.blocks:
+                block.multihead_attention.reset_cache()
+        next_token = None
+        for i in range(max_new_tokens):
+            if use_cache:
+                if i == 0:
+                    logits, _ = self.forward(input_indices, None, use_cache=True)
+                else:
+                    logits, _ = self.forward(next_token, None, use_cache=True)
+            else:
+                logits, _ = self.forward(input_indices, None, use_cache=False)
+            """ DELETE
+            last_logits = logits[:, -1, :] / temperature
+            probs = F.softmax(last_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            """
+            ### NEW ###
+            last_logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                top_k = min(top_k, last_logits.size(-1))
+                values, _ = torch.topk(last_logits, top_k)
+                min_value = values[:, -1].unsqueeze(-1)
+                last_logits = torch.where(
+                    last_logits < min_value,
+                    torch.full_like(last_logits, float("-inf")),
+                    last_logits,
+                )
+            if top_p is not None:
+                sorted_logits, sorted_indices = torch.sort(last_logits, descending=True)
+                sorted_probs = F.softmax(sorted_logits, dim=-1)
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                sorted_mask = cumulative_probs > top_p
+                sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+                sorted_mask[..., 0] = False
+                sorted_logits = torch.where(
+                    sorted_mask,
+                    torch.full_like(sorted_logits, float("-inf")),
+                    sorted_logits,
+                )
+                last_logits = torch.zeros_like(last_logits).scatter(
+                    -1, sorted_indices, sorted_logits
+                )
+            probs = F.softmax(last_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            ### NEW ###
+            yield int(next_token.item())
+            input_indices = torch.cat((input_indices, next_token), dim=1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+huggingface_hub
+tiktoken
+gradio