Spaces:

QUT-GenAILab
/

model-gateway-api

Sleeping

App Files Files Community

autumnssuns commited on 26 days ago

Commit

21bfda5

1 Parent(s): 22af552

✨ Add Gemma 4 E2B model integration and update service to support multiple models

Browse files

Files changed (6) hide show

app.py +2 -2
models/__init__.py +38 -8
models/config.py +39 -0
models/gemma4_e2b.py +72 -0
models/llama.py +57 -88
service.py +16 -5

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from typing import Any
 import spaces
-from models.llama import LlamaModel
 import gradio
 from service import generate, list_models
 app = gradio.Server()
@@ -13,7 +13,7 @@ app = gradio.Server()
 @spaces.GPU(duration=10)
 def generate_endpoint(
     messages: list[dict[str, str]],
-    model: str = LlamaModel.MODEL_ID,
     max_tokens: int = 512,
     temperature: float = 0.7,
     top_p: float = 0.9,

 from typing import Any
 import spaces
 import gradio
 from service import generate, list_models
+from models import gemma4_e2b
 app = gradio.Server()
 @spaces.GPU(duration=10)
 def generate_endpoint(
     messages: list[dict[str, str]],
+    model: str = gemma4_e2b.MODEL_ID,
     max_tokens: int = 512,
     temperature: float = 0.7,
     top_p: float = 0.9,

models/__init__.py CHANGED Viewed

@@ -1,14 +1,44 @@
 from typing import Any
-AVAILABLE_MODELS: list[dict[str, Any]] = [
-    {
-        "id": "meta-llama/Llama-3.2-3B-Instruct",
-        "type": "text-generation",
-        "backend": "local",
-        "max_tokens": 4096,
-    },
-]
 def get_available_models() -> list[dict[str, Any]]:

 from typing import Any
+from enum import Enum
+class Model(Enum):
+    LLAMA_3_2_3B_INSTRUCT = (
+        "meta-llama/Llama-3.2-3B-Instruct",
+        "text-generation",
+        "local",
+        4096,
+    )
+    GEMMA_4_E2B = ("google/gemma-4-E2B-it", "text-generation", "local", 4096)
+    def __init__(
+        self,
+        model_id: str,
+        model_type: str,
+        backend: str,
+        max_tokens: int,
+    ):
+        self.model_id = model_id
+        self.model_type = model_type
+        self.backend = backend
+        self.max_tokens = max_tokens
+    def __str__(self):
+        return self.model_id
+    def __repr__(self):
+        return f"Model(id={self.model_id}, type={self.model_type}, backend={self.backend}, max_tokens={self.max_tokens})"
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.model_id,
+            "type": self.model_type,
+            "backend": self.backend,
+            "max_tokens": self.max_tokens,
+        }
+AVAILABLE_MODELS = [model.to_dict() for model in Model]
 def get_available_models() -> list[dict[str, Any]]:

models/config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Common configuration for all models, including device and dtype settings.
+import os
+import torch
+TOKEN = os.getenv("HF_TOKEN")
+QUANTIZE_4_BIT = os.getenv("QUANTIZE_4_BIT", "false").lower() == "true"
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.bfloat16 if torch_device in ["cuda", "mps"] else torch.float32
+print(f"Using {torch_device} with dtype {torch_dtype}...")
+model_config = {
+    "torch_dtype": torch_dtype,
+    "device_map": torch_device,
+    "token": TOKEN,
+}
+tokenizer_config = {
+    "token": TOKEN,
+}
+pipeline_config = {
+    "torch_dtype": torch_dtype,
+    "device_map": "auto",
+}
+def enable_quantization():
+    print("Enabling 4-bit quantization for compatible models...")
+    from transformers import BitsAndBytesConfig
+    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+    model_config["quantization_config"] = quantization_config
+if QUANTIZE_4_BIT:
+    enable_quantization()

models/gemma4_e2b.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from typing import Any
+import torch
+from transformers import AutoProcessor, AutoModelForCausalLM, TextStreamer
+from . import Model
+MODEL_ID = Model.GEMMA_4_E2B.model_id
+# Load model
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype="auto", device_map="auto"
+)
+print(f"{MODEL_ID} loaded successfully.")
+print(f"Model device: {model.device}")
+def generate(
+    messages: list[dict[str, str]],
+    max_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
+    stop: list[str] | None = None,
+) -> dict[str, Any]:
+    print(f"Generating with {MODEL_ID}...")
+    # Process input
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    inputs = processor(text=text, return_tensors="pt").to(model.device)
+    input_len = inputs["input_ids"].shape[-1]
+    streamer = TextStreamer(
+        processor.tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    with torch.inference_mode():
+        outputs = model.generate(  # type: ignore
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=temperature > 0,
+            streamer=streamer,
+        )
+    response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
+    content = processor.parse_response(response)
+    prompt_tokens = sum(len(msg["content"].split()) for msg in messages)
+    completion_tokens = len(content.split())
+    print(
+        f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"
+    )
+    print(f"Generated content: {content}")
+    return {
+        "model": MODEL_ID,
+        "content": content,
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
+    }

models/llama.py CHANGED Viewed

@@ -1,90 +1,59 @@
 from typing import Any
-import spaces
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-class LlamaModel:
-    _instance: "LlamaModel | None" = None
-    _pipe: Any = None
-    MODEL_ID: str = "meta-llama/Llama-3.2-3B-Instruct"
-    @classmethod
-    def get_instance(cls) -> "LlamaModel":
-        if cls._instance is None:
-            cls._instance = cls()
-        return cls._instance
-    def __init__(self) -> None:
-        if LlamaModel._pipe is not None:
-            return
-        torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-        torch_dtype = (
-            torch.bfloat16 if torch_device in ["cuda", "mps"] else torch.float32
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            self.MODEL_ID,
-            torch_dtype=torch_dtype,
-            device_map=torch_device,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(self.MODEL_ID)
-        LlamaModel._pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            torch_dtype=torch_dtype,
-            device_map="auto",
-        )
-    @staticmethod
-    def get_pipe() -> Any:
-        if LlamaModel._pipe is None:
-            LlamaModel.get_instance()
-        return LlamaModel._pipe
-    @staticmethod
-    def generate(
-        messages: list[dict[str, str]],
-        max_tokens: int = 512,
-        temperature: float = 0.7,
-        top_p: float = 0.9,
-        stop: list[str] | None = None,
-    ) -> dict[str, Any]:
-        print(f"Generating with {LlamaModel.MODEL_ID}...")
-        pipe = LlamaModel.get_pipe()
-        outputs = pipe(
-            messages,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            do_sample=temperature > 0,
-        )
-        content = outputs[0]["generated_text"][-1]["content"]
-        prompt_tokens = sum(len(msg["content"].split()) for msg in messages)
-        completion_tokens = len(content.split())
-        print(
-            f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"
-        )
-        print(f"Generated content: {content}")
-        return {
-            "model": LlamaModel.MODEL_ID,
-            "content": content,
-            "usage": {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            },
-        }
-# Load the model immediately
-LlamaModel.get_instance()
-print(f"{LlamaModel.MODEL_ID} loaded and ready to generate.")

 from typing import Any
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer
+from . import config, Model
+MODEL_ID = Model.LLAMA_3_2_3B_INSTRUCT.model_id
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **config.model_config)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **config.tokenizer_config)
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    **config.pipeline_config,
+)
+print(f"{MODEL_ID} loaded successfully.")
+print(f"Model device: {pipe.model.device}")
+def generate(
+    messages: list[dict[str, str]],
+    max_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
+    stop: list[str] | None = None,
+) -> dict[str, Any]:
+    assert pipe.tokenizer is not None, "Tokenizer is not loaded."
+    print(f"Generating with {MODEL_ID}...")
+    streamer = TextStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
+    outputs = pipe(
+        messages,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=temperature > 0,
+        # Enable streaming output to console
+        streamer=streamer,
+    )
+    content = outputs[0]["generated_text"][-1]["content"]
+    prompt_tokens = sum(len(msg["content"].split()) for msg in messages)
+    completion_tokens = len(content.split())
+    print(
+        f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"
+    )
+    print(f"Generated content: {content}")
+    return {
+        "model": MODEL_ID,
+        "content": content,
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
+    }

service.py CHANGED Viewed

@@ -1,19 +1,30 @@
 from typing import Any
-from models import get_available_models
-from models.llama import LlamaModel
 def generate(
     messages: list[dict[str, str]],
-    model: str = LlamaModel.MODEL_ID,
     max_tokens: int = 512,
     temperature: float = 0.7,
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
-    if model == LlamaModel.MODEL_ID:
-        return LlamaModel.generate(
             messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,

 from typing import Any
+from models import get_available_models, Model
 def generate(
     messages: list[dict[str, str]],
+    model: str,
     max_tokens: int = 512,
     temperature: float = 0.7,
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
+    if model == Model.LLAMA_3_2_3B_INSTRUCT.model_id:
+        from models import llama
+        return llama.generate(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+    if model == Model.GEMMA_4_E2B.model_id:
+        from models import gemma4_e2b
+        return gemma4_e2b.generate(
             messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,