Spaces:

QUT-GenAILab
/

model-gateway-api

Sleeping

App Files Files Community

autumnssuns commited on 25 days ago

Commit

e4b3020

1 Parent(s): 21bfda5

✨ Implement lazy loading for models and correct tokens counting

Browse files

Files changed (4) hide show

models/gemma4_e2b.py +27 -11
models/lazy_model.py +94 -0
models/{llama.py → llama3_2_3b_instruct.py} +31 -12
service.py +13 -20

models/gemma4_e2b.py CHANGED Viewed

@@ -2,21 +2,31 @@ from typing import Any
 import torch
 from transformers import AutoProcessor, AutoModelForCausalLM, TextStreamer
-from . import Model
 MODEL_ID = Model.GEMMA_4_E2B.model_id
-# Load model
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", device_map="auto"
-)
-print(f"{MODEL_ID} loaded successfully.")
-print(f"Model device: {model.device}")
 def generate(
     messages: list[dict[str, str]],
     max_tokens: int = 512,
@@ -24,7 +34,9 @@ def generate(
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
-    print(f"Generating with {MODEL_ID}...")
     # Process input
     text = processor.apply_chat_template(
@@ -52,9 +64,13 @@ def generate(
     response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
     content = processor.parse_response(response)
-    prompt_tokens = sum(len(msg["content"].split()) for msg in messages)
-    completion_tokens = len(content.split())
     print(
         f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"

 import torch
 from transformers import AutoProcessor, AutoModelForCausalLM, TextStreamer
+from . import config, Model
+from .lazy_model import LazyModel
 MODEL_ID = Model.GEMMA_4_E2B.model_id
+lazy = LazyModel(MODEL_ID)
+processor = None
+model = None
+@lazy.unload()
+def clean_up():
+    global processor, model
+    del processor
+    del model
+@lazy.load()
+def load():
+    global processor, model
+    processor = AutoProcessor.from_pretrained(MODEL_ID, **config.tokenizer_config)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **config.model_config)
+@lazy.entry()
 def generate(
     messages: list[dict[str, str]],
     max_tokens: int = 512,
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
+    global processor, model
+    assert processor is not None, "Processor is not initialized."
+    assert model is not None, "Model is not loaded."
     # Process input
     text = processor.apply_chat_template(
     response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
     content = processor.parse_response(response)
+    if isinstance(content, dict) and "content" in content:
+        content = content["content"]
+    prompt_tokens = len(processor.tokenizer.apply_chat_template(messages))
+    completion_tokens = len(
+        processor.tokenizer.encode(content, add_special_tokens=False)
+    )
     print(
         f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"

models/lazy_model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import Callable
+import gc
+import torch
+import os
+LAZY_LOAD_ENABLED = os.getenv("LAZY_LOAD", "false").lower() == "true"
+class LazyModel:
+    unload_func = None
+    init_func: Callable | None = None
+    is_loaded = False
+    def __init__(self, model_id: str):
+        self.model_id = model_id
+    def load(self):
+        def decorator(init_func):
+            if not LAZY_LOAD_ENABLED:
+                # Even if eager loading, the model should only be initialized once.
+                if not self.is_loaded:
+                    init_func()
+                    self.is_loaded = True
+                self.init_func = init_func
+                return init_func
+            def wrapper():
+                global current_model
+                if current_model is not None and current_model != self.model_id:
+                    print(
+                        f"Unloading currently loaded model '{current_model}' before loading '{self.model_id}'..."
+                    )
+                    _unload()
+                if current_model == self.model_id and self.is_loaded:
+                    print(
+                        f"Model '{self.model_id}' is already loaded. Skipping initialization."
+                    )
+                    return
+                print(f"Loading model '{self.model_id}'...")
+                init_func()
+                self.is_loaded = True
+                current_model = self
+                print(f"Model '{self.model_id}' loaded successfully.")
+            # Ensure the init_func also loads lazily
+            self.init_func = wrapper
+            return wrapper
+        return decorator
+    def unload(self):
+        # Create a decorator to set the unload callback function for this model. This allows the lazy loading mechanism to call the specified function when unloading the model, ensuring proper cleanup of resources.
+        def decorator(func):
+            self.unload_func = func
+            return func
+        return decorator
+    def entry(self):
+        def decorator(func):
+            def wrapper(*args, **kwargs):
+                if not self.init_func:
+                    raise RuntimeError(
+                        f"Model '{self.model_id}' does not have an initialization function defined."
+                    )
+                # Ensure the model is loaded before executing the main function
+                if self.init_func and not self.is_loaded:
+                    print(f"Model '{self.model_id}' is not loaded. Loading now...")
+                    self.init_func()
+                print(f"Executing main function for model '{self.model_id}'...")
+                return func(*args, **kwargs)
+            return wrapper
+        return decorator
+def _unload():
+    global current_model
+    if current_model and current_model.unload_func:
+        current_model.unload_func()
+    current_model = None
+    # Ensure garbage collection and CUDA cache clearing
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+# Global variaable to keep track of the currently loaded LazyModel instance. This allows the lazy loading mechanism to determine if a model is already loaded and manage unloading of other models when necessary.
+current_model: LazyModel | None = None

models/{llama.py → llama3_2_3b_instruct.py} RENAMED Viewed

@@ -2,22 +2,39 @@ from typing import Any
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer
 from . import config, Model
 MODEL_ID = Model.LLAMA_3_2_3B_INSTRUCT.model_id
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **config.model_config)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **config.tokenizer_config)
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    **config.pipeline_config,
-)
-print(f"{MODEL_ID} loaded successfully.")
-print(f"Model device: {pipe.model.device}")
 def generate(
     messages: list[dict[str, str]],
     max_tokens: int = 512,
@@ -25,6 +42,8 @@ def generate(
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
     assert pipe.tokenizer is not None, "Tokenizer is not loaded."
     print(f"Generating with {MODEL_ID}...")
@@ -40,8 +59,8 @@ def generate(
     )
     content = outputs[0]["generated_text"][-1]["content"]
-    prompt_tokens = sum(len(msg["content"].split()) for msg in messages)
-    completion_tokens = len(content.split())
     print(
         f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"

 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer
 from . import config, Model
+from .lazy_model import LazyModel
 MODEL_ID = Model.LLAMA_3_2_3B_INSTRUCT.model_id
+lazy = LazyModel(MODEL_ID)
+model = None
+tokenizer = None
+pipe = None
+@lazy.unload()
+def clean_up():
+    global model, tokenizer, pipe
+    del model
+    del tokenizer
+    del pipe
+@lazy.load()
+def init():
+    global model, tokenizer, pipe
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **config.model_config)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **config.tokenizer_config)
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        **config.pipeline_config,
+    )
+@lazy.entry()
 def generate(
     messages: list[dict[str, str]],
     max_tokens: int = 512,
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
+    global model, tokenizer, pipe
+    assert pipe is not None, "Pipeline is not initialized."
     assert pipe.tokenizer is not None, "Tokenizer is not loaded."
     print(f"Generating with {MODEL_ID}...")
     )
     content = outputs[0]["generated_text"][-1]["content"]
+    prompt_tokens = len(pipe.tokenizer.apply_chat_template(messages))
+    completion_tokens = len(pipe.tokenizer.encode(content, add_special_tokens=False))
     print(
         f"Generation complete. Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}"

service.py CHANGED Viewed

@@ -11,28 +11,21 @@ def generate(
     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
     if model == Model.LLAMA_3_2_3B_INSTRUCT.model_id:
-        from models import llama
-        return llama.generate(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stop=stop,
-        )
     if model == Model.GEMMA_4_E2B.model_id:
-        from models import gemma4_e2b
-        return gemma4_e2b.generate(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stop=stop,
-        )
-    msg = f"Unsupported model: {model}"
-    raise ValueError(msg)
 def list_models() -> dict[str, list[dict[str, Any]]]:

     top_p: float = 0.9,
     stop: list[str] | None = None,
 ) -> dict[str, Any]:
+    # Ensure model exists
+    if model not in [m["id"] for m in get_available_models()]:
+        msg = f"Model '{model}' is not available. Supported models: {[m['id'] for m in get_available_models()]}"
+        raise ValueError(msg)
     if model == Model.LLAMA_3_2_3B_INSTRUCT.model_id:
+        from models.llama3_2_3b_instruct import generate
     if model == Model.GEMMA_4_E2B.model_id:
+        from models.gemma4_e2b import generate
+    return generate(  # type: ignore
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stop=stop,
+    )
 def list_models() -> dict[str, list[dict[str, Any]]]: