Spaces:

Locutusque
/

Locutusque-Models

Running on Zero

App Files Files Community

Locutusque commited on Aug 28, 2025

Commit

8eb13b2

verified ·

1 Parent(s): 3b00f9a

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -31

app.py CHANGED Viewed

@@ -2,50 +2,121 @@ import spaces
 import gradio as gr
 from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
 import torch
-from threading import Thread
 import os
 # Global dictionary to store preloaded models and tokenizers
 LOADED_MODELS = {}
 LOADED_TOKENIZERS = {}
 def preload_models(model_choices):
     """Preload all models to CPU at startup"""
-    print("Preloading models to CPU...")
-    for model_name in model_choices:
-        try:
-            print(f"Loading {model_name}...")
-            # Load model to CPU with bfloat16 to save memory
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                trust_remote_code=True,
-                token=os.environ.get("token"),
-            )
-            # Load tokenizer
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                trust_remote_code=True,
-                token=os.environ.get("token")
-            )
-            tokenizer.eos_token = "<|im_end|>"
-            LOADED_MODELS[model_name] = model
-            LOADED_TOKENIZERS[model_name] = tokenizer
-            print(f"Successfully loaded {model_name}")
-        except Exception as e:
-            print(f"Failed to load {model_name}: {e}")
 @spaces.GPU()
 def get_model_pipeline(model_name):
     """Move selected model to GPU and create pipeline"""
-    if model_name not in LOADED_MODELS:
-        raise ValueError(f"Model {model_name} not found in preloaded models")
-    # Move model to GPU
-    model = LOADED_MODELS[model_name]
-    tokenizer = LOADED_TOKENIZERS[model_name]
     # Create pipeline with the GPU model
     pipe = pipeline(
@@ -134,6 +205,10 @@ model_choices = [
 # Preload all models to CPU at startup
 preload_models(model_choices)
 # Create Gradio interface
 g = gr.ChatInterface(
     fn=generate,
@@ -160,4 +235,8 @@ g = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    g.launch()

 import gradio as gr
 from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
 import torch
+from threading import Thread, Lock, Event
 import os
+import asyncio
+import time
+from datetime import datetime
+import gc
 # Global dictionary to store preloaded models and tokenizers
 LOADED_MODELS = {}
 LOADED_TOKENIZERS = {}
+# Lock for thread-safe model access
+MODEL_LOCK = Lock()
+# Event to signal shutdown
+SHUTDOWN_EVENT = Event()
+def clear_memory():
+    """Clear GPU and CPU memory"""
+    torch.cuda.empty_cache()
+    gc.collect()
+def load_single_model(model_name):
+    """Load a single model and tokenizer"""
+    try:
+        print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Loading {model_name}...")
+        # Load model to CPU with bfloat16 to save memory
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            token=os.environ.get("token"),
+        )
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            token=os.environ.get("token")
+        )
+        tokenizer.eos_token = "<|im_end|>"
+        print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Successfully loaded {model_name}")
+        return model, tokenizer
+    except Exception as e:
+        print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Failed to load {model_name}: {e}")
+        return None, None
 def preload_models(model_choices):
     """Preload all models to CPU at startup"""
+    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Preloading models to CPU...")
+    with MODEL_LOCK:
+        for model_name in model_choices:
+            model, tokenizer = load_single_model(model_name)
+            if model is not None and tokenizer is not None:
+                LOADED_MODELS[model_name] = model
+                LOADED_TOKENIZERS[model_name] = tokenizer
+def reload_models_task(model_choices):
+    """Background task to reload models every 15 minutes"""
+    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting model reload task...")
+    while not SHUTDOWN_EVENT.is_set():
+        # Wait for 15 minutes (900 seconds)
+        if SHUTDOWN_EVENT.wait(900):
+            # If event is set, exit the loop
+            break
+        print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting periodic model reload...")
+        # Create temporary dictionaries for new models
+        new_models = {}
+        new_tokenizers = {}
+        # Load new models
+        for model_name in model_choices:
+            model, tokenizer = load_single_model(model_name)
+            if model is not None and tokenizer is not None:
+                new_models[model_name] = model
+                new_tokenizers[model_name] = tokenizer
+        # Replace old models with new ones atomically
+        with MODEL_LOCK:
+            # Clear old models from memory
+            for model_name in LOADED_MODELS:
+                if model_name in LOADED_MODELS:
+                    try:
+                        del LOADED_MODELS[model_name]
+                    except:
+                        pass
+                if model_name in LOADED_TOKENIZERS:
+                    try:
+                        del LOADED_TOKENIZERS[model_name]
+                    except:
+                        pass
+            # Clear memory
+            clear_memory()
+            # Update with new models
+            LOADED_MODELS.update(new_models)
+            LOADED_TOKENIZERS.update(new_tokenizers)
+        print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Model reload completed")
 @spaces.GPU()
 def get_model_pipeline(model_name):
     """Move selected model to GPU and create pipeline"""
+    with MODEL_LOCK:
+        if model_name not in LOADED_MODELS:
+            raise ValueError(f"Model {model_name} not found in preloaded models")
+        # Get model and tokenizer references
+        model = LOADED_MODELS[model_name]
+        tokenizer = LOADED_TOKENIZERS[model_name]
     # Create pipeline with the GPU model
     pipe = pipeline(
 # Preload all models to CPU at startup
 preload_models(model_choices)
+# Start the background reload task
+reload_thread = Thread(target=reload_models_task, args=(model_choices,), daemon=True)
+reload_thread.start()
 # Create Gradio interface
 g = gr.ChatInterface(
     fn=generate,
 )
 if __name__ == "__main__":
+    try:
+        g.launch()
+    finally:
+        # Signal the reload thread to stop when the app shuts down
+        SHUTDOWN_EVENT.set()