Spaces:

llaa33219
/

context-window-extender

Running on Zero

llaa33219 commited on Mar 8

Commit

adbe710

verified ·

1 Parent(s): 360a4ff

Upload 3 files

Files changed (1) hide show

app.py CHANGED Viewed

@@ -41,8 +41,11 @@ def calculate_context_length(base_context, multiplier):
     return base_context * multipliers.get(multiplier, 2)
-def load_model_with_extension(model_id, extension_method, new_context_length, rope_type, rope_factor, device="cuda"):
-    cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}_{device}"
     if cache_key in model_cache:
         return model_cache[cache_key]
@@ -74,7 +77,7 @@ def load_model_with_extension(model_id, extension_method, new_context_length, ro
         model_id,
         config=config,
         torch_dtype=torch_dtype,
-        device_map=device,
         low_cpu_mem_usage=True,
         trust_remote_code=True
     )
@@ -223,6 +226,7 @@ with gr.Blocks(title="Context Window Extender - Chat") as demo:
     gr.Markdown("### 💬 Chat with the Model")
     # Conversational chat interface
     def respond(
         message: str,
         history: list,
@@ -268,7 +272,10 @@ with gr.Blocks(title="Context Window Extender - Chat") as demo:
             model = model_data["model"]
             tokenizer = model_data["tokenizer"]
-            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
             # Stream generation
             from transformers import TextIteratorStreamer

     return base_context * multipliers.get(multiplier, 2)
+def load_model_with_extension(model_id, extension_method, new_context_length, rope_type, rope_factor):
+    """Load model - CPU by default, ZeroGPU will handle GPU allocation."""
+    device = "cpu"  # Use CPU, ZeroGPU will move to GPU when needed
+    cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}"
     if cache_key in model_cache:
         return model_cache[cache_key]
         model_id,
         config=config,
         torch_dtype=torch_dtype,
+        device_map="cpu",  # Load on CPU, ZeroGPU handles GPU
         low_cpu_mem_usage=True,
         trust_remote_code=True
     )
     gr.Markdown("### 💬 Chat with the Model")
     # Conversational chat interface
+    @spaces.GPU(duration=120)
     def respond(
         message: str,
         history: list,
             model = model_data["model"]
             tokenizer = model_data["tokenizer"]
+            # Move model to GPU for generation
+            model = model.to("cuda")
+            inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
             # Stream generation
             from transformers import TextIteratorStreamer