Spaces:

llaa33219
/

context-window-extender

Running on Zero

App Files Files Community

llaa33219 commited on Mar 8

Commit

dd51a85

verified ·

1 Parent(s): 72ad230

Upload 3 files

Browse files

Files changed (2) hide show

app.py +15 -4
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 model_cache = {}
-def load_model_with_extension(model_id, extension_method, new_context_length, rope_type, rope_factor):
-    cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}"
     if cache_key in model_cache:
         return model_cache[cache_key]
@@ -31,7 +32,16 @@ def load_model_with_extension(model_id, extension_method, new_context_length, ro
                 config.rope_scaling = {"type": "yarn", "factor": rope_factor, "original_max_position_embeddings": original_context}
                 config.rope_theta = original_theta
-    model = AutoModelForCausalLM.from_pretrained(model_id, config=config, torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True, trust_remote_code=True)
     model.eval()
     result = {"model": model, "tokenizer": tokenizer, "original_context": original_context, "applied_context": new_context_length}
@@ -39,6 +49,7 @@ def load_model_with_extension(model_id, extension_method, new_context_length, ro
     return result
 def generate(model_id, extension_method, new_context_length, rope_type, rope_factor, prompt, max_new_tokens, temperature, top_p):
     if not model_id.strip():
         return "Error: Please enter a model ID"
@@ -54,7 +65,7 @@ def generate(model_id, extension_method, new_context_length, rope_type, rope_fac
     tokenizer = model_data["tokenizer"]
     try:
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=False, padding=False)
         with torch.no_grad():
             outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

 import gradio as gr
+import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 model_cache = {}
+def load_model_with_extension(model_id, extension_method, new_context_length, rope_type, rope_factor, device="cuda"):
+    cache_key = f"{model_id}_{extension_method}_{new_context_length}_{rope_type}_{rope_factor}_{device}"
     if cache_key in model_cache:
         return model_cache[cache_key]
                 config.rope_scaling = {"type": "yarn", "factor": rope_factor, "original_max_position_embeddings": original_context}
                 config.rope_theta = original_theta
+    torch_dtype = torch.float16 if device == "cuda" else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        config=config,
+        torch_dtype=torch_dtype,
+        device_map=device,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True
+    )
     model.eval()
     result = {"model": model, "tokenizer": tokenizer, "original_context": original_context, "applied_context": new_context_length}
     return result
+@spaces.GPU(duration=120)
 def generate(model_id, extension_method, new_context_length, rope_type, rope_factor, prompt, max_new_tokens, temperature, top_p):
     if not model_id.strip():
         return "Error: Please enter a model ID"
     tokenizer = model_data["tokenizer"]
     try:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
         with torch.no_grad():
             outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 transformers>=4.35.0
 accelerate>=0.25.0
 huggingface-hub>=0.36.0,<1.0.0

 transformers>=4.35.0
 accelerate>=0.25.0
 huggingface-hub>=0.36.0,<1.0.0
+spaces>=0.0.15