Spaces:

Remostart
/

PlutusLearn

Sleeping

App Files Files Community

Remostart commited on Aug 28, 2025

Commit

ffa43a7

verified ·

1 Parent(s): a3a1052

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -10

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from peft import PeftModel
 import torch
 import os
 from huggingface_hub import login
-import spaces  # Required for @spaces.GPU decorator
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
@@ -12,33 +12,34 @@ login(token=hf_token)
 # Model repository IDs
 base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
-peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"  # Replace with your fine-tuned model repo (e.g., ubiodee/my-finetuned-model)
 # Load the tokenizer from the fine-tuned model
 tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
-# Load the base model (ZeroGPU handles device placement automatically)
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
-    device_map="auto",  # Let ZeroGPU/accelerate handle GPU placement
     token=hf_token,
     low_cpu_mem_usage=True,
     trust_remote_code=True
 )
-base_model.resize_token_embeddings(len(tokenizer))  # Fix vocabulary mismatch
 # Load the PEFT adapter
 model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
-# Decorate the prediction function with @spaces.GPU to trigger GPU allocation
-@spaces.GPU(duration=120)  # 120s max runtime; adjust if your inferences are longer/shorter (default: 60s)
 def predict(text, max_length=100):
     try:
         messages = [{"role": "user", "content": text}]
         inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
-        # No explicit .to("cuda") needed; device_map="auto" handles it
-        outputs = model.generate(inputs, max_length=max_length)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
         return f"Error during inference: {str(e)}"
@@ -53,7 +54,7 @@ demo = gr.Interface(
     outputs=gr.Textbox(label="Model Output"),
     title="LearnPlutus Demo",
     description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
-    flagging_mode="never"  # Updated for Gradio compatibility
 )
 # Launch with ZeroGPU settings

 import torch
 import os
 from huggingface_hub import login
+import spaces  # Required for ZeroGPU
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 # Model repository IDs
 base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
+peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"  # Replace with your model repo (e.g., ubiodee/my-finetuned-model)
 # Load the tokenizer from the fine-tuned model
 tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
+# Load the base model
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
+    device_map="auto",  # Use GPU for ZeroGPU
     token=hf_token,
     low_cpu_mem_usage=True,
     trust_remote_code=True
 )
+base_model.resize_token_embeddings(len(tokenizer))
 # Load the PEFT adapter
 model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
+# Define the prediction function with GPU support
+@spaces.GPU(duration=120)  # Allocate GPU for 120s per inference
 def predict(text, max_length=100):
     try:
         messages = [{"role": "user", "content": text}]
         inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
+        # Explicitly move inputs to GPU
+        inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
+        outputs = model.generate(**inputs, max_length=max_length)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
         return f"Error during inference: {str(e)}"
     outputs=gr.Textbox(label="Model Output"),
     title="LearnPlutus Demo",
     description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
+    flagging_mode="never"
 )
 # Launch with ZeroGPU settings