Spaces:

Remostart
/

PlutusLearn

Sleeping

App Files Files Community

Remostart commited on Aug 28, 2025

Commit

4b57ecb

verified ·

1 Parent(s): ffa43a7

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -12

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from peft import PeftModel
 import torch
 import os
 from huggingface_hub import login
-import spaces  # Required for ZeroGPU
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
@@ -12,7 +12,7 @@ login(token=hf_token)
 # Model repository IDs
 base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
-peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"  # Replace with your model repo (e.g., ubiodee/my-finetuned-model)
 # Load the tokenizer from the fine-tuned model
 tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
@@ -21,7 +21,7 @@ tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
-    device_map="auto",  # Use GPU for ZeroGPU
     token=hf_token,
     low_cpu_mem_usage=True,
     trust_remote_code=True
@@ -31,14 +31,18 @@ base_model.resize_token_embeddings(len(tokenizer))
 # Load the PEFT adapter
 model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
-# Define the prediction function with GPU support
-@spaces.GPU(duration=120)  # Allocate GPU for 120s per inference
 def predict(text, max_length=100):
     try:
         messages = [{"role": "user", "content": text}]
         inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
-        # Explicitly move inputs to GPU
-        inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
         outputs = model.generate(**inputs, max_length=max_length)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
@@ -57,11 +61,10 @@ demo = gr.Interface(
     flagging_mode="never"
 )
-# Launch with ZeroGPU settings
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
-    ssr_mode=False,
-    share=True,  # Enable public URL
-    debug=True  # Enable debug mode for detailed logs
-)

 import torch
 import os
 from huggingface_hub import login
+import spaces
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 # Model repository IDs
 base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
+peft_model_id = "ubiodee/<your-model-repo>"  # Replace with your model repo
 # Load the tokenizer from the fine-tuned model
 tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
+    device_map="auto",
     token=hf_token,
     low_cpu_mem_usage=True,
     trust_remote_code=True
 # Load the PEFT adapter
 model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
+# Define the prediction function with proper device handling
+@spaces.GPU(duration=120)
 def predict(text, max_length=100):
     try:
         messages = [{"role": "user", "content": text}]
         inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
+        # Move inputs to GPU if they are a dictionary of tensors
+        if isinstance(inputs, dict):
+            inputs = {key: val.to("cuda:0") for key, val in inputs.items()}
+        else:
+            # If inputs is a single tensor (unlikely but for robustness)
+            inputs = inputs.to("cuda:0")
         outputs = model.generate(**inputs, max_length=max_length)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
     flagging_mode="never"
 )
+# Launch the app
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
+    share=True,
+    debug=True
+)