Spaces:

Remostart
/

PlutusLearn

Sleeping

App Files Files Community

Remostart commited on Aug 28, 2025

Commit

68ff241

verified ·

1 Parent(s): 2ad0f54

Fix tokenizer size mismatch

Browse files

Files changed (1) hide show

app.py +41 -19

app.py CHANGED Viewed

@@ -2,35 +2,57 @@ import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import torch
-# Replace with your model repository ID
-model_repo_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"
-# Load the tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_repo_id)
-# Load the base model and apply the PEFT adapter
 base_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.2-3B-Instruct",
     torch_dtype=torch.float16,
-    device_map="auto"
 )
-model = PeftModel.from_pretrained(base_model, model_repo_id)
-# Define the prediction function
-def predict(text):
-    inputs = tokenizer(text, return_tensors="pt").to("cuda")
-    outputs = model.generate(**inputs, max_length=100)  # Adjust parameters as needed
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Create Gradio interface
 demo = gr.Interface(
     fn=predict,
-    inputs=gr.Textbox(label="Input Text"),
     outputs=gr.Textbox(label="Model Output"),
-    title="My Model Demo",
-    description="Test the fine-tuned model hosted on Hugging Face."
 )
-# Launch the app
-demo.launch()

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import torch
+import os
+from huggingface_hub import login
+# Authenticate with Hugging Face
+hf_token = os.getenv("HF_TOKEN")
+login(token=hf_token)
+# Model repository IDs
+base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
+peft_model_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct"
+# Load the tokenizer from the fine-tuned model
+tokenizer = AutoTokenizer.from_pretrained(peft_model_id, token=hf_token)
+# Load the base model
 base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_id,
     torch_dtype=torch.float16,
+    device_map="auto",
+    token=hf_token,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True
 )
+# Resize the base model's embeddings to match the fine-tuned tokenizer
+base_model.resize_token_embeddings(len(tokenizer))
+# Load the PEFT adapter
+model = PeftModel.from_pretrained(base_model, peft_model_id, token=hf_token)
+# Define the prediction function with chat template
+def predict(text, max_length=100):
+    try:
+        messages = [{"role": "user", "content": text}]
+        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
+        outputs = model.generate(inputs, max_length=max_length)
+        return tokenizer.decode(outputs[0], skip_special_tokens=True)
+    except Exception as e:
+        return f"Error during inference: {str(e)}"
+# Create Gradio interface for ZeroGPU
 demo = gr.Interface(
     fn=predict,
+    inputs=[
+        gr.Textbox(label="Input Text"),
+        gr.Slider(label="Max Length", minimum=50, maximum=500, value=100, step=1)
+    ],
     outputs=gr.Textbox(label="Model Output"),
+    title="LearnPlutus Demo",
+    description="Test the fine-tuned Llama-3.2-3B-Instruct model on ZeroGPU.",
+    allow_flagging="never"
 )
+demo.launch(server_name="0.0.0.0", server_port=7860)