Spaces:

Sumit404
/

QuGPT_LLM

Build error

App Files Files Community

Sumit404 commited on Aug 31, 2025

Commit

ba22c0b

verified ·

1 Parent(s): 9593f2c

Create app.py

Browse files

Files changed (1) hide show

app.py +70 -0

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+from unsloth import FastLanguageModel
+from peft import PeftModel
+import torch
+# Load the base model and tokenizer
+max_seq_length = 4096
+dtype = None
+load_in_4bit = True
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit
+)
+# Load the LoRA adapters
+LORA_ADAPTER_PATH = "Sumit404/Llama-3.2-3B-Instruct-bnb-4bit-finetuned" # Replace with your repo ID
+model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
+# Set tokenizer and model for inference
+from unsloth.chat_templates import get_chat_template
+tokenizer = get_chat_template(
+   tokenizer,
+   chat_template = "llama-3.2",
+)
+tokenizer.pad_token = tokenizer.eos_token
+FastLanguageModel.for_inference(model)
+def generate_text(prompt):
+    messages = [{"role": "user", "content": prompt}]
+    inputs = tokenizer.apply_chat_template(
+       messages,
+       tokenize=True,
+       add_generation_prompt=True,
+       return_tensors="pt",
+       padding=True,
+    ).to("cuda")
+    attention_mask = inputs != tokenizer.pad_token_id
+    outputs = model.generate(
+       input_ids=inputs,
+       attention_mask=attention_mask,
+       max_new_tokens=128, # Increased output length for potentially longer answers
+       use_cache=True,
+       temperature=0.6,
+       min_p=0.1,
+    )
+    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the assistant's response
+    assistant_response_start = text.find("<|start_header_id|>assistant<|end_header_id|>\n\n")
+    if assistant_response_start != -1:
+        text = text[assistant_response_start + len("<|start_header_id|>assistant<|end_header_id|>\n\n"):]
+    return text
+# Create the Gradio interface
+interface = gr.Interface(
+    fn=generate_text,
+    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
+    outputs="text",
+    title="Fine-tuned Llama-3.2 Instruct Model",
+    description="Ask a question to the fine-tuned model."
+)
+# To run this in Colab, set share=True
+interface.launch(share=True)