File size: 2,708 Bytes
9646204
503ff85
 
9646204
3603079
9646204
3603079
503ff85
 
 
27a8bb1
503ff85
 
 
 
5558821
e41281f
27a8bb1
 
3603079
27a8bb1
 
246f6b2
 
 
 
9646204
3603079
503ff85
 
3cbcab1
 
503ff85
 
 
3cbcab1
503ff85
27a8bb1
3cbcab1
503ff85
 
 
 
a8e01ad
503ff85
 
27a8bb1
503ff85
 
 
3cbcab1
503ff85
 
 
 
 
 
 
 
 
 
9646204
3cbcab1
 
3603079
3cbcab1
3603079
3cbcab1
 
c5ec3aa
 
 
9646204
c5ec3aa
 
3603079
9853f62
27a8bb1
9646204
 
 
1205c6a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Downloading the model ...")

model_name = "samzito12/lora_model3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu",
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)

print("✅ Downloaded model with CPU optimisations")
model.eval()

SYSTEM_PROMPT = """You are a helpful AI coding assistant based on Meta's Llama-3.2-3B model. 
Your task is to assist users with programming-related questions: write code snippets, debug code, explain concepts clearly, and provide best practices. 
Always respond in a concise, clear, and friendly manner, and adapt your explanations to the user's level."""


def chat(message, history, temperature=1.5, max_tokens=128):
    # Build conversation
    conversation = f"System: {SYSTEM_PROMPT}\n\n"
    
    for user_msg, assistant_msg in history:
        conversation += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
    
    conversation += f"User: {message}\nAssistant:"
    
    # Tokenize
    inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024, padding=True)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract assistant's response
    if "Assistant:" in full_response:
        response = full_response.split("Assistant:")[-1].strip()
    else:
        response = full_response[len(conversation):].strip()
    
    return response

demo = gr.ChatInterface(
    chat,
    title="Your Coding Assistant",
    description="""
    **Model:** This chatbot was fine-tuned to provide a free coding service, designed to assist users in writing, debugging, and optimizing code across various programming languages. 
    """,
    examples=[
        ["What model are you?", 0.7, 128],
        ["Explain machine learning in simple terms", 0.7, 128],
        ["Write a Python function to reverse a string", 0.7, 128]
    ],
    additional_inputs=[
        gr.Slider(minimum=0, maximum=2, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=32, maximum=512, value=128, step=16, label="Max Tokens")
    ],
    theme="soft",
)

if __name__ == "__main__":
    demo.launch()