Fu01978 commited on
Commit
3ad9149
·
verified ·
1 Parent(s): ffe33f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -33
app.py CHANGED
@@ -1,14 +1,24 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import torch
4
 
5
- # Load model and tokenizer
6
- model_name = "unsloth/Llama-3.2-1B-Instruct"
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForCausalLM.from_pretrained(
9
- model_name,
10
- dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
11
- device_map="auto"
 
 
 
 
 
 
 
 
 
 
12
  )
13
 
14
  def chat(message, history):
@@ -30,39 +40,23 @@ def chat(message, history):
30
  # Add current message
31
  messages.append({"role": "user", "content": message})
32
 
33
- # Apply chat template
34
- prompt = tokenizer.apply_chat_template(
35
- messages,
36
- tokenize=False,
37
- add_generation_prompt=True
38
- )
39
-
40
- # Tokenize
41
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
42
-
43
  # Generate response
44
- outputs = model.generate(
45
- **inputs,
46
- max_new_tokens=512,
47
  temperature=0.7,
48
  top_p=0.9,
49
- do_sample=True,
50
- pad_token_id=tokenizer.eos_token_id
51
  )
52
 
53
- # Decode and extract only the new response
54
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
55
-
56
- # Extract only the assistant's response (after the last prompt)
57
- response = response.split("assistant")[-1].strip()
58
-
59
- return response
60
 
61
  # Create Gradio interface
62
  demo = gr.ChatInterface(
63
  fn=chat,
64
- title="Llama 3.2 1B Instruct Chatbot",
65
- description="Chat with Llama 3.2 1B Instruct model. Ask me anything!",
66
  examples=[
67
  "What is artificial intelligence?",
68
  "Write a short poem about coding",
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+ import os
4
 
5
+ # Download and load the GGUF model
6
+ model_url = "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K_L.gguf?download=true"
7
+ model_path = "model.gguf"
8
+
9
+ # Download model if not already present
10
+ if not os.path.exists(model_path):
11
+ print("Downloading model...")
12
+ import urllib.request
13
+ urllib.request.urlretrieve(model_url, model_path)
14
+ print("Model downloaded!")
15
+
16
+ # Load the model
17
+ llm = Llama(
18
+ model_path=model_path,
19
+ n_ctx=2048, # Context window
20
+ n_threads=4, # Number of CPU threads
21
+ n_gpu_layers=0 # Set to -1 to offload all layers to GPU if available
22
  )
23
 
24
  def chat(message, history):
 
40
  # Add current message
41
  messages.append({"role": "user", "content": message})
42
 
 
 
 
 
 
 
 
 
 
 
43
  # Generate response
44
+ response = llm.create_chat_completion(
45
+ messages=messages,
46
+ max_tokens=512,
47
  temperature=0.7,
48
  top_p=0.9,
49
+ stream=False
 
50
  )
51
 
52
+ # Extract the assistant's response
53
+ return response["choices"][0]["message"]["content"]
 
 
 
 
 
54
 
55
  # Create Gradio interface
56
  demo = gr.ChatInterface(
57
  fn=chat,
58
+ title="Llama 3.2 3B Instruct Chatbot (GGUF)",
59
+ description="Chat with Llama 3.2 3B Instruct model running from GGUF format. Ask me anything!",
60
  examples=[
61
  "What is artificial intelligence?",
62
  "Write a short poem about coding",