bricksandbot commited on
Commit
8c04ab8
·
verified ·
1 Parent(s): ece3728

Deploy Buildsnpper chatbot Gradio interface

Browse files

- Add Gradio chat interface for Buildsnpper platform
- Uses bricksandbotltd/buildsnpper-chatbot-Q4_K_M model
- Includes 8 example questions
- Supports conversation history
- Clean, simple UI for customer support

Files changed (2) hide show
  1. app.py +57 -25
  2. requirements.txt +6 -3
app.py CHANGED
@@ -1,31 +1,41 @@
1
  #!/usr/bin/env python3
2
  """
3
  Gradio interface for Buildsnpper Chatbot.
4
- Deployed as a HuggingFace Space.
5
  """
6
  import gradio as gr
7
- from llama_cpp import Llama
8
- import os
 
9
 
10
  # Configuration
11
- MODEL_REPO = "bricksandbotltd/buildsnpper-chatbot-Q4_K_M"
12
- MODEL_FILE = "buildsnpper-chatbot-Q4_K_M.gguf"
13
-
14
- # Initialize model (loaded once at startup)
15
- print("Loading model...")
16
- llm = Llama.from_pretrained(
17
- repo_id=MODEL_REPO,
18
- filename=MODEL_FILE,
19
- n_ctx=2048, # Context window
20
- n_threads=4, # CPU threads
21
- verbose=False
22
  )
 
 
 
 
 
 
 
 
 
 
 
23
  print("Model loaded successfully!")
24
 
25
 
 
26
  def chat(message, history):
27
  """
28
- Process user message and generate response.
29
 
30
  Args:
31
  message: User's input message
@@ -43,16 +53,35 @@ def chat(message, history):
43
  # Add current message
44
  messages.append({"role": "user", "content": message})
45
 
46
- # Generate response
47
- response = llm.create_chat_completion(
48
- messages=messages,
49
- temperature=0.1,
50
- top_p=0.9,
51
- max_tokens=300,
52
- stop=["<|endoftext|>", "<|end|>"]
53
  )
54
 
55
- return response['choices'][0]['message']['content']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
 
58
  # Example questions
@@ -81,6 +110,8 @@ with gr.Blocks(title="Buildsnpper Chatbot", theme=gr.themes.Soft()) as demo:
81
  - Technical issues
82
 
83
  **Note**: This chatbot is specialized for Buildsnpper platform questions only.
 
 
84
  """
85
  )
86
 
@@ -97,10 +128,11 @@ with gr.Blocks(title="Buildsnpper Chatbot", theme=gr.themes.Soft()) as demo:
97
  gr.Markdown(
98
  """
99
  ---
100
- **Model**: [bricksandbotltd/buildsnpper-chatbot-Q4_K_M](https://huggingface.co/bricksandbotltd/buildsnpper-chatbot-Q4_K_M)
101
  **Base Model**: microsoft/Phi-4-mini-instruct (3.8B parameters)
102
  **Fine-tuned**: LoRA on 89 Buildsnpper Q&A pairs
103
- **Format**: GGUF Q4_K_M quantized
 
104
  """
105
  )
106
 
 
1
  #!/usr/bin/env python3
2
  """
3
  Gradio interface for Buildsnpper Chatbot.
4
+ Deployed as a HuggingFace Space with ZeroGPU and 4-bit quantization.
5
  """
6
  import gradio as gr
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
9
+ import spaces
10
 
11
  # Configuration
12
+ MODEL_REPO = "bricksandbotltd/buildsnpper-chatbot-merged"
13
+
14
+ # 4-bit quantization config
15
+ quantization_config = BitsAndBytesConfig(
16
+ load_in_4bit=True,
17
+ bnb_4bit_compute_dtype=torch.bfloat16,
18
+ bnb_4bit_use_double_quant=True,
19
+ bnb_4bit_quant_type="nf4"
 
 
 
20
  )
21
+
22
+ # Initialize model and tokenizer
23
+ print("Loading model and tokenizer with 4-bit quantization...")
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ MODEL_REPO,
27
+ quantization_config=quantization_config,
28
+ device_map="auto",
29
+ trust_remote_code=True
30
+ )
31
+ model.eval()
32
  print("Model loaded successfully!")
33
 
34
 
35
+ @spaces.GPU
36
  def chat(message, history):
37
  """
38
+ Process user message and generate response using ZeroGPU.
39
 
40
  Args:
41
  message: User's input message
 
53
  # Add current message
54
  messages.append({"role": "user", "content": message})
55
 
56
+ # Format with chat template
57
+ prompt = tokenizer.apply_chat_template(
58
+ messages,
59
+ tokenize=False,
60
+ add_generation_prompt=True
 
 
61
  )
62
 
63
+ # Tokenize
64
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
65
+
66
+ # Generate response
67
+ with torch.no_grad():
68
+ outputs = model.generate(
69
+ **inputs,
70
+ max_new_tokens=300,
71
+ temperature=0.1,
72
+ do_sample=True,
73
+ top_p=0.9,
74
+ pad_token_id=tokenizer.eos_token_id,
75
+ )
76
+
77
+ # Decode response
78
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
+
80
+ # Extract just the assistant's response
81
+ if "<|assistant|>" in response:
82
+ response = response.split("<|assistant|>")[-1].strip()
83
+
84
+ return response
85
 
86
 
87
  # Example questions
 
110
  - Technical issues
111
 
112
  **Note**: This chatbot is specialized for Buildsnpper platform questions only.
113
+
114
+ **Powered by**: ZeroGPU for fast inference
115
  """
116
  )
117
 
 
128
  gr.Markdown(
129
  """
130
  ---
131
+ **Model**: [bricksandbotltd/buildsnpper-chatbot-merged](https://huggingface.co/bricksandbotltd/buildsnpper-chatbot-merged)
132
  **Base Model**: microsoft/Phi-4-mini-instruct (3.8B parameters)
133
  **Fine-tuned**: LoRA on 89 Buildsnpper Q&A pairs
134
+ **Quantization**: 4-bit (NF4) with bitsandbytes
135
+ **Acceleration**: ZeroGPU
136
  """
137
  )
138
 
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
- gradio==4.44.0
2
- huggingface-hub==0.20.0
3
- https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cpu/llama_cpp_python-0.2.90-cp310-cp310-manylinux_2_17_x86_64.whl
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ accelerate
5
+ bitsandbytes
6
+ spaces