Erik22TY commited on
Commit
2170a5b
·
verified ·
1 Parent(s): 8759cb6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+
5
+ # loaders for different quant types
6
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
7
+ from awq import AutoAWQForCausalLM
8
+
9
+ # all models & type
10
+ MODEL_OPTIONS = {
11
+ "Llama-3.2-3B": ("meta-llama/Llama-3.2-3B-Instruct", "transformers"),
12
+ "Llama-3.2-1B": ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
13
+ "Qwen2.5-3B-Instruct": ("Qwen/Qwen2.5-3B-Instruct", "transformers"),
14
+ "Qwen2.5-1.5B-Instruct": ("Qwen/Qwen2.5-1.5B-Instruct", "transformers"),
15
+ "OpenChat-3.5-0106-GPTQ": ("TheBloke/openchat-3.5-0106-GPTQ", "gptq"),
16
+ "Gemma-3-4b-it-GPTQ": ("ISTA-DASLab/gemma-3-4b-it-GPTQ-4b-128g", "gptq"),
17
+ "LLaMA2-7B-GPTQ": ("TheBloke/Llama-2-7B-GPTQ", "gptq"),
18
+ "LLaMA2-7B-AWQ": ("TitanML/llama2-7b-base-4bit-AWQ", "awq"),
19
+ "BTLM-3B-8k-base": ("cerebras/btlm-3b-8k-base", "transformers"),
20
+ "SmolLM3-3B": ("HuggingFaceTB/SmolLM3-3B", "transformers"),
21
+ "StableLM2-1.6B": ("stabilityai/stablelm-2-zephyr-1_6b", "transformers"),
22
+ "Falcon-H1-1.5B-Deep": ("unsloth/Falcon-H1-1.5B-Deep-Instruct", "transformers"),
23
+ "Mistral-7B-Instruct": ("mistralai/Mistral-7B-Instruct-v0.1", "transformers")
24
+ }
25
+
26
+ loaded = {}
27
+ SYSTEM_PROMPT = "You are HugginGPT — a helpful assistant that remembers context and follows instructions."
28
+
29
+ def load_model(model_key):
30
+ model_id, mtype = MODEL_OPTIONS[model_key]
31
+ # return cached if loaded
32
+ if model_key in loaded:
33
+ return loaded[model_key]
34
+
35
+ # transformers regular
36
+ if mtype == "transformers":
37
+ from transformers import AutoModelForCausalLM
38
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
39
+ model = AutoModelForCausalLM.from_pretrained(
40
+ model_id,
41
+ device_map="auto",
42
+ torch_dtype=torch.float16
43
+ )
44
+
45
+ # GPTQ quant
46
+ elif mtype == "gptq":
47
+ quant_cfg = BaseQuantizeConfig(bits=4, group_size=64, desc_act=False)
48
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
49
+ model = AutoGPTQForCausalLM.from_quantized(
50
+ model_id,
51
+ use_safetensors=True,
52
+ device="cuda:0",
53
+ quantize_config=quant_cfg
54
+ )
55
+
56
+ # AWQ quant
57
+ elif mtype == "awq":
58
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=False)
59
+ model = AutoAWQForCausalLM.from_quantized(
60
+ model_id,
61
+ fuse_layers=True,
62
+ trust_remote_code=False,
63
+ safetensors=True
64
+ )
65
+
66
+ loaded[model_key] = (tokenizer, model)
67
+ return tokenizer, model
68
+
69
+ def generate_response(message, history, model_choice):
70
+ tokenizer, model = load_model(model_choice)
71
+
72
+ # build prompt with system + memory
73
+ context = f"system: {SYSTEM_PROMPT}\n"
74
+ if history:
75
+ for u, a in history:
76
+ context += f"user: {u}\nassistant: {a}\n"
77
+ context += f"user: {message}\nassistant:"
78
+
79
+ inputs = tokenizer(context, return_tensors="pt").to(model.device)
80
+ output = model.generate(
81
+ **inputs,
82
+ max_new_tokens=200,
83
+ do_sample=True,
84
+ top_p=0.9,
85
+ temperature=0.8
86
+ )
87
+ text = tokenizer.decode(output[0], skip_special_tokens=True)
88
+ reply = text.split("assistant:")[-1].strip()
89
+ return reply
90
+
91
+ with gr.Blocks() as demo:
92
+ gr.ChatInterface(
93
+ fn=generate_response,
94
+ title="HugginGPT",
95
+ inputs=[gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Llama-3.2-3B")]
96
+ )
97
+
98
+ demo.launch()