Dhairyashil Ghatage commited on
Commit
f04dcd7
·
1 Parent(s): fa06863

add app and model data

Browse files
Files changed (5) hide show
  1. README.md +25 -13
  2. adapters.npz +3 -0
  3. app.py +52 -59
  4. models/phi2.py +138 -0
  5. utils.py +163 -0
README.md CHANGED
@@ -1,13 +1,25 @@
1
- ---
2
- title: Phi 2 QLoRA
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phi 2 QLoRA Chatbot
2
+
3
+ 💬 A fine-tuned chatbot using Microsoft's Phi-2 model and QLoRA technique.
4
+
5
+ ## Overview
6
+
7
+ This project demonstrates a chatbot implementation using:
8
+ - [Gradio](https://gradio.app) for the user interface
9
+ - [Microsoft's Phi-2 model](https://huggingface.co/microsoft/phi-2) as the base language model
10
+ - [OpenAssistant Conversations Dataset (OASST1)](https://huggingface.co/datasets/OpenAssistant/oasst1) for fine-tuning
11
+ - GenAI code assistant
12
+
13
+ ## Fine-tuning
14
+
15
+ The model was fine-tuned using the QLoRA (Quantized Low-Rank Adaptation) technique. This approach allows for efficient fine-tuning of large language models on consumer-grade hardware.
16
+
17
+ Credit for the fine-tuning process goes to the excellent guide by [Deltaaruna](https://medium.com/rahasak/fine-tune-llms-on-your-pc-with-qlora-apple-mlx-c2aedf1f607d).
18
+
19
+ ## Usage
20
+
21
+ [Add instructions on how to run or use the chatbot]
22
+
23
+ ## License
24
+
25
+ MIT
adapters.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceefba0222ee06b0c1d1885f0d57dabcfec25f9173c49409187285a838d5c4db
3
+ size 2629974
app.py CHANGED
@@ -1,63 +1,56 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
  ],
 
 
 
59
  )
60
 
61
-
62
- if __name__ == "__main__":
63
- demo.launch()
 
1
  import gradio as gr
2
+ import mlx.core as mx
3
+ import utils
4
+
5
+ # Load the model and tokenizer
6
+ def load_model(model_path, adapter_path):
7
+ model, tokenizer, _ = utils.load(model_path)
8
+ if adapter_path:
9
+ try:
10
+ adapter_weights = mx.load(adapter_path)
11
+ # Filter out any weights that don't match the model's structure
12
+ filtered_weights = {k: v for k, v in adapter_weights.items() if k in model.parameters()}
13
+ model.load_weights(filtered_weights, strict=False)
14
+ print(f"Loaded adapter weights from {adapter_path}")
15
+ except Exception as e:
16
+ print(f"Error loading adapter weights: {str(e)}")
17
+ return model, tokenizer
18
+
19
+ # Generate response
20
+ def generate_response(model, tokenizer, prompt, max_tokens, temperature):
21
+ prompt_tokens = mx.array(tokenizer.encode(prompt))
22
+
23
+ generated_tokens = []
24
+ for token in utils.generate(prompt_tokens, model, temperature):
25
+ generated_tokens.append(token.item())
26
+ if len(generated_tokens) >= max_tokens or token.item() == tokenizer.eos_token_id:
27
+ break
28
+
29
+ return tokenizer.decode(generated_tokens)
30
+
31
+ # Inference function
32
+ def infer(question, max_tokens, temperature):
33
+ prompt = f"Q: {question}\nA:"
34
+ response = generate_response(model, tokenizer, prompt, max_tokens, temperature)
35
+ return response
36
+
37
+ # Load the model and tokenizer (do this outside the infer function to load only once)
38
+ model_path = "./phi-2" # Update this with the actual path to your model
39
+ adapter_path = "./adapters.npz" # Update this with the actual path to your adapters
40
+ model, tokenizer = load_model(model_path, adapter_path)
41
+
42
+ # Create the Gradio interface
43
+ iface = gr.Interface(
44
+ fn=infer,
45
+ inputs=[
46
+ gr.Textbox(lines=2, placeholder="Enter your question here..."),
47
+ gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Max Tokens"),
48
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
 
 
 
 
 
 
 
 
 
49
  ],
50
+ outputs="text",
51
+ title="Fine-tuned Phi-2 Q&A Demo",
52
+ description="Ask a question and get an answer from the fine-tuned Phi-2 model. Finetuned on OASST1 dataset."
53
  )
54
 
55
+ # Launch the interface
56
+ iface.launch()
 
models/phi2.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+
4
+ import mlx.core as mx
5
+ import mlx.nn as nn
6
+
7
+ from .base import BaseModelArgs
8
+
9
+
10
+ @dataclass
11
+ class ModelArgs(BaseModelArgs):
12
+ n_positions: int = 2048
13
+ vocab_size: int = 51200
14
+ n_embd: int = 2560
15
+ n_head: int = 32
16
+ n_layer: int = 32
17
+ rotary_dim: int = 32
18
+
19
+
20
+ class LayerNorm(nn.LayerNorm):
21
+ def __call__(self, x: mx.array) -> mx.array:
22
+ return super().__call__(x.astype(mx.float32)).astype(x.dtype)
23
+
24
+
25
+ class RoPEAttention(nn.Module):
26
+ def __init__(self, dims: int, n_head: int, rotary_dim: int):
27
+ super().__init__()
28
+
29
+ self.n_head = n_head
30
+
31
+ self.q_proj = nn.Linear(dims, dims)
32
+ self.k_proj = nn.Linear(dims, dims)
33
+ self.v_proj = nn.Linear(dims, dims)
34
+ self.dense = nn.Linear(dims, dims)
35
+
36
+ self.rope = nn.RoPE(rotary_dim, traditional=False)
37
+
38
+ def __call__(self, x, mask=None, cache=None):
39
+ queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
40
+
41
+ # Extract some shapes
42
+ n_head = self.n_head
43
+ B, L, D = queries.shape
44
+
45
+ # Prepare the queries, keys and values for the attention computation
46
+ queries = queries.reshape(B, L, n_head, -1).transpose(0, 2, 1, 3)
47
+ keys = keys.reshape(B, L, n_head, -1).transpose(0, 2, 1, 3)
48
+ values = values.reshape(B, L, n_head, -1).transpose(0, 2, 1, 3)
49
+
50
+ # Add RoPE to the queries and keys and combine them with the cache
51
+ if cache is not None:
52
+ key_cache, value_cache = cache
53
+ queries = self.rope(queries, offset=key_cache.shape[2])
54
+ keys = self.rope(keys, offset=key_cache.shape[2])
55
+ keys = mx.concatenate([key_cache, keys], axis=2)
56
+ values = mx.concatenate([value_cache, values], axis=2)
57
+ else:
58
+ queries = self.rope(queries)
59
+ keys = self.rope(keys)
60
+
61
+ queries = queries.astype(mx.float32)
62
+ keys = keys.astype(mx.float32)
63
+
64
+ # Finally perform the attention computation
65
+ scale = math.sqrt(1 / queries.shape[-1])
66
+ scores = (queries * scale) @ keys.transpose(0, 1, 3, 2)
67
+ if mask is not None:
68
+ scores = scores + mask
69
+
70
+ scores = mx.softmax(scores, axis=-1).astype(values.dtype)
71
+ values_hat = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
72
+
73
+ return self.dense(values_hat), (keys, values)
74
+
75
+
76
+ class MLP(nn.Module):
77
+ def __init__(self, dim, hidden_dim):
78
+ super().__init__()
79
+ self.fc1 = nn.Linear(dim, hidden_dim)
80
+ self.fc2 = nn.Linear(hidden_dim, dim)
81
+ self.act = nn.GELU(approx="precise")
82
+
83
+ def __call__(self, x) -> mx.array:
84
+ return self.fc2(self.act(self.fc1(x)))
85
+
86
+
87
+ class ParallelBlock(nn.Module):
88
+ def __init__(self, config: ModelArgs):
89
+ super().__init__()
90
+ dims = config.n_embd
91
+ mlp_dims = dims * 4
92
+ self.self_attn = RoPEAttention(dims, config.n_head, config.rotary_dim)
93
+ self.input_layernorm = LayerNorm(dims)
94
+ self.mlp = MLP(dims, mlp_dims)
95
+
96
+ def __call__(self, x, mask, cache):
97
+ h = self.input_layernorm(x)
98
+ attn_h, cache = self.self_attn(h, mask, cache)
99
+ ff_h = self.mlp(h)
100
+ return attn_h + ff_h + x, cache
101
+
102
+
103
+ class Transformer(nn.Module):
104
+ def __init__(self, config: ModelArgs):
105
+ super().__init__()
106
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.n_embd)
107
+ self.layers = [ParallelBlock(config) for i in range(config.n_layer)]
108
+ self.final_layernorm = LayerNorm(config.n_embd)
109
+
110
+ def __call__(self, x, mask, cache):
111
+ x = self.embed_tokens(x)
112
+ if cache is None:
113
+ cache = [None] * len(self.layers)
114
+
115
+ for e, layer in enumerate(self.layers):
116
+ x, cache[e] = layer(x, mask, cache[e])
117
+ return self.final_layernorm(x), cache
118
+
119
+
120
+ class Model(nn.Module):
121
+ def __init__(self, config: ModelArgs):
122
+ super().__init__()
123
+ self.model = Transformer(config)
124
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
125
+
126
+ def __call__(
127
+ self,
128
+ x: mx.array,
129
+ mask: mx.array = None,
130
+ cache: mx.array = None,
131
+ ) -> tuple[mx.array, mx.array]:
132
+ mask = None
133
+ if x.shape[1] > 1:
134
+ mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
135
+ mask = mask.astype(x.dtype)
136
+
137
+ y, cache = self.model(x, mask, cache)
138
+ return self.lm_head(y), cache
utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright © 2023 Apple Inc.
2
+
3
+ import glob
4
+ import json
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Generator
8
+
9
+ import mlx.core as mx
10
+ import mlx.nn as nn
11
+ import models.phi2 as phi2
12
+ import transformers
13
+ from huggingface_hub import snapshot_download
14
+ from transformers import AutoTokenizer
15
+
16
+ # Constants
17
+ MODEL_MAPPING = {
18
+ "phi": phi2,
19
+ }
20
+
21
+
22
+ def _get_classes(config: dict):
23
+ """
24
+ Retrieve the model and model args classes based on the configuration.
25
+
26
+ Args:
27
+ config (dict): The model configuration.
28
+
29
+ Returns:
30
+ A tuple containing the Model class and the ModelArgs class.
31
+ """
32
+ model_type = config["model_type"]
33
+ if model_type not in MODEL_MAPPING:
34
+ msg = f"Model type {model_type} not supported."
35
+ logging.error(msg)
36
+ raise ValueError(msg)
37
+
38
+ arch = MODEL_MAPPING[model_type]
39
+ return arch.Model, arch.ModelArgs
40
+
41
+
42
+ def fetch_from_hub(hf_path: str):
43
+ model_path = snapshot_download(
44
+ repo_id=hf_path,
45
+ allow_patterns=["*.json", "*.safetensors", "tokenizer.model"],
46
+ )
47
+ weight_files = glob.glob(f"{model_path}/*.safetensors")
48
+ if len(weight_files) == 0:
49
+ raise FileNotFoundError("No safetensors found in {}".format(model_path))
50
+
51
+ weights = {}
52
+ for wf in weight_files:
53
+ weights.update(mx.load(wf).items())
54
+
55
+ config = transformers.AutoConfig.from_pretrained(hf_path)
56
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
57
+ hf_path,
58
+ )
59
+ return weights, config.to_dict(), tokenizer
60
+
61
+
62
+ def make_shards(weights: dict, max_file_size_gibibyte: int = 15):
63
+ max_file_size_bytes = max_file_size_gibibyte << 30
64
+ shards = []
65
+ shard, shard_size = {}, 0
66
+ for k, v in weights.items():
67
+ if shard_size + v.nbytes > max_file_size_bytes:
68
+ shards.append(shard)
69
+ shard, shard_size = {}, 0
70
+ shard[k] = v
71
+ shard_size += v.nbytes
72
+ shards.append(shard)
73
+ return shards
74
+
75
+
76
+ def save_model(save_dir: str, weights, tokenizer, config):
77
+ save_dir = Path(save_dir)
78
+ save_dir.mkdir(parents=True, exist_ok=True)
79
+
80
+ shards = make_shards(weights, max_file_size_gibibyte=5)
81
+ shards_count = len(shards)
82
+ shard_file_format = (
83
+ "model-{:05d}-of-{:05d}.safetensors"
84
+ if shards_count > 1
85
+ else "model.safetensors"
86
+ )
87
+
88
+ for i, shard in enumerate(shards):
89
+ shard_name = shard_file_format.format(i + 1, shards_count)
90
+ mx.save_safetensors(str(save_dir / shard_name), shard)
91
+
92
+ tokenizer.save_pretrained(save_dir)
93
+
94
+ with open(save_dir / "config.json", "w") as fid:
95
+ json.dump(config, fid, indent=4)
96
+
97
+
98
+ def load(path):
99
+ model_path = Path(path)
100
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
101
+
102
+ # Load the config
103
+ with open(model_path / "config.json", "r") as f:
104
+ config = json.load(f)
105
+
106
+ # Get the appropriate model and ModelArgs classes
107
+ model_class, model_args_class = _get_classes(config)
108
+
109
+ # Create ModelArgs instance
110
+ model_args = model_args_class.from_dict(config)
111
+
112
+ # Create model instance
113
+ model = model_class(model_args)
114
+
115
+ # Load weights from .safetensors files
116
+ weight_files = glob.glob(str(model_path / "*.safetensors"))
117
+ if not weight_files:
118
+ raise FileNotFoundError(f"No .safetensors files found in {model_path}")
119
+
120
+ weights = {}
121
+ for wf in weight_files:
122
+ weights.update(mx.load(wf))
123
+
124
+ if "quantization" in config:
125
+ print("[INFO] Loading quantized model")
126
+ group_size = config["quantization"]["group_size"]
127
+ bits = config["quantization"]["bits"]
128
+
129
+ nn.quantize(model, group_size, bits)
130
+
131
+ model.load_weights(list(weights.items()))
132
+ return model, tokenizer, model_args
133
+
134
+
135
+ def generate(
136
+ prompt: mx.array, model: nn.Module, temp: float = 0.0
137
+ ) -> Generator[mx.array, None, None]:
138
+ """
139
+ Generate text based on the given prompt and model.
140
+
141
+ Args:
142
+ prompt (mx.array): The input prompt.
143
+ model (nn.Module): The model to use for generation.
144
+ temp (float): The temperature for sampling. If temp is 0, use max sampling.
145
+
146
+ Yields:
147
+ mx.array: The generated text.
148
+ """
149
+
150
+ def sample(logits: mx.array) -> mx.array:
151
+ return (
152
+ mx.argmax(logits, axis=-1)
153
+ if temp == 0
154
+ else mx.random.categorical(logits * (1 / temp))
155
+ )
156
+
157
+ y = prompt
158
+ cache = None
159
+ while True:
160
+ logits, cache = model(y[None], cache=cache)
161
+ logits = logits[:, -1, :]
162
+ y = sample(logits)
163
+ yield y