Spaces:

dhairyashil
/

phi_2_QLoRA

Runtime error

App Files Files Community

Dhairyashil Ghatage commited on Sep 6, 2024

Commit

f04dcd7

1 Parent(s): fa06863

add app and model data

Browse files

Files changed (5) hide show

README.md +25 -13
adapters.npz +3 -0
app.py +52 -59
models/phi2.py +138 -0
utils.py +163 -0

README.md CHANGED Viewed

@@ -1,13 +1,25 @@
----
-title: Phi 2 QLoRA
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 4.36.1
-app_file: app.py
-pinned: false
-license: mit
----
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

+# Phi 2 QLoRA Chatbot
+💬 A fine-tuned chatbot using Microsoft's Phi-2 model and QLoRA technique.
+## Overview
+This project demonstrates a chatbot implementation using:
+- [Gradio](https://gradio.app) for the user interface
+- [Microsoft's Phi-2 model](https://huggingface.co/microsoft/phi-2) as the base language model
+- [OpenAssistant Conversations Dataset (OASST1)](https://huggingface.co/datasets/OpenAssistant/oasst1) for fine-tuning
+- GenAI code assistant
+## Fine-tuning
+The model was fine-tuned using the QLoRA (Quantized Low-Rank Adaptation) technique. This approach allows for efficient fine-tuning of large language models on consumer-grade hardware.
+Credit for the fine-tuning process goes to the excellent guide by [Deltaaruna](https://medium.com/rahasak/fine-tune-llms-on-your-pc-with-qlora-apple-mlx-c2aedf1f607d).
+## Usage
+[Add instructions on how to run or use the chatbot]
+## License
+MIT

adapters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceefba0222ee06b0c1d1885f0d57dabcfec25f9173c49409187285a838d5c4db
+size 2629974

app.py CHANGED Viewed

@@ -1,63 +1,56 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import mlx.core as mx
+import utils
+# Load the model and tokenizer
+def load_model(model_path, adapter_path):
+    model, tokenizer, _ = utils.load(model_path)
+    if adapter_path:
+        try:
+            adapter_weights = mx.load(adapter_path)
+            # Filter out any weights that don't match the model's structure
+            filtered_weights = {k: v for k, v in adapter_weights.items() if k in model.parameters()}
+            model.load_weights(filtered_weights, strict=False)
+            print(f"Loaded adapter weights from {adapter_path}")
+        except Exception as e:
+            print(f"Error loading adapter weights: {str(e)}")
+    return model, tokenizer
+# Generate response
+def generate_response(model, tokenizer, prompt, max_tokens, temperature):
+    prompt_tokens = mx.array(tokenizer.encode(prompt))
+    generated_tokens = []
+    for token in utils.generate(prompt_tokens, model, temperature):
+        generated_tokens.append(token.item())
+        if len(generated_tokens) >= max_tokens or token.item() == tokenizer.eos_token_id:
+            break
+    return tokenizer.decode(generated_tokens)
+# Inference function
+def infer(question, max_tokens, temperature):
+    prompt = f"Q: {question}\nA:"
+    response = generate_response(model, tokenizer, prompt, max_tokens, temperature)
+    return response
+# Load the model and tokenizer (do this outside the infer function to load only once)
+model_path = "./phi-2"  # Update this with the actual path to your model
+adapter_path = "./adapters.npz"  # Update this with the actual path to your adapters
+model, tokenizer = load_model(model_path, adapter_path)
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=infer,
+    inputs=[
+        gr.Textbox(lines=2, placeholder="Enter your question here..."),
+        gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Max Tokens"),
+        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
     ],
+    outputs="text",
+    title="Fine-tuned Phi-2 Q&A Demo",
+    description="Ask a question and get an answer from the fine-tuned Phi-2 model. Finetuned on OASST1 dataset."
 )
+# Launch the interface
+iface.launch()

models/phi2.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import math
+from dataclasses import dataclass
+import mlx.core as mx
+import mlx.nn as nn
+from .base import BaseModelArgs
+@dataclass
+class ModelArgs(BaseModelArgs):
+    n_positions: int = 2048
+    vocab_size: int = 51200
+    n_embd: int = 2560
+    n_head: int = 32
+    n_layer: int = 32
+    rotary_dim: int = 32
+class LayerNorm(nn.LayerNorm):
+    def __call__(self, x: mx.array) -> mx.array:
+        return super().__call__(x.astype(mx.float32)).astype(x.dtype)
+class RoPEAttention(nn.Module):
+    def __init__(self, dims: int, n_head: int, rotary_dim: int):
+        super().__init__()
+        self.n_head = n_head
+        self.q_proj = nn.Linear(dims, dims)
+        self.k_proj = nn.Linear(dims, dims)
+        self.v_proj = nn.Linear(dims, dims)
+        self.dense = nn.Linear(dims, dims)
+        self.rope = nn.RoPE(rotary_dim, traditional=False)
+    def __call__(self, x, mask=None, cache=None):
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        # Extract some shapes
+        n_head = self.n_head
+        B, L, D = queries.shape
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, n_head, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, n_head, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, n_head, -1).transpose(0, 2, 1, 3)
+        # Add RoPE to the queries and keys and combine them with the cache
+        if cache is not None:
+            key_cache, value_cache = cache
+            queries = self.rope(queries, offset=key_cache.shape[2])
+            keys = self.rope(keys, offset=key_cache.shape[2])
+            keys = mx.concatenate([key_cache, keys], axis=2)
+            values = mx.concatenate([value_cache, values], axis=2)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+        queries = queries.astype(mx.float32)
+        keys = keys.astype(mx.float32)
+        # Finally perform the attention computation
+        scale = math.sqrt(1 / queries.shape[-1])
+        scores = (queries * scale) @ keys.transpose(0, 1, 3, 2)
+        if mask is not None:
+            scores = scores + mask
+        scores = mx.softmax(scores, axis=-1).astype(values.dtype)
+        values_hat = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.dense(values_hat), (keys, values)
+class MLP(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, dim)
+        self.act = nn.GELU(approx="precise")
+    def __call__(self, x) -> mx.array:
+        return self.fc2(self.act(self.fc1(x)))
+class ParallelBlock(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        dims = config.n_embd
+        mlp_dims = dims * 4
+        self.self_attn = RoPEAttention(dims, config.n_head, config.rotary_dim)
+        self.input_layernorm = LayerNorm(dims)
+        self.mlp = MLP(dims, mlp_dims)
+    def __call__(self, x, mask, cache):
+        h = self.input_layernorm(x)
+        attn_h, cache = self.self_attn(h, mask, cache)
+        ff_h = self.mlp(h)
+        return attn_h + ff_h + x, cache
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.n_embd)
+        self.layers = [ParallelBlock(config) for i in range(config.n_layer)]
+        self.final_layernorm = LayerNorm(config.n_embd)
+    def __call__(self, x, mask, cache):
+        x = self.embed_tokens(x)
+        if cache is None:
+            cache = [None] * len(self.layers)
+        for e, layer in enumerate(self.layers):
+            x, cache[e] = layer(x, mask, cache[e])
+        return self.final_layernorm(x), cache
+class Model(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.model = Transformer(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: mx.array = None,
+        cache: mx.array = None,
+    ) -> tuple[mx.array, mx.array]:
+        mask = None
+        if x.shape[1] > 1:
+            mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
+            mask = mask.astype(x.dtype)
+        y, cache = self.model(x, mask, cache)
+        return self.lm_head(y), cache

utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright © 2023 Apple Inc.
+import glob
+import json
+import logging
+from pathlib import Path
+from typing import Generator
+import mlx.core as mx
+import mlx.nn as nn
+import models.phi2 as phi2
+import transformers
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+# Constants
+MODEL_MAPPING = {
+    "phi": phi2,
+}
+def _get_classes(config: dict):
+    """
+    Retrieve the model and model args classes based on the configuration.
+    Args:
+        config (dict): The model configuration.
+    Returns:
+        A tuple containing the Model class and the ModelArgs class.
+    """
+    model_type = config["model_type"]
+    if model_type not in MODEL_MAPPING:
+        msg = f"Model type {model_type} not supported."
+        logging.error(msg)
+        raise ValueError(msg)
+    arch = MODEL_MAPPING[model_type]
+    return arch.Model, arch.ModelArgs
+def fetch_from_hub(hf_path: str):
+    model_path = snapshot_download(
+        repo_id=hf_path,
+        allow_patterns=["*.json", "*.safetensors", "tokenizer.model"],
+    )
+    weight_files = glob.glob(f"{model_path}/*.safetensors")
+    if len(weight_files) == 0:
+        raise FileNotFoundError("No safetensors found in {}".format(model_path))
+    weights = {}
+    for wf in weight_files:
+        weights.update(mx.load(wf).items())
+    config = transformers.AutoConfig.from_pretrained(hf_path)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        hf_path,
+    )
+    return weights, config.to_dict(), tokenizer
+def make_shards(weights: dict, max_file_size_gibibyte: int = 15):
+    max_file_size_bytes = max_file_size_gibibyte << 30
+    shards = []
+    shard, shard_size = {}, 0
+    for k, v in weights.items():
+        if shard_size + v.nbytes > max_file_size_bytes:
+            shards.append(shard)
+            shard, shard_size = {}, 0
+        shard[k] = v
+        shard_size += v.nbytes
+    shards.append(shard)
+    return shards
+def save_model(save_dir: str, weights, tokenizer, config):
+    save_dir = Path(save_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)
+    shards = make_shards(weights, max_file_size_gibibyte=5)
+    shards_count = len(shards)
+    shard_file_format = (
+        "model-{:05d}-of-{:05d}.safetensors"
+        if shards_count > 1
+        else "model.safetensors"
+    )
+    for i, shard in enumerate(shards):
+        shard_name = shard_file_format.format(i + 1, shards_count)
+        mx.save_safetensors(str(save_dir / shard_name), shard)
+    tokenizer.save_pretrained(save_dir)
+    with open(save_dir / "config.json", "w") as fid:
+        json.dump(config, fid, indent=4)
+def load(path):
+    model_path = Path(path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    # Load the config
+    with open(model_path / "config.json", "r") as f:
+        config = json.load(f)
+    # Get the appropriate model and ModelArgs classes
+    model_class, model_args_class = _get_classes(config)
+    # Create ModelArgs instance
+    model_args = model_args_class.from_dict(config)
+    # Create model instance
+    model = model_class(model_args)
+    # Load weights from .safetensors files
+    weight_files = glob.glob(str(model_path / "*.safetensors"))
+    if not weight_files:
+        raise FileNotFoundError(f"No .safetensors files found in {model_path}")
+    weights = {}
+    for wf in weight_files:
+        weights.update(mx.load(wf))
+    if "quantization" in config:
+        print("[INFO] Loading quantized model")
+        group_size = config["quantization"]["group_size"]
+        bits = config["quantization"]["bits"]
+        nn.quantize(model, group_size, bits)
+    model.load_weights(list(weights.items()))
+    return model, tokenizer, model_args
+def generate(
+    prompt: mx.array, model: nn.Module, temp: float = 0.0
+) -> Generator[mx.array, None, None]:
+    """
+    Generate text based on the given prompt and model.
+    Args:
+        prompt (mx.array): The input prompt.
+        model (nn.Module): The model to use for generation.
+        temp (float): The temperature for sampling. If temp is 0, use max sampling.
+    Yields:
+        mx.array: The generated text.
+    """
+    def sample(logits: mx.array) -> mx.array:
+        return (
+            mx.argmax(logits, axis=-1)
+            if temp == 0
+            else mx.random.categorical(logits * (1 / temp))
+        )
+    y = prompt
+    cache = None
+    while True:
+        logits, cache = model(y[None], cache=cache)
+        logits = logits[:, -1, :]
+        y = sample(logits)
+        yield y