ogflash commited on
Commit
2c2db68
·
verified ·
1 Parent(s): 0503dd6

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +12 -13
  2. app.py +47 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
- ---
2
- title: QnA Bitnet Lora
3
- emoji: 💻
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.38.2
8
- app_file: app.py
9
- pinned: false
10
- license: unknown
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # BitNet QA - LoRA-Fine-Tuned Mistral
2
+
3
+ This Space provides a chat interface to a LoRA fine-tuned version of `Mistral-7B-Instruct`, trained on Q&A from the BitNet b1.58 (1-bit LLM) paper.
4
+
5
+ Ask any technical question about:
6
+ - 1-bit vs FP16 models
7
+ - BitNet architecture
8
+ - Inference latency
9
+ - Memory and energy savings
10
+ - Edge deployment of LLMs
11
+
12
+ Model: [ogflash/mistral-lora-qa-1bit](https://huggingface.co/ogflash/mistral-lora-qa-1bit)
 
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
+ from peft import PeftModel
4
+ import gradio as gr
5
+
6
+ base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
7
+ lora_model_id = "ogflash/mistral-lora-qa-1bit"
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(lora_model_id)
10
+
11
+ bnb_config = BitsAndBytesConfig(
12
+ load_in_4bit=True,
13
+ bnb_4bit_use_double_quant=True,
14
+ bnb_4bit_quant_type="nf4",
15
+ bnb_4bit_compute_dtype=torch.float16,
16
+ )
17
+
18
+ base_model = AutoModelForCausalLM.from_pretrained(
19
+ base_model_id,
20
+ device_map="auto",
21
+ quantization_config=bnb_config
22
+ )
23
+
24
+ model = PeftModel.from_pretrained(base_model, lora_model_id)
25
+
26
+ def generate_response(user_input):
27
+ prompt = f"### Instruction:\n{user_input}\n\n### Response:\n"
28
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
29
+ outputs = model.generate(
30
+ **inputs,
31
+ max_new_tokens=256,
32
+ do_sample=True,
33
+ top_p=0.95,
34
+ temperature=0.7,
35
+ pad_token_id=tokenizer.eos_token_id
36
+ )
37
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
38
+
39
+ demo = gr.Interface(
40
+ fn=generate_response,
41
+ inputs=gr.Textbox(lines=2, placeholder="Ask something about 1-bit LLMs..."),
42
+ outputs="text",
43
+ title="BitNet QA - Mistral LoRA",
44
+ description="Ask questions related to 1-bit LLMs (BitNet b1.58)."
45
+ )
46
+
47
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ peft
3
+ accelerate
4
+ torch
5
+ gradio
6
+ bitsandbytes