File size: 4,509 Bytes
0c8c4f5
 
8272482
0c8c4f5
 
319b4d3
0c8c4f5
06e5052
0c8c4f5
1465e91
0c8c4f5
 
 
 
 
 
 
 
 
 
 
06e5052
 
2ca6e84
0c8c4f5
 
319b4d3
0c8c4f5
 
2ca6e84
0c8c4f5
 
 
 
 
 
 
 
 
 
 
06e5052
0c8c4f5
 
 
 
 
 
 
06e5052
 
0c8c4f5
06e5052
0c8c4f5
06e5052
0c8c4f5
06e5052
0c8c4f5
 
 
 
 
 
 
 
 
 
 
06e5052
0c8c4f5
 
 
 
 
06e5052
0c8c4f5
 
 
 
 
 
 
06e5052
0c8c4f5
 
 
 
 
 
 
 
 
 
06e5052
 
 
 
 
 
 
 
 
 
0c8c4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319b4d3
0010001
8a97208
0010001
 
 
eb443cd
 
4789c94
c928ad3
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer

# from gpt4all import GPT4All
# model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")

#----------------------------------------------------------------------------------------------------------------------------
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
import torch
# from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# -----------------------------------------------------------------------------------------------------------------------------------------------------------------

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float32"  # Changed to float32 for CPU compatibility

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Remove device_map, as it's GPU-specific
# device_map = {"": 0}

# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name = "DR-DRR/Model_001"
model_basename = "pytorch_model-00001-of-00002.bin"  # the model is in bin format

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Remove GPU-specific check for bfloat16

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # Remove device_map for CPU usage
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# result = pipe(f"<s>[INST] {prompt} [/INST]")
# print(result[0]['generated_text'])


def generate_text(prompt):
    # output = model.generate(input_text)
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    return result

text_generation_interface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.inputs.Textbox(label="Input Text"),
    ],
    outputs=gr.outputs.Textbox(label="Generated Text"),
    title="GPT-4 Text Generation",
).launch()



# model_name = ""