Spaces:
Runtime error
Runtime error
File size: 4,509 Bytes
0c8c4f5 8272482 0c8c4f5 319b4d3 0c8c4f5 06e5052 0c8c4f5 1465e91 0c8c4f5 06e5052 2ca6e84 0c8c4f5 319b4d3 0c8c4f5 2ca6e84 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 06e5052 0c8c4f5 319b4d3 0010001 8a97208 0010001 eb443cd 4789c94 c928ad3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | # import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from gpt4all import GPT4All
# model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
#----------------------------------------------------------------------------------------------------------------------------
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
import torch
# from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# -----------------------------------------------------------------------------------------------------------------------------------------------------------------
# LoRA attention dimension
lora_r = 64
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float32" # Changed to float32 for CPU compatibility
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
# Remove device_map, as it's GPU-specific
# device_map = {"": 0}
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name = "DR-DRR/Model_001"
model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
# Remove GPU-specific check for bfloat16
# Load base model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
# Remove device_map for CPU usage
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
# Load LoRA configuration
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
)
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
# Run text generation pipeline with our next model
# prompt = "What is a large language model?"
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# result = pipe(f"<s>[INST] {prompt} [/INST]")
# print(result[0]['generated_text'])
def generate_text(prompt):
# output = model.generate(input_text)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
return result
text_generation_interface = gr.Interface(
fn=generate_text,
inputs=[
gr.inputs.Textbox(label="Input Text"),
],
outputs=gr.outputs.Textbox(label="Generated Text"),
title="GPT-4 Text Generation",
).launch()
# model_name = ""
|