Spaces:

DR-Rakshitha
/

wizardlm_api

Runtime error

App Files Files Community

wizardlm_api / app.py

DR-Rakshitha

Update app.py

1465e91 over 2 years ago

raw

history blame

4.51 kB

	# import gradio as gr
	# from transformers import AutoModelForCausalLM, AutoTokenizer

	# from gpt4all import GPT4All
	# model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")

	#----------------------------------------------------------------------------------------------------------------------------
	# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
	import torch
	# from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	HfArgumentParser,
	TrainingArguments,
	pipeline,
	logging,
	)
	from peft import LoraConfig, PeftModel
	from trl import SFTTrainer

	# -----------------------------------------------------------------------------------------------------------------------------------------------------------------

	# LoRA attention dimension
	lora_r = 64

	# Alpha parameter for LoRA scaling
	lora_alpha = 16

	# Dropout probability for LoRA layers
	lora_dropout = 0.1

	################################################################################
	# bitsandbytes parameters
	################################################################################

	# Activate 4-bit precision base model loading
	use_4bit = True

	# Compute dtype for 4-bit base models
	bnb_4bit_compute_dtype = "float32" # Changed to float32 for CPU compatibility

	# Quantization type (fp4 or nf4)
	bnb_4bit_quant_type = "nf4"

	# Activate nested quantization for 4-bit base models (double quantization)
	use_nested_quant = False

	# Remove device_map, as it's GPU-specific
	# device_map = {"": 0}

	# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
	model_name = "DR-DRR/Model_001"
	model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format

	# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------

	# Load tokenizer and model with QLoRA configuration
	compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=use_4bit,
	bnb_4bit_quant_type=bnb_4bit_quant_type,
	bnb_4bit_compute_dtype=compute_dtype,
	bnb_4bit_use_double_quant=use_nested_quant,
	)

	# Remove GPU-specific check for bfloat16

	# Load base model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=bnb_config,
	# Remove device_map for CPU usage
	)
	model.config.use_cache = False
	model.config.pretraining_tp = 1

	# Load LLaMA tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

	# Load LoRA configuration
	peft_config = LoraConfig(
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	r=lora_r,
	bias="none",
	task_type="CAUSAL_LM",
	)

	# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
	# Ignore warnings
	logging.set_verbosity(logging.CRITICAL)

	# Run text generation pipeline with our next model
	prompt = "What is a large language model?"
	pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	result = pipe(f"<s>[INST] {prompt} [/INST]")
	print(result[0]['generated_text'])

	#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
	# Ignore warnings
	logging.set_verbosity(logging.CRITICAL)

	# Run text generation pipeline with our next model
	# prompt = "What is a large language model?"
	# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	# result = pipe(f"<s>[INST] {prompt} [/INST]")
	# print(result[0]['generated_text'])


	def generate_text(prompt):
	# output = model.generate(input_text)
	pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	result = pipe(f"<s>[INST] {prompt} [/INST]")
	return result

	text_generation_interface = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.inputs.Textbox(label="Input Text"),
	],
	outputs=gr.outputs.Textbox(label="Generated Text"),
	title="GPT-4 Text Generation",
	).launch()



	# model_name = ""