Spaces:

DR-Rakshitha
/

Medibot

Paused

App Files Files Community

Medibot / app.py

DR-Rakshitha

Update app.py

b0e586c about 2 years ago

raw

history blame contribute delete

5.73 kB

	import gradio as gr
	import os
	import torch
	from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	HfArgumentParser,
	TrainingArguments,
	pipeline,
	logging,
	)
	from peft import LoraConfig, PeftModel
	from trl import SFTTrainer

	# The model that you want to train from the Hugging Face hub
	model_name = "DR-DRR/Model_001"
	################################################################################
	# QLoRA parameters
	################################################################################

	# LoRA attention dimension
	lora_r = 64

	# Alpha parameter for LoRA scaling
	lora_alpha = 16

	# Dropout probability for LoRA layers
	lora_dropout = 0.1

	################################################################################
	# bitsandbytes parameters
	################################################################################

	# Activate 4-bit precision base model loading
	use_4bit = True

	# Compute dtype for 4-bit base models
	bnb_4bit_compute_dtype = "float16"

	# Quantization type (fp4 or nf4)
	bnb_4bit_quant_type = "nf4"

	# Activate nested quantization for 4-bit base models (double quantization)
	use_nested_quant = False

	################################################################################
	# TrainingArguments parameters
	################################################################################

	# Output directory where the model predictions and checkpoints will be stored
	output_dir = "./results"

	# Number of training epochs
	num_train_epochs = 0.1

	# Enable fp16/bf16 training (set bf16 to True with an A100)
	fp16 = False
	bf16 = False

	# Batch size per GPU for training
	per_device_train_batch_size = 4

	# Batch size per GPU for evaluation
	per_device_eval_batch_size = 4

	# Number of update steps to accumulate the gradients for
	gradient_accumulation_steps = 1

	# Enable gradient checkpointing
	gradient_checkpointing = True

	# Maximum gradient normal (gradient clipping)
	max_grad_norm = 0.3

	# Initial learning rate (AdamW optimizer)
	learning_rate = 2e-4

	# Weight decay to apply to all layers except bias/LayerNorm weights
	weight_decay = 0.001

	# Optimizer to use
	optim = "paged_adamw_32bit"

	# Learning rate schedule
	lr_scheduler_type = "cosine"

	# Number of training steps (overrides num_train_epochs)
	max_steps = -1

	# Ratio of steps for a linear warmup (from 0 to learning rate)
	warmup_ratio = 0.03

	# Group sequences into batches with same length
	# Saves memory and speeds up training considerably
	group_by_length = True

	# Save checkpoint every X updates steps
	save_steps = 0

	# Log every X updates steps
	logging_steps = 25

	################################################################################
	# SFT parameters
	################################################################################

	# Maximum sequence length to use
	max_seq_length = None

	# Pack multiple short examples in the same input sequence to increase efficiency
	packing = False

	# Load the entire model on the GPU 0
	device_map = {"": 0}

	# Parameter end
	#load model

	# Load tokenizer and model with QLoRA configuration
	compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=use_4bit,
	bnb_4bit_quant_type=bnb_4bit_quant_type,
	bnb_4bit_compute_dtype=compute_dtype,
	bnb_4bit_use_double_quant=use_nested_quant,
	)

	# Check GPU compatibility with bfloat16
	if compute_dtype == torch.float16 and use_4bit:
	major, _ = torch.cuda.get_device_capability()
	if major >= 8:
	print("=" * 80)
	print("Your GPU supports bfloat16: accelerate training with bf16=True")
	print("=" * 80)

	# Load base model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=bnb_config,
	device_map=device_map
	)
	model.config.use_cache = False
	model.config.pretraining_tp = 1

	# Load LLaMA tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

	# Load LoRA configuration
	peft_config = LoraConfig(
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	r=lora_r,
	bias="none",
	task_type="CAUSAL_LM",
	)

	# End model


	# Specify the local path to the downloaded model file
	# model_path = "wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin"

	# Initialize the model using the local path
	# model = GPT4All(model_path)

	def generate_text(prompt):
	# # result = model.generate(prompt)
	# # return result
	# logging.set_verbosity(logging.CRITICAL)
	# # prompt = input()
	# additional_prompt = "You are an AI Medical customer care bot. Please provide detailed and complete answers for only medical questions."
	# prompt = additional_prompt + prompt
	# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	# result = pipe(f"<s>[INST] {prompt} [/INST]")
	# output = result[0]['generated_text']
	# question = row['Question']
	# print(question)

	pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	result = pipe(f"<s>[INST] {prompt} [/INST]")
	generated_text = result[0]['generated_text']

	split_text = generated_text.split("[/INST]")
	generated_content = split_text[1].strip()

	prediction = generated_content.split("[/]")[0]
	return prediction

	text_generation_interface = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.inputs.Textbox(label="Input Text"),
	],
	outputs=gr.outputs.Textbox(label="Generated Text"),
	title="Medibot Text Generation",
	).launch()