Spaces:

DR-Rakshitha
/

wizardlm_api

Runtime error

App Files Files Community

wizardlm_api / app.py

DR-Rakshitha

Update app.py

c13a07b over 2 years ago

raw

history blame

5.12 kB

	# import gradio as gr
	# from transformers import AutoModelForCausalLM, AutoTokenizer

	from gpt4all import GPT4All
	model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")

	# #----------------------------------------------------------------------------------------------------------------------------
	# # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
	# # import os
	# import torch
	# from datasets import load_dataset
	# from transformers import (
	# AutoModelForCausalLM,
	# AutoTokenizer,
	# BitsAndBytesConfig,
	# HfArgumentParser,
	# TrainingArguments,
	# pipeline,
	# logging,
	# )
	# from peft import LoraConfig, PeftModel
	# from trl import SFTTrainer
	# # -----------------------------------------------------------------------------------------------------------------------------------------------------------------

	# # LoRA attention dimension
	# lora_r = 64

	# # Alpha parameter for LoRA scaling
	# lora_alpha = 16

	# # Dropout probability for LoRA layers
	# lora_dropout = 0.1

	# ################################################################################
	# # bitsandbytes parameters
	# ################################################################################

	# # Activate 4-bit precision base model loading
	# use_4bit = True

	# # Compute dtype for 4-bit base models
	# bnb_4bit_compute_dtype = "float16"

	# # Quantization type (fp4 or nf4)
	# bnb_4bit_quant_type = "nf4"

	# # Activate nested quantization for 4-bit base models (double quantization)
	# use_nested_quant = False

	# # Load the entire model on the GPU 0
	# device_map = {"": 0}

	# #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
	# model_name = "DR-DRR/Model_001"
	# model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format

	# #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------

	# # Load tokenizer and model with QLoRA configuration
	# compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

	# bnb_config = BitsAndBytesConfig(
	# load_in_4bit=use_4bit,
	# bnb_4bit_quant_type=bnb_4bit_quant_type,
	# bnb_4bit_compute_dtype=compute_dtype,
	# bnb_4bit_use_double_quant=use_nested_quant,
	# )

	# # Check GPU compatibility with bfloat16
	# if compute_dtype == torch.float16 and use_4bit:
	# major, _ = torch.cuda.get_device_capability()
	# if major >= 8:
	# print("=" * 80)
	# print("Your GPU supports bfloat16: accelerate training with bf16=True")
	# print("=" * 80)

	# # Load base model
	# model = AutoModelForCausalLM.from_pretrained(
	# model_name,
	# quantization_config=bnb_config,
	# device_map=device_map
	# )
	# model.config.use_cache = False
	# model.config.pretraining_tp = 1

	# # Load LLaMA tokenizer
	# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	# tokenizer.pad_token = tokenizer.eos_token
	# tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

	# # Load LoRA configuration
	# peft_config = LoraConfig(
	# lora_alpha=lora_alpha,
	# lora_dropout=lora_dropout,
	# r=lora_r,
	# bias="none",
	# task_type="CAUSAL_LM",
	# )

	# #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
	# # Ignore warnings
	# logging.set_verbosity(logging.CRITICAL)

	# Run text generation pipeline with our next model
	# prompt = "What is a large language model?"
	# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	# result = pipe(f"<s>[INST] {prompt} [/INST]")
	# print(result[0]['generated_text'])

	# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
	# Ignore warnings
	# logging.set_verbosity(logging.CRITICAL)

	# Run text generation pipeline with our next model
	# prompt = "What is a large language model?"
	# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	# result = pipe(f"<s>[INST] {prompt} [/INST]")
	# print(result[0]['generated_text'])


	def generate_text(prompt):
	result = model.generate(prompt)
	# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
	# result = pipe(f"<s>[INST] {prompt} [/INST]")
	# # prompt = "What is a large language model?"
	# # input_ids = tokenizer.encode(prompt, return_tensors="pt")

	# output = model.generate(input_ids, max_length=200, num_return_sequences=1)
	# result = tokenizer.decode(output[0], skip_special_tokens=True)
	return result

	text_generation_interface = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.inputs.Textbox(label="Input Text"),
	],
	outputs=gr.outputs.Textbox(label="Generated Text"),
	title="GPT-4 Text Generation",
	).launch()



	# model_name = ""