Spaces:

DR-Rakshitha
/

wizardlm_api

Runtime error

App Files Files Community

DR-Rakshitha commited on Oct 1, 2023

Commit

1832b83

1 Parent(s): d048f50

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -18

app.py CHANGED Viewed

@@ -5,22 +5,107 @@
 # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
 #----------------------------------------------------------------------------------------------------------------------------
-from transformers import AutoModelForCausalLM, AutoTokenizer
-# Path to the model directory (assuming it's in the same directory as your script)
-model_directory = "./"
-# Load the model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(model_directory, from_tf=True)
-tokenizer = AutoTokenizer.from_pretrained(model_directory, trust_remote_code=True)
-# Now you can generate text as before
 # prompt = "What is a large language model?"
-# input_ids = tokenizer.encode(prompt, return_tensors="pt")
-# output = model.generate(input_ids, max_length=200, num_return_sequences=1)
-# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-# print(generated_text)
 # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Ignore warnings
@@ -45,13 +130,13 @@ logging.set_verbosity(logging.CRITICAL)
 def generate_text(prompt):
     # output = model.generate(input_text)
-    # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
-    # result = pipe(f"<s>[INST] {prompt} [/INST]")
     # prompt = "What is a large language model?"
-    input_ids = tokenizer.encode(prompt, return_tensors="pt")
-    output = model.generate(input_ids, max_length=200, num_return_sequences=1)
-    result = tokenizer.decode(output[0], skip_special_tokens=True)
     return result
 text_generation_interface = gr.Interface(

 # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin")
 #----------------------------------------------------------------------------------------------------------------------------
+# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
+# import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    HfArgumentParser,
+    TrainingArguments,
+    pipeline,
+    logging,
+)
+from peft import LoraConfig, PeftModel
+from trl import SFTTrainer
+ # -----------------------------------------------------------------------------------------------------------------------------------------------------------------
+# LoRA attention dimension
+lora_r = 64
+# Alpha parameter for LoRA scaling
+lora_alpha = 16
+# Dropout probability for LoRA layers
+lora_dropout = 0.1
+################################################################################
+# bitsandbytes parameters
+################################################################################
+# Activate 4-bit precision base model loading
+use_4bit = True
+# Compute dtype for 4-bit base models
+bnb_4bit_compute_dtype = "float16"
+# Quantization type (fp4 or nf4)
+bnb_4bit_quant_type = "nf4"
+# Activate nested quantization for 4-bit base models (double quantization)
+use_nested_quant = False
+# Load the entire model on the GPU 0
+device_map = {"": 0}
+#----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+model_name = "DR-DRR/Model_001"
+model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format
+#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Load tokenizer and model with QLoRA configuration
+compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=use_4bit,
+    bnb_4bit_quant_type=bnb_4bit_quant_type,
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=use_nested_quant,
+)
+# Check GPU compatibility with bfloat16
+if compute_dtype == torch.float16 and use_4bit:
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 8:
+        print("=" * 80)
+        print("Your GPU supports bfloat16: accelerate training with bf16=True")
+        print("=" * 80)
+# Load base model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map=device_map
+)
+model.config.use_cache = False
+model.config.pretraining_tp = 1
+# Load LLaMA tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
+# Load LoRA configuration
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Ignore warnings
+logging.set_verbosity(logging.CRITICAL)
+# Run text generation pipeline with our next model
 # prompt = "What is a large language model?"
+# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
+# result = pipe(f"<s>[INST] {prompt} [/INST]")
+# print(result[0]['generated_text'])
 # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 # Ignore warnings
 def generate_text(prompt):
     # output = model.generate(input_text)
+    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
+    result = pipe(f"<s>[INST] {prompt} [/INST]")
     # prompt = "What is a large language model?"
+    # input_ids = tokenizer.encode(prompt, return_tensors="pt")
+    # output = model.generate(input_ids, max_length=200, num_return_sequences=1)
+    # result = tokenizer.decode(output[0], skip_special_tokens=True)
     return result
 text_generation_interface = gr.Interface(