Spaces:
Runtime error
Runtime error
| # import gradio as gr | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # from gpt4all import GPT4All | |
| # model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin") | |
| #---------------------------------------------------------------------------------------------------------------------------- | |
| !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 | |
| import os | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| HfArgumentParser, | |
| TrainingArguments, | |
| pipeline, | |
| logging, | |
| ) | |
| from peft import LoraConfig, PeftModel | |
| from trl import SFTTrainer | |
| # ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # LoRA attention dimension | |
| lora_r = 64 | |
| # Alpha parameter for LoRA scaling | |
| lora_alpha = 16 | |
| # Dropout probability for LoRA layers | |
| lora_dropout = 0.1 | |
| ################################################################################ | |
| # bitsandbytes parameters | |
| ################################################################################ | |
| # Activate 4-bit precision base model loading | |
| use_4bit = True | |
| # Compute dtype for 4-bit base models | |
| bnb_4bit_compute_dtype = "float16" | |
| # Quantization type (fp4 or nf4) | |
| bnb_4bit_quant_type = "nf4" | |
| # Activate nested quantization for 4-bit base models (double quantization) | |
| use_nested_quant = False | |
| # Load the entire model on the GPU 0 | |
| device_map = {"": 0} | |
| #---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| model_name = "DR-DRR/Model_001" | |
| model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format | |
| #------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # Load tokenizer and model with QLoRA configuration | |
| compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=use_4bit, | |
| bnb_4bit_quant_type=bnb_4bit_quant_type, | |
| bnb_4bit_compute_dtype=compute_dtype, | |
| bnb_4bit_use_double_quant=use_nested_quant, | |
| ) | |
| # Check GPU compatibility with bfloat16 | |
| if compute_dtype == torch.float16 and use_4bit: | |
| major, _ = torch.cuda.get_device_capability() | |
| if major >= 8: | |
| print("=" * 80) | |
| print("Your GPU supports bfloat16: accelerate training with bf16=True") | |
| print("=" * 80) | |
| # Load base model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=bnb_config, | |
| device_map=device_map | |
| ) | |
| model.config.use_cache = False | |
| model.config.pretraining_tp = 1 | |
| # Load LLaMA tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training | |
| # Load LoRA configuration | |
| peft_config = LoraConfig( | |
| lora_alpha=lora_alpha, | |
| lora_dropout=lora_dropout, | |
| r=lora_r, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # Ignore warnings | |
| logging.set_verbosity(logging.CRITICAL) | |
| # Run text generation pipeline with our next model | |
| # prompt = "What is a large language model?" | |
| # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) | |
| # result = pipe(f"<s>[INST] {prompt} [/INST]") | |
| # print(result[0]['generated_text']) | |
| def generate_text(prompt): | |
| # output = model.generate(input_text) | |
| pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) | |
| result = pipe(f"<s>[INST] {prompt} [/INST]") | |
| return result | |
| text_generation_interface = gr.Interface( | |
| fn=generate_text, | |
| inputs=[ | |
| gr.inputs.Textbox(label="Input Text"), | |
| ], | |
| outputs=gr.outputs.Textbox(label="Generated Text"), | |
| title="GPT-4 Text Generation", | |
| ).launch() | |
| # model_name = "" | |