Spaces:
Runtime error
Runtime error
| # import gradio as gr | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from gpt4all import GPT4All | |
| model = GPT4All("wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin") | |
| # #---------------------------------------------------------------------------------------------------------------------------- | |
| # # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 | |
| # # import os | |
| # import torch | |
| # from datasets import load_dataset | |
| # from transformers import ( | |
| # AutoModelForCausalLM, | |
| # AutoTokenizer, | |
| # BitsAndBytesConfig, | |
| # HfArgumentParser, | |
| # TrainingArguments, | |
| # pipeline, | |
| # logging, | |
| # ) | |
| # from peft import LoraConfig, PeftModel | |
| # from trl import SFTTrainer | |
| # # ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # # LoRA attention dimension | |
| # lora_r = 64 | |
| # # Alpha parameter for LoRA scaling | |
| # lora_alpha = 16 | |
| # # Dropout probability for LoRA layers | |
| # lora_dropout = 0.1 | |
| # ################################################################################ | |
| # # bitsandbytes parameters | |
| # ################################################################################ | |
| # # Activate 4-bit precision base model loading | |
| # use_4bit = True | |
| # # Compute dtype for 4-bit base models | |
| # bnb_4bit_compute_dtype = "float16" | |
| # # Quantization type (fp4 or nf4) | |
| # bnb_4bit_quant_type = "nf4" | |
| # # Activate nested quantization for 4-bit base models (double quantization) | |
| # use_nested_quant = False | |
| # # Load the entire model on the GPU 0 | |
| # device_map = {"": 0} | |
| # #---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # model_name = "DR-DRR/Model_001" | |
| # model_basename = "pytorch_model-00001-of-00002.bin" # the model is in bin format | |
| # #------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # # Load tokenizer and model with QLoRA configuration | |
| # compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | |
| # bnb_config = BitsAndBytesConfig( | |
| # load_in_4bit=use_4bit, | |
| # bnb_4bit_quant_type=bnb_4bit_quant_type, | |
| # bnb_4bit_compute_dtype=compute_dtype, | |
| # bnb_4bit_use_double_quant=use_nested_quant, | |
| # ) | |
| # # Check GPU compatibility with bfloat16 | |
| # if compute_dtype == torch.float16 and use_4bit: | |
| # major, _ = torch.cuda.get_device_capability() | |
| # if major >= 8: | |
| # print("=" * 80) | |
| # print("Your GPU supports bfloat16: accelerate training with bf16=True") | |
| # print("=" * 80) | |
| # # Load base model | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # model_name, | |
| # quantization_config=bnb_config, | |
| # device_map=device_map | |
| # ) | |
| # model.config.use_cache = False | |
| # model.config.pretraining_tp = 1 | |
| # # Load LLaMA tokenizer | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| # tokenizer.pad_token = tokenizer.eos_token | |
| # tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training | |
| # # Load LoRA configuration | |
| # peft_config = LoraConfig( | |
| # lora_alpha=lora_alpha, | |
| # lora_dropout=lora_dropout, | |
| # r=lora_r, | |
| # bias="none", | |
| # task_type="CAUSAL_LM", | |
| # ) | |
| # #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # # Ignore warnings | |
| # logging.set_verbosity(logging.CRITICAL) | |
| # Run text generation pipeline with our next model | |
| # prompt = "What is a large language model?" | |
| # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) | |
| # result = pipe(f"<s>[INST] {prompt} [/INST]") | |
| # print(result[0]['generated_text']) | |
| # --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # Ignore warnings | |
| # logging.set_verbosity(logging.CRITICAL) | |
| # Run text generation pipeline with our next model | |
| # prompt = "What is a large language model?" | |
| # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) | |
| # result = pipe(f"<s>[INST] {prompt} [/INST]") | |
| # print(result[0]['generated_text']) | |
| def generate_text(prompt): | |
| result = model.generate(prompt) | |
| # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) | |
| # result = pipe(f"<s>[INST] {prompt} [/INST]") | |
| # # prompt = "What is a large language model?" | |
| # # input_ids = tokenizer.encode(prompt, return_tensors="pt") | |
| # output = model.generate(input_ids, max_length=200, num_return_sequences=1) | |
| # result = tokenizer.decode(output[0], skip_special_tokens=True) | |
| return result | |
| text_generation_interface = gr.Interface( | |
| fn=generate_text, | |
| inputs=[ | |
| gr.inputs.Textbox(label="Input Text"), | |
| ], | |
| outputs=gr.outputs.Textbox(label="Generated Text"), | |
| title="GPT-4 Text Generation", | |
| ).launch() | |
| # model_name = "" | |