Spaces:

nafisneehal
/

chanbot

Sleeping

App Files Files Community

nafisneehal commited on Nov 14, 2024

Commit

cc5a84d

verified ·

1 Parent(s): 260f4bd

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -15

app.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import gradio as gr
 import os
 import torch
-from unsloth import FastLanguageModel
 from huggingface_hub import spaces
-# Get Hugging Face token from environment variables
-HF_TOKEN = os.environ.get('HF_TOKEN')
 # Check if we're running in a Hugging Face Space with GPU constraints
 IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
 IS_SPACE = os.environ.get("SPACE_ID", None) is not None
 # Determine device (use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
@@ -19,31 +20,28 @@ print(f"Using device: {device}")
 print(f"Low memory mode: {LOW_MEMORY}")
 # Model configuration
-max_seq_length = 2048  # Max sequence length for RoPE scaling
-dtype = torch.float16 if device == "cuda" else torch.float32
-load_in_4bit = True  # Enable 4-bit quantization if memory is limited
 # Load model and tokenizer with device mapping
 model_name = "nafisneehal/chandler_bot"
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name=model_name,
-    max_seq_length=max_seq_length,
-    dtype=dtype,
     load_in_4bit=load_in_4bit,
     device_map="auto" if device == "cuda" else None  # Automatic GPU mapping
 )
-FastLanguageModel.for_inference(model)  # Optimize model for faster inference
 # Define prompt structure (update if necessary for your model)
 alpaca_prompt = "{instruction} {input} {output}"
-instruction_text = "Learn how to talk like Chandler - a popular character from FRIENDS TV Show. Input is someone saying something, Output is what Chandler saying in response."
 @spaces.GPU  # Use GPU provided by Hugging Face Spaces if available
 def generate_response(user_input, chat_history):
-    instruction = user_input  # Treats user input as instruction
-    input_text = ""  # Any additional input if needed; empty otherwise
     # Prepare inputs for model inference on the correct device
     inputs = tokenizer(

 import gradio as gr
 import os
 import torch
+from peft import AutoPeftModelForCausalLM
+from transformers import AutoTokenizer
 from huggingface_hub import spaces
 # Check if we're running in a Hugging Face Space with GPU constraints
 IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
 IS_SPACE = os.environ.get("SPACE_ID", None) is not None
+# Get Hugging Face token from environment variables
+HF_TOKEN = os.environ.get('HF_TOKEN')
 # Determine device (use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
 print(f"Low memory mode: {LOW_MEMORY}")
 # Model configuration
+load_in_4bit = True  # Use 4-bit quantization if memory is constrained
 # Load model and tokenizer with device mapping
+# Replace with the name of your trained model
 model_name = "nafisneehal/chandler_bot"
+model = AutoPeftModelForCausalLM.from_pretrained(
+    model_name,
     load_in_4bit=load_in_4bit,
     device_map="auto" if device == "cuda" else None  # Automatic GPU mapping
 )
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Define prompt structure (update if necessary for your model)
 alpaca_prompt = "{instruction} {input} {output}"
+instruction = "Chat with me like Chandler"
 @spaces.GPU  # Use GPU provided by Hugging Face Spaces if available
 def generate_response(user_input, chat_history):
+    instruction = instruction  # Treats user input as the instruction
+    input_text = user_input  # Any additional input if needed; leave blank otherwise
     # Prepare inputs for model inference on the correct device
     inputs = tokenizer(