Spaces:

Prajjwalng
/

customercare

Sleeping

App Files Files Community

Prajjwalng commited on Mar 15, 2025

Commit

53cc210

verified ·

1 Parent(s): 6035193

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -10

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import os
 from huggingface_hub import login
 # Login with HF_TOKEN (if available)
 hf_token = os.environ.get("HF_TOKEN")
@@ -15,19 +16,30 @@ if hf_token:
 else:
     st.warning("HF_TOKEN environment variable not set. Some features may be limited.")
 # Initialize model and tokenizer (load only once)
 @st.cache_resource
-def load_model():
-    model_name = "google/gemma-2b-it"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    return tokenizer, model
-tokenizer, model = load_model()
 # Function to generate chatbot response using the provided template
 def get_completion(query: str, model, tokenizer) -> str:
-    device = "cuda:0" if torch.cuda.is_available() else "cpu" #Use cuda if available.
     prompt_template = f"""
 <start_of_turn>system You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<end_of_turn>
@@ -45,7 +57,7 @@ def get_completion(query: str, model, tokenizer) -> str:
     model_inputs = encodeds.to(device)
-    model.to(device) #Move model to device.
     generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
     decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
@@ -53,7 +65,7 @@ def get_completion(query: str, model, tokenizer) -> str:
     return model_response
 # Streamlit app
-st.title("Gemma-2b-it Support Chatbot")
 # Initialize chat history
 if "messages" not in st.session_state:
@@ -76,7 +88,7 @@ if prompt := st.chat_input("How can I help you?"):
     with st.chat_message("assistant"):
         message_placeholder = st.empty()
         full_response = ""
-        response = get_completion(prompt, model, tokenizer)
         # Simulate stream of responses with milliseconds delay
         import time

 import torch
 import os
 from huggingface_hub import login
+from peft import PeftModel, PeftConfig
 # Login with HF_TOKEN (if available)
 hf_token = os.environ.get("HF_TOKEN")
 else:
     st.warning("HF_TOKEN environment variable not set. Some features may be limited.")
+# Model and Adapter Configuration
+model_id = "google/gemma-2b-it"  # Base model
+adapter_id = "Prajjwalng/gemma_customercare_adapters" #adapter model
 # Initialize model and tokenizer (load only once)
 @st.cache_resource
+def load_model(model_id, adapter_id):
+    base_model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        low_cpu_mem_usage=True,
+        return_dict=True,
+        torch_dtype=torch.float16,
+        device_map={"": 0} if torch.cuda.is_available() else "cpu"
+    )
+    merged_model = PeftModel.from_pretrained(base_model, adapter_id)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
+    return merged_model, tokenizer
+merged_model, tokenizer = load_model(model_id, adapter_id)
 # Function to generate chatbot response using the provided template
 def get_completion(query: str, model, tokenizer) -> str:
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
     prompt_template = f"""
 <start_of_turn>system You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<end_of_turn>
     model_inputs = encodeds.to(device)
+    model.to(device)
     generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
     decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
     return model_response
 # Streamlit app
+st.title("Gemma-2b-it Customer Care Chatbot")
 # Initialize chat history
 if "messages" not in st.session_state:
     with st.chat_message("assistant"):
         message_placeholder = st.empty()
         full_response = ""
+        response = get_completion(prompt, merged_model, tokenizer)
         # Simulate stream of responses with milliseconds delay
         import time