Spaces:

Mikhil-jivus
/

EndpointTesting

Runtime error

App Files Files Community

Mikhil-jivus commited on Oct 10, 2024

Commit

00a2ac7

verified ·

1 Parent(s): 432a144

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -10

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = """\
 # Llama 3.2 3B Instruct
@@ -18,10 +20,9 @@ For more details, please check [our post](https://huggingface.co/blog/llama32).
 # Access token for the model (if required)
 access_token = os.getenv('HF_TOKEN')
 # Download the Base model
 #model_id = "./models/Llama-32-3B-Instruct"
-model_id = "Mikhil-jivus/Llama-32-8B-FineTuned-Instruct-v1"
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
@@ -29,15 +30,14 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 #model_id = "nltpt/Llama-3.2-3B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_id,token = access_token)
 tokenizer.padding_side = 'right'
-tokenizer.pad_token_id = 128004
-tokenizer.pad_token = "<|finetune_right_pad_id|>"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="auto",
     torch_dtype=torch.bfloat16,
-    token = access_token
 )
 model.eval()
@@ -63,15 +63,26 @@ def generate(
         )
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,add_special_tokens = True, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
@@ -79,7 +90,7 @@ def generate(
         top_k=top_k,
         temperature=temperature,
         num_beams=1,
-        repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()

 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# Set the environment variable
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 DESCRIPTION = """\
 # Llama 3.2 3B Instruct
 # Access token for the model (if required)
 access_token = os.getenv('HF_TOKEN')
 # Download the Base model
 #model_id = "./models/Llama-32-3B-Instruct"
+model_id = "Mikhil-jivus/Llama-32-3B-FineTuned-Instruct-v4"
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 #model_id = "nltpt/Llama-3.2-3B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 tokenizer.padding_side = 'right'
+tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    device_map=device,
     torch_dtype=torch.bfloat16,
+    local_files_only = True
 )
 model.eval()
         )
     conversation.append({"role": "user", "content": message})
+    # Set pad_token_id if it's not already set
+    if tokenizer.pad_token_id is None:
+        tokenizer.padding_side = 'right'
+        tokenizer.pad_token = tokenizer.eos_token
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,add_special_tokens=True, return_tensors="pt",padding=True ,return_attention_mask=True)
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    # Ensure attention mask is set
+    #attention_mask = input_ids['attention_mask']
     input_ids = input_ids.to(model.device)
+    #attention_mask = attention_mask.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_k=top_k,
         temperature=temperature,
         num_beams=1,
+        repetition_penalty=repetition_penalty
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()