IST199655
commited on
Commit
·
e7c3048
1
Parent(s):
66e4a39
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from huggingface_hub import InferenceClient
|
|
| 5 |
Copied from inference in colab notebook
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
from transformers import AutoTokenizer , AutoModelForCausalLM ,
|
| 9 |
import torch
|
| 10 |
from threading import Thread
|
| 11 |
|
|
@@ -93,22 +93,12 @@ def respond(
|
|
| 93 |
messages.append({"role": "assistant", "content": val[1]})
|
| 94 |
messages.append({"role": "user", "content": message})
|
| 95 |
|
| 96 |
-
# Create a single text prompt from the messages
|
| 97 |
-
prompt = ""
|
| 98 |
-
for msg in messages:
|
| 99 |
-
if msg["role"] == "system":
|
| 100 |
-
prompt += f"[System]: {msg['content']}\n\n"
|
| 101 |
-
elif msg["role"] == "user":
|
| 102 |
-
prompt += f"[User]: {msg['content']}\n\n"
|
| 103 |
-
elif msg["role"] == "assistant":
|
| 104 |
-
prompt += f"[Assistant]: {msg['content']}\n\n"
|
| 105 |
-
|
| 106 |
# Tokenize the prompt
|
| 107 |
-
inputs = tokenizer(
|
| 108 |
input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
|
| 109 |
|
| 110 |
# Generate tokens incrementally
|
| 111 |
-
streamer =
|
| 112 |
generation_kwargs = {
|
| 113 |
"input_ids": input_ids,
|
| 114 |
"max_new_tokens": max_tokens,
|
|
@@ -124,8 +114,7 @@ def respond(
|
|
| 124 |
response = ""
|
| 125 |
for token in streamer:
|
| 126 |
response += token
|
| 127 |
-
yield response
|
| 128 |
-
print(response)
|
| 129 |
|
| 130 |
|
| 131 |
"""
|
|
|
|
| 5 |
Copied from inference in colab notebook
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from transformers import AutoTokenizer , AutoModelForCausalLM , TextStreamer
|
| 9 |
import torch
|
| 10 |
from threading import Thread
|
| 11 |
|
|
|
|
| 93 |
messages.append({"role": "assistant", "content": val[1]})
|
| 94 |
messages.append({"role": "user", "content": message})
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
# Tokenize the prompt
|
| 97 |
+
inputs = tokenizer(messages, return_tensors="pt", truncation=True)
|
| 98 |
input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
|
| 99 |
|
| 100 |
# Generate tokens incrementally
|
| 101 |
+
streamer = TextStreamer(tokenizer, skip_prompt=True)
|
| 102 |
generation_kwargs = {
|
| 103 |
"input_ids": input_ids,
|
| 104 |
"max_new_tokens": max_tokens,
|
|
|
|
| 114 |
response = ""
|
| 115 |
for token in streamer:
|
| 116 |
response += token
|
| 117 |
+
yield response
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
"""
|