Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -46,6 +46,8 @@ def load_model_for_zerocpu():
|
|
| 46 |
model_type="llama",
|
| 47 |
gpu_layers=0
|
| 48 |
)
|
|
|
|
|
|
|
| 49 |
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
|
| 50 |
if tokenizer.pad_token is None:
|
| 51 |
tokenizer.pad_token = tokenizer.eos_token
|
|
@@ -79,16 +81,36 @@ def predict_chat(message: str, history: list):
|
|
| 79 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
| 80 |
return
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
messages.append({"role": "user", "content": message})
|
| 84 |
|
| 85 |
generated_text = ""
|
| 86 |
start_time = time.time()
|
| 87 |
|
| 88 |
-
# CORRECTED: Check against ctransformers.llm.LLM directly
|
| 89 |
if GGUF_AVAILABLE and isinstance(model, LLM):
|
| 90 |
print("Using GGUF model generation path.")
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
repetition_penalty=1.1,
|
| 93 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
| 94 |
stream=True
|
|
@@ -96,20 +118,21 @@ def predict_chat(message: str, history: list):
|
|
| 96 |
generated_text += token
|
| 97 |
yield generated_text
|
| 98 |
except Exception as e:
|
| 99 |
-
print(f"Error in GGUF generation: {e}")
|
| 100 |
-
# Fallback to non-streaming generation
|
|
|
|
| 101 |
output = model(
|
| 102 |
prompt_input,
|
| 103 |
max_new_tokens=MAX_NEW_TOKENS,
|
| 104 |
temperature=TEMPERATURE,
|
| 105 |
top_k=TOP_K,
|
| 106 |
top_p=TOP_P,
|
| 107 |
-
|
| 108 |
repetition_penalty=1.1,
|
| 109 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
| 110 |
)
|
| 111 |
-
|
| 112 |
-
generated_text
|
| 113 |
yield generated_text
|
| 114 |
|
| 115 |
else:
|
|
@@ -117,18 +140,25 @@ def predict_chat(message: str, history: list):
|
|
| 117 |
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 118 |
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
outputs = model.generate(
|
| 121 |
inputs,
|
| 122 |
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
|
| 123 |
temperature=TEMPERATURE,
|
| 124 |
top_k=TOP_K,
|
| 125 |
top_p=TOP_P,
|
| 126 |
-
|
| 127 |
pad_token_id=tokenizer.pad_token_id
|
| 128 |
)
|
| 129 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
| 130 |
yield generated_text
|
| 131 |
-
|
| 132 |
end_time = time.time()
|
| 133 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
| 134 |
|
|
|
|
| 46 |
model_type="llama",
|
| 47 |
gpu_layers=0
|
| 48 |
)
|
| 49 |
+
# For ctransformers models, the tokenizer is often separate, or not strictly needed for basic chat templates
|
| 50 |
+
# We use the original model's tokenizer for consistency and template application.
|
| 51 |
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
|
| 52 |
if tokenizer.pad_token is None:
|
| 53 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
| 81 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
| 82 |
return
|
| 83 |
|
| 84 |
+
# Gradio history is already formatted as a list of lists: [[user_msg, bot_msg], ...]
|
| 85 |
+
# We need to convert it to the format expected by the tokenizer's chat template.
|
| 86 |
+
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
|
| 87 |
+
for human, assistant in history:
|
| 88 |
+
messages.append({"role": "user", "content": human})
|
| 89 |
+
messages.append({"role": "assistant", "content": assistant})
|
| 90 |
messages.append({"role": "user", "content": message})
|
| 91 |
|
| 92 |
generated_text = ""
|
| 93 |
start_time = time.time()
|
| 94 |
|
| 95 |
+
# CORRECTED: Check against ctransformers.llm.LLM directly and ensure parameters are correct
|
| 96 |
if GGUF_AVAILABLE and isinstance(model, LLM):
|
| 97 |
print("Using GGUF model generation path.")
|
| 98 |
+
# Apply chat template for GGUF models as well,
|
| 99 |
+
# though ctransformers might expect a simpler string.
|
| 100 |
+
# This can be adjusted if the model has a specific prompt format.
|
| 101 |
+
# For Llama-based models, the tokenizer.apply_chat_template should work.
|
| 102 |
+
prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
# The do_sample parameter should be passed directly, not as part of the prompt string
|
| 106 |
+
# Also, 'stream=True' is crucial for token-by-token output in Gradio
|
| 107 |
+
for token in model(
|
| 108 |
+
prompt_input,
|
| 109 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
| 110 |
+
temperature=TEMPERATURE,
|
| 111 |
+
top_k=TOP_K,
|
| 112 |
+
top_p=TOP_P,
|
| 113 |
+
do_sample=DO_SAMPLE, # Corrected parameter passing
|
| 114 |
repetition_penalty=1.1,
|
| 115 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
| 116 |
stream=True
|
|
|
|
| 118 |
generated_text += token
|
| 119 |
yield generated_text
|
| 120 |
except Exception as e:
|
| 121 |
+
print(f"Error in GGUF streaming generation: {e}")
|
| 122 |
+
# Fallback to non-streaming generation if streaming fails
|
| 123 |
+
# Ensure the output is processed correctly
|
| 124 |
output = model(
|
| 125 |
prompt_input,
|
| 126 |
max_new_tokens=MAX_NEW_TOKENS,
|
| 127 |
temperature=TEMPERATURE,
|
| 128 |
top_k=TOP_K,
|
| 129 |
top_p=TOP_P,
|
| 130 |
+
do_sample=DO_SAMPLE, # Corrected parameter passing
|
| 131 |
repetition_penalty=1.1,
|
| 132 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
| 133 |
)
|
| 134 |
+
# If not streaming, the 'output' is the complete string
|
| 135 |
+
generated_text = output
|
| 136 |
yield generated_text
|
| 137 |
|
| 138 |
else:
|
|
|
|
| 140 |
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 141 |
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
|
| 142 |
|
| 143 |
+
# Using stream=True for Hugging Face generation with yield for Gradio
|
| 144 |
+
# Note: `model.generate` for Hugging Face `transformers` typically doesn't stream token by token
|
| 145 |
+
# in the same way ctransformers does directly. For true streaming with HF models,
|
| 146 |
+
# you'd often need a custom generation loop or a specific streaming API.
|
| 147 |
+
# For this example, we'll generate the full response and then yield it.
|
| 148 |
+
# If true token-by-token streaming is critical for the HF model,
|
| 149 |
+
# you might need to adjust this part or use a different model.
|
| 150 |
outputs = model.generate(
|
| 151 |
inputs,
|
| 152 |
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
|
| 153 |
temperature=TEMPERATURE,
|
| 154 |
top_k=TOP_K,
|
| 155 |
top_p=TOP_P,
|
| 156 |
+
do_sample=DO_SAMPLE, # Uncommented for use
|
| 157 |
pad_token_id=tokenizer.pad_token_id
|
| 158 |
)
|
| 159 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
| 160 |
yield generated_text
|
| 161 |
+
|
| 162 |
end_time = time.time()
|
| 163 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
| 164 |
|