Revert "Replace use of transformers library with vLLM for high-speed inference"
Browse filesThis reverts commit 56c4132b099df59a010a2b8a2bc9ed88150da502.
- quiz_generator.py +22 -25
- requirements.txt +1 -1
quiz_generator.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
import spaces
|
| 3 |
-
from
|
| 4 |
|
| 5 |
example_quiz = """
|
| 6 |
{
|
|
@@ -32,14 +32,18 @@ system_prompt = f"""
|
|
| 32 |
Final Answer:{example_quiz}
|
| 33 |
"""
|
| 34 |
|
| 35 |
-
# Initialize
|
| 36 |
model_id = "openai/gpt-oss-20b"
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
dtype="auto",
|
| 40 |
-
|
| 41 |
)
|
| 42 |
|
|
|
|
|
|
|
| 43 |
@spaces.GPU(duration=90)
|
| 44 |
def run_inference(prompt_messages):
|
| 45 |
"""
|
|
@@ -49,25 +53,14 @@ def run_inference(prompt_messages):
|
|
| 49 |
See https://huggingface.co/docs/hub/en/spaces-zerogpu
|
| 50 |
|
| 51 |
:param prompt_messages: The system and user messages submitted to the LLM
|
| 52 |
-
:return:
|
| 53 |
"""
|
| 54 |
-
|
| 55 |
-
prompt = llm.get_tokenizer().apply_chat_template(
|
| 56 |
prompt_messages,
|
| 57 |
-
|
| 58 |
-
add_generation_prompt=True
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
# Set up sampling parameters
|
| 62 |
-
sampling_params = SamplingParams(
|
| 63 |
-
max_tokens=3000,
|
| 64 |
temperature=0.7,
|
| 65 |
-
|
| 66 |
)
|
| 67 |
-
|
| 68 |
-
# Generate response
|
| 69 |
-
outputs = llm.generate([prompt], sampling_params)
|
| 70 |
-
return outputs[0].outputs[0].text
|
| 71 |
|
| 72 |
def to_final_answer(response):
|
| 73 |
"""
|
|
@@ -83,13 +76,17 @@ def to_final_answer(response):
|
|
| 83 |
"""
|
| 84 |
first_json_key = '"questions":'
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
if last_marker_idx != -1:
|
| 89 |
-
text = "{" +
|
| 90 |
else:
|
| 91 |
-
# Fallback: use the
|
| 92 |
-
text = response.strip()
|
| 93 |
print('final text:', text)
|
| 94 |
return text
|
| 95 |
|
|
|
|
| 1 |
import json
|
| 2 |
import spaces
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 4 |
|
| 5 |
example_quiz = """
|
| 6 |
{
|
|
|
|
| 32 |
Final Answer:{example_quiz}
|
| 33 |
"""
|
| 34 |
|
| 35 |
+
# Initialize model and pipeline
|
| 36 |
model_id = "openai/gpt-oss-20b"
|
| 37 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 38 |
+
|
| 39 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 40 |
+
model_id,
|
| 41 |
dtype="auto",
|
| 42 |
+
device_map="auto",
|
| 43 |
)
|
| 44 |
|
| 45 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
| 46 |
+
|
| 47 |
@spaces.GPU(duration=90)
|
| 48 |
def run_inference(prompt_messages):
|
| 49 |
"""
|
|
|
|
| 53 |
See https://huggingface.co/docs/hub/en/spaces-zerogpu
|
| 54 |
|
| 55 |
:param prompt_messages: The system and user messages submitted to the LLM
|
| 56 |
+
:return: All messages returned by the LLM
|
| 57 |
"""
|
| 58 |
+
return pipe(
|
|
|
|
| 59 |
prompt_messages,
|
| 60 |
+
max_new_tokens=3000,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
temperature=0.7,
|
| 62 |
+
do_sample=True,
|
| 63 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
def to_final_answer(response):
|
| 66 |
"""
|
|
|
|
| 76 |
"""
|
| 77 |
first_json_key = '"questions":'
|
| 78 |
|
| 79 |
+
# Code from https://huggingface.co/docs/transformers/en/conversations#textgenerationpipeline
|
| 80 |
+
# The assistant response is always the last in the generated_text array, so -1.
|
| 81 |
+
assistant_response = response[0]["generated_text"][-1]["content"]
|
| 82 |
+
|
| 83 |
+
print('all_generated:', assistant_response)
|
| 84 |
+
last_marker_idx = assistant_response.rfind(first_json_key)
|
| 85 |
if last_marker_idx != -1:
|
| 86 |
+
text = "{" + assistant_response[last_marker_idx:].strip()
|
| 87 |
else:
|
| 88 |
+
# Fallback: use the last response's text
|
| 89 |
+
text = response[-1]["generated_text"].strip()
|
| 90 |
print('final text:', text)
|
| 91 |
return text
|
| 92 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
gradio
|
| 2 |
spaces
|
| 3 |
-
|
| 4 |
torch
|
| 5 |
accelerate
|
|
|
|
| 1 |
gradio
|
| 2 |
spaces
|
| 3 |
+
transformers
|
| 4 |
torch
|
| 5 |
accelerate
|