timadair commited on
Commit
74aae71
·
1 Parent(s): eabb6a7

Revert "Replace use of transformers library with vLLM for high-speed inference"

Browse files

This reverts commit 56c4132b099df59a010a2b8a2bc9ed88150da502.

Files changed (2) hide show
  1. quiz_generator.py +22 -25
  2. requirements.txt +1 -1
quiz_generator.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import spaces
3
- from vllm import LLM, SamplingParams
4
 
5
  example_quiz = """
6
  {
@@ -32,14 +32,18 @@ system_prompt = f"""
32
  Final Answer:{example_quiz}
33
  """
34
 
35
- # Initialize vLLM model
36
  model_id = "openai/gpt-oss-20b"
37
- llm = LLM(
38
- model=model_id,
 
 
39
  dtype="auto",
40
- tensor_parallel_size=1, # Adjust based on your GPU setup
41
  )
42
 
 
 
43
  @spaces.GPU(duration=90)
44
  def run_inference(prompt_messages):
45
  """
@@ -49,25 +53,14 @@ def run_inference(prompt_messages):
49
  See https://huggingface.co/docs/hub/en/spaces-zerogpu
50
 
51
  :param prompt_messages: The system and user messages submitted to the LLM
52
- :return: Generated text from vLLM
53
  """
54
- # Use the model's chat template to format messages
55
- prompt = llm.get_tokenizer().apply_chat_template(
56
  prompt_messages,
57
- tokenize=False,
58
- add_generation_prompt=True
59
- )
60
-
61
- # Set up sampling parameters
62
- sampling_params = SamplingParams(
63
- max_tokens=3000,
64
  temperature=0.7,
65
- top_p=0.9,
66
  )
67
-
68
- # Generate response
69
- outputs = llm.generate([prompt], sampling_params)
70
- return outputs[0].outputs[0].text
71
 
72
  def to_final_answer(response):
73
  """
@@ -83,13 +76,17 @@ def to_final_answer(response):
83
  """
84
  first_json_key = '"questions":'
85
 
86
- print('all_generated:', response)
87
- last_marker_idx = response.rfind(first_json_key)
 
 
 
 
88
  if last_marker_idx != -1:
89
- text = "{" + response[last_marker_idx:].strip()
90
  else:
91
- # Fallback: use the entire response
92
- text = response.strip()
93
  print('final text:', text)
94
  return text
95
 
 
1
  import json
2
  import spaces
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
 
5
  example_quiz = """
6
  {
 
32
  Final Answer:{example_quiz}
33
  """
34
 
35
+ # Initialize model and pipeline
36
  model_id = "openai/gpt-oss-20b"
37
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
38
+
39
+ model = AutoModelForCausalLM.from_pretrained(
40
+ model_id,
41
  dtype="auto",
42
+ device_map="auto",
43
  )
44
 
45
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
46
+
47
  @spaces.GPU(duration=90)
48
  def run_inference(prompt_messages):
49
  """
 
53
  See https://huggingface.co/docs/hub/en/spaces-zerogpu
54
 
55
  :param prompt_messages: The system and user messages submitted to the LLM
56
+ :return: All messages returned by the LLM
57
  """
58
+ return pipe(
 
59
  prompt_messages,
60
+ max_new_tokens=3000,
 
 
 
 
 
 
61
  temperature=0.7,
62
+ do_sample=True,
63
  )
 
 
 
 
64
 
65
  def to_final_answer(response):
66
  """
 
76
  """
77
  first_json_key = '"questions":'
78
 
79
+ # Code from https://huggingface.co/docs/transformers/en/conversations#textgenerationpipeline
80
+ # The assistant response is always the last in the generated_text array, so -1.
81
+ assistant_response = response[0]["generated_text"][-1]["content"]
82
+
83
+ print('all_generated:', assistant_response)
84
+ last_marker_idx = assistant_response.rfind(first_json_key)
85
  if last_marker_idx != -1:
86
+ text = "{" + assistant_response[last_marker_idx:].strip()
87
  else:
88
+ # Fallback: use the last response's text
89
+ text = response[-1]["generated_text"].strip()
90
  print('final text:', text)
91
  return text
92
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  gradio
2
  spaces
3
- vllm
4
  torch
5
  accelerate
 
1
  gradio
2
  spaces
3
+ transformers
4
  torch
5
  accelerate