valonys Cursor commited on
Commit
b02f0c1
·
1 Parent(s): 16cb9ef

Fix inference: switch to OpenAI-compatible /v1/chat/completions endpoint

Browse files

The previous code used the legacy text-generation API with a manually
formatted prompt string. That format does not work reliably with PEFT
adapter models. The /v1/chat/completions endpoint applies the model's
own chat template automatically and streams tokens in standard OpenAI
SSE format (choices[0].delta.content).

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show
  1. server.ts +7 -20
  2. src/App.tsx +2 -1
server.ts CHANGED
@@ -6,7 +6,9 @@ const app = express();
6
  const PORT = 7860;
7
 
8
  const MODEL_ID = "amiguel/qwen2.5-7b-instruct-ai_llm-sft";
9
- const API_URL = `https://api-inference.huggingface.co/models/${MODEL_ID}`;
 
 
10
 
11
  app.use(express.json());
12
 
@@ -21,15 +23,6 @@ app.post('/api/chat', async (req: Request, res: Response) => {
21
  return res.status(401).json({ error: "HF_TOKEN environment variable not set." });
22
  }
23
 
24
- // Qwen 2.5 ChatML prompt template
25
- const prompt = messages
26
- .map((m: { role: string; content: string }) => {
27
- if (m.role === 'user') return `<|im_start|>user\n${m.content}<|im_end|>\n`;
28
- if (m.role === 'assistant') return `<|im_start|>assistant\n${m.content}<|im_end|>\n`;
29
- return '';
30
- })
31
- .join('') + '<|im_start|>assistant\n';
32
-
33
  try {
34
  const response = await fetch(API_URL, {
35
  method: 'POST',
@@ -38,17 +31,11 @@ app.post('/api/chat', async (req: Request, res: Response) => {
38
  'Content-Type': 'application/json',
39
  },
40
  body: JSON.stringify({
41
- inputs: prompt,
42
- parameters: {
43
- max_new_tokens: 1024,
44
- return_full_text: false,
45
- do_sample: true,
46
- temperature: 0.7,
47
- },
48
  stream: true,
49
- options: {
50
- wait_for_model: true,
51
- },
52
  }),
53
  });
54
 
 
6
  const PORT = 7860;
7
 
8
  const MODEL_ID = "amiguel/qwen2.5-7b-instruct-ai_llm-sft";
9
+ // OpenAI-compatible chat completions endpoint — handles PEFT adapters,
10
+ // applies the model's own chat template, and streams in standard SSE format.
11
+ const API_URL = `https://api-inference.huggingface.co/models/${MODEL_ID}/v1/chat/completions`;
12
 
13
  app.use(express.json());
14
 
 
23
  return res.status(401).json({ error: "HF_TOKEN environment variable not set." });
24
  }
25
 
 
 
 
 
 
 
 
 
 
26
  try {
27
  const response = await fetch(API_URL, {
28
  method: 'POST',
 
31
  'Content-Type': 'application/json',
32
  },
33
  body: JSON.stringify({
34
+ model: MODEL_ID,
35
+ messages,
36
+ max_tokens: 1024,
 
 
 
 
37
  stream: true,
38
+ temperature: 0.7,
 
 
39
  }),
40
  });
41
 
src/App.tsx CHANGED
@@ -57,7 +57,8 @@ export default function App() {
57
 
58
  try {
59
  const parsed = JSON.parse(jsonStr);
60
- const token = parsed.token?.text || '';
 
61
 
62
  if (token) {
63
  setMessages((prev) => {
 
57
 
58
  try {
59
  const parsed = JSON.parse(jsonStr);
60
+ // OpenAI-compatible SSE: choices[0].delta.content
61
+ const token: string = parsed.choices?.[0]?.delta?.content ?? '';
62
 
63
  if (token) {
64
  setMessages((prev) => {