Spaces:
Sleeping
Sleeping
valonys Cursor commited on
Commit ·
b02f0c1
1
Parent(s): 16cb9ef
Fix inference: switch to OpenAI-compatible /v1/chat/completions endpoint
Browse filesThe previous code used the legacy text-generation API with a manually
formatted prompt string. That format does not work reliably with PEFT
adapter models. The /v1/chat/completions endpoint applies the model's
own chat template automatically and streams tokens in standard OpenAI
SSE format (choices[0].delta.content).
Co-authored-by: Cursor <cursoragent@cursor.com>
- server.ts +7 -20
- src/App.tsx +2 -1
server.ts
CHANGED
|
@@ -6,7 +6,9 @@ const app = express();
|
|
| 6 |
const PORT = 7860;
|
| 7 |
|
| 8 |
const MODEL_ID = "amiguel/qwen2.5-7b-instruct-ai_llm-sft";
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
app.use(express.json());
|
| 12 |
|
|
@@ -21,15 +23,6 @@ app.post('/api/chat', async (req: Request, res: Response) => {
|
|
| 21 |
return res.status(401).json({ error: "HF_TOKEN environment variable not set." });
|
| 22 |
}
|
| 23 |
|
| 24 |
-
// Qwen 2.5 ChatML prompt template
|
| 25 |
-
const prompt = messages
|
| 26 |
-
.map((m: { role: string; content: string }) => {
|
| 27 |
-
if (m.role === 'user') return `<|im_start|>user\n${m.content}<|im_end|>\n`;
|
| 28 |
-
if (m.role === 'assistant') return `<|im_start|>assistant\n${m.content}<|im_end|>\n`;
|
| 29 |
-
return '';
|
| 30 |
-
})
|
| 31 |
-
.join('') + '<|im_start|>assistant\n';
|
| 32 |
-
|
| 33 |
try {
|
| 34 |
const response = await fetch(API_URL, {
|
| 35 |
method: 'POST',
|
|
@@ -38,17 +31,11 @@ app.post('/api/chat', async (req: Request, res: Response) => {
|
|
| 38 |
'Content-Type': 'application/json',
|
| 39 |
},
|
| 40 |
body: JSON.stringify({
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
return_full_text: false,
|
| 45 |
-
do_sample: true,
|
| 46 |
-
temperature: 0.7,
|
| 47 |
-
},
|
| 48 |
stream: true,
|
| 49 |
-
|
| 50 |
-
wait_for_model: true,
|
| 51 |
-
},
|
| 52 |
}),
|
| 53 |
});
|
| 54 |
|
|
|
|
| 6 |
const PORT = 7860;
|
| 7 |
|
| 8 |
const MODEL_ID = "amiguel/qwen2.5-7b-instruct-ai_llm-sft";
|
| 9 |
+
// OpenAI-compatible chat completions endpoint — handles PEFT adapters,
|
| 10 |
+
// applies the model's own chat template, and streams in standard SSE format.
|
| 11 |
+
const API_URL = `https://api-inference.huggingface.co/models/${MODEL_ID}/v1/chat/completions`;
|
| 12 |
|
| 13 |
app.use(express.json());
|
| 14 |
|
|
|
|
| 23 |
return res.status(401).json({ error: "HF_TOKEN environment variable not set." });
|
| 24 |
}
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
try {
|
| 27 |
const response = await fetch(API_URL, {
|
| 28 |
method: 'POST',
|
|
|
|
| 31 |
'Content-Type': 'application/json',
|
| 32 |
},
|
| 33 |
body: JSON.stringify({
|
| 34 |
+
model: MODEL_ID,
|
| 35 |
+
messages,
|
| 36 |
+
max_tokens: 1024,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
stream: true,
|
| 38 |
+
temperature: 0.7,
|
|
|
|
|
|
|
| 39 |
}),
|
| 40 |
});
|
| 41 |
|
src/App.tsx
CHANGED
|
@@ -57,7 +57,8 @@ export default function App() {
|
|
| 57 |
|
| 58 |
try {
|
| 59 |
const parsed = JSON.parse(jsonStr);
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
if (token) {
|
| 63 |
setMessages((prev) => {
|
|
|
|
| 57 |
|
| 58 |
try {
|
| 59 |
const parsed = JSON.parse(jsonStr);
|
| 60 |
+
// OpenAI-compatible SSE: choices[0].delta.content
|
| 61 |
+
const token: string = parsed.choices?.[0]?.delta?.content ?? '';
|
| 62 |
|
| 63 |
if (token) {
|
| 64 |
setMessages((prev) => {
|