Spaces:

amiguel
/

expertInsp

Sleeping

valonys Cursor commited on 28 days ago

Commit

b02f0c1

1 Parent(s): 16cb9ef

Fix inference: switch to OpenAI-compatible /v1/chat/completions endpoint

The previous code used the legacy text-generation API with a manually
formatted prompt string. That format does not work reliably with PEFT
adapter models. The /v1/chat/completions endpoint applies the model's
own chat template automatically and streams tokens in standard OpenAI
SSE format (choices[0].delta.content).

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show

server.ts +7 -20
src/App.tsx +2 -1

server.ts CHANGED Viewed

@@ -6,7 +6,9 @@ const app = express();
 const PORT = 7860;
 const MODEL_ID = "amiguel/qwen2.5-7b-instruct-ai_llm-sft";
-const API_URL = `https://api-inference.huggingface.co/models/${MODEL_ID}`;
 app.use(express.json());
@@ -21,15 +23,6 @@ app.post('/api/chat', async (req: Request, res: Response) => {
     return res.status(401).json({ error: "HF_TOKEN environment variable not set." });
   }
-  // Qwen 2.5 ChatML prompt template
-  const prompt = messages
-    .map((m: { role: string; content: string }) => {
-      if (m.role === 'user') return `<|im_start|>user\n${m.content}<|im_end|>\n`;
-      if (m.role === 'assistant') return `<|im_start|>assistant\n${m.content}<|im_end|>\n`;
-      return '';
-    })
-    .join('') + '<|im_start|>assistant\n';
   try {
     const response = await fetch(API_URL, {
       method: 'POST',
@@ -38,17 +31,11 @@ app.post('/api/chat', async (req: Request, res: Response) => {
         'Content-Type': 'application/json',
       },
       body: JSON.stringify({
-        inputs: prompt,
-        parameters: {
-          max_new_tokens: 1024,
-          return_full_text: false,
-          do_sample: true,
-          temperature: 0.7,
-        },
         stream: true,
-        options: {
-          wait_for_model: true,
-        },
       }),
     });

 const PORT = 7860;
 const MODEL_ID = "amiguel/qwen2.5-7b-instruct-ai_llm-sft";
+// OpenAI-compatible chat completions endpoint — handles PEFT adapters,
+// applies the model's own chat template, and streams in standard SSE format.
+const API_URL = `https://api-inference.huggingface.co/models/${MODEL_ID}/v1/chat/completions`;
 app.use(express.json());
     return res.status(401).json({ error: "HF_TOKEN environment variable not set." });
   }
   try {
     const response = await fetch(API_URL, {
       method: 'POST',
         'Content-Type': 'application/json',
       },
       body: JSON.stringify({
+        model: MODEL_ID,
+        messages,
+        max_tokens: 1024,
         stream: true,
+        temperature: 0.7,
       }),
     });

src/App.tsx CHANGED Viewed

@@ -57,7 +57,8 @@ export default function App() {
             try {
               const parsed = JSON.parse(jsonStr);
-              const token = parsed.token?.text || '';
               if (token) {
                 setMessages((prev) => {

             try {
               const parsed = JSON.parse(jsonStr);
+              // OpenAI-compatible SSE: choices[0].delta.content
+              const token: string = parsed.choices?.[0]?.delta?.content ?? '';
               if (token) {
                 setMessages((prev) => {