Spaces:

bluewinliang
/

zai2api

Paused

App Files Files Community

bluewinliang commited on Aug 12, 2025

Commit

13cb7f1

verified ·

1 Parent(s): 916ea12

Upload proxy_handler.py

Browse files

Files changed (1) hide show

proxy_handler.py +55 -35

proxy_handler.py CHANGED Viewed

@@ -84,8 +84,38 @@ class ProxyHandler:
             comp_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
             think_open = False
             phase_cur = None
-            # FIX: Buffer to hold the LATEST (aggregate) thinking content.
-            last_thinking_content = ""
             async with self.client.stream("POST", settings.UPSTREAM_URL, json=body, headers=headers) as resp:
                 if resp.status_code != 200:
@@ -102,9 +132,7 @@ class ProxyHandler:
                         payload_str = line[6:]
                         if payload_str == '[DONE]':
-                            # If stream ends but </think> was not sent, send it now.
-                            if think_open:
-                                yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '</think>'}, 'finish_reason': None}]})}\n\n"
                             yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"; yield "data: [DONE]\n\n"; return
                         try:
                             dat = json.loads(payload_str).get("data", {})
@@ -114,33 +142,21 @@ class ProxyHandler:
                         if new_phase: phase_cur = new_phase
                         if not phase_cur: continue
-                        content = dat.get("edit_content") or dat.get("delta_content")
-                        if not content: continue
-                        # FIX: Implement "wait-then-flush" logic.
                         if phase_cur == "thinking":
-                            # Keep overwriting with the latest content. Only the final, aggregate version will be stored.
-                            last_thinking_content = content
-                        elif phase_cur == "answer":
-                            # When the first answer chunk arrives, flush the buffered thinking content.
-                            if last_thinking_content and settings.SHOW_THINK_TAGS and not think_open:
-                                cleaned_thought = self._clean_thinking_content(last_thinking_content)
-                                if cleaned_thought:
-                                    yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '<think>'}, 'finish_reason': None}]})}\n\n"
-                                    yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': cleaned_thought}, 'finish_reason': None}]})}\n\n"
-                                think_open = True # Mark as flushed
-                            # Close the think tag if it's open.
-                            if think_open:
-                                yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '</think>'}, 'finish_reason': None}]})}\n\n"
-                                think_open = False # Ensure it's only closed once
-                            # Process and yield the answer content.
-                            cleaned_answer = self._clean_answer_content(content)
-                            if cleaned_answer:
-                                yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': cleaned_answer}, 'finish_reason': None}]})}\n\n"
         except Exception:
             logger.exception("Stream error"); raise
@@ -148,6 +164,7 @@ class ProxyHandler:
         ck = None
         try:
             body, headers, ck = await self._prep_upstream(req)
             last_thinking_content = ""
             raw_answer_parts = []
             phase_cur = None
@@ -172,14 +189,17 @@ class ProxyHandler:
                         if new_phase: phase_cur = new_phase
                         if not phase_cur: continue
-                        content = dat.get("edit_content") or dat.get("delta_content")
-                        if not content: continue
-                        # This logic correctly captures the last thinking content and all answer parts.
                         if phase_cur == "thinking":
-                            last_thinking_content = content
                         elif phase_cur == "answer":
-                            raw_answer_parts.append(content)
                     else: continue
                     break

             comp_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
             think_open = False
             phase_cur = None
+            # Buffer for what has been sent to the client to calculate deltas.
+            yielded_think_buffer = ""
+            # Authoritative state of the current full thinking content.
+            current_thinking_content = ""
+            async def yield_delta_content(content_type: str, text: str):
+                nonlocal think_open, yielded_think_buffer
+                if content_type == "thinking" and settings.SHOW_THINK_TAGS:
+                    if not think_open:
+                        yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '<think>'}, 'finish_reason': None}]})}\n\n"
+                        think_open = True
+                    cleaned_full_text = self._clean_thinking_content(text)
+                    if cleaned_full_text.startswith(yielded_think_buffer):
+                        delta_to_send = cleaned_full_text[len(yielded_think_buffer):]
+                    else:
+                        delta_to_send = cleaned_full_text
+                    if delta_to_send:
+                        yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': delta_to_send}, 'finish_reason': None}]})}\n\n"
+                    yielded_think_buffer = cleaned_full_text
+                elif content_type == "answer":
+                    if think_open:
+                        yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '</think>'}, 'finish_reason': None}]})}\n\n"
+                        think_open = False
+                    cleaned_text = self._clean_answer_content(text)
+                    if cleaned_text:
+                        yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': cleaned_text}, 'finish_reason': None}]})}\n\n"
             async with self.client.stream("POST", settings.UPSTREAM_URL, json=body, headers=headers) as resp:
                 if resp.status_code != 200:
                         payload_str = line[6:]
                         if payload_str == '[DONE]':
+                            if think_open: yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {'content': '</think>'}, 'finish_reason': None}]})}\n\n"
                             yield f"data: {json.dumps({'id': comp_id, 'object': 'chat.completion.chunk', 'created': int(time.time()), 'model': req.model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"; yield "data: [DONE]\n\n"; return
                         try:
                             dat = json.loads(payload_str).get("data", {})
                         if new_phase: phase_cur = new_phase
                         if not phase_cur: continue
+                        # Correctly handle delta_content (append) vs edit_content (replace).
                         if phase_cur == "thinking":
+                            if "edit_content" in dat and dat["edit_content"] is not None:
+                                current_thinking_content = dat["edit_content"]
+                            elif "delta_content" in dat and dat["delta_content"] is not None:
+                                current_thinking_content += dat["delta_content"]
+                            async for item in yield_delta_content("thinking", current_thinking_content):
+                                yield item
+                        elif phase_cur == "answer":
+                            content = dat.get("delta_content") or dat.get("edit_content")
+                            if content:
+                                async for item in yield_delta_content("answer", content):
+                                    yield item
         except Exception:
             logger.exception("Stream error"); raise
         ck = None
         try:
             body, headers, ck = await self._prep_upstream(req)
+            # Use a single string to hold the latest full thinking content.
             last_thinking_content = ""
             raw_answer_parts = []
             phase_cur = None
                         if new_phase: phase_cur = new_phase
                         if not phase_cur: continue
                         if phase_cur == "thinking":
+                            if "edit_content" in dat and dat["edit_content"] is not None:
+                                last_thinking_content = dat["edit_content"]
+                            elif "delta_content" in dat and dat["delta_content"] is not None:
+                                # In non-stream, we only need the final complete version.
+                                # 'edit_content' is usually that. We can build it as a fallback.
+                                last_thinking_content += dat["delta_content"]
                         elif phase_cur == "answer":
+                            content = dat.get("delta_content") or dat.get("edit_content")
+                            if content:
+                                raw_answer_parts.append(content)
                     else: continue
                     break