Spaces:

wynai
/

gptoss

Sleeping

App Files Files Community

wynai commited on Aug 14, 2025

Commit

8e8bc41

verified ·

1 Parent(s): 1f185a0

Update main.py

Browse files

Files changed (1) hide show

main.py +78 -88

main.py CHANGED Viewed

@@ -3,7 +3,6 @@ import json
 import uuid
 import base64
 import time
-import asyncio
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
 from typing import AsyncGenerator, Dict, Any
@@ -14,23 +13,18 @@ GPT_OSS_URL = "https://api.gpt-oss.com/chatkit"
 AVAILABLE_MODELS = ["gpt-oss-20b", "gpt-oss-120b"]
 def get_cookie():
-    # Tạo user_id ngẫu nhiên mỗi lần gọi
     user_id = str(uuid.uuid4())
-    # Sinh session "giả lập" ngẫu nhiên, encode dạng base64 urlsafe
     session_payload = {
         "hf_access_token": f"hf_oauth_{uuid.uuid4().hex}",
-        "hf_exp": time.time() + 3600,  # hết hạn sau 1 giờ
-        "hf_username": f"user_{uuid.uuid4().hex[:8]}"
     }
     session_json = json.dumps(session_payload, separators=(",", ":"))
     session_b64 = base64.urlsafe_b64encode(session_json.encode()).decode()
-    # Trả về cookie theo định dạng cũ nhưng là dữ liệu random
     return f"user_id={user_id}; session={session_b64}"
 def get_headers(model: str):
-    headers = {
         "authority": "api.gpt-oss.com",
         "accept": "text/event-stream",
         "accept-language": "vi-VN,vi;q=0.9,fr-FR;q=0.8,fr;q=0.7,en-US;q=0.6,en;q=0.5",
@@ -47,9 +41,8 @@ def get_headers(model: str):
         "user-agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Mobile Safari/537.36",
         "x-reasoning-effort": "high",
         "x-selected-model": model,
-        "x-show-reasoning": "true"
     }
-    return headers
 def build_prompt(messages: list):
     prompt = ""
@@ -65,18 +58,44 @@ def build_prompt(messages: list):
     return prompt.strip()
 def build_payload(prompt: str):
-    payload = {
         "op": "threads.create",
         "params": {
             "input": {
                 "text": prompt,
                 "content": [{"type": "input_text", "text": prompt}],
                 "quoted_text": "",
-                "attachments": []
             }
-        }
     }
-    return payload
 async def stream_gpt_oss_response(request_data: Dict[str, Any]) -> AsyncGenerator[str, None]:
     model = request_data["model"]
@@ -85,10 +104,9 @@ async def stream_gpt_oss_response(request_data: Dict[str, Any]) -> AsyncGenerato
     payload = build_payload(prompt)
     thoughts = []
-    text_buffer = ""  # Buffer tích lũy toàn bộ text
-    words_sent = 0  # Số từ đã gửi
     in_assistant = False
-    think_sent = False
     try:
         with requests.post(GPT_OSS_URL, headers=headers, data=json.dumps(payload), stream=True, timeout=120) as r:
@@ -100,76 +118,46 @@ async def stream_gpt_oss_response(request_data: Dict[str, Any]) -> AsyncGenerato
                     data = json.loads(line[5:].strip())
                     event_type = data.get("type")
-                    # Thu thập CoT (thought/recap) như cũ
-                    if event_type == "thread.item_updated" and "update" in data and data["update"].get("type") == "cot.entry_added":
                         entry = data["update"]["entry"]
                         if entry["type"] in ["thought", "recap"]:
                             thoughts.append(entry["content"])
-                    # Khi CoT hoàn tất, gửi một lần
                     elif event_type == "thread.item_done" and data["item"].get("type") == "cot":
                         think_str = " ".join(thoughts).strip()
-                        if think_str and not think_sent:
-                            chunk = {"choices": [{"delta": {"content": f"<think>{think_str}</think> "}, "index": 0, "finish_reason": None}]}
-                            yield f"data: {json.dumps(chunk)}\n\n"
-                            think_sent = True
-                    # Bắt đầu phase assistant message
                     elif event_type == "thread.item_added" and data["item"].get("type") == "assistant_message":
                         in_assistant = True
-                    # Tích lũy text và gửi từng từ
-                    elif event_type == "thread.item_updated" and "update" in data and data["update"].get("type") == "assistant_message.content_part.text_delta":
                         if in_assistant:
                             delta = data["update"]["delta"]
-                            text_buffer += delta
-                            # Tách thành các từ (bao gồm khoảng trắng)
-                            import re
-                            words = re.findall(r'\S+|\s+', text_buffer)
-                            # Gửi các từ mới (chưa được gửi)
-                            while words_sent < len(words):
-                                word = words[words_sent]
-                                # Chỉ gửi từ thực sự (không phải khoảng trắng đơn thuần)
-                                # hoặc nếu là từ cuối cùng trong list hiện tại
-                                if word.strip() or words_sent == len(words) - 1:
-                                    # Nếu là từ thực sự, gửi kèm khoảng trắng tiếp theo nếu có
-                                    if word.strip() and words_sent + 1 < len(words) and words[words_sent + 1].isspace():
-                                        word_to_send = word + words[words_sent + 1]
-                                        words_sent += 2
-                                    else:
-                                        word_to_send = word
-                                        words_sent += 1
-                                    chunk = {"choices": [{"delta": {"content": word_to_send}, "index": 0, "finish_reason": None}]}
-                                    yield f"data: {json.dumps(chunk)}\n\n"
-                                    # Delay giữa các từ
-                                    await asyncio.sleep(0.1)
-                                else:
-                                    words_sent += 1
-                    # Kết thúc assistant message
                     elif event_type == "thread.item_done" and data["item"].get("type") == "assistant_message":
-                        # Gửi bất kỳ từ cuối cùng nào còn lại
-                        import re
-                        words = re.findall(r'\S+|\s+', text_buffer)
-                        while words_sent < len(words):
-                            word = words[words_sent]
-                            if word.strip():  # Chỉ gửi từ có nội dung
-                                chunk = {"choices": [{"delta": {"content": word}, "index": 0, "finish_reason": None}]}
-                                yield f"data: {json.dumps(chunk)}\n\n"
-                            words_sent += 1
                         yield "data: [DONE]\n\n"
                         break
-                except Exception as e:
                     continue
     except Exception as e:
-        yield f"data: {json.dumps({'error': str(e)})}\n\n"
-# Xóa function không cần thiết
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
@@ -179,17 +167,24 @@ async def chat_completions(request: Request):
         return {"error": f"Model must be one of {AVAILABLE_MODELS}"}, 400
     stream = request_data.get("stream", False)
     if stream:
-        return StreamingResponse(stream_gpt_oss_response(request_data), media_type="text/event-stream")
     else:
-        # Non-stream: thu gom toàn bộ response
         headers = get_headers(model)
         prompt = build_prompt(request_data["messages"])
         payload = build_payload(prompt)
-        thoughts = []
-        content = ""
         try:
             with requests.post(GPT_OSS_URL, headers=headers, data=json.dumps(payload), stream=True, timeout=120) as r:
@@ -200,16 +195,12 @@ async def chat_completions(request: Request):
                     try:
                         data = json.loads(line[5:].strip())
                         event_type = data.get("type")
-                        if event_type == "thread.item_updated" and "update" in data and data["update"].get("type") == "cot.entry_added":
                             entry = data["update"]["entry"]
                             if entry["type"] in ["thought", "recap"]:
                                 thoughts.append(entry["content"])
-                        elif event_type == "thread.item_updated" and "update" in data and data["update"].get("type") == "assistant_message.content_part.text_delta":
-                            delta = data["update"]["delta"]
-                            content += delta
                         elif event_type == "thread.item_done" and data["item"].get("type") == "assistant_message":
                             break
                     except Exception:
@@ -218,16 +209,15 @@ async def chat_completions(request: Request):
             return {"error": str(e)}
         think_str = " ".join(thoughts).strip()
-        full_content = f"<think>{think_str}</think> {content}".strip() if think_str else content.strip()
-        response = {
             "choices": [{
                 "message": {"content": full_content},
                 "index": 0,
                 "finish_reason": "stop"
             }]
         }
-        return response
 if __name__ == "__main__":
     import uvicorn

 import uuid
 import base64
 import time
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
 from typing import AsyncGenerator, Dict, Any
 AVAILABLE_MODELS = ["gpt-oss-20b", "gpt-oss-120b"]
 def get_cookie():
     user_id = str(uuid.uuid4())
     session_payload = {
         "hf_access_token": f"hf_oauth_{uuid.uuid4().hex}",
+        "hf_exp": time.time() + 3600,
+        "hf_username": f"user_{uuid.uuid4().hex[:8]}",
     }
     session_json = json.dumps(session_payload, separators=(",", ":"))
     session_b64 = base64.urlsafe_b64encode(session_json.encode()).decode()
     return f"user_id={user_id}; session={session_b64}"
 def get_headers(model: str):
+    return {
         "authority": "api.gpt-oss.com",
         "accept": "text/event-stream",
         "accept-language": "vi-VN,vi;q=0.9,fr-FR;q=0.8,fr;q=0.7,en-US;q=0.6,en;q=0.5",
         "user-agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Mobile Safari/537.36",
         "x-reasoning-effort": "high",
         "x-selected-model": model,
+        "x-show-reasoning": "true",
     }
 def build_prompt(messages: list):
     prompt = ""
     return prompt.strip()
 def build_payload(prompt: str):
+    return {
         "op": "threads.create",
         "params": {
             "input": {
                 "text": prompt,
                 "content": [{"type": "input_text", "text": prompt}],
                 "quoted_text": "",
+                "attachments": [],
             }
+        },
+    }
+def _openai_chunk(model: str, delta_content: str = "", role: str = None, finish: str = None):
+    """
+    Trả về 1 chunk theo định dạng OpenAI stream.
+    - role: chỉ gửi 1 lần đầu ("assistant")
+    - finish: "stop" khi kết thúc
+    """
+    obj = {
+        "id": f"chatcmpl-{uuid.uuid4().hex}",
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "delta": {},
+                "finish_reason": None,
+            }
+        ],
     }
+    if role is not None:
+        obj["choices"][0]["delta"]["role"] = role
+    if delta_content:
+        obj["choices"][0]["delta"]["content"] = delta_content
+    if finish is not None:
+        obj["choices"][0]["finish_reason"] = finish
+    return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"
 async def stream_gpt_oss_response(request_data: Dict[str, Any]) -> AsyncGenerator[str, None]:
     model = request_data["model"]
     payload = build_payload(prompt)
     thoughts = []
     in_assistant = False
+    role_sent = False
+    think_buffer = None  # giữ CoT để gửi ngay trước content
     try:
         with requests.post(GPT_OSS_URL, headers=headers, data=json.dumps(payload), stream=True, timeout=120) as r:
                     data = json.loads(line[5:].strip())
                     event_type = data.get("type")
+                    # Thu thập CoT
+                    if event_type == "thread.item_updated" and data.get("update", {}).get("type") == "cot.entry_added":
                         entry = data["update"]["entry"]
                         if entry["type"] in ["thought", "recap"]:
                             thoughts.append(entry["content"])
                     elif event_type == "thread.item_done" and data["item"].get("type") == "cot":
                         think_str = " ".join(thoughts).strip()
+                        if think_str:
+                            think_buffer = f"<think>{think_str}</think> "
                     elif event_type == "thread.item_added" and data["item"].get("type") == "assistant_message":
                         in_assistant = True
+                        # gửi role 1 lần
+                        if not role_sent:
+                            yield _openai_chunk(model, role="assistant")
+                            role_sent = True
+                        # nếu có CoT, đẩy ra trước khi stream chữ
+                        if think_buffer:
+                            for ch in think_buffer:
+                                yield _openai_chunk(model, delta_content=ch)
+                            think_buffer = None
+                    elif event_type == "thread.item_updated" and data.get("update", {}).get("type") == "assistant_message.content_part.text_delta":
                         if in_assistant:
                             delta = data["update"]["delta"]
+                            # stream từng KÝ TỰ theo chuẩn OpenAI
+                            for ch in delta:
+                                yield _openai_chunk(model, delta_content=ch)
                     elif event_type == "thread.item_done" and data["item"].get("type") == "assistant_message":
+                        # chunk kết thúc
+                        yield _openai_chunk(model, finish="stop")
                         yield "data: [DONE]\n\n"
                         break
+                except Exception:
                     continue
     except Exception as e:
+        err = {"error": str(e)}
+        yield f"data: {json.dumps(err, ensure_ascii=False)}\n\n"
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
         return {"error": f"Model must be one of {AVAILABLE_MODELS}"}, 400
     stream = request_data.get("stream", False)
     if stream:
+        # Thêm header giúp proxy không buffer
+        return StreamingResponse(
+            stream_gpt_oss_response(request_data),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "X-Accel-Buffering": "no",
+                "Connection": "keep-alive",
+            },
+        )
     else:
+        # Non-stream (giữ nguyên)
         headers = get_headers(model)
         prompt = build_prompt(request_data["messages"])
         payload = build_payload(prompt)
+        thoughts, content = [], ""
         try:
             with requests.post(GPT_OSS_URL, headers=headers, data=json.dumps(payload), stream=True, timeout=120) as r:
                     try:
                         data = json.loads(line[5:].strip())
                         event_type = data.get("type")
+                        if event_type == "thread.item_updated" and data.get("update", {}).get("type") == "cot.entry_added":
                             entry = data["update"]["entry"]
                             if entry["type"] in ["thought", "recap"]:
                                 thoughts.append(entry["content"])
+                        elif event_type == "thread.item_updated" and data.get("update", {}).get("type") == "assistant_message.content_part.text_delta":
+                            content += data["update"]["delta"]
                         elif event_type == "thread.item_done" and data["item"].get("type") == "assistant_message":
                             break
                     except Exception:
             return {"error": str(e)}
         think_str = " ".join(thoughts).strip()
+        full_content = (f"<think>{think_str}</think> " if think_str else "") + content.strip()
+        return {
             "choices": [{
                 "message": {"content": full_content},
                 "index": 0,
                 "finish_reason": "stop"
             }]
         }
 if __name__ == "__main__":
     import uvicorn