Spaces:

Vishinka
/

Code_LLM

Sleeping

App Files Files Community

AnatoliiG commited on Jan 18

Commit

afbbaeb

1 Parent(s): c65ca6d

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -16

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import uvicorn
@@ -11,7 +12,8 @@ from model import engine
 from ui import create_ui
 from utils import get_clean_text
-# --- FastAPI Setup ---
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
@@ -22,9 +24,11 @@ app.add_middleware(
 )
-# --- API Endpoints ---
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
     if not engine.llm:
         return JSONResponse(content={"error": "Model not loaded"}, status_code=500)
@@ -45,22 +49,30 @@ async def chat_completions(request: Request):
         temperature = data.get("temperature", config.DEFAULT_TEMP)
         max_tokens = data.get("max_tokens", config.DEFAULT_MAX_TOKENS)
-        output = engine.generate(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            stream=stream,
-        )
         if stream:
-            def iter_content():
-                for chunk in output:
-                    yield f"data: {json.dumps(chunk)}\n\n"
-                yield "data: [DONE]\n\n"
             return StreamingResponse(
-                iter_content(),
                 media_type="text/event-stream",
                 headers={
                     "Cache-Control": "no-cache",
@@ -68,9 +80,20 @@ async def chat_completions(request: Request):
                     "X-Accel-Buffering": "no",
                 },
             )
-        return JSONResponse(content=output)
     except Exception as e:
         return JSONResponse(content={"error": str(e)}, status_code=500)

+import asyncio  # <--- Добавили импорт
 import json
 import uvicorn
 from ui import create_ui
 from utils import get_clean_text
+model_lock = asyncio.Lock()
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
 )
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
+    if model_lock.locked():
+        pass
     if not engine.llm:
         return JSONResponse(content={"error": "Model not loaded"}, status_code=500)
         temperature = data.get("temperature", config.DEFAULT_TEMP)
         max_tokens = data.get("max_tokens", config.DEFAULT_MAX_TOKENS)
+        async def iter_content_locked():
+            async with model_lock:
+                try:
+                    output = engine.generate(
+                        messages=messages,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        stream=True,
+                    )
+                    for chunk in output:
+                        if "model" not in chunk:
+                            chunk["model"] = config.REPO_ID
+                        yield f"data: {json.dumps(chunk)}\n\n"
+                        await asyncio.sleep(0)
+                    yield "data: [DONE]\n\n"
+                except Exception as e:
+                    print(f"Streaming error: {e}")
+                    yield f"data: {json.dumps({'error': str(e)})}\n\n"
         if stream:
             return StreamingResponse(
+                iter_content_locked(),
                 media_type="text/event-stream",
                 headers={
                     "Cache-Control": "no-cache",
                     "X-Accel-Buffering": "no",
                 },
             )
+        else:
+            async with model_lock:
+                output = engine.generate(
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    stream=False,
+                )
+            return JSONResponse(content=output)
     except Exception as e:
+        import traceback
+        traceback.print_exc()
         return JSONResponse(content={"error": str(e)}, status_code=500)