Spaces:

Vishinka
/

Code_LLM

Sleeping

App Files Files Community

AnatoliiG commited on Jan 17

Commit

1f21c8c

1 Parent(s): 19b570f

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -0

app.py CHANGED Viewed

	@@ -0,0 +1,103 @@

+import json
+import gradio as gr
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
+from gradio import mount_gradio_app
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
+FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
+print(f"Loading model {REPO_ID}...")
+model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
+print("Initializing Llama...")
+llm = Llama(
+    model_path=model_path,
+    n_ctx=8192,
+    n_threads=2,
+    verbose=False,
+)
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    data = await request.json()
+    messages = data.get("messages", [])
+    stream = data.get("stream", False)
+    temperature = data.get("temperature", 0.2)
+    max_tokens = data.get("max_tokens", 2048)
+    output = llm.create_chat_completion(
+        messages=messages, max_tokens=max_tokens, temperature=temperature, stream=stream
+    )
+    if stream:
+        def iter_content():
+            try:
+                for chunk in output:
+                    yield f"data: {json.dumps(chunk)}\n\n"
+            except Exception as e:
+                print(f"Streaming error: {e}")
+            finally:
+                yield "data: [DONE]\n\n"
+        return StreamingResponse(
+            iter_content(),
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+        )
+    return JSONResponse(content=output)
+def gradio_interface(message, history):
+    messages = [{"role": "system", "content": "You are an expert coding assistant."}]
+    for u, a in history:
+        messages.append({"role": "user", "content": u})
+        messages.append({"role": "assistant", "content": a})
+    messages.append({"role": "user", "content": message})
+    response_stream = llm.create_chat_completion(
+        messages=messages, max_tokens=2048, temperature=0.4, stream=True
+    )
+    partial_text = ""
+    for chunk in response_stream:
+        delta = chunk["choices"][0]["delta"]
+        if "content" in delta:
+            partial_text += delta["content"]
+            yield partial_text
+demo = gr.ChatInterface(
+    fn=gradio_interface,
+    title="Qwen 2.5 Coder API",
+    description="API endpoint: /v1/chat/completions",
+)
+app = mount_gradio_app(app, demo, path="/ui")
+@app.get("/")
+async def root():
+    return RedirectResponse(url="/ui")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)