oki692 commited on
Commit
6ca3422
·
verified ·
1 Parent(s): 55c5fe9

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +12 -0
  2. main.py +282 -0
  3. requirements.txt +5 -0
  4. system_prompts.py +15 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 7860
11
+
12
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "2"]
main.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-model AI gateway endpoint — HF Spaces compatible.
3
+ Authorization via 'connect' API key header.
4
+ Streaming always enabled. Function calling supported.
5
+ """
6
+
7
+ import json
8
+ import asyncio
9
+ from typing import AsyncGenerator, Optional
10
+
11
+ from fastapi import FastAPI, HTTPException, Header, Request
12
+ from fastapi.responses import StreamingResponse
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from pydantic import BaseModel, Field
15
+ from openai import OpenAI
16
+
17
+ from system_prompts import get_system_prompt
18
+
19
+ # ── Config ──────────────────────────────────────────────────────────────────
20
+
21
+ CONNECT_KEY = "connect"
22
+
23
+ NVIDIA_BASE_URL = "https://integrate.api.nvidia.com/v1"
24
+ NVIDIA_API_KEY = "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"
25
+
26
+ # Model registry: display-name → real model id + optional extra body
27
+ MODELS = {
28
+ "Bielik-11b": {
29
+ "model_id": "speakleash/bielik-11b-v2.6-instruct",
30
+ "extra_body": {
31
+ "chat_template_kwargs": {
32
+ "enable_thinking": False,
33
+ "clear_thinking": True,
34
+ }
35
+ },
36
+ },
37
+ "GLM-4.7": {
38
+ "model_id": "z-ai/glm4.7",
39
+ "extra_body": {
40
+ "chat_template_kwargs": {
41
+ "enable_thinking": False,
42
+ "clear_thinking": True,
43
+ }
44
+ },
45
+ },
46
+ "Mistral-Small-4": {
47
+ "model_id": "mistralai/mistral-small-4-119b-2603",
48
+ "extra_body": {},
49
+ },
50
+ "DeepSeek-V3.1": {
51
+ "model_id": "deepseek-ai/deepseek-v3.1",
52
+ "extra_body": {},
53
+ },
54
+ "Kimi-K2": {
55
+ "model_id": "moonshotai/kimi-k2-instruct",
56
+ "extra_body": {},
57
+ },
58
+ }
59
+
60
+ # ── FastAPI ──────────────────────────────────────────────────────────────────
61
+
62
+ app = FastAPI(
63
+ title="Multi-Model AI Gateway",
64
+ version="1.0.0",
65
+ description="Streaming endpoint for Bielik-11b, GLM-4.7, Mistral-Small-4, DeepSeek-V3.1, Kimi-K2",
66
+ )
67
+
68
+ app.add_middleware(
69
+ CORSMiddleware,
70
+ allow_origins=["*"],
71
+ allow_methods=["*"],
72
+ allow_headers=["*"],
73
+ )
74
+
75
+ client = OpenAI(base_url=NVIDIA_BASE_URL, api_key=NVIDIA_API_KEY)
76
+
77
+ # ── Auth ─────────────────────────────────────────────────────────────────────
78
+
79
+ def verify_key(authorization: Optional[str]) -> None:
80
+ """Check Bearer token matches CONNECT_KEY."""
81
+ if not authorization:
82
+ raise HTTPException(status_code=401, detail="Missing Authorization header")
83
+ scheme, _, token = authorization.partition(" ")
84
+ if scheme.lower() != "bearer" or token != CONNECT_KEY:
85
+ raise HTTPException(status_code=403, detail="Invalid API key")
86
+
87
+ # ── Schemas ───────────────────────────────────────────────────────────────────
88
+
89
+ class Message(BaseModel):
90
+ role: str
91
+ content: str | list # supports text or multipart
92
+
93
+ class ToolFunction(BaseModel):
94
+ name: str
95
+ description: Optional[str] = None
96
+ parameters: Optional[dict] = None
97
+
98
+ class Tool(BaseModel):
99
+ type: str = "function"
100
+ function: ToolFunction
101
+
102
+ class ChatRequest(BaseModel):
103
+ model: str = Field(..., description="Model name: Bielik-11b | GLM-4.7 | Mistral-Small-4 | DeepSeek-V3.1 | Kimi-K2")
104
+ messages: list[Message]
105
+ tools: Optional[list[Tool]] = None
106
+ tool_choice: Optional[str | dict] = None
107
+ temperature: Optional[float] = None
108
+ max_tokens: Optional[int] = None
109
+ top_p: Optional[float] = None
110
+ presence_penalty: Optional[float] = None
111
+ frequency_penalty: Optional[float] = None
112
+ inject_system_prompt: bool = Field(
113
+ default=True,
114
+ description="Prepend the model-specific system prompt automatically"
115
+ )
116
+
117
+ # ── Stream helper ─────────────────────────────────────────────────────────────
118
+
119
+ async def stream_nvidia(
120
+ model_name: str,
121
+ messages: list[dict],
122
+ tools: Optional[list[dict]],
123
+ tool_choice,
124
+ kwargs: dict,
125
+ extra_body: dict,
126
+ ) -> AsyncGenerator[str, None]:
127
+ """Yield SSE chunks from NVIDIA NIM in a thread-safe way."""
128
+
129
+ params = {
130
+ "model": MODELS[model_name]["model_id"],
131
+ "messages": messages,
132
+ "stream": True, # always True
133
+ **kwargs,
134
+ }
135
+
136
+ if tools:
137
+ params["tools"] = tools
138
+ if tool_choice is not None:
139
+ params["tool_choice"] = tool_choice
140
+ if extra_body:
141
+ params["extra_body"] = extra_body
142
+
143
+ loop = asyncio.get_event_loop()
144
+
145
+ def _call():
146
+ return client.chat.completions.create(**params)
147
+
148
+ stream = await loop.run_in_executor(None, _call)
149
+
150
+ for chunk in stream:
151
+ data = chunk.model_dump()
152
+ yield f"data: {json.dumps(data)}\n\n"
153
+
154
+ yield "data: [DONE]\n\n"
155
+
156
+ # ── Endpoints ─────────────────────────────────────────────────────────────────
157
+
158
+ @app.get("/")
159
+ async def root():
160
+ return {
161
+ "service": "Multi-Model AI Gateway",
162
+ "models": list(MODELS.keys()),
163
+ "auth": "Bearer <connect-key>",
164
+ "docs": "/docs",
165
+ }
166
+
167
+
168
+ @app.get("/models")
169
+ async def list_models(authorization: Optional[str] = Header(default=None)):
170
+ verify_key(authorization)
171
+ return {
172
+ name: {
173
+ "model_id": cfg["model_id"],
174
+ "has_thinking": bool(cfg["extra_body"]),
175
+ }
176
+ for name, cfg in MODELS.items()
177
+ }
178
+
179
+
180
+ @app.post("/chat")
181
+ async def chat(
182
+ request: ChatRequest,
183
+ authorization: Optional[str] = Header(default=None),
184
+ ):
185
+ verify_key(authorization)
186
+
187
+ if request.model not in MODELS:
188
+ raise HTTPException(
189
+ status_code=400,
190
+ detail=f"Unknown model '{request.model}'. Available: {list(MODELS.keys())}",
191
+ )
192
+
193
+ cfg = MODELS[request.model]
194
+
195
+ # Build messages list
196
+ messages = [m.model_dump() for m in request.messages]
197
+
198
+ # Inject per-model system prompt at position 0 if not already present
199
+ if request.inject_system_prompt:
200
+ system_prompt = get_system_prompt(request.model)
201
+ if not messages or messages[0].get("role") != "system":
202
+ messages.insert(0, {"role": "system", "content": system_prompt})
203
+
204
+ # Optional params
205
+ kwargs = {}
206
+ for field in ("temperature", "max_tokens", "top_p", "presence_penalty", "frequency_penalty"):
207
+ val = getattr(request, field)
208
+ if val is not None:
209
+ kwargs[field] = val
210
+
211
+ tools = [t.model_dump() for t in request.tools] if request.tools else None
212
+
213
+ return StreamingResponse(
214
+ stream_nvidia(
215
+ model_name=request.model,
216
+ messages=messages,
217
+ tools=tools,
218
+ tool_choice=request.tool_choice,
219
+ kwargs=kwargs,
220
+ extra_body=cfg["extra_body"],
221
+ ),
222
+ media_type="text/event-stream",
223
+ headers={
224
+ "Cache-Control": "no-cache",
225
+ "X-Accel-Buffering": "no",
226
+ },
227
+ )
228
+
229
+
230
+ # ── Compatibility: OpenAI-style /v1/chat/completions ──────────────────────────
231
+
232
+ @app.post("/v1/chat/completions")
233
+ async def openai_compat(
234
+ raw: Request,
235
+ authorization: Optional[str] = Header(default=None),
236
+ ):
237
+ """
238
+ Drop-in OpenAI-compatible endpoint.
239
+ Pass model as one of the gateway model names (e.g. 'Kimi-K2').
240
+ """
241
+ verify_key(authorization)
242
+ body = await raw.json()
243
+
244
+ model_name = body.get("model", "")
245
+ if model_name not in MODELS:
246
+ raise HTTPException(
247
+ status_code=400,
248
+ detail=f"Unknown model '{model_name}'. Available: {list(MODELS.keys())}",
249
+ )
250
+
251
+ cfg = MODELS[model_name]
252
+ messages = body.get("messages", [])
253
+
254
+ inject = body.get("inject_system_prompt", True)
255
+ if inject:
256
+ system_prompt = get_system_prompt(model_name)
257
+ if not messages or messages[0].get("role") != "system":
258
+ messages.insert(0, {"role": "system", "content": system_prompt})
259
+
260
+ kwargs = {}
261
+ for field in ("temperature", "max_tokens", "top_p", "presence_penalty", "frequency_penalty"):
262
+ if field in body:
263
+ kwargs[field] = body[field]
264
+
265
+ tools = body.get("tools")
266
+ tool_choice = body.get("tool_choice")
267
+
268
+ return StreamingResponse(
269
+ stream_nvidia(
270
+ model_name=model_name,
271
+ messages=messages,
272
+ tools=tools,
273
+ tool_choice=tool_choice,
274
+ kwargs=kwargs,
275
+ extra_body=cfg["extra_body"],
276
+ ),
277
+ media_type="text/event-stream",
278
+ headers={
279
+ "Cache-Control": "no-cache",
280
+ "X-Accel-Buffering": "no",
281
+ },
282
+ )
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.111.0
2
+ uvicorn[standard]>=0.29.0
3
+ openai>=1.35.0
4
+ pydantic>=2.7.0
5
+ httpx>=0.27.0
system_prompts.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM_PROMPTS = {
2
+ "speakleash/bielik-11b-v2.6-instruct": """Jesteś Bielikiem – polskim asystentem AI stworzonym z myślą o użytkownikach mówiących po polsku. Komunikujesz się przede wszystkim w języku polskim, chyba że użytkownik wyraźnie poprosi o inny język. Jesteś precyzyjny, kulturalny i pomocny. Posiadasz szeroką wiedzę o polskiej kulturze, historii i realiach. Odpowiadasz zwięźle, ale wyczerpująco. Nie udajesz innego modelu – jesteś Bielikiem.""",
3
+
4
+ "z-ai/glm4.7": """You are GLM-4.7, a high-capability multilingual assistant developed by Zhipu AI. You excel at reasoning, coding, mathematics, and structured analysis. You think step by step when solving complex problems. You are direct, efficient, and technically precise. You do not over-explain simple things. When writing code, you always include brief inline comments for clarity. You respond in the same language the user writes in.""",
5
+
6
+ "mistralai/mistral-small-4-119b-2603": """You are Mistral, a fast and efficient AI assistant built for practical, real-world tasks. You are concise by default – you give short, sharp answers unless the user asks for depth. You excel at summarization, classification, drafting, and function calling. When tools are available, you proactively use them rather than guessing. You avoid unnecessary filler phrases. You match the user's language and tone.""",
7
+
8
+ "deepseek-ai/deepseek-v3.1": """You are DeepSeek, an advanced AI assistant with strong reasoning and coding abilities. You approach problems methodically: you break down complex questions, consider multiple angles, and provide well-structured responses. For technical tasks – especially code, math, and system design – you go deep and thorough. You are honest about uncertainty. When you don't know something, you say so clearly rather than speculating. You respond in the language the user uses.""",
9
+
10
+ "moonshotai/kimi-k2-instruct": """You are Kimi, an AI assistant by Moonshot AI with an exceptionally long context window and strong document understanding capabilities. You are especially good at reading, analyzing, and synthesizing large amounts of information. You are thoughtful, curious, and thorough. When given documents or long inputs, you summarize key points before diving into details. You communicate in a warm but professional tone. You respond in the language the user writes in.""",
11
+ }
12
+
13
+
14
+ def get_system_prompt(model_id: str) -> str | None:
15
+ return SYSTEM_PROMPTS.get(model_id)