oki692 commited on
Commit
f2f6faa
·
verified ·
1 Parent(s): 7ff2724

Upload 2 files

Browse files
Files changed (2) hide show
  1. entrypoint.sh +42 -0
  2. proxy.py +167 -0
entrypoint.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Walidacja wymaganych zmiennych
5
+ if [ -z "${MODEL}" ] || [ "${MODEL}" = "!TU MUSISZ EDYTOWAC!" ]; then
6
+ echo "BLAD: Zmienna MODEL nie jest ustawiona!"
7
+ echo "Ustaw ja w: HF Space Settings -> Variables -> MODEL"
8
+ echo "Przyklad: deepseek-r1:14b"
9
+ exit 1
10
+ fi
11
+
12
+ if [ -z "${API_KEY}" ] || [ "${API_KEY}" = "!TU MUSISZ EDYTOWAC!" ]; then
13
+ echo "BLAD: Zmienna API_KEY nie jest ustawiona!"
14
+ echo "Ustaw ja w: HF Space Settings -> Variables -> API_KEY"
15
+ exit 1
16
+ fi
17
+
18
+ export OLLAMA_HOST=127.0.0.1:11434
19
+ export OLLAMA_NUM_PARALLEL=2
20
+ export OLLAMA_MAX_LOADED_MODELS=1
21
+
22
+ echo "==> Model: ${MODEL}"
23
+ echo "==> Starting Ollama..."
24
+ ollama serve &
25
+
26
+ echo "==> Waiting for Ollama..."
27
+ for i in $(seq 1 30); do
28
+ if curl -sf http://127.0.0.1:11434/api/version > /dev/null 2>&1; then
29
+ echo "==> Ollama ready!"
30
+ break
31
+ fi
32
+ echo " Waiting... ($i/30)"
33
+ sleep 2
34
+ done
35
+
36
+ # !TU MUSISZ EDYTOWAC! — wpisz nazwę modelu, np. "deepseek-r1:14b" albo "hf.co/unsloth/GLM-4.7-Flash-GGUF:UD-TQ1_0"
37
+ echo "==> Pulling ${MODEL}..."
38
+ ollama pull ${MODEL}
39
+
40
+ echo "==> Starting proxy on :7860..."
41
+ # Start proxy with increased timeout for streaming
42
+ exec uvicorn proxy:app --host 0.0.0.0 --port 7860 --workers 4 --timeout-keep-alive 600
proxy.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, Request, HTTPException, Depends
3
+ from fastapi.responses import StreamingResponse
4
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
5
+ import httpx
6
+ import json
7
+ import time
8
+ import uuid
9
+
10
+ app = FastAPI()
11
+ security = HTTPBearer()
12
+
13
+ API_KEY = os.environ.get("API_KEY", "!TU MUSISZ EDYTOWAC!")
14
+ MODEL = os.environ.get("MODEL", "!TU MUSISZ EDYTOWAC!")
15
+ OLLAMA_BASE = "http://127.0.0.1:11434"
16
+
17
+ if "!TU MUSISZ EDYTOWAC!" in (API_KEY, MODEL):
18
+ raise RuntimeError("Ustaw zmienne API_KEY i MODEL w HF Space Settings -> Variables")
19
+
20
+ def verify_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
21
+ if credentials.credentials != API_KEY:
22
+ raise HTTPException(status_code=401, detail="Invalid API key")
23
+ return credentials.credentials
24
+
25
+ # --- Ollama Compatible Endpoints (Directly at /api/...) ---
26
+
27
+ @app.post("/api/chat")
28
+ async def ollama_chat(request: Request, key: str = Depends(verify_key)):
29
+ body = await request.json()
30
+ # Force streaming and ensure the correct model is used
31
+ body["stream"] = True
32
+ body["model"] = MODEL
33
+
34
+ async def generate():
35
+ async with httpx.AsyncClient(timeout=600.0) as client:
36
+ async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=body) as resp:
37
+ async for line in resp.aiter_lines():
38
+ if not line: continue
39
+ # Ollama's native chat API already includes 'thinking' if supported
40
+ yield line + "\n"
41
+
42
+ return StreamingResponse(generate(), media_type="application/x-ndjson")
43
+
44
+ @app.post("/api/generate")
45
+ async def ollama_generate(request: Request, key: str = Depends(verify_key)):
46
+ body = await request.json()
47
+ body["stream"] = True
48
+ body["model"] = MODEL
49
+
50
+ async def generate():
51
+ async with httpx.AsyncClient(timeout=600.0) as client:
52
+ async with client.stream("POST", f"{OLLAMA_BASE}/api/generate", json=body) as resp:
53
+ async for line in resp.aiter_lines():
54
+ if not line: continue
55
+ yield line + "\n"
56
+
57
+ return StreamingResponse(generate(), media_type="application/x-ndjson")
58
+
59
+ @app.get("/api/tags")
60
+ async def ollama_tags(key: str = Depends(verify_key)):
61
+ return {
62
+ "models": [{
63
+ "name": MODEL,
64
+ "model": MODEL,
65
+ "modified_at": "2024-01-01T00:00:00Z",
66
+ "size": 0,
67
+ "digest": "sha256:0000000000000000000000000000000000000000000000000000000000000000",
68
+ "details": {
69
+ "parent_model": "",
70
+ "format": "gguf",
71
+ "family": "llama",
72
+ "families": ["llama"],
73
+ "parameter_size": "unknown",
74
+ "quantization_level": "unknown"
75
+ }
76
+ }]
77
+ }
78
+
79
+ @app.get("/api/version")
80
+ async def ollama_version():
81
+ return {"version": "0.1.0-proxy"}
82
+
83
+ # --- OpenAI Compatible Endpoints (Maintained for flexibility) ---
84
+
85
+ @app.post("/v1/chat/completions")
86
+ async def chat_completions(request: Request, key: str = Depends(verify_key)):
87
+ body = await request.json()
88
+ body["stream"] = True
89
+
90
+ ollama_payload = {
91
+ "model": MODEL,
92
+ "messages": body.get("messages", []),
93
+ "stream": True,
94
+ "options": {
95
+ "temperature": body.get("temperature", 0.6),
96
+ "top_p": body.get("top_p", 0.95),
97
+ }
98
+ }
99
+ if "max_tokens" in body:
100
+ ollama_payload["options"]["num_predict"] = body["max_tokens"]
101
+
102
+ completion_id = f"chatcmpl-{uuid.uuid4().hex}"
103
+ created = int(time.time())
104
+
105
+ async def generate():
106
+ async with httpx.AsyncClient(timeout=600.0) as client:
107
+ async with client.stream("POST", f"{OLLAMA_BASE}/api/chat", json=ollama_payload) as resp:
108
+ async for line in resp.aiter_lines():
109
+ if not line: continue
110
+ try:
111
+ chunk = json.loads(line)
112
+ except: continue
113
+
114
+ msg = chunk.get("message", {})
115
+ done = chunk.get("done", False)
116
+
117
+ delta = {}
118
+ # Explicitly handle thinking tags for OpenAI format
119
+ thinking = msg.get("thinking")
120
+ content = msg.get("content")
121
+
122
+ if thinking:
123
+ delta["reasoning_content"] = thinking
124
+ if content:
125
+ delta["content"] = content
126
+
127
+ data = {
128
+ "id": completion_id,
129
+ "object": "chat.completion.chunk",
130
+ "created": created,
131
+ "model": MODEL,
132
+ "choices": [{
133
+ "index": 0,
134
+ "delta": delta,
135
+ "finish_reason": "stop" if done else None,
136
+ }]
137
+ }
138
+ yield f"data: {json.dumps(data)}\n\n"
139
+ if done: break
140
+ yield "data: [DONE]\n\n"
141
+
142
+ return StreamingResponse(generate(), media_type="text/event-stream")
143
+
144
+ @app.get("/v1/models")
145
+ async def list_models(key: str = Depends(verify_key)):
146
+ return {
147
+ "object": "list",
148
+ "data": [{
149
+ "id": MODEL,
150
+ "object": "model",
151
+ "created": int(time.time()),
152
+ "owned_by": "ollama",
153
+ }]
154
+ }
155
+
156
+ # --- Health Check ---
157
+
158
+ @app.get("/")
159
+ @app.get("/health")
160
+ async def health():
161
+ async with httpx.AsyncClient(timeout=5.0) as client:
162
+ try:
163
+ r = await client.get(f"{OLLAMA_BASE}/api/version")
164
+ ollama_ok = r.status_code == 200
165
+ except Exception:
166
+ ollama_ok = False
167
+ return {"status": "ok" if ollama_ok else "starting", "model": MODEL}