Spaces:

khushalcodiste
/

gemme4

Running

App Files Files Community

d3evil4 commited on Apr 7

Commit

e536cd5

1 Parent(s): 35424c3

feat: huh

Browse files

Files changed (3) hide show

Dockerfile +4 -3
main.py +19 -177
start.sh +46 -0

Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@ FROM ghcr.io/ggml-org/llama.cpp:full
 WORKDIR /app
-RUN apt update && apt install -y python3 python3-pip python3-venv
 RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
@@ -19,7 +19,8 @@ RUN python3 -c 'from huggingface_hub import hf_hub_download; \
 ENV HF_TOKEN=""
-COPY main.py /app/main.py
 ENTRYPOINT []
-CMD uvicorn main:app --host 0.0.0.0 --port 7860

 WORKDIR /app
+RUN apt update && apt install -y python3 python3-pip python3-venv curl
 RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 ENV HF_TOKEN=""
+COPY main.py start.sh /app/
+RUN chmod +x /app/start.sh
 ENTRYPOINT []
+CMD ["/app/start.sh"]

main.py CHANGED Viewed

@@ -1,13 +1,7 @@
 from __future__ import annotations
-import asyncio
 import logging
-import os
-import shutil
-import subprocess
 import sys
-import time
-from contextlib import asynccontextmanager
 from typing import Any, AsyncIterator
 import httpx
@@ -23,92 +17,9 @@ logging.basicConfig(
 )
 logger = logging.getLogger("gemma4")
-def _find_llama_server() -> str:
-    candidates = [
-        "llama-server",
-        "/llama-server",
-        "/usr/local/bin/llama-server",
-        "/usr/bin/llama-server",
-    ]
-    for c in candidates:
-        found = shutil.which(c)
-        if found:
-            return found
-        if os.path.isfile(c) and os.access(c, os.X_OK):
-            return c
-    raise RuntimeError(f"llama-server binary not found; searched: {candidates}")
-LLAMA_BASE = "http://localhost:8080"
-LLAMA_CMD = [
-    _find_llama_server(),
-    "-m", "/app/gemma-4-E2B-it-UD-Q5_K_XL.gguf",
-    "--mmproj", "/app/mmproj-BF16.gguf",
-    "--host", "0.0.0.0",
-    "--port", "8080",
-    "-t", "2",
-    "--cache-type-k", "q8_0",
-    "--cache-type-v", "iq4_nl",
-    "-c", "128000",
-    "-n", "38912",
-]
-HEALTH_TIMEOUT = 300
-HEALTH_POLL_INTERVAL = 2
-_llama_proc: subprocess.Popen[bytes] | None = None
-_http_client: httpx.AsyncClient | None = None
-async def _wait_for_llama() -> None:
-    assert _http_client is not None
-    deadline = time.monotonic() + HEALTH_TIMEOUT
-    while time.monotonic() < deadline:
-        try:
-            resp = await _http_client.get(f"{LLAMA_BASE}/health", timeout=5.0)
-            if resp.status_code == 200:
-                logger.info("llama.cpp server is healthy")
-                return
-        except httpx.TransportError:
-            pass
-        await asyncio.sleep(HEALTH_POLL_INTERVAL)
-    raise RuntimeError("llama.cpp server did not become healthy within timeout")
-@asynccontextmanager
-async def lifespan(app: FastAPI) -> AsyncIterator[None]:
-    global _llama_proc, _http_client
-    _http_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=10.0))
-    logger.info("Starting llama.cpp server: %s", " ".join(LLAMA_CMD))
-    _llama_proc = subprocess.Popen(
-        LLAMA_CMD,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-    )
-    try:
-        await _wait_for_llama()
-    except RuntimeError:
-        _llama_proc.terminate()
-        await _http_client.aclose()
-        raise
-    yield
-    logger.info("Shutting down llama.cpp server")
-    if _llama_proc and _llama_proc.poll() is None:
-        _llama_proc.terminate()
-        try:
-            _llama_proc.wait(timeout=10)
-        except subprocess.TimeoutExpired:
-            _llama_proc.kill()
-    await _http_client.aclose()
-app = FastAPI(title="Gemma 4 API", version="1.0.0", lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
@@ -118,6 +29,8 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.middleware("http")
 async def log_requests(request: Request, call_next):
@@ -127,17 +40,10 @@ async def log_requests(request: Request, call_next):
     return response
-def _client() -> httpx.AsyncClient:
-    if _http_client is None:
-        raise HTTPException(status_code=503, detail="Service not initialized")
-    return _http_client
 async def _proxy_stream(url: str, payload: dict[str, Any]) -> AsyncIterator[bytes]:
-    async with _client().stream("POST", url, json=payload) as resp:
         if resp.status_code != 200:
-            body = await resp.aread()
-            yield body
             return
         async for chunk in resp.aiter_bytes():
             yield chunk
@@ -147,24 +53,6 @@ async def _proxy_stream(url: str, payload: dict[str, Any]) -> AsyncIterator[byte
 # Pydantic models
 # ---------------------------------------------------------------------------
-class MessageContent(BaseModel):
-    role: str
-    content: Any
-class ChatRequest(BaseModel):
-    model: str | None = None
-    messages: list[dict[str, Any]]
-    max_tokens: int | None = Field(default=None, alias="max_tokens")
-    temperature: float | None = None
-    top_p: float | None = None
-    stream: bool = False
-    stop: list[str] | None = None
-    extra: dict[str, Any] = Field(default_factory=dict)
-    model_config = {"extra": "allow", "populate_by_name": True}
 class SimpleChatRequest(BaseModel):
     messages: list[dict[str, Any]]
     max_tokens: int = 2048
@@ -193,24 +81,17 @@ class VisionRequest(BaseModel):
 @app.get("/health")
 async def health() -> dict[str, Any]:
     try:
-        resp = await _client().get(f"{LLAMA_BASE}/health", timeout=5.0)
         llama_status = resp.json() if resp.status_code == 200 else {"status": "error"}
     except httpx.TransportError:
         raise HTTPException(status_code=503, detail="llama.cpp server unreachable")
-    try:
-        models_resp = await _client().get(f"{LLAMA_BASE}/v1/models", timeout=5.0)
-        models = models_resp.json() if models_resp.status_code == 200 else {}
-    except httpx.TransportError:
-        models = {}
-    return {"status": "ok", "llama": llama_status, "models": models}
 @app.get("/v1/models")
 async def list_models() -> Any:
     try:
-        resp = await _client().get(f"{LLAMA_BASE}/v1/models", timeout=10.0)
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     return resp.json()
@@ -219,26 +100,17 @@ async def list_models() -> Any:
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request) -> Any:
     payload = await request.json()
-    stream = payload.get("stream", False)
-    if stream:
         return StreamingResponse(
             _proxy_stream(f"{LLAMA_BASE}/v1/chat/completions", payload),
             media_type="text/event-stream",
         )
     try:
-        resp = await _client().post(
-            f"{LLAMA_BASE}/v1/chat/completions",
-            json=payload,
-            timeout=300.0,
-        )
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()
@@ -250,25 +122,17 @@ async def chat(req: SimpleChatRequest) -> Any:
         "temperature": req.temperature,
         "stream": req.stream,
     }
     if req.stream:
         return StreamingResponse(
             _proxy_stream(f"{LLAMA_BASE}/v1/chat/completions", payload),
             media_type="text/event-stream",
         )
     try:
-        resp = await _client().post(
-            f"{LLAMA_BASE}/v1/chat/completions",
-            json=payload,
-            timeout=300.0,
-        )
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()
@@ -280,64 +144,42 @@ async def generate(req: GenerateRequest) -> Any:
         "temperature": req.temperature,
         "stream": req.stream,
     }
     if req.stream:
         return StreamingResponse(
             _proxy_stream(f"{LLAMA_BASE}/v1/chat/completions", payload),
             media_type="text/event-stream",
         )
     try:
-        resp = await _client().post(
-            f"{LLAMA_BASE}/v1/chat/completions",
-            json=payload,
-            timeout=300.0,
-        )
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()
 @app.post("/vision")
 async def vision(req: VisionRequest) -> Any:
-    image_content: dict[str, Any]
     if req.image.startswith("http://") or req.image.startswith("https://"):
-        image_content = {"type": "image_url", "image_url": {"url": req.image}}
     else:
         image_content = {
             "type": "image_url",
             "image_url": {"url": f"data:image/jpeg;base64,{req.image}"},
         }
     payload: dict[str, Any] = {
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": req.prompt},
-                    image_content,
-                ],
-            }
-        ],
         "max_tokens": req.max_tokens,
         "temperature": req.temperature,
         "stream": False,
     }
     try:
-        resp = await _client().post(
-            f"{LLAMA_BASE}/v1/chat/completions",
-            json=payload,
-            timeout=300.0,
-        )
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()

 from __future__ import annotations
 import logging
 import sys
 from typing import Any, AsyncIterator
 import httpx
 )
 logger = logging.getLogger("gemma4")
+LLAMA_BASE = "http://127.0.0.1:8080"
+app = FastAPI(title="Gemma 4 API", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
+_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=10.0))
 @app.middleware("http")
 async def log_requests(request: Request, call_next):
     return response
 async def _proxy_stream(url: str, payload: dict[str, Any]) -> AsyncIterator[bytes]:
+    async with _client.stream("POST", url, json=payload) as resp:
         if resp.status_code != 200:
+            yield await resp.aread()
             return
         async for chunk in resp.aiter_bytes():
             yield chunk
 # Pydantic models
 # ---------------------------------------------------------------------------
 class SimpleChatRequest(BaseModel):
     messages: list[dict[str, Any]]
     max_tokens: int = 2048
 @app.get("/health")
 async def health() -> dict[str, Any]:
     try:
+        resp = await _client.get(f"{LLAMA_BASE}/health", timeout=5.0)
         llama_status = resp.json() if resp.status_code == 200 else {"status": "error"}
     except httpx.TransportError:
         raise HTTPException(status_code=503, detail="llama.cpp server unreachable")
+    return {"status": "ok", "llama": llama_status}
 @app.get("/v1/models")
 async def list_models() -> Any:
     try:
+        resp = await _client.get(f"{LLAMA_BASE}/v1/models", timeout=10.0)
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     return resp.json()
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request) -> Any:
     payload = await request.json()
+    if payload.get("stream", False):
         return StreamingResponse(
             _proxy_stream(f"{LLAMA_BASE}/v1/chat/completions", payload),
             media_type="text/event-stream",
         )
     try:
+        resp = await _client.post(f"{LLAMA_BASE}/v1/chat/completions", json=payload, timeout=300.0)
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()
         "temperature": req.temperature,
         "stream": req.stream,
     }
     if req.stream:
         return StreamingResponse(
             _proxy_stream(f"{LLAMA_BASE}/v1/chat/completions", payload),
             media_type="text/event-stream",
         )
     try:
+        resp = await _client.post(f"{LLAMA_BASE}/v1/chat/completions", json=payload, timeout=300.0)
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()
         "temperature": req.temperature,
         "stream": req.stream,
     }
     if req.stream:
         return StreamingResponse(
             _proxy_stream(f"{LLAMA_BASE}/v1/chat/completions", payload),
             media_type="text/event-stream",
         )
     try:
+        resp = await _client.post(f"{LLAMA_BASE}/v1/chat/completions", json=payload, timeout=300.0)
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()
 @app.post("/vision")
 async def vision(req: VisionRequest) -> Any:
     if req.image.startswith("http://") or req.image.startswith("https://"):
+        image_content: dict[str, Any] = {"type": "image_url", "image_url": {"url": req.image}}
     else:
         image_content = {
             "type": "image_url",
             "image_url": {"url": f"data:image/jpeg;base64,{req.image}"},
         }
     payload: dict[str, Any] = {
+        "messages": [{
+            "role": "user",
+            "content": [{"type": "text", "text": req.prompt}, image_content],
+        }],
         "max_tokens": req.max_tokens,
         "temperature": req.temperature,
         "stream": False,
     }
     try:
+        resp = await _client.post(f"{LLAMA_BASE}/v1/chat/completions", json=payload, timeout=300.0)
     except httpx.TransportError as exc:
         raise HTTPException(status_code=503, detail=str(exc))
     if resp.status_code != 200:
         raise HTTPException(status_code=resp.status_code, detail=resp.text)
     return resp.json()

start.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/bin/bash
+set -e
+# Find llama-server binary
+LLAMA_BIN=$(find /usr /app /llama.cpp /usr/local / -maxdepth 6 -name "llama-server" -type f 2>/dev/null | head -1)
+if [ -z "$LLAMA_BIN" ]; then
+    echo "ERROR: llama-server binary not found"
+    exit 1
+fi
+echo "Found llama-server at: $LLAMA_BIN"
+"$LLAMA_BIN" \
+    -m /app/gemma-4-E2B-it-UD-Q5_K_XL.gguf \
+    --mmproj /app/mmproj-BF16.gguf \
+    --host 127.0.0.1 \
+    --port 8080 \
+    -t 2 \
+    --cache-type-k q8_0 \
+    --cache-type-v iq4_nl \
+    -c 128000 \
+    -n 38912 &
+LLAMA_PID=$!
+echo "llama-server started (PID $LLAMA_PID)"
+# Wait up to 5 minutes for llama-server to be healthy
+echo "Waiting for llama-server to be ready..."
+for i in $(seq 1 150); do
+    if curl -sf http://127.0.0.1:8080/health > /dev/null 2>&1; then
+        echo "llama-server is ready"
+        break
+    fi
+    if ! kill -0 "$LLAMA_PID" 2>/dev/null; then
+        echo "ERROR: llama-server process died"
+        exit 1
+    fi
+    if [ "$i" -eq 150 ]; then
+        echo "ERROR: llama-server did not become ready in time"
+        exit 1
+    fi
+    sleep 2
+done
+exec uvicorn main:app --host 0.0.0.0 --port 7860