Spaces:
Sleeping
Sleeping
Dmitry Beresnev commited on
Commit ·
f64a284
1
Parent(s): 7763bf4
fix dockerfile
Browse files- Dockerfile +6 -4
- app.py +46 -12
Dockerfile
CHANGED
|
@@ -9,9 +9,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 9 |
libcurl4-openssl-dev \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
-
# Clone and build llama.cpp with
|
| 13 |
WORKDIR /build
|
| 14 |
-
ARG CACHEBUST=
|
| 15 |
RUN git clone https://github.com/ggerganov/llama.cpp.git && \
|
| 16 |
cd llama.cpp && \
|
| 17 |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
|
@@ -19,7 +19,8 @@ RUN git clone https://github.com/ggerganov/llama.cpp.git && \
|
|
| 19 |
-DGGML_AVX2=OFF \
|
| 20 |
-DGGML_AVX=OFF \
|
| 21 |
-DGGML_FMA=OFF \
|
| 22 |
-
-DGGML_F16C=OFF
|
|
|
|
| 23 |
cmake --build build --config Release --target llama-server -j1 && \
|
| 24 |
echo "=== Binary dependencies ===" && \
|
| 25 |
ldd build/bin/llama-server || true
|
|
@@ -27,12 +28,13 @@ RUN git clone https://github.com/ggerganov/llama.cpp.git && \
|
|
| 27 |
# Runtime stage
|
| 28 |
FROM debian:bookworm-slim
|
| 29 |
|
| 30 |
-
# Install runtime dependencies
|
| 31 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 32 |
libcurl4 \
|
| 33 |
ca-certificates \
|
| 34 |
libgomp1 \
|
| 35 |
libstdc++6 \
|
|
|
|
| 36 |
&& rm -rf /var/lib/apt/lists/*
|
| 37 |
|
| 38 |
# Copy llama-server binary and all shared libraries from builder
|
|
|
|
| 9 |
libcurl4-openssl-dev \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
+
# Clone and build llama.cpp with SSL support for HuggingFace Hub
|
| 13 |
WORKDIR /build
|
| 14 |
+
ARG CACHEBUST=5
|
| 15 |
RUN git clone https://github.com/ggerganov/llama.cpp.git && \
|
| 16 |
cd llama.cpp && \
|
| 17 |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
|
|
|
| 19 |
-DGGML_AVX2=OFF \
|
| 20 |
-DGGML_AVX=OFF \
|
| 21 |
-DGGML_FMA=OFF \
|
| 22 |
+
-DGGML_F16C=OFF \
|
| 23 |
+
-DLLAMA_CURL=ON && \
|
| 24 |
cmake --build build --config Release --target llama-server -j1 && \
|
| 25 |
echo "=== Binary dependencies ===" && \
|
| 26 |
ldd build/bin/llama-server || true
|
|
|
|
| 28 |
# Runtime stage
|
| 29 |
FROM debian:bookworm-slim
|
| 30 |
|
| 31 |
+
# Install runtime dependencies including SSL/HTTPS support
|
| 32 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 33 |
libcurl4 \
|
| 34 |
ca-certificates \
|
| 35 |
libgomp1 \
|
| 36 |
libstdc++6 \
|
| 37 |
+
openssl \
|
| 38 |
&& rm -rf /var/lib/apt/lists/*
|
| 39 |
|
| 40 |
# Copy llama-server binary and all shared libraries from builder
|
app.py
CHANGED
|
@@ -523,10 +523,26 @@ async def start_llama_server(model_id: str, port: int) -> tuple[subprocess.Popen
|
|
| 523 |
while elapsed < max_wait_time:
|
| 524 |
# Check if process died
|
| 525 |
if process.poll() is not None:
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
logger.error(f"llama-server exited with code {process.returncode}")
|
| 528 |
-
logger.error(f"
|
| 529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
try:
|
| 532 |
# Use aiohttp for async health check
|
|
@@ -596,14 +612,22 @@ async def startup_event():
|
|
| 596 |
model_id = AVAILABLE_MODELS[current_model]
|
| 597 |
port = model_cache._get_next_port()
|
| 598 |
|
| 599 |
-
|
| 600 |
-
|
|
|
|
| 601 |
|
| 602 |
-
|
| 603 |
-
|
| 604 |
|
| 605 |
-
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
|
| 608 |
|
| 609 |
@app.on_event("shutdown")
|
|
@@ -611,10 +635,20 @@ async def shutdown_event():
|
|
| 611 |
"""Clean shutdown - clear cache and close HTTP session."""
|
| 612 |
logger.info("Application shutdown initiated")
|
| 613 |
|
| 614 |
-
|
| 615 |
-
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
|
| 620 |
@app.get(
|
|
|
|
| 523 |
while elapsed < max_wait_time:
|
| 524 |
# Check if process died
|
| 525 |
if process.poll() is not None:
|
| 526 |
+
# Process died - collect output for debugging
|
| 527 |
+
try:
|
| 528 |
+
stdout = process.stdout.read() if process.stdout else ""
|
| 529 |
+
except:
|
| 530 |
+
stdout = "Unable to read stdout"
|
| 531 |
+
|
| 532 |
logger.error(f"llama-server exited with code {process.returncode}")
|
| 533 |
+
logger.error(f"Model ID: {model_id}")
|
| 534 |
+
logger.error(f"Port: {port}")
|
| 535 |
+
logger.error(f"Output:\n{stdout}")
|
| 536 |
+
|
| 537 |
+
# Provide helpful error message
|
| 538 |
+
error_msg = f"llama-server process died (exit code {process.returncode})"
|
| 539 |
+
if "HTTPS is not supported" in str(stdout):
|
| 540 |
+
error_msg += "\n\nHTTPS support is missing. The llama-server binary needs to be rebuilt with CURL/SSL support."
|
| 541 |
+
error_msg += "\nAdd -DLLAMA_CURL=ON to the cmake build flags."
|
| 542 |
+
elif "no usable GPU found" in str(stdout):
|
| 543 |
+
error_msg += "\n\nNote: Running on CPU only (no GPU detected)."
|
| 544 |
+
|
| 545 |
+
raise RuntimeError(error_msg)
|
| 546 |
|
| 547 |
try:
|
| 548 |
# Use aiohttp for async health check
|
|
|
|
| 612 |
model_id = AVAILABLE_MODELS[current_model]
|
| 613 |
port = model_cache._get_next_port()
|
| 614 |
|
| 615 |
+
try:
|
| 616 |
+
process, load_time = await start_llama_server(model_id, port)
|
| 617 |
+
await model_cache.put(current_model, model_id, process, port, load_time)
|
| 618 |
|
| 619 |
+
metrics.startup_time = time.time() - startup_start
|
| 620 |
+
logger.info(f"Started with default model: {current_model} (total startup: {metrics.startup_time:.2f}s)")
|
| 621 |
|
| 622 |
+
# Start preloading in background
|
| 623 |
+
asyncio.create_task(preload_models_background())
|
| 624 |
+
except Exception as e:
|
| 625 |
+
# Clean up on startup failure
|
| 626 |
+
logger.error(f"Startup failed: {e}")
|
| 627 |
+
if http_session:
|
| 628 |
+
await http_session.close()
|
| 629 |
+
model_cache._release_port(port)
|
| 630 |
+
raise
|
| 631 |
|
| 632 |
|
| 633 |
@app.on_event("shutdown")
|
|
|
|
| 635 |
"""Clean shutdown - clear cache and close HTTP session."""
|
| 636 |
logger.info("Application shutdown initiated")
|
| 637 |
|
| 638 |
+
# Clear model cache first
|
| 639 |
+
try:
|
| 640 |
+
await model_cache.clear()
|
| 641 |
+
except Exception as e:
|
| 642 |
+
logger.error(f"Error clearing cache during shutdown: {e}")
|
| 643 |
|
| 644 |
+
# Close HTTP session
|
| 645 |
+
if http_session and not http_session.closed:
|
| 646 |
+
try:
|
| 647 |
+
await http_session.close()
|
| 648 |
+
# Give it a moment to close gracefully
|
| 649 |
+
await asyncio.sleep(0.1)
|
| 650 |
+
except Exception as e:
|
| 651 |
+
logger.error(f"Error closing HTTP session: {e}")
|
| 652 |
|
| 653 |
|
| 654 |
@app.get(
|