Spaces:

NanoBotAIAgent
/

gemma-4-e2b-uncensored-api

Sleeping

Entrypoint with reasoning on for Gemma-4-E2B

8a42673 verified 6 days ago

1.72 kB

	#!/bin/bash
	set -e

	MODEL_PATH="/data/model/Gemma-4-E2B-Uncensored-HauhauCS-Aggressive-Q8_K_P.gguf"

	# Detect CPU cores for max threading
	NPROC=$(nproc)
	echo "Using $NPROC threads"

	# Context window = 131072 tokens.
	# --n-predict 25000 sets the default/maximum generated tokens per request.
	#
	# Reasoning ("thinking") is ENABLED BY DEFAULT:
	# --jinja use the model's embedded Gemma-4 chat template
	# --reasoning on sets enable_thinking=true in the template kwargs,
	# which injects <\|think\|> and lets the model generate
	# <\|channel>thought...reasoning...<channel\|> blocks
	# --reasoning-format deepseek
	# extracts the <\|channel>thought...<channel\|> block
	# into a separate `reasoning_content` field (same as
	# DeepSeek/Qwen3 API format)
	# --reasoning-budget -1 unrestricted thinking length
	#
	# NOTE: Gemma-4 uses <\|channel>thought / <channel\|> delimiters (NOT <think>).
	# llama.cpp auto-detects the Gemma-4 template and uses the correct PEG parser.
	/app/llama-server \
	--model "$MODEL_PATH" \
	--port 8080 \
	--host 127.0.0.1 \
	--ctx-size 131072 \
	--n-predict 25000 \
	--parallel 1 \
	--threads "$NPROC" \
	--threads-batch "$NPROC" \
	--batch-size 512 \
	--jinja \
	--reasoning on \
	--reasoning-format deepseek \
	--reasoning-budget -1 &

	LLAMA_PID=$!

	echo "Waiting for llama-server to start..."
	for i in {1..600}; do
	if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
	echo "llama-server is ready!"
	break
	fi
	sleep 1
	done

	exec uvicorn proxy:app --host 0.0.0.0 --port 8000 --proxy-headers