# Ollama Modelfile — Phi-4 Multimodal Instruct Q4_K_M
# Optimised for: Intel 11th Gen NUC, 8 GB RAM, CPU-only
#
# Source model : microsoft/Phi-4-multimodal-instruct
# License      : MIT  https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/LICENSE
# Quantization : Q4_K_M via llama.cpp llama-quantize
# Architecture : phi3 (3.8B LLM backbone + vision/speech adapters in base GGUF)

FROM ./phi4-mm-Q4_K_M.gguf

# ── Context & KV cache ───────────────────────────────────────────────────────
# 8 192 tokens balances capability vs RAM on 8 GB hardware.
# Lower to 4096 if you observe OOM / heavy swapping.
PARAMETER num_ctx 8192

# ── CPU tuning ───────────────────────────────────────────────────────────────
# 11th Gen NUC typically 4 cores / 8 logical threads (i5/i7-1135G7 / 1165G7).
# Reduce to 4 if the NUC is a Core i3 variant.
PARAMETER num_thread 8

# No discrete GPU — all layers run on CPU.
PARAMETER num_gpu 0

# Flash attention is a GPU feature; disable for CPU inference.
PARAMETER flash_attn false

# ── Generation defaults ───────────────────────────────────────────────────────
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER repeat_penalty 1.1
PARAMETER stop "<|end|>"
PARAMETER stop "<|user|>"
PARAMETER stop "<|assistant|>"

# ── System prompt ─────────────────────────────────────────────────────────────
SYSTEM """You are a helpful, accurate, and concise AI assistant. You excel at reasoning, analysis, writing, coding, and answering questions. Be direct and thorough."""