"""
Ollama-compatible API server
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  ⚡ qwen2.5-coder-1.5b  → coding, quick replies        (port 8080)
  🧠 qwen3.5-4b          → thinking, hard problems       (port 8081)
  🌐 gemma3-4b           → translation, general chat     (port 8082)
  🔍 qwen3.5-0.8b        → internet queries, news, fast  (port 8083)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  NO extra packages — web search uses only requests (already installed)
  Downloads + server starts run in background — port 7860 binds instantly
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""

from fastapi import FastAPI, HTTPException, Response
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
import subprocess
import requests
import uvicorn
import os
import json
import time
import hashlib
import threading
import urllib.parse
from typing import Optional

app = FastAPI()


# ---------------------------
# MODEL CONFIGS
# ---------------------------

MODELS = {
    "qwen2.5-coder-1.5b": {
        "path":       "models/qwen2.5-coder-1.5b.gguf",
        "repo":       "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF",
        "file":       "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf",
        "port":       8080,
        "param_size": "1.5B",
        "family":     "qwen2.5",
        "fmt":        "chatml",
        "web_search": False,
        "threads":    2,
        "ctx":        8192,
        "batch":      512,
    },
    "qwen3.5-4b": {
        "path":       "models/qwen3.5-4b.gguf",
        "repo":       "bartowski/Qwen_Qwen3.5-4B-GGUF",
        "file":       "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
        "port":       8081,
        "param_size": "4B",
        "family":     "qwen3.5",
        "fmt":        "chatml",
        "web_search": False,
        "threads":    2,
        "ctx":        8192,
        "batch":      512,
    },
    "gemma3-4b": {
        "path":       "models/gemma3-4b.gguf",
        "repo":       "bartowski/google_gemma-3-4b-it-GGUF",
        "file":       "google_gemma-3-4b-it-Q4_K_M.gguf",
        "port":       8082,
        "param_size": "4B",
        "family":     "gemma3",
        "fmt":        "gemma",
        "web_search": False,
        "threads":    2,
        "ctx":        8192,
        "batch":      512,
    },
    "qwen3.5-0.8b": {
        "path":       "models/qwen3.5-0.8b.gguf",
        "repo":       "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
        "file":       "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
        "port":       8083,
        "param_size": "0.8B",
        "family":     "qwen3.5",
        "fmt":        "chatml",
        "web_search": True,
        "threads":    2,
        "ctx":        8192,
        "batch":      512,
    },
}

DEFAULT_MODEL = "qwen2.5-coder-1.5b"
LLAMA_SERVER  = "./llama.cpp/build/bin/llama-server"
_server_ready: dict = {k: False for k in MODELS}


# ---------------------------
# REQUEST MODELS
# ---------------------------

class ChatRequest(BaseModel):
    model:    str  = DEFAULT_MODEL
    messages: list
    stream:   bool = True
    options:  Optional[dict] = None


class GenerateRequest(BaseModel):
    model:   str  = DEFAULT_MODEL
    prompt:  str
    stream:  bool = False
    options: Optional[dict] = None


# ---------------------------
# WEB SEARCH — pure requests, no extra package
# ---------------------------

def web_search(query: str, max_results: int = 3) -> str:
    """
    DuckDuckGo search using only the `requests` library.
    Uses DDG's JSON API — no API key, no extra packages.
    """
    try:
        # Step 1: get vqd token (DDG requires this for search)
        encoded = urllib.parse.quote(query)
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; LLM-Search/1.0)",
        }

        # Use DDG lite HTML endpoint — most reliable, no JS required
        resp = requests.get(
            f"https://html.duckduckgo.com/html/?q={encoded}",
            headers=headers,
            timeout=8,
        )

        if resp.status_code != 200:
            return ""

        # Parse results from HTML using simple string extraction
        html = resp.text
        results = []

        # Extract result blocks between <div class="result"> tags
        import re
        # Extract titles and snippets
        titles   = re.findall(r'class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL)
        snippets = re.findall(r'class="result__snippet"[^>]*>(.*?)</div>', html, re.DOTALL)
        urls     = re.findall(r'class="result__url"[^>]*>(.*?)</span>', html, re.DOTALL)

        # Clean HTML tags from extracted text
        def strip_tags(text):
            return re.sub(r'<[^>]+>', '', text).strip()

        count = min(max_results, len(titles), len(snippets))
        if count == 0:
            return ""

        context = "=== Web Search Results ===\n"
        for i in range(count):
            title   = strip_tags(titles[i])
            snippet = strip_tags(snippets[i])
            url     = strip_tags(urls[i]) if i < len(urls) else ""
            context += f"\n[{i+1}] {title}\n{snippet}\nSource: {url}\n"
        context += "\n=== End of Web Results ===\n"
        return context

    except Exception as e:
        print(f"  [web_search] error: {e}")
        return ""


def inject_web_context(messages: list) -> list:
    """Inject DuckDuckGo results as system context before last user message."""
    if not messages:
        return messages

    last_user = next(
        (m for m in reversed(messages) if m.get("role") == "user"), None
    )
    if not last_user:
        return messages

    user_text = last_user.get("content", "")
    print(f"  [web_search] searching: {user_text[:60]}...")
    context = web_search(user_text)

    if not context:
        print("  [web_search] no results, continuing without web context")
        return messages

    print(f"  [web_search] injected {len(context)} chars of context")

    web_system = {
        "role": "system",
        "content": (
            "You have access to the following real-time web search results. "
            "Use them to answer the user's question accurately and concisely. "
            "Always mention the source when using web data. "
            "If the results are not relevant, rely on your own knowledge.\n\n"
            + context
        )
    }

    new_messages = []
    inserted = False
    for m in messages:
        if m is last_user and not inserted:
            new_messages.append(web_system)
            inserted = True
        new_messages.append(m)
    return new_messages


# ---------------------------
# PROMPT BUILDER
# ---------------------------

def build_prompt(messages: list, fmt: str = "chatml") -> str:

    if fmt == "gemma":
        prompt = "<bos>"
        for m in messages:
            role    = m.get("role", "user")
            content = m.get("content", "").strip()
            if not content:
                continue
            if role == "system":
                prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
            elif role == "user":
                prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
            elif role == "assistant":
                prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
        prompt += "<start_of_turn>model\n"
        return prompt

    # ChatML (Qwen2.5, Qwen3.5)
    prompt = ""
    has_system = any(m.get("role") == "system" for m in messages)
    if not has_system:
        prompt += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    for m in messages:
        role    = m.get("role", "user")
        content = m.get("content", "").strip()
        if not content:
            continue
        if role == "system":
            prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
        elif role == "user":
            prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
        elif role == "assistant":
            prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
    prompt += "<|im_start|>assistant\n"
    return prompt


# ---------------------------
# MODEL RESOLVER
# ---------------------------

def resolve_model(name: str) -> str:
    name = (name or DEFAULT_MODEL).lower().strip()
    if name in MODELS:
        return name
    for key in MODELS:
        if key in name or name in key:
            return key
    return DEFAULT_MODEL


# ---------------------------
# DOWNLOAD + START  (all in background)
# ---------------------------

def download_model(cfg: dict):
    if not os.path.exists(cfg["path"]):
        print(f"Downloading {cfg['file']} ...")
        downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"])
        os.system(f"cp '{downloaded}' '{cfg['path']}'")
        print(f"  ✓ saved to {cfg['path']}")


def start_llama(model_name: str, cfg: dict):
    download_model(cfg)

    print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
    log = open(f"llama_{model_name}.log", "w")

    process = subprocess.Popen([
        LLAMA_SERVER,
        "-m",           cfg["path"],
        "--host",       "0.0.0.0",
        "--port",       str(cfg["port"]),
        "-c",           str(cfg["ctx"]),
        "--threads",    str(cfg["threads"]),
        "--batch-size", str(cfg["batch"]),
        "-ngl",         "0",
        "-np",          "1",
    ], stdout=log, stderr=log)

    url = f"http://localhost:{cfg['port']}/health"
    for i in range(90):
        time.sleep(2)
        try:
            r = requests.get(url, timeout=2)
            if r.status_code == 200:
                _server_ready[model_name] = True
                print(f"  ✓ {model_name} ready (took ~{(i+1)*2}s)")
                return process
        except Exception:
            pass
        try:
            with open(f"llama_{model_name}.log") as lf:
                lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
                print(f"  [{model_name}] {lines[-1] if lines else 'starting...'}")
        except Exception:
            print(f"  waiting for {model_name}... ({i+1}/90)")

    print(f"  ✗ {model_name} failed — check llama_{model_name}.log")
    return None


def setup_all():
    """All downloads + server starts run here in background. Port 7860 binds instantly."""
    os.makedirs("models", exist_ok=True)
    for name, cfg in MODELS.items():
        threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()


# Kick off everything in background immediately — uvicorn binds port 7860 first
threading.Thread(target=setup_all, daemon=True).start()


# ---------------------------
# READINESS GUARD
# ---------------------------

def wait_for_model(model_key: str, timeout: int = 300):
    deadline = time.time() + timeout
    while time.time() < deadline:
        if _server_ready.get(model_key):
            return
        time.sleep(1)
    raise HTTPException(
        status_code=503,
        detail=f"Model '{model_key}' is still loading. Please wait and retry."
    )


# ---------------------------
# HELPERS
# ---------------------------

def model_meta(name: str, cfg: dict) -> dict:
    size   = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0
    digest = ""
    if os.path.exists(cfg["path"]):
        with open(cfg["path"], "rb") as f:
            digest = hashlib.md5(f.read(65536)).hexdigest()
    return {
        "name":        name,
        "model":       name,
        "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "size":        size,
        "digest":      f"sha256:{digest}",
        "details": {
            "format":             "gguf",
            "family":             cfg["family"],
            "families":           [cfg["family"]],
            "parameter_size":     cfg["param_size"],
            "quantization_level": "Q4_K_M",
        },
    }


def llama_params(options: Optional[dict], fmt: str = "chatml") -> dict:
    o = options or {}
    if fmt == "gemma":
        default_stop = ["<end_of_turn>", "<eos>"]
    else:
        default_stop = ["<|im_end|>", "<|endoftext|>", "</think>"]
    return {
        "temperature":    o.get("temperature",    0.7),
        "top_p":          o.get("top_p",          0.9),
        "top_k":          o.get("top_k",          40),
        "repeat_penalty": o.get("repeat_penalty", 1.1),
        "n_predict":      o.get("num_predict",    1024),
        "stop":           o.get("stop",           default_stop),
    }


# ---------------------------
# ROOT
# ---------------------------

@app.get("/")
def root():
    return {
        "status": "running",
        "models_ready": dict(_server_ready),
        "usage": {
            "⚡ fast coding":   "qwen2.5-coder-1.5b",
            "🧠 thinking":      "qwen3.5-4b  (add /think to message)",
            "🌐 translation":   "gemma3-4b",
            "🔍 internet/news": "qwen3.5-0.8b  (auto web search every message)",
        }
    }


# ---------------------------
# /health  (HEAD for UptimeRobot)
# ---------------------------

@app.head("/health")
def health_head():
    return Response(status_code=200)

@app.get("/health")
def health_get():
    return {"status": "ok", "ready": all(_server_ready.values())}


# ---------------------------
# /api/tags
# ---------------------------

@app.get("/api/tags")
def tags():
    return {"models": [model_meta(n, c) for n, c in MODELS.items()]}


# ---------------------------
# /api/show
# ---------------------------

@app.post("/api/show")
def show(body: dict):
    key  = resolve_model(body.get("name", DEFAULT_MODEL))
    cfg  = MODELS[key]
    meta = model_meta(key, cfg)
    meta["modelfile"]  = f"FROM {key}\n"
    meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
    if cfg["fmt"] == "gemma":
        meta["template"] = "{{ .Prompt }}"
    else:
        meta["template"] = (
            "<|im_start|>system\n{{ .System }}<|im_end|>\n"
            "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
    return meta


# ---------------------------
# /api/ps
# ---------------------------

@app.get("/api/ps")
def ps():
    running = []
    for name, cfg in MODELS.items():
        if _server_ready.get(name):
            m = model_meta(name, cfg)
            m["expires_at"] = "0001-01-01T00:00:00Z"
            m["size_vram"]  = 0
            running.append(m)
    return {"models": running}


# ---------------------------
# /api/generate
# ---------------------------

@app.post("/api/generate")
def generate(req: GenerateRequest):
    key = resolve_model(req.model)
    cfg = MODELS[key]
    wait_for_model(key)

    params           = llama_params(req.options, fmt=cfg["fmt"])
    params["prompt"] = req.prompt
    params["stream"] = req.stream

    r = requests.post(
        f"http://localhost:{cfg['port']}/completion",
        json=params, stream=req.stream, timeout=180,
    )

    if not req.stream:
        text = r.json().get("content", "").strip()
        return {"model": req.model, "response": text, "done": True, "done_reason": "stop"}

    def stream_gen():
        for line in r.iter_lines():
            if not line:
                continue
            line = line.decode("utf-8").strip()
            if line.startswith("data:"):
                line = line[5:].strip()
            try:
                data = json.loads(line)
            except Exception:
                continue
            token = data.get("content", "")
            done  = data.get("stop", False)
            yield json.dumps({"model": req.model, "response": token, "done": done}) + "\n"
            if done:
                break
        yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n"

    return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
                             headers={"Cache-Control": "no-cache"})


# ---------------------------
# /api/chat
# ---------------------------

@app.post("/api/chat")
def chat(req: ChatRequest):
    key = resolve_model(req.model)
    cfg = MODELS[key]
    wait_for_model(key)

    messages = req.messages
    if cfg.get("web_search", False):
        messages = inject_web_context(messages)

    prompt           = build_prompt(messages, fmt=cfg["fmt"])
    params           = llama_params(req.options, fmt=cfg["fmt"])
    params["prompt"] = prompt
    params["stream"] = req.stream

    r = requests.post(
        f"http://localhost:{cfg['port']}/completion",
        json=params, stream=req.stream, timeout=180,
    )

    if not req.stream:
        text = r.json().get("content", "").strip()
        return JSONResponse({
            "model":   req.model,
            "message": {"role": "assistant", "content": text},
            "done": True, "done_reason": "stop",
        })

    def stream_gen():
        for line in r.iter_lines():
            if not line:
                continue
            line = line.decode("utf-8").strip()
            if line.startswith("data:"):
                line = line[5:].strip()
            try:
                data = json.loads(line)
            except Exception:
                continue
            token = data.get("content", "")
            done  = data.get("stop", False)
            yield json.dumps({
                "model":   req.model,
                "message": {"role": "assistant", "content": token},
                "done":    done,
            }) + "\n"
            if done:
                break
        yield json.dumps({"model": req.model, "done": True, "done_reason": "stop"}) + "\n"

    return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
                             headers={"Cache-Control": "no-cache"})


# ---------------------------
# START
# ---------------------------

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)