File size: 5,818 Bytes
44522b6
 
 
 
 
 
 
 
 
 
 
 
bfd4088
44522b6
 
 
bfd4088
44522b6
 
 
 
 
 
 
 
 
bfd4088
44522b6
 
 
 
 
 
 
 
 
 
bfd4088
44522b6
 
 
bfd4088
 
44522b6
 
 
 
 
 
 
 
 
bfd4088
44522b6
 
 
 
 
 
 
 
 
bfd4088
 
44522b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfd4088
 
44522b6
 
 
 
 
 
 
 
bfd4088
 
 
44522b6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
SurvivalAI Pro β€” HF Space chat interface.

Runs the V1 LoRA-finetuned Phi-3 GGUF (Q4_K_M) locally inside the Space via
llama-cpp-python. The Space is CPU-only; on paid CPU Upgrade hardware we get
~6-10 tokens/sec. The GGUF is fetched from a separate HF model repo at cold
start because the 2.4 GB file exceeds Space repo limits.
"""

import os
from pathlib import Path

import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama


# ── Config ───────────────────────────────────────────────────────────────────
MODEL_REPO     = "Znilsson/survivalai-phi3-gguf"        # private model repo
MODEL_FILENAME = "survivalai-phi3-Q4_K_M.gguf"
N_CTX          = 4096
N_THREADS      = int(os.environ.get("N_THREADS", "4"))
N_BATCH        = 256
MAX_TOKENS     = 400
TEMPERATURE    = 0.7
TOP_P          = 0.9

SYSTEM_MSG = (
    "You are SurvivalAI, an expert survival and civilizational knowledge "
    "assistant. You provide accurate, practical, and potentially life-saving "
    "information about wilderness survival, emergency preparedness, first aid, "
    "food procurement, water purification, shelter construction, navigation, "
    "and rebuilding civilization. Your responses are clear, actionable, and "
    "thorough. The user is in an off-grid context β€” assume no doctor, no "
    "Poison Control, no internet, no professional help is available. Give "
    "the best answer you can with the knowledge you have."
)

# Phi-3 chat template
PHI3_TMPL = "<|user|>\n{user}<|end|>\n<|assistant|>\n"
STOP_TOKENS = ["<|end|>", "<|user|>", "<|endoftext|>"]


# ── Model download + load (cold start) ───────────────────────────────────────
print(f"Fetching {MODEL_FILENAME} from {MODEL_REPO}...")
model_path = hf_hub_download(
    repo_id   = MODEL_REPO,
    filename  = MODEL_FILENAME,
    token     = os.environ.get("HF_TOKEN"),   # required if repo is private
    cache_dir = "/data" if Path("/data").exists() else None,
)
print(f"Model file: {model_path}")

print(f"Loading Llama (n_ctx={N_CTX}, n_threads={N_THREADS})...")
llm = Llama(
    model_path = model_path,
    n_ctx      = N_CTX,
    n_threads  = N_THREADS,
    n_batch    = N_BATCH,
    verbose    = False,
)
print("Model loaded. Ready.")


# ── Chat function ────────────────────────────────────────────────────────────
def build_prompt(history, user_msg):
    """Build a Phi-3 prompt incorporating system message + chat history.

    Phi-3 chat template uses <|system|>, <|user|>, <|assistant|>, <|end|>.
    We collapse the system message into the first user turn for simplicity
    (this is the same approach used during training/eval).
    """
    parts = []
    # Embed system msg as a preamble inside the first user turn so behavior
    # matches what the eval rubric saw during training.
    if not history:
        first_user = f"{SYSTEM_MSG}\n\nQuestion: {user_msg}"
        parts.append(PHI3_TMPL.format(user=first_user).rstrip("\n"))
    else:
        # Replay history
        for i, (u, a) in enumerate(history):
            if i == 0:
                u = f"{SYSTEM_MSG}\n\nQuestion: {u}"
            parts.append(f"<|user|>\n{u}<|end|>\n<|assistant|>\n{a}<|end|>")
        # Add current turn
        parts.append(f"<|user|>\n{user_msg}<|end|>\n<|assistant|>\n")
    return "\n".join(parts)


def chat_fn(message, history):
    """Generator: yields incremental partial responses for streaming UI."""
    prompt = build_prompt(history, message)
    accum = ""
    try:
        for chunk in llm(
            prompt,
            max_tokens   = MAX_TOKENS,
            temperature  = TEMPERATURE,
            top_p        = TOP_P,
            stop         = STOP_TOKENS,
            stream       = True,
        ):
            tok = chunk["choices"][0]["text"]
            accum += tok
            yield accum
    except Exception as e:
        yield f"[ERROR: {e}]"


# ── UI ───────────────────────────────────────────────────────────────────────
EXAMPLES = [
    "I cut my leg badly with an axe in the woods. Walk me through what to do.",
    "How do I find drinkable water if I'm stuck in a forest with no supplies?",
    "It's getting dark and dropping below freezing. How do I build a shelter from what's around?",
    "What edible plants are common in temperate North American forests?",
    "I need to navigate without a compass. How do I find north?",
]

DESCRIPTION = """
**SurvivalAI Pro** β€” fine-tuned off-grid survival assistant, running fully on CPU inside this Space.

Built on Phi-3-mini-4k-instruct, fine-tuned on ~150,000 survival-knowledge Q/A pairs covering medical
first aid, water, food, shelter, fire, navigation, signaling, foraging, hunting, and tools.

⚠️ **Prototype β€” not for clinical or life-critical use.** This model can produce confident-sounding
but incorrect specifics for trap categories like exact drug dosages or precise frequencies. For
survival-skill guidance it scores well; for precise numerical specifics, double-check with an
authoritative reference.
"""

demo = gr.ChatInterface(
    fn          = chat_fn,
    title       = "πŸ•οΈ SurvivalAI Pro",
    description = DESCRIPTION,
    examples    = EXAMPLES,
    cache_examples = False,
    theme       = gr.themes.Soft(),
)


if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)