File size: 2,102 Bytes
197748f
 
2c0bdd9
 
 
 
 
 
197748f
2e24877
197748f
 
 
 
 
 
2e24877
 
 
 
 
 
197748f
2e24877
197748f
 
 
2e24877
197748f
 
 
 
2e24877
197748f
2c0bdd9
 
 
 
197748f
2c0bdd9
 
 
 
 
197748f
 
 
 
0a7b900
197748f
 
2c0bdd9
0a7b900
 
2c0bdd9
 
197748f
0a7b900
2c0bdd9
 
 
0a7b900
 
2c0bdd9
0a7b900
2c0bdd9
 
 
 
197748f
0a7b900
197748f
2c0bdd9
0a7b900
197748f
0a7b900
197748f
0a7b900
197748f
2e24877
197748f
2e24877
 
 
 
 
 
 
0a7b900
2e24877
197748f
 
 
0a7b900
197748f
2c0bdd9
2e24877
 
 
0a7b900
2e24877
197748f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import multiprocessing
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import config


# ============================
# Download Model
# ============================

HF_TOKEN = os.environ.get("HF_TOKEN")

print("Downloading model from Hugging Face Hub...")

model_path = hf_hub_download(
    repo_id=config.MODEL_REPO,
    filename=config.MODEL_FILE,
    token=HF_TOKEN,
    cache_dir="/tmp/hf_cache"
)

print("Model downloaded successfully:", model_path)


# ============================
# Load Model
# ============================

CPU_THREADS = multiprocessing.cpu_count()

print("CPU Threads available:", CPU_THREADS)
print("Loading model into memory...")

llm = Llama(
    model_path=model_path,
    n_ctx=config.CTX_SIZE,
    n_threads=CPU_THREADS,
    n_batch=512,
    use_mmap=True,
    verbose=False
)

print("Model loaded successfully.")


# ============================
# Prompt Builder
# ============================

SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant.
Write clean and efficient code.
Only explain when asked.
"""


def build_prompt(message, history):

    prompt = SYSTEM_PROMPT + "\n\n"

    for user_msg, assistant_msg in history:
        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"

    prompt += f"User: {message}\nAssistant:"

    return prompt


# ============================
# Generate Response
# ============================

def chat(message, history):

    history = history or []

    prompt = build_prompt(message, history)

    output = ""

    for token in llm(
        prompt,
        max_tokens=config.MAX_TOKENS,
        temperature=config.TEMPERATURE,
        top_p=0.95,
        stream=True
    ):
        output += token["choices"][0]["text"]
        yield output


# ============================
# Launch Gradio ChatInterface
# ============================

demo = gr.ChatInterface(
    fn=chat,
    title="DeepSeek Coder 1.3B",
    description="Production GGUF model running on llama.cpp"
)

demo.launch(
    server_name="0.0.0.0",
    server_port=7860
)