Spaces:
Sleeping
Sleeping
File size: 2,102 Bytes
197748f 2c0bdd9 197748f 2e24877 197748f 2e24877 197748f 2e24877 197748f 2e24877 197748f 2e24877 197748f 2c0bdd9 197748f 2c0bdd9 197748f 0a7b900 197748f 2c0bdd9 0a7b900 2c0bdd9 197748f 0a7b900 2c0bdd9 0a7b900 2c0bdd9 0a7b900 2c0bdd9 197748f 0a7b900 197748f 2c0bdd9 0a7b900 197748f 0a7b900 197748f 0a7b900 197748f 2e24877 197748f 2e24877 0a7b900 2e24877 197748f 0a7b900 197748f 2c0bdd9 2e24877 0a7b900 2e24877 197748f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | import os
import multiprocessing
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import config
# ============================
# Download Model
# ============================
HF_TOKEN = os.environ.get("HF_TOKEN")
print("Downloading model from Hugging Face Hub...")
model_path = hf_hub_download(
repo_id=config.MODEL_REPO,
filename=config.MODEL_FILE,
token=HF_TOKEN,
cache_dir="/tmp/hf_cache"
)
print("Model downloaded successfully:", model_path)
# ============================
# Load Model
# ============================
CPU_THREADS = multiprocessing.cpu_count()
print("CPU Threads available:", CPU_THREADS)
print("Loading model into memory...")
llm = Llama(
model_path=model_path,
n_ctx=config.CTX_SIZE,
n_threads=CPU_THREADS,
n_batch=512,
use_mmap=True,
verbose=False
)
print("Model loaded successfully.")
# ============================
# Prompt Builder
# ============================
SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant.
Write clean and efficient code.
Only explain when asked.
"""
def build_prompt(message, history):
prompt = SYSTEM_PROMPT + "\n\n"
for user_msg, assistant_msg in history:
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
prompt += f"User: {message}\nAssistant:"
return prompt
# ============================
# Generate Response
# ============================
def chat(message, history):
history = history or []
prompt = build_prompt(message, history)
output = ""
for token in llm(
prompt,
max_tokens=config.MAX_TOKENS,
temperature=config.TEMPERATURE,
top_p=0.95,
stream=True
):
output += token["choices"][0]["text"]
yield output
# ============================
# Launch Gradio ChatInterface
# ============================
demo = gr.ChatInterface(
fn=chat,
title="DeepSeek Coder 1.3B",
description="Production GGUF model running on llama.cpp"
)
demo.launch(
server_name="0.0.0.0",
server_port=7860
) |