Spaces:

Anonymous0045
/

Coder

Sleeping

File size: 2,102 Bytes

import os
import multiprocessing
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import config


# ============================
# Download Model
# ============================

HF_TOKEN = os.environ.get("HF_TOKEN")

print("Downloading model from Hugging Face Hub...")

model_path = hf_hub_download(
    repo_id=config.MODEL_REPO,
    filename=config.MODEL_FILE,
    token=HF_TOKEN,
    cache_dir="/tmp/hf_cache"
)

print("Model downloaded successfully:", model_path)


# ============================
# Load Model
# ============================

CPU_THREADS = multiprocessing.cpu_count()

print("CPU Threads available:", CPU_THREADS)
print("Loading model into memory...")

llm = Llama(
    model_path=model_path,
    n_ctx=config.CTX_SIZE,
    n_threads=CPU_THREADS,
    n_batch=512,
    use_mmap=True,
    verbose=False
)

print("Model loaded successfully.")


# ============================
# Prompt Builder
# ============================

SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant.
Write clean and efficient code.
Only explain when asked.
"""


def build_prompt(message, history):

    prompt = SYSTEM_PROMPT + "\n\n"

    for user_msg, assistant_msg in history:
        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"

    prompt += f"User: {message}\nAssistant:"

    return prompt


# ============================
# Generate Response
# ============================

def chat(message, history):

    history = history or []

    prompt = build_prompt(message, history)

    output = ""

    for token in llm(
        prompt,
        max_tokens=config.MAX_TOKENS,
        temperature=config.TEMPERATURE,
        top_p=0.95,
        stream=True
    ):
        output += token["choices"][0]["text"]
        yield output


# ============================
# Launch Gradio ChatInterface
# ============================

demo = gr.ChatInterface(
    fn=chat,
    title="DeepSeek Coder 1.3B",
    description="Production GGUF model running on llama.cpp"
)

demo.launch(
    server_name="0.0.0.0",
    server_port=7860
)