Spaces:

OrbitMC
/

hf_dead

Runtime error

File size: 8,267 Bytes

import os
import time
import json
import threading
from flask import Flask, request, Response, stream_with_context
from huggingface_hub import hf_hub_download

# --- Config ---
MODEL_DIR = "/tmp/models"
REPO = "mradermacher/LFM2-2.6B-Uncensored-X64-GGUF"
FILENAME = "LFM2-2.6B-Uncensored-X64.Q3_K_S.gguf"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

os.makedirs(MODEL_DIR, exist_ok=True)

# --- Download model ---
if not os.path.exists(MODEL_PATH):
    print(f"Downloading {FILENAME} ...")
    hf_hub_download(repo_id=REPO, filename=FILENAME, local_dir=MODEL_DIR)
    print("Download complete.")

# --- Load vLLM engine ---
from vllm import LLM, SamplingParams

print("Loading model with vLLM ...")
llm_engine = LLM(
    model=MODEL_PATH,
    tokenizer="meta-llama/Llama-2-7b-hf",  # fallback tokenizer for GGUF
    max_model_len=2048,
    dtype="float32",       # CPU needs float32
    device="cpu",
    enforce_eager=True,    # no CUDA graphs on CPU
    gpu_memory_utilization=0.0,
)
print("Model loaded.")

app = Flask(__name__)

HTML_PAGE = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LFM2-2.6B Chat</title>
<style>
  * { box-sizing: border-box; margin: 0; padding: 0; }
  body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #1a1a2e; color: #eee; display: flex; justify-content: center; align-items: center; min-height: 100vh; }
  .container { width: 100%; max-width: 800px; padding: 20px; }
  h1 { text-align: center; margin-bottom: 6px; color: #e94560; font-size: 1.5rem; }
  .subtitle { text-align: center; margin-bottom: 20px; color: #888; font-size: 0.85rem; }
  .chatbox { background: #16213e; border-radius: 12px; padding: 20px; height: 55vh; overflow-y: auto; margin-bottom: 15px; border: 1px solid #0f3460; }
  .msg { margin-bottom: 14px; line-height: 1.6; white-space: pre-wrap; }
  .msg.user { color: #e94560; }
  .msg.user::before { content: "You: "; font-weight: bold; }
  .msg.bot { color: #a8d8ea; }
  .msg.bot::before { content: "AI: "; font-weight: bold; }
  .stats { color: #666; font-size: 0.78rem; margin-top: 4px; }
  .input-row { display: flex; gap: 10px; }
  textarea { flex: 1; padding: 12px; border-radius: 8px; border: 1px solid #0f3460; background: #16213e; color: #eee; font-size: 1rem; resize: none; height: 60px; font-family: inherit; }
  textarea:focus { outline: none; border-color: #e94560; }
  button { padding: 12px 28px; border-radius: 8px; border: none; background: #e94560; color: #fff; font-size: 1rem; cursor: pointer; font-weight: bold; }
  button:hover { background: #c73650; }
  button:disabled { background: #555; cursor: not-allowed; }
  .settings { display: flex; gap: 15px; margin-bottom: 15px; flex-wrap: wrap; align-items: center; }
  .settings label { font-size: 0.85rem; color: #aaa; }
  .settings input { background: #16213e; border: 1px solid #0f3460; color: #eee; padding: 5px 8px; border-radius: 6px; width: 80px; }
</style>
</head>
<body>
<div class="container">
  <h1>LFM2-2.6B Uncensored</h1>
  <p class="subtitle">Running on CPU via vLLM</p>
  <div class="settings">
    <label>Max tokens: <input type="number" id="maxTokens" value="256" min="16" max="2048"></label>
    <label>Temperature: <input type="number" id="temperature" value="0.7" min="0" max="2" step="0.1"></label>
    <label>Top-P: <input type="number" id="topP" value="0.9" min="0" max="1" step="0.05"></label>
    <button onclick="clearChat()" style="padding:5px 14px;font-size:0.85rem;">Clear</button>
  </div>
  <div class="chatbox" id="chatbox"></div>
  <div class="input-row">
    <textarea id="userInput" placeholder="Type your message..." onkeydown="if(event.key==='Enter'&&!event.shiftKey){event.preventDefault();sendMsg();}"></textarea>
    <button id="sendBtn" onclick="sendMsg()">Send</button>
  </div>
</div>
<script>
  const chatbox = document.getElementById('chatbox');
  const userInput = document.getElementById('userInput');
  const sendBtn = document.getElementById('sendBtn');
  let history = [];

  function addMsg(role, text) {
    const div = document.createElement('div');
    div.className = 'msg ' + role;
    div.textContent = text;
    chatbox.appendChild(div);
    chatbox.scrollTop = chatbox.scrollHeight;
    return div;
  }

  function clearChat() { history = []; chatbox.innerHTML = ''; }

  async function sendMsg() {
    const text = userInput.value.trim();
    if (!text) return;
    userInput.value = '';
    addMsg('user', text);
    history.push({role:'user', content:text});
    sendBtn.disabled = true;

    const botDiv = addMsg('bot', '');
    botDiv.textContent = '';

    try {
      const resp = await fetch('/chat', {
        method: 'POST',
        headers: {'Content-Type':'application/json'},
        body: JSON.stringify({
          messages: history,
          max_tokens: parseInt(document.getElementById('maxTokens').value)||256,
          temperature: parseFloat(document.getElementById('temperature').value)||0.7,
          top_p: parseFloat(document.getElementById('topP').value)||0.9
        })
      });
      const reader = resp.body.getReader();
      const decoder = new TextDecoder();
      let full = '';

      while (true) {
        const {done, value} = await reader.read();
        if (done) break;
        const chunk = decoder.decode(value, {stream:true});
        for (const line of chunk.split('\\n')) {
          if (!line.startsWith('data: ')) continue;
          const d = line.slice(6);
          if (d === '[DONE]') continue;
          try {
            const j = JSON.parse(d);
            if (j.token) { full += j.token; }
            if (j.stats) {
              const s = document.createElement('div');
              s.className = 'stats';
              s.textContent = j.stats;
              botDiv.textContent = full;
              botDiv.appendChild(s);
            } else {
              botDiv.textContent = full;
            }
          } catch(e){}
        }
      }
      history.push({role:'assistant', content:full});
    } catch(e) {
      botDiv.textContent = 'Error: ' + e.message;
    }
    sendBtn.disabled = false;
    chatbox.scrollTop = chatbox.scrollHeight;
  }
</script>
</body>
</html>
"""


def build_prompt(messages):
    prompt = ""
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        if role == "user":
            prompt += f"<|user|>\n{content}\n"
        elif role == "assistant":
            prompt += f"<|assistant|>\n{content}\n"
        elif role == "system":
            prompt += f"<|system|>\n{content}\n"
    prompt += "<|assistant|>\n"
    return prompt


@app.route("/")
def index():
    return HTML_PAGE


@app.route("/chat", methods=["POST"])
def chat():
    data = request.json
    messages = data.get("messages", [])
    max_tokens = min(data.get("max_tokens", 256), 2048)
    temperature = data.get("temperature", 0.7)
    top_p = data.get("top_p", 0.9)

    prompt = build_prompt(messages)

    sampling_params = SamplingParams(
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=["<|user|>", "<|assistant|>", "<|end|>", "<|endoftext|>"],
    )

    def generate():
        start = time.perf_counter()
        token_count = 0

        # vLLM streaming via generate iterator
        results = llm_engine.generate([prompt], sampling_params, use_tqdm=False)

        for request_output in results:
            output_text = request_output.outputs[0].text
            token_ids = request_output.outputs[0].token_ids
            token_count = len(token_ids)

            # Send full text as a single chunk (vLLM batches on CPU)
            yield f"data: {json.dumps({'token': output_text})}\n\n"

        elapsed = time.perf_counter() - start
        tps = token_count / elapsed if elapsed > 0 else 0
        stats = f"{token_count} tokens in {elapsed:.1f}s \u2014 {tps:.2f} tokens/s"
        yield f"data: {json.dumps({'stats': stats})}\n\n"
        yield "data: [DONE]\n\n"

    return Response(stream_with_context(generate()), mimetype="text/event-stream")


if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    app.run(host="0.0.0.0", port=port, debug=False, threaded=True)