Simonc-44's picture
Update app.py
6dfab63 verified
import os, time, threading
# ── MKL / OpenMP tuning BEFORE torch import ───────────────────────────────────
# On HF free CPU (1 vCPU), inter-op parallelism causes contention.
# MKL_NUM_THREADS=1 avoids spawning extra threads inside BLAS kernels.
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
# MKL-DNN (oneDNN) is the main CPU perf backend for PyTorch
os.environ.setdefault("DNNL_VERBOSE", "0")
import torch
import torch.backends.mkldnn
# Lock threads after env is set
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
from flask import Flask, request, jsonify, Response, send_from_directory, stream_with_context
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, QuantoConfig
app = Flask(__name__)
SYSTEM_PROMPT = (
"You are Cygnis-Alpha, a helpful AI assistant created by CygnisAI.\n\n"
"## ABSOLUTE RULES\n"
"1. LANGUAGE: Detect the user's language. You MUST reply 100% in the SAME language as the user's last message. Never switch to English unless the user asks you to.\n"
"2. IDENTITY: Your name is Cygnis-Alpha, created by CygnisAI.\n"
"3. HONESTY: Never invent facts. If you don't know, say it.\n"
"4. FOCUS: Answer only what was asked. No yapping.\n\n"
"## STYLE\n"
"- Tone: Warm, friendly, professional.\n"
"- Length: Be extremely concise (short answers). Only detail if explicitly requested."
)
MODEL_ID = "CygnisAI/Cygnis-Alpha-1.7B-v0.1-Instruct"
FAVICON_SVG = """<svg width="64" height="64" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
<defs>
<linearGradient id="cygnis-organic-grad" x1="12" y1="3" x2="12" y2="21" gradientUnits="userSpaceOnUse">
<stop stop-color="#4FACFE"/><stop offset="1" stop-color="#00F2FE"/>
</linearGradient>
<filter id="shadow-pop" x="-30%" y="-30%" width="160%" height="160%">
<feGaussianBlur in="SourceAlpha" stdDeviation="0.8"/>
<feOffset dx="0" dy="1.2" result="offsetblur"/>
<feComponentTransfer><feFuncA type="linear" slope="0.4"/></feComponentTransfer>
<feMerge><feMergeNode/><feMergeNode in="SourceGraphic"/></feMerge>
</filter>
<filter id="inner-glow">
<feOffset dx="0" dy="0.5"/><feGaussianBlur stdDeviation="0.4" result="blur"/>
<feComposite operator="out" in="SourceGraphic" in2="blur" result="inverse"/>
<feFlood flood-color="black" flood-opacity="0.2" result="color"/>
<feComposite operator="in" in="color" in2="inverse" result="shadow"/>
<feComposite operator="over" in="shadow" in2="SourceGraphic"/>
</filter>
</defs>
<g filter="url(#shadow-pop)">
<path d="M12 3C7.03 3 3 7.03 3 12C3 16.97 7.03 21 12 21C16.97 21 21 16.97 21 12C21 7.03 16.97 3 12 3ZM12 19.2C8.02 19.2 4.8 15.98 4.8 12C4.8 8.02 8.02 4.8 12 4.8C15.98 4.8 19.2 8.02 19.2 12C19.2 15.98 15.98 19.2 12 19.2Z" fill="url(#cygnis-organic-grad)" opacity="0.5"/>
<g filter="url(#inner-glow)">
<path d="M19.2 12C19.2 10.5 17.5 9.5 14.5 9.5V14.5C17.5 14.5 19.2 13.5 19.2 12Z" fill="url(#cygnis-organic-grad)"/>
<path d="M12 19.2C10.5 19.2 9.5 17.5 9.5 14.5L14.5 14.5C14.5 17.5 13.5 19.2 12 19.2Z" fill="url(#cygnis-organic-grad)"/>
<path d="M12 4.8C13.5 4.8 14.5 6.5 14.5 9.5H9.5C9.5 6.5 10.5 4.8 12 4.8Z" fill="url(#cygnis-organic-grad)"/>
</g>
<circle cx="12" cy="12" r="3.2" stroke="url(#cygnis-organic-grad)" stroke-width="2.2" stroke-linecap="round"/>
</g>
</svg>"""
HTML_PAGE = """<!DOCTYPE html>
<html lang="fr">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>CygnisAI | Console</title>
<link rel="icon" type="image/svg+xml" href="/favicon.svg">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;900&display=swap" rel="stylesheet">
<style>
*, ::before, ::after { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: 'Inter', sans-serif;
background-color: #0c0e10;
background-image: radial-gradient(circle at center, #1a1c1e 0%, #0c0e10 100%);
color: #a5e7ff;
min-height: 100vh;
display: flex; flex-direction: column;
align-items: center; justify-content: center;
overflow: hidden; position: relative;
}
.bg-texture { position:absolute; inset:0; background-image:url('https://www.transparenttextures.com/patterns/carbon-fibre.png'); opacity:0.05; pointer-events:none; }
.scanlines { position:fixed; inset:0; background:repeating-linear-gradient(0deg,transparent,transparent 2px,rgba(0,0,0,0.03) 2px,rgba(0,0,0,0.03) 4px); pointer-events:none; z-index:100; }
.content { position:relative; z-index:10; display:flex; flex-direction:column; align-items:center; gap:2.5rem; padding:1.5rem; }
.hero { display:flex; flex-direction:row; align-items:center; justify-content:center; gap:3rem; animation:fadeUp 0.9s cubic-bezier(0.22,1,0.36,1) both; }
.logo-wrap { width:9rem; height:9rem; transition:transform 0.7s; }
.logo-wrap:hover { transform:scale(1.05); }
.logo-wrap svg { width:100%; height:100%; animation:pulse-glow 3s ease-in-out infinite; }
h1 { font-size:clamp(4rem,10vw,8rem); font-weight:900; letter-spacing:-0.06em; line-height:0.85; text-align:left; text-shadow:0 0 40px rgba(165,231,255,0.4); }
h1 span:last-child { opacity:0.8; }
.badge { display:flex; align-items:center; gap:0.5rem; border:1px solid rgba(255,255,255,0.1); border-radius:9999px; padding:0.5rem 1.25rem; background:rgba(255,255,255,0.05); backdrop-filter:blur(8px); font-size:0.75rem; letter-spacing:0.1em; text-transform:uppercase; color:rgba(165,231,255,0.7); animation:fadeUp 0.9s 0.15s cubic-bezier(0.22,1,0.36,1) both; }
.dot { width:0.5rem; height:0.5rem; border-radius:9999px; background:#34d399; animation:blink 1.8s ease-in-out infinite; }
.card { border:1px solid rgba(255,255,255,0.1); border-radius:1rem; background:rgba(255,255,255,0.05); backdrop-filter:blur(8px); padding:1.5rem 2rem; font-size:0.875rem; max-width:30rem; width:100%; text-align:left; animation:fadeUp 0.9s 0.3s cubic-bezier(0.22,1,0.36,1) both; }
.card-label { color:rgba(255,255,255,0.4); text-transform:uppercase; letter-spacing:0.1em; font-size:0.7rem; margin-bottom:1rem; }
.endpoint-row { display:flex; align-items:center; gap:0.75rem; margin-bottom:0.75rem; }
.method { color:#34d399; font-weight:700; }
code { background:rgba(255,255,255,0.1); padding:0.2rem 0.6rem; border-radius:0.4rem; color:#a5e7ff; font-size:0.8rem; }
.hint { color:rgba(255,255,255,0.5); font-size:0.75rem; line-height:1.6; }
.divider { margin-top:0.75rem; padding-top:0.75rem; border-top:1px solid rgba(255,255,255,0.07); }
.quant-badge { display:inline-flex; align-items:center; gap:0.4rem; background:rgba(79,172,254,0.1); border:1px solid rgba(79,172,254,0.25); border-radius:0.4rem; padding:0.2rem 0.6rem; font-size:0.7rem; color:#4FACFE; margin-top:0.5rem; }
@keyframes pulse-glow { 0%,100%{filter:drop-shadow(0 0 20px rgba(34,211,238,0.4))} 50%{filter:drop-shadow(0 0 45px rgba(34,211,238,0.9))} }
@keyframes fadeUp { from{opacity:0;transform:translateY(24px)} to{opacity:1;transform:translateY(0)} }
@keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
</style>
</head>
<body>
<div class="bg-texture"></div>
<div class="scanlines"></div>
<div class="content">
<div class="hero">
<div class="logo-wrap">
<svg width="64" height="64" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
<defs>
<!-- Dégradé Signature Cygnis - Plus vibrant pour le relief -->
<linearGradient id="cygnis-organic-grad" x1="12" y1="3" x2="12" y2="21" gradientUnits="userSpaceOnUse">
<stop stop-color="#4FACFE" />
<stop offset="1" stop-color="#00F2FE" />
</linearGradient>
<!-- Ombre de flottaison pour l'effet 3D -->
<filter id="shadow-pop" x="-30%" y="-30%" width="160%" height="160%">
<feGaussianBlur in="SourceAlpha" stdDeviation="0.8" />
<feOffset dx="0" dy="1.2" result="offsetblur" />
<feComponentTransfer>
<feFuncA type="linear" slope="0.4" />
</feComponentTransfer>
<feMerge>
<feMergeNode />
<feMergeNode in="SourceGraphic" />
</feMerge>
</filter>
<!-- Profondeur interne pour l'effet bombé -->
<filter id="inner-glow">
<feOffset dx="0" dy="0.5" />
<feGaussianBlur stdDeviation="0.4" result="blur" />
<feComposite operator="out" in="SourceGraphic" in2="blur" result="inverse" />
<feFlood flood-color="black" flood-opacity="0.2" result="color" />
<feComposite operator="in" in="color" in2="inverse" result="shadow" />
<feComposite operator="over" in="shadow" in2="SourceGraphic" />
</filter>
</defs>
<g filter="url(#shadow-pop)">
<!-- Anneau extérieur : Plus épais et doux -->
<path d="M12 3C7.03 3 3 7.03 3 12C3 16.97 7.03 21 12 21C16.97 21 21 16.97 21 12C21 7.03 16.97 3 12 3ZM12 19.2C8.02 19.2 4.8 15.98 4.8 12C4.8 8.02 8.02 4.8 12 4.8C15.98 4.8 19.2 8.02 19.2 12C19.2 15.98 15.98 19.2 12 19.2Z"
fill="url(#cygnis-organic-grad)"
opacity="0.5" />
<!-- Segments : Ils touchent le bord mais avec des courbes fluides -->
<g filter="url(#inner-glow)">
<!-- Segment Droite : Incurvé vers l'anneau -->
<path d="M19.2 12C19.2 10.5 17.5 9.5 14.5 9.5V14.5C17.5 14.5 19.2 13.5 19.2 12Z" fill="url(#cygnis-organic-grad)" />
<!-- Segment Bas : Incurvé vers l'anneau -->
<path d="M12 19.2C10.5 19.2 9.5 17.5 9.5 14.5L14.5 14.5C14.5 17.5 13.5 19.2 12 19.2Z" fill="url(#cygnis-organic-grad)" />
<!-- Segment Haut : Incurvé vers l'anneau -->
<path d="M12 4.8C13.5 4.8 14.5 6.5 14.5 9.5H9.5C9.5 6.5 10.5 4.8 12 4.8Z" fill="url(#cygnis-organic-grad)" />
</g>
<!-- Anneau central : Parfaitement rond et intégré -->
<circle cx="12" cy="12" r="3.2" stroke="url(#cygnis-organic-grad)" stroke-width="2.2" stroke-linecap="round" />
</g>
</svg>
</div>
<h1><span>CygnisAI</span><br><span>Console</span></h1>
</div>
<div class="badge"><span class="dot"></span>Cygnis-Alpha &middot; Online</div>
<div class="card">
<p class="card-label">API Endpoints</p>
<div class="endpoint-row"><span class="method">POST</span><code>/generate</code><span class="hint">&nbsp;— JSON complet</span></div>
<div class="endpoint-row"><span class="method">POST</span><code>/generate/stream</code><span class="hint">&nbsp;— SSE token par token</span></div>
<p class="hint">Body: <code>{"prompt": "...", "max_new_tokens": 256, "fast": true}</code></p>
<div class="divider hint">SSE: <code>data: token</code> &nbsp;·&nbsp; <code>data: [STATS]...</code> &nbsp;·&nbsp; <code>data: [DONE]</code></div>
<div><span class="quant-badge">⚡ INT8 quanto</span></div>
</div>
</div>
</body>
</html>"""
# ─── Model globals ─────────────────────────────────────────────────────────────
tokenizer_g = None
model_g = None
model_ready = False
model_error = None
quant_mode = "none" # updated after load
def load_model():
global tokenizer_g, model_g, model_ready, model_error, quant_mode
try:
print(f"[CygnisAI] Loading {MODEL_ID} ...")
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
# ── INT8 quantization via quanto ──────────────────────────────────────
# Quantizes linear layers to int8 weights → ~4× smaller, faster matmul
try:
qconfig = QuantoConfig(weights="int8")
mdl = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=qconfig,
low_cpu_mem_usage=True,
)
quant_mode = "int8-quanto"
print("[CygnisAI] ✅ INT8 quantization loaded.")
except Exception as qe:
print(f"[CygnisAI] INT8 failed ({qe}), falling back to float32 …")
mdl = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
)
quant_mode = "float32"
mdl.eval()
# ── torch.compile ─────────────────────────────────────────────────────
# "reduce-overhead" eliminates Python dispatch overhead per token
try:
mdl = torch.compile(mdl, mode="reduce-overhead", fullgraph=False)
print("[CygnisAI] torch.compile OK.")
except Exception as ce:
print(f"[CygnisAI] torch.compile skipped: {ce}")
tokenizer_g = tok
model_g = mdl
model_ready = True
print(f"[CygnisAI] Model ready. Mode: {quant_mode}")
# ── Warmup: trigger compile before first real request ─────────────────
_warmup()
except Exception as e:
model_error = str(e)
print(f"[CygnisAI] Load error: {e}")
def _warmup():
try:
print("[CygnisAI] Warming up ...")
ids = tokenizer_g("Hi", return_tensors="pt")
with torch.inference_mode():
model_g.generate(
**ids,
max_new_tokens=4,
do_sample=False,
use_cache=True,
pad_token_id=tokenizer_g.eos_token_id,
)
print("[CygnisAI] Warmup done — ready to serve.")
except Exception as e:
print(f"[CygnisAI] Warmup error (non-fatal): {e}")
threading.Thread(target=load_model, daemon=True).start()
# ─── Helpers ──────────────────────────────────────────────────────────────────
def build_prompt(user_prompt: str) -> str:
return (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
def guard():
if model_error:
return False, (jsonify({"error": f"Model failed to load: {model_error}"}), 500)
if not model_ready:
return False, (jsonify({"error": "Model is loading, retry in a moment."}), 503)
return True, None
def parse_body():
data = request.get_json(silent=True)
if not data:
return None, None, None, None, (jsonify({"error": "Request body must be valid JSON."}), 400)
prompt = str(data.get("prompt", "")).strip()
if not prompt:
return None, None, None, None, (jsonify({"error": "Field 'prompt' is required."}), 400)
max_tok = min(int(data.get("max_new_tokens", 256)), 512)
temperature = float(data.get("temperature", 0.7))
fast = bool(data.get("fast", True)) # True = greedy (~2× faster)
return prompt, max_tok, temperature, fast, None
def make_gen_kwargs(inputs, max_tok, temperature, fast, streamer=None):
kw = dict(
**inputs,
max_new_tokens=max_tok,
use_cache=True,
pad_token_id=tokenizer_g.eos_token_id,
eos_token_id=tokenizer_g.eos_token_id,
)
if fast:
kw["do_sample"] = False # greedy: fastest
else:
kw.update(do_sample=True, temperature=temperature, top_p=0.9, repetition_penalty=1.15)
if streamer:
kw["streamer"] = streamer
return kw
# ─── Routes ───────────────────────────────────────────────────────────────────
@app.route("/favicon.svg")
def favicon_svg():
return Response(FAVICON_SVG, mimetype="image/svg+xml")
@app.route("/favicon.ico")
@app.route("/favicon.png")
def favicon_fallback():
root = os.path.dirname(os.path.abspath(__file__))
for name in ("favicon.png", "favicon.ico"):
if os.path.exists(os.path.join(root, name)):
return send_from_directory(root, name)
return Response(FAVICON_SVG, mimetype="image/svg+xml")
@app.route("/", methods=["GET"])
def home():
return Response(HTML_PAGE, mimetype="text/html")
@app.route("/health", methods=["GET"])
def health():
if model_error:
return jsonify({"status": "error", "detail": model_error}), 500
if not model_ready:
return jsonify({"status": "loading"}), 503
return jsonify({"status": "ok", "model": MODEL_ID, "quant": quant_mode})
# ── /generate ─────────────────────────────────────────────────────────────────
@app.route("/generate", methods=["POST"])
def generate():
ok, err = guard()
if not ok: return err
prompt, max_tok, temperature, fast, err = parse_body()
if err: return err
inputs = tokenizer_g(build_prompt(prompt), return_tensors="pt")
kwargs = make_gen_kwargs(inputs, max_tok, temperature, fast)
n_prompt = inputs["input_ids"].shape[-1]
result = {}
def _infer():
try:
with torch.inference_mode():
out = model_g.generate(**kwargs)
new_ids = out[0][n_prompt:]
text = tokenizer_g.decode(new_ids, skip_special_tokens=True)
result["text"] = text.split("<|im_end|>")[0].strip() or "Je suis Cygnis-Alpha."
result["n_tokens"] = len(new_ids)
except Exception as e:
result["error"] = str(e)
t0 = time.time()
t = threading.Thread(target=_infer)
t.start(); t.join(timeout=120)
elapsed = round(time.time() - t0, 2)
if t.is_alive():
return jsonify({"error": "Timeout >120s. Reduce max_new_tokens."}), 504
if "error" in result:
return jsonify({"error": result["error"]}), 500
n = result.get("n_tokens", 0)
tps = round(n / elapsed, 2) if elapsed > 0 else 0
return jsonify({
"response": result["text"],
"model": MODEL_ID, "quant": quant_mode,
"elapsed_sec": elapsed, "tokens": n, "tps": tps,
})
# ── /generate/stream (SSE) ───────────────────────────────────────────────────
@app.route("/generate/stream", methods=["POST"])
def generate_stream():
ok, err = guard()
if not ok: return err
prompt, max_tok, temperature, fast, err = parse_body()
if err: return err
inputs = tokenizer_g(build_prompt(prompt), return_tensors="pt")
streamer = TextIteratorStreamer(tokenizer_g, skip_prompt=True, skip_special_tokens=True, timeout=15.0)
kwargs = make_gen_kwargs(inputs, max_tok, temperature, fast, streamer=streamer)
gen_err = {}
def _run():
try:
with torch.inference_mode():
model_g.generate(**kwargs)
except Exception as e:
gen_err["msg"] = str(e)
def event_stream():
t0, n = time.time(), 0
t = threading.Thread(target=_run, daemon=True)
t.start()
try:
for token in streamer:
clean = token.replace("<|im_end|>", "")
if clean:
n += 1
yield f"data: {clean}\n\n"
except Exception as e:
yield f"data: [ERROR] {e}\n\n"
finally:
t.join(timeout=5)
elapsed = round(time.time() - t0, 2)
tps = round(n / elapsed, 2) if elapsed > 0 else 0
if gen_err:
yield f"data: [ERROR] {gen_err['msg']}\n\n"
yield f"data: [STATS] tokens={n} elapsed={elapsed}s tps={tps} quant={quant_mode}\n\n"
yield "data: [DONE]\n\n"
return Response(
stream_with_context(event_stream()),
mimetype="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"},
)
# ─── Entry ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port, debug=False, threaded=True)