Spaces:
Sleeping
Sleeping
| import os, time, threading | |
| # ── MKL / OpenMP tuning BEFORE torch import ─────────────────────────────────── | |
| # On HF free CPU (1 vCPU), inter-op parallelism causes contention. | |
| # MKL_NUM_THREADS=1 avoids spawning extra threads inside BLAS kernels. | |
| os.environ.setdefault("MKL_NUM_THREADS", "1") | |
| os.environ.setdefault("OMP_NUM_THREADS", "1") | |
| os.environ.setdefault("OPENBLAS_NUM_THREADS", "1") | |
| os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1") | |
| os.environ.setdefault("NUMEXPR_NUM_THREADS", "1") | |
| # MKL-DNN (oneDNN) is the main CPU perf backend for PyTorch | |
| os.environ.setdefault("DNNL_VERBOSE", "0") | |
| import torch | |
| import torch.backends.mkldnn | |
| # Lock threads after env is set | |
| torch.set_num_threads(1) | |
| torch.set_num_interop_threads(1) | |
| from flask import Flask, request, jsonify, Response, send_from_directory, stream_with_context | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, QuantoConfig | |
| app = Flask(__name__) | |
| SYSTEM_PROMPT = ( | |
| "You are Cygnis-Alpha, a helpful AI assistant created by CygnisAI.\n\n" | |
| "## ABSOLUTE RULES\n" | |
| "1. LANGUAGE: Detect the user's language. You MUST reply 100% in the SAME language as the user's last message. Never switch to English unless the user asks you to.\n" | |
| "2. IDENTITY: Your name is Cygnis-Alpha, created by CygnisAI.\n" | |
| "3. HONESTY: Never invent facts. If you don't know, say it.\n" | |
| "4. FOCUS: Answer only what was asked. No yapping.\n\n" | |
| "## STYLE\n" | |
| "- Tone: Warm, friendly, professional.\n" | |
| "- Length: Be extremely concise (short answers). Only detail if explicitly requested." | |
| ) | |
| MODEL_ID = "CygnisAI/Cygnis-Alpha-1.7B-v0.1-Instruct" | |
| FAVICON_SVG = """<svg width="64" height="64" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"> | |
| <defs> | |
| <linearGradient id="cygnis-organic-grad" x1="12" y1="3" x2="12" y2="21" gradientUnits="userSpaceOnUse"> | |
| <stop stop-color="#4FACFE"/><stop offset="1" stop-color="#00F2FE"/> | |
| </linearGradient> | |
| <filter id="shadow-pop" x="-30%" y="-30%" width="160%" height="160%"> | |
| <feGaussianBlur in="SourceAlpha" stdDeviation="0.8"/> | |
| <feOffset dx="0" dy="1.2" result="offsetblur"/> | |
| <feComponentTransfer><feFuncA type="linear" slope="0.4"/></feComponentTransfer> | |
| <feMerge><feMergeNode/><feMergeNode in="SourceGraphic"/></feMerge> | |
| </filter> | |
| <filter id="inner-glow"> | |
| <feOffset dx="0" dy="0.5"/><feGaussianBlur stdDeviation="0.4" result="blur"/> | |
| <feComposite operator="out" in="SourceGraphic" in2="blur" result="inverse"/> | |
| <feFlood flood-color="black" flood-opacity="0.2" result="color"/> | |
| <feComposite operator="in" in="color" in2="inverse" result="shadow"/> | |
| <feComposite operator="over" in="shadow" in2="SourceGraphic"/> | |
| </filter> | |
| </defs> | |
| <g filter="url(#shadow-pop)"> | |
| <path d="M12 3C7.03 3 3 7.03 3 12C3 16.97 7.03 21 12 21C16.97 21 21 16.97 21 12C21 7.03 16.97 3 12 3ZM12 19.2C8.02 19.2 4.8 15.98 4.8 12C4.8 8.02 8.02 4.8 12 4.8C15.98 4.8 19.2 8.02 19.2 12C19.2 15.98 15.98 19.2 12 19.2Z" fill="url(#cygnis-organic-grad)" opacity="0.5"/> | |
| <g filter="url(#inner-glow)"> | |
| <path d="M19.2 12C19.2 10.5 17.5 9.5 14.5 9.5V14.5C17.5 14.5 19.2 13.5 19.2 12Z" fill="url(#cygnis-organic-grad)"/> | |
| <path d="M12 19.2C10.5 19.2 9.5 17.5 9.5 14.5L14.5 14.5C14.5 17.5 13.5 19.2 12 19.2Z" fill="url(#cygnis-organic-grad)"/> | |
| <path d="M12 4.8C13.5 4.8 14.5 6.5 14.5 9.5H9.5C9.5 6.5 10.5 4.8 12 4.8Z" fill="url(#cygnis-organic-grad)"/> | |
| </g> | |
| <circle cx="12" cy="12" r="3.2" stroke="url(#cygnis-organic-grad)" stroke-width="2.2" stroke-linecap="round"/> | |
| </g> | |
| </svg>""" | |
| HTML_PAGE = """<!DOCTYPE html> | |
| <html lang="fr"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>CygnisAI | Console</title> | |
| <link rel="icon" type="image/svg+xml" href="/favicon.svg"> | |
| <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;900&display=swap" rel="stylesheet"> | |
| <style> | |
| *, ::before, ::after { box-sizing: border-box; margin: 0; padding: 0; } | |
| body { | |
| font-family: 'Inter', sans-serif; | |
| background-color: #0c0e10; | |
| background-image: radial-gradient(circle at center, #1a1c1e 0%, #0c0e10 100%); | |
| color: #a5e7ff; | |
| min-height: 100vh; | |
| display: flex; flex-direction: column; | |
| align-items: center; justify-content: center; | |
| overflow: hidden; position: relative; | |
| } | |
| .bg-texture { position:absolute; inset:0; background-image:url('https://www.transparenttextures.com/patterns/carbon-fibre.png'); opacity:0.05; pointer-events:none; } | |
| .scanlines { position:fixed; inset:0; background:repeating-linear-gradient(0deg,transparent,transparent 2px,rgba(0,0,0,0.03) 2px,rgba(0,0,0,0.03) 4px); pointer-events:none; z-index:100; } | |
| .content { position:relative; z-index:10; display:flex; flex-direction:column; align-items:center; gap:2.5rem; padding:1.5rem; } | |
| .hero { display:flex; flex-direction:row; align-items:center; justify-content:center; gap:3rem; animation:fadeUp 0.9s cubic-bezier(0.22,1,0.36,1) both; } | |
| .logo-wrap { width:9rem; height:9rem; transition:transform 0.7s; } | |
| .logo-wrap:hover { transform:scale(1.05); } | |
| .logo-wrap svg { width:100%; height:100%; animation:pulse-glow 3s ease-in-out infinite; } | |
| h1 { font-size:clamp(4rem,10vw,8rem); font-weight:900; letter-spacing:-0.06em; line-height:0.85; text-align:left; text-shadow:0 0 40px rgba(165,231,255,0.4); } | |
| h1 span:last-child { opacity:0.8; } | |
| .badge { display:flex; align-items:center; gap:0.5rem; border:1px solid rgba(255,255,255,0.1); border-radius:9999px; padding:0.5rem 1.25rem; background:rgba(255,255,255,0.05); backdrop-filter:blur(8px); font-size:0.75rem; letter-spacing:0.1em; text-transform:uppercase; color:rgba(165,231,255,0.7); animation:fadeUp 0.9s 0.15s cubic-bezier(0.22,1,0.36,1) both; } | |
| .dot { width:0.5rem; height:0.5rem; border-radius:9999px; background:#34d399; animation:blink 1.8s ease-in-out infinite; } | |
| .card { border:1px solid rgba(255,255,255,0.1); border-radius:1rem; background:rgba(255,255,255,0.05); backdrop-filter:blur(8px); padding:1.5rem 2rem; font-size:0.875rem; max-width:30rem; width:100%; text-align:left; animation:fadeUp 0.9s 0.3s cubic-bezier(0.22,1,0.36,1) both; } | |
| .card-label { color:rgba(255,255,255,0.4); text-transform:uppercase; letter-spacing:0.1em; font-size:0.7rem; margin-bottom:1rem; } | |
| .endpoint-row { display:flex; align-items:center; gap:0.75rem; margin-bottom:0.75rem; } | |
| .method { color:#34d399; font-weight:700; } | |
| code { background:rgba(255,255,255,0.1); padding:0.2rem 0.6rem; border-radius:0.4rem; color:#a5e7ff; font-size:0.8rem; } | |
| .hint { color:rgba(255,255,255,0.5); font-size:0.75rem; line-height:1.6; } | |
| .divider { margin-top:0.75rem; padding-top:0.75rem; border-top:1px solid rgba(255,255,255,0.07); } | |
| .quant-badge { display:inline-flex; align-items:center; gap:0.4rem; background:rgba(79,172,254,0.1); border:1px solid rgba(79,172,254,0.25); border-radius:0.4rem; padding:0.2rem 0.6rem; font-size:0.7rem; color:#4FACFE; margin-top:0.5rem; } | |
| @keyframes pulse-glow { 0%,100%{filter:drop-shadow(0 0 20px rgba(34,211,238,0.4))} 50%{filter:drop-shadow(0 0 45px rgba(34,211,238,0.9))} } | |
| @keyframes fadeUp { from{opacity:0;transform:translateY(24px)} to{opacity:1;transform:translateY(0)} } | |
| @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="bg-texture"></div> | |
| <div class="scanlines"></div> | |
| <div class="content"> | |
| <div class="hero"> | |
| <div class="logo-wrap"> | |
| <svg width="64" height="64" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"> | |
| <defs> | |
| <!-- Dégradé Signature Cygnis - Plus vibrant pour le relief --> | |
| <linearGradient id="cygnis-organic-grad" x1="12" y1="3" x2="12" y2="21" gradientUnits="userSpaceOnUse"> | |
| <stop stop-color="#4FACFE" /> | |
| <stop offset="1" stop-color="#00F2FE" /> | |
| </linearGradient> | |
| <!-- Ombre de flottaison pour l'effet 3D --> | |
| <filter id="shadow-pop" x="-30%" y="-30%" width="160%" height="160%"> | |
| <feGaussianBlur in="SourceAlpha" stdDeviation="0.8" /> | |
| <feOffset dx="0" dy="1.2" result="offsetblur" /> | |
| <feComponentTransfer> | |
| <feFuncA type="linear" slope="0.4" /> | |
| </feComponentTransfer> | |
| <feMerge> | |
| <feMergeNode /> | |
| <feMergeNode in="SourceGraphic" /> | |
| </feMerge> | |
| </filter> | |
| <!-- Profondeur interne pour l'effet bombé --> | |
| <filter id="inner-glow"> | |
| <feOffset dx="0" dy="0.5" /> | |
| <feGaussianBlur stdDeviation="0.4" result="blur" /> | |
| <feComposite operator="out" in="SourceGraphic" in2="blur" result="inverse" /> | |
| <feFlood flood-color="black" flood-opacity="0.2" result="color" /> | |
| <feComposite operator="in" in="color" in2="inverse" result="shadow" /> | |
| <feComposite operator="over" in="shadow" in2="SourceGraphic" /> | |
| </filter> | |
| </defs> | |
| <g filter="url(#shadow-pop)"> | |
| <!-- Anneau extérieur : Plus épais et doux --> | |
| <path d="M12 3C7.03 3 3 7.03 3 12C3 16.97 7.03 21 12 21C16.97 21 21 16.97 21 12C21 7.03 16.97 3 12 3ZM12 19.2C8.02 19.2 4.8 15.98 4.8 12C4.8 8.02 8.02 4.8 12 4.8C15.98 4.8 19.2 8.02 19.2 12C19.2 15.98 15.98 19.2 12 19.2Z" | |
| fill="url(#cygnis-organic-grad)" | |
| opacity="0.5" /> | |
| <!-- Segments : Ils touchent le bord mais avec des courbes fluides --> | |
| <g filter="url(#inner-glow)"> | |
| <!-- Segment Droite : Incurvé vers l'anneau --> | |
| <path d="M19.2 12C19.2 10.5 17.5 9.5 14.5 9.5V14.5C17.5 14.5 19.2 13.5 19.2 12Z" fill="url(#cygnis-organic-grad)" /> | |
| <!-- Segment Bas : Incurvé vers l'anneau --> | |
| <path d="M12 19.2C10.5 19.2 9.5 17.5 9.5 14.5L14.5 14.5C14.5 17.5 13.5 19.2 12 19.2Z" fill="url(#cygnis-organic-grad)" /> | |
| <!-- Segment Haut : Incurvé vers l'anneau --> | |
| <path d="M12 4.8C13.5 4.8 14.5 6.5 14.5 9.5H9.5C9.5 6.5 10.5 4.8 12 4.8Z" fill="url(#cygnis-organic-grad)" /> | |
| </g> | |
| <!-- Anneau central : Parfaitement rond et intégré --> | |
| <circle cx="12" cy="12" r="3.2" stroke="url(#cygnis-organic-grad)" stroke-width="2.2" stroke-linecap="round" /> | |
| </g> | |
| </svg> | |
| </div> | |
| <h1><span>CygnisAI</span><br><span>Console</span></h1> | |
| </div> | |
| <div class="badge"><span class="dot"></span>Cygnis-Alpha · Online</div> | |
| <div class="card"> | |
| <p class="card-label">API Endpoints</p> | |
| <div class="endpoint-row"><span class="method">POST</span><code>/generate</code><span class="hint"> — JSON complet</span></div> | |
| <div class="endpoint-row"><span class="method">POST</span><code>/generate/stream</code><span class="hint"> — SSE token par token</span></div> | |
| <p class="hint">Body: <code>{"prompt": "...", "max_new_tokens": 256, "fast": true}</code></p> | |
| <div class="divider hint">SSE: <code>data: token</code> · <code>data: [STATS]...</code> · <code>data: [DONE]</code></div> | |
| <div><span class="quant-badge">⚡ INT8 quanto</span></div> | |
| </div> | |
| </div> | |
| </body> | |
| </html>""" | |
| # ─── Model globals ───────────────────────────────────────────────────────────── | |
| tokenizer_g = None | |
| model_g = None | |
| model_ready = False | |
| model_error = None | |
| quant_mode = "none" # updated after load | |
| def load_model(): | |
| global tokenizer_g, model_g, model_ready, model_error, quant_mode | |
| try: | |
| print(f"[CygnisAI] Loading {MODEL_ID} ...") | |
| tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
| if tok.pad_token is None: | |
| tok.pad_token = tok.eos_token | |
| # ── INT8 quantization via quanto ────────────────────────────────────── | |
| # Quantizes linear layers to int8 weights → ~4× smaller, faster matmul | |
| try: | |
| qconfig = QuantoConfig(weights="int8") | |
| mdl = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| quantization_config=qconfig, | |
| low_cpu_mem_usage=True, | |
| ) | |
| quant_mode = "int8-quanto" | |
| print("[CygnisAI] ✅ INT8 quantization loaded.") | |
| except Exception as qe: | |
| print(f"[CygnisAI] INT8 failed ({qe}), falling back to float32 …") | |
| mdl = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| ) | |
| quant_mode = "float32" | |
| mdl.eval() | |
| # ── torch.compile ───────────────────────────────────────────────────── | |
| # "reduce-overhead" eliminates Python dispatch overhead per token | |
| try: | |
| mdl = torch.compile(mdl, mode="reduce-overhead", fullgraph=False) | |
| print("[CygnisAI] torch.compile OK.") | |
| except Exception as ce: | |
| print(f"[CygnisAI] torch.compile skipped: {ce}") | |
| tokenizer_g = tok | |
| model_g = mdl | |
| model_ready = True | |
| print(f"[CygnisAI] Model ready. Mode: {quant_mode}") | |
| # ── Warmup: trigger compile before first real request ───────────────── | |
| _warmup() | |
| except Exception as e: | |
| model_error = str(e) | |
| print(f"[CygnisAI] Load error: {e}") | |
| def _warmup(): | |
| try: | |
| print("[CygnisAI] Warming up ...") | |
| ids = tokenizer_g("Hi", return_tensors="pt") | |
| with torch.inference_mode(): | |
| model_g.generate( | |
| **ids, | |
| max_new_tokens=4, | |
| do_sample=False, | |
| use_cache=True, | |
| pad_token_id=tokenizer_g.eos_token_id, | |
| ) | |
| print("[CygnisAI] Warmup done — ready to serve.") | |
| except Exception as e: | |
| print(f"[CygnisAI] Warmup error (non-fatal): {e}") | |
| threading.Thread(target=load_model, daemon=True).start() | |
| # ─── Helpers ────────────────────────────────────────────────────────────────── | |
| def build_prompt(user_prompt: str) -> str: | |
| return ( | |
| f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" | |
| f"<|im_start|>user\n{user_prompt}<|im_end|>\n" | |
| f"<|im_start|>assistant\n" | |
| ) | |
| def guard(): | |
| if model_error: | |
| return False, (jsonify({"error": f"Model failed to load: {model_error}"}), 500) | |
| if not model_ready: | |
| return False, (jsonify({"error": "Model is loading, retry in a moment."}), 503) | |
| return True, None | |
| def parse_body(): | |
| data = request.get_json(silent=True) | |
| if not data: | |
| return None, None, None, None, (jsonify({"error": "Request body must be valid JSON."}), 400) | |
| prompt = str(data.get("prompt", "")).strip() | |
| if not prompt: | |
| return None, None, None, None, (jsonify({"error": "Field 'prompt' is required."}), 400) | |
| max_tok = min(int(data.get("max_new_tokens", 256)), 512) | |
| temperature = float(data.get("temperature", 0.7)) | |
| fast = bool(data.get("fast", True)) # True = greedy (~2× faster) | |
| return prompt, max_tok, temperature, fast, None | |
| def make_gen_kwargs(inputs, max_tok, temperature, fast, streamer=None): | |
| kw = dict( | |
| **inputs, | |
| max_new_tokens=max_tok, | |
| use_cache=True, | |
| pad_token_id=tokenizer_g.eos_token_id, | |
| eos_token_id=tokenizer_g.eos_token_id, | |
| ) | |
| if fast: | |
| kw["do_sample"] = False # greedy: fastest | |
| else: | |
| kw.update(do_sample=True, temperature=temperature, top_p=0.9, repetition_penalty=1.15) | |
| if streamer: | |
| kw["streamer"] = streamer | |
| return kw | |
| # ─── Routes ─────────────────────────────────────────────────────────────────── | |
| def favicon_svg(): | |
| return Response(FAVICON_SVG, mimetype="image/svg+xml") | |
| def favicon_fallback(): | |
| root = os.path.dirname(os.path.abspath(__file__)) | |
| for name in ("favicon.png", "favicon.ico"): | |
| if os.path.exists(os.path.join(root, name)): | |
| return send_from_directory(root, name) | |
| return Response(FAVICON_SVG, mimetype="image/svg+xml") | |
| def home(): | |
| return Response(HTML_PAGE, mimetype="text/html") | |
| def health(): | |
| if model_error: | |
| return jsonify({"status": "error", "detail": model_error}), 500 | |
| if not model_ready: | |
| return jsonify({"status": "loading"}), 503 | |
| return jsonify({"status": "ok", "model": MODEL_ID, "quant": quant_mode}) | |
| # ── /generate ───────────────────────────────────────────────────────────────── | |
| def generate(): | |
| ok, err = guard() | |
| if not ok: return err | |
| prompt, max_tok, temperature, fast, err = parse_body() | |
| if err: return err | |
| inputs = tokenizer_g(build_prompt(prompt), return_tensors="pt") | |
| kwargs = make_gen_kwargs(inputs, max_tok, temperature, fast) | |
| n_prompt = inputs["input_ids"].shape[-1] | |
| result = {} | |
| def _infer(): | |
| try: | |
| with torch.inference_mode(): | |
| out = model_g.generate(**kwargs) | |
| new_ids = out[0][n_prompt:] | |
| text = tokenizer_g.decode(new_ids, skip_special_tokens=True) | |
| result["text"] = text.split("<|im_end|>")[0].strip() or "Je suis Cygnis-Alpha." | |
| result["n_tokens"] = len(new_ids) | |
| except Exception as e: | |
| result["error"] = str(e) | |
| t0 = time.time() | |
| t = threading.Thread(target=_infer) | |
| t.start(); t.join(timeout=120) | |
| elapsed = round(time.time() - t0, 2) | |
| if t.is_alive(): | |
| return jsonify({"error": "Timeout >120s. Reduce max_new_tokens."}), 504 | |
| if "error" in result: | |
| return jsonify({"error": result["error"]}), 500 | |
| n = result.get("n_tokens", 0) | |
| tps = round(n / elapsed, 2) if elapsed > 0 else 0 | |
| return jsonify({ | |
| "response": result["text"], | |
| "model": MODEL_ID, "quant": quant_mode, | |
| "elapsed_sec": elapsed, "tokens": n, "tps": tps, | |
| }) | |
| # ── /generate/stream (SSE) ─────────────────────────────────────────────────── | |
| def generate_stream(): | |
| ok, err = guard() | |
| if not ok: return err | |
| prompt, max_tok, temperature, fast, err = parse_body() | |
| if err: return err | |
| inputs = tokenizer_g(build_prompt(prompt), return_tensors="pt") | |
| streamer = TextIteratorStreamer(tokenizer_g, skip_prompt=True, skip_special_tokens=True, timeout=15.0) | |
| kwargs = make_gen_kwargs(inputs, max_tok, temperature, fast, streamer=streamer) | |
| gen_err = {} | |
| def _run(): | |
| try: | |
| with torch.inference_mode(): | |
| model_g.generate(**kwargs) | |
| except Exception as e: | |
| gen_err["msg"] = str(e) | |
| def event_stream(): | |
| t0, n = time.time(), 0 | |
| t = threading.Thread(target=_run, daemon=True) | |
| t.start() | |
| try: | |
| for token in streamer: | |
| clean = token.replace("<|im_end|>", "") | |
| if clean: | |
| n += 1 | |
| yield f"data: {clean}\n\n" | |
| except Exception as e: | |
| yield f"data: [ERROR] {e}\n\n" | |
| finally: | |
| t.join(timeout=5) | |
| elapsed = round(time.time() - t0, 2) | |
| tps = round(n / elapsed, 2) if elapsed > 0 else 0 | |
| if gen_err: | |
| yield f"data: [ERROR] {gen_err['msg']}\n\n" | |
| yield f"data: [STATS] tokens={n} elapsed={elapsed}s tps={tps} quant={quant_mode}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return Response( | |
| stream_with_context(event_stream()), | |
| mimetype="text/event-stream", | |
| headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"}, | |
| ) | |
| # ─── Entry ──────────────────────────────────────────────────────────────────── | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| app.run(host="0.0.0.0", port=port, debug=False, threaded=True) |