Spaces:

arthu1
/

north-air-api

Runtime error

App Files Files Community

arthu1 commited on Feb 23

Commit

27f81ac

1 Parent(s): 264ac43

Revert to Docker/FastAPI + add dynamic INT8 quantization for faster CPU inference

Browse files

Files changed (4) hide show

Dockerfile +15 -0
README.md +15 -18
app.py +259 -167
requirements.txt +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.11-slim
+WORKDIR /app
+ENV PYTHONUNBUFFERED=1 \
+    PORT=7860
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -3,33 +3,30 @@ title: North Air API
 emoji: 🌬️
 colorFrom: blue
 colorTo: green
-sdk: gradio
-sdk_version: 5.12.0
-app_file: app.py
-hardware: zero-a10g
 ---
-# North Air 1 — ZeroGPU API
-GPU-accelerated inference via HuggingFace ZeroGPU (free A100 time-slices).
-## API Endpoints
-- `POST /api/chat` — non-streaming chat
-- `POST /api/chat_stream` — streaming chat (newline-delimited JSON events)
-### Request format
 ```json
 {
-  "data": ["{\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}"]
 }
 ```
-### Response format
 ```json
-{
-  "data": ["{\"output\":\"Hey! I'm North Air 1...\",\"model\":\"north-air-1\",\"tokens_generated\":42,\"latency_ms\":1200}"]
-}
 ```

 emoji: 🌬️
 colorFrom: blue
 colorTo: green
+sdk: docker
+app_port: 7860
 ---
+# North Air 1 API
+Optimized CPU inference with dynamic INT8 quantization.
+Endpoints:
+- `GET /health`
+- `POST /chat`
+- `POST /chat/stream`
+Request shape (`/chat`):
 ```json
 {
+  "model": "north-air-1",
+  "messages": [
+    {"role": "user", "content": "Hello"}
+  ]
 }
 ```
+Response shape:
 ```json
+{"output": "...", "inference": "pytorch-int8"}
 ```

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import os
 import re
-import json
 import time
 from threading import Thread
 import torch
-import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-# ─── Config ───
 MODEL_DIR = os.getenv("MODEL_DIR", "./final_model")
 MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512"))
 TEMPERATURE = float(os.getenv("TEMPERATURE", "0.6"))
@@ -18,39 +20,144 @@ SYSTEM_PROMPT = """You are North Air 1, built by North Air. 0.6B params, a custo
 Be direct, helpful, concise. Use markdown. Write clean code. Never fabricate facts.
 If asked who you are: "I'm North Air 1, built by North Air." You are NOT ChatGPT/GPT-4/Claude/etc."""
-# ─── Load model on CPU first, ZeroGPU moves it to GPU per-call ───
-def _load_model():
-    adapter_cfg = os.path.join(MODEL_DIR, "adapter_config.json")
-    if os.path.exists(adapter_cfg):
-        from peft import AutoPeftModelForCausalLM
-        model = AutoPeftModelForCausalLM.from_pretrained(
-            MODEL_DIR, torch_dtype=torch.float16, device_map={"": "cpu"},
-        )
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_DIR, torch_dtype=torch.float16, device_map={"": "cpu"},
-            trust_remote_code=True,
         )
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True, trust_remote_code=True)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model.eval()
-    return model, tokenizer
-try:
-    MODEL, TOKENIZER = _load_model()
-    LOAD_ERROR = None
-except Exception as exc:
-    MODEL, TOKENIZER = None, None
-    LOAD_ERROR = str(exc)
-def _build_prompt(messages, system, enable_thinking=False):
     has_system = any(m["role"] == "system" for m in messages)
     if not has_system:
         messages = [{"role": "system", "content": system}] + messages
     if hasattr(TOKENIZER, "apply_chat_template"):
         return TOKENIZER.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
@@ -59,178 +166,163 @@ def _build_prompt(messages, system, enable_thinking=False):
     return "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
-# ─── GPU inference via ZeroGPU ───
-@gr.api(api_name="chat")
-@gr.GPU
-def chat_api(request_json: str) -> str:
-    """Non-streaming chat. Called via Gradio API."""
-    if MODEL is None:
-        return json.dumps({"error": f"Model failed to load: {LOAD_ERROR}"})
-    body = json.loads(request_json)
-    messages = body.get("messages", [])
-    if not messages:
-        return json.dumps({"error": "messages are required"})
-    system = body.get("system_prompt", SYSTEM_PROMPT)
-    max_tokens = body.get("max_new_tokens", MAX_NEW_TOKENS)
-    temperature = body.get("temperature", TEMPERATURE)
-    top_p = body.get("top_p", TOP_P)
-    enable_thinking = body.get("enable_thinking", False)
-    msg_dicts = [{"role": m["role"], "content": m["content"]} for m in messages]
-    prompt = _build_prompt(msg_dicts, system, enable_thinking)
     batch = TOKENIZER(prompt, return_tensors="pt", add_special_tokens=False)
-    # Move to GPU (ZeroGPU provides it)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    MODEL.to(device)
-    input_ids = batch["input_ids"].to(device)
-    attention_mask = batch["attention_mask"].to(device)
     t0 = time.time()
     with torch.no_grad():
         out = MODEL.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_tokens,
-            temperature=max(temperature, 0.01),
-            top_p=top_p,
-            top_k=40,
-            do_sample=True,
-            repetition_penalty=1.2,
-            pad_token_id=TOKENIZER.pad_token_id,
-            eos_token_id=TOKENIZER.eos_token_id,
         )
-    elapsed = time.time() - t0
     generated_ids = out[0][input_ids.shape[1]:]
     completion = TOKENIZER.decode(generated_ids, skip_special_tokens=True).strip()
-    # Parse thinking tags
-    think_match = re.search(r"<think>(.*?)</think>", completion, re.DOTALL)
-    thinking = ""
-    answer = completion
-    if think_match:
-        thinking = think_match.group(1).strip()
-        answer = re.sub(r"<think>.*?</think>", "", completion, flags=re.DOTALL).strip()
-    return json.dumps({
         "output": answer,
         "thinking": thinking if thinking else None,
         "model": "north-air-1",
         "tokens_generated": len(generated_ids),
         "latency_ms": round(elapsed * 1000),
-    })
-@gr.api(api_name="chat_stream")
-@gr.GPU
-def chat_stream_api(request_json: str) -> str:
-    """Streaming chat. Returns all tokens as newline-delimited JSON events."""
-    if MODEL is None:
-        return json.dumps({"error": f"Model failed to load: {LOAD_ERROR}"})
-    body = json.loads(request_json)
-    messages = body.get("messages", [])
-    if not messages:
-        return json.dumps({"error": "messages are required"})
-    system = body.get("system_prompt", SYSTEM_PROMPT)
-    max_tokens = body.get("max_new_tokens", MAX_NEW_TOKENS)
-    temperature = body.get("temperature", TEMPERATURE)
-    top_p = body.get("top_p", TOP_P)
-    enable_thinking = body.get("enable_thinking", False)
-    msg_dicts = [{"role": m["role"], "content": m["content"]} for m in messages]
-    prompt = _build_prompt(msg_dicts, system, enable_thinking)
-    batch = TOKENIZER(prompt, return_tensors="pt", add_special_tokens=False)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    MODEL.to(device)
-    input_ids = batch["input_ids"].to(device)
-    attention_mask = batch["attention_mask"].to(device)
     streamer = TextIteratorStreamer(TOKENIZER, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "max_new_tokens": max_tokens,
-        "temperature": max(temperature, 0.01),
-        "top_p": top_p,
-        "top_k": 40,
-        "do_sample": True,
-        "repetition_penalty": 1.2,
-        "pad_token_id": TOKENIZER.pad_token_id,
-        "eos_token_id": TOKENIZER.eos_token_id,
-        "streamer": streamer,
-    }
     t0 = time.time()
-    thread = Thread(target=lambda: MODEL.generate(**gen_kwargs))
     thread.start()
-    # Collect all tokens (ZeroGPU doesn't support true SSE, so we batch)
-    events = []
-    token_count = 0
-    in_thinking = False
-    buf = ""
-    for token_text in streamer:
-        buf += token_text
-        token_count += 1
-        if "<think>" in buf and not in_thinking:
-            in_thinking = True
-            events.append(json.dumps({"type": "thinking_start"}))
-            after = buf.split("<think>", 1)[1]
-            buf = after if after else ""
-        if "</think>" in buf and in_thinking:
-            before = buf.split("</think>", 1)[0]
-            if before:
-                events.append(json.dumps({"type": "thinking", "text": before}))
-            in_thinking = False
-            events.append(json.dumps({"type": "thinking_end"}))
-            after = buf.split("</think>", 1)[1].lstrip()
-            buf = ""
-            if after:
-                events.append(json.dumps({"type": "text", "text": after}))
-            continue
         if buf:
             evt_type = "thinking" if in_thinking else "text"
-            events.append(json.dumps({"type": evt_type, "text": buf}))
-            buf = ""
-    if buf:
-        evt_type = "thinking" if in_thinking else "text"
-        events.append(json.dumps({"type": evt_type, "text": buf}))
-    thread.join()
-    elapsed = time.time() - t0
-    events.append(json.dumps({
-        "type": "done",
-        "tokens_generated": token_count,
-        "latency_ms": round(elapsed * 1000),
-    }))
-    return "\n".join(events)
-# ─── Gradio UI (required for ZeroGPU Spaces) ───
-def gradio_chat(message, history):
-    """Simple chat interface for the Gradio UI."""
-    result_json = chat_api(json.dumps({
-        "messages": [{"role": "user", "content": message}],
-    }))
-    result = json.loads(result_json)
-    return result.get("output", result.get("error", "Error"))
-with gr.Blocks(title="North Air 1 API", theme=gr.themes.Base()) as demo:
-    gr.Markdown("# North Air 1 API\n0.6B parameter AI by North Air. Use the chat below or call the API endpoints.")
-    gr.ChatInterface(gradio_chat, type="messages")
-demo.launch()

 import os
 import re
 import time
+import json
+from typing import List, Optional
 from threading import Thread
 import torch
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from transformers import AutoTokenizer, TextIteratorStreamer
 MODEL_DIR = os.getenv("MODEL_DIR", "./final_model")
 MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512"))
 TEMPERATURE = float(os.getenv("TEMPERATURE", "0.6"))
 Be direct, helpful, concise. Use markdown. Write clean code. Never fabricate facts.
 If asked who you are: "I'm North Air 1, built by North Air." You are NOT ChatGPT/GPT-4/Claude/etc."""
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[Message]
+    model: Optional[str] = "north-air-1"
+    max_new_tokens: Optional[int] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    system_prompt: Optional[str] = None
+    stream: Optional[bool] = False
+    enable_thinking: Optional[bool] = False
+app = FastAPI(title="North Air 1 API", version="4.0.0")
+# ─── Model Loading: try ONNX first (fast), fallback to PyTorch ───
+ONNX_SESSION = None
+MODEL = None
+TOKENIZER = None
+LOAD_ERROR = None
+INFERENCE_MODE = "pytorch"  # or "onnx"
+def _try_load_onnx():
+    """Try to load ONNX Runtime quantized model for 2-4x faster CPU inference."""
+    global ONNX_SESSION, INFERENCE_MODE
+    onnx_path = os.path.join(MODEL_DIR, "model_quantized.onnx")
+    if not os.path.exists(onnx_path):
+        onnx_path = os.path.join(MODEL_DIR, "model.onnx")
+    if not os.path.exists(onnx_path):
+        return False
+    try:
+        import onnxruntime as ort
+        sess_options = ort.SessionOptions()
+        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        sess_options.intra_op_num_threads = 4
+        sess_options.inter_op_num_threads = 2
+        sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        ONNX_SESSION = ort.InferenceSession(
+            onnx_path, sess_options,
+            providers=["CPUExecutionProvider"],
         )
+        INFERENCE_MODE = "onnx"
+        print(f"ONNX Runtime loaded: {onnx_path}")
+        return True
+    except Exception as e:
+        print(f"ONNX load failed: {e}")
+        return False
+def _load_model():
+    """Load model — ONNX quantized if available, else PyTorch."""
+    global MODEL, TOKENIZER, LOAD_ERROR, INFERENCE_MODE
+    try:
+        TOKENIZER = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True, trust_remote_code=True)
+        if TOKENIZER.pad_token is None:
+            TOKENIZER.pad_token = TOKENIZER.eos_token
+    except Exception as e:
+        LOAD_ERROR = f"Tokenizer load failed: {e}"
+        return
+    # Try ONNX first
+    if _try_load_onnx():
+        print(f"Using ONNX Runtime ({INFERENCE_MODE})")
+        return
+    # Fallback: PyTorch with optimizations
+    try:
+        from transformers import AutoModelForCausalLM
+        adapter_cfg = os.path.join(MODEL_DIR, "adapter_config.json")
+        if os.path.exists(adapter_cfg):
+            from peft import AutoPeftModelForCausalLM
+            MODEL = AutoPeftModelForCausalLM.from_pretrained(
+                MODEL_DIR, torch_dtype=torch.float32, device_map={"": "cpu"},
+            )
+        else:
+            MODEL = AutoModelForCausalLM.from_pretrained(
+                MODEL_DIR, torch_dtype=torch.float32, device_map={"": "cpu"},
+                trust_remote_code=True,
+            )
+        MODEL.eval()
+        # Apply PyTorch dynamic quantization (INT8) for ~1.5-2x speedup on CPU
+        try:
+            MODEL = torch.quantization.quantize_dynamic(
+                MODEL, {torch.nn.Linear}, dtype=torch.qint8,
+            )
+            INFERENCE_MODE = "pytorch-int8"
+            print("PyTorch dynamic INT8 quantization applied")
+        except Exception as e:
+            INFERENCE_MODE = "pytorch"
+            print(f"Quantization skipped: {e}")
+        # Enable torch.compile if available (PyTorch 2.x)
+        try:
+            MODEL = torch.compile(MODEL, mode="reduce-overhead")
+            print("torch.compile applied")
+        except Exception:
+            pass
+        print(f"Model loaded: {INFERENCE_MODE}")
+    except Exception as e:
+        LOAD_ERROR = str(e)
+_load_model()
+@app.get("/health")
+def health():
+    ok = (MODEL is not None) or (ONNX_SESSION is not None)
+    return {
+        "ok": ok,
+        "model": "north-air-1",
+        "version": "4.0.0",
+        "architecture": "Qwen3-0.6B + LoRA r=64",
+        "inference": INFERENCE_MODE,
+        "features": ["streaming", "thinking", "quantized"],
+        "model_dir": MODEL_DIR,
+        "error": LOAD_ERROR,
+    }
+def _build_prompt(messages: list, system: str, enable_thinking: bool) -> str:
     has_system = any(m["role"] == "system" for m in messages)
     if not has_system:
         messages = [{"role": "system", "content": system}] + messages
     if hasattr(TOKENIZER, "apply_chat_template"):
         return TOKENIZER.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
     return "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
+def _parse_thinking(text: str) -> tuple:
+    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
+    if think_match:
+        thinking = think_match.group(1).strip()
+        answer = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+        return thinking, answer
+    return "", text
+def _generation_kwargs(input_ids, attention_mask, max_new_tokens, temperature, top_p, **extra):
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "max_new_tokens": max_new_tokens,
+        "temperature": max(temperature, 0.01),
+        "top_p": top_p,
+        "top_k": 40,
+        "do_sample": True,
+        "repetition_penalty": 1.2,
+        "pad_token_id": TOKENIZER.pad_token_id,
+        "eos_token_id": TOKENIZER.eos_token_id,
+        **extra,
+    }
+def _check_model():
+    if MODEL is None and ONNX_SESSION is None:
+        raise HTTPException(status_code=500, detail=f"Model failed to load: {LOAD_ERROR}")
+    if TOKENIZER is None:
+        raise HTTPException(status_code=500, detail=f"Tokenizer failed to load: {LOAD_ERROR}")
+def _prepare_request(req: ChatRequest):
+    system = req.system_prompt or SYSTEM_PROMPT
+    messages = [{"role": m.role, "content": m.content} for m in req.messages]
+    enable_thinking = req.enable_thinking if req.enable_thinking is not None else False
+    prompt = _build_prompt(messages, system, enable_thinking)
     batch = TOKENIZER(prompt, return_tensors="pt", add_special_tokens=False)
+    max_new_tokens = req.max_new_tokens or MAX_NEW_TOKENS
+    temperature = req.temperature if req.temperature is not None else TEMPERATURE
+    top_p = req.top_p if req.top_p is not None else TOP_P
+    return batch, max_new_tokens, temperature, top_p
+@app.post("/chat")
+def chat(req: ChatRequest):
+    _check_model()
+    if not req.messages:
+        raise HTTPException(status_code=400, detail="messages are required")
+    if req.stream:
+        return chat_stream(req)
+    batch, max_new_tokens, temperature, top_p = _prepare_request(req)
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
     t0 = time.time()
     with torch.no_grad():
         out = MODEL.generate(
+            **_generation_kwargs(input_ids, attention_mask, max_new_tokens, temperature, top_p)
         )
+    elapsed = time.time() - t0
     generated_ids = out[0][input_ids.shape[1]:]
     completion = TOKENIZER.decode(generated_ids, skip_special_tokens=True).strip()
+    thinking, answer = _parse_thinking(completion)
+    return {
         "output": answer,
         "thinking": thinking if thinking else None,
         "model": "north-air-1",
+        "inference": INFERENCE_MODE,
         "tokens_generated": len(generated_ids),
         "latency_ms": round(elapsed * 1000),
+    }
+@app.post("/chat/stream")
+def chat_stream(req: ChatRequest):
+    _check_model()
+    if not req.messages:
+        raise HTTPException(status_code=400, detail="messages are required")
+    batch, max_new_tokens, temperature, top_p = _prepare_request(req)
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
     streamer = TextIteratorStreamer(TOKENIZER, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = _generation_kwargs(
+        input_ids, attention_mask, max_new_tokens, temperature, top_p,
+        streamer=streamer,
+    )
     t0 = time.time()
+    thread = Thread(target=_generate_in_thread, args=(gen_kwargs,))
     thread.start()
+    def event_stream():
+        token_count = 0
+        in_thinking = False
+        buf = ""
+        for token_text in streamer:
+            buf += token_text
+            token_count += 1
+            if "<think>" in buf and not in_thinking:
+                in_thinking = True
+                yield f"data: {json.dumps({'type': 'thinking_start'})}\n\n"
+                after = buf.split("<think>", 1)[1]
+                buf = after if after else ""
+            if "</think>" in buf and in_thinking:
+                before = buf.split("</think>", 1)[0]
+                if before:
+                    yield f"data: {json.dumps({'type': 'thinking', 'text': before})}\n\n"
+                in_thinking = False
+                yield f"data: {json.dumps({'type': 'thinking_end'})}\n\n"
+                after = buf.split("</think>", 1)[1].lstrip()
+                buf = ""
+                if after:
+                    yield f"data: {json.dumps({'type': 'text', 'text': after})}\n\n"
+                continue
+            partial_open = "<think"
+            partial_close = "</think"
+            if not in_thinking and buf.endswith(tuple(partial_open[:i] for i in range(1, len(partial_open) + 1))):
+                continue
+            if in_thinking and buf.endswith(tuple(partial_close[:i] for i in range(1, len(partial_close) + 1))):
+                continue
+            if buf:
+                evt_type = "thinking" if in_thinking else "text"
+                yield f"data: {json.dumps({'type': evt_type, 'text': buf})}\n\n"
+                buf = ""
         if buf:
             evt_type = "thinking" if in_thinking else "text"
+            yield f"data: {json.dumps({'type': evt_type, 'text': buf})}\n\n"
+        if in_thinking:
+            yield f"data: {json.dumps({'type': 'thinking_end'})}\n\n"
+        thread.join()
+        elapsed = time.time() - t0
+        yield f"data: {json.dumps({'type': 'done', 'tokens_generated': token_count, 'latency_ms': round(elapsed * 1000), 'inference': INFERENCE_MODE})}\n\n"
+    return StreamingResponse(event_stream(), media_type="text/event-stream")
+def _generate_in_thread(kwargs):
+    with torch.no_grad():
+        MODEL.generate(**kwargs)

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 torch>=2.2.0
 transformers>=4.45.0
 peft>=0.12.0

+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+pydantic==2.9.2
 torch>=2.2.0
 transformers>=4.45.0
 peft>=0.12.0