Spaces:

maxime-antoine-dev
/

fades-api

Sleeping

App Files Files Community

maxime-antoine-dev commited on 26 days ago

Commit

1f23e23

1 Parent(s): 712c34b

added light mode

Browse files

Files changed (1) hide show

main.py +239 -99

main.py CHANGED Viewed

@@ -2,8 +2,9 @@
 import os
 import json
 import time
 import asyncio
-from typing import Any, Dict, Optional
 from functools import lru_cache
 from fastapi import FastAPI
@@ -12,26 +13,51 @@ from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # ----------------------------
-# Config
 # ----------------------------
 GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "maxime-antoine-dev/fades-mistral-v02-gguf")
 GGUF_FILENAME = os.getenv("GGUF_FILENAME", "mistral_v02_fades.Q4_K_M.gguf")
-# llama.cpp params (CPU Space)
-N_CTX = int(os.getenv("N_CTX", "2048"))
-N_THREADS = int(os.getenv("N_THREADS", str(max(1, (os.cpu_count() or 2) - 1))))
 N_BATCH = int(os.getenv("N_BATCH", "256"))
-# generation defaults
 MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS", "180"))
 TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE", "0.0"))
 TOP_P_DEFAULT = float(os.getenv("TOP_P", "0.95"))
-# One request at a time on CPU (prevents stalls / extreme latency)
 GEN_LOCK = asyncio.Lock()
 # ----------------------------
-# Prompt (aligned with your training target)
 # ----------------------------
 ALLOWED_LABELS = [
     "none",
@@ -51,66 +77,52 @@ ALLOWED_LABELS = [
     "intentional",
 ]
-def labels_block_compact() -> str:
-    return "\n".join([f'- "{k}"' for k in ALLOWED_LABELS])
-INSTRUCTION = """You are a logical fallacy detection assistant.
 You MUST choose labels ONLY from this list (use the exact string):
-{labels_list}
-Return ONLY ONE valid JSON object with this schema:
 {{
   "has_fallacy": boolean,
   "fallacies": [
     {{
       "type": string,
-      "confidence": number,              // 0.0..1.0
-      "evidence_quotes": [string],       // exact substring(s) copied from the input text
-      "rationale": string                // specific to this fallacy + quote
     }}
   ],
-  "overall_explanation": string          // short summary across the whole input
 }}
-Hard rules:
-- Output ONLY the JSON object. No markdown. No extra text.
-- Produce exactly ONE JSON object, then STOP.
-- evidence_quotes MUST be exact substrings from the input text.
-- If has_fallacy=false:
-  - fallacies MUST be []
-  - overall_explanation MUST explicitly say there is no fallacy
-  - overall_explanation MUST NOT mention any fallacy label/category names.
-- If has_fallacy=true:
-  - fallacies MUST contain at least 1 item
-  - EACH fallacies[i].type MUST be one of the allowed labels (NOT a synonym)
-"""
-SYSTEM_PROMPT = "You are a careful JSON-only assistant. Output only JSON."
 def build_messages(text: str) -> list[dict]:
-    instruction = INSTRUCTION.format(labels_list=labels_block_compact())
     return [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user", "content": f"{instruction}\n\nTEXT:\n{text}\n\nJSON:"},
     ]
 # ----------------------------
-# Robust JSON extraction
 # ----------------------------
-def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
-    start = s.find("{")
-    if start == -1:
-        return None
-    end = s.rfind("}")
-    if end == -1 or end <= start:
-        return None
-    cand = s[start : end + 1].strip()
-    try:
-        return json.loads(cand)
-    except Exception:
-        return None
 def stop_at_complete_json(text: str) -> Optional[str]:
     start = text.find("{")
     if start == -1:
@@ -142,73 +154,145 @@ def stop_at_complete_json(text: str) -> Optional[str]:
                 return text[start : i + 1]
     return None
 # ----------------------------
-# Load GGUF model (global)
 # ----------------------------
 llm: Optional[Llama] = None
 model_path: Optional[str] = None
-def load_llama() -> tuple[str, Llama]:
-    global model_path
-    t0 = time.time()
-    mp = hf_hub_download(
-        repo_id=GGUF_REPO_ID,
-        filename=GGUF_FILENAME,
-        token=os.getenv("HF_TOKEN"),  # optional (only if repo is private)
-    )
-    t1 = time.time()
-    # CPU Space -> n_gpu_layers = 0
-    llama = Llama(
-        model_path=mp,
-        n_ctx=N_CTX,
-        n_threads=N_THREADS,
-        n_batch=N_BATCH,
-        n_gpu_layers=0,
-        verbose=True,
-    )
-    t2 = time.time()
-    print(f"✅ GGUF downloaded: {mp} ({t1 - t0:.1f}s)")
-    print(f"✅ Model loaded: ({t2 - t1:.1f}s) n_ctx={N_CTX} threads={N_THREADS} batch={N_BATCH}")
-    model_path = mp
-    return mp, llama
-# ----------------------------
-# FastAPI
-# ----------------------------
-app = FastAPI(title="FADES Fallacy Detector (GGUF / llama.cpp)")
-class AnalyzeRequest(BaseModel):
-    text: str
-    max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT
-    temperature: float = TEMPERATURE_DEFAULT
-    top_p: float = TOP_P_DEFAULT
 @app.get("/health")
 def health():
     return {
-        "ok": True,
-        "engine": "llama.cpp (llama-cpp-python)",
         "gguf_repo": GGUF_REPO_ID,
         "gguf_filename": GGUF_FILENAME,
-        "model_loaded": llm is not None,
         "model_path": model_path,
         "n_ctx": N_CTX,
         "n_threads": N_THREADS,
         "n_batch": N_BATCH,
     }
-@app.on_event("startup")
-def _startup():
-    global llm
-    _, llm_loaded = load_llama()
-    llm = llm_loaded
 @lru_cache(maxsize=256)
-def _cached_generate(text: str, max_new_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
-    assert llm is not None
     messages = build_messages(text)
@@ -221,18 +305,74 @@ def _cached_generate(text: str, max_new_tokens: int, temperature: float, top_p:
     )
     raw = out["choices"][0]["message"]["content"]
-    cut = stop_at_complete_json(raw)
-    raw_cut = cut if cut is not None else raw
-    obj = extract_first_json_obj(raw_cut)
     if obj is None:
-        return {"ok": False, "raw": raw_cut}
     return {"ok": True, "result": obj}
 @app.post("/analyze")
 async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
-    # CPU: serialize requests to keep stable latency
     async with GEN_LOCK:
-        return _cached_generate(req.text, int(req.max_new_tokens), float(req.temperature), float(req.top_p))

 import os
 import json
 import time
+import uuid
 import asyncio
+from typing import Any, Dict, Optional, Tuple
 from functools import lru_cache
 from fastapi import FastAPI
 from llama_cpp import Llama
 # ----------------------------
+# Config (model)
 # ----------------------------
 GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "maxime-antoine-dev/fades-mistral-v02-gguf")
 GGUF_FILENAME = os.getenv("GGUF_FILENAME", "mistral_v02_fades.Q4_K_M.gguf")
+# Model load params (fixed once at startup)
+# Keep these conservative for HF CPU
+N_CTX = int(os.getenv("N_CTX", "1536"))
+CPU_COUNT = os.cpu_count() or 4
+N_THREADS = int(os.getenv("N_THREADS", str(min(8, max(1, CPU_COUNT - 1)))))
 N_BATCH = int(os.getenv("N_BATCH", "256"))
+# Default generation params ("normal")
 MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS", "180"))
 TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE", "0.0"))
 TOP_P_DEFAULT = float(os.getenv("TOP_P", "0.95"))
+# "Light" generation params (fastest / most stable)
+LIGHT_MAX_NEW_TOKENS = int(os.getenv("LIGHT_MAX_NEW_TOKENS", "60"))
+LIGHT_TEMPERATURE = float(os.getenv("LIGHT_TEMPERATURE", "0.0"))
+LIGHT_TOP_P = float(os.getenv("LIGHT_TOP_P", "0.9"))
+# "Light" runtime knobs (do NOT reload model, just reduce work)
+LIGHT_N_BATCH = int(os.getenv("LIGHT_N_BATCH", "64"))
+# One request at a time on CPU
 GEN_LOCK = asyncio.Lock()
+app = FastAPI(title="FADES Fallacy Detector (GGUF / llama.cpp)")
 # ----------------------------
+# Request model
+# ----------------------------
+class AnalyzeRequest(BaseModel):
+    text: str
+    # if True => use "light" parameters
+    light: bool = False
+    # optional overrides (applied after picking light/normal defaults)
+    max_new_tokens: Optional[int] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+# ----------------------------
+# Prompt
 # ----------------------------
 ALLOWED_LABELS = [
     "none",
     "intentional",
 ]
+LABELS_STR = ", ".join([f'"{x}"' for x in ALLOWED_LABELS])
+PROMPT_TEMPLATE = f"""You are a logical fallacy detection assistant.
 You MUST choose labels ONLY from this list (use the exact string):
+{LABELS_STR}
+Return ONLY valid JSON with this schema:
 {{
   "has_fallacy": boolean,
   "fallacies": [
     {{
       "type": string,
+      "confidence": number,
+      "evidence_quotes": [string],
+      "rationale": string
     }}
   ],
+  "overall_explanation": string
 }}
+Rules:
+Output ONLY JSON. No markdown.
+If no fallacy: has_fallacy=false and fallacies=[].
+INPUT:
+{{text}}
+OUTPUT:"""
 def build_messages(text: str) -> list[dict]:
     return [
+        {"role": "system", "content": "Output only JSON. Produce exactly one JSON object and stop."},
+        {"role": "user", "content": PROMPT_TEMPLATE.replace("{text}", text)},
     ]
 # ----------------------------
+# Logging helpers
 # ----------------------------
+def _log(rid: str, msg: str):
+    # rid = request id to correlate logs
+    print(f"[{rid}] {msg}", flush=True)
+# ----------------------------
+# JSON extraction helpers
+# ----------------------------
 def stop_at_complete_json(text: str) -> Optional[str]:
     start = text.find("{")
     if start == -1:
                 return text[start : i + 1]
     return None
+def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
+    cut = stop_at_complete_json(s) or s
+    start = cut.find("{")
+    end = cut.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        return None
+    cand = cut[start : end + 1].strip()
+    try:
+        return json.loads(cand)
+    except Exception:
+        return None
 # ----------------------------
+# Model load
 # ----------------------------
 llm: Optional[Llama] = None
 model_path: Optional[str] = None
+load_error: Optional[str] = None
+loaded_at_ts: Optional[float] = None
+def load_llama() -> None:
+    global llm, model_path, load_error, loaded_at_ts
+    print("=== FADES startup ===", flush=True)
+    print(f"GGUF_REPO_ID={GGUF_REPO_ID}", flush=True)
+    print(f"GGUF_FILENAME={GGUF_FILENAME}", flush=True)
+    print(f"N_CTX={N_CTX} N_THREADS={N_THREADS} N_BATCH={N_BATCH}", flush=True)
+    try:
+        t0 = time.time()
+        mp = hf_hub_download(
+            repo_id=GGUF_REPO_ID,
+            filename=GGUF_FILENAME,
+            token=os.getenv("HF_TOKEN"),
+        )
+        t1 = time.time()
+        print(f"✅ GGUF downloaded: {mp} ({t1 - t0:.1f}s)", flush=True)
+        t2 = time.time()
+        llm_local = Llama(
+            model_path=mp,
+            n_ctx=N_CTX,
+            n_threads=N_THREADS,
+            n_batch=N_BATCH,
+            n_gpu_layers=0,
+            verbose=False,
+        )
+        t3 = time.time()
+        print(f"✅ Model loaded: ({t3 - t2:.1f}s) n_ctx={N_CTX} threads={N_THREADS} batch={N_BATCH}", flush=True)
+        llm = llm_local
+        model_path = mp
+        load_error = None
+        loaded_at_ts = time.time()
+        print("=== Startup OK ===", flush=True)
+    except Exception as e:
+        load_error = repr(e)
+        print(f"❌ Startup FAILED: {load_error}", flush=True)
+@app.on_event("startup")
+def _startup():
+    load_llama()
+@app.get("/")
+def root():
+    return {"ok": True, "hint": "Use GET /health or POST /analyze"}
 @app.get("/health")
 def health():
     return {
+        "ok": llm is not None and load_error is None,
+        "model_loaded": llm is not None,
+        "load_error": load_error,
         "gguf_repo": GGUF_REPO_ID,
         "gguf_filename": GGUF_FILENAME,
         "model_path": model_path,
         "n_ctx": N_CTX,
         "n_threads": N_THREADS,
         "n_batch": N_BATCH,
+        "loaded_at_ts": loaded_at_ts,
     }
+# ----------------------------
+# Param selection (light vs normal)
+# ----------------------------
+def pick_params(req: AnalyzeRequest) -> Dict[str, Any]:
+    if req.light:
+        params = {
+            "max_new_tokens": LIGHT_MAX_NEW_TOKENS,
+            "temperature": LIGHT_TEMPERATURE,
+            "top_p": LIGHT_TOP_P,
+            "n_batch": LIGHT_N_BATCH,
+        }
+    else:
+        params = {
+            "max_new_tokens": MAX_NEW_TOKENS_DEFAULT,
+            "temperature": TEMPERATURE_DEFAULT,
+            "top_p": TOP_P_DEFAULT,
+            "n_batch": N_BATCH,  # keep default
+        }
+    # Apply per-request overrides (if provided)
+    if req.max_new_tokens is not None:
+        params["max_new_tokens"] = int(req.max_new_tokens)
+    if req.temperature is not None:
+        params["temperature"] = float(req.temperature)
+    if req.top_p is not None:
+        params["top_p"] = float(req.top_p)
+    # Hard safety caps on CPU
+    params["max_new_tokens"] = max(1, min(int(params["max_new_tokens"]), 300))
+    params["temperature"] = max(0.0, min(float(params["temperature"]), 1.5))
+    params["top_p"] = max(0.05, min(float(params["top_p"]), 1.0))
+    params["n_batch"] = max(16, min(int(params["n_batch"]), 512))
+    return params
+# ----------------------------
+# Cached generate - separated by mode + params
+# ----------------------------
 @lru_cache(maxsize=256)
+def _cached_generate(
+    text: str,
+    light: bool,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    n_batch: int,
+) -> Dict[str, Any]:
+    if llm is None:
+        return {"ok": False, "error": "model_not_loaded", "detail": load_error}
+    # Change batch for this call (llama-cpp-python supports runtime override)
+    # Some versions accept it; if yours doesn't, it will be ignored harmlessly.
+    try:
+        llm.n_batch = int(n_batch)  # type: ignore[attr-defined]
+    except Exception:
+        pass
     messages = build_messages(text)
     )
     raw = out["choices"][0]["message"]["content"]
+    obj = extract_first_json_obj(raw)
     if obj is None:
+        return {"ok": False, "error": "json_parse_error", "raw": raw}
     return {"ok": True, "result": obj}
 @app.post("/analyze")
 async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
+    rid = uuid.uuid4().hex[:10]
+    t0 = time.time()
+    _log(rid, f"📩 Request received (light={req.light}) chars={len(req.text)}")
+    if not req.text or not req.text.strip():
+        _log(rid, "⚠️ Empty text")
+        return {"ok": False, "error": "empty_text"}
+    params = pick_params(req)
+    _log(
+        rid,
+        f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']}",
+    )
+    # serialize requests on CPU
     async with GEN_LOCK:
+        _log(rid, "🔒 Acquired GEN_LOCK")
+        t_lock = time.time()
+        _log(rid, "🧱 Building prompt/messages")
+        t1 = time.time()
+        # Generate
+        _log(rid, "🧠 Generating...")
+        t2 = time.time()
+        res = _cached_generate(
+            req.text,
+            bool(req.light),
+            int(params["max_new_tokens"]),
+            float(params["temperature"]),
+            float(params["top_p"]),
+            int(params["n_batch"]),
+        )
+        t3 = time.time()
+        if not res.get("ok"):
+            _log(rid, f"❌ Generation failed: {res.get('error')}")
+        else:
+            _log(rid, "✅ JSON parsed OK")
+        elapsed_total = time.time() - t0
+        elapsed_lock = time.time() - t_lock
+        _log(rid, f"⏱ Done. gen_time={t3 - t2:.2f}s total={elapsed_total:.2f}s (under lock {elapsed_lock:.2f}s)")
+        # return with timings
+        return {
+            **res,
+            "meta": {
+                "request_id": rid,
+                "light": bool(req.light),
+                "params": {
+                    "max_new_tokens": int(params["max_new_tokens"]),
+                    "temperature": float(params["temperature"]),
+                    "top_p": float(params["top_p"]),
+                    "n_batch": int(params["n_batch"]),
+                },
+                "timings_s": {
+                    "total": round(elapsed_total, 3),
+                    "gen": round(t3 - t2, 3),
+                },
+            },
+        }