Spaces:

maxime-antoine-dev
/

fades-api

Sleeping

App Files Files Community

maxime-antoine-dev commited on 25 days ago

Commit

66ca5c9

1 Parent(s): 8d0988b

Added rewirte route, improved prompts

Browse files

Files changed (1) hide show

main.py +450 -82

main.py CHANGED Viewed

@@ -4,23 +4,23 @@ import json
 import time
 import uuid
 import asyncio
-from typing import Any, Dict, Optional, Tuple
 from functools import lru_cache
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
-# ----------------------------
 # Config (model)
-# ----------------------------
 GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "maxime-antoine-dev/fades-mistral-v02-gguf")
 GGUF_FILENAME = os.getenv("GGUF_FILENAME", "mistral_v02_fades.Q4_K_M.gguf")
 # Model load params (fixed once at startup)
-# Keep these conservative for HF CPU
 N_CTX = int(os.getenv("N_CTX", "1536"))
 CPU_COUNT = os.cpu_count() or 4
 N_THREADS = int(os.getenv("N_THREADS", str(min(8, max(1, CPU_COUNT - 1)))))
@@ -31,12 +31,12 @@ MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS", "180"))
 TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE", "0.0"))
 TOP_P_DEFAULT = float(os.getenv("TOP_P", "0.95"))
-# "Light" generation params (fastest / most stable)
 LIGHT_MAX_NEW_TOKENS = int(os.getenv("LIGHT_MAX_NEW_TOKENS", "60"))
 LIGHT_TEMPERATURE = float(os.getenv("LIGHT_TEMPERATURE", "0.0"))
 LIGHT_TOP_P = float(os.getenv("LIGHT_TOP_P", "0.9"))
-# "Light" runtime knobs (do NOT reload model, just reduce work)
 LIGHT_N_BATCH = int(os.getenv("LIGHT_N_BATCH", "64"))
 # One request at a time on CPU
@@ -44,17 +44,16 @@ GEN_LOCK = asyncio.Lock()
 app = FastAPI(title="FADES Fallacy Detector (GGUF / llama.cpp)")
-# ----------------------------
 # CORS (for browser front-ends)
-# ----------------------------
-# Comma-separated list of allowed origins, or "*" to allow all.
 _CORS_ORIGINS = os.getenv("CORS_ALLOW_ORIGINS", "*").strip()
 if _CORS_ORIGINS == "*" or not _CORS_ORIGINS:
     allow_origins = ["*"]
 else:
     allow_origins = [o.strip() for o in _CORS_ORIGINS.split(",") if o.strip()]
-# Note: when allow_origins=["*"], allow_credentials must be False.
 app.add_middleware(
     CORSMiddleware,
     allow_origins=allow_origins,
@@ -63,22 +62,34 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ----------------------------
-# Request model
-# ----------------------------
-class AnalyzeRequest(BaseModel):
-    text: str
     # if True => use "light" parameters
     light: bool = False
     # optional overrides (applied after picking light/normal defaults)
     max_new_tokens: Optional[int] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
-# ----------------------------
-# Prompt
-# ----------------------------
 ALLOWED_LABELS = [
     "none",
     "faulty generalization",
@@ -99,12 +110,13 @@ ALLOWED_LABELS = [
 LABELS_STR = ", ".join([f'"{x}"' for x in ALLOWED_LABELS])
-PROMPT_TEMPLATE = f"""You are a logical fallacy detection assistant.
-You MUST choose labels ONLY from this list (use the exact string):
 {LABELS_STR}
-Return ONLY valid JSON with this schema:
 {{
   "has_fallacy": boolean,
   "fallacies": [
@@ -118,31 +130,96 @@ Return ONLY valid JSON with this schema:
   "overall_explanation": string
 }}
-Rules:
-Output ONLY JSON. No markdown.
-If no fallacy: has_fallacy=false and fallacies=[].
 INPUT:
 {{text}}
 OUTPUT:"""
-def build_messages(text: str) -> list[dict]:
     return [
-        {"role": "system", "content": "Output only JSON. Produce exactly one JSON object and stop."},
-        {"role": "user", "content": PROMPT_TEMPLATE.replace("{text}", text)},
     ]
-# ----------------------------
-# Logging helpers
-# ----------------------------
 def _log(rid: str, msg: str):
-    # rid = request id to correlate logs
     print(f"[{rid}] {msg}", flush=True)
-# ----------------------------
-# JSON extraction helpers
-# ----------------------------
 def stop_at_complete_json(text: str) -> Optional[str]:
     start = text.find("{")
     if start == -1:
@@ -174,6 +251,7 @@ def stop_at_complete_json(text: str) -> Optional[str]:
                 return text[start : i + 1]
     return None
 def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     cut = stop_at_complete_json(s) or s
     start = cut.find("{")
@@ -186,14 +264,16 @@ def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
-# ----------------------------
 # Model load
-# ----------------------------
 llm: Optional[Llama] = None
 model_path: Optional[str] = None
 load_error: Optional[str] = None
 loaded_at_ts: Optional[float] = None
 def load_llama() -> None:
     global llm, model_path, load_error, loaded_at_ts
@@ -234,13 +314,16 @@ def load_llama() -> None:
         load_error = repr(e)
         print(f"❌ Startup FAILED: {load_error}", flush=True)
 @app.on_event("startup")
 def _startup():
     load_llama()
 @app.get("/")
 def root():
-    return {"ok": True, "hint": "Use GET /health or POST /analyze"}
 @app.get("/health")
 def health():
@@ -257,10 +340,11 @@ def health():
         "loaded_at_ts": loaded_at_ts,
     }
-# ----------------------------
-# Param selection (light vs normal)
-# ----------------------------
-def pick_params(req: AnalyzeRequest) -> Dict[str, Any]:
     if req.light:
         params = {
             "max_new_tokens": LIGHT_MAX_NEW_TOKENS,
@@ -273,10 +357,9 @@ def pick_params(req: AnalyzeRequest) -> Dict[str, Any]:
             "max_new_tokens": MAX_NEW_TOKENS_DEFAULT,
             "temperature": TEMPERATURE_DEFAULT,
             "top_p": TOP_P_DEFAULT,
-            "n_batch": N_BATCH,  # keep default
         }
-    # Apply per-request overrides (if provided)
     if req.max_new_tokens is not None:
         params["max_new_tokens"] = int(req.max_new_tokens)
     if req.temperature is not None:
@@ -284,20 +367,115 @@ def pick_params(req: AnalyzeRequest) -> Dict[str, Any]:
     if req.top_p is not None:
         params["top_p"] = float(req.top_p)
-    # Hard safety caps on CPU
-    params["max_new_tokens"] = max(1, min(int(params["max_new_tokens"]), 300))
     params["temperature"] = max(0.0, min(float(params["temperature"]), 1.5))
     params["top_p"] = max(0.05, min(float(params["top_p"]), 1.0))
     params["n_batch"] = max(16, min(int(params["n_batch"]), 512))
     return params
-# ----------------------------
-# Cached generate - separated by mode + params
-# ----------------------------
-@lru_cache(maxsize=256)
-def _cached_generate(
-    text: str,
     light: bool,
     max_new_tokens: int,
     temperature: float,
@@ -307,14 +485,27 @@ def _cached_generate(
     if llm is None:
         return {"ok": False, "error": "model_not_loaded", "detail": load_error}
-    # Change batch for this call (llama-cpp-python supports runtime override)
-    # Some versions accept it; if yours doesn't, it will be ignored harmlessly.
     try:
         llm.n_batch = int(n_batch)  # type: ignore[attr-defined]
     except Exception:
         pass
-    messages = build_messages(text)
     out = llm.create_chat_completion(
         messages=messages,
@@ -331,15 +522,44 @@ def _cached_generate(
     return {"ok": True, "result": obj}
 @app.post("/analyze")
 async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
     rid = uuid.uuid4().hex[:10]
     t0 = time.time()
-    _log(rid, f"📩 Request received (light={req.light}) chars={len(req.text)}")
     if not req.text or not req.text.strip():
-        _log(rid, "⚠️ Empty text")
         return {"ok": False, "error": "empty_text"}
     params = pick_params(req)
@@ -348,37 +568,29 @@ async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
         f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']}",
     )
-    # serialize requests on CPU
     async with GEN_LOCK:
-        _log(rid, "🔒 Acquired GEN_LOCK")
         t_lock = time.time()
-        _log(rid, "🧱 Building prompt/messages")
-        t1 = time.time()
-        # Generate
-        _log(rid, "🧠 Generating...")
-        t2 = time.time()
-        res = _cached_generate(
-            req.text,
             bool(req.light),
             int(params["max_new_tokens"]),
             float(params["temperature"]),
             float(params["top_p"]),
             int(params["n_batch"]),
         )
-        t3 = time.time()
-        if not res.get("ok"):
-            _log(rid, f"❌ Generation failed: {res.get('error')}")
-        else:
-            _log(rid, "✅ JSON parsed OK")
-        elapsed_total = time.time() - t0
-        elapsed_lock = time.time() - t_lock
-        _log(rid, f"⏱ Done. gen_time={t3 - t2:.2f}s total={elapsed_total:.2f}s (under lock {elapsed_lock:.2f}s)")
-        # return with timings
         return {
             **res,
             "meta": {
@@ -390,9 +602,165 @@ async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
                     "top_p": float(params["top_p"]),
                     "n_batch": int(params["n_batch"]),
                 },
-                "timings_s": {
-                    "total": round(elapsed_total, 3),
-                    "gen": round(t3 - t2, 3),
                 },
             },
         }

 import time
 import uuid
 import asyncio
+from typing import Any, Dict, Optional, List
 from functools import lru_cache
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
+# ============================
 # Config (model)
+# ============================
 GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "maxime-antoine-dev/fades-mistral-v02-gguf")
 GGUF_FILENAME = os.getenv("GGUF_FILENAME", "mistral_v02_fades.Q4_K_M.gguf")
 # Model load params (fixed once at startup)
 N_CTX = int(os.getenv("N_CTX", "1536"))
 CPU_COUNT = os.cpu_count() or 4
 N_THREADS = int(os.getenv("N_THREADS", str(min(8, max(1, CPU_COUNT - 1)))))
 TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE", "0.0"))
 TOP_P_DEFAULT = float(os.getenv("TOP_P", "0.95"))
+# "Light" generation params
 LIGHT_MAX_NEW_TOKENS = int(os.getenv("LIGHT_MAX_NEW_TOKENS", "60"))
 LIGHT_TEMPERATURE = float(os.getenv("LIGHT_TEMPERATURE", "0.0"))
 LIGHT_TOP_P = float(os.getenv("LIGHT_TOP_P", "0.9"))
+# "Light" runtime knobs
 LIGHT_N_BATCH = int(os.getenv("LIGHT_N_BATCH", "64"))
 # One request at a time on CPU
 app = FastAPI(title="FADES Fallacy Detector (GGUF / llama.cpp)")
+# ============================
 # CORS (for browser front-ends)
+# ============================
 _CORS_ORIGINS = os.getenv("CORS_ALLOW_ORIGINS", "*").strip()
 if _CORS_ORIGINS == "*" or not _CORS_ORIGINS:
     allow_origins = ["*"]
 else:
     allow_origins = [o.strip() for o in _CORS_ORIGINS.split(",") if o.strip()]
 app.add_middleware(
     CORSMiddleware,
     allow_origins=allow_origins,
     allow_headers=["*"],
 )
+# ============================
+# Schemas
+# ============================
+class GenParams(BaseModel):
     # if True => use "light" parameters
     light: bool = False
     # optional overrides (applied after picking light/normal defaults)
     max_new_tokens: Optional[int] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
+class AnalyzeRequest(GenParams):
+    text: str
+class RewriteRequest(GenParams):
+    text: str
+    quote: str = Field(..., description="Verbatim substring that must be replaced.")
+    fallacy_type: str = Field(..., description="Fallacy type of the quote.")
+    rationale: str = Field(..., description="Why the quote is fallacious.")
+    occurrence: int = Field(0, description="Which occurrence of quote to replace (0-based).")
+# ============================
+# Labels & Prompts
+# ============================
 ALLOWED_LABELS = [
     "none",
     "faulty generalization",
 LABELS_STR = ", ".join([f'"{x}"' for x in ALLOWED_LABELS])
+# Stronger /analyze prompt: forces specificity and forbids the "template" sentence
+ANALYZE_PROMPT = f"""You are a fallacy detection assistant.
+You MUST choose labels ONLY from this list (exact string):
 {LABELS_STR}
+You MUST return ONLY valid JSON with this schema:
 {{
   "has_fallacy": boolean,
   "fallacies": [
   "overall_explanation": string
 }}
+Hard rules:
+- Output ONLY JSON. No markdown. No extra text.
+- evidence_quotes MUST be verbatim substrings copied from the input text (no paraphrase).
+- Keep each evidence quote short (prefer 1–2 sentences; max 240 chars).
+- confidence MUST be a real probability between 0.0 and 1.0 (use 2 decimals).
+  It MUST NOT be always the same across examples. Calibrate it:
+  * 0.90–1.00: very explicit, unambiguous match, clear cue words.
+  * 0.70–0.89: strong match but some ambiguity or missing premise.
+  * 0.40–0.69: plausible but weak/partial evidence.
+  * 0.10–0.39: very uncertain.
+- The rationale MUST be specific to the evidence (2–4 sentences):
+  Explain (1) what the quote claims, (2) why that matches the fallacy label,
+  (3) what logical step is invalid or missing.
+  DO NOT use generic filler. Do NOT reuse stock phrases.
+  In particular, you MUST NOT output this sentence:
+  "The input contains fallacious reasoning consistent with the predicted type(s)."
+- overall_explanation MUST also be specific (2–5 sentences): summarize the reasoning issues and reference the key cue(s).
+- If no fallacy: has_fallacy=false and fallacies=[] and overall_explanation explains briefly why.
 INPUT:
 {{text}}
 OUTPUT:"""
+# /rewrite prompt: returns ONLY a replacement substring for the quote (server does the replacement)
+REWRITE_PROMPT = """You are rewriting a small quoted span inside a larger text.
+Goal:
+- You MUST propose a replacement for the QUOTE only.
+- The replacement should remove the fallacious reasoning described, while keeping the same tone/style/tense/entities.
+- The replacement MUST be plausible in the surrounding context and should be similar length (roughly +/- 40%).
+- Do NOT change anything outside the quote. Do NOT add new facts not implied by the original.
+- Do NOT introduce new fallacies.
+Return ONLY valid JSON with this schema:
+{
+  "replacement_quote": string,
+  "why_this_fix": string
+}
+Hard rules:
+- Output ONLY JSON. No markdown. No extra text.
+- replacement_quote should be standalone text (no surrounding quotes).
+- why_this_fix: 1–3 sentences, specific.
+INPUT_TEXT:
+{text}
+QUOTE_TO_REWRITE:
+{quote}
+FALLACY_TYPE:
+{fallacy_type}
+WHY_FALLACIOUS:
+{rationale}
+OUTPUT:"""
+def build_analyze_messages(text: str) -> List[Dict[str, str]]:
     return [
+        {"role": "system", "content": "Return only JSON. Exactly one JSON object. No extra text."},
+        {"role": "user", "content": ANALYZE_PROMPT.replace("{text}", text)},
     ]
+def build_rewrite_messages(text: str, quote: str, fallacy_type: str, rationale: str) -> List[Dict[str, str]]:
+    prompt = REWRITE_PROMPT.format(
+        text=text,
+        quote=quote,
+        fallacy_type=fallacy_type,
+        rationale=rationale,
+    )
+    return [
+        {"role": "system", "content": "Return only JSON. Exactly one JSON object. No extra text."},
+        {"role": "user", "content": prompt},
+    ]
+# ============================
+# Logging
+# ============================
 def _log(rid: str, msg: str):
     print(f"[{rid}] {msg}", flush=True)
+# ============================
+# Robust JSON extraction
+# ============================
 def stop_at_complete_json(text: str) -> Optional[str]:
     start = text.find("{")
     if start == -1:
                 return text[start : i + 1]
     return None
 def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     cut = stop_at_complete_json(s) or s
     start = cut.find("{")
     except Exception:
         return None
+# ============================
 # Model load
+# ============================
 llm: Optional[Llama] = None
 model_path: Optional[str] = None
 load_error: Optional[str] = None
 loaded_at_ts: Optional[float] = None
 def load_llama() -> None:
     global llm, model_path, load_error, loaded_at_ts
         load_error = repr(e)
         print(f"❌ Startup FAILED: {load_error}", flush=True)
 @app.on_event("startup")
 def _startup():
     load_llama()
 @app.get("/")
 def root():
+    return {"ok": True, "hint": "Use GET /health, POST /analyze, POST /rewrite"}
 @app.get("/health")
 def health():
         "loaded_at_ts": loaded_at_ts,
     }
+# ============================
+# Param selection
+# ============================
+def pick_params(req: GenParams) -> Dict[str, Any]:
     if req.light:
         params = {
             "max_new_tokens": LIGHT_MAX_NEW_TOKENS,
             "max_new_tokens": MAX_NEW_TOKENS_DEFAULT,
             "temperature": TEMPERATURE_DEFAULT,
             "top_p": TOP_P_DEFAULT,
+            "n_batch": N_BATCH,
         }
     if req.max_new_tokens is not None:
         params["max_new_tokens"] = int(req.max_new_tokens)
     if req.temperature is not None:
     if req.top_p is not None:
         params["top_p"] = float(req.top_p)
+    # Safety caps
+    params["max_new_tokens"] = max(1, min(int(params["max_new_tokens"]), 400))
     params["temperature"] = max(0.0, min(float(params["temperature"]), 1.5))
     params["top_p"] = max(0.05, min(float(params["top_p"]), 1.0))
     params["n_batch"] = max(16, min(int(params["n_batch"]), 512))
     return params
+# ============================
+# Output sanitation / validation
+# ============================
+def _clamp01(x: Any, default: float = 0.5) -> float:
+    try:
+        v = float(x)
+    except Exception:
+        return default
+    if v < 0.0:
+        return 0.0
+    if v > 1.0:
+        return 1.0
+    return v
+def _is_allowed_label(lbl: Any) -> bool:
+    return isinstance(lbl, str) and lbl in ALLOWED_LABELS and lbl != "none"
+def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, Any]:
+    """
+    Enforce shape, clamp confidence, drop invalid labels,
+    enforce evidence_quotes being substrings.
+    """
+    has_fallacy = bool(obj.get("has_fallacy", False))
+    fallacies_in = obj.get("fallacies", [])
+    if not isinstance(fallacies_in, list):
+        fallacies_in = []
+    fallacies_out = []
+    for f in fallacies_in:
+        if not isinstance(f, dict):
+            continue
+        f_type = f.get("type")
+        if not _is_allowed_label(f_type):
+            continue
+        conf = _clamp01(f.get("confidence", 0.5))
+        # keep 2 decimals for nicer UI
+        conf = float(f"{conf:.2f}")
+        ev = f.get("evidence_quotes", [])
+        if not isinstance(ev, list):
+            ev = []
+        ev_clean: List[str] = []
+        for q in ev:
+            if not isinstance(q, str):
+                continue
+            qq = q.strip()
+            if not qq:
+                continue
+            # evidence MUST be substring
+            if qq in input_text:
+                # keep short, but don't hard-cut if it breaks substring matching
+                if len(qq) <= 240:
+                    ev_clean.append(qq)
+                else:
+                    # if too long, try to keep first 240 if still substring (rare); else keep as-is
+                    short = qq[:240]
+                    if short in input_text:
+                        ev_clean.append(short)
+                    else:
+                        ev_clean.append(qq)
+        rationale = f.get("rationale")
+        if not isinstance(rationale, str):
+            rationale = ""
+        rationale = rationale.strip()
+        fallacies_out.append(
+            {
+                "type": f_type,
+                "confidence": conf,
+                "evidence_quotes": ev_clean[:3],
+                "rationale": rationale,
+            }
+        )
+    overall = obj.get("overall_explanation")
+    if not isinstance(overall, str):
+        overall = ""
+    overall = overall.strip()
+    # If no fallacies survived sanitation, force no-fallacy state
+    if len(fallacies_out) == 0:
+        has_fallacy = False
+    return {
+        "has_fallacy": has_fallacy,
+        "fallacies": fallacies_out,
+        "overall_explanation": overall,
+    }
+# ============================
+# Cached generation (task-aware)
+# ============================
+@lru_cache(maxsize=512)
+def _cached_chat_completion(
+    task: str,
+    payload: str,
     light: bool,
     max_new_tokens: int,
     temperature: float,
     if llm is None:
         return {"ok": False, "error": "model_not_loaded", "detail": load_error}
     try:
         llm.n_batch = int(n_batch)  # type: ignore[attr-defined]
     except Exception:
         pass
+    try:
+        data = json.loads(payload)
+    except Exception:
+        return {"ok": False, "error": "bad_payload"}
+    if task == "analyze":
+        messages = build_analyze_messages(data["text"])
+    elif task == "rewrite":
+        messages = build_rewrite_messages(
+            data["text"],
+            data["quote"],
+            data["fallacy_type"],
+            data["rationale"],
+        )
+    else:
+        return {"ok": False, "error": "unknown_task"}
     out = llm.create_chat_completion(
         messages=messages,
     return {"ok": True, "result": obj}
+def _occurrence_index(text: str, sub: str, occurrence: int) -> int:
+    if occurrence < 0:
+        return -1
+    start = 0
+    for _ in range(occurrence + 1):
+        idx = text.find(sub, start)
+        if idx == -1:
+            return -1
+        start = idx + max(1, len(sub))
+    return idx
+def _replace_nth(text: str, old: str, new: str, occurrence: int) -> Dict[str, Any]:
+    idx = _occurrence_index(text, old, occurrence)
+    if idx == -1:
+        return {"ok": False, "error": "quote_not_found"}
+    return {
+        "ok": True,
+        "rewritten_text": text[:idx] + new + text[idx + len(old) :],
+        "start_char": idx,
+        "end_char": idx + len(new),
+        "old_start_char": idx,
+        "old_end_char": idx + len(old),
+    }
+# ============================
+# Routes
+# ============================
 @app.post("/analyze")
 async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
     rid = uuid.uuid4().hex[:10]
     t0 = time.time()
+    _log(rid, f"📩 /analyze received (light={req.light}) chars={len(req.text) if req.text else 0}")
     if not req.text or not req.text.strip():
         return {"ok": False, "error": "empty_text"}
     params = pick_params(req)
         f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']}",
     )
+    payload = json.dumps({"text": req.text}, ensure_ascii=False)
     async with GEN_LOCK:
         t_lock = time.time()
+        _log(rid, "🧠 Generating analyze...")
+        t_gen0 = time.time()
+        res = _cached_chat_completion(
+            "analyze",
+            payload,
             bool(req.light),
             int(params["max_new_tokens"]),
             float(params["temperature"]),
             float(params["top_p"]),
             int(params["n_batch"]),
         )
+        t_gen1 = time.time()
+    elapsed_total = time.time() - t0
+    elapsed_lock = time.time() - t_lock
+    if not res.get("ok"):
+        _log(rid, f"❌ /analyze failed: {res.get('error')}")
         return {
             **res,
             "meta": {
                     "top_p": float(params["top_p"]),
                     "n_batch": int(params["n_batch"]),
                 },
+                "timings_s": {"total": round(elapsed_total, 3), "gen": round(t_gen1 - t_gen0, 3)},
+            },
+        }
+    # sanitize output for stability (substrings, labels, confidence clamp)
+    clean = sanitize_analyze_output(res["result"], req.text)
+    _log(rid, f"✅ /analyze ok fallacies={len(clean.get('fallacies', []))} total={elapsed_total:.2f}s")
+    return {
+        "ok": True,
+        "result": clean,
+        "meta": {
+            "request_id": rid,
+            "light": bool(req.light),
+            "params": {
+                "max_new_tokens": int(params["max_new_tokens"]),
+                "temperature": float(params["temperature"]),
+                "top_p": float(params["top_p"]),
+                "n_batch": int(params["n_batch"]),
+            },
+            "timings_s": {
+                "total": round(elapsed_total, 3),
+                "gen": round(t_gen1 - t_gen0, 3),
+                "under_lock": round(elapsed_lock, 3),
+            },
+        },
+    }
+@app.post("/rewrite")
+async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
+    rid = uuid.uuid4().hex[:10]
+    t0 = time.time()
+    _log(
+        rid,
+        f"📩 /rewrite received (light={req.light}) text_chars={len(req.text) if req.text else 0} quote_chars={len(req.quote) if req.quote else 0}",
+    )
+    if not req.text or not req.text.strip():
+        return {"ok": False, "error": "empty_text"}
+    if not req.quote or not req.quote.strip():
+        return {"ok": False, "error": "empty_quote"}
+    quote = req.quote.strip()
+    occurrence = int(req.occurrence or 0)
+    # validate quote existence early
+    if _occurrence_index(req.text, quote, occurrence) == -1:
+        return {"ok": False, "error": "quote_not_found", "detail": {"occurrence": occurrence}}
+    params = pick_params(req)
+    # rewrite generally needs a bit more room than light analyze if you want fluent replacements
+    # (still controllable by request overrides)
+    if req.light and req.max_new_tokens is None:
+        params["max_new_tokens"] = max(params["max_new_tokens"], 80)
+    _log(
+        rid,
+        f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']}",
+    )
+    payload = json.dumps(
+        {
+            "text": req.text,
+            "quote": quote,
+            "fallacy_type": req.fallacy_type,
+            "rationale": req.rationale,
+        },
+        ensure_ascii=False,
+    )
+    async with GEN_LOCK:
+        t_lock = time.time()
+        _log(rid, "🧠 Generating rewrite replacement_quote...")
+        t_gen0 = time.time()
+        res = _cached_chat_completion(
+            "rewrite",
+            payload,
+            bool(req.light),
+            int(params["max_new_tokens"]),
+            float(params["temperature"]),
+            float(params["top_p"]),
+            int(params["n_batch"]),
+        )
+        t_gen1 = time.time()
+    elapsed_total = time.time() - t0
+    elapsed_lock = time.time() - t_lock
+    if not res.get("ok"):
+        _log(rid, f"❌ /rewrite failed: {res.get('error')}")
+        return {
+            **res,
+            "meta": {
+                "request_id": rid,
+                "light": bool(req.light),
+                "params": {
+                    "max_new_tokens": int(params["max_new_tokens"]),
+                    "temperature": float(params["temperature"]),
+                    "top_p": float(params["top_p"]),
+                    "n_batch": int(params["n_batch"]),
                 },
+                "timings_s": {"total": round(elapsed_total, 3), "gen": round(t_gen1 - t_gen0, 3)},
             },
         }
+    obj = res["result"]
+    if not isinstance(obj, dict):
+        return {"ok": False, "error": "bad_rewrite_output"}
+    replacement = obj.get("replacement_quote")
+    if not isinstance(replacement, str):
+        return {"ok": False, "error": "missing_replacement_quote", "raw": obj}
+    replacement = replacement.strip()
+    if not replacement:
+        return {"ok": False, "error": "empty_replacement_quote", "raw": obj}
+    why = obj.get("why_this_fix")
+    if not isinstance(why, str):
+        why = ""
+    why = why.strip()
+    # server-side enforced: ONLY the quote is changed
+    rep = _replace_nth(req.text, quote, replacement, occurrence)
+    if not rep.get("ok"):
+        return {"ok": False, "error": rep.get("error", "replace_failed")}
+    _log(rid, f"✅ /rewrite ok total={elapsed_total:.2f}s")
+    return {
+        "ok": True,
+        "result": {
+            "rewritten_text": rep["rewritten_text"],
+            "old_quote": quote,
+            "replacement_quote": replacement,
+            "why_this_fix": why,
+            "occurrence": occurrence,
+            "span": {
+                "old_start_char": rep["old_start_char"],
+                "old_end_char": rep["old_end_char"],
+                "new_start_char": rep["start_char"],
+                "new_end_char": rep["end_char"],
+            },
+        },
+        "meta": {
+            "request_id": rid,
+            "light": bool(req.light),
+            "params": {
+                "max_new_tokens": int(params["max_new_tokens"]),
+                "temperature": float(params["temperature"]),
+                "top_p": float(params["top_p"]),
+                "n_batch": int(params["n_batch"]),
+            },
+            "timings_s": {
+                "total": round(elapsed_total, 3),
+                "gen": round(t_gen1 - t_gen0, 3),
+                "under_lock": round(elapsed_lock, 3),
+            },
+        },
+    }