Spaces:

maxime-antoine-dev
/

fades-api

Sleeping

App Files Files Community

maxime-antoine-dev commited on 17 days ago

Commit

d48c265

1 Parent(s): 07cd6a9

fixed infinite gen

Browse files

Files changed (1) hide show

main.py +215 -145

main.py CHANGED Viewed

@@ -5,8 +5,7 @@ import time
 import uuid
 import asyncio
 import re
-from typing import Any, Dict, Optional, List
-from functools import lru_cache
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
@@ -40,6 +39,13 @@ LIGHT_TOP_P = float(os.getenv("LIGHT_TOP_P", "0.9"))
 # "Light" runtime knobs
 LIGHT_N_BATCH = int(os.getenv("LIGHT_N_BATCH", "64"))
 # One request at a time on CPU
 GEN_LOCK = asyncio.Lock()
@@ -68,12 +74,11 @@ app.add_middleware(
 # Schemas
 # ============================
 class GenParams(BaseModel):
-    # if True => use "light" parameters
     light: bool = False
-    # optional overrides (applied after picking light/normal defaults)
     max_new_tokens: Optional[int] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
 class AnalyzeRequest(GenParams):
@@ -108,10 +113,11 @@ ALLOWED_LABELS = [
     "miscellaneous",
     "intentional",
 ]
 LABELS_STR = ", ".join([f'"{x}"' for x in ALLOWED_LABELS])
-# Stronger /analyze prompt: forces specificity and forbids the "template" sentence
 ANALYZE_PROMPT = f"""You are a fallacy detection assistant.
 You MUST choose labels ONLY from this list (exact string):
@@ -135,29 +141,21 @@ Hard rules:
 - Output ONLY JSON. No markdown. No extra text.
 - evidence_quotes MUST be verbatim substrings copied from the input text (no paraphrase).
 - Keep each evidence quote short (prefer 1–2 sentences; max 240 chars).
-- confidence MUST be a real probability between 0.0 and 1.0 (use 2 decimals).
-  It MUST NOT be always the same across examples. Calibrate it:
-  * 0.90–1.00: very explicit, unambiguous match, clear cue words.
-  * 0.70–0.89: strong match but some ambiguity or missing premise.
-  * 0.40–0.69: plausible but weak/partial evidence.
-  * 0.10–0.39: very uncertain.
-- The rationale MUST be specific to the evidence (2–4 sentences):
-  Explain (1) what the quote claims, (2) why that matches the fallacy label,
-  (3) what logical step is invalid or missing.
-  DO NOT use generic filler. Do NOT reuse stock phrases.
-  In particular, you MUST NOT output this sentence:
   "The input contains fallacious reasoning consistent with the predicted type(s)."
-- overall_explanation MUST also be specific (2–5 sentences): summarize the reasoning issues and reference the key cue(s).
-- If no fallacy: has_fallacy=false and fallacies=[] and overall_explanation explains briefly why.
 INPUT:
 {{text}}
-OUTPUT:"""
-# /rewrite prompt: returns ONLY a replacement substring for the quote (server does the replacement)
-# IMPORTANT: braces are escaped so .format() does not treat the JSON schema as placeholders.
-REWRITE_PROMPT = """You are rewriting a small quoted span inside a larger text.
 Goal:
 - You MUST propose a replacement for the QUOTE only.
@@ -177,19 +175,22 @@ Hard rules:
 - replacement_quote should be standalone text (no surrounding quotes).
 - why_this_fix: 1–3 sentences, specific.
 INPUT_TEXT:
-{text}
 QUOTE_TO_REWRITE:
-{quote}
 FALLACY_TYPE:
-{fallacy_type}
 WHY_FALLACIOUS:
-{rationale}
-OUTPUT:"""
 def build_analyze_messages(text: str) -> List[Dict[str, str]]:
@@ -200,11 +201,12 @@ def build_analyze_messages(text: str) -> List[Dict[str, str]]:
 def build_rewrite_messages(text: str, quote: str, fallacy_type: str, rationale: str) -> List[Dict[str, str]]:
-    prompt = REWRITE_PROMPT.format(
-        text=text,
-        quote=quote,
-        fallacy_type=fallacy_type,
-        rationale=rationale,
     )
     return [
         {"role": "system", "content": "Return only JSON. Exactly one JSON object. No extra text."},
@@ -220,8 +222,17 @@ def _log(rid: str, msg: str):
 # ============================
-# Robust JSON extraction
 # ============================
 def stop_at_complete_json(text: str) -> Optional[str]:
     start = text.find("{")
     if start == -1:
@@ -255,6 +266,7 @@ def stop_at_complete_json(text: str) -> Optional[str]:
 def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     cut = stop_at_complete_json(s) or s
     start = cut.find("{")
     end = cut.rfind("}")
@@ -267,6 +279,90 @@ def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
         return None
 # ============================
 # Model load
 # ============================
@@ -353,6 +449,7 @@ def pick_params(req: GenParams) -> Dict[str, Any]:
             "temperature": LIGHT_TEMPERATURE,
             "top_p": LIGHT_TOP_P,
             "n_batch": LIGHT_N_BATCH,
         }
     else:
         params = {
@@ -360,6 +457,7 @@ def pick_params(req: GenParams) -> Dict[str, Any]:
             "temperature": TEMPERATURE_DEFAULT,
             "top_p": TOP_P_DEFAULT,
             "n_batch": N_BATCH,
         }
     if req.max_new_tokens is not None:
@@ -368,47 +466,34 @@ def pick_params(req: GenParams) -> Dict[str, Any]:
         params["temperature"] = float(req.temperature)
     if req.top_p is not None:
         params["top_p"] = float(req.top_p)
     # Safety caps
     params["max_new_tokens"] = max(1, min(int(params["max_new_tokens"]), 400))
     params["temperature"] = max(0.0, min(float(params["temperature"]), 1.5))
     params["top_p"] = max(0.05, min(float(params["top_p"]), 1.0))
     params["n_batch"] = max(16, min(int(params["n_batch"]), 512))
     return params
 # ============================
 # Post-processing helpers
 # ============================
-# This exact sentence is a known training artefact that can leak into rationales/overall explanations.
-# We strip it server-side for stable outputs.
 _TEMPLATE_SENTENCE = "The input contains fallacious reasoning consistent with the predicted type(s)."
-# Match the sentence with minor variations (extra spaces / trailing punctuation), case-insensitive.
 _TEMPLATE_RE = re.compile(
-    r"(?is)\bThe input contains fallacious reasoning consistent with the predicted type\(s\)\.\s*",
 )
 def strip_template_sentence(text: Any) -> str:
-    """
-    Remove the known stock sentence from model outputs, then clean up whitespace/punctuation.
-    Safe to call on non-strings.
-    """
     if not isinstance(text, str):
         return ""
     out = _TEMPLATE_RE.sub("", text)
-    # Also strip any leftover exact substring variant (belt & suspenders)
     out = out.replace(_TEMPLATE_SENTENCE, "")
-    # Normalize whitespace
     out = re.sub(r"\s{2,}", " ", out).strip()
-    # Remove leading separators left behind
     out = re.sub(r"^[\s\-–—:;,\.\u2022]+", "", out).strip()
-    # Fix occasional doubled punctuation
     out = out.replace("..", ".").replace(" ,", ",").strip()
     return out
@@ -433,11 +518,6 @@ def _is_allowed_label(lbl: Any) -> bool:
 def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, Any]:
-    """
-    Enforce shape, clamp confidence, drop invalid labels,
-    enforce evidence_quotes being substrings.
-    Also strips known training artefacts from rationales/overall.
-    """
     has_fallacy = bool(obj.get("has_fallacy", False))
     fallacies_in = obj.get("fallacies", [])
     if not isinstance(fallacies_in, list):
@@ -452,12 +532,12 @@ def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, A
             continue
         conf = _clamp01(f.get("confidence", 0.5))
-        # keep 2 decimals for nicer UI
         conf = float(f"{conf:.2f}")
         ev = f.get("evidence_quotes", [])
         if not isinstance(ev, list):
             ev = []
         ev_clean: List[str] = []
         for q in ev:
             if not isinstance(q, str):
@@ -465,23 +545,10 @@ def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, A
             qq = q.strip()
             if not qq:
                 continue
-            # evidence MUST be substring
             if qq in input_text:
-                # keep short, but don't hard-cut if it breaks substring matching
-                if len(qq) <= 240:
-                    ev_clean.append(qq)
-                else:
-                    # if too long, try to keep first 240 if still substring (rare); else keep as-is
-                    short = qq[:240]
-                    if short in input_text:
-                        ev_clean.append(short)
-                    else:
-                        ev_clean.append(qq)
-        rationale = f.get("rationale")
-        if not isinstance(rationale, str):
-            rationale = ""
-        rationale = strip_template_sentence(rationale)
         fallacies_out.append(
             {
@@ -492,12 +559,8 @@ def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, A
             }
         )
-    overall = obj.get("overall_explanation")
-    if not isinstance(overall, str):
-        overall = ""
-    overall = strip_template_sentence(overall)
-    # If no fallacies survived sanitation, force no-fallacy state
     if len(fallacies_out) == 0:
         has_fallacy = False
@@ -509,30 +572,23 @@ def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, A
 def generate_overall_explanation(clean: Dict[str, Any]) -> str:
-    """
-    Build a non-duplicative overall explanation that (a) summarizes what happened and
-    (b) highlights risks of the detected fallacy(ies).
-    This intentionally does NOT copy any per-fallacy rationale verbatim.
-    """
     has_fallacy = bool(clean.get("has_fallacy"))
     fallacies = clean.get("fallacies") or []
-    if not isinstance(fallacies, list):
-        fallacies = []
     if not has_fallacy or not fallacies:
         return (
             "No clear fallacious reasoning was detected in the text. "
-            "The argument appears broadly consistent as written, though it may still depend on unstated assumptions."
         )
-    # Unique types, preserve order
     types: List[str] = []
     for f in fallacies:
-        t = f.get("type") if isinstance(f, dict) else None
-        if isinstance(t, str) and t not in types:
-            types.append(t)
-    # Example cue quote (keep very short)
     example = ""
     for f in fallacies:
         if isinstance(f, dict):
@@ -560,7 +616,6 @@ def generate_overall_explanation(clean: Dict[str, Any]) -> str:
         "intentional": "It can be persuasive while bypassing careful reasoning, increasing the chance of manipulation.",
     }
-    # Pick up to 2 risk sentences for the detected types
     risks: List[str] = []
     for t in types:
         rs = risk_map.get(t)
@@ -570,25 +625,45 @@ def generate_overall_explanation(clean: Dict[str, Any]) -> str:
             break
     types_str = ", ".join(types) if len(types) <= 3 else ", ".join(types[:3]) + "…"
-    sentences: List[str] = []
-    sentences.append(
         f"The text contains fallacious reasoning ({types_str}) that can make the conclusion seem stronger than the evidence supports."
     )
     if example:
-        sentences.append(f'For example: "{example}".')
-    if risks:
-        sentences.append("Risk: " + " ".join(risks))
-    else:
-        sentences.append("Risk: it may mislead readers by presenting weak support as if it were decisive.")
-    return " ".join(sentences).strip()
 # ============================
-# Cached generation (task-aware)
 # ============================
-@lru_cache(maxsize=512)
-def _cached_chat_completion(
     task: str,
     payload: str,
     light: bool,
@@ -596,10 +671,16 @@ def _cached_chat_completion(
     temperature: float,
     top_p: float,
     n_batch: int,
 ) -> Dict[str, Any]:
     if llm is None:
         return {"ok": False, "error": "model_not_loaded", "detail": load_error}
     try:
         llm.n_batch = int(n_batch)  # type: ignore[attr-defined]
     except Exception:
@@ -622,20 +703,33 @@ def _cached_chat_completion(
     else:
         return {"ok": False, "error": "unknown_task"}
     out = llm.create_chat_completion(
         messages=messages,
         max_tokens=int(max_new_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
         stream=False,
     )
     raw = out["choices"][0]["message"]["content"]
     obj = extract_first_json_obj(raw)
     if obj is None:
-        return {"ok": False, "error": "json_parse_error", "raw": raw}
-    return {"ok": True, "result": obj}
 def _occurrence_index(text: str, sub: str, occurrence: int) -> int:
@@ -680,17 +774,14 @@ async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
     params = pick_params(req)
     _log(
         rid,
-        f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']}",
     )
     payload = json.dumps({"text": req.text}, ensure_ascii=False)
     async with GEN_LOCK:
-        t_lock = time.time()
         _log(rid, "🧠 Generating analyze...")
-        t_gen0 = time.time()
-        res = _cached_chat_completion(
             "analyze",
             payload,
             bool(req.light),
@@ -698,11 +789,10 @@ async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
             float(params["temperature"]),
             float(params["top_p"]),
             int(params["n_batch"]),
         )
-        t_gen1 = time.time()
     elapsed_total = time.time() - t0
-    elapsed_lock = time.time() - t_lock
     if not res.get("ok"):
         _log(rid, f"❌ /analyze failed: {res.get('error')}")
@@ -716,14 +806,14 @@ async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
                     "temperature": float(params["temperature"]),
                     "top_p": float(params["top_p"]),
                     "n_batch": int(params["n_batch"]),
                 },
-                "timings_s": {"total": round(elapsed_total, 3), "gen": round(t_gen1 - t_gen0, 3)},
             },
         }
-    # sanitize output for stability (substrings, labels, confidence clamp) + strip training artefact
     clean = sanitize_analyze_output(res["result"], req.text)
-    # overwrite overall explanation with a real summary + risk (and never copy rationales)
     clean["overall_explanation"] = generate_overall_explanation(clean)
     _log(rid, f"✅ /analyze ok fallacies={len(clean.get('fallacies', []))} total={elapsed_total:.2f}s")
@@ -738,12 +828,9 @@ async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
                 "temperature": float(params["temperature"]),
                 "top_p": float(params["top_p"]),
                 "n_batch": int(params["n_batch"]),
             },
-            "timings_s": {
-                "total": round(elapsed_total, 3),
-                "gen": round(t_gen1 - t_gen0, 3),
-                "under_lock": round(elapsed_lock, 3),
-            },
         },
     }
@@ -766,21 +853,13 @@ async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
     quote = req.quote.strip()
     occurrence = int(req.occurrence or 0)
-    # validate quote existence early
     if _occurrence_index(req.text, quote, occurrence) == -1:
         return {"ok": False, "error": "quote_not_found", "detail": {"occurrence": occurrence}}
     params = pick_params(req)
-    # rewrite generally needs a bit more room than light analyze if you want fluent replacements
-    # (still controllable by request overrides)
     if req.light and req.max_new_tokens is None:
         params["max_new_tokens"] = max(params["max_new_tokens"], 80)
-    _log(
-        rid,
-        f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']}",
-    )
     payload = json.dumps(
         {
             "text": req.text,
@@ -792,11 +871,8 @@ async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
     )
     async with GEN_LOCK:
-        t_lock = time.time()
         _log(rid, "🧠 Generating rewrite replacement_quote...")
-        t_gen0 = time.time()
-        res = _cached_chat_completion(
             "rewrite",
             payload,
             bool(req.light),
@@ -804,11 +880,10 @@ async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
             float(params["temperature"]),
             float(params["top_p"]),
             int(params["n_batch"]),
         )
-        t_gen1 = time.time()
     elapsed_total = time.time() - t0
-    elapsed_lock = time.time() - t_lock
     if not res.get("ok"):
         _log(rid, f"❌ /rewrite failed: {res.get('error')}")
@@ -822,8 +897,9 @@ async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
                     "temperature": float(params["temperature"]),
                     "top_p": float(params["top_p"]),
                     "n_batch": int(params["n_batch"]),
                 },
-                "timings_s": {"total": round(elapsed_total, 3), "gen": round(t_gen1 - t_gen0, 3)},
             },
         }
@@ -840,11 +916,8 @@ async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
         return {"ok": False, "error": "empty_replacement_quote", "raw": obj}
     why = obj.get("why_this_fix")
-    if not isinstance(why, str):
-        why = ""
-    why = why.strip()
-    # server-side enforced: ONLY the quote is changed
     rep = _replace_nth(req.text, quote, replacement, occurrence)
     if not rep.get("ok"):
         return {"ok": False, "error": rep.get("error", "replace_failed")}
@@ -873,11 +946,8 @@ async def rewrite(req: RewriteRequest) -> Dict[str, Any]:
                 "temperature": float(params["temperature"]),
                 "top_p": float(params["top_p"]),
                 "n_batch": int(params["n_batch"]),
             },
-            "timings_s": {
-                "total": round(elapsed_total, 3),
-                "gen": round(t_gen1 - t_gen0, 3),
-                "under_lock": round(elapsed_lock, 3),
-            },
         },
     }

 import uuid
 import asyncio
 import re
+from typing import Any, Dict, Optional, List, Tuple
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 # "Light" runtime knobs
 LIGHT_N_BATCH = int(os.getenv("LIGHT_N_BATCH", "64"))
+# Anti-loop defaults
+REPEAT_PENALTY_DEFAULT = float(os.getenv("REPEAT_PENALTY", "1.15"))
+# Cache only SUCCESSFUL generations (TTL)
+CACHE_TTL_S = int(os.getenv("CACHE_TTL_S", "300"))  # 5 minutes
+CACHE_MAX_ITEMS = int(os.getenv("CACHE_MAX_ITEMS", "512"))
 # One request at a time on CPU
 GEN_LOCK = asyncio.Lock()
 # Schemas
 # ============================
 class GenParams(BaseModel):
     light: bool = False
     max_new_tokens: Optional[int] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
+    repeat_penalty: Optional[float] = None
 class AnalyzeRequest(GenParams):
     "miscellaneous",
     "intentional",
 ]
 LABELS_STR = ", ".join([f'"{x}"' for x in ALLOWED_LABELS])
+END_SENTINEL = "<END_JSON>"
+STOP_SEQS = [END_SENTINEL]
 ANALYZE_PROMPT = f"""You are a fallacy detection assistant.
 You MUST choose labels ONLY from this list (exact string):
 - Output ONLY JSON. No markdown. No extra text.
 - evidence_quotes MUST be verbatim substrings copied from the input text (no paraphrase).
 - Keep each evidence quote short (prefer 1–2 sentences; max 240 chars).
+- confidence MUST be a real probability between 0.0 and 1.0 (use 2 decimals). It MUST NOT be always the same.
+- The rationale MUST be specific (2–4 sentences). DO NOT use generic filler.
+- You MUST NOT output this sentence anywhere:
   "The input contains fallacious reasoning consistent with the predicted type(s)."
+- overall_explanation MUST be specific (2–5 sentences).
+IMPORTANT TERMINATION:
+- After the JSON object, output the token {END_SENTINEL} and stop.
 INPUT:
 {{text}}
+OUTPUT (JSON then {END_SENTINEL}):"""
+REWRITE_PROMPT = f"""You are rewriting a small quoted span inside a larger text.
 Goal:
 - You MUST propose a replacement for the QUOTE only.
 - replacement_quote should be standalone text (no surrounding quotes).
 - why_this_fix: 1–3 sentences, specific.
+IMPORTANT TERMINATION:
+- After the JSON object, output the token {END_SENTINEL} and stop.
 INPUT_TEXT:
+{{text}}
 QUOTE_TO_REWRITE:
+{{quote}}
 FALLACY_TYPE:
+{{fallacy_type}}
 WHY_FALLACIOUS:
+{{rationale}}
+OUTPUT (JSON then {END_SENTINEL}):"""
 def build_analyze_messages(text: str) -> List[Dict[str, str]]:
 def build_rewrite_messages(text: str, quote: str, fallacy_type: str, rationale: str) -> List[Dict[str, str]]:
+    prompt = (
+        REWRITE_PROMPT
+        .replace("{text}", text)
+        .replace("{quote}", quote)
+        .replace("{fallacy_type}", fallacy_type)
+        .replace("{rationale}", rationale)
     )
     return [
         {"role": "system", "content": "Return only JSON. Exactly one JSON object. No extra text."},
 # ============================
+# Robust JSON extraction + repair
 # ============================
+def _strip_sentinel(s: str) -> str:
+    if not isinstance(s, str):
+        return ""
+    idx = s.find(END_SENTINEL)
+    if idx != -1:
+        return s[:idx]
+    return s
 def stop_at_complete_json(text: str) -> Optional[str]:
     start = text.find("{")
     if start == -1:
 def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
+    s = _strip_sentinel(s)
     cut = stop_at_complete_json(s) or s
     start = cut.find("{")
     end = cut.rfind("}")
         return None
+def _count_unescaped_quotes(s: str) -> int:
+    in_str = False
+    esc = False
+    count = 0
+    for ch in s:
+        if esc:
+            esc = False
+            continue
+        if ch == "\\":
+            esc = True
+            continue
+        if ch == '"':
+            count += 1
+            in_str = not in_str
+    return count
+def _balance_braces_outside_strings(s: str) -> Tuple[int, int]:
+    opens = 0
+    closes = 0
+    in_str = False
+    esc = False
+    for ch in s:
+        if in_str:
+            if esc:
+                esc = False
+            elif ch == "\\":
+                esc = True
+            elif ch == '"':
+                in_str = False
+            continue
+        else:
+            if ch == '"':
+                in_str = True
+                continue
+            if ch == "{":
+                opens += 1
+            elif ch == "}":
+                closes += 1
+    return opens, closes
+def try_repair_and_parse_json(raw: str) -> Optional[Dict[str, Any]]:
+    """
+    Best-effort repair when model got stuck/repetitive and didn't close JSON.
+    Strategy:
+      - take from first '{'
+      - if quotes count odd => append '"'
+      - balance braces outside strings by appending missing '}'
+      - try json.loads
+    """
+    if not isinstance(raw, str):
+        return None
+    s = _strip_sentinel(raw)
+    start = s.find("{")
+    if start == -1:
+        return None
+    cand = s[start:].strip()
+    # If it contains huge repetition, hard-trim after some chars to avoid pathological payloads.
+    # (Keeps server responsive.)
+    MAX_CAND = 50_000
+    if len(cand) > MAX_CAND:
+        cand = cand[:MAX_CAND]
+    # Close open string if needed
+    if _count_unescaped_quotes(cand) % 2 == 1:
+        cand += '"'
+    opens, closes = _balance_braces_outside_strings(cand)
+    if closes > opens:
+        # can't safely repair this
+        return None
+    if opens > closes:
+        cand += "}" * (opens - closes)
+    cand = cand.strip()
+    try:
+        return json.loads(cand)
+    except Exception:
+        return None
 # ============================
 # Model load
 # ============================
             "temperature": LIGHT_TEMPERATURE,
             "top_p": LIGHT_TOP_P,
             "n_batch": LIGHT_N_BATCH,
+            "repeat_penalty": REPEAT_PENALTY_DEFAULT,
         }
     else:
         params = {
             "temperature": TEMPERATURE_DEFAULT,
             "top_p": TOP_P_DEFAULT,
             "n_batch": N_BATCH,
+            "repeat_penalty": REPEAT_PENALTY_DEFAULT,
         }
     if req.max_new_tokens is not None:
         params["temperature"] = float(req.temperature)
     if req.top_p is not None:
         params["top_p"] = float(req.top_p)
+    if req.repeat_penalty is not None:
+        params["repeat_penalty"] = float(req.repeat_penalty)
     # Safety caps
     params["max_new_tokens"] = max(1, min(int(params["max_new_tokens"]), 400))
     params["temperature"] = max(0.0, min(float(params["temperature"]), 1.5))
     params["top_p"] = max(0.05, min(float(params["top_p"]), 1.0))
     params["n_batch"] = max(16, min(int(params["n_batch"]), 512))
+    params["repeat_penalty"] = max(1.0, min(float(params["repeat_penalty"]), 1.5))
     return params
 # ============================
 # Post-processing helpers
 # ============================
 _TEMPLATE_SENTENCE = "The input contains fallacious reasoning consistent with the predicted type(s)."
 _TEMPLATE_RE = re.compile(
+    r"(?is)\bThe input contains fallacious reasoning consistent with the predicted type\(s\)\.\s*"
 )
 def strip_template_sentence(text: Any) -> str:
     if not isinstance(text, str):
         return ""
     out = _TEMPLATE_RE.sub("", text)
     out = out.replace(_TEMPLATE_SENTENCE, "")
     out = re.sub(r"\s{2,}", " ", out).strip()
     out = re.sub(r"^[\s\-–—:;,\.\u2022]+", "", out).strip()
     out = out.replace("..", ".").replace(" ,", ",").strip()
     return out
 def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, Any]:
     has_fallacy = bool(obj.get("has_fallacy", False))
     fallacies_in = obj.get("fallacies", [])
     if not isinstance(fallacies_in, list):
             continue
         conf = _clamp01(f.get("confidence", 0.5))
         conf = float(f"{conf:.2f}")
         ev = f.get("evidence_quotes", [])
         if not isinstance(ev, list):
             ev = []
         ev_clean: List[str] = []
         for q in ev:
             if not isinstance(q, str):
             qq = q.strip()
             if not qq:
                 continue
             if qq in input_text:
+                ev_clean.append(qq if len(qq) <= 240 else qq[:240])
+        rationale = strip_template_sentence(f.get("rationale", ""))
         fallacies_out.append(
             {
             }
         )
+    overall = strip_template_sentence(obj.get("overall_explanation", ""))
     if len(fallacies_out) == 0:
         has_fallacy = False
 def generate_overall_explanation(clean: Dict[str, Any]) -> str:
     has_fallacy = bool(clean.get("has_fallacy"))
     fallacies = clean.get("fallacies") or []
     if not has_fallacy or not fallacies:
         return (
             "No clear fallacious reasoning was detected in the text. "
+            "The argument appears broadly consistent as written, though it may still rely on unstated assumptions."
         )
+    # unique types
     types: List[str] = []
     for f in fallacies:
+        if isinstance(f, dict):
+            t = f.get("type")
+            if isinstance(t, str) and t not in types:
+                types.append(t)
+    # example
     example = ""
     for f in fallacies:
         if isinstance(f, dict):
         "intentional": "It can be persuasive while bypassing careful reasoning, increasing the chance of manipulation.",
     }
     risks: List[str] = []
     for t in types:
         rs = risk_map.get(t)
             break
     types_str = ", ".join(types) if len(types) <= 3 else ", ".join(types[:3]) + "…"
+    out = (
         f"The text contains fallacious reasoning ({types_str}) that can make the conclusion seem stronger than the evidence supports."
     )
     if example:
+        out += f' For example: "{example}".'
+    out += " Risk: " + (" ".join(risks) if risks else "it may mislead readers by presenting weak support as if it were decisive.")
+    return out.strip()
+# ============================
+# Success-only cache
+# ============================
+_SUCCESS_CACHE: Dict[Tuple[Any, ...], Tuple[float, Dict[str, Any]]] = {}
+def _cache_get(key: Tuple[Any, ...]) -> Optional[Dict[str, Any]]:
+    item = _SUCCESS_CACHE.get(key)
+    if not item:
+        return None
+    ts, val = item
+    if (time.time() - ts) > CACHE_TTL_S:
+        _SUCCESS_CACHE.pop(key, None)
+        return None
+    return val
+def _cache_put(key: Tuple[Any, ...], val: Dict[str, Any]) -> None:
+    # naive eviction if too big
+    if len(_SUCCESS_CACHE) >= CACHE_MAX_ITEMS:
+        # drop oldest
+        oldest_key = min(_SUCCESS_CACHE.items(), key=lambda kv: kv[1][0])[0]
+        _SUCCESS_CACHE.pop(oldest_key, None)
+    _SUCCESS_CACHE[key] = (time.time(), val)
 # ============================
+# Completion (task-aware)
 # ============================
+def _chat_completion(
     task: str,
     payload: str,
     light: bool,
     temperature: float,
     top_p: float,
     n_batch: int,
+    repeat_penalty: float,
 ) -> Dict[str, Any]:
     if llm is None:
         return {"ok": False, "error": "model_not_loaded", "detail": load_error}
+    key = (task, payload, light, max_new_tokens, temperature, top_p, n_batch, repeat_penalty)
+    cached = _cache_get(key)
+    if cached is not None:
+        return {"ok": True, "result": cached, "cached": True}
     try:
         llm.n_batch = int(n_batch)  # type: ignore[attr-defined]
     except Exception:
     else:
         return {"ok": False, "error": "unknown_task"}
+    t0 = time.time()
     out = llm.create_chat_completion(
         messages=messages,
         max_tokens=int(max_new_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
+        repeat_penalty=float(repeat_penalty),
+        stop=STOP_SEQS,
         stream=False,
     )
+    t1 = time.time()
     raw = out["choices"][0]["message"]["content"]
+    raw = _strip_sentinel(raw)
     obj = extract_first_json_obj(raw)
     if obj is None:
+        # attempt repair (close quote/braces) to avoid unusable responses
+        obj = try_repair_and_parse_json(raw)
+    if obj is None:
+        return {"ok": False, "error": "json_parse_error", "raw": raw, "gen_s": round(t1 - t0, 3)}
+    # success only: store in cache
+    _cache_put(key, obj)
+    return {"ok": True, "result": obj, "gen_s": round(t1 - t0, 3)}
 def _occurrence_index(text: str, sub: str, occurrence: int) -> int:
     params = pick_params(req)
     _log(
         rid,
+        f"⚙️ Params: max_new_tokens={params['max_new_tokens']} temp={params['temperature']} top_p={params['top_p']} n_batch={params['n_batch']} repeat_penalty={params['repeat_penalty']}",
     )
     payload = json.dumps({"text": req.text}, ensure_ascii=False)
     async with GEN_LOCK:
         _log(rid, "🧠 Generating analyze...")
+        res = _chat_completion(
             "analyze",
             payload,
             bool(req.light),
             float(params["temperature"]),
             float(params["top_p"]),
             int(params["n_batch"]),
+            float(params["repeat_penalty"]),
         )
     elapsed_total = time.time() - t0
     if not res.get("ok"):
         _log(rid, f"❌ /analyze failed: {res.get('error')}")
                     "temperature": float(params["temperature"]),
                     "top_p": float(params["top_p"]),
                     "n_batch": int(params["n_batch"]),
+                    "repeat_penalty": float(params["repeat_penalty"]),
                 },
+                "timings_s": {"total": round(elapsed_total, 3), "gen": res.get("gen_s", None)},
             },
         }
     clean = sanitize_analyze_output(res["result"], req.text)
+    # ensure overall explanation is always a useful summary + risk
     clean["overall_explanation"] = generate_overall_explanation(clean)
     _log(rid, f"✅ /analyze ok fallacies={len(clean.get('fallacies', []))} total={elapsed_total:.2f}s")
                 "temperature": float(params["temperature"]),
                 "top_p": float(params["top_p"]),
                 "n_batch": int(params["n_batch"]),
+                "repeat_penalty": float(params["repeat_penalty"]),
             },
+            "timings_s": {"total": round(elapsed_total, 3), "gen": res.get("gen_s", None)},
         },
     }
     quote = req.quote.strip()
     occurrence = int(req.occurrence or 0)
     if _occurrence_index(req.text, quote, occurrence) == -1:
         return {"ok": False, "error": "quote_not_found", "detail": {"occurrence": occurrence}}
     params = pick_params(req)
     if req.light and req.max_new_tokens is None:
         params["max_new_tokens"] = max(params["max_new_tokens"], 80)
     payload = json.dumps(
         {
             "text": req.text,
     )
     async with GEN_LOCK:
         _log(rid, "🧠 Generating rewrite replacement_quote...")
+        res = _chat_completion(
             "rewrite",
             payload,
             bool(req.light),
             float(params["temperature"]),
             float(params["top_p"]),
             int(params["n_batch"]),
+            float(params["repeat_penalty"]),
         )
     elapsed_total = time.time() - t0
     if not res.get("ok"):
         _log(rid, f"❌ /rewrite failed: {res.get('error')}")
                     "temperature": float(params["temperature"]),
                     "top_p": float(params["top_p"]),
                     "n_batch": int(params["n_batch"]),
+                    "repeat_penalty": float(params["repeat_penalty"]),
                 },
+                "timings_s": {"total": round(elapsed_total, 3), "gen": res.get("gen_s", None)},
             },
         }
         return {"ok": False, "error": "empty_replacement_quote", "raw": obj}
     why = obj.get("why_this_fix")
+    why = strip_template_sentence(why)
     rep = _replace_nth(req.text, quote, replacement, occurrence)
     if not rep.get("ok"):
         return {"ok": False, "error": rep.get("error", "replace_failed")}
                 "temperature": float(params["temperature"]),
                 "top_p": float(params["top_p"]),
                 "n_batch": int(params["n_batch"]),
+                "repeat_penalty": float(params["repeat_penalty"]),
             },
+            "timings_s": {"total": round(elapsed_total, 3), "gen": res.get("gen_s", None)},
         },
     }