Spaces:

maxime-antoine-dev
/

fades-api

Sleeping

App Files Files Community

maxime-antoine-dev commited on 23 days ago

Commit

5a8ecdf

1 Parent(s): 6e3fd51

api

Browse files

Files changed (3) hide show

Dockerfile +10 -0
main.py +229 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# main.py
+import os
+import json
+from typing import Any, Dict, Optional
+from functools import lru_cache
+import asyncio
+import torch
+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+# ----------------------------
+# Config (CPU-friendly defaults)
+# ----------------------------
+BASE_MODEL = os.getenv("BASE_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
+ADAPTER_REPO = os.getenv("ADAPTER_REPO", "maxime-antoine-dev/fades")
+# Keep smaller on CPU to stay usable
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "896"))
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "140"))
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# One request at a time on CPU Spaces (prevents OOM / huge latency spikes)
+GEN_LOCK = asyncio.Lock()
+# ----------------------------
+# Prompt (aligned with training, but compact for CPU)
+# ----------------------------
+ALLOWED_LABELS = [
+    "none","faulty generalization","false causality","circular reasoning","ad populum","ad hominem",
+    "fallacy of logic","appeal to emotion","false dilemma","equivocation","fallacy of extension",
+    "fallacy of relevance","fallacy of credibility","miscellaneous","intentional",
+]
+def labels_block_compact() -> str:
+    # Compact list (removes long hints to reduce prompt tokens on CPU)
+    return "\n".join([f'- "{k}"' for k in ALLOWED_LABELS])
+INSTRUCTION = """You are a logical fallacy detection assistant.
+You MUST choose labels ONLY from this list (use the exact string):
+{labels_list}
+Return ONLY ONE valid JSON object with this schema:
+{{
+  "has_fallacy": boolean,
+  "fallacies": [
+    {{
+      "type": string,
+      "confidence": number,              // 0.0..1.0
+      "evidence_quotes": [string],       // exact substring(s) copied from the input text
+      "rationale": string                // specific to this fallacy + quote
+    }}
+  ],
+  "overall_explanation": string          // short summary across the whole input
+}}
+Hard rules:
+- Output ONLY the JSON object. No markdown. No extra text.
+- Produce exactly ONE JSON object, then STOP. Do NOT repeat the input. Do NOT create new examples.
+- evidence_quotes MUST be exact substrings from the input text.
+- If has_fallacy=false:
+  - fallacies MUST be []
+  - overall_explanation MUST explicitly say there is no fallacy
+  - overall_explanation MUST NOT mention any fallacy label/category names.
+- If has_fallacy=true:
+  - fallacies MUST contain at least 1 item
+  - EACH fallacies[i].type MUST be one of the allowed labels (NOT a synonym)
+  - overall_explanation may summarize the detected fallacy(ies).
+"""
+def build_prompt(tokenizer: AutoTokenizer, text: str) -> str:
+    instruction = INSTRUCTION.format(labels_list=labels_block_compact())
+    messages = [
+        {"role": "system", "content": "You are a careful JSON-only assistant."},
+        {"role": "user", "content": f"{instruction}\n\nTEXT:\n{text}\n\nJSON:"},
+    ]
+    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+# ----------------------------
+# JSON extraction + stop guard
+# ----------------------------
+def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
+    start = s.find("{")
+    if start == -1:
+        return None
+    end = s.rfind("}")
+    if end == -1 or end <= start:
+        return None
+    cand = s[start:end + 1].strip()
+    try:
+        return json.loads(cand)
+    except Exception:
+        return None
+def stop_at_complete_json(text: str) -> Optional[str]:
+    start = text.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    in_str = False
+    esc = False
+    for i in range(start, len(text)):
+        ch = text[i]
+        if in_str:
+            if esc:
+                esc = False
+            elif ch == "\\":
+                esc = True
+            elif ch == '"':
+                in_str = False
+            continue
+        if ch == '"':
+            in_str = True
+            continue
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start:i + 1]
+    return None
+# ----------------------------
+# Load model (tries CPU 8-bit if available; falls back to FP32)
+# ----------------------------
+def load_model() -> tuple[AutoTokenizer, Any]:
+    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO, use_fast=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Try CPU quantization to reduce RAM; fallback if not supported
+    try:
+        import bitsandbytes  # noqa: F401
+        base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            device_map="auto",     # CPU on Spaces free
+            load_in_8bit=True,
+        )
+    except Exception:
+        base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            device_map=None,
+            torch_dtype=torch.float32,
+        )
+    model = PeftModel.from_pretrained(base, ADAPTER_REPO)
+    model.eval()
+    return tokenizer, model
+# Keep them global, but load on startup for clearer errors/logs
+tokenizer: Optional[AutoTokenizer] = None
+model: Optional[Any] = None
+# ----------------------------
+# FastAPI
+# ----------------------------
+app = FastAPI(title="FADES Fallacy Detector")
+class AnalyzeRequest(BaseModel):
+    text: str
+    max_new_tokens: int = MAX_NEW_TOKENS
+@app.get("/health")
+def health():
+    return {
+        "ok": True,
+        "device": DEVICE,
+        "base_model": BASE_MODEL,
+        "adapter_repo": ADAPTER_REPO,
+        "model_loaded": model is not None and tokenizer is not None,
+    }
+@app.on_event("startup")
+def _startup():
+    global tokenizer, model
+    tokenizer, model = load_model()
+@lru_cache(maxsize=256)
+def _cached_generate(text: str, max_new_tokens: int) -> Dict[str, Any]:
+    assert tokenizer is not None and model is not None
+    prompt = build_prompt(tokenizer, text)
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_INPUT_TOKENS,
+    )
+    # move to device if CUDA exists
+    if DEVICE == "cuda":
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            do_sample=False,
+            temperature=0.0,
+            use_cache=True,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
+    cut = stop_at_complete_json(decoded)
+    decoded_cut = cut if cut is not None else decoded
+    obj = extract_first_json_obj(decoded_cut)
+    if obj is None:
+        return {"ok": False, "raw": decoded_cut}
+    return {"ok": True, "result": obj}
+@app.post("/analyze")
+async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
+    # One-at-a-time generation on CPU (prevents stalls/OOM)
+    async with GEN_LOCK:
+        return _cached_generate(req.text, int(req.max_new_tokens))

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+torch
+transformers
+peft
+accelerate
+sentencepiece
+safetensors
+huggingface_hub