Spaces:

maxime-antoine-dev
/

fades-api

Sleeping

App Files Files Community

maxime-antoine-dev commited on 11 days ago

Commit

e19317d

1 Parent(s): 5a8ecdf

Updated compat with quantized model

Browse files

Files changed (3) hide show

Dockerfile +15 -4
main.py +99 -90
requirements.txt +4 -9

Dockerfile CHANGED Viewed

@@ -1,10 +1,21 @@
 FROM python:3.11-slim
 WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-COPY . .
-EXPOSE 7860
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/data/.huggingface \
+    HUGGINGFACE_HUB_CACHE=/data/.cache/huggingface/hub
 WORKDIR /app
+# Optional but safer (some environments may need build tools)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+RUN pip install -r /app/requirements.txt
+COPY main.py /app/main.py
+EXPOSE 7860
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py CHANGED Viewed

@@ -1,42 +1,57 @@
 # main.py
 import os
 import json
 from typing import Any, Dict, Optional
 from functools import lru_cache
-import asyncio
-import torch
 from fastapi import FastAPI
 from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
 # ----------------------------
-# Config (CPU-friendly defaults)
 # ----------------------------
-BASE_MODEL = os.getenv("BASE_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
-ADAPTER_REPO = os.getenv("ADAPTER_REPO", "maxime-antoine-dev/fades")
-# Keep smaller on CPU to stay usable
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "896"))
-MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "140"))
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# One request at a time on CPU Spaces (prevents OOM / huge latency spikes)
 GEN_LOCK = asyncio.Lock()
 # ----------------------------
-# Prompt (aligned with training, but compact for CPU)
 # ----------------------------
 ALLOWED_LABELS = [
-    "none","faulty generalization","false causality","circular reasoning","ad populum","ad hominem",
-    "fallacy of logic","appeal to emotion","false dilemma","equivocation","fallacy of extension",
-    "fallacy of relevance","fallacy of credibility","miscellaneous","intentional",
 ]
 def labels_block_compact() -> str:
-    # Compact list (removes long hints to reduce prompt tokens on CPU)
     return "\n".join([f'- "{k}"' for k in ALLOWED_LABELS])
 INSTRUCTION = """You are a logical fallacy detection assistant.
@@ -60,7 +75,7 @@ Return ONLY ONE valid JSON object with this schema:
 Hard rules:
 - Output ONLY the JSON object. No markdown. No extra text.
-- Produce exactly ONE JSON object, then STOP. Do NOT repeat the input. Do NOT create new examples.
 - evidence_quotes MUST be exact substrings from the input text.
 - If has_fallacy=false:
   - fallacies MUST be []
@@ -69,19 +84,19 @@ Hard rules:
 - If has_fallacy=true:
   - fallacies MUST contain at least 1 item
   - EACH fallacies[i].type MUST be one of the allowed labels (NOT a synonym)
-  - overall_explanation may summarize the detected fallacy(ies).
 """
-def build_prompt(tokenizer: AutoTokenizer, text: str) -> str:
     instruction = INSTRUCTION.format(labels_list=labels_block_compact())
-    messages = [
-        {"role": "system", "content": "You are a careful JSON-only assistant."},
         {"role": "user", "content": f"{instruction}\n\nTEXT:\n{text}\n\nJSON:"},
     ]
-    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 # ----------------------------
-# JSON extraction + stop guard
 # ----------------------------
 def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     start = s.find("{")
@@ -90,7 +105,7 @@ def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     end = s.rfind("}")
     if end == -1 or end <= start:
         return None
-    cand = s[start:end + 1].strip()
     try:
         return json.loads(cand)
     except Exception:
@@ -124,106 +139,100 @@ def stop_at_complete_json(text: str) -> Optional[str]:
         elif ch == "}":
             depth -= 1
             if depth == 0:
-                return text[start:i + 1]
     return None
 # ----------------------------
-# Load model (tries CPU 8-bit if available; falls back to FP32)
 # ----------------------------
-def load_model() -> tuple[AutoTokenizer, Any]:
-    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO, use_fast=True)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    # Try CPU quantization to reduce RAM; fallback if not supported
-    try:
-        import bitsandbytes  # noqa: F401
-        base = AutoModelForCausalLM.from_pretrained(
-            BASE_MODEL,
-            device_map="auto",     # CPU on Spaces free
-            load_in_8bit=True,
-        )
-    except Exception:
-        base = AutoModelForCausalLM.from_pretrained(
-            BASE_MODEL,
-            device_map=None,
-            torch_dtype=torch.float32,
-        )
-    model = PeftModel.from_pretrained(base, ADAPTER_REPO)
-    model.eval()
-    return tokenizer, model
-# Keep them global, but load on startup for clearer errors/logs
-tokenizer: Optional[AutoTokenizer] = None
-model: Optional[Any] = None
 # ----------------------------
 # FastAPI
 # ----------------------------
-app = FastAPI(title="FADES Fallacy Detector")
 class AnalyzeRequest(BaseModel):
     text: str
-    max_new_tokens: int = MAX_NEW_TOKENS
 @app.get("/health")
 def health():
     return {
         "ok": True,
-        "device": DEVICE,
-        "base_model": BASE_MODEL,
-        "adapter_repo": ADAPTER_REPO,
-        "model_loaded": model is not None and tokenizer is not None,
     }
 @app.on_event("startup")
 def _startup():
-    global tokenizer, model
-    tokenizer, model = load_model()
 @lru_cache(maxsize=256)
-def _cached_generate(text: str, max_new_tokens: int) -> Dict[str, Any]:
-    assert tokenizer is not None and model is not None
-    prompt = build_prompt(tokenizer, text)
-    inputs = tokenizer(
-        prompt,
-        return_tensors="pt",
-        truncation=True,
-        max_length=MAX_INPUT_TOKENS,
     )
-    # move to device if CUDA exists
-    if DEVICE == "cuda":
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=int(max_new_tokens),
-            do_sample=False,
-            temperature=0.0,
-            use_cache=True,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.pad_token_id,
-        )
-    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
-    cut = stop_at_complete_json(decoded)
-    decoded_cut = cut if cut is not None else decoded
-    obj = extract_first_json_obj(decoded_cut)
     if obj is None:
-        return {"ok": False, "raw": decoded_cut}
     return {"ok": True, "result": obj}
 @app.post("/analyze")
 async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
-    # One-at-a-time generation on CPU (prevents stalls/OOM)
     async with GEN_LOCK:
-        return _cached_generate(req.text, int(req.max_new_tokens))

 # main.py
 import os
 import json
+import time
+import asyncio
 from typing import Any, Dict, Optional
 from functools import lru_cache
 from fastapi import FastAPI
 from pydantic import BaseModel
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 # ----------------------------
+# Config
 # ----------------------------
+GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "maxime-antoine-dev/fades-mistral-v02-gguf")
+GGUF_FILENAME = os.getenv("GGUF_FILENAME", "mistral_v02_fades.Q4_K_M.gguf")
+# llama.cpp params (CPU Space)
+N_CTX = int(os.getenv("N_CTX", "2048"))
+N_THREADS = int(os.getenv("N_THREADS", str(max(1, (os.cpu_count() or 2) - 1))))
+N_BATCH = int(os.getenv("N_BATCH", "256"))
+# generation defaults
+MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS", "180"))
+TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE", "0.0"))
+TOP_P_DEFAULT = float(os.getenv("TOP_P", "0.95"))
+# One request at a time on CPU (prevents stalls / extreme latency)
 GEN_LOCK = asyncio.Lock()
 # ----------------------------
+# Prompt (aligned with your training target)
 # ----------------------------
 ALLOWED_LABELS = [
+    "none",
+    "faulty generalization",
+    "false causality",
+    "circular reasoning",
+    "ad populum",
+    "ad hominem",
+    "fallacy of logic",
+    "appeal to emotion",
+    "false dilemma",
+    "equivocation",
+    "fallacy of extension",
+    "fallacy of relevance",
+    "fallacy of credibility",
+    "miscellaneous",
+    "intentional",
 ]
 def labels_block_compact() -> str:
     return "\n".join([f'- "{k}"' for k in ALLOWED_LABELS])
 INSTRUCTION = """You are a logical fallacy detection assistant.
 Hard rules:
 - Output ONLY the JSON object. No markdown. No extra text.
+- Produce exactly ONE JSON object, then STOP.
 - evidence_quotes MUST be exact substrings from the input text.
 - If has_fallacy=false:
   - fallacies MUST be []
 - If has_fallacy=true:
   - fallacies MUST contain at least 1 item
   - EACH fallacies[i].type MUST be one of the allowed labels (NOT a synonym)
 """
+SYSTEM_PROMPT = "You are a careful JSON-only assistant. Output only JSON."
+def build_messages(text: str) -> list[dict]:
     instruction = INSTRUCTION.format(labels_list=labels_block_compact())
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user", "content": f"{instruction}\n\nTEXT:\n{text}\n\nJSON:"},
     ]
 # ----------------------------
+# Robust JSON extraction
 # ----------------------------
 def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
     start = s.find("{")
     end = s.rfind("}")
     if end == -1 or end <= start:
         return None
+    cand = s[start : end + 1].strip()
     try:
         return json.loads(cand)
     except Exception:
         elif ch == "}":
             depth -= 1
             if depth == 0:
+                return text[start : i + 1]
     return None
 # ----------------------------
+# Load GGUF model (global)
 # ----------------------------
+llm: Optional[Llama] = None
+model_path: Optional[str] = None
+def load_llama() -> tuple[str, Llama]:
+    global model_path
+    t0 = time.time()
+    mp = hf_hub_download(
+        repo_id=GGUF_REPO_ID,
+        filename=GGUF_FILENAME,
+        token=os.getenv("HF_TOKEN"),  # optional (only if repo is private)
+    )
+    t1 = time.time()
+    # CPU Space -> n_gpu_layers = 0
+    llama = Llama(
+        model_path=mp,
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
+        n_batch=N_BATCH,
+        n_gpu_layers=0,
+        verbose=True,
+    )
+    t2 = time.time()
+    print(f"✅ GGUF downloaded: {mp} ({t1 - t0:.1f}s)")
+    print(f"✅ Model loaded: ({t2 - t1:.1f}s) n_ctx={N_CTX} threads={N_THREADS} batch={N_BATCH}")
+    model_path = mp
+    return mp, llama
 # ----------------------------
 # FastAPI
 # ----------------------------
+app = FastAPI(title="FADES Fallacy Detector (GGUF / llama.cpp)")
 class AnalyzeRequest(BaseModel):
     text: str
+    max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT
+    temperature: float = TEMPERATURE_DEFAULT
+    top_p: float = TOP_P_DEFAULT
 @app.get("/health")
 def health():
     return {
         "ok": True,
+        "engine": "llama.cpp (llama-cpp-python)",
+        "gguf_repo": GGUF_REPO_ID,
+        "gguf_filename": GGUF_FILENAME,
+        "model_loaded": llm is not None,
+        "model_path": model_path,
+        "n_ctx": N_CTX,
+        "n_threads": N_THREADS,
+        "n_batch": N_BATCH,
     }
 @app.on_event("startup")
 def _startup():
+    global llm
+    _, llm_loaded = load_llama()
+    llm = llm_loaded
 @lru_cache(maxsize=256)
+def _cached_generate(text: str, max_new_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
+    assert llm is not None
+    messages = build_messages(text)
+    out = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=int(max_new_tokens),
+        temperature=float(temperature),
+        top_p=float(top_p),
+        stream=False,
     )
+    raw = out["choices"][0]["message"]["content"]
+    cut = stop_at_complete_json(raw)
+    raw_cut = cut if cut is not None else raw
+    obj = extract_first_json_obj(raw_cut)
     if obj is None:
+        return {"ok": False, "raw": raw_cut}
     return {"ok": True, "result": obj}
 @app.post("/analyze")
 async def analyze(req: AnalyzeRequest) -> Dict[str, Any]:
+    # CPU: serialize requests to keep stable latency
     async with GEN_LOCK:
+        return _cached_generate(req.text, int(req.max_new_tokens), float(req.temperature), float(req.top_p))

requirements.txt CHANGED Viewed

@@ -1,9 +1,4 @@
-fastapi
-uvicorn
-torch
-transformers
-peft
-accelerate
-sentencepiece
-safetensors
-huggingface_hub

+fastapi>=0.110
+uvicorn[standard]>=0.27
+huggingface_hub>=0.23
+llama-cpp-python>=0.2.90