Spaces:

maxime-antoine-dev
/

fades-api

Sleeping

File size: 8,831 Bytes

import json
import re
from typing import Any, Dict, Optional, List

# If prompts.py doesn't exist, keep a safe fallback
try:
    from prompts import ALLOWED_LABELS  # type: ignore
except Exception:
    ALLOWED_LABELS = [
        "none", "faulty generalization", "false causality", "circular reasoning",
        "ad populum", "ad hominem", "fallacy of logic", "appeal to emotion",
        "false dilemma", "equivocation", "fallacy of extension",
        "fallacy of relevance", "fallacy of credibility", "miscellaneous", "intentional"
    ]

# ----------------------------
# Robust JSON extraction
# ----------------------------
def stop_at_complete_json(text: str) -> Optional[str]:
    start = text.find("{")
    if start == -1:
        return None

    depth = 0
    in_str = False
    esc = False

    for i in range(start, len(text)):
        ch = text[i]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
            continue

        if ch == '"':
            in_str = True
            continue
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return text[start : i + 1]
    return None


def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
    cut = stop_at_complete_json(s) or s
    start = cut.find("{")
    end = cut.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return None
    cand = cut[start : end + 1].strip()
    try:
        return json.loads(cand)
    except Exception:
        return None


# ----------------------------
# Extra robustness: remove stray unquoted fields (e.g., `confidence: 0.75`)
# that sometimes appear outside JSON strings due to generation glitches.
# ----------------------------
def _remove_unquoted_confidence_field(json_text: str) -> str:
    """
    Removes an unquoted trailing field like `confidence: 0.75` that appears
    outside strings in otherwise-valid JSON output. This is a targeted fix
    for common LLM glitches and intentionally conservative (only triggers
    when we are NOT inside a quoted string).
    """
    out_chars: List[str] = []
    i = 0
    in_str = False
    esc = False

    def _pop_trailing_ws_and_optional_comma():
        # remove trailing whitespace
        while out_chars and out_chars[-1].isspace():
            out_chars.pop()
        # remove trailing comma (and whitespace before it)
        if out_chars and out_chars[-1] == ",":
            out_chars.pop()
            while out_chars and out_chars[-1].isspace():
                out_chars.pop()

    while i < len(json_text):
        ch = json_text[i]

        if in_str:
            out_chars.append(ch)
            if esc:
                esc = False
            elif ch == "\\":  # escape
                esc = True
            elif ch == '"':
                in_str = False
            i += 1
            continue

        if ch == '"':
            in_str = True
            out_chars.append(ch)
            i += 1
            continue

        # Detect an unquoted `confidence: <number>` outside strings.
        # Only remove if followed by a number and then a delimiter (`,` or `}`).
        if json_text.startswith("confidence", i):
            j = i + len("confidence")
            while j < len(json_text) and json_text[j].isspace():
                j += 1
            if j < len(json_text) and json_text[j] == ":":
                j += 1
                while j < len(json_text) and json_text[j].isspace():
                    j += 1

                # parse a simple number
                if j < len(json_text) and json_text[j] in "+-":
                    j += 1
                has_digit = False
                while j < len(json_text) and json_text[j].isdigit():
                    has_digit = True
                    j += 1
                if j < len(json_text) and json_text[j] == ".":
                    j += 1
                    while j < len(json_text) and json_text[j].isdigit():
                        has_digit = True
                        j += 1

                if has_digit:
                    k = j
                    while k < len(json_text) and json_text[k].isspace():
                        k += 1
                    if k < len(json_text) and json_text[k] in {",", "}"}:
                        _pop_trailing_ws_and_optional_comma()
                        i = k  # keep delimiter
                        continue

        out_chars.append(ch)
        i += 1

    return "".join(out_chars)


def extract_json_obj_robust(s: str) -> Optional[Dict[str, Any]]:
    """
    Extract and parse the first JSON object from a model output string.

    - Cuts at the first complete `{...}` (brace-balanced while respecting strings).
    - Repairs a common glitch: an unquoted trailing `confidence: <num>`.
    - Returns a dict if parsing succeeds, else None.
    """
    cut = stop_at_complete_json(s) or s
    start = cut.find("{")
    end = cut.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return None

    cand = cut[start : end + 1].strip()
    cand = cand.replace("```json", "").replace("```", "").strip()
    cand = _remove_unquoted_confidence_field(cand)

    try:
        return json.loads(cand)
    except Exception:
        return None


# ----------------------------
# Post-processing: remove template sentence
# ----------------------------
_TEMPLATE_RE = re.compile(
    r"\bthe input contains fallacious reasoning consistent with the predicted type\(s\)\b\.?",
    flags=re.IGNORECASE,
)

def strip_template_sentence(text: str) -> str:
    if not isinstance(text, str):
        return ""
    out = _TEMPLATE_RE.sub("", text)
    out = out.replace("..", ".").strip()
    out = re.sub(r"\s{2,}", " ", out)
    out = re.sub(r"^\s*[\-–—:;\.\s]+", "", out).strip()
    return out


# ----------------------------
# Output sanitation / validation
# ----------------------------
def _clamp01(x: Any, default: float = 0.5) -> float:
    try:
        v = float(x)
    except Exception:
        return default
    return 0.0 if v < 0.0 else (1.0 if v > 1.0 else v)


def _is_allowed_label(lbl: Any) -> bool:
    return isinstance(lbl, str) and lbl in ALLOWED_LABELS and lbl != "none"


def sanitize_analyze_output(obj: Dict[str, Any], input_text: str) -> Dict[str, Any]:
    has_fallacy = bool(obj.get("has_fallacy", False))
    fallacies_in = obj.get("fallacies", [])
    if not isinstance(fallacies_in, list):
        fallacies_in = []

    fallacies_out = []
    for f in fallacies_in:
        if not isinstance(f, dict):
            continue
        f_type = f.get("type")
        if not _is_allowed_label(f_type):
            continue

        conf = _clamp01(f.get("confidence", 0.5))
        conf = float(f"{conf:.2f}")

        ev = f.get("evidence_quotes", [])
        if not isinstance(ev, list):
            ev = []

        ev_clean: List[str] = []
        for q in ev:
            if not isinstance(q, str):
                continue
            qq = q.strip()
            if not qq:
                continue
            if qq in input_text:
                if len(qq) <= 240:
                    ev_clean.append(qq)
                else:
                    short = qq[:240]
                    ev_clean.append(short if short in input_text else qq)

        rationale = strip_template_sentence(str(f.get("rationale", "")).strip())

        fallacies_out.append(
            {
                "type": f_type,
                "confidence": conf,
                "evidence_quotes": ev_clean[:3],
                "rationale": rationale,
            }
        )

    overall = strip_template_sentence(str(obj.get("overall_explanation", "")).strip())

    if len(fallacies_out) == 0:
        has_fallacy = False

    return {
        "has_fallacy": has_fallacy,
        "fallacies": fallacies_out,
        "overall_explanation": overall,
    }


# ----------------------------
# Replace helpers
# ----------------------------
def occurrence_index(text: str, sub: str, occurrence: int) -> int:
    if occurrence < 0:
        return -1
    start = 0
    for _ in range(occurrence + 1):
        idx = text.find(sub, start)
        if idx == -1:
            return -1
        start = idx + max(1, len(sub))
    return idx


def replace_nth(text: str, old: str, new: str, occurrence: int) -> Dict[str, Any]:
    idx = occurrence_index(text, old, occurrence)
    if idx == -1:
        return {"ok": False, "error": "quote_not_found"}
    return {
        "ok": True,
        "rewritten_text": text[:idx] + new + text[idx + len(old) :],
        "start_char": idx,
        "end_char": idx + len(new),
        "old_start_char": idx,
        "old_end_char": idx + len(old),
    }