Spaces:

CharlieBonito
/

ClarityGuardAgent

Sleeping

File size: 29,935 Bytes

243683d
001e68f
3eb25ab
 
 
aa1ec95
3eb25ab
5ede788
5efd378
806e9a1
 
5bdf748
 
bfc5315
5efd378
001e68f
2aa15cd
806e9a1
6b7ab0c
7a88bb3
 
3eb25ab
806e9a1
79aa6fb
806e9a1
febdc07
5bdf748
 
cf7c8e7
4521963
cf7c8e7
af4426f
cf7c8e7
806e9a1
 
 
 
 
 
 
 
47f4594
 
806e9a1
 
 
 
8129739
203ea5b
8129739
806e9a1
c03d8ca
806e9a1
c03d8ca
806e9a1
c03d8ca
 
806e9a1
c03d8ca
806e9a1
c03d8ca
8129739
 
 
806e9a1
8129739
 
 
 
806e9a1
8129739
 
 
806e9a1
8129739
 
 
 
 
 
 
806e9a1
8129739
806e9a1
8129739
 
 
 
806e9a1
8129739
 
806e9a1
8129739
 
806e9a1
8129739
 
806e9a1
8129739
 
806e9a1
8129739
 
806e9a1
8129739
 
806e9a1
8129739
 
 
 
 
 
 
 
806e9a1
c03d8ca
 
 
 
8129739
 
 
806e9a1
c03d8ca
 
806e9a1
8129739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806e9a1
8129739
c03d8ca
8129739
 
 
806e9a1
8129739
 
806e9a1
c03d8ca
8129739
c03d8ca
806e9a1
c03d8ca
8129739
c03d8ca
806e9a1
8129739
c03d8ca
8129739
 
 
 
 
 
 
 
 
 
203ea5b
8129739
 
 
3d78c97
e50bf90
febdc07
5bdf748
e50bf90
5efd378
 
 
 
 
 
 
fdc2e4b
5bdf748
3eb25ab
4521963
 
 
 
f862f5d
 
4521963
f862f5d
5efd378
 
2aa15cd
5efd378
 
 
5ede788
5bdf748
 
 
 
 
 
 
 
 
5ede788
5efd378
 
f8bfc93
6559489
febdc07
 
bfc5315
cf7c8e7
3d78c97
66f6169
 
 
 
febdc07
f862f5d
febdc07
 
6559489
4521963
 
febdc07
 
 
 
66f6169
5bdf748
f862f5d
 
 
c03d8ca
bfc5315
 
 
 
 
2f0dbf5
febdc07
 
 
 
 
 
 
 
c03d8ca
febdc07
 
 
 
 
 
 
 
 
c2e01e9
febdc07
5ede788
4e37b96
5ede788
2aa15cd
febdc07
5ede788
 
5efd378
5ede788
febdc07
 
2aa15cd
5ede788
febdc07
4e37b96
5efd378
21455d3
806e9a1
 
47f4594
806e9a1
47f4594
 
806e9a1
 
 
 
 
 
 
47f4594
806e9a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bdf748
 
 
 
 
 
 
 
806e9a1
 
 
 
5bdf748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c03d8ca
5bdf748
 
 
 
 
 
 
 
738ee8d
e50bf90
febdc07
8129739
febdc07
8129739
368fc81
2f0dbf5
806e9a1
 
 
 
c2e01e9
febdc07
 
 
5bdf748
 
 
febdc07
 
 
 
 
 
 
 
368fc81
e9af0e7
fdc2e4b
f6072c0
febdc07
 
 
 
af4426f
febdc07
 
5bdf748
f6072c0
febdc07
e9af0e7
fdc2e4b
bfc5315
 
 
 
 
 
 
 
 
2aa15cd
bfc5315
 
2aa15cd
bfc5315
3eb25ab
2aa15cd
 
738ee8d
8129739
 
 
 
 
 
 
 
 
 
 
31f5558
c03d8ca
 
5bdf748
 
 
 
 
 
 
 
 
 
 
c03d8ca
5bdf748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c03d8ca
5bdf748
 
 
 
 
 
5d13956
f8bfc93
243683d
5efd378
 
806e9a1
c2e01e9
febdc07

import gradio as gr
import os
import subprocess
import time
import requests
import json
import threading
import traceback
import sys
import hashlib
import math
import base64
import mimetypes
from huggingface_hub import hf_hub_download
from datetime import datetime

# --- CONFIGURACIÓN ---
APP_DIR = os.getenv("APP_DIR", os.path.dirname(os.path.abspath(__file__)))
MODEL_REPO = "CharlieBonito/clarity-guard-gemma4-7b"
MODEL_FILE = "ClarityGuard-v2.gguf"
MMPROJ_FILE = os.getenv("MMPROJ_FILE", "mmproj-ClarityGuard-v2.gguf")
LLAMA_SERVER = "/opt/llama-cpp/llama-server"
MODEL_DIR = os.getenv("MODEL_DIR", os.path.join(APP_DIR, "models"))
SERVER_URL = "http://127.0.0.1:8080"
LOG_FILE = os.getenv("LOG_FILE", os.path.join(APP_DIR, "startup.log"))
CPU_THREADS = int(os.getenv("CPU_THREADS", "8"))
LLAMA_CTX = int(os.getenv("LLAMA_CTX", "12288"))
LLAMA_MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "8192"))
LLAMA_BATCH = int(os.getenv("LLAMA_BATCH", "1024"))
LLAMA_UBATCH = int(os.getenv("LLAMA_UBATCH", "512"))
LLAMA_GPU_LAYERS = int(os.getenv("LLAMA_GPU_LAYERS", "999"))
LLAMA_TEMP = float(os.getenv("LLAMA_TEMP", "0.7"))
MMPROJ_OFFLOAD = os.getenv("MMPROJ_OFFLOAD", "true").lower() in ("1", "true", "yes")
JINA_API_KEY = os.getenv("JINA_API_KEY", "")
JINA_EMBED_MODEL = os.getenv("JINA_EMBED_MODEL", "jina-embeddings-v3")
RAG_INDEX_FILE = os.getenv("RAG_INDEX_FILE", os.path.join(APP_DIR, "rag_index.json"))
RAG_TOP_K = int(os.getenv("RAG_TOP_K", "4"))
RAG_MAX_CONTEXT_CHARS = int(os.getenv("RAG_MAX_CONTEXT_CHARS", "9000"))
RAG_CHUNK_CHARS = int(os.getenv("RAG_CHUNK_CHARS", "1800"))
RAG_CHUNK_OVERLAP = int(os.getenv("RAG_CHUNK_OVERLAP", "250"))
RAG_DOCS = [
    ("chatty", os.path.join(APP_DIR, "documents", "chatty.md")),
    ("libro", os.path.join(APP_DIR, "documents", "libro.md")),
    ("chatty", os.path.join(APP_DIR, "chatty.md")),
    ("libro", os.path.join(APP_DIR, "libro.md")),
]

CLARITYGUARD_SYSTEM_PROMPT = """CLARITYGUARD ASSISTANT — NEURO-INCLUSIVE EDITION v4.7
Tuned for ClarityGuard v2 / Gemma 4 E4B IT checkpoint 750 | Dify + Jina RAG
Based on C.F.R.V.A., created by Carlos Lengemann (2026) — CC BY 4.0

Language policy (non-negotiable): These instructions are written in English for clarity for builders. Your replies to the user must always be in the same language the user uses in their current message (including step titles, examples, and suggested wording). If the user mixes languages, mirror the language of their question / framing (the part where they ask for help), not the quoted third-party text. Never concatenate words. Always write with correct spacing and normal punctuation.

Response initialization (non-negotiable): Every response must begin with a clean, natural opener such as "Got it.", "Sure!", "Hi there!" or "Understood." before any analysis. This is mandatory on every turn without exception.

IDENTITY AND PURPOSE
You are ClarityGuard, a structural communication-analysis module. You specialize in providing objective clarity for neurodivergent individuals by translating abstract or socially-coded messages into concrete, actionable data.

Core Function: You determine whether confusion originates in the structure of the message itself rather than a cognitive failure of the user. You treat ambiguity as a technical bug in the communication protocol.

Foundational Principles:
Fundamental Principle: Confusion in the face of a structurally incomplete message is the correct response, not a cognitive error. If a message lacks a clear subject, defined action, explicit date, or measurable criterion, no person can execute it with certainty, regardless of their cognitive profile.
Universality Principle: The perception that others "understand" ambiguous messages does not demonstrate message clarity. It may demonstrate the use of cognitive shortcuts (confirmation bias, anchoring bias, social conformity) that produce an illusion of understanding.
Double Empathy Mitigation: You bridge the gap between literal/data-driven communication styles and implicit/vibe-driven styles without pathologizing either.

═══════════════════════════════════════════════════════
STEP 0 — INPUT TRIAGE (mandatory first gate)
═══════════════════════════════════════════════════════
Before running C.F.R.V.A., classify the user's input into ONE of three modes:

MODE A — CASUAL / CONVERSATIONAL
Triggers: greetings, small talk, subjective opinion questions ("which game is better, StarCraft or Age of Empires?"), general knowledge questions, playful banter, hypotheticals with no real-world stakes, requests for recommendations without a communication conflict, or any input where there is NO reported interpersonal misunderstanding, NO ambiguous message from a third party being decoded, and NO emotional distress.
→ Response: Reply naturally and conversationally, like a friendly knowledgeable assistant. Do NOT mention C.F.R.V.A. Do NOT produce a score. Do NOT use the 4-step structure. Do NOT use clinical/structural language. Just answer the way a smart, warm friend would. Keep the opener requirement.

MODE B — LIGHT CLARIFICATION
Triggers: the user reports a minor confusion about a single phrase, idiom, or instruction, but with no emotional charge and no ongoing conflict. Example: "My coworker said 'ping me later' — does that mean call or message?"
→ Response: Give a brief plain-language explanation (2–4 sentences) of what the phrase likely means in context, plus ONE optional clarification question they could ask. Do NOT run the full 4-step protocol. Do NOT show a score. Stay light.

MODE C — STRUCTURAL ANALYSIS (full ClarityGuard)
Triggers: the user reports a workplace, social, or interpersonal situation involving (a) an ambiguous/coded message from another party, (b) a label or accusation directed at them ("arrogant", "shifty", "not a culture fit", "passive-aggressive", etc.), (c) a conflict where they feel misunderstood or judged, (d) sensory/cognitive accommodation issues, or (e) any situation where they need help decoding what someone "really meant" in a high-stakes context.
→ Response: Run the full C.F.R.V.A. analysis and the 4-step protocol below.

Routing principle: When in doubt between A and C, ask yourself: "Is there a real-world communication conflict with stakes for the user?" If no → Mode A. If yes → Mode C. Never force a casual question into the structural protocol.

═══════════════════════════════════════════════════════
C.F.R.V.A. FRAMEWORK DEFINITIONS (Mode C only)
Based on C.F.R.V.A. — Carlos Lengemann (2026)
═══════════════════════════════════════════════════════

C — Undeclared Context
Presence of implicit assumptions, unverbalized background, or prior information the sender assumes is known but does not make explicit, generating interpretation gaps.

F — Diffuse Focusing
Absence of measurable criteria, undefined terms, or instructions that do not specify what observable result constitutes compliance.

R — Covert Redirection
Change of focus or priority without explicit signaling, where the object of the communication shifts without notice, preventing linear tracking.

V — Conditioned Validation
Structure where approval, positive response, or access to information depends on NOT requesting clarification, implicitly penalizing the question.

A — Linguistic Ambiguity
Use of figurative language, undefined technical jargon, metaphors, or extended instructions without written support that prevent objective verification.

SCORING SCALE
Each dimension is scored 0–10. Maximum total: 50 points.

0–10: Clear message. Confirm receipt and offer support if needed.
11–20: General clarity problem. Name the ambiguous element, suggest one confirmation question.
21–30: Moderate ambiguity. Full analysis + cognitive protection + clarification suggestion.
31–50: Maximum Alert. Full analysis with cognitive protection + clarification questions + follow-up plan for abstract replies.

═══════════════════════════════════════════════════════
RESPONSE STRUCTURE — 4 STEPS (Mode C only)
═══════════════════════════════════════════════════════

STEP 1 — ANALYSIS
🔍 [ClarityGuard] C.F.R.V.A. score: XX/50 → [Level Name]
Use descriptive, clinical language to identify Protocol Mismatches:
Identify what the message has (literal tokens).
Identify the Structural Vacuum (what is missing: dates, units, specific verbs, measurable criteria).
Flag Adjective-Based Feedback: adjectives (e.g., "arrogant," "proactive") are emotional data points for the sender, but zero-value data points for the receiver.
Do NOT evaluate the sender's intent. Evaluate the message structure only.

STEP 2 — COGNITIVE PROTECTION
🔒 Your confusion is not a failure. It is the correct response to an incomplete message.

Tone for Step 2: Warmer and more human than Step 1 and Step 4.
Step 2 is the moment of relief in the response — the user has
just received structural analysis (Step 1) and is about to
receive action items (Step 3). Step 2 should feel like a
pause where the bot acknowledges that the user's logical
response to the message is valid.

Open Step 2 with one sentence that validates the user's
position logically — not emotionally. Examples of acceptable
openings (do not project feelings the user did not declare):
- "Your reading of this message is structurally correct."
- "The difficulty you may be having parsing this is not
   a comprehension issue — it is a data issue."
- "Nothing about this message is your responsibility to decode
   alone; it was delivered without the necessary parameters."

Conditional emotional mirroring: If — and only if — the user
explicitly uses emotional language ("I feel worried", "this
made me anxious", "I'm overwhelmed"), you may mirror that
specific word once in Step 2 before continuing with the
structural analysis. Do not introduce emotional vocabulary
the user did not provide.

Objective Fact: Summarize what was said literally.
Structural Gap: Name the missing technical parameter.
Universality Brief: Explain that the ambiguity makes the message
structurally unexecutable. Others appearing to "understand" it
are likely using social shortcuts, not data-driven comprehension.

Constraint: Do NOT name or infer emotions (e.g., "you feel anxious")
unless the user explicitly used those words. Stay operational.

STEP 3 — CONCRETE ACTION (Read-Back)
✍️ Clarification suggestion: Provide a "Read-Back" script designed to force the other party back into Operational Language.
"To ensure I meet the exact professional standard: when you say [QUOTE], are you referring to [VARIABLE A] or [VARIABLE B]? What is the specific observable behavior you would like me to implement?"

STEP 4 — FOLLOW-UP PLAN (Binary Choice Decomposition)
⏰ If the reply remains abstract (e.g., "Just be more open"), apply Binary Choice Decomposition: Propose two concrete, mutually exclusive actions for the other person to choose from.
"To achieve '[Abstract Term]', should I optimize for Option A [Concrete Action 1] or Option B [Concrete Action 2]? If neither, please provide one physical action I can practice today."

═══════════════════════════════════════════════════════
TONE AND RESTRICTIONS
═══════════════════════════════════════════════════════
Mode-dependent tone: In Mode A, be warm, natural, and conversational. In Mode B, be clear and brief. In Mode C, be structural, precise, and operational.
No Conflict Metaphors (Mode C): Do not use "Red Herring," "Gaslighting," "Trap," or "Attack." Use "Structural Inconsistency," "Protocol Mismatch," or "Introduction of non-verifiable variables."
No Emotional Labeling (Mode C): Do not use "painful," "destabilizing," or "distressing" unless the user does.
Neutrality (Mode C): Evaluate message structure only. Never evaluate the intent of the sender or the cognitive profile of the user.
Subjective opinions welcome in Mode A: When the user asks for your take on a non-conflict subjective topic, you may share a perspective casually.
Spacing Rule (all modes): Never concatenate words. Use correct spacing and standard punctuation.

═══════════════════════════════════════════════════════
Version: ClarityGuard v4.7 — Structural / Neuro-inclusive
Stack: ClarityGuard v2 / Gemma 4 E4B IT checkpoint 750 | Dify | Jina RAG
Framework: C.F.R.V.A. (Lengemann, 2026) | Input Triage | Operational Pragmatics | Universality of Ambiguity
Attribution: Based on C.F.R.V.A., created by Carlos Lengemann (2026). Licensed CC BY 4.0.
https://creativecommons.org/licenses/by/4.0/deed.es"""

server_ready = False
server_error = None
multimodal_ready = False

def log(msg):
    stamp = datetime.now().strftime("%H:%M:%S")
    line = f"[{stamp}] {msg}"
    print(line, file=sys.stderr)
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(line + "\n")

def start_server():
    global multimodal_ready
    os.makedirs(MODEL_DIR, exist_ok=True)
    log(
        "Configuración: "
        f"CPU_THREADS={CPU_THREADS}, LLAMA_CTX={LLAMA_CTX}, "
        f"LLAMA_MAX_TOKENS={LLAMA_MAX_TOKENS}, LLAMA_BATCH={LLAMA_BATCH}, "
        f"LLAMA_UBATCH={LLAMA_UBATCH}, LLAMA_GPU_LAYERS={LLAMA_GPU_LAYERS}, "
        f"MMPROJ_OFFLOAD={MMPROJ_OFFLOAD}"
    )
    log("Descargando modelo para inferencia...")
    try:
        m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
        log(f"Modelo descargado en: {m_path}")
    except Exception as e:
        log(f"FALLO en descarga: {e}")
        raise

    mmproj_path = ""
    try:
        mmproj_path = hf_hub_download(repo_id=MODEL_REPO, filename=MMPROJ_FILE, local_dir=MODEL_DIR)
        multimodal_ready = True
        log(f"Projector multimodal descargado en: {mmproj_path}")
    except Exception as e:
        multimodal_ready = False
        log(f"Projector multimodal no disponible; imágenes desactivadas. Detalle: {e}")

    if not os.path.exists(LLAMA_SERVER):
        raise FileNotFoundError(f"No existe {LLAMA_SERVER}")

    env = os.environ.copy()
    env["LD_LIBRARY_PATH"] = "/usr/local/lib:" + env.get("LD_LIBRARY_PATH", "")
    env["OMP_NUM_THREADS"] = str(CPU_THREADS)
    env["OMP_PROC_BIND"] = "false"

    # GPU by default on the Hugging Face Space; override LLAMA_GPU_LAYERS for CPU testing.
    cmd = [
        LLAMA_SERVER,
        "-m", m_path,
        "--host", "127.0.0.1",
        "--port", "8080",
        "-c", str(LLAMA_CTX),
        "-ngl", str(LLAMA_GPU_LAYERS),
        "-t", str(CPU_THREADS),
        "-tb", str(CPU_THREADS),
        "-np", "1",
        "-b", str(LLAMA_BATCH),
        "-ub", str(LLAMA_UBATCH),
        "--threads-http", "2",
        "--fit", "off",
        "--no-mmap",
        "--jinja",
    ]
    if mmproj_path:
        cmd.extend(["--mmproj", mmproj_path])
        if not MMPROJ_OFFLOAD:
            cmd.append("--no-mmproj-offload")
    log(f"Lanzando llama-server GPU: {' '.join(cmd)}")
    return subprocess.Popen(
        cmd, env=env,
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        text=True, bufsize=1
    )

def wait_until_ready(proc, timeout=900):
    global server_ready, server_error
    start = time.time()
    while proc.poll() is None and time.time() - start < timeout:
        try:
            r = requests.get(f"{SERVER_URL}/health", timeout=2)
            if r.status_code == 200:
                server_ready = True
                log("MOTOR EN LINEA (GPU)")
                return
        except Exception:
            pass
        time.sleep(2)
    if proc.poll() is None:
        server_error = "El motor no respondió al health-check dentro del tiempo esperado."
    else:
        server_error = f"El motor terminó antes de estar listo. Código: {proc.returncode}"

def monitor_engine():
    global server_error
    try:
        log("Arrancando monitor...")
        proc = start_server()
        log(f"PID llama-server: {proc.pid}")
        threading.Thread(target=wait_until_ready, args=(proc,), daemon=True).start()
        for line in proc.stdout:
            line = line.strip()
            log(f"[llama] {line}")
        ret = proc.wait()
        if ret != 0 and not server_error:
            server_error = f"llama-server terminó con código {ret}"
        log(f"llama-server terminó con código: {ret}")
    except Exception as e:
        server_error = str(e)
        log(f"EXCEPCIÓN MONITOR: {e}")
        log(traceback.format_exc())

def read_rag_documents():
    docs = []
    seen_sources = set()
    for source, path in RAG_DOCS:
        if source in seen_sources:
            continue
        if not os.path.exists(path):
            log(f"RAG: documento no encontrado: {path}")
            continue
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()
        if text:
            docs.append({"source": source, "path": path, "text": text})
            seen_sources.add(source)
    return docs

def chunk_document(text, max_chars=RAG_CHUNK_CHARS, overlap=RAG_CHUNK_OVERLAP):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks = []
    current = ""
    for paragraph in paragraphs:
        if len(paragraph) > max_chars:
            if current:
                chunks.append(current.strip())
                current = ""
            for i in range(0, len(paragraph), max_chars - overlap):
                chunks.append(paragraph[i:i + max_chars].strip())
            continue
        if len(current) + len(paragraph) + 2 <= max_chars:
            current = f"{current}\n\n{paragraph}".strip()
        else:
            if current:
                chunks.append(current.strip())
                tail = current[-overlap:] if overlap > 0 else ""
                current = f"{tail}\n\n{paragraph}".strip()
            else:
                current = paragraph
    if current:
        chunks.append(current.strip())
    return chunks

def rag_fingerprint(docs):
    h = hashlib.sha256()
    h.update(JINA_EMBED_MODEL.encode("utf-8"))
    h.update(str(RAG_CHUNK_CHARS).encode("utf-8"))
    h.update(str(RAG_CHUNK_OVERLAP).encode("utf-8"))
    for doc in docs:
        h.update(doc["source"].encode("utf-8"))
        h.update(doc["text"].encode("utf-8"))
    return h.hexdigest()

def normalize_vector(vector):
    norm = math.sqrt(sum(float(x) * float(x) for x in vector))
    if norm == 0:
        return [0.0 for _ in vector]
    return [float(x) / norm for x in vector]

def jina_embed(texts, task):
    if not JINA_API_KEY:
        raise RuntimeError("falta JINA_API_KEY")
    r = requests.post(
        "https://api.jina.ai/v1/embeddings",
        headers={
            "Authorization": f"Bearer {JINA_API_KEY}",
            "Content-Type": "application/json",
        },
        json={
            "model": JINA_EMBED_MODEL,
            "task": task,
            "input": texts,
        },
        timeout=120,
    )
    r.raise_for_status()
    data = r.json().get("data", [])
    data.sort(key=lambda item: item.get("index", 0))
    return [normalize_vector(item["embedding"]) for item in data]

def build_rag_index():
    docs = read_rag_documents()
    if not docs:
        log("RAG: sin documentos disponibles.")
        return []

    fingerprint = rag_fingerprint(docs)
    if os.path.exists(RAG_INDEX_FILE):
        try:
            with open(RAG_INDEX_FILE, "r", encoding="utf-8") as f:
                cached = json.load(f)
            if cached.get("fingerprint") == fingerprint:
                chunks = cached.get("chunks", [])
                log(f"RAG: índice cargado desde cache ({len(chunks)} chunks).")
                return chunks
        except Exception as e:
            log(f"RAG: no se pudo leer cache, se reconstruye. Detalle: {e}")

    if not JINA_API_KEY:
        log("RAG: desactivado porque falta JINA_API_KEY.")
        return []

    chunks = []
    for doc in docs:
        for idx, text in enumerate(chunk_document(doc["text"])):
            chunks.append({
                "source": doc["source"],
                "chunk_id": idx,
                "text": text,
            })

    log(f"RAG: generando embeddings Jina v3 para {len(chunks)} chunks.")
    batch_size = 16
    for start in range(0, len(chunks), batch_size):
        batch = chunks[start:start + batch_size]
        embeddings = jina_embed([item["text"] for item in batch], "retrieval.passage")
        for item, embedding in zip(batch, embeddings):
            item["embedding"] = embedding

    with open(RAG_INDEX_FILE, "w", encoding="utf-8") as f:
        json.dump({
            "fingerprint": fingerprint,
            "model": JINA_EMBED_MODEL,
            "chunks": chunks,
        }, f)
    log(f"RAG: índice guardado en {RAG_INDEX_FILE}.")
    return chunks

rag_chunks = None
rag_lock = threading.Lock()

def get_rag_chunks():
    global rag_chunks
    with rag_lock:
        if rag_chunks is None:
            try:
                rag_chunks = build_rag_index()
            except Exception as e:
                log(f"RAG: error construyendo índice: {e}")
                rag_chunks = []
    return rag_chunks

def retrieve_rag_context(query):
    chunks = get_rag_chunks()
    if not chunks or not query.strip() or not JINA_API_KEY:
        return ""
    try:
        query_embedding = jina_embed([query], "retrieval.query")[0]
    except Exception as e:
        log(f"RAG: error consultando Jina: {e}")
        return ""

    scored = []
    for chunk in chunks:
        embedding = chunk.get("embedding")
        if not embedding:
            continue
        score = sum(a * b for a, b in zip(query_embedding, embedding))
        scored.append((score, chunk))
    scored.sort(key=lambda item: item[0], reverse=True)

    selected = []
    used_chars = 0
    for score, chunk in scored[:RAG_TOP_K]:
        text = chunk["text"].strip()
        block = f"[source={chunk['source']} chunk={chunk['chunk_id']} score={score:.3f}]\n{text}"
        if used_chars + len(block) > RAG_MAX_CONTEXT_CHARS:
            break
        selected.append(block)
        used_chars += len(block)

    if not selected:
        return ""
    return (
        "RAG CONTEXT (reference only; ClarityGuard system prompt has priority):\n"
        "Use this context only when it directly helps answer the user's current message. "
        "Do not copy confrontational Chatty/book tone into the user-facing answer.\n\n"
        + "\n\n---\n\n".join(selected)
    )

def latest_user_text(history):
    for item in reversed(history):
        if isinstance(item, dict) and item.get("role") == "user":
            content = item.get("content", "")
            if isinstance(content, list):
                return " ".join(
                    str(part.get("text", ""))
                    for part in content
                    if isinstance(part, dict) and part.get("type") == "text"
                )
            return str(content)
        if isinstance(item, (list, tuple)) and item and item[0]:
            return str(item[0])
    return ""

def image_to_data_uri(image_path):
    if not image_path:
        return ""
    mime_type, _ = mimetypes.guess_type(image_path)
    if not mime_type:
        mime_type = "image/png"
    with open(image_path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("ascii")
    return f"data:{mime_type};base64,{encoded}"

def make_user_content(message, image_path=None):
    text = str(message or "").strip()
    if not image_path:
        return text
    if not multimodal_ready:
        note = "[Attached image, but the multimodal projector is not loaded in llama-server.]"
        return f"{text}\n\n{note}" if text else note
    content = []
    data_uri = image_to_data_uri(image_path)
    if data_uri:
        content.append({"type": "image_url", "image_url": {"url": data_uri}})
    content.append({"type": "text", "text": text or "Analyze this image."})
    return content

def respond(history):
    if not server_ready:
        if server_error:
            yield f"Engine unavailable: {server_error}"
            return
        yield "Engine loading… this may take a few minutes the first time."
        return

    api_messages = [{"role": "system", "content": CLARITYGUARD_SYSTEM_PROMPT}]
    rag_context = retrieve_rag_context(latest_user_text(history))
    if rag_context:
        api_messages.append({"role": "system", "content": rag_context})
    for m in history:
        if isinstance(m, dict):
            content = m.get("content", "")
            if isinstance(content, list):
                api_messages.append({"role": m.get("role", "user"), "content": content})
            else:
                api_messages.append({"role": m.get("role", "user"), "content": str(content)})
            continue

        if isinstance(m, (list, tuple)) and len(m) >= 2:
            user_msg, assistant_msg = m[0], m[1]
            if user_msg:
                api_messages.append({"role": "user", "content": str(user_msg)})
            if assistant_msg:
                api_messages.append({"role": "assistant", "content": str(assistant_msg)})

    try:
        r = requests.post(
            f"{SERVER_URL}/v1/chat/completions",
            json={
                "model": MODEL_FILE,
                "messages": api_messages,
                "stream": True,
                "temperature": LLAMA_TEMP,
                "max_tokens": LLAMA_MAX_TOKENS,
            },
            stream=True, timeout=1200
        )
        r.raise_for_status()
        full_text = ""
        for line in r.iter_lines():
            if not line:
                continue
            raw = line.decode("utf-8")
            if not raw.startswith("data:"):
                continue
            chunk = raw[5:].strip()
            if chunk == "[DONE]":
                break
            try:
                delta = json.loads(chunk)["choices"][0].get("delta", {}).get("content", "")
                full_text += delta
                yield full_text
            except Exception:
                continue
    except Exception as e:
        yield f"Error: {e}"

with gr.Blocks() as demo:
    gr.Markdown("""# ClarityGuard
Hi there! I'm ClarityGuard. How can I help you?

You can ask me things like:

> *"I missed a minor typo in a draft report, and my manager CC'd the entire HR department, calling it a 'concerning pattern of negligence.' My neurotypical peers make much bigger mistakes and it's just called a 'learning curve.' I feel like they're building a 'paper trail' to fire me over a non-issue — what happened here?"*

> *"During my annual review, they said I'm not a 'culture fit' because I don't go to the Friday happy hours. I told them I prefer to focus on my technical tasks during work hours, but they said I lack 'passion for the company vision.' It feels like they're judging my character because I don't want to perform the expected social scripts — does this make sense?"*

Or ask me anything — about work, relationships, or just to talk. I'm open to anything.
""")
    chatbot = gr.Chatbot(height=500)
    msg = gr.Textbox(placeholder="Type your message and press Enter...")
    image = gr.Image(label="Optional image", type="filepath")
    api_state = gr.State([])

    def user_fn(message, image_path, visible_history, api_history):
        if visible_history is None:
            visible_history = []
        if api_history is None:
            api_history = []

        text = str(message or "").strip()
        visible_text = text
        if image_path:
            visible_text = f"{text}\n\n[Attached image]" if text else "[Attached image]"

        visible_history.append({"role": "user", "content": visible_text})
        api_history.append({"role": "user", "content": make_user_content(text, image_path)})
        return "", None, visible_history, api_history

    def bot_fn(visible_history, api_history):
        if visible_history is None:
            visible_history = []
        if api_history is None:
            api_history = []

        visible_history.append({"role": "assistant", "content": ""})
        for chunk in respond(api_history):
            visible_history[-1] = {"role": "assistant", "content": chunk}
            yield visible_history, api_history
        if visible_history:
            api_history.append({"role": "assistant", "content": visible_history[-1]["content"]})
            yield visible_history, api_history

    send = gr.Button("Send")

    msg.submit(user_fn, [msg, image, chatbot, api_state], [msg, image, chatbot, api_state]).then(
        bot_fn, [chatbot, api_state], [chatbot, api_state]
    )
    send.click(user_fn, [msg, image, chatbot, api_state], [msg, image, chatbot, api_state]).then(
        bot_fn, [chatbot, api_state], [chatbot, api_state]
    )

if __name__ == "__main__":
    with open(LOG_FILE, "w") as f:
        f.write("Iniciando...\n")
    threading.Thread(target=get_rag_chunks, daemon=True).start()
    threading.Thread(target=monitor_engine, daemon=True).start()
    demo.launch(server_name="0.0.0.0", server_port=7860)