Spaces:

pbhappliedsystems
/

quant-eval-agent-arena

Running on Zero

File size: 9,345 Bytes

# react_engine.py
# PBH Applied Systems — Lightweight custom ReAct loop.
# Thought → Action → Observation cycles with full trace streaming.
# No LlamaIndex dependency. Llama instance is passed in from model_loader.

import re
from typing import Generator
from tools import dispatch_tool, build_tool_prompt_section, TOOL_REGISTRY

MAX_STEPS = 8
MAX_TOKENS_PER_STEP = 3072
FINAL_ANSWER_PREFIX = "FINAL ANSWER:"

# ---------------------------------------------------------------------------
# Per-template system prompt personas
# Agents have access to pre-computed behavioral evaluation data from the
# PBH Applied Systems quant_eval v7.21 evaluation series — NOT the
# quant_eval harness itself, which is a separate proprietary system.
# ---------------------------------------------------------------------------

_TEMPLATE_PERSONAS = {
    "reasoning": (
        "You are a production-grade reasoning and analysis agent built on evaluated, "
        "quantized open-weight models by PBH Applied Systems. You have deep expertise "
        "in multi-step logical inference, strategic analysis, data science, machine "
        "learning concepts, and auditable decision workflows. You produce transparent, "
        "well-structured chain-of-thought reasoning. You write complete, thorough "
        "responses — never truncated, never redirected when you can answer directly."
    ),
    "document": (
        "You are a production-grade document intelligence agent built on evaluated, "
        "quantized open-weight models by PBH Applied Systems. You specialize in "
        "extracting, analyzing, summarizing, and structuring information from documents, "
        "contracts, policies, research papers, and any text-heavy input. You produce "
        "complete, well-organized responses with clear structure. You never truncate "
        "or redirect when you can answer directly from your knowledge."
    ),
    "code": (
        "You are a production-grade code and automation agent built on evaluated, "
        "quantized open-weight models by PBH Applied Systems. You have deep expertise "
        "in Python, data engineering, ETL pipelines, Flask APIs, ML infrastructure, "
        "batch processing, and production-quality code generation. You write complete, "
        "working, well-commented code with full implementations — never stubs, "
        "never truncated examples, never redirected when you can answer directly."
    ),
}

_REACT_SYSTEM_PROMPT_TEMPLATE = """{persona}

You also have access to pre-computed behavioral evaluation data from the PBH Applied Systems quant_eval v7.21 evaluation series, served through four lookup tools:

{tools_section}

BEHAVIOR RULES:
- For general questions (coding, data science, ML, reasoning, writing, document analysis): answer directly and completely from your own knowledge. Do NOT force a tool call. Output FINAL ANSWER: with a full, useful response.
- For questions about model scores, model selection, quantization tradeoffs, fixture families, or deployment recommendations: use the ReAct format — THOUGHT:, ACTION:, OBSERVATION:, then FINAL ANSWER:
- ACTION format: ACTION: tool_name(argument)
- Never invent tool outputs. Never fabricate scores or evaluation data.
- Never give a short or unhelpful redirect when you can answer the question yourself.
- Always produce complete responses. Never cut off mid-implementation.

Begin."""


def build_system_prompt(agent_template: str = "reasoning") -> str:
    persona = _TEMPLATE_PERSONAS.get(agent_template, _TEMPLATE_PERSONAS["reasoning"])
    return _REACT_SYSTEM_PROMPT_TEMPLATE.format(
        persona=persona,
        tools_section=build_tool_prompt_section()
    )


# EOS token patterns — required for Phi-4-reasoning-plus and Qwen series
_EOS_PATTERN = re.compile(r'<\|im_end\|>|<\|end\|>|<\|endoftext\|>', re.IGNORECASE)

# Think-block pattern — required for Qwen3.6-27B
_THINK_PATTERN = re.compile(r'<think>.*?</think>', re.DOTALL)


def strip_output(text: str, strip_thinking: bool = False) -> str:
    """Strip EOS tokens and optionally <think> blocks."""
    if strip_thinking:
        text = _THINK_PATTERN.sub('', text)
    return _EOS_PATTERN.sub('', text).strip()


_ACTION_RE = re.compile(r'ACTION:\s*(\w+)\(([^)]*)\)', re.IGNORECASE)


def parse_action(text: str) -> tuple[str, str] | None:
    # Strip markdown bold markers and trailing punctuation before parsing
    cleaned = re.sub(r'\*+', '', text).strip()
    match = _ACTION_RE.search(cleaned)
    if match:
        return match.group(1).strip(), match.group(2).strip()
    # Fallback: ACTION: tool_name args (no parens)
    fallback = re.search(r'ACTION:\s*(\w+)\s*(.*)', cleaned, re.IGNORECASE)
    if fallback:
        tool = fallback.group(1).strip()
        args = re.sub(r'\*+', '', fallback.group(2)).strip()
        return tool, args
    return None


def run_react_loop(
    llm,
    user_query: str,
    model_key: str,
    agent_template: str = "reasoning",
    temperature: float = 0.2,
    max_steps: int = MAX_STEPS,
) -> Generator[str, None, None]:
    """
    Run the ReAct loop for a single model.
    Yields trace lines incrementally for Gradio streaming.

    Args:
        llm: loaded llama_cpp.Llama instance
        user_query: the user's prompt
        model_key: key from MODELS dict
        agent_template: one of 'reasoning', 'document', 'code'
        temperature: sampling temperature
        max_steps: maximum Thought/Action cycles
    """
    from eval_data import MODELS
    m = MODELS.get(model_key, {})
    short_name = m.get("short_name", model_key)
    thinking_mode = m.get("thinking_mode", False)

    system_prompt = build_system_prompt(agent_template)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_query},
    ]

    yield f"**[{short_name}] Starting...**\n"

    for step in range(1, max_steps + 1):
        yield f"\n---\n**Step {step}**\n"

        try:
            response = llm.create_chat_completion(
                messages=messages,
                temperature=temperature,
                max_tokens=MAX_TOKENS_PER_STEP,
                stop=["OBSERVATION:"],
            )
            raw = response["choices"][0]["message"]["content"]
        except Exception as e:
            yield f"\n⚠️ **LLM error on step {step}:** {e}\n"
            break

        output = strip_output(raw, strip_thinking=thinking_mode)
        messages.append({"role": "assistant", "content": output})

        for line in output.strip().splitlines():
            if line.strip():
                yield f"{line}\n"

        if FINAL_ANSWER_PREFIX.upper() in output.upper():
            yield "\n✅ **Agent reached Final Answer.**\n"
            return

        action = parse_action(output)
        if action is None:
            # Check if the output is already a complete, substantive answer
            # (has code blocks, significant length, or conclusion markers)
            has_code = '```' in output
            has_conclusion = any(kw in output.lower() for kw in [
                "in conclusion", "therefore", "to summarize", "here is", "here's",
                "the above", "this implementation", "this pipeline", "this script"
            ])
            is_substantial = len(output.strip()) > 400
        
            if has_code or has_conclusion or is_substantial:
                # Model answered directly — wrap in FINAL ANSWER and continue
                messages.append({"role": "assistant", "content": f"FINAL ANSWER: {output}"})
                yield "\n✅ **Agent concluded with direct response.**\n"
                return
        
            messages.append({
                "role": "user",
                "content": (
                    f"Available tools are: {', '.join(TOOL_REGISTRY.keys())}. "
                    f"If this query is a general coding, analysis, or writing task, output "
                    f"FINAL ANSWER: followed by your complete response. "
                    f"If this query is about evaluation data, call a tool with: ACTION: tool_name(args)"
                )
            })
            yield "\n⚠️ *No parseable ACTION. Nudging...*\n"
            continue

        tool_name, args_str = action
        yield f"\n🔧 **Tool:** `{tool_name}({args_str})`\n"

        observation = dispatch_tool(tool_name, args_str)
        yield f"\n📋 **Observation:**\n```\n{observation}\n```\n"

        messages.append({"role": "user", "content": f"OBSERVATION:\n{observation}"})

    # Max steps reached — force conclusion
    yield f"\n⚠️ **Max steps ({max_steps}) reached. Forcing final answer...**\n"
    messages.append({
        "role": "user",
        "content": f"You have used {max_steps} steps. Output your FINAL ANSWER: now."
    })
    try:
        final = llm.create_chat_completion(
            messages=messages,
            temperature=temperature,
            max_tokens=MAX_TOKENS_PER_STEP,
        )
        final_text = strip_output(
            final["choices"][0]["message"]["content"],
            strip_thinking=thinking_mode
        )
        for line in final_text.strip().splitlines():
            if line.strip():
                yield f"{line}\n"
    except Exception as e:
        yield f"\n⚠️ **Error generating forced final answer:** {e}\n"