# react_engine.py
# PBH Applied Systems — Lightweight custom ReAct loop.
# Thought → Action → Observation cycles with full trace streaming.
# No LlamaIndex dependency. Llama instance is passed in from model_loader.
import re
from typing import Generator
from tools import dispatch_tool, build_tool_prompt_section, TOOL_REGISTRY
MAX_STEPS = 8
MAX_TOKENS_PER_STEP = 3072
FINAL_ANSWER_PREFIX = "FINAL ANSWER:"
# ---------------------------------------------------------------------------
# Per-template system prompt personas
# Agents have access to pre-computed behavioral evaluation data from the
# PBH Applied Systems quant_eval v7.21 evaluation series — NOT the
# quant_eval harness itself, which is a separate proprietary system.
# ---------------------------------------------------------------------------
_TEMPLATE_PERSONAS = {
"reasoning": (
"You are a production-grade reasoning and analysis agent built on evaluated, "
"quantized open-weight models by PBH Applied Systems. You have deep expertise "
"in multi-step logical inference, strategic analysis, data science, machine "
"learning concepts, and auditable decision workflows. You produce transparent, "
"well-structured chain-of-thought reasoning. You write complete, thorough "
"responses — never truncated, never redirected when you can answer directly."
),
"document": (
"You are a production-grade document intelligence agent built on evaluated, "
"quantized open-weight models by PBH Applied Systems. You specialize in "
"extracting, analyzing, summarizing, and structuring information from documents, "
"contracts, policies, research papers, and any text-heavy input. You produce "
"complete, well-organized responses with clear structure. You never truncate "
"or redirect when you can answer directly from your knowledge."
),
"code": (
"You are a production-grade code and automation agent built on evaluated, "
"quantized open-weight models by PBH Applied Systems. You have deep expertise "
"in Python, data engineering, ETL pipelines, Flask APIs, ML infrastructure, "
"batch processing, and production-quality code generation. You write complete, "
"working, well-commented code with full implementations — never stubs, "
"never truncated examples, never redirected when you can answer directly."
),
}
_REACT_SYSTEM_PROMPT_TEMPLATE = """{persona}
You also have access to pre-computed behavioral evaluation data from the PBH Applied Systems quant_eval v7.21 evaluation series, served through four lookup tools:
{tools_section}
BEHAVIOR RULES:
- For general questions (coding, data science, ML, reasoning, writing, document analysis): answer directly and completely from your own knowledge. Do NOT force a tool call. Output FINAL ANSWER: with a full, useful response.
- For questions about model scores, model selection, quantization tradeoffs, fixture families, or deployment recommendations: use the ReAct format — THOUGHT:, ACTION:, OBSERVATION:, then FINAL ANSWER:
- ACTION format: ACTION: tool_name(argument)
- Never invent tool outputs. Never fabricate scores or evaluation data.
- Never give a short or unhelpful redirect when you can answer the question yourself.
- Always produce complete responses. Never cut off mid-implementation.
Begin."""
def build_system_prompt(agent_template: str = "reasoning") -> str:
persona = _TEMPLATE_PERSONAS.get(agent_template, _TEMPLATE_PERSONAS["reasoning"])
return _REACT_SYSTEM_PROMPT_TEMPLATE.format(
persona=persona,
tools_section=build_tool_prompt_section()
)
# EOS token patterns — required for Phi-4-reasoning-plus and Qwen series
_EOS_PATTERN = re.compile(r'<\|im_end\|>|<\|end\|>|<\|endoftext\|>', re.IGNORECASE)
# Think-block pattern — required for Qwen3.6-27B
_THINK_PATTERN = re.compile(r'.*?', re.DOTALL)
def strip_output(text: str, strip_thinking: bool = False) -> str:
"""Strip EOS tokens and optionally blocks."""
if strip_thinking:
text = _THINK_PATTERN.sub('', text)
return _EOS_PATTERN.sub('', text).strip()
_ACTION_RE = re.compile(r'ACTION:\s*(\w+)\(([^)]*)\)', re.IGNORECASE)
def parse_action(text: str) -> tuple[str, str] | None:
# Strip markdown bold markers and trailing punctuation before parsing
cleaned = re.sub(r'\*+', '', text).strip()
match = _ACTION_RE.search(cleaned)
if match:
return match.group(1).strip(), match.group(2).strip()
# Fallback: ACTION: tool_name args (no parens)
fallback = re.search(r'ACTION:\s*(\w+)\s*(.*)', cleaned, re.IGNORECASE)
if fallback:
tool = fallback.group(1).strip()
args = re.sub(r'\*+', '', fallback.group(2)).strip()
return tool, args
return None
def run_react_loop(
llm,
user_query: str,
model_key: str,
agent_template: str = "reasoning",
temperature: float = 0.2,
max_steps: int = MAX_STEPS,
) -> Generator[str, None, None]:
"""
Run the ReAct loop for a single model.
Yields trace lines incrementally for Gradio streaming.
Args:
llm: loaded llama_cpp.Llama instance
user_query: the user's prompt
model_key: key from MODELS dict
agent_template: one of 'reasoning', 'document', 'code'
temperature: sampling temperature
max_steps: maximum Thought/Action cycles
"""
from eval_data import MODELS
m = MODELS.get(model_key, {})
short_name = m.get("short_name", model_key)
thinking_mode = m.get("thinking_mode", False)
system_prompt = build_system_prompt(agent_template)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_query},
]
yield f"**[{short_name}] Starting...**\n"
for step in range(1, max_steps + 1):
yield f"\n---\n**Step {step}**\n"
try:
response = llm.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=MAX_TOKENS_PER_STEP,
stop=["OBSERVATION:"],
)
raw = response["choices"][0]["message"]["content"]
except Exception as e:
yield f"\n⚠️ **LLM error on step {step}:** {e}\n"
break
output = strip_output(raw, strip_thinking=thinking_mode)
messages.append({"role": "assistant", "content": output})
for line in output.strip().splitlines():
if line.strip():
yield f"{line}\n"
if FINAL_ANSWER_PREFIX.upper() in output.upper():
yield "\n✅ **Agent reached Final Answer.**\n"
return
action = parse_action(output)
if action is None:
# Check if the output is already a complete, substantive answer
# (has code blocks, significant length, or conclusion markers)
has_code = '```' in output
has_conclusion = any(kw in output.lower() for kw in [
"in conclusion", "therefore", "to summarize", "here is", "here's",
"the above", "this implementation", "this pipeline", "this script"
])
is_substantial = len(output.strip()) > 400
if has_code or has_conclusion or is_substantial:
# Model answered directly — wrap in FINAL ANSWER and continue
messages.append({"role": "assistant", "content": f"FINAL ANSWER: {output}"})
yield "\n✅ **Agent concluded with direct response.**\n"
return
messages.append({
"role": "user",
"content": (
f"Available tools are: {', '.join(TOOL_REGISTRY.keys())}. "
f"If this query is a general coding, analysis, or writing task, output "
f"FINAL ANSWER: followed by your complete response. "
f"If this query is about evaluation data, call a tool with: ACTION: tool_name(args)"
)
})
yield "\n⚠️ *No parseable ACTION. Nudging...*\n"
continue
tool_name, args_str = action
yield f"\n🔧 **Tool:** `{tool_name}({args_str})`\n"
observation = dispatch_tool(tool_name, args_str)
yield f"\n📋 **Observation:**\n```\n{observation}\n```\n"
messages.append({"role": "user", "content": f"OBSERVATION:\n{observation}"})
# Max steps reached — force conclusion
yield f"\n⚠️ **Max steps ({max_steps}) reached. Forcing final answer...**\n"
messages.append({
"role": "user",
"content": f"You have used {max_steps} steps. Output your FINAL ANSWER: now."
})
try:
final = llm.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=MAX_TOKENS_PER_STEP,
)
final_text = strip_output(
final["choices"][0]["message"]["content"],
strip_thinking=thinking_mode
)
for line in final_text.strip().splitlines():
if line.strip():
yield f"{line}\n"
except Exception as e:
yield f"\n⚠️ **Error generating forced final answer:** {e}\n"