# react_engine.py # PBH Applied Systems — Lightweight custom ReAct loop. # Thought → Action → Observation cycles with full trace streaming. # No LlamaIndex dependency. Llama instance is passed in from model_loader. import re from typing import Generator from tools import dispatch_tool, build_tool_prompt_section, TOOL_REGISTRY MAX_STEPS = 8 MAX_TOKENS_PER_STEP = 3072 FINAL_ANSWER_PREFIX = "FINAL ANSWER:" # --------------------------------------------------------------------------- # Per-template system prompt personas # Agents have access to pre-computed behavioral evaluation data from the # PBH Applied Systems quant_eval v7.21 evaluation series — NOT the # quant_eval harness itself, which is a separate proprietary system. # --------------------------------------------------------------------------- _TEMPLATE_PERSONAS = { "reasoning": ( "You are a production-grade reasoning and analysis agent built on evaluated, " "quantized open-weight models by PBH Applied Systems. You have deep expertise " "in multi-step logical inference, strategic analysis, data science, machine " "learning concepts, and auditable decision workflows. You produce transparent, " "well-structured chain-of-thought reasoning. You write complete, thorough " "responses — never truncated, never redirected when you can answer directly." ), "document": ( "You are a production-grade document intelligence agent built on evaluated, " "quantized open-weight models by PBH Applied Systems. You specialize in " "extracting, analyzing, summarizing, and structuring information from documents, " "contracts, policies, research papers, and any text-heavy input. You produce " "complete, well-organized responses with clear structure. You never truncate " "or redirect when you can answer directly from your knowledge." ), "code": ( "You are a production-grade code and automation agent built on evaluated, " "quantized open-weight models by PBH Applied Systems. You have deep expertise " "in Python, data engineering, ETL pipelines, Flask APIs, ML infrastructure, " "batch processing, and production-quality code generation. You write complete, " "working, well-commented code with full implementations — never stubs, " "never truncated examples, never redirected when you can answer directly." ), } _REACT_SYSTEM_PROMPT_TEMPLATE = """{persona} You also have access to pre-computed behavioral evaluation data from the PBH Applied Systems quant_eval v7.21 evaluation series, served through four lookup tools: {tools_section} BEHAVIOR RULES: - For general questions (coding, data science, ML, reasoning, writing, document analysis): answer directly and completely from your own knowledge. Do NOT force a tool call. Output FINAL ANSWER: with a full, useful response. - For questions about model scores, model selection, quantization tradeoffs, fixture families, or deployment recommendations: use the ReAct format — THOUGHT:, ACTION:, OBSERVATION:, then FINAL ANSWER: - ACTION format: ACTION: tool_name(argument) - Never invent tool outputs. Never fabricate scores or evaluation data. - Never give a short or unhelpful redirect when you can answer the question yourself. - Always produce complete responses. Never cut off mid-implementation. Begin.""" def build_system_prompt(agent_template: str = "reasoning") -> str: persona = _TEMPLATE_PERSONAS.get(agent_template, _TEMPLATE_PERSONAS["reasoning"]) return _REACT_SYSTEM_PROMPT_TEMPLATE.format( persona=persona, tools_section=build_tool_prompt_section() ) # EOS token patterns — required for Phi-4-reasoning-plus and Qwen series _EOS_PATTERN = re.compile(r'<\|im_end\|>|<\|end\|>|<\|endoftext\|>', re.IGNORECASE) # Think-block pattern — required for Qwen3.6-27B _THINK_PATTERN = re.compile(r'.*?', re.DOTALL) def strip_output(text: str, strip_thinking: bool = False) -> str: """Strip EOS tokens and optionally blocks.""" if strip_thinking: text = _THINK_PATTERN.sub('', text) return _EOS_PATTERN.sub('', text).strip() _ACTION_RE = re.compile(r'ACTION:\s*(\w+)\(([^)]*)\)', re.IGNORECASE) def parse_action(text: str) -> tuple[str, str] | None: # Strip markdown bold markers and trailing punctuation before parsing cleaned = re.sub(r'\*+', '', text).strip() match = _ACTION_RE.search(cleaned) if match: return match.group(1).strip(), match.group(2).strip() # Fallback: ACTION: tool_name args (no parens) fallback = re.search(r'ACTION:\s*(\w+)\s*(.*)', cleaned, re.IGNORECASE) if fallback: tool = fallback.group(1).strip() args = re.sub(r'\*+', '', fallback.group(2)).strip() return tool, args return None def run_react_loop( llm, user_query: str, model_key: str, agent_template: str = "reasoning", temperature: float = 0.2, max_steps: int = MAX_STEPS, ) -> Generator[str, None, None]: """ Run the ReAct loop for a single model. Yields trace lines incrementally for Gradio streaming. Args: llm: loaded llama_cpp.Llama instance user_query: the user's prompt model_key: key from MODELS dict agent_template: one of 'reasoning', 'document', 'code' temperature: sampling temperature max_steps: maximum Thought/Action cycles """ from eval_data import MODELS m = MODELS.get(model_key, {}) short_name = m.get("short_name", model_key) thinking_mode = m.get("thinking_mode", False) system_prompt = build_system_prompt(agent_template) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_query}, ] yield f"**[{short_name}] Starting...**\n" for step in range(1, max_steps + 1): yield f"\n---\n**Step {step}**\n" try: response = llm.create_chat_completion( messages=messages, temperature=temperature, max_tokens=MAX_TOKENS_PER_STEP, stop=["OBSERVATION:"], ) raw = response["choices"][0]["message"]["content"] except Exception as e: yield f"\n⚠️ **LLM error on step {step}:** {e}\n" break output = strip_output(raw, strip_thinking=thinking_mode) messages.append({"role": "assistant", "content": output}) for line in output.strip().splitlines(): if line.strip(): yield f"{line}\n" if FINAL_ANSWER_PREFIX.upper() in output.upper(): yield "\n✅ **Agent reached Final Answer.**\n" return action = parse_action(output) if action is None: # Check if the output is already a complete, substantive answer # (has code blocks, significant length, or conclusion markers) has_code = '```' in output has_conclusion = any(kw in output.lower() for kw in [ "in conclusion", "therefore", "to summarize", "here is", "here's", "the above", "this implementation", "this pipeline", "this script" ]) is_substantial = len(output.strip()) > 400 if has_code or has_conclusion or is_substantial: # Model answered directly — wrap in FINAL ANSWER and continue messages.append({"role": "assistant", "content": f"FINAL ANSWER: {output}"}) yield "\n✅ **Agent concluded with direct response.**\n" return messages.append({ "role": "user", "content": ( f"Available tools are: {', '.join(TOOL_REGISTRY.keys())}. " f"If this query is a general coding, analysis, or writing task, output " f"FINAL ANSWER: followed by your complete response. " f"If this query is about evaluation data, call a tool with: ACTION: tool_name(args)" ) }) yield "\n⚠️ *No parseable ACTION. Nudging...*\n" continue tool_name, args_str = action yield f"\n🔧 **Tool:** `{tool_name}({args_str})`\n" observation = dispatch_tool(tool_name, args_str) yield f"\n📋 **Observation:**\n```\n{observation}\n```\n" messages.append({"role": "user", "content": f"OBSERVATION:\n{observation}"}) # Max steps reached — force conclusion yield f"\n⚠️ **Max steps ({max_steps}) reached. Forcing final answer...**\n" messages.append({ "role": "user", "content": f"You have used {max_steps} steps. Output your FINAL ANSWER: now." }) try: final = llm.create_chat_completion( messages=messages, temperature=temperature, max_tokens=MAX_TOKENS_PER_STEP, ) final_text = strip_output( final["choices"][0]["message"]["content"], strip_thinking=thinking_mode ) for line in final_text.strip().splitlines(): if line.strip(): yield f"{line}\n" except Exception as e: yield f"\n⚠️ **Error generating forced final answer:** {e}\n"