Spaces:
Running on Zero
Running on Zero
| # react_engine.py | |
| # PBH Applied Systems — Lightweight custom ReAct loop. | |
| # Thought → Action → Observation cycles with full trace streaming. | |
| # No LlamaIndex dependency. Llama instance is passed in from model_loader. | |
| import re | |
| from typing import Generator | |
| from tools import dispatch_tool, build_tool_prompt_section, TOOL_REGISTRY | |
| MAX_STEPS = 8 | |
| MAX_TOKENS_PER_STEP = 3072 | |
| FINAL_ANSWER_PREFIX = "FINAL ANSWER:" | |
| # --------------------------------------------------------------------------- | |
| # Per-template system prompt personas | |
| # Agents have access to pre-computed behavioral evaluation data from the | |
| # PBH Applied Systems quant_eval v7.21 evaluation series — NOT the | |
| # quant_eval harness itself, which is a separate proprietary system. | |
| # --------------------------------------------------------------------------- | |
| _TEMPLATE_PERSONAS = { | |
| "reasoning": ( | |
| "You are a production-grade reasoning and analysis agent built on evaluated, " | |
| "quantized open-weight models by PBH Applied Systems. You have deep expertise " | |
| "in multi-step logical inference, strategic analysis, data science, machine " | |
| "learning concepts, and auditable decision workflows. You produce transparent, " | |
| "well-structured chain-of-thought reasoning. You write complete, thorough " | |
| "responses — never truncated, never redirected when you can answer directly." | |
| ), | |
| "document": ( | |
| "You are a production-grade document intelligence agent built on evaluated, " | |
| "quantized open-weight models by PBH Applied Systems. You specialize in " | |
| "extracting, analyzing, summarizing, and structuring information from documents, " | |
| "contracts, policies, research papers, and any text-heavy input. You produce " | |
| "complete, well-organized responses with clear structure. You never truncate " | |
| "or redirect when you can answer directly from your knowledge." | |
| ), | |
| "code": ( | |
| "You are a production-grade code and automation agent built on evaluated, " | |
| "quantized open-weight models by PBH Applied Systems. You have deep expertise " | |
| "in Python, data engineering, ETL pipelines, Flask APIs, ML infrastructure, " | |
| "batch processing, and production-quality code generation. You write complete, " | |
| "working, well-commented code with full implementations — never stubs, " | |
| "never truncated examples, never redirected when you can answer directly." | |
| ), | |
| } | |
| _REACT_SYSTEM_PROMPT_TEMPLATE = """{persona} | |
| You also have access to pre-computed behavioral evaluation data from the PBH Applied Systems quant_eval v7.21 evaluation series, served through four lookup tools: | |
| {tools_section} | |
| BEHAVIOR RULES: | |
| - For general questions (coding, data science, ML, reasoning, writing, document analysis): answer directly and completely from your own knowledge. Do NOT force a tool call. Output FINAL ANSWER: with a full, useful response. | |
| - For questions about model scores, model selection, quantization tradeoffs, fixture families, or deployment recommendations: use the ReAct format — THOUGHT:, ACTION:, OBSERVATION:, then FINAL ANSWER: | |
| - ACTION format: ACTION: tool_name(argument) | |
| - Never invent tool outputs. Never fabricate scores or evaluation data. | |
| - Never give a short or unhelpful redirect when you can answer the question yourself. | |
| - Always produce complete responses. Never cut off mid-implementation. | |
| Begin.""" | |
| def build_system_prompt(agent_template: str = "reasoning") -> str: | |
| persona = _TEMPLATE_PERSONAS.get(agent_template, _TEMPLATE_PERSONAS["reasoning"]) | |
| return _REACT_SYSTEM_PROMPT_TEMPLATE.format( | |
| persona=persona, | |
| tools_section=build_tool_prompt_section() | |
| ) | |
| # EOS token patterns — required for Phi-4-reasoning-plus and Qwen series | |
| _EOS_PATTERN = re.compile(r'<\|im_end\|>|<\|end\|>|<\|endoftext\|>', re.IGNORECASE) | |
| # Think-block pattern — required for Qwen3.6-27B | |
| _THINK_PATTERN = re.compile(r'<think>.*?</think>', re.DOTALL) | |
| def strip_output(text: str, strip_thinking: bool = False) -> str: | |
| """Strip EOS tokens and optionally <think> blocks.""" | |
| if strip_thinking: | |
| text = _THINK_PATTERN.sub('', text) | |
| return _EOS_PATTERN.sub('', text).strip() | |
| _ACTION_RE = re.compile(r'ACTION:\s*(\w+)\(([^)]*)\)', re.IGNORECASE) | |
| def parse_action(text: str) -> tuple[str, str] | None: | |
| # Strip markdown bold markers and trailing punctuation before parsing | |
| cleaned = re.sub(r'\*+', '', text).strip() | |
| match = _ACTION_RE.search(cleaned) | |
| if match: | |
| return match.group(1).strip(), match.group(2).strip() | |
| # Fallback: ACTION: tool_name args (no parens) | |
| fallback = re.search(r'ACTION:\s*(\w+)\s*(.*)', cleaned, re.IGNORECASE) | |
| if fallback: | |
| tool = fallback.group(1).strip() | |
| args = re.sub(r'\*+', '', fallback.group(2)).strip() | |
| return tool, args | |
| return None | |
| def run_react_loop( | |
| llm, | |
| user_query: str, | |
| model_key: str, | |
| agent_template: str = "reasoning", | |
| temperature: float = 0.2, | |
| max_steps: int = MAX_STEPS, | |
| ) -> Generator[str, None, None]: | |
| """ | |
| Run the ReAct loop for a single model. | |
| Yields trace lines incrementally for Gradio streaming. | |
| Args: | |
| llm: loaded llama_cpp.Llama instance | |
| user_query: the user's prompt | |
| model_key: key from MODELS dict | |
| agent_template: one of 'reasoning', 'document', 'code' | |
| temperature: sampling temperature | |
| max_steps: maximum Thought/Action cycles | |
| """ | |
| from eval_data import MODELS | |
| m = MODELS.get(model_key, {}) | |
| short_name = m.get("short_name", model_key) | |
| thinking_mode = m.get("thinking_mode", False) | |
| system_prompt = build_system_prompt(agent_template) | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_query}, | |
| ] | |
| yield f"**[{short_name}] Starting...**\n" | |
| for step in range(1, max_steps + 1): | |
| yield f"\n---\n**Step {step}**\n" | |
| try: | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| temperature=temperature, | |
| max_tokens=MAX_TOKENS_PER_STEP, | |
| stop=["OBSERVATION:"], | |
| ) | |
| raw = response["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| yield f"\n⚠️ **LLM error on step {step}:** {e}\n" | |
| break | |
| output = strip_output(raw, strip_thinking=thinking_mode) | |
| messages.append({"role": "assistant", "content": output}) | |
| for line in output.strip().splitlines(): | |
| if line.strip(): | |
| yield f"{line}\n" | |
| if FINAL_ANSWER_PREFIX.upper() in output.upper(): | |
| yield "\n✅ **Agent reached Final Answer.**\n" | |
| return | |
| action = parse_action(output) | |
| if action is None: | |
| # Check if the output is already a complete, substantive answer | |
| # (has code blocks, significant length, or conclusion markers) | |
| has_code = '```' in output | |
| has_conclusion = any(kw in output.lower() for kw in [ | |
| "in conclusion", "therefore", "to summarize", "here is", "here's", | |
| "the above", "this implementation", "this pipeline", "this script" | |
| ]) | |
| is_substantial = len(output.strip()) > 400 | |
| if has_code or has_conclusion or is_substantial: | |
| # Model answered directly — wrap in FINAL ANSWER and continue | |
| messages.append({"role": "assistant", "content": f"FINAL ANSWER: {output}"}) | |
| yield "\n✅ **Agent concluded with direct response.**\n" | |
| return | |
| messages.append({ | |
| "role": "user", | |
| "content": ( | |
| f"Available tools are: {', '.join(TOOL_REGISTRY.keys())}. " | |
| f"If this query is a general coding, analysis, or writing task, output " | |
| f"FINAL ANSWER: followed by your complete response. " | |
| f"If this query is about evaluation data, call a tool with: ACTION: tool_name(args)" | |
| ) | |
| }) | |
| yield "\n⚠️ *No parseable ACTION. Nudging...*\n" | |
| continue | |
| tool_name, args_str = action | |
| yield f"\n🔧 **Tool:** `{tool_name}({args_str})`\n" | |
| observation = dispatch_tool(tool_name, args_str) | |
| yield f"\n📋 **Observation:**\n```\n{observation}\n```\n" | |
| messages.append({"role": "user", "content": f"OBSERVATION:\n{observation}"}) | |
| # Max steps reached — force conclusion | |
| yield f"\n⚠️ **Max steps ({max_steps}) reached. Forcing final answer...**\n" | |
| messages.append({ | |
| "role": "user", | |
| "content": f"You have used {max_steps} steps. Output your FINAL ANSWER: now." | |
| }) | |
| try: | |
| final = llm.create_chat_completion( | |
| messages=messages, | |
| temperature=temperature, | |
| max_tokens=MAX_TOKENS_PER_STEP, | |
| ) | |
| final_text = strip_output( | |
| final["choices"][0]["message"]["content"], | |
| strip_thinking=thinking_mode | |
| ) | |
| for line in final_text.strip().splitlines(): | |
| if line.strip(): | |
| yield f"{line}\n" | |
| except Exception as e: | |
| yield f"\n⚠️ **Error generating forced final answer:** {e}\n" | |