Spaces:

pbhappliedsystems
/

quant-eval-agent-arena

Running on Zero

App Files Files Community

quant-eval-agent-arena / react_engine.py

pbhappliedsystems

Update react_engine.py

cd12454 verified 26 days ago

raw

history blame contribute delete

9.35 kB

	# react_engine.py
	# PBH Applied Systems — Lightweight custom ReAct loop.
	# Thought → Action → Observation cycles with full trace streaming.
	# No LlamaIndex dependency. Llama instance is passed in from model_loader.

	import re
	from typing import Generator
	from tools import dispatch_tool, build_tool_prompt_section, TOOL_REGISTRY

	MAX_STEPS = 8
	MAX_TOKENS_PER_STEP = 3072
	FINAL_ANSWER_PREFIX = "FINAL ANSWER:"

	# ---------------------------------------------------------------------------
	# Per-template system prompt personas
	# Agents have access to pre-computed behavioral evaluation data from the
	# PBH Applied Systems quant_eval v7.21 evaluation series — NOT the
	# quant_eval harness itself, which is a separate proprietary system.
	# ---------------------------------------------------------------------------

	_TEMPLATE_PERSONAS = {
	"reasoning": (
	"You are a production-grade reasoning and analysis agent built on evaluated, "
	"quantized open-weight models by PBH Applied Systems. You have deep expertise "
	"in multi-step logical inference, strategic analysis, data science, machine "
	"learning concepts, and auditable decision workflows. You produce transparent, "
	"well-structured chain-of-thought reasoning. You write complete, thorough "
	"responses — never truncated, never redirected when you can answer directly."
	),
	"document": (
	"You are a production-grade document intelligence agent built on evaluated, "
	"quantized open-weight models by PBH Applied Systems. You specialize in "
	"extracting, analyzing, summarizing, and structuring information from documents, "
	"contracts, policies, research papers, and any text-heavy input. You produce "
	"complete, well-organized responses with clear structure. You never truncate "
	"or redirect when you can answer directly from your knowledge."
	),
	"code": (
	"You are a production-grade code and automation agent built on evaluated, "
	"quantized open-weight models by PBH Applied Systems. You have deep expertise "
	"in Python, data engineering, ETL pipelines, Flask APIs, ML infrastructure, "
	"batch processing, and production-quality code generation. You write complete, "
	"working, well-commented code with full implementations — never stubs, "
	"never truncated examples, never redirected when you can answer directly."
	),
	}

	_REACT_SYSTEM_PROMPT_TEMPLATE = """{persona}

	You also have access to pre-computed behavioral evaluation data from the PBH Applied Systems quant_eval v7.21 evaluation series, served through four lookup tools:

	{tools_section}

	BEHAVIOR RULES:
	- For general questions (coding, data science, ML, reasoning, writing, document analysis): answer directly and completely from your own knowledge. Do NOT force a tool call. Output FINAL ANSWER: with a full, useful response.
	- For questions about model scores, model selection, quantization tradeoffs, fixture families, or deployment recommendations: use the ReAct format — THOUGHT:, ACTION:, OBSERVATION:, then FINAL ANSWER:
	- ACTION format: ACTION: tool_name(argument)
	- Never invent tool outputs. Never fabricate scores or evaluation data.
	- Never give a short or unhelpful redirect when you can answer the question yourself.
	- Always produce complete responses. Never cut off mid-implementation.

	Begin."""


	def build_system_prompt(agent_template: str = "reasoning") -> str:
	persona = _TEMPLATE_PERSONAS.get(agent_template, _TEMPLATE_PERSONAS["reasoning"])
	return _REACT_SYSTEM_PROMPT_TEMPLATE.format(
	persona=persona,
	tools_section=build_tool_prompt_section()
	)


	# EOS token patterns — required for Phi-4-reasoning-plus and Qwen series
	_EOS_PATTERN = re.compile(r'<\\|im_end\\|>\|<\\|end\\|>\|<\\|endoftext\\|>', re.IGNORECASE)

	# Think-block pattern — required for Qwen3.6-27B
	_THINK_PATTERN = re.compile(r'<think>.*?</think>', re.DOTALL)


	def strip_output(text: str, strip_thinking: bool = False) -> str:
	"""Strip EOS tokens and optionally <think> blocks."""
	if strip_thinking:
	text = _THINK_PATTERN.sub('', text)
	return _EOS_PATTERN.sub('', text).strip()


	_ACTION_RE = re.compile(r'ACTION:\s(\w+)\(([^)])\)', re.IGNORECASE)


	def parse_action(text: str) -> tuple[str, str] \| None:
	# Strip markdown bold markers and trailing punctuation before parsing
	cleaned = re.sub(r'\*+', '', text).strip()
	match = _ACTION_RE.search(cleaned)
	if match:
	return match.group(1).strip(), match.group(2).strip()
	# Fallback: ACTION: tool_name args (no parens)
	fallback = re.search(r'ACTION:\s(\w+)\s(.*)', cleaned, re.IGNORECASE)
	if fallback:
	tool = fallback.group(1).strip()
	args = re.sub(r'\*+', '', fallback.group(2)).strip()
	return tool, args
	return None


	def run_react_loop(
	llm,
	user_query: str,
	model_key: str,
	agent_template: str = "reasoning",
	temperature: float = 0.2,
	max_steps: int = MAX_STEPS,
	) -> Generator[str, None, None]:
	"""
	Run the ReAct loop for a single model.
	Yields trace lines incrementally for Gradio streaming.

	Args:
	llm: loaded llama_cpp.Llama instance
	user_query: the user's prompt
	model_key: key from MODELS dict
	agent_template: one of 'reasoning', 'document', 'code'
	temperature: sampling temperature
	max_steps: maximum Thought/Action cycles
	"""
	from eval_data import MODELS
	m = MODELS.get(model_key, {})
	short_name = m.get("short_name", model_key)
	thinking_mode = m.get("thinking_mode", False)

	system_prompt = build_system_prompt(agent_template)
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_query},
	]

	yield f"[{short_name}] Starting...\n"

	for step in range(1, max_steps + 1):
	yield f"\n---\nStep {step}\n"

	try:
	response = llm.create_chat_completion(
	messages=messages,
	temperature=temperature,
	max_tokens=MAX_TOKENS_PER_STEP,
	stop=["OBSERVATION:"],
	)
	raw = response["choices"][0]["message"]["content"]
	except Exception as e:
	yield f"\n⚠️ LLM error on step {step}: {e}\n"
	break

	output = strip_output(raw, strip_thinking=thinking_mode)
	messages.append({"role": "assistant", "content": output})

	for line in output.strip().splitlines():
	if line.strip():
	yield f"{line}\n"

	if FINAL_ANSWER_PREFIX.upper() in output.upper():
	yield "\n✅ Agent reached Final Answer.\n"
	return

	action = parse_action(output)
	if action is None:
	# Check if the output is already a complete, substantive answer
	# (has code blocks, significant length, or conclusion markers)
	has_code = '```' in output
	has_conclusion = any(kw in output.lower() for kw in [
	"in conclusion", "therefore", "to summarize", "here is", "here's",
	"the above", "this implementation", "this pipeline", "this script"
	])
	is_substantial = len(output.strip()) > 400

	if has_code or has_conclusion or is_substantial:
	# Model answered directly — wrap in FINAL ANSWER and continue
	messages.append({"role": "assistant", "content": f"FINAL ANSWER: {output}"})
	yield "\n✅ Agent concluded with direct response.\n"
	return

	messages.append({
	"role": "user",
	"content": (
	f"Available tools are: {', '.join(TOOL_REGISTRY.keys())}. "
	f"If this query is a general coding, analysis, or writing task, output "
	f"FINAL ANSWER: followed by your complete response. "
	f"If this query is about evaluation data, call a tool with: ACTION: tool_name(args)"
	)
	})
	yield "\n⚠️ No parseable ACTION. Nudging...\n"
	continue

	tool_name, args_str = action
	yield f"\n🔧 Tool: `{tool_name}({args_str})`\n"

	observation = dispatch_tool(tool_name, args_str)
	yield f"\n📋 Observation:\n```\n{observation}\n```\n"

	messages.append({"role": "user", "content": f"OBSERVATION:\n{observation}"})

	# Max steps reached — force conclusion
	yield f"\n⚠️ Max steps ({max_steps}) reached. Forcing final answer...\n"
	messages.append({
	"role": "user",
	"content": f"You have used {max_steps} steps. Output your FINAL ANSWER: now."
	})
	try:
	final = llm.create_chat_completion(
	messages=messages,
	temperature=temperature,
	max_tokens=MAX_TOKENS_PER_STEP,
	)
	final_text = strip_output(
	final["choices"][0]["message"]["content"],
	strip_thinking=thinking_mode
	)
	for line in final_text.strip().splitlines():
	if line.strip():
	yield f"{line}\n"
	except Exception as e:
	yield f"\n⚠️ Error generating forced final answer: {e}\n"