clue-agent / app.py
antonisbast's picture
Update frontend description to reflect sequential agent flow
2b36c7c
import os
import re
import ast
import threading
import time
from typing import Annotated
import pandas as pd
import gradio as gr
from autogen import AssistantAgent, UserProxyAgent, GroupChat, GroupChatManager
DATA_FILE = "train_with_baselines.csv"
df = pd.read_csv(DATA_FILE)
puzzle_choices = [f"Puzzle {i + 1}" for i in range(len(df))]
EVIDENCE_ANALYST_PROMPT = (
"You are a data extractor. Read the puzzle and output EXACTLY this format:\n\n"
"SUSPECTS: [comma-separated, exact spelling from puzzle]\n"
"WEAPONS: [comma-separated, exact spelling]\n"
"ROOMS: [numbered, e.g. 1=Hall;2=Study;3=Lounge;4=Kitchen;5=Ballroom;6=Dining Room]\n"
"GRID: [e.g. 1|2|3;4|5|6]\n"
"TIMES: [comma-separated, chronological]\n"
"MOTIVES: [comma-separated]\n\n"
"CLUES (copy each clue exactly, numbered):\n"
"1. ...\n"
"2. ...\n\n"
"QUESTIONS: [copy the \"Fill out your final answers\" section word for word]\n\n"
"Do NOT solve. Do NOT interpret. Just extract."
)
DETECTIVE_PROMPT = (
"You are a detective solving a murder mystery logic puzzle.\n\n"
"FIRST: Call get_room_adjacencies with the GRID and ROOMS from above. "
"Do this ONCE before reasoning.\n\n"
"THEN solve:\n\n"
"1. ANSWER FORMAT - Read the QUESTIONS section. Write what each letter "
"(A, B, C...) asks for. This varies per puzzle!\n\n"
"2. ABSOLUTE FACTS - These are CERTAIN, find them first:\n"
' - "killed with [X]" or "murdered with [X]" \u2192 weapon is DEFINITELY X\n'
' - "murdered at [time]" \u2192 time is DEFINITELY that time\n'
' - "[Person] is motivated by [motive]" \u2192 DEFINITE motive for that person\n'
" - Motives are usually assigned directly in clues. Search for EVERY "
'"motivated by" statement and list them ALL.\n'
" - If a motive is not directly stated for the murderer, use elimination: "
"assign all stated motives to their people, the remaining motive belongs "
"to the remaining person.\n\n"
"3. SPATIAL CLUES - Use adjacency results to resolve EVERY directional clue:\n"
' - "just north of X" \u2192 look up north neighbor of X\n'
' - "just east of X" \u2192 look up east neighbor of X\n'
' - Write: "[Person] was in [RESOLVED ROOM NAME] at [Time]"\n\n'
"4. TIMELINE - Build a table tracking EVERY person's position at EVERY "
"time step.\n"
" - Start with directly stated positions\n"
" - Movement rule: each person can only STAY or move to ONE ADJACENT "
"room per time step\n"
" - Fill gaps using movement constraints\n"
" - Track weapons too - they move when carried by someone\n\n"
"5. MURDER CONDITIONS:\n"
" - Murderer + Mr. Boddy + weapon must be in the SAME room\n"
" - They must be ALONE (no other suspects in that room)\n"
' - Use "the murderer was in [room] at [time]" clues to ELIMINATE '
"suspects who cannot be there\n"
" - Check: can each remaining suspect physically reach the required "
"rooms via adjacency?\n\n"
"6. FINAL CHECK:\n"
" - Every answer MUST come from the puzzle's exact lists\n"
" - Suspect names must match exactly (Mrs. White, not Ms. White)\n"
" - Room names must match exactly (Billiard Room, not Billard Room)\n"
" - Times must match format exactly (11:00 PM, not 11pm)\n"
" - You MUST provide an answer for EVERY letter. Never leave one blank.\n\n"
"FINAL ANSWERS:\n"
"A. [answer]\n"
"B. [answer]\n"
"C. [answer]\n"
"D. [answer]\n"
"E. [answer]\n"
"F. [answer]\n"
"G. [answer if question exists]\n"
"H. [answer if question exists]"
)
FORMAT_AGENT_PROMPT = (
"Find the final answers from the conversation. Look for lines starting "
"with A., B., C., etc.\n\n"
"If the LAST message has final answers, use those.\n"
"If the LAST message has no answers, search ALL previous messages for "
"the most recent set of A., B., C. answers.\n\n"
"Output ONLY a Python dictionary:\n"
"{'A': 'Mrs. White', 'B': 'Knife', 'C': 'Ballroom', 'D': '11:00 PM', "
"'E': 'Revenge', 'F': 'Ballroom', 'G': 'Study'}\n\n"
"Rules:\n"
"- Copy values EXACTLY as written (spelling, capitalization)\n"
"- Include ALL answer keys (A through F minimum, G and H if they exist)\n"
'- NEVER output "?", "Unknown", "Not specified", "NaN", or placeholder words\n'
'- If an answer says "?" or is missing, use your best guess from the '
"conversation context\n"
"- Output ONLY the dictionary, nothing else"
)
def get_room_adjacencies(
grid_rows: Annotated[
str,
"Grid rows separated by semicolons. Each row has room numbers "
"separated by pipes. Use - for empty cells. "
"Example: '1|2|3;4|5|6' or '01|02|03|04;05|06|07|08'",
],
room_names: Annotated[
str,
"Room number to name mapping, semicolon-separated. "
"Example: '1=Carriage House;2=Billiard Room;3=Lounge'",
],
) -> str:
name_map = {}
for pair in room_names.split(";"):
pair = pair.strip()
if "=" in pair:
num, name = pair.split("=", 1)
name_map[num.strip()] = name.strip()
name_map[num.strip().lstrip("0") or "0"] = name.strip()
grid = []
for row_str in grid_rows.split(";"):
row_str = row_str.strip()
if row_str:
cells = [c.strip() for c in row_str.split("|")]
grid.append(cells)
result_lines = []
for r in range(len(grid)):
for c in range(len(grid[r])):
cell = grid[r][c]
if cell == "-" or cell == "":
continue
cell_key = cell.lstrip("0") or "0"
room_name = name_map.get(cell, name_map.get(cell_key, f"Room {cell}"))
adjacents = []
if r > 0 and c < len(grid[r - 1]):
nc = grid[r - 1][c]
if nc != "-" and nc != "":
nc_key = nc.lstrip("0") or "0"
adjacents.append(
f"north={name_map.get(nc, name_map.get(nc_key, nc))}"
)
if r + 1 < len(grid) and c < len(grid[r + 1]):
sc = grid[r + 1][c]
if sc != "-" and sc != "":
sc_key = sc.lstrip("0") or "0"
adjacents.append(
f"south={name_map.get(sc, name_map.get(sc_key, sc))}"
)
if c > 0:
wc = grid[r][c - 1]
if wc != "-" and wc != "":
wc_key = wc.lstrip("0") or "0"
adjacents.append(
f"west={name_map.get(wc, name_map.get(wc_key, wc))}"
)
if c + 1 < len(grid[r]):
ec = grid[r][c + 1]
if ec != "-" and ec != "":
ec_key = ec.lstrip("0") or "0"
adjacents.append(
f"east={name_map.get(ec, name_map.get(ec_key, ec))}"
)
adj_str = ", ".join(adjacents) if adjacents else "no adjacent rooms"
result_lines.append(f"{room_name}: {adj_str}")
return "ROOM ADJACENCIES:\n" + "\n".join(result_lines)
def extract_answer(text: str) -> dict | None:
"""Extract answer dictionary from agent output text."""
pattern = r"\{[^{}]*['\"]A['\"]\s*:"
for match in re.finditer(pattern, text):
start = match.start()
depth = 0
for i in range(start, len(text)):
if text[i] == "{":
depth += 1
elif text[i] == "}":
depth -= 1
if depth == 0:
try:
return ast.literal_eval(text[start : i + 1])
except (ValueError, SyntaxError):
break
return None
def build_chat_messages(groupchat: GroupChat) -> list[dict]:
"""Convert AG2 GroupChat messages to Gradio chatbot format."""
messages = []
for msg in groupchat.messages:
name = msg.get("name", msg.get("role", "System"))
content = msg.get("content", "") or ""
# Skip empty / None / tool-call-only messages
if not content.strip() or content.strip() == "None":
continue
messages.append(
{"role": "assistant", "content": f"**{name}:**\n\n{content}"}
)
return messages
def solve_puzzle(puzzle_idx: str):
"""Run the multi-agent pipeline on the selected puzzle (streams live)."""
api_key = os.environ.get("NIM_API_KEY")
if not api_key:
yield (
[
{
"role": "assistant",
"content": (
"**Error:** `NIM_API_KEY` not set. "
"Add it as a Space secret (Settings > Secrets)."
),
}
],
pd.DataFrame(),
"No API key",
)
return
idx = int(puzzle_idx.split(" ")[1]) - 1
question = df.iloc[idx]["question"]
ground_truth = ast.literal_eval(df.iloc[idx]["ground_truth"])
yield [{"role": "assistant", "content": "Setting up agents..."}], pd.DataFrame(), "Starting..."
qwen_config = {
"config_list": [
{
"model": "qwen/qwen2.5-7b-instruct",
"api_key": api_key,
"base_url": "https://integrate.api.nvidia.com/v1",
"temperature": 0,
"max_tokens": 1024,
}
],
}
llama_config = {
"config_list": [
{
"model": "meta/llama-3.3-70b-instruct",
"api_key": api_key,
"base_url": "https://integrate.api.nvidia.com/v1",
"temperature": 0,
}
],
}
evidence_analyst = AssistantAgent(
name="Evidence_Analyst",
system_message=EVIDENCE_ANALYST_PROMPT,
llm_config=qwen_config,
)
detective = AssistantAgent(
name="Detective",
system_message=DETECTIVE_PROMPT,
llm_config=llama_config,
max_consecutive_auto_reply=3,
)
format_agent = AssistantAgent(
name="Format_Agent",
system_message=FORMAT_AGENT_PROMPT,
llm_config=qwen_config,
)
user_proxy = UserProxyAgent(
name="User",
human_input_mode="NEVER",
max_consecutive_auto_reply=0,
default_auto_reply="exit",
code_execution_config=False,
)
# Register tool: Detective calls and executes it (must be in GroupChat)
# Use the exact name get_room_adjacencies (no underscore) so LLM + executor match
_tool_impl = get_room_adjacencies
detective.register_for_llm(
description=(
"Computes room adjacencies from a grid layout. Takes grid_rows "
"(room numbers in grid format, rows separated by semicolons, "
"cells by pipes, e.g. '1|2|3;4|5|6') and room_names "
"(number-to-name mapping, semicolon-separated, e.g. "
"'1=Hall;2=Study;3=Lounge'). Returns which rooms are north, "
"south, east, west of each other."
)
)(_tool_impl)
detective.register_for_execution()(_tool_impl)
def select_speaker(last_speaker, groupchat):
"""EA → Detective (stays until reasoning done) → Format Agent."""
if not groupchat.messages:
return evidence_analyst
last_content = (groupchat.messages[-1].get("content", "") or "").strip()
# After EA → go to Detective
if last_speaker == evidence_analyst:
return detective
# After Detective → check if it finished reasoning
if last_speaker == detective:
# Tool call (None) or tool result → keep Detective
if (
not last_content
or last_content == "None"
or "ROOM ADJACENCIES" in last_content
):
return detective
# Detective has reasoned (has lettered answers) → Format Agent
return format_agent
# After Format Agent → done (termination catches the dict)
return evidence_analyst
groupchat = GroupChat(
agents=[evidence_analyst, detective, format_agent],
messages=[],
max_round=12,
speaker_selection_method=select_speaker,
send_introductions=False,
enable_clear_history=False,
)
def _is_answer_dict(msg):
"""Stop the chat once an answer dict like {'A': '...'} appears."""
content = msg.get("content", "") or ""
return bool(re.search(r"\{['\"]A['\"]\s*:\s*['\"]", content))
manager = GroupChatManager(
groupchat=groupchat,
llm_config=qwen_config,
is_termination_msg=_is_answer_dict,
)
# Run chat in a background thread so we can stream messages live
error_holder: dict = {}
def _run_chat():
try:
user_proxy.initiate_chat(manager, message=question)
except Exception as e:
error_holder["error"] = str(e)
thread = threading.Thread(target=_run_chat)
thread.start()
# Stream messages as they appear
seen = 0
while thread.is_alive():
msgs = list(groupchat.messages)
if len(msgs) > seen:
seen = len(msgs)
yield build_chat_messages(groupchat), pd.DataFrame(), "Agents working..."
time.sleep(0.5)
thread.join()
# Final messages
chat_messages = build_chat_messages(groupchat)
if "error" in error_holder:
chat_messages.append(
{"role": "assistant", "content": f"**Pipeline error:** {error_holder['error']}"}
)
yield chat_messages, pd.DataFrame(), "Error"
return
# Extract prediction from the last message (Format Agent's output)
prediction = None
for msg in reversed(groupchat.messages):
content = msg.get("content", "")
prediction = extract_answer(content)
if prediction:
break
rows = []
correct = 0
total = 0
for key in sorted(ground_truth.keys()):
gt_val = ground_truth[key]
pred_val = prediction.get(key, "\u2014") if prediction else "\u2014"
match = (
pred_val.strip() == gt_val.strip()
if prediction and pred_val != "\u2014"
else False
)
if match:
correct += 1
total += 1
rows.append(
{
"Letter": key,
"Predicted": pred_val,
"Ground Truth": gt_val,
"Match": "\u2713" if match else "\u2717",
}
)
results_df = pd.DataFrame(rows)
accuracy = f"{correct}/{total} ({100 * correct / total:.1f}%)"
yield chat_messages, results_df, accuracy
with gr.Blocks(title="Temporal Clue Multi-Agent Solver") as demo:
gr.Markdown(
"# Temporal Clue Multi-Agent Solver\n\n"
"A multi-agent system that solves murder mystery logic puzzles from the "
"[Temporal Clue](https://github.com/bradhilton/temporal-clue) benchmark. "
"Three agents collaborate in sequence:\n\n"
"1. **Evidence Analyst** (Qwen 7B) &mdash; extracts structured data from the puzzle\n"
"2. **Detective** (Llama 70B + spatial tool) &mdash; reasons through clues and deduces answers\n"
"3. **Format Agent** (Qwen 7B) &mdash; extracts the final answer dictionary\n\n"
"Select a puzzle and click **Solve** to watch the agents work in real time."
)
with gr.Row():
puzzle_dropdown = gr.Dropdown(
choices=puzzle_choices,
value=puzzle_choices[0],
label="Select Puzzle",
)
solve_btn = gr.Button("Solve", variant="primary", scale=0)
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="Agent Conversation",
height=500,
)
with gr.Column(scale=1):
accuracy_display = gr.Textbox(label="Accuracy", interactive=False)
results_table = gr.Dataframe(label="Results")
solve_btn.click(
fn=solve_puzzle,
inputs=[puzzle_dropdown],
outputs=[chatbot, results_table, accuracy_display],
concurrency_limit=1,
)
if __name__ == "__main__":
demo.launch()