Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import ast | |
| import threading | |
| import time | |
| from typing import Annotated | |
| import pandas as pd | |
| import gradio as gr | |
| from autogen import AssistantAgent, UserProxyAgent, GroupChat, GroupChatManager | |
| DATA_FILE = "train_with_baselines.csv" | |
| df = pd.read_csv(DATA_FILE) | |
| puzzle_choices = [f"Puzzle {i + 1}" for i in range(len(df))] | |
| EVIDENCE_ANALYST_PROMPT = ( | |
| "You are a data extractor. Read the puzzle and output EXACTLY this format:\n\n" | |
| "SUSPECTS: [comma-separated, exact spelling from puzzle]\n" | |
| "WEAPONS: [comma-separated, exact spelling]\n" | |
| "ROOMS: [numbered, e.g. 1=Hall;2=Study;3=Lounge;4=Kitchen;5=Ballroom;6=Dining Room]\n" | |
| "GRID: [e.g. 1|2|3;4|5|6]\n" | |
| "TIMES: [comma-separated, chronological]\n" | |
| "MOTIVES: [comma-separated]\n\n" | |
| "CLUES (copy each clue exactly, numbered):\n" | |
| "1. ...\n" | |
| "2. ...\n\n" | |
| "QUESTIONS: [copy the \"Fill out your final answers\" section word for word]\n\n" | |
| "Do NOT solve. Do NOT interpret. Just extract." | |
| ) | |
| DETECTIVE_PROMPT = ( | |
| "You are a detective solving a murder mystery logic puzzle.\n\n" | |
| "FIRST: Call get_room_adjacencies with the GRID and ROOMS from above. " | |
| "Do this ONCE before reasoning.\n\n" | |
| "THEN solve:\n\n" | |
| "1. ANSWER FORMAT - Read the QUESTIONS section. Write what each letter " | |
| "(A, B, C...) asks for. This varies per puzzle!\n\n" | |
| "2. ABSOLUTE FACTS - These are CERTAIN, find them first:\n" | |
| ' - "killed with [X]" or "murdered with [X]" \u2192 weapon is DEFINITELY X\n' | |
| ' - "murdered at [time]" \u2192 time is DEFINITELY that time\n' | |
| ' - "[Person] is motivated by [motive]" \u2192 DEFINITE motive for that person\n' | |
| " - Motives are usually assigned directly in clues. Search for EVERY " | |
| '"motivated by" statement and list them ALL.\n' | |
| " - If a motive is not directly stated for the murderer, use elimination: " | |
| "assign all stated motives to their people, the remaining motive belongs " | |
| "to the remaining person.\n\n" | |
| "3. SPATIAL CLUES - Use adjacency results to resolve EVERY directional clue:\n" | |
| ' - "just north of X" \u2192 look up north neighbor of X\n' | |
| ' - "just east of X" \u2192 look up east neighbor of X\n' | |
| ' - Write: "[Person] was in [RESOLVED ROOM NAME] at [Time]"\n\n' | |
| "4. TIMELINE - Build a table tracking EVERY person's position at EVERY " | |
| "time step.\n" | |
| " - Start with directly stated positions\n" | |
| " - Movement rule: each person can only STAY or move to ONE ADJACENT " | |
| "room per time step\n" | |
| " - Fill gaps using movement constraints\n" | |
| " - Track weapons too - they move when carried by someone\n\n" | |
| "5. MURDER CONDITIONS:\n" | |
| " - Murderer + Mr. Boddy + weapon must be in the SAME room\n" | |
| " - They must be ALONE (no other suspects in that room)\n" | |
| ' - Use "the murderer was in [room] at [time]" clues to ELIMINATE ' | |
| "suspects who cannot be there\n" | |
| " - Check: can each remaining suspect physically reach the required " | |
| "rooms via adjacency?\n\n" | |
| "6. FINAL CHECK:\n" | |
| " - Every answer MUST come from the puzzle's exact lists\n" | |
| " - Suspect names must match exactly (Mrs. White, not Ms. White)\n" | |
| " - Room names must match exactly (Billiard Room, not Billard Room)\n" | |
| " - Times must match format exactly (11:00 PM, not 11pm)\n" | |
| " - You MUST provide an answer for EVERY letter. Never leave one blank.\n\n" | |
| "FINAL ANSWERS:\n" | |
| "A. [answer]\n" | |
| "B. [answer]\n" | |
| "C. [answer]\n" | |
| "D. [answer]\n" | |
| "E. [answer]\n" | |
| "F. [answer]\n" | |
| "G. [answer if question exists]\n" | |
| "H. [answer if question exists]" | |
| ) | |
| FORMAT_AGENT_PROMPT = ( | |
| "Find the final answers from the conversation. Look for lines starting " | |
| "with A., B., C., etc.\n\n" | |
| "If the LAST message has final answers, use those.\n" | |
| "If the LAST message has no answers, search ALL previous messages for " | |
| "the most recent set of A., B., C. answers.\n\n" | |
| "Output ONLY a Python dictionary:\n" | |
| "{'A': 'Mrs. White', 'B': 'Knife', 'C': 'Ballroom', 'D': '11:00 PM', " | |
| "'E': 'Revenge', 'F': 'Ballroom', 'G': 'Study'}\n\n" | |
| "Rules:\n" | |
| "- Copy values EXACTLY as written (spelling, capitalization)\n" | |
| "- Include ALL answer keys (A through F minimum, G and H if they exist)\n" | |
| '- NEVER output "?", "Unknown", "Not specified", "NaN", or placeholder words\n' | |
| '- If an answer says "?" or is missing, use your best guess from the ' | |
| "conversation context\n" | |
| "- Output ONLY the dictionary, nothing else" | |
| ) | |
| def get_room_adjacencies( | |
| grid_rows: Annotated[ | |
| str, | |
| "Grid rows separated by semicolons. Each row has room numbers " | |
| "separated by pipes. Use - for empty cells. " | |
| "Example: '1|2|3;4|5|6' or '01|02|03|04;05|06|07|08'", | |
| ], | |
| room_names: Annotated[ | |
| str, | |
| "Room number to name mapping, semicolon-separated. " | |
| "Example: '1=Carriage House;2=Billiard Room;3=Lounge'", | |
| ], | |
| ) -> str: | |
| name_map = {} | |
| for pair in room_names.split(";"): | |
| pair = pair.strip() | |
| if "=" in pair: | |
| num, name = pair.split("=", 1) | |
| name_map[num.strip()] = name.strip() | |
| name_map[num.strip().lstrip("0") or "0"] = name.strip() | |
| grid = [] | |
| for row_str in grid_rows.split(";"): | |
| row_str = row_str.strip() | |
| if row_str: | |
| cells = [c.strip() for c in row_str.split("|")] | |
| grid.append(cells) | |
| result_lines = [] | |
| for r in range(len(grid)): | |
| for c in range(len(grid[r])): | |
| cell = grid[r][c] | |
| if cell == "-" or cell == "": | |
| continue | |
| cell_key = cell.lstrip("0") or "0" | |
| room_name = name_map.get(cell, name_map.get(cell_key, f"Room {cell}")) | |
| adjacents = [] | |
| if r > 0 and c < len(grid[r - 1]): | |
| nc = grid[r - 1][c] | |
| if nc != "-" and nc != "": | |
| nc_key = nc.lstrip("0") or "0" | |
| adjacents.append( | |
| f"north={name_map.get(nc, name_map.get(nc_key, nc))}" | |
| ) | |
| if r + 1 < len(grid) and c < len(grid[r + 1]): | |
| sc = grid[r + 1][c] | |
| if sc != "-" and sc != "": | |
| sc_key = sc.lstrip("0") or "0" | |
| adjacents.append( | |
| f"south={name_map.get(sc, name_map.get(sc_key, sc))}" | |
| ) | |
| if c > 0: | |
| wc = grid[r][c - 1] | |
| if wc != "-" and wc != "": | |
| wc_key = wc.lstrip("0") or "0" | |
| adjacents.append( | |
| f"west={name_map.get(wc, name_map.get(wc_key, wc))}" | |
| ) | |
| if c + 1 < len(grid[r]): | |
| ec = grid[r][c + 1] | |
| if ec != "-" and ec != "": | |
| ec_key = ec.lstrip("0") or "0" | |
| adjacents.append( | |
| f"east={name_map.get(ec, name_map.get(ec_key, ec))}" | |
| ) | |
| adj_str = ", ".join(adjacents) if adjacents else "no adjacent rooms" | |
| result_lines.append(f"{room_name}: {adj_str}") | |
| return "ROOM ADJACENCIES:\n" + "\n".join(result_lines) | |
| def extract_answer(text: str) -> dict | None: | |
| """Extract answer dictionary from agent output text.""" | |
| pattern = r"\{[^{}]*['\"]A['\"]\s*:" | |
| for match in re.finditer(pattern, text): | |
| start = match.start() | |
| depth = 0 | |
| for i in range(start, len(text)): | |
| if text[i] == "{": | |
| depth += 1 | |
| elif text[i] == "}": | |
| depth -= 1 | |
| if depth == 0: | |
| try: | |
| return ast.literal_eval(text[start : i + 1]) | |
| except (ValueError, SyntaxError): | |
| break | |
| return None | |
| def build_chat_messages(groupchat: GroupChat) -> list[dict]: | |
| """Convert AG2 GroupChat messages to Gradio chatbot format.""" | |
| messages = [] | |
| for msg in groupchat.messages: | |
| name = msg.get("name", msg.get("role", "System")) | |
| content = msg.get("content", "") or "" | |
| # Skip empty / None / tool-call-only messages | |
| if not content.strip() or content.strip() == "None": | |
| continue | |
| messages.append( | |
| {"role": "assistant", "content": f"**{name}:**\n\n{content}"} | |
| ) | |
| return messages | |
| def solve_puzzle(puzzle_idx: str): | |
| """Run the multi-agent pipeline on the selected puzzle (streams live).""" | |
| api_key = os.environ.get("NIM_API_KEY") | |
| if not api_key: | |
| yield ( | |
| [ | |
| { | |
| "role": "assistant", | |
| "content": ( | |
| "**Error:** `NIM_API_KEY` not set. " | |
| "Add it as a Space secret (Settings > Secrets)." | |
| ), | |
| } | |
| ], | |
| pd.DataFrame(), | |
| "No API key", | |
| ) | |
| return | |
| idx = int(puzzle_idx.split(" ")[1]) - 1 | |
| question = df.iloc[idx]["question"] | |
| ground_truth = ast.literal_eval(df.iloc[idx]["ground_truth"]) | |
| yield [{"role": "assistant", "content": "Setting up agents..."}], pd.DataFrame(), "Starting..." | |
| qwen_config = { | |
| "config_list": [ | |
| { | |
| "model": "qwen/qwen2.5-7b-instruct", | |
| "api_key": api_key, | |
| "base_url": "https://integrate.api.nvidia.com/v1", | |
| "temperature": 0, | |
| "max_tokens": 1024, | |
| } | |
| ], | |
| } | |
| llama_config = { | |
| "config_list": [ | |
| { | |
| "model": "meta/llama-3.3-70b-instruct", | |
| "api_key": api_key, | |
| "base_url": "https://integrate.api.nvidia.com/v1", | |
| "temperature": 0, | |
| } | |
| ], | |
| } | |
| evidence_analyst = AssistantAgent( | |
| name="Evidence_Analyst", | |
| system_message=EVIDENCE_ANALYST_PROMPT, | |
| llm_config=qwen_config, | |
| ) | |
| detective = AssistantAgent( | |
| name="Detective", | |
| system_message=DETECTIVE_PROMPT, | |
| llm_config=llama_config, | |
| max_consecutive_auto_reply=3, | |
| ) | |
| format_agent = AssistantAgent( | |
| name="Format_Agent", | |
| system_message=FORMAT_AGENT_PROMPT, | |
| llm_config=qwen_config, | |
| ) | |
| user_proxy = UserProxyAgent( | |
| name="User", | |
| human_input_mode="NEVER", | |
| max_consecutive_auto_reply=0, | |
| default_auto_reply="exit", | |
| code_execution_config=False, | |
| ) | |
| # Register tool: Detective calls and executes it (must be in GroupChat) | |
| # Use the exact name get_room_adjacencies (no underscore) so LLM + executor match | |
| _tool_impl = get_room_adjacencies | |
| detective.register_for_llm( | |
| description=( | |
| "Computes room adjacencies from a grid layout. Takes grid_rows " | |
| "(room numbers in grid format, rows separated by semicolons, " | |
| "cells by pipes, e.g. '1|2|3;4|5|6') and room_names " | |
| "(number-to-name mapping, semicolon-separated, e.g. " | |
| "'1=Hall;2=Study;3=Lounge'). Returns which rooms are north, " | |
| "south, east, west of each other." | |
| ) | |
| )(_tool_impl) | |
| detective.register_for_execution()(_tool_impl) | |
| def select_speaker(last_speaker, groupchat): | |
| """EA → Detective (stays until reasoning done) → Format Agent.""" | |
| if not groupchat.messages: | |
| return evidence_analyst | |
| last_content = (groupchat.messages[-1].get("content", "") or "").strip() | |
| # After EA → go to Detective | |
| if last_speaker == evidence_analyst: | |
| return detective | |
| # After Detective → check if it finished reasoning | |
| if last_speaker == detective: | |
| # Tool call (None) or tool result → keep Detective | |
| if ( | |
| not last_content | |
| or last_content == "None" | |
| or "ROOM ADJACENCIES" in last_content | |
| ): | |
| return detective | |
| # Detective has reasoned (has lettered answers) → Format Agent | |
| return format_agent | |
| # After Format Agent → done (termination catches the dict) | |
| return evidence_analyst | |
| groupchat = GroupChat( | |
| agents=[evidence_analyst, detective, format_agent], | |
| messages=[], | |
| max_round=12, | |
| speaker_selection_method=select_speaker, | |
| send_introductions=False, | |
| enable_clear_history=False, | |
| ) | |
| def _is_answer_dict(msg): | |
| """Stop the chat once an answer dict like {'A': '...'} appears.""" | |
| content = msg.get("content", "") or "" | |
| return bool(re.search(r"\{['\"]A['\"]\s*:\s*['\"]", content)) | |
| manager = GroupChatManager( | |
| groupchat=groupchat, | |
| llm_config=qwen_config, | |
| is_termination_msg=_is_answer_dict, | |
| ) | |
| # Run chat in a background thread so we can stream messages live | |
| error_holder: dict = {} | |
| def _run_chat(): | |
| try: | |
| user_proxy.initiate_chat(manager, message=question) | |
| except Exception as e: | |
| error_holder["error"] = str(e) | |
| thread = threading.Thread(target=_run_chat) | |
| thread.start() | |
| # Stream messages as they appear | |
| seen = 0 | |
| while thread.is_alive(): | |
| msgs = list(groupchat.messages) | |
| if len(msgs) > seen: | |
| seen = len(msgs) | |
| yield build_chat_messages(groupchat), pd.DataFrame(), "Agents working..." | |
| time.sleep(0.5) | |
| thread.join() | |
| # Final messages | |
| chat_messages = build_chat_messages(groupchat) | |
| if "error" in error_holder: | |
| chat_messages.append( | |
| {"role": "assistant", "content": f"**Pipeline error:** {error_holder['error']}"} | |
| ) | |
| yield chat_messages, pd.DataFrame(), "Error" | |
| return | |
| # Extract prediction from the last message (Format Agent's output) | |
| prediction = None | |
| for msg in reversed(groupchat.messages): | |
| content = msg.get("content", "") | |
| prediction = extract_answer(content) | |
| if prediction: | |
| break | |
| rows = [] | |
| correct = 0 | |
| total = 0 | |
| for key in sorted(ground_truth.keys()): | |
| gt_val = ground_truth[key] | |
| pred_val = prediction.get(key, "\u2014") if prediction else "\u2014" | |
| match = ( | |
| pred_val.strip() == gt_val.strip() | |
| if prediction and pred_val != "\u2014" | |
| else False | |
| ) | |
| if match: | |
| correct += 1 | |
| total += 1 | |
| rows.append( | |
| { | |
| "Letter": key, | |
| "Predicted": pred_val, | |
| "Ground Truth": gt_val, | |
| "Match": "\u2713" if match else "\u2717", | |
| } | |
| ) | |
| results_df = pd.DataFrame(rows) | |
| accuracy = f"{correct}/{total} ({100 * correct / total:.1f}%)" | |
| yield chat_messages, results_df, accuracy | |
| with gr.Blocks(title="Temporal Clue Multi-Agent Solver") as demo: | |
| gr.Markdown( | |
| "# Temporal Clue Multi-Agent Solver\n\n" | |
| "A multi-agent system that solves murder mystery logic puzzles from the " | |
| "[Temporal Clue](https://github.com/bradhilton/temporal-clue) benchmark. " | |
| "Three agents collaborate in sequence:\n\n" | |
| "1. **Evidence Analyst** (Qwen 7B) — extracts structured data from the puzzle\n" | |
| "2. **Detective** (Llama 70B + spatial tool) — reasons through clues and deduces answers\n" | |
| "3. **Format Agent** (Qwen 7B) — extracts the final answer dictionary\n\n" | |
| "Select a puzzle and click **Solve** to watch the agents work in real time." | |
| ) | |
| with gr.Row(): | |
| puzzle_dropdown = gr.Dropdown( | |
| choices=puzzle_choices, | |
| value=puzzle_choices[0], | |
| label="Select Puzzle", | |
| ) | |
| solve_btn = gr.Button("Solve", variant="primary", scale=0) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot( | |
| label="Agent Conversation", | |
| height=500, | |
| ) | |
| with gr.Column(scale=1): | |
| accuracy_display = gr.Textbox(label="Accuracy", interactive=False) | |
| results_table = gr.Dataframe(label="Results") | |
| solve_btn.click( | |
| fn=solve_puzzle, | |
| inputs=[puzzle_dropdown], | |
| outputs=[chatbot, results_table, accuracy_display], | |
| concurrency_limit=1, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |