File size: 14,044 Bytes
a52bae4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 | # ============================================================================
# ringmaster_tools.py β Tools the LangGraph Ringmaster supervisor can call
# ============================================================================
#
# These tools exist ONLY for the LangGraph Ringmaster backend. They are NOT
# registered in the standard tools.py because the other 6 backends do not
# know about workbench state, data loading, or research-corpus inspection.
#
# COMPLIANCE
# ----------
# Every tool here is a thin wrapper. It:
# - reads structured input
# - calls a real domain function (workbench_grounded_theory.run,
# workbench_thematic_analysis.run, or a simple string inspection)
# - returns a plain-string summary the LLM can include in its reply
#
# Tools NEVER do control flow. They NEVER route. They NEVER decide what
# runs next. The supervisor decides, the tool executes, the supervisor
# sees the result string and decides again.
#
# DATA CONTRACT
# -------------
# Every tool receives `context` β a dict the ringmaster backend builds
# from the Gradio session state before invoking the supervisor. Fields:
# context["loaded_context"] -> str, newline-separated sentences (may be empty)
# context["llm_provider"] -> str, the LLM provider name
# context["llm_key"] -> str, the API key (may be empty)
# context["cgt_result"] -> dict or None, last CGT run result
# context["cta_result"] -> dict or None, last CTA run result
#
# Tools that produce new results MUTATE context["cgt_result"] or
# context["cta_result"] so subsequent tool calls in the same chat turn
# can see them (and so the chat handler can extract them afterward to
# update the workbench tabs).
# ============================================================================
from typing import Dict, Any, List
# ----------------------------------------------------------------
# TOOL 1 β check_data_status
# ----------------------------------------------------------------
def check_data_status(context: Dict[str, Any]) -> str:
"""Report whether research data is currently loaded, and if so how much."""
loaded = (context.get("loaded_context") or "").strip()
if not loaded:
return (
"NO DATA LOADED. The user has not uploaded a file, pasted text, "
"or scraped a URL yet. Ask the user to go to the Inputs tab and "
"load data before running any research workbench."
)
sentences = [s.strip() for s in loaded.split("\n") if s.strip()]
n = len(sentences)
preview = sentences[:3]
if n == 0:
return "NO DATA LOADED β loaded_context is whitespace only."
return (
f"DATA LOADED: {n} sentences available for analysis.\n"
f"First 3 sentences for preview:\n"
+ "\n".join(f" {i+1}. {s}" for i, s in enumerate(preview))
)
# ----------------------------------------------------------------
# TOOL 2 β run_grounded_theory
# ----------------------------------------------------------------
def run_grounded_theory(
context: Dict[str, Any],
similarity_threshold: float = 0.60,
min_cluster_size: int = 3,
n_nearest: int = 3,
) -> str:
"""Run the Computational Grounded Theory supervisor on loaded data.
Returns a short text summary. Mutates context["cgt_result"] with the
full result dict so the chat handler can update the CGT tab afterward.
"""
loaded = (context.get("loaded_context") or "").strip()
if not loaded:
return (
"ERROR: cannot run grounded theory β no data loaded. "
"Ask the user to load data via the Inputs tab first."
)
sentences = [s.strip() for s in loaded.split("\n") if s.strip()]
true_labels = ["(unknown)"] * len(sentences)
# Import here to keep the ringmaster_tools module import-light and to
# avoid a circular import at app.py boot.
import workbench_grounded_theory as wb_cgt
result = wb_cgt.run(
user_message="Run computational grounded theory.",
sentences=sentences,
true_labels=true_labels,
data_source="uploaded",
similarity_threshold=float(similarity_threshold),
min_cluster_size=int(min_cluster_size),
n_nearest=int(n_nearest),
llm_provider=context.get("llm_provider", "Mistral"),
llm_key=context.get("llm_key", ""),
)
context["cgt_result"] = result
det = result.get("detection_result") or {}
clusters = det.get("clusters") or []
n_clusters = len(clusters)
cluster_summary_lines = []
for c in clusters:
label = c.get("llm_label") or c.get("cluster_id") or "unknown"
size = c.get("size") or 0
cluster_summary_lines.append(f" - Cluster {c.get('cluster_id')}: {label} ({size} sentences)")
if not cluster_summary_lines:
return (
f"Ran grounded theory on {len(sentences)} sentences but no clusters were "
f"found at similarity {similarity_threshold} / min size {min_cluster_size}. "
f"Suggest the user lower similarity_threshold or min_cluster_size."
)
return (
f"COMPLETED: grounded theory on {len(sentences)} sentences. "
f"Found {n_clusters} cluster(s):\n"
+ "\n".join(cluster_summary_lines)
+ "\nThe full trace and per-sentence cluster table are now in the "
"Researcher Workbench β Computational Grounded Theory tab."
)
# ----------------------------------------------------------------
# TOOL 3 β run_thematic_analysis
# ----------------------------------------------------------------
def run_thematic_analysis(
context: Dict[str, Any],
max_sentences: int = 20,
) -> str:
"""Run the Computational Thematic Analysis supervisor on loaded data.
Returns a short text summary. Mutates context["cta_result"].
"""
loaded = (context.get("loaded_context") or "").strip()
if not loaded:
return (
"ERROR: cannot run thematic analysis β no data loaded. "
"Ask the user to load data via the Inputs tab first."
)
sentences = [s.strip() for s in loaded.split("\n") if s.strip()]
true_labels = ["(unknown)"] * len(sentences)
import workbench_thematic_analysis as wb_cta
result = wb_cta.run(
user_message="Run reflexive thematic analysis.",
sentences=sentences,
true_labels=true_labels,
data_source="uploaded",
max_sentences_to_code=int(max_sentences),
llm_provider=context.get("llm_provider", "Mistral"),
llm_key=context.get("llm_key", ""),
)
context["cta_result"] = result
phase2 = result.get("phase2_initial_codes") or {}
coded_rows = phase2.get("coded_rows") or []
code_counts = phase2.get("code_frequency") or {}
top_codes = sorted(code_counts.items(), key=lambda kv: -kv[1])[:5]
top_codes_str = ", ".join(f"{code} ({count})" for code, count in top_codes) or "(none)"
return (
f"COMPLETED: thematic analysis on {len(coded_rows)} sentences "
f"(out of {len(sentences)} loaded, capped at {max_sentences}). "
f"Top 5 codes: {top_codes_str}. "
f"The full trace and per-sentence code table are now in the "
f"Researcher Workbench β Computational Thematic Analysis tab."
)
# ----------------------------------------------------------------
# TOOL 4 β summarize_cgt_result
# ----------------------------------------------------------------
def summarize_cgt_result(context: Dict[str, Any]) -> str:
"""Return a text summary of the most recent grounded theory run."""
result = context.get("cgt_result")
if not result:
return (
"NO PRIOR GROUNDED THEORY RUN. The user has not yet run grounded "
"theory in this session. Use run_grounded_theory first."
)
det = result.get("detection_result") or {}
clusters = det.get("clusters") or []
lines = ["Most recent Grounded Theory run:"]
for c in clusters:
lines.append(
f" - Cluster {c.get('cluster_id')}: {c.get('llm_label', 'unlabeled')} "
f"({c.get('size', 0)} sentences)"
)
lines.append(f"Supervisor reply: {result.get('reply', '(empty)')}")
return "\n".join(lines)
# ----------------------------------------------------------------
# TOOL 5 β summarize_cta_result
# ----------------------------------------------------------------
def summarize_cta_result(context: Dict[str, Any]) -> str:
"""Return a text summary of the most recent thematic analysis run."""
result = context.get("cta_result")
if not result:
return (
"NO PRIOR THEMATIC ANALYSIS RUN. The user has not yet run "
"thematic analysis in this session. Use run_thematic_analysis first."
)
phase2 = result.get("phase2_initial_codes") or {}
coded_rows = phase2.get("coded_rows") or []
code_freq = phase2.get("code_frequency") or {}
top_codes = sorted(code_freq.items(), key=lambda kv: -kv[1])[:5]
lines = [f"Most recent Thematic Analysis run: {len(coded_rows)} sentences coded."]
lines.append("Top 5 codes:")
for code, count in top_codes:
lines.append(f" - {code}: {count}")
lines.append(f"Supervisor reply: {result.get('reply', '(empty)')}")
return "\n".join(lines)
# ============================================================================
# Tool registration β shape matches tools.py for consistency
# ============================================================================
RINGMASTER_TOOL_FUNCTIONS = {
"check_data_status": check_data_status,
"run_grounded_theory": run_grounded_theory,
"run_thematic_analysis": run_thematic_analysis,
"summarize_cgt_result": summarize_cgt_result,
"summarize_cta_result": summarize_cta_result,
}
RINGMASTER_TOOL_SCHEMAS = [
{
"type": "function",
"function": {
"name": "check_data_status",
"description": (
"Check whether research data is currently loaded in the session. "
"Returns the number of sentences and a short preview, or reports "
"that no data is loaded. ALWAYS call this before run_grounded_theory "
"or run_thematic_analysis so you know whether to ask the user to "
"load data first."
),
"parameters": {
"type": "object",
"properties": {},
},
},
},
{
"type": "function",
"function": {
"name": "run_grounded_theory",
"description": (
"Run Computational Grounded Theory (Nelson 2020) on the currently "
"loaded research data. Only call this AFTER check_data_status "
"confirmed data is loaded. The result is a short text summary of "
"the clusters found; the full trace and sentence-level table will "
"appear in the Researcher Workbench tab automatically."
),
"parameters": {
"type": "object",
"properties": {
"similarity_threshold": {
"type": "number",
"description": "Cosine similarity threshold (0.4-0.9, default 0.60)",
},
"min_cluster_size": {
"type": "integer",
"description": "Minimum sentences per cluster (2-10, default 3)",
},
"n_nearest": {
"type": "integer",
"description": "Representatives per cluster for LLM labeling (1-10, default 3)",
},
},
},
},
},
{
"type": "function",
"function": {
"name": "run_thematic_analysis",
"description": (
"Run Computational Thematic Analysis (Braun & Clarke 2006) on the "
"currently loaded research data. Only call this AFTER "
"check_data_status confirmed data is loaded. Phase 2 (generating "
"initial codes) is the only real phase; the rest are placeholders. "
"The result is a short text summary; the full per-sentence code "
"table will appear in the Researcher Workbench tab automatically."
),
"parameters": {
"type": "object",
"properties": {
"max_sentences": {
"type": "integer",
"description": "Cap on sentences to code (expensive β each is one LLM call, default 20)",
},
},
},
},
},
{
"type": "function",
"function": {
"name": "summarize_cgt_result",
"description": (
"Return a text summary of the most recent Grounded Theory run so "
"you can answer follow-up questions about it. Does not re-run the "
"analysis."
),
"parameters": {
"type": "object",
"properties": {},
},
},
},
{
"type": "function",
"function": {
"name": "summarize_cta_result",
"description": (
"Return a text summary of the most recent Thematic Analysis run "
"so you can answer follow-up questions. Does not re-run."
),
"parameters": {
"type": "object",
"properties": {},
},
},
},
]
|