File size: 14,044 Bytes
a52bae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# ============================================================================
# ringmaster_tools.py β€” Tools the LangGraph Ringmaster supervisor can call
# ============================================================================
#
# These tools exist ONLY for the LangGraph Ringmaster backend. They are NOT
# registered in the standard tools.py because the other 6 backends do not
# know about workbench state, data loading, or research-corpus inspection.
#
# COMPLIANCE
# ----------
# Every tool here is a thin wrapper. It:
#   - reads structured input
#   - calls a real domain function (workbench_grounded_theory.run,
#     workbench_thematic_analysis.run, or a simple string inspection)
#   - returns a plain-string summary the LLM can include in its reply
#
# Tools NEVER do control flow. They NEVER route. They NEVER decide what
# runs next. The supervisor decides, the tool executes, the supervisor
# sees the result string and decides again.
#
# DATA CONTRACT
# -------------
# Every tool receives `context` β€” a dict the ringmaster backend builds
# from the Gradio session state before invoking the supervisor. Fields:
#   context["loaded_context"]  -> str, newline-separated sentences (may be empty)
#   context["llm_provider"]    -> str, the LLM provider name
#   context["llm_key"]         -> str, the API key (may be empty)
#   context["cgt_result"]      -> dict or None, last CGT run result
#   context["cta_result"]      -> dict or None, last CTA run result
#
# Tools that produce new results MUTATE context["cgt_result"] or
# context["cta_result"] so subsequent tool calls in the same chat turn
# can see them (and so the chat handler can extract them afterward to
# update the workbench tabs).
# ============================================================================

from typing import Dict, Any, List


# ----------------------------------------------------------------
# TOOL 1 β€” check_data_status
# ----------------------------------------------------------------
def check_data_status(context: Dict[str, Any]) -> str:
    """Report whether research data is currently loaded, and if so how much."""
    loaded = (context.get("loaded_context") or "").strip()
    if not loaded:
        return (
            "NO DATA LOADED. The user has not uploaded a file, pasted text, "
            "or scraped a URL yet. Ask the user to go to the Inputs tab and "
            "load data before running any research workbench."
        )

    sentences = [s.strip() for s in loaded.split("\n") if s.strip()]
    n = len(sentences)
    preview = sentences[:3]

    if n == 0:
        return "NO DATA LOADED β€” loaded_context is whitespace only."

    return (
        f"DATA LOADED: {n} sentences available for analysis.\n"
        f"First 3 sentences for preview:\n"
        + "\n".join(f"  {i+1}. {s}" for i, s in enumerate(preview))
    )


# ----------------------------------------------------------------
# TOOL 2 β€” run_grounded_theory
# ----------------------------------------------------------------
def run_grounded_theory(

    context: Dict[str, Any],

    similarity_threshold: float = 0.60,

    min_cluster_size: int = 3,

    n_nearest: int = 3,

) -> str:
    """Run the Computational Grounded Theory supervisor on loaded data.



    Returns a short text summary. Mutates context["cgt_result"] with the

    full result dict so the chat handler can update the CGT tab afterward.

    """
    loaded = (context.get("loaded_context") or "").strip()
    if not loaded:
        return (
            "ERROR: cannot run grounded theory β€” no data loaded. "
            "Ask the user to load data via the Inputs tab first."
        )

    sentences = [s.strip() for s in loaded.split("\n") if s.strip()]
    true_labels = ["(unknown)"] * len(sentences)

    # Import here to keep the ringmaster_tools module import-light and to
    # avoid a circular import at app.py boot.
    import workbench_grounded_theory as wb_cgt

    result = wb_cgt.run(
        user_message="Run computational grounded theory.",
        sentences=sentences,
        true_labels=true_labels,
        data_source="uploaded",
        similarity_threshold=float(similarity_threshold),
        min_cluster_size=int(min_cluster_size),
        n_nearest=int(n_nearest),
        llm_provider=context.get("llm_provider", "Mistral"),
        llm_key=context.get("llm_key", ""),
    )

    context["cgt_result"] = result

    det = result.get("detection_result") or {}
    clusters = det.get("clusters") or []
    n_clusters = len(clusters)
    cluster_summary_lines = []
    for c in clusters:
        label = c.get("llm_label") or c.get("cluster_id") or "unknown"
        size = c.get("size") or 0
        cluster_summary_lines.append(f"  - Cluster {c.get('cluster_id')}: {label} ({size} sentences)")

    if not cluster_summary_lines:
        return (
            f"Ran grounded theory on {len(sentences)} sentences but no clusters were "
            f"found at similarity {similarity_threshold} / min size {min_cluster_size}. "
            f"Suggest the user lower similarity_threshold or min_cluster_size."
        )

    return (
        f"COMPLETED: grounded theory on {len(sentences)} sentences. "
        f"Found {n_clusters} cluster(s):\n"
        + "\n".join(cluster_summary_lines)
        + "\nThe full trace and per-sentence cluster table are now in the "
        "Researcher Workbench β†’ Computational Grounded Theory tab."
    )


# ----------------------------------------------------------------
# TOOL 3 β€” run_thematic_analysis
# ----------------------------------------------------------------
def run_thematic_analysis(

    context: Dict[str, Any],

    max_sentences: int = 20,

) -> str:
    """Run the Computational Thematic Analysis supervisor on loaded data.



    Returns a short text summary. Mutates context["cta_result"].

    """
    loaded = (context.get("loaded_context") or "").strip()
    if not loaded:
        return (
            "ERROR: cannot run thematic analysis β€” no data loaded. "
            "Ask the user to load data via the Inputs tab first."
        )

    sentences = [s.strip() for s in loaded.split("\n") if s.strip()]
    true_labels = ["(unknown)"] * len(sentences)

    import workbench_thematic_analysis as wb_cta

    result = wb_cta.run(
        user_message="Run reflexive thematic analysis.",
        sentences=sentences,
        true_labels=true_labels,
        data_source="uploaded",
        max_sentences_to_code=int(max_sentences),
        llm_provider=context.get("llm_provider", "Mistral"),
        llm_key=context.get("llm_key", ""),
    )

    context["cta_result"] = result

    phase2 = result.get("phase2_initial_codes") or {}
    coded_rows = phase2.get("coded_rows") or []
    code_counts = phase2.get("code_frequency") or {}

    top_codes = sorted(code_counts.items(), key=lambda kv: -kv[1])[:5]
    top_codes_str = ", ".join(f"{code} ({count})" for code, count in top_codes) or "(none)"

    return (
        f"COMPLETED: thematic analysis on {len(coded_rows)} sentences "
        f"(out of {len(sentences)} loaded, capped at {max_sentences}). "
        f"Top 5 codes: {top_codes_str}. "
        f"The full trace and per-sentence code table are now in the "
        f"Researcher Workbench β†’ Computational Thematic Analysis tab."
    )


# ----------------------------------------------------------------
# TOOL 4 β€” summarize_cgt_result
# ----------------------------------------------------------------
def summarize_cgt_result(context: Dict[str, Any]) -> str:
    """Return a text summary of the most recent grounded theory run."""
    result = context.get("cgt_result")
    if not result:
        return (
            "NO PRIOR GROUNDED THEORY RUN. The user has not yet run grounded "
            "theory in this session. Use run_grounded_theory first."
        )

    det = result.get("detection_result") or {}
    clusters = det.get("clusters") or []
    lines = ["Most recent Grounded Theory run:"]
    for c in clusters:
        lines.append(
            f"  - Cluster {c.get('cluster_id')}: {c.get('llm_label', 'unlabeled')} "
            f"({c.get('size', 0)} sentences)"
        )
    lines.append(f"Supervisor reply: {result.get('reply', '(empty)')}")
    return "\n".join(lines)


# ----------------------------------------------------------------
# TOOL 5 β€” summarize_cta_result
# ----------------------------------------------------------------
def summarize_cta_result(context: Dict[str, Any]) -> str:
    """Return a text summary of the most recent thematic analysis run."""
    result = context.get("cta_result")
    if not result:
        return (
            "NO PRIOR THEMATIC ANALYSIS RUN. The user has not yet run "
            "thematic analysis in this session. Use run_thematic_analysis first."
        )

    phase2 = result.get("phase2_initial_codes") or {}
    coded_rows = phase2.get("coded_rows") or []
    code_freq = phase2.get("code_frequency") or {}
    top_codes = sorted(code_freq.items(), key=lambda kv: -kv[1])[:5]

    lines = [f"Most recent Thematic Analysis run: {len(coded_rows)} sentences coded."]
    lines.append("Top 5 codes:")
    for code, count in top_codes:
        lines.append(f"  - {code}: {count}")
    lines.append(f"Supervisor reply: {result.get('reply', '(empty)')}")
    return "\n".join(lines)


# ============================================================================
# Tool registration β€” shape matches tools.py for consistency
# ============================================================================
RINGMASTER_TOOL_FUNCTIONS = {
    "check_data_status": check_data_status,
    "run_grounded_theory": run_grounded_theory,
    "run_thematic_analysis": run_thematic_analysis,
    "summarize_cgt_result": summarize_cgt_result,
    "summarize_cta_result": summarize_cta_result,
}


RINGMASTER_TOOL_SCHEMAS = [
    {
        "type": "function",
        "function": {
            "name": "check_data_status",
            "description": (
                "Check whether research data is currently loaded in the session. "
                "Returns the number of sentences and a short preview, or reports "
                "that no data is loaded. ALWAYS call this before run_grounded_theory "
                "or run_thematic_analysis so you know whether to ask the user to "
                "load data first."
            ),
            "parameters": {
                "type": "object",
                "properties": {},
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "run_grounded_theory",
            "description": (
                "Run Computational Grounded Theory (Nelson 2020) on the currently "
                "loaded research data. Only call this AFTER check_data_status "
                "confirmed data is loaded. The result is a short text summary of "
                "the clusters found; the full trace and sentence-level table will "
                "appear in the Researcher Workbench tab automatically."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "similarity_threshold": {
                        "type": "number",
                        "description": "Cosine similarity threshold (0.4-0.9, default 0.60)",
                    },
                    "min_cluster_size": {
                        "type": "integer",
                        "description": "Minimum sentences per cluster (2-10, default 3)",
                    },
                    "n_nearest": {
                        "type": "integer",
                        "description": "Representatives per cluster for LLM labeling (1-10, default 3)",
                    },
                },
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "run_thematic_analysis",
            "description": (
                "Run Computational Thematic Analysis (Braun & Clarke 2006) on the "
                "currently loaded research data. Only call this AFTER "
                "check_data_status confirmed data is loaded. Phase 2 (generating "
                "initial codes) is the only real phase; the rest are placeholders. "
                "The result is a short text summary; the full per-sentence code "
                "table will appear in the Researcher Workbench tab automatically."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "max_sentences": {
                        "type": "integer",
                        "description": "Cap on sentences to code (expensive β€” each is one LLM call, default 20)",
                    },
                },
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "summarize_cgt_result",
            "description": (
                "Return a text summary of the most recent Grounded Theory run so "
                "you can answer follow-up questions about it. Does not re-run the "
                "analysis."
            ),
            "parameters": {
                "type": "object",
                "properties": {},
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "summarize_cta_result",
            "description": (
                "Return a text summary of the most recent Thematic Analysis run "
                "so you can answer follow-up questions. Does not re-run."
            ),
            "parameters": {
                "type": "object",
                "properties": {},
            },
        },
    },
]