# ============================================================================ # app.py — Agentic AI Systems for Large Scale Content Analysis # ============================================================================ # # PURPOSE # ------- # A chat-driven Gradio app that demonstrates FOUR different backend # implementations of the same agent task, side by side. This file is the # UI SHELL ONLY — it owns the chat, the tabs, the data source loaders, # the training panels, and the download list. It knows nothing about how # any individual backend works; it dispatches through a 4-symbol contract. # # THE FOUR BACKENDS # ----------------- # agent_workflow.py — Workflow: 2-step prompt chain, no tools (raw SDK) # agent_py.py — Simple Python Agent: tool-calling loop (raw SDK) # agent_langchain.py — LangChain AgentExecutor with tool calling # agent_langgraph.py — LangGraph state graph with supervisor + task nodes # # THE CONTRACT (every backend file exports these four symbols) # ------------------------------------------------------------ # BACKEND_NAME — string shown in the UI radio # get_client(api_key) — returns whatever 'client' the runner needs # run(client, user_message) — returns {"reply", "steps", "extracted"} # build_code_snippets(user_message, steps) -> str — for the Code tab # # Adding a new backend = new file with these four symbols, then one # import line in ZONE 2 and a registration into BACKENDS dict. No # handler, UI, or wiring changes. # # GRACEFUL DEGRADATION # -------------------- # agent_langchain and agent_langgraph are imported inside try/except. # If langchain / langchain-mistralai / langgraph are not installed, those # modes are silently hidden from the radio at startup and a warning prints # to the console. The app keeps running with Workflow + Simple Python Agent. # # CODE ORGANIZATION # ----------------- # ZONE 1: Imports & constants # ZONE 2: Backend imports + helpers (save_json_artifact, build_outputs, ...) # ZONE 3: Action handlers (wired to UI buttons) # ZONE 4: UI definition (gr.Blocks) # ZONE 5: Event wiring (.click handlers — the glue) # # LOGICAL FLOW OF ONE CHAT TURN # ----------------------------- # User types in chat, clicks Send. # -> send_btn.click fires process_message(...) # -> if loaded_context is set, prepend it to user_message # -> backend = BACKENDS[mode] # -> client = backend.get_client(api_key) # -> result = backend.run(client, effective_message) # -> returns {reply, steps, extracted} # -> build_outputs() produces table / chart / code / extracted JSON # -> calls backend.build_code_snippets(...) for the Code tab # -> save_json_artifact() writes a timestamped run_*.json # -> returns 8 values matching the chat_outputs list in ZONE 5 # 1. new chat history -> chatbot # 2. steps dataframe -> Results > Table # 3. extracted JSON -> Results > Extracted # 4. chart dataframe -> Visuals # 5. code snippet -> Results > Code # 6. downloads list -> downloads_state # 7. downloads list (same) -> Downloads tab file list # 8. empty string -> chat_input (clears it) # # DATA SOURCE LOADERS follow a shorter pattern: # User loads a URL / PDF / spreadsheet / ML examples -> saves JSON artifact, # appends to downloads, updates loaded_context_state for next chat turn. # Returns 5 values: preview, status, context, downloads_state, downloads_files. # # THE TWO RULES THAT WILL SAVE YOU PAIN # ------------------------------------- # 1. Handler return order MUST match its wiring outputs list. # Function returns N values -> outputs=[c1, c2, ..., cN] must have N items # in the same order. Mismatch is the #1 source of silent breakage. # # 2. All chat handlers (process_message, submit_form, new_chat) share # the same chat_outputs list. If you change the shape of one, change # all three at once. # # WHERE TO ADD NEW THINGS # ----------------------- # New backend -> Create agent_.py with the 4 contract symbols, # add one import line in ZONE 2, add it to BACKENDS. # Nothing else changes. # # New top-level tab -> ZONE 4 inside outer gr.Tabs() # + handler in ZONE 3 # + wiring in ZONE 5 # # New sub-tab -> ZONE 4 inside the parent tab's inner gr.Tabs() # + handler in ZONE 3 following scrape_url pattern # + wiring in ZONE 5 following scrape_btn pattern # # New output display -> ZONE 4 component + expand build_outputs in ZONE 2 # + add to chat_outputs list # + update process_message, submit_form, new_chat # to return one more value in the matching position # # New data source -> Same as sub-tab. Always call save_json_artifact() # and always return the 5-tuple shape. # # New agent tool -> Edit tools.py only. Add function to TOOL_FUNCTIONS # dict and schema to TOOL_SCHEMAS list. The raw-SDK # backends pick it up automatically. For LangChain # and LangGraph, also wrap it with @lc_tool in # agent_langchain.py and (if math/info scoped) add # to MATH_TOOLS or INFO_TOOLS in agent_langgraph.py. # # New field in an -> Find the `artifact = {...}` dict in the relevant # existing JSON handler in ZONE 3 and add your key. # # ============================================================================ # ============================================================================ # ZONE 1 — Imports & constants # ============================================================================ import os import json import hashlib from datetime import datetime from dotenv import load_dotenv load_dotenv() # Load environment variables from .env file import gradio as gr import pandas as pd import requests from bs4 import BeautifulSoup from pypdf import PdfReader MAX_CONTEXT_CHARS = 5000 # ============================================================================ # ZONE 2 — Helpers (pure functions, no UI knowledge) # ============================================================================ # These functions take plain Python inputs and return plain Python outputs. # They know nothing about Gradio. Reusable and testable on their own. # # NOTE: the actual LLM orchestration (Workflow and Agent runners, the # MODES dict, the client, and the code snippet builder) lives in agent.py # so that it can be swapped for alternative implementations (LangChain, # LangGraph, etc.) without touching this file. We just import what we need. # ---------------------------------------------------------------- # Agent backend — swappable module # ---------------------------------------------------------------- # ---------------------------------------------------------------- # Agent backends — each file is an independent import. # ALL backend imports are wrapped in try/except so the app boots even # if one file is broken (missing dep, version conflict, import error). # Broken backends are silently hidden from the mode radio at startup and # a warning is printed to the console. At least one backend must load # or the app will show an empty mode list, but the app itself will run. # ---------------------------------------------------------------- BACKENDS = {} # Ringmaster is listed FIRST so it becomes the default selection try: import agent_langgraph_ringmaster BACKENDS[agent_langgraph_ringmaster.BACKEND_NAME] = agent_langgraph_ringmaster except Exception as _rm_err: print(f"[app.py] LangGraph Ringmaster backend unavailable: {_rm_err}") try: import agent_workflow BACKENDS[agent_workflow.BACKEND_NAME] = agent_workflow except Exception as _wf_err: print(f"[app.py] Workflow backend unavailable: {_wf_err}") try: import agent_py BACKENDS[agent_py.BACKEND_NAME] = agent_py except Exception as _py_err: print(f"[app.py] Simple Python Agent backend unavailable: {_py_err}") try: import agent_langchain BACKENDS[agent_langchain.BACKEND_NAME] = agent_langchain except Exception as _lc_err: print(f"[app.py] LangChain backend unavailable: {_lc_err}") try: import agent_langgraph BACKENDS[agent_langgraph.BACKEND_NAME] = agent_langgraph except Exception as _lg_err: print(f"[app.py] LangGraph backend unavailable: {_lg_err}") try: import agent_smolagents BACKENDS[agent_smolagents.BACKEND_NAME] = agent_smolagents except Exception as _sa_err: print(f"[app.py] smolagents backend unavailable: {_sa_err}") try: import agent_crewai BACKENDS[agent_crewai.BACKEND_NAME] = agent_crewai except Exception as _crew_err: print(f"[app.py] CrewAI backend unavailable: {_crew_err}") try: import agent_llama_index BACKENDS[agent_llama_index.BACKEND_NAME] = agent_llama_index except Exception as _li_err: print(f"[app.py] LlamaIndex backend unavailable: {_li_err}") # Fallback so the UI never crashes on an empty BACKENDS dict if not BACKENDS: print("[app.py] WARNING: no backends loaded. Check build logs.") from examples import ML_EXAMPLES from training_data import TRAINING_EXAMPLES from training import ( train_classifier, predict as classifier_predict, cluster_hierarchical, cluster_report, ) try: import vectorstore VECTORSTORE_OK = True except Exception as _vs_err: print(f"[app.py] vectorstore unavailable: {_vs_err}") VECTORSTORE_OK = False import providers # Workbench packages — each is a self-contained LangGraph supervisor workflow. # Wrapped so a broken workbench does not kill the whole app on cold boot. # ============================================================================ # !!! RULE_VIOLATION_6 — DELIBERATE — see COMPLIANCE.md !!! # ---------------------------------------------------------------------------- # Pattern: try/except around module imports + WB_*_OK flags + print fallback. # Reason: A broken workbench folder (wrong upload, missing __init__, syntax # slip after an edit) must NOT bring down the entire Space on cold # boot. Defensive import lets the seven-backend chat, Supervised ML, # Unsupervised ML, and Vector Processing tabs keep working even if # one workbench is broken. # Fix-when: Never. This is the one boundary where graceful degradation is # worth more than strict compliance. Alternative would be pinning # every workbench dependency exhaustively — brittle on HF Spaces. # ============================================================================ try: import workbench_grounded_theory as wb_cgt WB_CGT_OK = True _wb_cgt_err = None except Exception as _e: WB_CGT_OK = False _wb_cgt_err = str(_e) print(f"[app.py] workbench_grounded_theory unavailable: {_wb_cgt_err}") try: import workbench_thematic_analysis as wb_cta WB_CTA_OK = True _wb_cta_err = None except Exception as _e: WB_CTA_OK = False _wb_cta_err = str(_e) print(f"[app.py] workbench_thematic_analysis unavailable: {_wb_cta_err}") try: from workbench_thematic_analysis import phase2_agent PHASE2_AGENT_OK = True _phase2_agent_err = None except Exception as _e: PHASE2_AGENT_OK = False _phase2_agent_err = str(_e) print(f"[app.py] phase2_agent unavailable: {_phase2_agent_err}") try: from phase3_themes import run_phase3_searching_themes PHASE3_OK = True _phase3_err = None except Exception as _e: PHASE3_OK = False _phase3_err = str(_e) print(f"[app.py] phase3_themes unavailable: {_phase3_err}") try: from phase4_review import run_phase4_reviewing_themes PHASE4_OK = True _phase4_err = None except Exception as _e: PHASE4_OK = False _phase4_err = str(_e) print(f"[app.py] phase4_review unavailable: {_phase4_err}") try: from phase5_defining_naming import run_phase5_defining_naming PHASE5_OK = True _phase5_err = None except Exception as _e: PHASE5_OK = False _phase5_err = str(_e) print(f"[app.py] phase5_defining_naming unavailable: {_phase5_err}") try: from phase6_report import run_phase6_producing_report PHASE6_OK = True _phase6_err = None except Exception as _e: PHASE6_OK = False _phase6_err = str(_e) print(f"[app.py] phase6_report unavailable: {_phase6_err}") try: from corpus_compression import run_corpus_compression COMPRESSION_OK = True _compression_err = None except Exception as _e: COMPRESSION_OK = False _compression_err = str(_e) print(f"[app.py] corpus_compression unavailable: {_compression_err}") try: from cluster_labeling import ( build_cluster_table_from_compression, run_iter1, run_iter2, commit_final_labels, LABEL_PROMPT_ITER1, LABEL_PROMPT_ITER2, ) CLUSTER_LABELING_OK = True _cluster_labeling_err = None except Exception as _e: CLUSTER_LABELING_OK = False _cluster_labeling_err = str(_e) print(f"[app.py] cluster_labeling unavailable: {_cluster_labeling_err}") try: import database as db DB_OK = True _db_err = None if DB_OK: DB_OK = db.create_tables() except Exception as _e: DB_OK = False _db_err = str(_e) print(f"[app.py] database unavailable: {_db_err}") try: from phase0_preparation import ( apply_length_filter, apply_noise_strip, apply_hash_dedup, apply_semantic_dedup, run_full_preparation_pipeline, SEMANTIC_DEDUP_AVAILABLE, ) PHASE0_PREP_OK = True _phase0_prep_err = None except Exception as _e: PHASE0_PREP_OK = False _phase0_prep_err = str(_e) print(f"[app.py] phase0_preparation unavailable: {_phase0_prep_err}") # ---------------------------------------------------------------- # FT50 method contracts — paper-cited preconditions per phase. # See method_contracts.py for the full registry. Reviewers can grep # that file for paper citations (e.g. "B&C 2006 p. 84") to see every # place the corresponding constraint is enforced. # ---------------------------------------------------------------- from method_contracts import ( MethodContractError, contracts_as_dicts, check_phase1_familiarization, check_phase0_compression, check_phase2_initial_coding, check_phase3_searching_themes, check_phase4_reviewing_themes, check_phase5_defining_naming, check_phase6_producing_report, check_cgt_phase2_refinement, ) # ---------------------------------------------------------------- # CGT Phase 2 Pattern Refinement — Nelson 2020 Step 2 # ---------------------------------------------------------------- from cgt_phase2_refinement import ( run_pattern_refinement, validate_refinement_table, ) # ---------------------------------------------------------------- # Methodology comparison — reference paper technique vs our 2026 # best-in-class technique, per workbench. Paper-ready Markdown, # downloadable as .md for injection into papers' methods sections. # ---------------------------------------------------------------- from methodology_comparison import COMPARISONS as METHOD_COMPARISONS # ---------------------------------------------------------------- # Artifact writer — every input/run becomes a timestamped JSON file # ---------------------------------------------------------------- def save_json_artifact(data, prefix): ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] path = f"{prefix}_{ts}.json" with open(path, "w") as f: json.dump(data, f, indent=2, default=str, ensure_ascii=False) return path # ---------------------------------------------------------------- # Methodology comparison download handler # ---------------------------------------------------------------- def handle_methodology_comparison_download(workbench_key, downloads_list): """Save the methodology comparison for a workbench as a .md file. Paper-ready Markdown — researcher pastes into the methods section. Args: workbench_key: 'bc', 'gw', or 'cgt' downloads_list: current downloads list (Gradio state) """ dl = list(downloads_list or []) comp = METHOD_COMPARISONS.get(workbench_key) if comp is None: return f"**Unknown workbench key: {workbench_key!r}**", dl, dl ts = datetime.now().strftime("%Y%m%d_%H%M%S") path = f"methodology_comparison_{workbench_key}_{ts}.md" with open(path, "w", encoding="utf-8") as f: f.write(comp.as_markdown()) dl.append(path) return f"**Saved:** `{path}` (ready to paste into paper's methods section)", dl, dl # ---------------------------------------------------------------- # Build outputs for the Results/Visuals tabs from a run result # ---------------------------------------------------------------- def build_outputs(user_message, mode, result): steps_df = pd.DataFrame(result["steps"]) extracted_json = json.dumps(result["extracted"], indent=2) tool_counts = {} for s in result["steps"]: tool_counts[s["tool"]] = tool_counts.get(s["tool"], 0) + 1 if tool_counts: chart_df = pd.DataFrame( [{"tool": k, "count": v} for k, v in tool_counts.items()] ) else: chart_df = pd.DataFrame([{"tool": "(none)", "count": 0}]) # Each backend has its own build_code_snippets — pick the right one. backend = BACKENDS.get(mode) if backend is not None: code_snippet = backend.build_code_snippets(user_message, result["steps"]) else: code_snippet = f"# Unknown backend: {mode}" return steps_df, extracted_json, chart_df, code_snippet # ============================================================================ # ZONE 3 — Action handlers (wired to UI buttons in Zone 5) # ============================================================================ # These are the functions Gradio calls when a button is clicked or a form # is submitted. They read state, call Zone 2 helpers, and return values # that go directly into UI components. # # CONVENTIONS: # - Data source loaders return 5 values: # (preview, status, loaded_context, downloads_state, downloads_files) # - Chat handlers (process_message, submit_form, new_chat) return 8 values: # (chat_history, table_df, extracted_json, chart_df, code_snippet, # downloads_state, downloads_files, empty_string_to_clear_input) # - Clear handlers return only the fields they reset. Never touch downloads. # # ---------------------------------------------------------------- # Data source loaders # Each returns: preview, status, loaded_context, downloads_state, downloads_files # Each saves a timestamped JSON artifact and appends to the downloads list. # ---------------------------------------------------------------- def scrape_url(url, downloads_list): dl = list(downloads_list or []) if not url or not url.strip(): return "", "Nothing loaded.", "", dl, dl resp = requests.get(url.strip(), timeout=15) soup = BeautifulSoup(resp.text, "html.parser") for tag in soup(["script", "style", "noscript"]): tag.decompose() text = soup.get_text(separator=" ", strip=True)[:MAX_CONTEXT_CHARS] status = f"**Loaded:** {url.strip()} — {len(text)} chars" artifact = { "timestamp": datetime.now().isoformat(), "source_type": "web_scrape", "url": url.strip(), "char_count": len(text), "content": text, } path = save_json_artifact(artifact, "scrape") dl.append(path) return text, status, text, dl, dl def extract_pdf(file_obj, downloads_list): dl = list(downloads_list or []) if file_obj is None: return "", "Nothing loaded.", "", dl, dl reader = PdfReader(file_obj.name) text = "\n".join((page.extract_text() or "") for page in reader.pages) text = text[:MAX_CONTEXT_CHARS] status = f"**Loaded:** PDF with {len(reader.pages)} pages — {len(text)} chars" artifact = { "timestamp": datetime.now().isoformat(), "source_type": "pdf_upload", "filename": os.path.basename(file_obj.name), "page_count": len(reader.pages), "char_count": len(text), "content": text, } path = save_json_artifact(artifact, "pdf") dl.append(path) return text, status, text, dl, dl def load_spreadsheet(file_obj, downloads_list): dl = list(downloads_list or []) if file_obj is None: return pd.DataFrame(), "Nothing loaded.", "", dl, dl path_in = file_obj.name if path_in.lower().endswith(".csv"): df = pd.read_csv(path_in) else: df = pd.read_excel(path_in) preview_df = df.head(20) text = df.head(50).to_string()[:MAX_CONTEXT_CHARS] status = f"**Loaded:** {len(df)} rows x {len(df.columns)} columns" artifact = { "timestamp": datetime.now().isoformat(), "source_type": "spreadsheet_upload", "filename": os.path.basename(path_in), "row_count": int(len(df)), "column_count": int(len(df.columns)), "columns": list(df.columns), "rows": df.head(100).to_dict(orient="records"), } path_out = save_json_artifact(artifact, "spreadsheet") dl.append(path_out) return preview_df, status, text, dl, dl def load_ml_examples(downloads_list): """Load the built-in ML paper catalog as context. No upload needed.""" dl = list(downloads_list or []) paper_ids = {e["paper_id"] for e in ML_EXAMPLES} preview_lines = [ f"[{e['label']}] {e['sentence'][:90]}{'...' if len(e['sentence']) > 90 else ''}" f" — {e['paper_title']}, {e['year']}" for e in ML_EXAMPLES[:8] ] preview_lines.append(f"\n... and {max(0, len(ML_EXAMPLES) - 8)} more sentences") preview = "\n".join(preview_lines) status = f"**Loaded:** {len(ML_EXAMPLES)} labeled sentences from {len(paper_ids)} ML papers" context_text = json.dumps(ML_EXAMPLES, indent=2, ensure_ascii=False)[:MAX_CONTEXT_CHARS] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "ml_examples_catalog", "sentence_count": len(ML_EXAMPLES), "paper_count": len(paper_ids), "examples": ML_EXAMPLES, } path = save_json_artifact(artifact, "ml_examples") dl.append(path) return preview, status, context_text, dl, dl # ---------------------------------------------------------------- # Clear handlers — reset only the source-specific fields # ---------------------------------------------------------------- def clear_scrape(): return "", "", "Nothing loaded.", "" def clear_pdf(): return None, "", "Nothing loaded.", "" def clear_spreadsheet(): return None, pd.DataFrame(), "Nothing loaded.", "" def clear_ml_examples(): return "", "Nothing loaded.", "" # ---------------------------------------------------------------- # Training handlers — supervised and unsupervised ML on TRAINING_EXAMPLES # ---------------------------------------------------------------- def handle_train(downloads_list): """Fit a TF-IDF + logistic regression classifier and save the result.""" dl = list(downloads_list or []) trained = train_classifier() # Build a display-friendly confusion matrix dataframe cm_df = pd.DataFrame( trained.confusion, columns=[f"pred:{l}" for l in trained.labels], ) cm_df.insert(0, "actual", trained.labels) status = ( f"**Accuracy:** {trained.accuracy:.1%} \n" f"**Train size:** {trained.train_size}, " f"**Test size:** {trained.test_size}" ) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "supervised_training", "accuracy": trained.accuracy, "train_size": trained.train_size, "test_size": trained.test_size, "labels": trained.labels, "confusion_matrix": trained.confusion, } path = save_json_artifact(artifact, "training") dl.append(path) return trained, status, cm_df, dl, dl def handle_predict(trained, sentence, downloads_list): """Predict the label of a new sentence using a previously trained model.""" dl = list(downloads_list or []) if trained is None: return "Train the classifier first.", dl, dl if not sentence or not sentence.strip(): return "Enter a sentence to predict.", dl, dl result = classifier_predict(trained, sentence.strip()) lines = [ f"**Predicted label:** `{result['predicted_label']}`", f"**Confidence:** {result['confidence']:.1%}", "", "**Class probabilities:**", ] for label, prob in sorted(result["probabilities"].items(), key=lambda x: -x[1]): lines.append(f"- `{label}`: {prob:.1%}") artifact = { "timestamp": datetime.now().isoformat(), "source_type": "supervised_prediction", **result, } path = save_json_artifact(artifact, "prediction") dl.append(path) return "\n".join(lines), dl, dl def handle_cluster(similarity_threshold, min_cluster_size, n_nearest, enable_llm_labels, llm_provider, llm_key, downloads_list): """Parameterized clustering with optional LLM labeling of each cluster. Uses training.cluster_with_params which returns: - cluster_ids per sentence (-1 = noise) - centroids per surviving cluster - n_nearest representative sentences per cluster Then (optionally) sends those representatives to an LLM with a constrained prompt that asks for a short cluster label. """ from training import cluster_with_params as _cwp dl = list(downloads_list or []) sentences = [e["sentence"] for e in TRAINING_EXAMPLES] true_labels = [e["label"] for e in TRAINING_EXAMPLES] result = _cwp( sentences, similarity_threshold=float(similarity_threshold), min_cluster_size=int(min_cluster_size), n_nearest=int(n_nearest), ) cluster_ids = result["cluster_ids"] representatives = result["representatives"] distances = result["distances_to_centroid"] # Build LLM labels if enabled llm_labels = {} llm_error = None if enable_llm_labels and result["n_clusters_found"] > 0: try: client = providers.get_llm_client(llm_provider, llm_key) model_name = providers.get_llm_model(llm_provider) for cid, reps in representatives.items(): rep_sentences = [sentences[i] for i, _d in reps] numbered = "\n".join( f"{k+1}. {s}" for k, s in enumerate(rep_sentences) ) prompt = ( f"The following {len(rep_sentences)} sentences were grouped " f"together by a clustering algorithm. Based ONLY on these " f"sentences, produce a short label (2-5 words) that describes " f"what they have in common. Output ONLY the label, nothing else.\n\n" f"{numbered}\n\nLabel:" ) resp = client.chat.complete( model=model_name, messages=[{"role": "user", "content": prompt}], temperature=0.2, max_tokens=40, ) label = (resp.choices[0].message.content or "").strip() # Trim to first line, cap length label = label.split("\n")[0][:60] llm_labels[cid] = label except Exception as e: llm_error = str(e) # Build sentence-level dataframe sent_rows = [] for idx, sent in enumerate(sentences): cid = cluster_ids[idx] rep_idxs = {i for i, _d in representatives.get(cid, [])} sent_rows.append({ "idx": idx, "sentence": sent, "true_label": true_labels[idx], "cluster_id": "noise" if cid == -1 else str(cid), "cluster_label": llm_labels.get(cid, "") if cid != -1 else "", "is_representative": idx in rep_idxs, "dist_to_centroid": ( round(distances[idx], 4) if distances[idx] is not None else None ), }) sent_df = pd.DataFrame(sent_rows) n_found = result["n_clusters_found"] n_noise = result["n_noise_points"] status_parts = [ f"**Similarity >= {float(similarity_threshold):.2f}**, " f"**min size = {int(min_cluster_size)}**, " f"**N nearest = {int(n_nearest)}**", f"**Found:** {n_found} cluster(s), **Noise:** {n_noise} sentence(s)", ] if enable_llm_labels: if llm_error: status_parts.append(f"**LLM labeling failed:** {llm_error}") else: status_parts.append(f"**LLM labels generated** via {llm_provider}") status = " \n".join(status_parts) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "unsupervised_clustering_parameterized", "algorithm": "Hierarchical Agglomerative", "similarity_threshold": float(similarity_threshold), "min_cluster_size": int(min_cluster_size), "n_nearest": int(n_nearest), "n_clusters_found": n_found, "n_noise_points": n_noise, "llm_provider": llm_provider if enable_llm_labels else None, "llm_labels": {str(k): v for k, v in llm_labels.items()}, "sentences": sent_rows, } path = save_json_artifact(artifact, "clusters_params") dl.append(path) return sent_df, status, dl, dl # ---------------------------------------------------------------- # Workbench handlers — Grounded Theory (Nelson 2020) + Thematic Analysis # ---------------------------------------------------------------- def handle_wb_cgt(user_message, similarity_threshold, min_cluster_size, n_nearest, llm_provider, llm_key, loaded_context, downloads_list): """Run the Computational Grounded Theory supervisor graph. Three-step framework from Nelson 2020. Round 1: Pattern Detection is a real LangGraph node, Pattern Refinement and Pattern Confirmation are placeholders that return 'not yet implemented'. Sentence source resolution: 1. If loaded_context (from the Inputs tab) is non-empty, split it on newlines and use those sentences with true_labels="(unknown)". 2. Otherwise fall back to the built-in TRAINING_EXAMPLES demo corpus with its real ground-truth labels. """ dl = list(downloads_list or []) # !!! RULE_VIOLATION_7 — DELIBERATE — see COMPLIANCE.md !!! if not WB_CGT_OK: return ( pd.DataFrame(), "# Workbench unavailable\n\n" + (_wb_cgt_err or "unknown error"), pd.DataFrame(), dl, dl, ) # ---- Resolve sentence source ---- if loaded_context and loaded_context.strip(): sentences = [s.strip() for s in loaded_context.split("\n") if s.strip()] true_labels = ["(unknown)"] * len(sentences) data_source = "uploaded" else: from training_data import TRAINING_EXAMPLES sentences = [e["sentence"] for e in TRAINING_EXAMPLES] true_labels = [e["label"] for e in TRAINING_EXAMPLES] data_source = "demo" result = wb_cgt.run( user_message=user_message or "Run computational grounded theory on the training data.", similarity_threshold=float(similarity_threshold), min_cluster_size=int(min_cluster_size), n_nearest=int(n_nearest), llm_provider=llm_provider, llm_key=llm_key, ) trace_df = pd.DataFrame(result.get("steps") or []) reply_md = "## Supervisor reply\n\n" + (result.get("reply") or "(empty)") reply_md += f"\n\n*Data source: **{data_source}** ({len(sentences)} sentences)*" det = result.get("detection_result") or {} sentence_rows = det.get("sentence_rows") or [] sentences_df = pd.DataFrame(sentence_rows) if sentence_rows else pd.DataFrame() artifact = { "timestamp": datetime.now().isoformat(), "source_type": "workbench_cgt", "paper": "Nelson 2020 - Computational Grounded Theory", "data_source": data_source, "n_sentences": len(sentences), "parameters": { "similarity_threshold": float(similarity_threshold), "min_cluster_size": int(min_cluster_size), "n_nearest": int(n_nearest), "llm_provider": llm_provider, }, "reply": result.get("reply"), "steps": result.get("steps"), "detection_result": result.get("detection_result"), "refinement_result": result.get("refinement_result"), "confirmation_result": result.get("confirmation_result"), } path = save_json_artifact(artifact, "workbench_cgt") dl.append(path) return trace_df, reply_md, sentences_df, dl, dl # ---------------------------------------------------------------- # CGT Phase 2 Pattern Refinement handlers (Nelson 2020 Step 2) # ---------------------------------------------------------------- def handle_cgt_p2_surface( sentences_df, n_exemplars, reflexive_positioning, llm_provider, llm_key, downloads_list, ): """Surface exemplars per Phase 1 pattern and draft LLM interpretive memos. Contracts (Nelson 2020 + C&R 2022): - Phase 1 output must exist with cluster_id column - at least 1 non-noise cluster - n_exemplars in [1, 20] - reflexive positioning >=20 chars - LLM key present Returns: (refinement_df, status_markdown, downloads_list, downloads_files) """ dl = list(downloads_list or []) empty = pd.DataFrame(columns=[ "pattern_id", "pattern_label", "n_sentences", "exemplars", "llm_memo_draft", "researcher_memo", "verdict", "new_label", ]) # Contract check try: contracts = check_cgt_phase2_refinement( sentences_df=sentences_df, n_exemplars=int(n_exemplars), reflexive_positioning=reflexive_positioning, llm_key=llm_key, ) except MethodContractError as e: return empty, f"**Method contract violation (CGT Phase 2):**\n\n{e}", dl, dl # Run refinement try: result = run_pattern_refinement( sentences_df=sentences_df, n_exemplars=int(n_exemplars), llm_provider=llm_provider or "Mistral", llm_key=llm_key, reflexive_pos=reflexive_positioning or "", ) except Exception as e: return empty, f"**CGT Phase 2 error:** {e}", dl, dl refinement_rows = result["refinement_rows"] if not refinement_rows: return empty, "**No patterns to refine** — Phase 1 produced no non-noise clusters.", dl, dl refinement_df = pd.DataFrame(refinement_rows) # Save "surface" artifact (pre-researcher-edit snapshot) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "cgt_phase2_surface", "methodology": "Nelson 2020 Step 2 — Pattern Refinement (exemplar surfacing + LLM memo draft)", "method_contracts_verified": contracts_as_dicts(contracts), "n_patterns": result["n_patterns"], "n_noise": result["n_noise"], "n_exemplars_per_pattern": int(n_exemplars), "llm_errors": result["llm_errors"], "refinement_rows": refinement_rows, } path = save_json_artifact(artifact, "cgt_phase2_surface") dl.append(path) status = ( f"**Phase 2 exemplars surfaced.** {result['n_patterns']} patterns, " f"{result['n_noise']} noise sentences skipped. " f"LLM memo drafts generated. " f"**Edit `researcher_memo`, `verdict`, and `new_label` columns below**, " f"then click Save." ) if result["llm_errors"]: status += f"\n\n*(LLM errors on {len(result['llm_errors'])} clusters — see artifact)*" return refinement_df, status, dl, dl def handle_cgt_p2_save(refinement_table, reflexive_positioning, downloads_list): """Save the researcher-edited Phase 2 refinement table as artifact. Validates the researcher's edits: every row must have a valid verdict (keep/merge/split/drop/rename), researcher_memo, and new_label for rename/split. """ dl = list(downloads_list or []) if not isinstance(refinement_table, pd.DataFrame): refinement_df = pd.DataFrame(refinement_table) if refinement_table else pd.DataFrame() else: refinement_df = refinement_table.copy() # Validate researcher edits validation = validate_refinement_table(refinement_df) if not validation["ok"]: msg = "**Phase 2 save blocked — fix these before saving:**\n\n" for err in validation["errors"][:10]: msg += f"- {err}\n" if len(validation["errors"]) > 10: msg += f"- ...and {len(validation['errors']) - 10} more\n" return msg, dl, dl refinement_rows = refinement_df.fillna("").to_dict("records") # Verdict tally verdict_counts = {} for r in refinement_rows: v = str(r.get("verdict", "")).strip().lower() verdict_counts[v] = verdict_counts.get(v, 0) + 1 artifact = { "timestamp": datetime.now().isoformat(), "source_type": "cgt_phase2_refinement_saved", "methodology": "Nelson 2020 Step 2 — Pattern Refinement (researcher-approved)", "method_contracts_enforced": ( "See method_contracts.check_cgt_phase2_refinement — enforced at surface time. " "Contracts: Nelson 2020 (Phase 1 output, cluster count, exemplar range), " "C&R 2022 (reflexive positioning), reproducibility." ), "reflexive_positioning": reflexive_positioning or "", "n_patterns_refined": len(refinement_rows), "verdict_tally": verdict_counts, "refinement_rows": refinement_rows, } path = save_json_artifact(artifact, "cgt_phase2_refinement") dl.append(path) tally_str = ", ".join(f"{k}={v}" for k, v in sorted(verdict_counts.items())) return ( f"**Phase 2 refinement saved** ({len(refinement_rows)} patterns). " f"Verdicts: {tally_str}. " f"Artifact: `{path}`.", dl, dl, ) def handle_wb_cta(user_message, max_sentences, llm_provider, llm_key, loaded_context, downloads_list): """Run the Computational Thematic Analysis supervisor graph. Six-phase framework from Braun & Clarke 2006. Round 1: Phase 2 (Generating Initial Codes) is a real LangGraph node, Phases 1, 3, 4, 5, 6 are placeholders that return 'not yet implemented'. Sentence source resolution: same as CGT — loaded_context from Inputs tab first, fall back to TRAINING_EXAMPLES demo corpus. """ dl = list(downloads_list or []) # !!! RULE_VIOLATION_7 — DELIBERATE — see COMPLIANCE.md !!! # Same pattern as above: pairs with RULE_VIOLATION_6 on cold-boot # import failure. if not WB_CTA_OK: return ( pd.DataFrame(), "# Workbench unavailable\n\n" + (_wb_cta_err or "unknown error"), pd.DataFrame(), dl, dl, ) # ---- Resolve sentence source ---- if loaded_context and loaded_context.strip(): sentences = [s.strip() for s in loaded_context.split("\n") if s.strip()] true_labels = ["(unknown)"] * len(sentences) data_source = "uploaded" else: from training_data import TRAINING_EXAMPLES sentences = [e["sentence"] for e in TRAINING_EXAMPLES] true_labels = [e["label"] for e in TRAINING_EXAMPLES] data_source = "demo" result = wb_cta.run( user_message=user_message or "Run reflexive thematic analysis on the training data.", max_sentences_to_code=int(max_sentences), llm_provider=llm_provider, llm_key=llm_key, ) trace_df = pd.DataFrame(result.get("steps") or []) reply_md = "## Supervisor reply\n\n" + (result.get("reply") or "(empty)") reply_md += f"\n\n*Data source: **{data_source}** ({len(sentences)} sentences)*" phase2 = result.get("phase2_initial_codes") or {} coded_rows = phase2.get("coded_rows") or [] codes_df = pd.DataFrame(coded_rows) if coded_rows else pd.DataFrame() artifact = { "timestamp": datetime.now().isoformat(), "source_type": "workbench_cta", "paper": "Braun & Clarke 2006 - Reflexive Thematic Analysis", "data_source": data_source, "n_sentences": len(sentences), "parameters": { "max_sentences_to_code": int(max_sentences), "llm_provider": llm_provider, }, "reply": result.get("reply"), "steps": result.get("steps"), "phase1_familiarization": result.get("phase1_familiarization"), "phase2_initial_codes": result.get("phase2_initial_codes"), "phase3_searching_themes": result.get("phase3_searching_themes"), "phase4_reviewing_themes": result.get("phase4_reviewing_themes"), "phase5_defining_naming": result.get("phase5_defining_naming"), "phase6_producing_report": result.get("phase6_producing_report"), } path = save_json_artifact(artifact, "workbench_cta") dl.append(path) return trace_df, reply_md, codes_df, dl, dl def clear_training(): return None, "Not trained yet.", pd.DataFrame(), "" def clear_clustering(): return pd.DataFrame(), "Not clustered yet." def filter_training_dataset(label): """Filter the training-data dataframe shown in the Supervised Dataset sub-tab.""" if label == "(all)" or not label: return pd.DataFrame(TRAINING_EXAMPLES) return pd.DataFrame([e for e in TRAINING_EXAMPLES if e["label"] == label]) # ============================================================================ # Phase 1 Familiarization handlers — Braun & Clarke 2006, Phase 1 # ============================================================================ # These handlers drive the Phase 1 — Familiarization sub-tab inside CTA. # The flow follows Braun & Clarke's active-reading protocol, implemented # through grounded dialogue partners (Gemini Gems + NotebookLM) plus # researcher confirmation: # 1. Load canonical corpus CSV (L1, L2, L3, L4, sentence_id, sentence) # 2. Researcher runs Familiarization Facilitator dialogue in Gemini, # pastes familiarization notes + transcript + source evidence back # 3. Researcher runs Reflexive Companion dialogue, pastes reflexive # challenges + reflexive positioning + immersion coverage back # 4. Build researcher confirmation table joining corpus with noticings # 5. Researcher edits the table (confirm/refine/reject each noticing) # 6. Save to JSON artifact for Downloads tab # ---------------------------------------------------------------- P1_REQUIRED_COLUMNS = ["L1", "L2", "L3", "L4", "sentence_id", "sentence"] def handle_p1_load_test_csv(downloads_list): """Load the built-in test_phase1.csv for pipeline verification.""" dl = list(downloads_list or []) try: df = pd.read_csv("test_phase1.csv") except Exception as e: return ( [], f"Failed to load test_phase1.csv: {e}", pd.DataFrame(), dl, dl, ) missing = [c for c in P1_REQUIRED_COLUMNS if c not in df.columns] if missing: return ( [], f"test_phase1.csv is missing required columns: {missing}", pd.DataFrame(), dl, dl, ) corpus = df[P1_REQUIRED_COLUMNS].to_dict("records") status = ( f"**Loaded test_phase1.csv** — {len(corpus)} sentences across " f"{df['L1'].nunique()} documents, " f"{df['L2'].nunique()} unique sections." ) return corpus, status, df[P1_REQUIRED_COLUMNS], dl, dl def handle_p1_upload_csv(file_obj, downloads_list): """Load a user-uploaded canonical CSV and write upload-provenance artifact. The provenance artifact is the first link in the reproducibility chain. It contains: SHA-256 hash (for integrity verification), filename, row count, schema, per-hierarchy distribution stats, and sentence previews. A reviewer presented with the artifact can verify that the corpus they receive matches the one that produced downstream results, by recomputing the SHA-256 over the file bytes. """ dl = list(downloads_list or []) if file_obj is None: return [], "No file uploaded.", pd.DataFrame(), dl, dl # Step 1 — read file bytes for hashing (before pandas touches it) try: with open(file_obj.name, "rb") as f: file_bytes = f.read() file_sha256 = hashlib.sha256(file_bytes).hexdigest() file_size_bytes = len(file_bytes) except Exception as e: return [], f"Failed to read file bytes: {e}", pd.DataFrame(), dl, dl # Step 2 — parse CSV try: df = pd.read_csv(file_obj.name) except Exception as e: return [], f"Failed to read CSV: {e}", pd.DataFrame(), dl, dl # Step 3 — validate schema missing = [c for c in P1_REQUIRED_COLUMNS if c not in df.columns] if missing: # Still write a provenance artifact for the FAILED upload, so the # reviewer can see what was attempted and why it was rejected. fail_artifact = { "timestamp": datetime.now().isoformat(), "source_type": "corpus_upload_rejected", "filename": os.path.basename(file_obj.name), "file_sha256": file_sha256, "file_size_bytes": file_size_bytes, "n_rows_attempted": int(len(df)), "detected_columns": list(df.columns), "required_columns": list(P1_REQUIRED_COLUMNS), "missing_columns": missing, "rejection_reason": f"Missing required columns: {missing}", } fail_path = save_json_artifact(fail_artifact, "corpus_upload_rejected") dl.append(fail_path) return ( [], ( f"Uploaded CSV is missing required columns: {missing}. " f"Canonical schema is: {P1_REQUIRED_COLUMNS}. \n" f"Rejection artifact: `{os.path.basename(fail_path)}`" ), pd.DataFrame(), dl, dl, ) # Step 4 — build corpus (only required columns flow downstream) corpus = df[P1_REQUIRED_COLUMNS].to_dict("records") # Step 5 — compute provenance stats (per-hierarchy uniqueness) def _safe_nunique(col_name): if col_name not in df.columns: return 0 return int(df[col_name].fillna("").astype(str).nunique()) # Step 6 — build upload provenance artifact upload_artifact = { "timestamp": datetime.now().isoformat(), "source_type": "corpus_upload", "pipeline_stage": "pre-sampling", "filename": os.path.basename(file_obj.name), "file_sha256": file_sha256, "file_size_bytes": file_size_bytes, "n_rows": int(len(corpus)), "detected_columns": list(df.columns), "required_columns_present": list(P1_REQUIRED_COLUMNS), "hierarchy_distribution": { "n_unique_L1": _safe_nunique("L1"), "n_unique_L2": _safe_nunique("L2"), "n_unique_L3": _safe_nunique("L3"), "n_unique_L4": _safe_nunique("L4"), "n_unique_sentence_id": _safe_nunique("sentence_id"), }, "preview_first_3": [ { "L1": str(r.get("L1", "")), "sentence_id": str(r.get("sentence_id", "")), "sentence_first_120_chars": str(r.get("sentence", ""))[:120], } for r in corpus[:3] ], "integrity_verification_instructions": ( "To verify this corpus matches downstream artifacts, compute " "SHA-256 of the source CSV file and compare to file_sha256 " "above. On Linux: `sha256sum `. On Windows PowerShell: " "`Get-FileHash -Algorithm SHA256`." ), } path = save_json_artifact(upload_artifact, "corpus_upload") dl.append(path) status = ( f"**Loaded uploaded CSV** — {len(corpus)} sentences across " f"{_safe_nunique('L1')} L1 values. \n" f"- File SHA-256: `{file_sha256[:16]}...` (full hash in artifact) \n" f"- Upload provenance: `{os.path.basename(path)}`" ) return corpus, status, df[P1_REQUIRED_COLUMNS], dl, dl def handle_p1_build_validation_table( corpus, facilitator_memo, facilitator_transcript, facilitator_citations, companion_challenges, companion_reflexivity, companion_breadth, ): """Build the researcher confirmation table from corpus + pasted Phase 1 outputs. Strategy: start with every corpus row (L1, L2, L3, L4, sentence_id, sentence), then append empty initial_noticing / researcher_confirmation columns. The researcher edits the table inline to attach initial noticings to specific sentences and mark each one confirm/refine/reject. This is the minimum viable version. A future round will parse the pasted source evidence and auto-populate the initial_noticing column for sentences that were explicitly quoted during the dialogue. """ if not corpus: empty = pd.DataFrame(columns=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "initial_noticing", "reflexive_challenge", "researcher_confirmation", "refined_noticing", ]) return empty rows = [] for r in corpus: rows.append({ "L1": r.get("L1", ""), "L2": r.get("L2", ""), "L3": r.get("L3", ""), "L4": r.get("L4", ""), "sentence_id": r.get("sentence_id", ""), "sentence": r.get("sentence", ""), "initial_noticing": "", "reflexive_challenge": "", "researcher_confirmation": "", "refined_noticing": "", }) return pd.DataFrame(rows) def handle_p1_save( corpus, facilitator_memo, facilitator_transcript, facilitator_citations, companion_challenges, companion_reflexivity, companion_breadth, validation_table, downloads_list, ): """Save all Phase 1 outputs as a timestamped JSON artifact.""" dl = list(downloads_list or []) # --- FT50 method contract check (B&C 2006 Phase 1) --- try: contracts = check_phase1_familiarization( corpus=corpus, reflexive_positioning=companion_reflexivity, ) except MethodContractError as e: violation = { "timestamp": datetime.now().isoformat(), "source_type": "method_contract_violation", "phase": "Phase 1 — Familiarization", "error": str(e), "contracts": contracts_as_dicts(e.contracts), } path = save_json_artifact(violation, "contract_violation_phase1") dl.append(path) return f"**Method contract violation (Phase 1):**\n\n{e}", dl, dl # Convert confirmation dataframe to list-of-dicts for JSON if isinstance(validation_table, pd.DataFrame): confirmation_rows = validation_table.fillna("").to_dict("records") else: confirmation_rows = [] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase1_familiarization", "methodology": "Braun & Clarke 2006 Phase 1 — Familiarizing Yourself With Your Data", "method_contracts_verified": contracts_as_dicts(contracts), "corpus_size": len(corpus or []), "step1_familiarization_facilitator": { "familiarization_notes": facilitator_memo or "", "active_reading_transcript": facilitator_transcript or "", "source_evidence": facilitator_citations or "", }, "step2_reflexive_companion": { "reflexive_challenges": companion_challenges or "", "reflexive_positioning": companion_reflexivity or "", "dataset_immersion_coverage": companion_breadth or "", }, "step3_researcher_confirmation_table": confirmation_rows, } path = save_json_artifact(artifact, "phase1_familiarization") dl.append(path) status = ( f"**Saved Phase 1 familiarization output** — {len(corpus or [])} corpus sentences, " f"{len(confirmation_rows)} confirmation rows. " f"Artifact: `{path.split('/')[-1]}`" ) return status, dl, dl # ============================================================================ # Phase 2 Initial Coding handlers — Braun & Clarke 2006, Phase 2 # ============================================================================ # Round 1: scaffolding + data flow. Round 2 replaces placeholder agent with # real LangGraph supervisor. Round 3 adds iteration 2/3 + convergence. # # The agent architecture (Round 2) will have 7 tools: # - read_corpus(filter) # - read_phase1_context() # - propose_code(sentence, semantic, latent) # - check_codebook(code_name) # - add_to_codebook(code_name, definition, example) # - flag_for_review(sentence, reason) # - save_iteration(n) # ---------------------------------------------------------------- def handle_p2_refresh_corpus( corpus, facilitator_memo, companion_reflexivity, validation_table, ): """Refresh Phase 2 corpus status + Phase 1 context summary. Phase 2 reads the corpus loaded in Phase 1 (shared state). It also surfaces Phase 1's reflexive positioning and confirmed noticings as context for the agent. """ if not corpus: return ( "**No corpus loaded.** Go to Phase 1 — Familiarization and load " "test_phase1.csv (or your own canonical CSV) first.", "*Phase 1 output will appear here after Save Phase 1.*", ) # Count confirmed noticings from Phase 1 validation table confirmed_count = 0 if isinstance(validation_table, pd.DataFrame) and not validation_table.empty: noticings_col = validation_table.get("initial_noticing") if noticings_col is not None: confirmed_count = sum( 1 for v in noticings_col.fillna("").tolist() if str(v).strip() ) n_docs = len({r.get("L1", "") for r in corpus}) corpus_status = ( f"**Corpus ready** — {len(corpus)} sentences across {n_docs} documents. " f"Inherited from Phase 1 state." ) p1_summary_parts = [] if facilitator_memo and facilitator_memo.strip(): preview = facilitator_memo.strip()[:300] p1_summary_parts.append(f"**Familiarization notes:** {preview}...") if companion_reflexivity and companion_reflexivity.strip(): preview = companion_reflexivity.strip()[:300] p1_summary_parts.append(f"**Reflexive positioning:** {preview}...") p1_summary_parts.append( f"**Confirmed initial noticings:** {confirmed_count} rows with non-empty `initial_noticing`." ) p1_summary = "\n\n".join(p1_summary_parts) if p1_summary_parts else ( "*Phase 1 output will appear here after Save Phase 1.*" ) return corpus_status, p1_summary def handle_p2_run_iteration( iteration_n, corpus, existing_codes_table, existing_codebook_table, facilitator_memo, companion_reflexivity, validation_table, llm_provider, llm_key, orientation, ): """Run one Phase 2 coding iteration via the real LangGraph agent. Strict B&C 2006 Phase 2: - Multiple codes per segment (1-5) - Context window (2 before + 2 after) - Researcher-chosen orientation (semantic OR latent, not both) - Reflexive positioning injected into every code prompt - Researcher override is final """ # Empty corpus guard if not corpus: empty_codes = pd.DataFrame(columns=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "ai_code_iter1", "human_code_iter1", "ai_code_iter2", "human_code_iter2", "ai_code_iter3", "human_code_iter3", "final_code", "flagged", ]) empty_codebook = pd.DataFrame(columns=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ]) return ( empty_codes, empty_codebook, "**Cannot run — no corpus loaded.** Load corpus in Phase 1 first.", ) # Agent availability guard if not PHASE2_AGENT_OK: empty_codes = pd.DataFrame(columns=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "ai_code_iter1", "human_code_iter1", "ai_code_iter2", "human_code_iter2", "ai_code_iter3", "human_code_iter3", "final_code", "flagged", ]) empty_codebook = pd.DataFrame(columns=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ]) return ( empty_codes, empty_codebook, f"**Phase 2 agent unavailable** — `{_phase2_agent_err}`", ) # API key guard if not llm_key or not str(llm_key).strip(): empty_codes = pd.DataFrame(columns=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "ai_code_iter1", "human_code_iter1", "ai_code_iter2", "human_code_iter2", "ai_code_iter3", "human_code_iter3", "final_code", "flagged", ]) empty_codebook = pd.DataFrame(columns=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ]) return ( empty_codes, empty_codebook, "**Cannot run — Mistral API key is missing.** Paste it in the sidebar first.", ) # --- FT50 method contract check (B&C 2006 Phase 2) --- try: contracts = check_phase2_initial_coding( orientation=orientation, corpus=corpus, reflexive_positioning=companion_reflexivity, llm_key=llm_key, iteration_n=int(iteration_n), ) except MethodContractError as e: empty_codes = pd.DataFrame(columns=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "ai_code_iter1", "human_code_iter1", "ai_code_iter2", "human_code_iter2", "ai_code_iter3", "human_code_iter3", "final_code", "flagged", ]) empty_codebook = pd.DataFrame(columns=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ]) return ( empty_codes, empty_codebook, f"**Method contract violation (Phase 2):**\n\n{e}", ) # Initialize the codes table (carry forward if it exists) if isinstance(existing_codes_table, pd.DataFrame) and not existing_codes_table.empty: codes_df = existing_codes_table.copy() else: rows = [] for r in corpus: rows.append({ "L1": r.get("L1", ""), "L2": r.get("L2", ""), "L3": r.get("L3", ""), "L4": r.get("L4", ""), "sentence_id": r.get("sentence_id", ""), "sentence": r.get("sentence", ""), "ai_code_iter1": "", "human_code_iter1": "", "ai_code_iter2": "", "human_code_iter2": "", "ai_code_iter3": "", "human_code_iter3": "", "final_code": "", "flagged": "", }) codes_df = pd.DataFrame(rows) # Initialize codebook if isinstance(existing_codebook_table, pd.DataFrame) and not existing_codebook_table.empty: codebook_list = existing_codebook_table.fillna("").to_dict("records") else: codebook_list = [] # Build confirmed_noticings list from Phase 1 validation table confirmed_noticings = [] if isinstance(validation_table, pd.DataFrame) and not validation_table.empty: noticing_col = validation_table.get("initial_noticing") if noticing_col is not None: confirmed_noticings = [ str(v).strip() for v in noticing_col.fillna("").tolist() if str(v).strip() ] # Build agent context agent_context = { "corpus": corpus, "phase1": { "reflexive_positioning": companion_reflexivity or "", "familiarization_notes": facilitator_memo or "", "confirmed_noticings": confirmed_noticings, }, "orientation": orientation or "semantic", "existing_codes_df": codes_df if iteration_n >= 2 else None, "codebook": codebook_list, "proposed_codes": {}, } # Run the agent try: steps, reply, result_context = phase2_agent.run_phase2_iteration( llm_provider=llm_provider, llm_key=llm_key, iteration_n=int(iteration_n), context=agent_context, ) except Exception as e: return ( codes_df, pd.DataFrame(codebook_list) if codebook_list else pd.DataFrame(columns=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ]), f"**Phase 2 agent error:** {e}", ) # Merge agent results into codes_df # New shape: each proposed entry has "codes": [list of 1-5 strings] proposed = result_context.get("proposed_codes", {}) ai_col = f"ai_code_iter{int(iteration_n)}" for idx, code_dict in proposed.items(): if 0 <= int(idx) < len(codes_df): codes_list = code_dict.get("codes", []) or [] if isinstance(codes_list, str): codes_list = [codes_list] combined = ", ".join(c for c in codes_list if c) codes_df.at[int(idx), ai_col] = combined # Update final_code column — latest human edit wins, else latest AI code for i in range(len(codes_df)): final = "" for it in (3, 2, 1): h = codes_df.at[i, f"human_code_iter{it}"] if h and str(h).strip(): final = str(h).strip() break if not final: for it in (3, 2, 1): a = codes_df.at[i, f"ai_code_iter{it}"] if a and str(a).strip(): final = str(a).strip() break codes_df.at[i, "final_code"] = final # Build codebook DataFrame updated_codebook = result_context.get("codebook", []) codebook_df = pd.DataFrame(updated_codebook) if updated_codebook else pd.DataFrame( columns=["code_name", "definition", "created_by", "provenance", "sentence_count"] ) total_codes = sum(len(v.get("codes", [])) for v in proposed.values()) status = ( f"**Iteration {iteration_n} complete** ({orientation} orientation). " f"Coded {len(proposed)} sentences with {total_codes} total codes " f"(avg {total_codes/len(proposed) if proposed else 0:.1f} codes/sentence). " f"Codebook has {len(updated_codebook)} entries. " f"Agent took {len(steps)} steps. " f"Reply: {reply[:200]}" ) return codes_df, codebook_df, status def handle_p2_save( corpus, codes_table, codebook_table, downloads_list, ): """Save Phase 2 outputs as a timestamped JSON artifact.""" dl = list(downloads_list or []) if isinstance(codes_table, pd.DataFrame): codes_rows = codes_table.fillna("").to_dict("records") else: codes_rows = [] if isinstance(codebook_table, pd.DataFrame): codebook_rows = codebook_table.fillna("").to_dict("records") else: codebook_rows = [] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase2_initial_coding", "methodology": "Braun & Clarke 2006 Phase 2 — Generating Initial Codes (agentic)", "method_contracts_enforced": ( "See method_contracts.check_phase2_initial_coding — enforced at run time. " "Contracts: B&C 2006 p. 84 (orientation), p. 88 (systematic coverage), " "reflexivity principle (positioning injected), iterative refinement (iter 1-3)." ), "corpus_size": len(corpus or []), "codes_table": codes_rows, "codebook": codebook_rows, } path = save_json_artifact(artifact, "phase2_initial_coding") dl.append(path) # -- Supabase persistence -- db_note = "" if DB_OK: try: # Re-read directly from artifact to avoid empty DataFrame issue codes_to_save = artifact.get("codes_table", []) cb_to_save = artifact.get("codebook", []) n_codes = db.save_coded_sentences(codes_to_save) n_cb = db.save_codebook(cb_to_save) db_note = f" Saved to Supabase: {n_codes} coded rows, {n_cb} codebook entries." except Exception as _dbe: db_note = f" Supabase save failed: {_dbe}" status = ( f"**Saved Final Codes** — {len(codes_rows)} coded sentences, " f"{len(codebook_rows)} codebook entries. Artifact: `{path.split('/')[-1]}`{db_note}" ) return status, dl, dl # ---------------------------------------------------------------- # Phase 3 -- Searching for Themes handlers (Braun & Clarke 2006) # ---------------------------------------------------------------- def handle_p3_run( codebook_table, similarity_threshold, min_cluster_size, orientation, companion_reflexivity, llm_provider, llm_key, downloads_list, ): dl = list(downloads_list or []) empty_themes = pd.DataFrame(columns=[ "theme_id", "candidate_theme_name", "description", "rationale", "member_codes", "code_count", "researcher_theme_name", "researcher_notes", ]) empty_noise = pd.DataFrame(columns=["code_name", "definition"]) if not PHASE3_OK: return (empty_themes, empty_noise, f"**Phase 3 unavailable** -- {_phase3_err}", dl, dl) if codebook_table is None or (isinstance(codebook_table, pd.DataFrame) and codebook_table.empty): return (empty_themes, empty_noise, "**Cannot run Phase 3** -- no codebook. Run Phase 2 first.", dl, dl) key = (llm_key or "").strip() or os.environ.get("MISTRAL_API_KEY", "") if not key: return (empty_themes, empty_noise, "**Cannot run Phase 3** -- Mistral API key missing.", dl, dl) codebook_df = codebook_table.copy() if isinstance(codebook_table, pd.DataFrame) else pd.DataFrame(codebook_table) # --- FT50 method contract check (B&C 2006 Phase 3) --- try: contracts = check_phase3_searching_themes( codebook_table=codebook_df, similarity_threshold=float(similarity_threshold), min_cluster_size=int(min_cluster_size), llm_key=key, ) except MethodContractError as e: return (empty_themes, empty_noise, f"**Method contract violation (Phase 3):**\n\n{e}", dl, dl) try: result = run_phase3_searching_themes( codebook_df=codebook_df, llm_provider=llm_provider or "Mistral", llm_key=key, similarity_threshold=float(similarity_threshold), min_cluster_size=int(min_cluster_size), orientation=orientation or "semantic", reflexive_pos=companion_reflexivity or "", ) except Exception as e: return (empty_themes, empty_noise, f"**Phase 3 error:** {e}", dl, dl) themes_df = pd.DataFrame(result["themes_rows"]) if result["themes_rows"] else empty_themes noise_df = pd.DataFrame(result["noise_codes"]) if result["noise_codes"] else empty_noise artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase3_searching_themes", "methodology": "Braun & Clarke 2006 Phase 3 -- Searching for Themes", "method_contracts_verified": contracts_as_dicts(contracts), "similarity_threshold": float(similarity_threshold), "min_cluster_size": int(min_cluster_size), "orientation": orientation, "n_themes": result["n_themes"], "n_noise": result["n_noise"], "themes": result["themes_rows"], "noise_codes": result["noise_codes"], } path = save_json_artifact(artifact, "phase3_searching_themes") dl.append(path) status = ( "**Phase 3 complete.** " + str(result["n_themes"]) + " candidate themes from " + str(len(codebook_df)) + " codes. " + str(result["n_noise"]) + " codes in noise bucket. " + "Artifact: `" + path.split("/")[-1] + "`" ) return themes_df, noise_df, status, dl, dl def handle_p3_save(themes_table, noise_table, downloads_list): dl = list(downloads_list or []) themes_rows = themes_table.fillna("").to_dict("records") if isinstance(themes_table, pd.DataFrame) else [] noise_rows = noise_table.fillna("").to_dict("records") if isinstance(noise_table, pd.DataFrame) else [] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase3_researcher_confirmed_themes", "methodology": "Braun & Clarke 2006 Phase 3 -- Researcher-confirmed candidate themes", "themes": themes_rows, "noise_codes": noise_rows, } path = save_json_artifact(artifact, "phase3_themes") dl.append(path) # -- Supabase persistence -- db_note = "" if DB_OK: try: n_themes = db.save_themes(themes_rows) db_note = f" Saved to Supabase: {n_themes} themes." except Exception as _dbe: db_note = f" Supabase save failed: {_dbe}" status = ( "**Saved Phase 3 themes** -- " + str(len(themes_rows)) + " themes, " + str(len(noise_rows)) + " noise codes. Artifact: `" + path.split("/")[-1] + "`" + db_note ) return status, dl, dl # ---------------------------------------------------------------- # Phase 4 -- Reviewing Themes handlers (Braun & Clarke 2006) # ---------------------------------------------------------------- def handle_p4_run( themes_table, codes_table, companion_reflexivity, llm_provider, llm_key, downloads_list, ): dl = list(downloads_list or []) empty = pd.DataFrame(columns=[ "theme_id", "theme_name", "member_codes", "code_count", "member_sentence_count", "within_cohesion", "llm_verdict", "llm_reasoning", "llm_action_suggestion", "researcher_verdict", "researcher_action_notes", ]) if not PHASE4_OK: return empty, f"**Phase 4 unavailable** -- {_phase4_err}", dl, dl if themes_table is None or (isinstance(themes_table, pd.DataFrame) and themes_table.empty): return empty, "**Cannot run Phase 4** -- no themes. Run Phase 3 first.", dl, dl key = (llm_key or "").strip() or os.environ.get("MISTRAL_API_KEY", "") if not key: return empty, "**Cannot run Phase 4** -- Mistral API key missing.", dl, dl themes_df = themes_table.copy() if isinstance(themes_table, pd.DataFrame) else pd.DataFrame(themes_table) codes_df = codes_table.copy() if isinstance(codes_table, pd.DataFrame) else pd.DataFrame() # --- FT50 method contract check (B&C 2006 Phase 4) --- try: contracts = check_phase4_reviewing_themes( themes_table=themes_df, codes_table=codes_df, llm_key=key, ) except MethodContractError as e: return empty, f"**Method contract violation (Phase 4):**\n\n{e}", dl, dl try: result = run_phase4_reviewing_themes( themes_df=themes_df, codes_df=codes_df, corpus=[], llm_key=key, llm_provider=llm_provider or "Mistral", reflexive_pos=companion_reflexivity or "", ) except Exception as e: return empty, f"**Phase 4 error:** {e}", dl, dl review_df = pd.DataFrame(result["review_rows"]) if result["review_rows"] else empty artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase4_reviewing_themes", "methodology": "Braun & Clarke 2006 Phase 4 -- Reviewing Themes", "method_contracts_verified": contracts_as_dicts(contracts), "review_rows": result["review_rows"], "errors": result["errors"], } path = save_json_artifact(artifact, "phase4_reviewing_themes") dl.append(path) warns = result.get("errors", []) warn_note = " " + str(len(warns)) + " errors." if warns else "" status = ( "**Phase 4 complete.** " + str(len(result["review_rows"])) + " themes reviewed." + warn_note + " Artifact: `" + path.split("/")[-1] + "`" ) return review_df, status, dl, dl def handle_p4_save(review_table, downloads_list): dl = list(downloads_list or []) rows = review_table.fillna("").to_dict("records") if isinstance(review_table, pd.DataFrame) else [] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase4_researcher_verdicts", "methodology": "Braun & Clarke 2006 Phase 4 -- Researcher-confirmed theme verdicts", "review_rows": rows, } path = save_json_artifact(artifact, "phase4_verdicts") dl.append(path) # -- Supabase persistence -- db_note = "" if DB_OK: try: n_reviews = db.save_theme_reviews(rows) db_note = f" Saved to Supabase: {n_reviews} verdicts." except Exception as _dbe: db_note = f" Supabase save failed: {_dbe}" status = "**Saved Phase 4 verdicts** -- " + str(len(rows)) + " rows. Artifact: `" + path.split("/")[-1] + "`" + db_note return status, dl, dl # ---------------------------------------------------------------- # Phase 5 -- Defining and Naming Themes handlers # ---------------------------------------------------------------- def handle_p5_run( review_table, companion_reflexivity, llm_provider, llm_key, downloads_list, ): dl = list(downloads_list or []) empty = pd.DataFrame(columns=[ "theme_id", "original_name", "final_name", "definition", "scope_note", "narrative_contribution", "member_codes", "code_count", "researcher_final_name", "researcher_definition", ]) if not PHASE5_OK: return empty, f"**Phase 5 unavailable** -- {_phase5_err}", dl, dl if review_table is None or (isinstance(review_table, pd.DataFrame) and review_table.empty): return empty, "**Cannot run Phase 5** -- no theme reviews. Run Phase 4 first.", dl, dl key = (llm_key or "").strip() or os.environ.get("MISTRAL_API_KEY", "") if not key: return empty, "**Cannot run Phase 5** -- Mistral API key missing.", dl, dl review_df = review_table.copy() if isinstance(review_table, pd.DataFrame) else pd.DataFrame(review_table) # --- FT50 method contract check (B&C 2006 Phase 5) --- try: contracts = check_phase5_defining_naming( review_table=review_df, llm_key=key, ) except MethodContractError as e: return empty, f"**Method contract violation (Phase 5):**\n\n{e}", dl, dl try: result = run_phase5_defining_naming( review_df=review_df, llm_key=key, llm_provider=llm_provider or "Mistral", reflexive_pos=companion_reflexivity or "", ) except Exception as e: return empty, f"**Phase 5 error:** {e}", dl, dl def_df = pd.DataFrame(result["definition_rows"]) if result["definition_rows"] else empty artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase5_defining_naming", "methodology": "Braun & Clarke 2006 Phase 5 -- Defining and Naming Themes", "method_contracts_verified": contracts_as_dicts(contracts), "definition_rows": result["definition_rows"], "skipped": result["skipped"], "errors": result["errors"], } path = save_json_artifact(artifact, "phase5_defining_naming") dl.append(path) skip_note = f" {len(result['skipped'])} themes dropped (verdict=drop)." if result["skipped"] else "" status = ( "**Phase 5 complete.** " + str(len(result["definition_rows"])) + " themes defined." + skip_note + " Artifact: `" + path.split("/")[-1] + "`" ) return def_df, status, dl, dl def handle_p5_save(def_table, downloads_list): dl = list(downloads_list or []) rows = def_table.fillna("").to_dict("records") if isinstance(def_table, pd.DataFrame) else [] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase5_researcher_definitions", "methodology": "Braun & Clarke 2006 Phase 5 -- Researcher-confirmed theme definitions", "definition_rows": rows, } path = save_json_artifact(artifact, "phase5_definitions") dl.append(path) status = "**Saved Phase 5 definitions** -- " + str(len(rows)) + " themes. Artifact: `" + path.split("/")[-1] + "`" return status, dl, dl # ---------------------------------------------------------------- # Phase 6 -- Producing the Report handlers # ---------------------------------------------------------------- def handle_p6_run( def_table, codes_table, research_question, companion_reflexivity, corpus, llm_provider, llm_key, downloads_list, ): dl = list(downloads_list or []) if not PHASE6_OK: return "", f"**Phase 6 unavailable** -- {_phase6_err}", dl, dl if def_table is None or (isinstance(def_table, pd.DataFrame) and def_table.empty): return "", "**Cannot run Phase 6** -- no theme definitions. Run Phase 5 first.", dl, dl key = (llm_key or "").strip() or os.environ.get("MISTRAL_API_KEY", "") if not key: return "", "**Cannot run Phase 6** -- Mistral API key missing.", dl, dl def_df = def_table.copy() if isinstance(def_table, pd.DataFrame) else pd.DataFrame(def_table) codes_df = codes_table.copy() if isinstance(codes_table, pd.DataFrame) else pd.DataFrame() corpus_desc = f"{len(corpus or [])} sentences" if corpus else "qualitative corpus" # --- FT50 method contract check (B&C 2006 Phase 6) --- try: contracts = check_phase6_producing_report( def_table=def_df, llm_key=key, ) except MethodContractError as e: return "", f"**Method contract violation (Phase 6):**\n\n{e}", dl, dl try: result = run_phase6_producing_report( definition_df=def_df, codes_df=codes_df, llm_key=key, llm_provider=llm_provider or "Mistral", research_question=research_question or "", reflexive_pos=companion_reflexivity or "", corpus_description=corpus_desc, ) except Exception as e: return "", f"**Phase 6 error:** {e}", dl, dl if result["error"]: return "", f"**Phase 6 error:** {result['error']}", dl, dl artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase6_producing_report", "methodology": "Braun & Clarke 2006 Phase 6 -- Producing the Report", "method_contracts_verified": contracts_as_dicts(contracts), "theme_count": result["theme_count"], "report_markdown": result["report_markdown"], } path = save_json_artifact(artifact, "phase6_report") dl.append(path) status = ( "**Phase 6 complete.** Report generated for " + str(result["theme_count"]) + " themes. " + "Artifact: `" + path.split("/")[-1] + "`" ) return result["report_markdown"], status, dl, dl def handle_p6_save(report_text, downloads_list): dl = list(downloads_list or []) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase6_researcher_report", "methodology": "Braun & Clarke 2006 Phase 6 -- Researcher-edited final report", "report_markdown": report_text or "", } path = save_json_artifact(artifact, "phase6_final_report") # Also save as .md file md_path = path.replace(".json", ".md") with open(md_path, "w") as f: f.write(report_text or "") dl.extend([path, md_path]) status = "**Saved Phase 6 report** -- JSON + Markdown. Artifact: `" + md_path.split("/")[-1] + "`" return status, dl, dl # ---------------------------------------------------------------- # Phase 0 Preparation handlers (Moreno-Ortiz 2023; BERTopic_Teen 2025) # ---------------------------------------------------------------- # Four pre-sampling hygiene steps. Each emits an artifact JSON with # full reproducibility audit + literature citation. # # Data flow: corpus (list-of-dicts from upload) → DataFrame → # noise_strip → length_filter → hash_dedup → semantic_dedup → # DataFrame with frequency_weight col → back to list-of-dicts # (this becomes the input to Phase 0 Sampling). # # All 4 preserve L1/L2/L3/L4/sentence_id/sentence schema. # All 4 add/update frequency_weight (dedup steps merge; other steps # pass through). # ---------------------------------------------------------------- def _corpus_to_df(corpus): """Convert corpus (list-of-dicts) to DataFrame with schema ready.""" if not corpus: return pd.DataFrame(columns=["L1", "L2", "L3", "L4", "sentence_id", "sentence"]) df = pd.DataFrame(corpus) # Ensure required columns exist for col in ["L1", "L2", "L3", "L4", "sentence_id", "sentence"]: if col not in df.columns: df[col] = "" return df def _df_to_corpus(df): """Convert DataFrame back to list-of-dicts for downstream state.""" if df is None or len(df) == 0: return [] return df.fillna("").to_dict("records") def handle_p0prep_length_filter(corpus, min_words, downloads_list): """Drop sentences shorter than min_words. Emit audit artifact.""" dl = list(downloads_list or []) if not PHASE0_PREP_OK: return corpus or [], pd.DataFrame(), f"**Phase 0 Prep unavailable** — {_phase0_prep_err}", dl, dl if not corpus: return [], pd.DataFrame(), "**No corpus loaded.** Upload a CSV first.", dl, dl df_in = _corpus_to_df(corpus) result = apply_length_filter(df_in, min_words=int(min_words)) if "error" in result: return corpus, pd.DataFrame(), f"**Length filter error:** {result['error']}", dl, dl df_out = result["filtered_df"] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_prep_length_filter", "sub_step": "0.0.1", "methodology": "Moreno-Ortiz & García-Gámez 2023 — length-based filtering", **{k: v for k, v in result.items() if k != "filtered_df"}, } path = save_json_artifact(artifact, "phase0_prep_length_filter") dl.append(path) status = ( f"**Length filter complete** (min_words={min_words}). \n" f"- Input: {result['n_input']} rows \n" f"- Dropped: {result['n_dropped']} (too short) \n" f"- Kept: {result['n_kept']} \n" f"- Word count distribution: min={result['n_words_distribution']['min']}, " f"median={result['n_words_distribution']['median']}, max={result['n_words_distribution']['max']} \n" f"- Artifact: `{path.split('/')[-1]}`" ) return _df_to_corpus(df_out), df_out, status, dl, dl def handle_p0prep_noise_strip(corpus, downloads_list): """Strip URLs, emoji, problematic Unicode. Emit audit artifact.""" dl = list(downloads_list or []) if not PHASE0_PREP_OK: return corpus or [], pd.DataFrame(), f"**Phase 0 Prep unavailable** — {_phase0_prep_err}", dl, dl if not corpus: return [], pd.DataFrame(), "**No corpus loaded.** Upload a CSV first.", dl, dl df_in = _corpus_to_df(corpus) result = apply_noise_strip(df_in) if "error" in result: return corpus, pd.DataFrame(), f"**Noise strip error:** {result['error']}", dl, dl df_out = result["filtered_df"] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_prep_noise_strip", "sub_step": "0.0.2", "methodology": "Moreno-Ortiz & García-Gámez 2023; BERTopic_Teen 2025 — regex-based hygiene", **{k: v for k, v in result.items() if k != "filtered_df"}, } path = save_json_artifact(artifact, "phase0_prep_noise_strip") dl.append(path) status = ( f"**Noise strip complete.** \n" f"- URLs removed: {result['n_urls_removed']} \n" f"- Emoji removed: {result['n_emoji_removed']} \n" f"- Sentences modified: {result['n_sentences_modified']} \n" f"- Sentences emptied by stripping: {result['n_sentences_emptied']} " f"(run length filter next to drop them) \n" f"- Artifact: `{path.split('/')[-1]}`" ) return _df_to_corpus(df_out), df_out, status, dl, dl def handle_p0prep_hash_dedup(corpus, case_sensitive, downloads_list): """Exact-match dedup with frequency_weight counter. Emit audit artifact.""" dl = list(downloads_list or []) if not PHASE0_PREP_OK: return corpus or [], pd.DataFrame(), f"**Phase 0 Prep unavailable** — {_phase0_prep_err}", dl, dl if not corpus: return [], pd.DataFrame(), "**No corpus loaded.** Upload a CSV first.", dl, dl df_in = _corpus_to_df(corpus) result = apply_hash_dedup(df_in, case_sensitive=bool(case_sensitive)) if "error" in result: return corpus, pd.DataFrame(), f"**Hash dedup error:** {result['error']}", dl, dl df_out = result["filtered_df"] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_prep_hash_dedup", "sub_step": "0.0.3", "methodology": "Moreno-Ortiz & García-Gámez 2023 — frequency-preserving exact dedup", **{k: v for k, v in result.items() if k != "filtered_df"}, } path = save_json_artifact(artifact, "phase0_prep_hash_dedup") dl.append(path) status = ( f"**Hash deduplication complete.** \n" f"- Input sentences (weighted): {result['n_input']} \n" f"- Unique after dedup: {result['n_unique']} \n" f"- Duplicates merged: {result['n_duplicates_merged']} " f"({result['duplication_rate_pct']}%) \n" f"- Max frequency_weight: {result['max_frequency_weight']} \n" f"- Invariant preserved: {result['invariant_preserved']} " f"(sum of frequency_weight == input count) \n" f"- Artifact: `{path.split('/')[-1]}`" ) return _df_to_corpus(df_out), df_out, status, dl, dl def handle_p0prep_semantic_dedup(corpus, threshold, downloads_list): """MiniLM semantic near-dup merge. Emit audit artifact.""" dl = list(downloads_list or []) if not PHASE0_PREP_OK: return corpus or [], pd.DataFrame(), f"**Phase 0 Prep unavailable** — {_phase0_prep_err}", dl, dl if not corpus: return [], pd.DataFrame(), "**No corpus loaded.** Upload a CSV first.", dl, dl df_in = _corpus_to_df(corpus) result = apply_semantic_dedup(df_in, threshold=float(threshold)) if "error" in result: return corpus, pd.DataFrame(), f"**Semantic dedup error:** {result['error']}", dl, dl df_out = result["filtered_df"] artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_prep_semantic_dedup", "sub_step": "0.0.4", "methodology": "BERTopic_Teen 2025; SemDeDup Abbas 2023 — MiniLM cosine near-duplicate merge", **{k: v for k, v in result.items() if k != "filtered_df"}, } path = save_json_artifact(artifact, "phase0_prep_semantic_dedup") dl.append(path) status = ( f"**Semantic dedup complete** (threshold={threshold}). \n" f"- Input rows: {result.get('n_input_rows', result['n_input'])} \n" f"- Unique after dedup: {result['n_unique']} \n" f"- Near-duplicates merged: {result['n_near_duplicates_merged']} \n" f"- Model: `{result['model']}` \n" f"- Invariant preserved: {result['invariant_preserved']} \n" f"- Artifact: `{path.split('/')[-1]}`" ) return _df_to_corpus(df_out), df_out, status, dl, dl # ---------------------------------------------------------------- # Phase 0 -- Sampling handler (Gauthier & Wallace 2022) # ---------------------------------------------------------------- _PHASE0_EMPTY_COLS = [ "idx", "L1", "L2", "L3", "L4", "sentence_id", "sentence", "cluster_id_original", "cluster_id_refined", "cluster_id", "cluster_fit", "cluster_mean_fit", "cluster_std_fit", "cluster_quality_tier", "split_decision", "cluster_size", "selected", "reason", ] def _build_split_proposal_df(split_proposals: dict, cluster_stats_by_orig: dict) -> pd.DataFrame: """Render LOOSE-cluster split proposals as an editable researcher-review table. Columns: cluster_id_original, cluster_size, std_before, n_sub_proposed, max_std_after, improvement, target_reached, decision (editable). """ if not split_proposals: return pd.DataFrame(columns=[ "cluster_id_original", "cluster_size", "std_before", "n_sub_proposed", "max_std_after", "improvement", "target_reached", "decision", ]) rows = [] for cid, prop in sorted(split_proposals.items()): st = cluster_stats_by_orig.get(cid, {}) rows.append({ "cluster_id_original": int(cid), "cluster_size": int(st.get("size", 0)), "std_before": round(float(st.get("std_fit", 0.0)), 4), "n_sub_proposed": int(prop.get("n_sub", 1)), "max_std_after": round( float(max(prop.get("sub_stds", [0.0]))) if prop.get("sub_stds") else 0.0, 4, ), "improvement": round(float(prop.get("improvement", 0.0)), 4), "target_reached": bool(prop.get("target_reached", False)), "decision": "PENDING", # researcher edits to ACCEPTED / REJECTED }) return pd.DataFrame(rows) def handle_compression_run( corpus, sentences_per_cluster, min_cluster_size, outlier_sample_size, min_cluster_fit, downloads_list, ): """ Phase 0 Sampling — FT50 two-stage design. Stage 1: HDBSCAN initial clustering. Stage 2: Spread diagnostic per cluster (TIGHT/MEDIUM/LOOSE). Stage 3: Agglomerative split PROPOSALS for LOOSE clusters. Researcher reviews in a separate table and accepts/rejects. Stage 4: Stratified sampling at 10% of cluster size (floor = min_cluster_size). First call produces proposals with `decision=PENDING`. Researcher edits the proposal table then clicks "Apply Split Decisions" to re-run Phase 0 with decisions applied (see handle_apply_split_decisions). """ dl = list(downloads_list or []) empty = pd.DataFrame(columns=_PHASE0_EMPTY_COLS) empty_proposals = pd.DataFrame(columns=[ "cluster_id_original", "cluster_size", "std_before", "n_sub_proposed", "max_std_after", "improvement", "target_reached", "decision", ]) if not COMPRESSION_OK: return (empty, empty_proposals, corpus or [], f"**Sampling unavailable** -- {_compression_err}", dl, dl) if not corpus: return (empty, empty_proposals, [], "**No corpus loaded.** Run Phase 0 Preparation first.", dl, dl) # --- FT50 method contract check (G&W 2022 Phase 0) --- try: contracts = check_phase0_compression( corpus=corpus, sentences_per_cluster=int(sentences_per_cluster), min_cluster_size=int(min_cluster_size), outlier_sample_size=int(outlier_sample_size), ) except MethodContractError as e: return (empty, empty_proposals, corpus or [], f"**Method contract violation (Phase 0):**\n\n{e}", dl, dl) try: result = run_corpus_compression( corpus=corpus, sentences_per_cluster=int(sentences_per_cluster), min_cluster_size=int(min_cluster_size), outlier_sample_size=int(outlier_sample_size), min_cluster_fit=float(min_cluster_fit), auto_split_loose=True, split_decisions=None, # first pass: no decisions applied ) except Exception as e: return (empty, empty_proposals, corpus, f"**Sampling error:** {type(e).__name__}: {e}", dl, dl) # Phase 0 Sampling output ends here. No `final_label` column — labels are # produced by the DOWNSTREAM Cluster Labeling stage as its own frozen # artifact. Phase 1 and later stages join the two artifacts at read-time # on cluster_id. This enforces the one-way pipeline: no later stage # mutates this Phase 0 output. comp_df = pd.DataFrame(result["compression_rows"]) if result["compression_rows"] else empty # Build stats-by-original-cluster mapping for proposal table rendering cluster_stats_by_orig: dict[int, dict] = {} for row in result["compression_rows"]: cid = int(row["cluster_id_original"]) if cid == -1: continue if cid not in cluster_stats_by_orig: cluster_stats_by_orig[cid] = { "size": int(row["cluster_size"]), "std_fit": float(row["cluster_std_fit"]), } proposals_df = _build_split_proposal_df( result.get("split_proposals", {}), cluster_stats_by_orig ) quality = result.get("quality_summary", {}) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_sampling", "methodology": ( "Gauthier & Wallace (2022) computational thematic analysis at scale, " "extended with two-stage clustering (HDBSCAN → Agglomerative refinement " "on LOOSE clusters) and spread-aware stratified sampling for FT50 " "submission. Researcher-in-the-loop review preserves B&C 2021 reflexivity." ), "method_contracts_verified": contracts_as_dicts(contracts), "design": { "stage_1_clustering": "HDBSCAN (Campello, Moulavi, Zimek & Sander 2015, ACM TKDD 10(1):1-51)", "stage_2_spread_diagnostic": "std(cluster_fit) classified TIGHT (<0.15) / MEDIUM (0.15-0.20) / LOOSE (>=0.20)", "stage_3_agglomerative_refinement": "Ward (1963) JASA 58(301):236-244; applied to LOOSE clusters; researcher ACCEPT/REJECT/PENDING", "stage_4_sampling": "Stratified by cluster_fit (50% top / 30% middle / 20% edge), n = max(min_cluster_size, ceil(0.10 × N))", }, "references": { "embedding": "Reimers & Gurevych (2019). Sentence-BERT. EMNLP 2019.", "initial_clustering": "Campello, Moulavi, Zimek & Sander (2015). ACM TKDD 10(1):1-51.", "agglomerative_split": "Ward (1963). JASA 58(301):236-244.", "computational_ta_at_scale": "Gauthier & Wallace (2022). Proc. ACM HCI 6(GROUP) Article 25.", "b_and_c_reflexivity": "Braun & Clarke (2021). Qualitative Research in Psychology.", "researcher_validation": "Carlsen & Ralund (2022). Big Data & Society 9(1).", }, "n_original": result["n_original"], "n_compressed": result["n_compressed"], "n_clusters": result["n_clusters"], "n_outliers": result["n_outliers"], "parameters": { "sentences_per_cluster_legacy": int(sentences_per_cluster), "min_cluster_size": int(min_cluster_size), "outlier_sample_size": int(outlier_sample_size), "min_cluster_fit_threshold": float(min_cluster_fit), "spread_tight_max": 0.15, "spread_medium_max": 0.20, "sample_percentage": 0.10, "stratify_top_middle_edge": [0.50, 0.30, 0.20], }, "quality_summary": quality, "split_proposals_pending_review": [ { "cluster_id_original": int(cid), "n_sub_proposed": int(prop["n_sub"]), "sub_stds": [round(float(s), 4) for s in prop.get("sub_stds", [])], "improvement": round(float(prop.get("improvement", 0.0)), 4), "target_reached": bool(prop.get("target_reached", False)), } for cid, prop in result.get("split_proposals", {}).items() ], "compression_rows": result["compression_rows"], } path = save_json_artifact(artifact, "corpus_compression") dl.append(path) errors_note = " " + "; ".join(result["errors"]) if result["errors"] else "" # Build diagnostic status tight = quality.get("TIGHT", 0) medium = quality.get("MEDIUM", 0) loose = quality.get("LOOSE", 0) flagged = quality.get("n_flagged_for_split", 0) quality_note = ( f" \n- **Cluster quality:** " f"{tight} TIGHT (std<0.15), {medium} MEDIUM (0.15-0.20), " f"**{loose} LOOSE (≥0.20)**" ) if flagged > 0: quality_note += ( f" \n- **{flagged} LOOSE cluster(s) flagged for Agglomerative split review.** " f"See the **Split Proposals** table below and set `decision` to " f"`ACCEPTED` or `REJECTED`, then click **Apply Split Decisions** to re-sample. " f"Committing now will proceed with PENDING decisions (soft-warn, logged in audit)." ) status = ( "**Phase 0 Sampling complete.** " + str(result["n_original"]) + " sentences → " + str(result["n_compressed"]) + " selected across " + str(result["n_clusters"]) + " clusters (" + str(result["n_outliers"]) + " outliers)." + quality_note + errors_note + " \nArtifact: `" + path.split("/")[-1] + "`" ) return comp_df, proposals_df, result["compressed_corpus"], status, dl, dl # ---------------------------------------------------------------- # Apply researcher split decisions (re-runs Phase 0 with decisions) # ---------------------------------------------------------------- def handle_apply_split_decisions( corpus, proposals_df, sentences_per_cluster, min_cluster_size, outlier_sample_size, min_cluster_fit, downloads_list, ): """ Re-run Phase 0 with researcher ACCEPT/REJECT decisions from the proposals table. Each ACCEPTED cluster gets its Agglomerative sub-cluster split applied, producing refined cluster IDs (original*1000 + sub_id). """ dl = list(downloads_list or []) empty = pd.DataFrame(columns=_PHASE0_EMPTY_COLS) empty_proposals = pd.DataFrame(columns=[ "cluster_id_original", "cluster_size", "std_before", "n_sub_proposed", "max_std_after", "improvement", "target_reached", "decision", ]) if not COMPRESSION_OK: return (empty, empty_proposals, corpus or [], f"**Sampling unavailable** -- {_compression_err}", dl, dl) if not corpus: return (empty, empty_proposals, [], "**No corpus loaded.** Run Phase 0 Preparation first.", dl, dl) # Parse decisions out of the proposals dataframe decisions: dict[int, str] = {} if isinstance(proposals_df, pd.DataFrame) and not proposals_df.empty: for _, row in proposals_df.iterrows(): try: cid = int(row["cluster_id_original"]) dec = str(row.get("decision", "PENDING")).upper().strip() if dec in ("ACCEPTED", "REJECTED", "PENDING"): decisions[cid] = dec except Exception: continue try: result = run_corpus_compression( corpus=corpus, sentences_per_cluster=int(sentences_per_cluster), min_cluster_size=int(min_cluster_size), outlier_sample_size=int(outlier_sample_size), min_cluster_fit=float(min_cluster_fit), auto_split_loose=True, split_decisions=decisions, ) except Exception as e: return (empty, empty_proposals, corpus, f"**Re-sampling error:** {type(e).__name__}: {e}", dl, dl) comp_df = pd.DataFrame(result["compression_rows"]) if result["compression_rows"] else empty # Preserve researcher decisions in the proposal table (don't reset) cluster_stats_by_orig: dict[int, dict] = {} for row in result["compression_rows"]: cid = int(row["cluster_id_original"]) if cid == -1: continue if cid not in cluster_stats_by_orig: cluster_stats_by_orig[cid] = { "size": int(row["cluster_size"]), "std_fit": float(row["cluster_std_fit"]), } proposals_out_df = _build_split_proposal_df( result.get("split_proposals", {}), cluster_stats_by_orig ) # Override decision column with researcher's prior decisions if not proposals_out_df.empty: proposals_out_df["decision"] = proposals_out_df["cluster_id_original"].map( lambda c: decisions.get(int(c), "PENDING") ) quality = result.get("quality_summary", {}) n_accepted = quality.get("n_splits_accepted", 0) n_rejected = quality.get("n_splits_rejected", 0) n_pending = quality.get("n_splits_pending", 0) errors_note = " " + "; ".join(result["errors"]) if result["errors"] else "" warn = "" if n_pending > 0: warn = ( f" \n⚠ **{n_pending} split decision(s) still PENDING.** " f"Soft-warn: Phase 0 artifact accepted with pending decisions. " f"Refine before commit if desired." ) status = ( "**Split decisions applied.** " + str(result["n_original"]) + " sentences → " + str(result["n_compressed"]) + " selected across " + str(result["n_clusters"]) + " refined clusters." + f" \n- Splits: {n_accepted} ACCEPTED, {n_rejected} REJECTED, {n_pending} PENDING" + warn + errors_note ) return comp_df, proposals_out_df, result["compressed_corpus"], status, dl, dl # ---------------------------------------------------------------- # LLM cluster labeling handlers — Phase 2 pattern (DataFrame in/out, no state) # ---------------------------------------------------------------- # Matches the B&C Phase 2 handle_p2_run_iteration pattern that works without # flicker: handler takes DataFrames as inputs, returns DataFrames as outputs, # no separate gr.State machinery. Gradio handles the DataFrame round-trip. def handle_label_init_cluster_table(compression_rows_df): """Build cluster-level editing table from compression DataFrame. Returns (cluster_df, status_markdown).""" empty_cluster_df = pd.DataFrame(columns=[ "cluster_id", "cluster_size", "mean_cluster_fit", "top3_sentences_preview", "llm_label_iter1", "researcher_edit_iter1", "llm_label_iter2", "researcher_edit_iter2", "final_label", ]) if not CLUSTER_LABELING_OK: return empty_cluster_df, f"**Cluster labeling unavailable** — {_cluster_labeling_err}" # compression_rows_df comes from live gw_compress_table if isinstance(compression_rows_df, pd.DataFrame): rows = compression_rows_df.to_dict(orient="records") if not compression_rows_df.empty else [] else: rows = list(compression_rows_df or []) if not rows: return empty_cluster_df, "**No sampling rows.** Run Phase 0 first." cluster_rows = build_cluster_table_from_compression(rows) if not cluster_rows: return empty_cluster_df, "**No non-noise clusters to label.**" # Preview column already includes [L1 > sentence_id] provenance # (built by cluster_labeling.build_cluster_table_from_compression) df = pd.DataFrame(cluster_rows) status = ( f"**Cluster Label table initialized.** {len(cluster_rows)} non-noise " f"clusters ready for labeling. \n" f"Next: click *Run Iter 1* to have the LLM draft 2-word labels for " f"every cluster." ) return df, status def handle_label_iter1(cluster_labels_df, compression_rows_df, llm_provider, llm_key, downloads_list): """Run LLM iter1 — labels every cluster. Returns (cluster_df, status_markdown, downloads, downloads).""" dl = list(downloads_list or []) empty_cluster_df = pd.DataFrame(columns=[ "cluster_id", "cluster_size", "mean_cluster_fit", "top3_sentences_preview", "llm_label_iter1", "researcher_edit_iter1", "llm_label_iter2", "researcher_edit_iter2", "final_label", ]) if not CLUSTER_LABELING_OK: return empty_cluster_df, f"**Cluster labeling unavailable** — {_cluster_labeling_err}", dl, dl # Read DataFrames directly if isinstance(cluster_labels_df, pd.DataFrame): cluster_rows = cluster_labels_df.to_dict(orient="records") if not cluster_labels_df.empty else [] else: cluster_rows = list(cluster_labels_df or []) if isinstance(compression_rows_df, pd.DataFrame): comp_rows = compression_rows_df.to_dict(orient="records") if not compression_rows_df.empty else [] else: comp_rows = list(compression_rows_df or []) # Auto-build cluster table if researcher didn't click Init first if not cluster_rows: cluster_rows = build_cluster_table_from_compression(comp_rows) if not cluster_rows: return empty_cluster_df, "**No cluster rows.** Run Phase 0 + Init cluster table first.", dl, dl # Preview column already includes [L1 > sentence_id] provenance # (built by cluster_labeling.build_cluster_table_from_compression) # Validate the UI key field is populated — the LLM API key field on the # landing page is the ONLY source. If it's empty, tell the user directly. key = (llm_key or "").strip() if not key: return pd.DataFrame(cluster_rows), ( "**Iter 1 failed: LLM API key missing.** " "Paste your Mistral key in the **LLM API key** field at the top of the page, " "then click ② Run Iter 1 again." ), dl, dl try: result = run_iter1( cluster_rows=cluster_rows, compression_rows=comp_rows, llm_provider=llm_provider, llm_key=key, ) except Exception as e: return pd.DataFrame(cluster_rows), f"**Iter 1 error:** {e}", dl, dl if result.get("errors") and result.get("n_labeled", 0) == 0: err = "; ".join(result["errors"]) return pd.DataFrame(cluster_rows), f"**Iter 1 failed:** {err}", dl, dl updated = result["updated_cluster_rows"] df = pd.DataFrame(updated) if updated else empty_cluster_df artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_cluster_labels_iter1", "iteration": 1, "llm_model": result.get("model_name"), "temperature": 0.0, "prompt_template": result.get("prompt_template"), "label_length_constraint": "exactly 2 words", "scope": "every non-noise cluster", "n_labeled": result.get("n_labeled", 0), "n_errors": result.get("n_errors", 0), "errors": result.get("errors", []), "per_cluster_audit": result.get("audit", []), } path = save_json_artifact(artifact, "cluster_labels_iter1") dl.append(path) n_labeled = result.get("n_labeled", 0) n_errors = result.get("n_errors", 0) model = result.get("model_name", "unknown") err_line = f" \n- LLM errors: {n_errors}" if n_errors else "" status = ( f"**Iter 1 complete.** {n_labeled} clusters labeled (strict 2-word) via {model} " f"(temperature 0.0). \n" f"- Review `llm_label_iter1` — type into `researcher_edit_iter1` where you want to refine \n" f"- Then click ③ Run Iter 2 for an interpretive second pass on all clusters" f"{err_line} \n" f"Artifact: `{path.split('/')[-1]}`" ) return df, status, dl, dl def handle_label_iter2(cluster_labels_df, compression_rows_df, llm_provider, llm_key, downloads_list): """Run LLM iter2 on flagged clusters only.""" dl = list(downloads_list or []) empty_cluster_df = pd.DataFrame(columns=[ "cluster_id", "cluster_size", "mean_cluster_fit", "top3_sentences_preview", "llm_label_iter1", "researcher_edit_iter1", "llm_label_iter2", "researcher_edit_iter2", "final_label", ]) if not CLUSTER_LABELING_OK: return empty_cluster_df, f"**Cluster labeling unavailable** — {_cluster_labeling_err}", dl, dl if isinstance(cluster_labels_df, pd.DataFrame): cluster_rows = cluster_labels_df.to_dict(orient="records") if not cluster_labels_df.empty else [] else: cluster_rows = list(cluster_labels_df or []) if isinstance(compression_rows_df, pd.DataFrame): comp_rows = compression_rows_df.to_dict(orient="records") if not compression_rows_df.empty else [] else: comp_rows = list(compression_rows_df or []) if not cluster_rows: return empty_cluster_df, "**No cluster rows.** Run iter 1 first.", dl, dl # Validate the UI key field is populated — same as iter1 key = (llm_key or "").strip() if not key: return pd.DataFrame(cluster_rows), ( "**Iter 2 failed: LLM API key missing.** " "Paste your Mistral key in the **LLM API key** field at the top of the page, " "then click ③ Run Iter 2 again." ), dl, dl try: result = run_iter2( cluster_rows=cluster_rows, compression_rows=comp_rows, llm_provider=llm_provider, llm_key=key, ) except Exception as e: return pd.DataFrame(cluster_rows), f"**Iter 2 error:** {e}", dl, dl if result.get("errors") and result.get("n_refined", 0) == 0: err = "; ".join(result["errors"]) return pd.DataFrame(cluster_rows), f"**Iter 2 skipped/failed:** {err}", dl, dl updated = result["updated_cluster_rows"] df = pd.DataFrame(updated) if updated else empty_cluster_df artifact = { "timestamp": datetime.now().isoformat(), "source_type": "phase0_cluster_labels_iter2", "iteration": 2, "llm_model": result.get("model_name"), "temperature": 0.0, "prompt_template": result.get("prompt_template"), "label_length_constraint": "2-4 words max", "scope": "all clusters (interpretive re-label)", "n_refined": result.get("n_refined", 0), "n_errors": result.get("n_errors", 0), "errors": result.get("errors", []), "per_cluster_audit": result.get("audit", []), } path = save_json_artifact(artifact, "cluster_labels_iter2") dl.append(path) n_refined = result.get("n_refined", 0) n_errors = result.get("n_errors", 0) model = result.get("model_name", "unknown") err_line = f" \n- LLM errors: {n_errors}" if n_errors else "" status = ( f"**Iter 2 complete.** {n_refined} clusters re-labeled with interpretive " f"prompt via {model} (temp 0.0). \n" f"- Review `llm_label_iter2` against `llm_label_iter1` — do they differ? Which is stronger? \n" f"- Optionally type into `researcher_edit_iter2` to refine further \n" f"- **Then type the winning label into `final_label` for every cluster** \n" f"- Click *Commit Final Labels* when ALL final_labels are filled{err_line} \n" f"Artifact: `{path.split('/')[-1]}`" ) return df, status, dl, dl def handle_label_commit_final(cluster_labels_df, compression_rows_df, downloads_list): """Commit researcher's final labels. ONE-WAY PIPELINE. Produces a frozen cluster-level artifact: {cluster_id → final_label, choice_source, candidates} Does NOT mutate the Phase 0 Sampling Table (compression_rows). The Sampling Table is Phase 0's frozen output; this handler only writes its own artifact. Phase 1 and downstream stages join the two frozen artifacts at read-time on cluster_id. Returns: (cluster_df, status_markdown, downloads_state, downloads_files_out) — 4 outputs. Sampling Table is NOT in outputs. """ dl = list(downloads_list or []) empty_cluster_df = pd.DataFrame(columns=[ "cluster_id", "cluster_size", "mean_cluster_fit", "top3_sentences_preview", "llm_label_iter1", "researcher_edit_iter1", "llm_label_iter2", "researcher_edit_iter2", "final_label", ]) if not CLUSTER_LABELING_OK: return (empty_cluster_df, f"**Cluster labeling unavailable** — {_cluster_labeling_err}", dl, dl) if isinstance(cluster_labels_df, pd.DataFrame): cluster_rows = cluster_labels_df.to_dict(orient="records") if not cluster_labels_df.empty else [] else: cluster_rows = list(cluster_labels_df or []) if isinstance(compression_rows_df, pd.DataFrame): comp_rows = compression_rows_df.to_dict(orient="records") if not compression_rows_df.empty else [] else: comp_rows = list(compression_rows_df or []) if not cluster_rows: return empty_cluster_df, "**No cluster rows.** Run iter 1 first.", dl, dl try: result = commit_final_labels(cluster_rows, comp_rows) except Exception as e: return (pd.DataFrame(cluster_rows), f"**Commit error:** {e}", dl, dl) # Validation failure: some final_labels blank — no artifact written, no propagation validation_error = result.get("validation_error") if validation_error: cluster_df = pd.DataFrame(cluster_rows) if cluster_rows else empty_cluster_df return ( cluster_df, f"**Commit blocked.** {validation_error}", dl, dl, ) updated_cluster = result["updated_cluster_rows"] cluster_df = pd.DataFrame(updated_cluster) if updated_cluster else empty_cluster_df artifact = { "timestamp": datetime.now().isoformat(), "source_type": "cluster_labels_final", "pipeline_stage": "cluster_labeling (downstream of phase0_sampling)", "methodology": ( "For each cluster, researcher reviewed 4 candidate labels " "(llm_label_iter1 strict + researcher_edit_iter1 + llm_label_iter2 interpretive + " "researcher_edit_iter2) and typed authoritative final_label. " "Commit rejects blanks — every final_label is researcher-authored. " "Per Braun & Clarke (2006) 'themes are actively developed by the researcher.'" ), "pipeline_contract": ( "This artifact is a frozen cluster-level mapping. " "Phase 0 Sampling's output (sentences with cluster_id) is NOT mutated. " "Downstream stages join on cluster_id at read-time." ), "n_committed": result.get("n_committed", 0), "n_blank": result.get("n_blank", 0), "source_distribution": result.get("source_distribution", {}), "cluster_id_to_final_label": { str(a["cluster_id"]): a["final_label"] for a in result.get("audit", []) }, "per_cluster_resolution": result.get("audit", []), } path = save_json_artifact(artifact, "cluster_labels_final") dl.append(path) n_committed = result.get("n_committed", 0) source_dist = result.get("source_distribution", {}) dist_lines = [] label_map = { "llm_label_iter1": "LLM iter1 (strict)", "researcher_edit_iter1": "your iter1 edit", "llm_label_iter2": "LLM iter2 (interpretive)", "researcher_edit_iter2": "your iter2 edit", "custom_5th_option": "custom (none of 4 candidates)", } for src_key, friendly in label_map.items(): n = source_dist.get(src_key, 0) if n: dist_lines.append(f" - From **{friendly}**: {n}") dist_text = "\n".join(dist_lines) if dist_lines else " - (no breakdown available)" status = ( f"**Final labels committed.** {n_committed} clusters labeled. \n" f"- Frozen artifact: cluster_id → final_label mapping \n" f"- Phase 0 Sampling Table above is **unchanged** (one-way pipeline) \n" f"- Phase 1 and downstream stages will join on `cluster_id` at read-time \n\n" f"**Source distribution of researcher's choices:** \n" f"{dist_text} \n\n" f"Artifact: `{path.split('/')[-1]}`" ) return cluster_df, status, dl, dl def handle_vectorize_preview(embedding_provider, embedding_key, downloads_list): """Compute embeddings for the first 10 training sentences and show them.""" dl = list(downloads_list or []) if not VECTORSTORE_OK: return pd.DataFrame(), "vectorstore unavailable — check build logs", dl, dl try: rows = vectorstore.preview_vectors( n=10, embedding_provider=embedding_provider, embedding_api_key=embedding_key, ) except Exception as e: return ( pd.DataFrame(), f"Embedding failed on provider `{embedding_provider}`: {e}", dl, dl, ) df = pd.DataFrame(rows) status = ( f"**Embedding provider:** `{embedding_provider}` \n" f"**Vector dim:** {rows[0]['vector_dim'] if rows else '?'} \n" f"Showing first 10 sentences with the first 8 of the vector dimensions." ) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "vectorize_preview", "embedding_provider": embedding_provider, "preview_rows": rows, } path = save_json_artifact(artifact, "vectors_preview") dl.append(path) return df, status, dl, dl def handle_vector_index(embedding_provider, embedding_key, downloads_list): """Embed all 100 sentences and write them to ChromaDB.""" dl = list(downloads_list or []) if not VECTORSTORE_OK: return "vectorstore unavailable — check build logs", dl, dl try: result = vectorstore.index_training_data( embedding_provider=embedding_provider, embedding_api_key=embedding_key, ) except Exception as e: return ( f"Indexing failed on provider `{embedding_provider}`: {e}", dl, dl, ) status = ( f"**Indexed {result['indexed']} sentences** into ChromaDB collection " f"`{result['collection_name']}`. \n" f"**Vector dim:** {result['vector_dim']} \n" f"**Embedding provider:** `{result['embedding_provider']}` \n" f"**Embedding model:** `{result['embedding_model']}` \n" f"**Persist dir:** `{result['persist_dir']}`" ) artifact = { "timestamp": datetime.now().isoformat(), "source_type": "vector_index", **result, } path = save_json_artifact(artifact, "vector_index") dl.append(path) return status, dl, dl def handle_vector_search(query, n_results, embedding_provider, embedding_key, downloads_list): """Semantic search — embed query and retrieve top-N nearest sentences.""" dl = list(downloads_list or []) if not VECTORSTORE_OK: return pd.DataFrame(), "vectorstore unavailable — check build logs", dl, dl if not query or not query.strip(): return pd.DataFrame(), "Enter a query to search.", dl, dl try: hits = vectorstore.search( query.strip(), n_results=int(n_results), embedding_provider=embedding_provider, embedding_api_key=embedding_key, ) except Exception as e: return ( pd.DataFrame(), f"Search failed on provider `{embedding_provider}`: {e}", dl, dl, ) if not hits: return ( pd.DataFrame(), "No results. Have you indexed the collection yet? " "Click 'Index all 100 sentences' in the Vector DB tab first. " "Note: indexing and searching must use the SAME embedding provider " "because vector dimensions differ between providers.", dl, dl, ) df = pd.DataFrame([ { "rank": i + 1, "similarity": round(h["similarity"], 4), "label": h["label"], "sentence": h["sentence"], } for i, h in enumerate(hits) ]) status = f"**Query:** `{query}` — found {len(hits)} nearest neighbors" artifact = { "timestamp": datetime.now().isoformat(), "source_type": "vector_search", "query": query, "n_results": int(n_results), "embedding_provider": embedding_provider, "hits": hits, } path = save_json_artifact(artifact, "vector_search") dl.append(path) return df, status, dl, dl def handle_vector_clear(downloads_list): """Drop all rows from the Chroma collection.""" dl = list(downloads_list or []) if not VECTORSTORE_OK: return "vectorstore unavailable", dl, dl result = vectorstore.clear_collection() stats = vectorstore.collection_stats() status = f"**Cleared {result['cleared']} vectors.** Collection now has {stats['count']} rows." return status, dl, dl def clear_vectorize_preview(): return pd.DataFrame(), "Click 'Preview embeddings' to see sentence vectors." # ---------------------------------------------------------------- # Main chat handler # ---------------------------------------------------------------- # Only the two raw-SDK backends (Workflow, Simple Python Agent) respect # the chosen LLM provider. Framework backends are pinned to Mistral # because each framework wires its LLM differently and swapping them # per-provider is a larger rewrite. PROVIDER_AWARE_BACKENDS = {"Workflow", "Simple Python Agent"} def process_message(user_message, mode, llm_provider, llm_key, chat_history, loaded_context, downloads_list): dl = list(downloads_list or []) if not user_message or not user_message.strip(): return chat_history, pd.DataFrame(), "", pd.DataFrame(), "", dl, dl, "" backend = BACKENDS.get(mode) if backend is None: return chat_history, pd.DataFrame(), "", pd.DataFrame(), \ f"# Unknown backend: {mode}", dl, dl, "" # Framework backends always use Mistral; raw-SDK backends use chosen provider effective_provider = llm_provider if mode in PROVIDER_AWARE_BACKENDS else "Mistral" try: if mode in PROVIDER_AWARE_BACKENDS: client = backend.get_client(llm_key, provider=effective_provider) else: client = backend.get_client(llm_key) except Exception as e: err = f"# Could not create client for {effective_provider}: {e}" return chat_history, pd.DataFrame(), "", pd.DataFrame(), err, dl, dl, "" # ---------------------------------------------------------------- # Dispatch: ringmaster-aware backend vs legacy backend # ---------------------------------------------------------------- is_ringmaster = hasattr(backend, "run_ringmaster") if is_ringmaster: # Ringmaster receives the raw user message plus a context dict # holding session state. The supervisor calls check_data_status # as its first tool, so we must NOT prefix the message with the # loaded data the way legacy backends do. ringmaster_context = { "loaded_context": loaded_context or "", "llm_provider": effective_provider, "llm_key": llm_key or "", "cgt_result": None, "cta_result": None, } try: result = backend.run_ringmaster(client, user_message, ringmaster_context) except Exception as e: err_reply = f"(error from {mode} / {effective_provider}: {e})" new_history = (chat_history or []) + [ {"role": "user", "content": user_message}, {"role": "assistant", "content": err_reply}, ] return new_history, pd.DataFrame(), "", pd.DataFrame(), "", dl, dl, "" else: # Legacy path: prefix loaded_context into the message text, call # backend.run(client, message) or backend.run(client, message, provider=...) if loaded_context: effective_message = ( f"Available data:\n{loaded_context[:MAX_CONTEXT_CHARS]}\n\n" f"User question: {user_message}" ) else: effective_message = user_message try: if mode in PROVIDER_AWARE_BACKENDS: result = backend.run(client, effective_message, provider=effective_provider) else: result = backend.run(client, effective_message) except Exception as e: err_reply = f"(error from {mode} / {effective_provider}: {e})" new_history = (chat_history or []) + [ {"role": "user", "content": user_message}, {"role": "assistant", "content": err_reply}, ] return new_history, pd.DataFrame(), "", pd.DataFrame(), "", dl, dl, "" new_history = (chat_history or []) + [ {"role": "user", "content": user_message}, {"role": "assistant", "content": result["reply"]}, ] steps_df, extracted_json, chart_df, code_snippet = build_outputs( user_message, mode, result ) # For the artifact log, record what was actually sent to the backend. # Ringmaster receives the raw user_message; legacy backends may receive # the prefixed effective_message. logged_effective = effective_message if not is_ringmaster else user_message run_artifact = { "timestamp": datetime.now().isoformat(), "source_type": f"chat_run_{mode.lower()}", "mode": mode, "llm_provider": effective_provider, "user_message": user_message, "effective_message": logged_effective, "reply": result["reply"], "steps": result["steps"], "extracted": result["extracted"], } run_path = save_json_artifact(run_artifact, f"run_{mode.lower()}") dl.append(run_path) return ( new_history, steps_df, extracted_json, chart_df, code_snippet, dl, dl, "", ) # ---------------------------------------------------------------- # Form submission — saves a form JSON, then routes through process_message # ---------------------------------------------------------------- def submit_form(task_type, operation, num_a, num_b, city, notes, mode, llm_provider, llm_key, chat_history, loaded_context, downloads_list): dl = list(downloads_list or []) form_artifact = { "timestamp": datetime.now().isoformat(), "source_type": "form_submission", "task_type": task_type, "operation": operation, "number_a": num_a, "number_b": num_b, "city": city, "notes": notes, } form_path = save_json_artifact(form_artifact, "form") dl.append(form_path) builders = { "Math": lambda: f"Calculate {num_a} {operation.lower()} {num_b}", "Weather": lambda: f"What is the weather in {city}?", "General": lambda: notes or "Hello", } user_message = builders[task_type]() return process_message(user_message, mode, llm_provider, llm_key, chat_history, loaded_context, dl) def clear_form(): return "Math", "Add", 0, 0, "", "" def new_chat(downloads_list): dl = list(downloads_list or []) return [], pd.DataFrame(), "", pd.DataFrame(), "", dl, dl, "" # ============================================================================ # ZONE 4 — UI definition (gr.Blocks) # ============================================================================ # Layout tree: # Row # +-- Column (sidebar): settings, mode, new chat, tab guide # +-- Column (main): # +-- Chatbot (display) # +-- Row: chat_input + send_btn # +-- Tabs (top-level) # +-- Data sources (Tab) # | +-- Tabs (inner) # | +-- Web scraping # | +-- PDF upload # | +-- CSV / Excel upload # +-- Form (Tab) # +-- Results (Tab) # | +-- Tabs (inner) # | +-- Table # | +-- Code # | +-- Extracted # +-- Visuals (Tab) # +-- Downloads (Tab) # # TWO gr.State OBJECTS persist values across clicks: # loaded_context_state -> text from the last loaded data source # downloads_state -> list of file paths, grows as artifacts are created # ---------------------------------------------------------------- # UI # ---------------------------------------------------------------- with gr.Blocks( theme=gr.themes.Soft(primary_hue="orange"), title="Agentic AI Systems for Large Scale Content Analysis", css=""" #main_chatbot { border: 1px solid #e0e0e0; border-radius: 8px; padding: 4px; } #send_btn { min-height: 42px !important; max-height: 42px !important; height: 42px !important; } #chat_input textarea { min-height: 42px !important; max-height: 42px !important; } #desc_block p, #desc_block small { font-size: 0.78rem; line-height: 1.3; } #desc_block { margin-bottom: 2px; } #sidebar_block label { font-size: 0.78rem !important; } #sidebar_block .wrap { padding: 4px 6px !important; } #sidebar_block input { font-size: 0.78rem !important; padding: 4px !important; } .sidebar-label { font-size: 0.72rem; color: #666; margin: 2px 0 0 0; line-height: 1.2; } /* Force all tab navs to wrap to multiple rows */ .tab-nav, .tab-nav-container, [class*="tab-nav"], [class*="tab_nav"], div[role="tablist"], .tabs > div:first-child, .tabs > div.tab-nav { display: flex !important; flex-wrap: wrap !important; overflow: visible !important; overflow-x: visible !important; overflow-y: visible !important; max-width: 100% !important; width: 100% !important; white-space: normal !important; gap: 3px !important; height: auto !important; min-height: auto !important; position: relative !important; } /* Hide overflow menu buttons aggressively */ button[aria-label*="verflow"], button[aria-label*="More"], button[aria-label*="more"], button[title*="More"], button[title*="verflow"], .tab-nav-overflow, [class*="overflow-button"], [class*="overflow"] > button:last-child, .tabs button:has(svg):last-child { display: none !important; visibility: hidden !important; width: 0 !important; } /* All tab buttons - FORCE visible, compact */ .tab-nav button, [role="tab"], div[role="tablist"] > button { background: #f5f5f5 !important; border: 1px solid #d0d0d0 !important; border-radius: 6px !important; margin: 1px !important; padding: 3px 7px !important; font-size: 0.72rem !important; font-weight: 500 !important; line-height: 1.15 !important; min-width: auto !important; max-width: none !important; white-space: pre-line !important; flex: 0 0 auto !important; height: auto !important; min-height: auto !important; display: inline-block !important; visibility: visible !important; opacity: 1 !important; color: #111 !important; } .tab-nav button.selected, [role="tab"][aria-selected="true"] { background: #e8621a !important; color: white !important; border: 1px solid #e8621a !important; } """ ) as demo: gr.Markdown("

Agentic AI Systems for Large Scale Content Analysis

") gr.Markdown("

Where Agentic AI Meets Qualitative Research — Thematic Analysis, Grounded Theory and Machine Learning

") gr.Markdown("

AI-First User Interface in the Age of Agents and Chatbots

") with gr.Row(elem_id="desc_block"): with gr.Column(scale=1): gr.Markdown( "" "**Agent Progression** — Raw Python → LangChain → LangGraph Supervisor → smolagents → CrewAI Multi-Agent → LlamaIndex \n" "**Web Scraping at Scale** — Agentic URL scraper, PDF loader, spreadsheet loader, real-time web search \n" "**Embedding-Based Supervised ML** — sentence embeddings → text classifier → accuracy evaluation → prediction \n" "**Embedding-Based Unsupervised ML** — sentence embeddings → hierarchical clustering → silhouette scoring → LLM cluster labelling" "" ) with gr.Column(scale=1): gr.Markdown( "**🔬 Researcher Workbench** \n" "**⚡ Agentic Computational Thematic Analysis** — 6-phase: familiarize → code → themes → review → define → report \n" "**⚡ Agentic Computational Grounded Theory** — pattern detection → refinement → confirmation \n" "" "Refs: Braun & Clarke (2006) QRP 3(2); Gauthier & Wallace (2022) PACMHCI 6(GROUP); " "Nelson (2020) SMR 49(1); Carlsen & Ralund (2022) BDS 9(1); Glaser & Strauss (1967)." "" ) loaded_context_state = gr.State("") downloads_state = gr.State([]) trained_state = gr.State(None) # ------------------------------------------------------------------------ # Per-workbench corpus states — methodological isolation (FT50 Priority 2a). # Each of the three workbenches owns its own corpus so a CSV loaded in one # does NOT appear in another. Replaces the former shared corpus state. # ------------------------------------------------------------------------ bc_corpus_state = gr.State([]) # Braun & Clarke (reflexive TA) workbench corpus gw_corpus_state = gr.State([]) # Gauthier & Wallace (TA at scale) workbench corpus gw_approved_corpus_state = gr.State([]) # G&W Phase 1 output — feeds Phase 2-6 cgt_corpus_state = gr.State([]) # Nelson + Carlsen & Ralund (grounded theory) corpus with gr.Row(): # ---------------- Sidebar ---------------- with gr.Column(scale=1, min_width=220): new_chat_btn = gr.Button("+ New chat", variant="primary") gr.Markdown("LLM — Mistral (locked)") llm_provider_select = gr.Dropdown( choices=list(providers.LLM_PROVIDERS.keys()), value="Mistral", label="LLM provider", interactive=False, info="Locked to Mistral for this release.", ) llm_key_input = gr.Textbox( label="LLM API key", type="password", placeholder="paste your Mistral API key", ) gr.Markdown("Embedding — MiniLM 384-dim (locked)") embedding_provider_select = gr.Dropdown( choices=list(providers.EMBEDDING_PROVIDERS.keys()), value="MiniLM (local)", label="Embedding provider", interactive=False, info="Locked to MiniLM (local) for this release.", ) embedding_key_input = gr.Textbox( label="Embedding API key", type="password", placeholder="not needed for MiniLM (local)", interactive=False, ) gr.Markdown("Agent Backend — Research Assistant + Vector Embeddings (locked)") _mode_choices = list(BACKENDS.keys()) or ["(no backends loaded)"] # Prefer Research Assistant as the default if present if "Research Assistant enabled by Vector Embeddings" in _mode_choices: _mode_default = "Research Assistant enabled by Vector Embeddings" else: _mode_default = _mode_choices[0] mode_select = gr.Radio( choices=_mode_choices, value=_mode_default, label="Backend", interactive=False, info="Locked to Research Assistant for this release.", ) gr.Markdown( "" "**Tabs:** Inputs (data, form) · Processing (Supervised ML, Unsupervised ML, Vector) · Outputs (Results, Visuals, Downloads) · Researcher Workbench" "" ) # ---------------- Main area ---------------- with gr.Column(scale=3): chatbot = gr.Chatbot(height=220, label="Conversation", elem_id="main_chatbot") with gr.Row(): chat_input = gr.Textbox( placeholder="Message the agent...", show_label=False, scale=5, elem_id="chat_input", ) send_btn = gr.Button("Send", scale=1, variant="primary", elem_id="send_btn") with gr.Tabs(): # =================== INPUTS =================== # =================== INPUTS =================== with gr.Tab("Inputs"): with gr.Tabs(): with gr.Tab("Data sources"): gr.Markdown( "Load external data as context. Each load is saved " "as a timestamped JSON file in the Downloads tab." ) with gr.Tabs(): with gr.Tab("Web scraping"): url_input = gr.Textbox( label="URL", placeholder="https://example.com", ) with gr.Row(): scrape_btn = gr.Button("Scrape", variant="primary") scrape_clear_btn = gr.Button("Clear") scrape_preview = gr.Textbox( label="Extracted text", lines=8, interactive=False, ) scrape_status = gr.Markdown("Nothing loaded.") with gr.Tab("PDF upload"): pdf_input = gr.File( label="Upload PDF", file_types=[".pdf"], ) with gr.Row(): pdf_extract_btn = gr.Button("Extract text", variant="primary") pdf_clear_btn = gr.Button("Clear") pdf_preview = gr.Textbox( label="Extracted text", lines=8, interactive=False, ) pdf_status = gr.Markdown("Nothing loaded.") with gr.Tab("CSV / Excel upload"): csv_input = gr.File( label="Upload CSV or Excel", file_types=[".csv", ".xlsx", ".xls"], ) with gr.Row(): csv_load_btn = gr.Button("Load", variant="primary") csv_clear_btn = gr.Button("Clear") csv_preview = gr.Dataframe( label="Preview (first 20 rows)", interactive=False, ) csv_status = gr.Markdown("Nothing loaded.") with gr.Tab("ML examples"): gr.Markdown( "Load the built-in catalog of labeled ML paper " "sentences as context. No upload needed — the " "dataset lives in examples.py." ) with gr.Row(): ml_load_btn = gr.Button("Load catalog", variant="primary") ml_clear_btn = gr.Button("Clear") ml_preview = gr.Textbox( label="Catalog preview", lines=10, interactive=False, ) ml_status = gr.Markdown("Nothing loaded.") with gr.Tab("Form"): gr.Markdown( "Fill structured fields and hit Submit. Generates a chat " "message and saves the form fields as their own JSON file." ) form_task = gr.Dropdown( ["Math", "Weather", "General"], value="Math", label="Task type", ) form_op = gr.Dropdown( ["Add", "Multiply"], value="Add", label="Operation (Math only)", ) with gr.Row(): form_a = gr.Number(label="Number A", value=0) form_b = gr.Number(label="Number B", value=0) form_city = gr.Textbox( label="City (Weather only)", placeholder="e.g. Tokyo", ) form_notes = gr.Textbox( label="Notes (General only)", lines=2, ) with gr.Row(): form_submit = gr.Button("Submit", variant="primary") form_clear = gr.Button("Clear") # =================== SUPERVISED MACHINE LEARNING =================== # =================== PROCESSING / ANALYSIS =================== with gr.Tab("Processing / Analysis"): with gr.Tabs(): with gr.Tab("Supervised Machine Learning"): gr.Markdown( "**Supervised ML** on the built-in 100-sentence customer-feedback " "dataset (6 labels). Uses semantic embeddings from " "`sentence-transformers/all-MiniLM-L6-v2` + logistic regression. " "No LLM involved." ) with gr.Tabs(): with gr.Tab("Dataset"): gr.Markdown( "The 100 labeled sentences the classifier learns from. " "Six labels, roughly balanced: positive_review, " "negative_review, question, complaint, compliment, " "feature_request." ) sup_label_filter = gr.Dropdown( choices=["(all)"] + list(sorted( {e["label"] for e in TRAINING_EXAMPLES} )), value="(all)", label="Filter by label", ) sup_dataset_view = gr.Dataframe( value=pd.DataFrame(TRAINING_EXAMPLES), label=f"Training dataset ({len(TRAINING_EXAMPLES)} sentences)", interactive=False, wrap=True, ) with gr.Tab("Train"): gr.Markdown( "Click Train to fit a logistic regression classifier on " "semantic embeddings of 80 sentences (stratified split), " "then evaluate on the remaining 20." ) with gr.Row(): train_btn = gr.Button("Train classifier", variant="primary") train_clear_btn = gr.Button("Clear") train_status = gr.Markdown("Not trained yet.") confusion_out = gr.Dataframe( label="Confusion matrix (rows=actual, cols=predicted)", interactive=False, wrap=True, ) with gr.Tab("Predict"): gr.Markdown( "Type a new sentence to classify. The classifier must " "be trained first — go to the Train sub-tab and click " "Train classifier before using this panel." ) predict_input = gr.Textbox( label="Sentence", placeholder="e.g. this product is amazing", lines=2, ) predict_btn = gr.Button("Predict", variant="primary") predict_out = gr.Markdown("No prediction yet.") # =================== UNSUPERVISED MACHINE LEARNING =================== with gr.Tab("Unsupervised Machine Learning"): gr.Markdown( "**Unsupervised ML** on the same 100-sentence dataset with the " "labels hidden from the algorithm. Uses semantic embeddings from " "`sentence-transformers/all-MiniLM-L6-v2` + **Hierarchical " "Agglomerative Clustering** with cosine distance." ) with gr.Tabs(): with gr.Tab("Dataset"): gr.Markdown( "The 100 sentences the clustering algorithm sees. " "Labels are hidden here on purpose — unsupervised " "learning works without them. After clustering runs, " "the Cluster sub-tab compares discovered clusters to " "the true labels so you can see what the algorithm " "figured out on its own." ) unsup_dataset_view = gr.Dataframe( value=pd.DataFrame( [{"sentence": e["sentence"]} for e in TRAINING_EXAMPLES] ), label=f"Sentences only ({len(TRAINING_EXAMPLES)} rows, no labels)", interactive=False, wrap=True, ) with gr.Tab("Cluster"): gr.Markdown( "**Hierarchical Agglomerative Clustering** on " "semantic embeddings. Clusters emerge from a " "similarity threshold instead of a fixed count. " "Small clusters become **noise**. Each surviving " "cluster exposes its **centroid** and the " "**N nearest-to-centroid** sentences as " "representatives — optionally sent to an LLM " "for an automatic cluster label." ) cluster_sim = gr.Slider( 0.40, 0.90, value=0.60, step=0.05, label="Similarity threshold", info="Minimum cosine similarity between vectors to merge.", ) cluster_min = gr.Slider( 2, 10, value=3, step=1, label="Minimum cluster size", info="Clusters smaller than this are reassigned to noise.", ) cluster_nnear = gr.Slider( 1, 10, value=3, step=1, label="N nearest-to-centroid", info="How many representative sentences to pick per cluster.", ) cluster_llm_toggle = gr.Checkbox( label="Label clusters with LLM", value=False, info="Sends the N nearest sentences per cluster to the sidebar LLM provider for a short label. Adds ~2s per cluster.", ) with gr.Row(): cluster_btn = gr.Button("Cluster", variant="primary") cluster_clear_btn = gr.Button("Clear") cluster_status = gr.Markdown("Not clustered yet.") cluster_out = gr.Dataframe( label="Sentence-level cluster table", interactive=False, wrap=True, ) # =================== VECTOR PROCESSING =================== with gr.Tab("Vector Processing"): gr.Markdown( "**Semantic vector storage and retrieval** using ChromaDB " "as a persistent on-disk vector database. \n" "Same embedding model as Supervised / Unsupervised ML " "(`sentence-transformers/all-MiniLM-L6-v2`), 384 dimensions, " "cosine similarity. Every sentence is stored with its label " "as metadata so retrieval results include ground-truth labels." ) with gr.Tabs(): with gr.Tab("Vectorize"): gr.Markdown( "See what a sentence embedding actually looks like. " "Click Preview to compute embeddings for the first " "10 training sentences and show the first 8 dimensions " "of each 384-dim vector." ) with gr.Row(): vectorize_btn = gr.Button( "Preview embeddings", variant="primary", ) vectorize_clear_btn = gr.Button("Clear") vectorize_status = gr.Markdown( "Click 'Preview embeddings' to see sentence vectors." ) vectorize_out = gr.Dataframe( label="Sentences with embedding preview", interactive=False, wrap=True, ) with gr.Tab("Vector DB"): gr.Markdown( "**ChromaDB-backed persistent vector store.** \n" "Step 1: Click 'Index all 100 sentences' once per " "session to embed the training data and write it to " "the local Chroma collection. \n" "Step 2: Type a query and click 'Semantic search' to " "retrieve the nearest training sentences. The results " "show cosine similarity and the ground-truth label " "from the metadata." ) gr.Markdown("### Index") with gr.Row(): vector_index_btn = gr.Button( "Index all 100 sentences", variant="primary", ) vector_clear_btn = gr.Button("Clear index") vector_index_status = gr.Markdown("Not indexed yet.") gr.Markdown("### Semantic search") vector_query = gr.Textbox( label="Query", placeholder="e.g. the app keeps crashing", lines=2, ) vector_n = gr.Slider( 1, 10, value=5, step=1, label="Number of results", ) vector_search_btn = gr.Button( "Semantic search", variant="primary", ) vector_search_status = gr.Markdown( "Enter a query and click 'Semantic search'." ) vector_search_out = gr.Dataframe( label="Nearest neighbors (cosine similarity)", interactive=False, wrap=True, ) # =================== OUTPUTS =================== # =================== OUTPUTS =================== with gr.Tab("Outputs"): with gr.Tabs(): with gr.Tab("Results"): with gr.Tabs(): with gr.Tab("Table"): gr.Markdown("Step log for the most recent run.") table_out = gr.Dataframe( headers=["step", "type", "tool", "args", "result"], label="", wrap=True, ) with gr.Tab("Code"): gr.Markdown("Python snippets for the most recent run.") code_out = gr.Code(language="python", label="") with gr.Tab("Extracted"): gr.Markdown("What the agent parsed from the most recent run.") extracted_out = gr.Code(language="json", label="") with gr.Tab("Visuals"): gr.Markdown("Tool-call counts for the most recent run.") chart_out = gr.BarPlot( x="tool", y="count", title="", tooltip=["tool", "count"], height=280, ) with gr.Tab("Downloads"): gr.Markdown( "Every input and every run is saved here as a " "timestamped JSON file. Files accumulate across the session." ) downloads_files_out = gr.File( label="All artifacts (timestamped JSON)", file_count="multiple", interactive=False, ) # ======================= RESEARCHER WORKBENCH (parent tab) ======================= with gr.Tab("Researcher Workbench"): gr.Markdown( "**Researcher Workbench.** Each published research methodology is a " "self-contained workbench — its own corpus state, its own phases, its " "own prompts, its own contracts. Nothing is shared between workbenches. " "Pick a methodology, upload your corpus, run the phases. New " "methodologies are added as sibling workbenches as the research " "programme expands." ) with gr.Tabs(): # ==================== COMPUTATIONAL THEMATIC ANALYSIS ==================== with gr.Tab("Computational Grounded Theory"): gr.Markdown( "**Computational Grounded Theory** — methodology family based on " "Glaser & Strauss (1967), operationalized computationally by Nelson (2020) " "with Carlsen & Ralund (2022) insisting the researcher remains central." ) with gr.Tabs(): with gr.Tab("Nelson + Carlsen & Ralund Workbench"): gr.Markdown( "## Computational Grounded Theory (3-step framework)\n" "*Nelson (2020). Computational grounded theory: A methodological framework. " "Sociological Methods & Research, 49(1), 3-42.* \n" "*Carlsen & Ralund (2022). Computational grounded theory revisited: From computer-led to computer-assisted. " "Big Data & Society, 9(1) — critique implemented: researcher approves every step.*\n\n" "**Pipeline (co-pilot, researcher approves every step):** " "**Load corpus** → Pattern Detection (unsupervised ML) → Pattern Refinement (close reading) → Pattern Confirmation (supervised ML) \n" "Maps to traditional GT: open → axial → selective coding." ) with gr.Accordion( "📊 Methodology reference: paper technique vs ours (click to expand — copy-ready for paper)", open=False, ): gr.Markdown( METHOD_COMPARISONS["cgt"].as_markdown(), elem_classes=["comparison-window"], ) cgt_comparison_dl_btn = gr.Button( "⬇ Download methods comparison as .md (paste into paper)", variant="secondary", ) cgt_comparison_dl_status = gr.Markdown("") gr.Markdown("---") gr.Markdown("### Step 1 — Load your corpus") cgt_load_test_btn = gr.Button( "Load built-in test_phase1.csv (30 sentences)", variant="secondary", ) gr.Markdown("**— or —**") cgt_upload_csv = gr.UploadButton( label="📁 Upload corpus CSV", file_types=[".csv"], file_count="single", variant="primary", ) gr.Markdown( "*Required columns: `L1`, `L2`, `L3`, `L4`, `sentence_id`, `sentence`*" ) cgt_load_status = gr.Markdown("**No corpus loaded.**") cgt_corpus_preview = gr.Dataframe( label="Loaded corpus preview", interactive=False, wrap=True, ) gr.Markdown("---") gr.Markdown( "### Step 2 — LangGraph Supervisor (Pattern Detection)\n" "*Architectural demo of the 3-step framework. " "Pattern Detection is implemented; Refinement and Confirmation are placeholders pending full integration.*" ) wb_cgt_msg = gr.Textbox( label="Request to the supervisor", value="Run computational grounded theory on the training data.", lines=2, ) with gr.Row(): wb_cgt_sim = gr.Slider( 0.40, 0.90, value=0.60, step=0.05, label="Similarity threshold", ) wb_cgt_min = gr.Slider( 2, 10, value=3, step=1, label="Minimum cluster size", ) wb_cgt_nnear = gr.Slider( 1, 10, value=3, step=1, label="N nearest to centroid", ) with gr.Row(): wb_cgt_run = gr.Button("Run Workbench", variant="primary") wb_cgt_reply = gr.Markdown("Not run yet.") gr.Markdown("### Graph execution trace") wb_cgt_trace = gr.Dataframe( headers=["step", "node", "action", "detail"], label="Supervisor routing + node invocations", interactive=False, wrap=True, ) gr.Markdown("### Pattern Detection output (Step 1)") wb_cgt_sentences = gr.Dataframe( label="Sentences with cluster id + LLM cluster label", interactive=False, wrap=True, ) # ============================================ # CGT Phase 2 — Pattern Refinement (Nelson 2020 Step 2) # ============================================ gr.Markdown("---") with gr.Accordion( "Phase 2 — Pattern Refinement (Nelson 2020 Step 2) — close reading + verdict", open=False, ): gr.Markdown( "### Phase 2 — Pattern Refinement\n" "*Nelson (2020) Step 2: for each pattern from Phase 1, the tool " "surfaces exemplar sentences (top-N by centroid proximity) and drafts " "an interpretive memo. The researcher reads the exemplars, writes the " "final memo, and assigns a verdict: **keep / merge / split / drop / rename**. " "Per Carlsen & Ralund 2022, the researcher decides; the LLM drafts.*\n\n" "**Prerequisites:** Phase 1 Pattern Detection must have run (populating " "the table above). Reflexive positioning required (>=20 chars, contract-enforced)." ) with gr.Row(): cgt_p2_n_exemplars = gr.Slider( minimum=1, maximum=20, value=5, step=1, label="Exemplar sentences per pattern", info="Top-N closest to cluster centroid (higher = more context, slower LLM drafting)", ) cgt_p2_reflexivity = gr.Textbox( label="Reflexive positioning (required, >=20 chars — C&R 2022 contract)", placeholder="Your position as analyst for CGT: who are you reading the patterns as? What stake?", lines=3, ) cgt_p2_surface_btn = gr.Button( "Surface exemplars + draft LLM memos (Phase 2)", variant="primary", ) cgt_p2_status = gr.Markdown( "*Click the button above after Phase 1 has run.*" ) cgt_p2_refinement_table = gr.Dataframe( headers=[ "pattern_id", "pattern_label", "n_sentences", "exemplars", "llm_memo_draft", "researcher_memo", "verdict", "new_label", ], label="Phase 2 Refinement Table — EDIT researcher_memo + verdict + new_label for each row", interactive=True, wrap=True, ) gr.Markdown( "**Valid verdicts:** `keep`, `merge`, `split`, `drop`, `rename`. " "For `rename` or `split`, fill in the `new_label` column. " "**Every row must have a researcher_memo and a valid verdict before saving.**" ) cgt_p2_save_btn = gr.Button( "Save Phase 2 refinement -> JSON artifact", variant="secondary", ) cgt_p2_save_status = gr.Markdown("") with gr.Tab("Computational Thematic Analysis"): gr.Markdown( "**Braun & Clarke 2006** — six-phase reflexive thematic analysis. " "This workbench groups two complementary paths: \n" "- **Workbench** — the LangGraph supervisor approach (Phase 2 real, rest placeholders) \n" "- **Phase 1 — Familiarization** — active-reading dialogue via grounded " "dialogue partners, followed by researcher confirmation of each initial noticing" ) with gr.Tabs(): # ------------ Gauthier & Wallace at-scale path ------------ with gr.Tab("G&W at Scale"): gr.Markdown( "## Computational Thematic Analysis at Scale\n\n" "*Gauthier & Wallace (2022). The Computational Thematic Analysis Toolkit. " "Proc. ACM Hum.-Comput. Interact., 6(GROUP), Art. 25.*\n\n" "**Same 6 phases as Braun & Clarke — with Phase 0 Sampling (G&W 2022) prepended.**\n\n" "Designed for large-scale corpora (Apify scrapes, forums, 1000s of documents). " "MiniLM embeds all sentences, HDBSCAN clusters them, and representative " "sentences are selected for Phase 2 coding.\n\n" "**Pipeline:** **Load corpus** → **P0 Compress** → **P1 Familiarize** (on compressed corpus) → P2 Code → P3 Themes → P4 Review → P5 Define → P6 Report" ) with gr.Accordion( "📊 Methodology reference: paper technique vs ours (click to expand — copy-ready for paper)", open=False, ): gr.Markdown( METHOD_COMPARISONS["gw"].as_markdown(), elem_classes=["comparison-window"], ) gw_comparison_dl_btn = gr.Button( "⬇ Download methods comparison as .md (paste into paper)", variant="secondary", ) gw_comparison_dl_status = gr.Markdown("") gr.Markdown("---") gr.Markdown("### Step 1 — Load your corpus") gw_load_test_btn = gr.Button( "Load built-in test_phase1.csv (30 sentences)", variant="secondary", ) gr.Markdown("**— or —**") gw_upload_csv = gr.UploadButton( label="📁 Upload large corpus CSV (e.g. 1000 sentences)", file_types=[".csv"], file_count="single", variant="primary", ) gr.Markdown( "*Required columns: `L1`, `L2`, `L3`, `L4`, `sentence_id`, `sentence`*" ) gw_load_status = gr.Markdown("**No corpus loaded.**") gw_corpus_preview = gr.Dataframe( label="Loaded corpus preview", interactive=False, wrap=True, ) gr.Markdown("---") gr.Markdown( "### Step 1.5 — Phase 0 Preparation (pre-sampling hygiene)\n" "*Four optional sub-steps for 1M-scale corpora. Each preserves the " "L1/L2/L3/L4/sentence_id/sentence schema and adds a `frequency_weight` " "column. Recommended order: noise → length → hash dedup → semantic dedup. " "Each emits a reproducibility artifact. \n\n" "**Literature:** Moreno-Ortiz & García-Gámez (2023) *Corpus Pragmatics* 7:241–265 " "(31B-word Twitter corpus methodology); BERTopic_Teen (2025) PMC12378273 " "(hash + semantic dedup); Abbas et al. (2023) *SemDeDup* ICLR Workshop; " "Reimers & Gurevych (2019) EMNLP (MiniLM).*" ) with gr.Row(): p0prep_min_words = gr.Slider( minimum=1, maximum=10, value=3, step=1, label="Length filter — min words", info="Moreno-Ortiz 2023 p.7: default=3.", ) p0prep_case_sensitive = gr.Checkbox( label="Hash dedup case-sensitive", value=False, info="Unchecked: 'Great!' and 'great!' merge. Default for reviews.", ) p0prep_semantic_threshold = gr.Slider( minimum=0.90, maximum=0.99, value=0.97, step=0.01, label="Semantic dedup threshold (cosine)", info="0.97 default for reviews (SemDeDup). 0.95 tighter (more merging). 0.99 stricter.", ) with gr.Row(): p0prep_noise_btn = gr.Button( "① Strip noise (URLs, emoji, Unicode)", variant="secondary", ) p0prep_length_btn = gr.Button( "② Apply length filter", variant="secondary", ) p0prep_hash_btn = gr.Button( "③ Hash deduplicate (exact)", variant="secondary", ) p0prep_semantic_btn = gr.Button( "④ Semantic deduplicate (MiniLM)", variant="secondary", ) p0prep_status = gr.Markdown( "*No preparation step run yet. Click any button above to run that step " "on the currently loaded corpus. Each step updates the corpus state " "and writes an audit JSON artifact.*" ) p0prep_table = gr.Dataframe( label="Preparation output — last step result", interactive=False, wrap=True, ) gr.Markdown( "*`frequency_weight` counts how many original-corpus sentences this row " "represents after deduplication. Downstream prevalence (in Phase 6) is " "computed as weighted sum, so compression is honest to the full corpus. " "The final output of whichever preparation steps you run becomes the " "input to Phase 0 Sampling below.*" ) gr.Markdown("---") gr.Markdown( "### Step 2 — Phase 0 Sampling (G&W 2022)\n" "*Reduce large corpus to representative sentences. " "MiniLM (Reimers & Gurevych 2019) embeds all sentences, " "HDBSCAN (Campello et al. 2013, 2015) clusters them, " "and representatives are selected by HDBSCAN cluster membership " "probability (density-tree score — the points most central to each " "cluster's density region, ranked descending). " "Phase 1 familiarization and Phase 2 coding run on the compressed corpus.*" ) with gr.Row(): gw_sentences_per_cluster = gr.Slider( minimum=1, maximum=5, value=2, step=1, label="Representatives per cluster", info="Top N sentences by HDBSCAN membership probability (descending). 1 = single heart-of-cluster sentence; 5 = more within-cluster variance.", ) gw_min_cluster_size = gr.Slider( minimum=2, maximum=20, value=3, step=1, label="Minimum cluster size (HDBSCAN mclSize)", info="Campello et al. 2013: components with fewer sentences than this are disregarded as noise. Raise if cluster_fit values come out mostly low (forces tighter clusters).", ) gw_outlier_sample = gr.Slider( minimum=0, maximum=50, value=10, step=5, label="Outlier sample size", info="How many noise-labeled sentences to retain (rare but potentially important views).", ) with gr.Row(): gw_min_cluster_fit = gr.Slider( minimum=0.0, maximum=0.5, value=0.1, step=0.05, label="Minimum cluster_fit threshold", info="Representatives with HDBSCAN membership probability below this are de-selected (reason='below_cluster_fit_threshold') but stay visible in the table so you can override. 0.0 = accept all picks; 0.1 = default; 0.3 = only confident reps; 0.5 = only deepest reps.", ) gw_compress_btn = gr.Button( "Run Phase 0 — Sample corpus", variant="primary", ) gw_compress_status = gr.Markdown("*No compression run yet.*") gr.Markdown( "### Sampling Table\n" "*Edit the `selected` column to manually include or exclude sentences. " "**The selected sentences flow into Phase 1 Familiarization below — " "you cannot skip Phase 1.** Phase 2 coding operates on Phase-1-approved " "sentences only.*" ) gw_compress_table = gr.Dataframe( headers=[ "idx", "L1", "L2", "L3", "L4", "sentence_id", "sentence", "cluster_id_original", "cluster_id_refined", "cluster_id", "cluster_fit", "cluster_mean_fit", "cluster_std_fit", "cluster_quality_tier", "split_decision", "cluster_size", "selected", "reason", ], label="Corpus compression — edit selected column", interactive=True, wrap=True, ) gr.Markdown( "*`cluster_id_original`: HDBSCAN cluster assignment. " "`cluster_id_refined`: final cluster after any accepted Agglomerative split " "(original × 1000 + sub_id for split clusters; same as original otherwise). " "`cluster_fit`: **1.0** = heart of cluster's density region, **0** = edge / near noise. " "`cluster_std_fit`: standard deviation of cluster_fit values within the refined cluster. " "`cluster_quality_tier`: **TIGHT** (std<0.15) / **MEDIUM** (0.15–0.20) / **LOOSE** (≥0.20). " "LOOSE clusters are flagged for researcher review below. \n\n" "**This table is Phase 0 Sampling's output — frozen.** Cluster labels live " "in the Cluster Label Review table further down (one row per refined cluster). " "Phase 1 and later stages join both artifacts on `cluster_id_refined`.*" ) # --- Split Proposals review (LOOSE clusters only) --- gr.Markdown("---") gr.Markdown( "### Split Proposals — Researcher Review (LOOSE clusters)\n" "*When a HDBSCAN cluster has internal std(cluster_fit) ≥ 0.20, " "the pipeline proposes an Agglomerative split (Ward 1963; cosine distance) " "to separate mixed-density sub-patterns. Review each proposal below and " "set `decision` to **ACCEPTED** or **REJECTED**, then click " "**Apply Split Decisions** to re-run Phase 0 with your choices. " "Leaving a row as `PENDING` is allowed but will trigger a soft-warn in the audit.*" ) gw_split_proposals_table = gr.Dataframe( headers=[ "cluster_id_original", "cluster_size", "std_before", "n_sub_proposed", "max_std_after", "improvement", "target_reached", "decision", ], label="Split proposals — edit the `decision` column", interactive=True, wrap=True, ) gw_apply_splits_btn = gr.Button( "Apply Split Decisions & Re-sample", variant="secondary", ) gr.Markdown( "*`std_before`: cluster's std(cluster_fit) before splitting. " "`max_std_after`: largest sub-cluster std if split accepted. " "`improvement`: std_before − max_std_after (higher = cleaner split). " "`target_reached`: True if every proposed sub-cluster has std ≤ 0.15. " "`decision`: **ACCEPTED** = apply the split; **REJECTED** = keep the original cluster intact; " "**PENDING** = undecided (soft-warn allowed).*" ) gw_compressed_corpus_state = gr.State([]) # ============================================================ # Cluster-level labeling workflow (2 iterations + commit) # Researcher-centric design: LLM proposes, researcher edits, # LLM refines (only flagged), researcher commits final labels. # Parallels the Phase 2 iter1/iter2/final pattern. # ============================================================ gr.Markdown("---") gr.Markdown( "### Cluster Label Review \n" "*Four candidate labels per cluster (2 LLM + 2 researcher), followed by researcher's " "mandatory final choice. Iter 1 LLM is strict 2-word descriptive; Iter 2 LLM is " "interpretive 2-4 words. Researcher types into `researcher_edit_iter1` / " "`researcher_edit_iter2` wherever they want to refine. Finally, researcher types the " "authoritative label in `final_label` for every cluster — commit is blocked until all " "are filled. Temperature 0.0 pinned; every iteration's prompt + per-cluster audit " "logged. Final labels propagate to the Sampling Table above.*" ) with gr.Row(): gw_label_init_btn = gr.Button( "① Initialize cluster table", variant="secondary", ) gw_label_iter1_btn = gr.Button( "② Run Iter 1 — LLM strict 2-word labels (all clusters)", variant="primary", ) gw_label_iter2_btn = gr.Button( "③ Run Iter 2 — LLM interpretive re-label (all clusters)", variant="primary", ) gw_label_commit_btn = gr.Button( "④ Commit final labels", variant="primary", ) gw_label_status = gr.Markdown( "*Workflow: ① build cluster table → ② iter 1 (strict 2-word on all) → " "optionally edit `researcher_edit_iter1` → ③ iter 2 (interpretive on all) → " "optionally edit `researcher_edit_iter2` → TYPE `final_label` for every cluster → " "④ commit (blocked if any final_label blank).*" ) gw_cluster_labels_table = gr.Dataframe( headers=[ "cluster_id", "cluster_size", "mean_cluster_fit", "top3_sentences_preview", "llm_label_iter1", "researcher_edit_iter1", "llm_label_iter2", "researcher_edit_iter2", "final_label", ], label="Cluster Label Review — one row per cluster", interactive=True, wrap=True, ) gr.Markdown( "**4 candidate label columns + 1 mandatory choice.** \n" "- `llm_label_iter1` — LLM strict 2-word draft (Button ②) \n" "- `researcher_edit_iter1` — your response to iter 1 (type here; blank = you accept iter 1) \n" "- `llm_label_iter2` — LLM interpretive re-label, 2-4 words (Button ③, all clusters) \n" "- `researcher_edit_iter2` — your response after seeing iter 2 (type here; blank = OK) \n" "- `final_label` — **MANDATORY**. Type (or copy-paste) the winning label for every cluster. \n\n" "Commit is **blocked** until every cluster has a non-blank `final_label`. " "This enforces active researcher choice per Braun & Clarke (2006) — themes are " "*actively developed* by the researcher, not auto-filled. " "Every iteration's artifact JSON contains the exact prompt + top-3 sentences + " "model name for reproducibility." ) # ============================================================ # Phase 1 — Familiarization (G&W path, on compressed corpus) # ============================================================ gr.Markdown("---") with gr.Accordion( "Phase 1 — Familiarization (on compressed corpus)", open=False, ): gr.Markdown( "### Phase 1 — Familiarization\n" "*Braun & Clarke (2006) Phase 1 applied to the Phase 0 sampled corpus. " "Read the representative sentences, write reflexive positioning, " "and confirm initial noticings. This feeds Phase 2 coding.*" ) gr.Markdown("#### Step 1 — Familiarization (facilitator)") gw_p1_facilitator_memo = gr.Textbox( label="Familiarization notes", placeholder="What do you notice in the representative sentences? Patterns, tensions, surprises?", lines=5, ) gw_p1_facilitator_transcript = gr.Textbox( label="Active reading transcript", placeholder="Your dialogue with the facilitator AI, or your own notes.", lines=4, ) gw_p1_facilitator_citations = gr.Textbox( label="Source evidence (quotes + L1/sentence_id)", placeholder="e.g., DOC_0002 Extract 7: 'I kept waiting for the other shoe to drop...'", lines=4, ) gr.Markdown("#### Step 2 — Reflexive companion") gw_p1_companion_challenges = gr.Textbox( label="Reflexive challenges", placeholder="What assumptions are you bringing? What perspectives might you miss?", lines=3, ) gw_p1_companion_reflexivity = gr.Textbox( label="Reflexive positioning (required, min 20 chars — contract enforced)", placeholder="Your position as analyst: who are you reading this as? What's your stake?", lines=3, ) gw_p1_companion_breadth = gr.Textbox( label="Dataset immersion coverage", placeholder="How did you ensure breadth of engagement across the compressed corpus?", lines=2, ) gr.Markdown("#### Step 3 — Researcher confirmation") gw_p1_validation_table = gr.Dataframe( headers=["noticing", "source_evidence", "researcher_confirmed"], label="Confirm each initial noticing before proceeding to Phase 2", interactive=True, wrap=True, ) gw_p1_save_btn = gr.Button( "Save Phase 1 output -> JSON artifact", variant="primary", ) gw_p1_save_status = gr.Markdown("") gr.Markdown("---") # ============================================================ # G&W Phase 2 — Generating Initial Codes (Braun & Clarke p.88) # ============================================================ with gr.Accordion("Phase 2 — Generating Initial Codes", open=False): gr.Markdown( "## Phase 2 — Generating Initial Codes\n\n" "*Braun & Clarke 2006, Phase 2: \"Coding interesting features of " "the data in a systematic fashion across the entire data set\" (p. 88).*\n\n" "Operates on the **Phase-1-approved sentences** from the G&W sampled " "+ familiarized corpus. Iterative refinement: iteration 1 → researcher " "edits → iteration 2 (reads edits) → iteration 3 (convergence). " "Runtime depends on sentence count." ) gr.Markdown("### Corpus — inherited from G&W Phase 1") gr.Markdown( "*Phase 2 reads the Phase-1-approved corpus (selected=true sentences " "from the Sampling Table, filtered by your edits in Phase 1). " "If Phase 1 hasn't been saved, this phase falls back to the raw corpus.*" ) gw_p2_corpus_status = gr.Markdown( "*Save Phase 1 first to populate the approved corpus for Phase 2.*" ) gw_p2_refresh_btn = gr.Button( "Refresh corpus status from G&W Phase 1", variant="secondary", ) gr.Markdown("---") gr.Markdown("### Phase 1 context (consumed by the agent)") gr.Markdown( "*The Phase 2 agent reads the researcher's reflexive " "positioning and confirmed initial noticings from G&W Phase 1 " "as context. This ensures Phase 2 coding is grounded in the " "researcher's familiarization of the compressed corpus.*" ) gw_p2_phase1_summary = gr.Markdown( "*Phase 1 output will appear here after Save Phase 1.*" ) gr.Markdown("---") gr.Markdown("### Coding orientation (Braun & Clarke p. 84)") gr.Markdown( "*SEMANTIC vs LATENT is an analysis-wide choice. " "Choose ONE orientation for this whole G&W analysis.* \n\n" "**Semantic** — surface content, what the text explicitly says \n" "**Latent** — underlying assumptions, what the text implies" ) gw_p2_orientation = gr.Radio( choices=["semantic", "latent"], value="semantic", label="Coding orientation for this G&W analysis", interactive=True, ) gr.Markdown("---") gr.Markdown("### Agentic coding iterations") gr.Markdown( "Iteration 1 → review AI codes → edit `human_code_iterN` → " "iteration 2 (agent reads your edits) → review → iteration 3 → converge." ) with gr.Row(): gw_p2_run_iter1_btn = gr.Button( "Run iteration 1", variant="primary", ) gw_p2_run_iter2_btn = gr.Button( "Run iteration 2 (reads your edits)", variant="primary", ) gw_p2_run_iter3_btn = gr.Button( "Run iteration 3 (final)", variant="primary", ) gw_p2_iter_status = gr.Markdown("*No iterations run yet.*") gr.Markdown("---") gr.Markdown("### Initial Codes Table (G&W)") gw_p2_codes_table = gr.Dataframe( headers=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "ai_code_iter1", "human_code_iter1", "ai_code_iter2", "human_code_iter2", "ai_code_iter3", "human_code_iter3", "final_code", "flagged", ], label="G&W Phase 2 Initial Codes — edit human_code_iterN columns", interactive=True, wrap=True, ) gr.Markdown("---") gr.Markdown("### Codebook (G&W)") gw_p2_codebook_table = gr.Dataframe( headers=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ], label="G&W Phase 2 Codebook — edit definitions", interactive=True, wrap=True, ) gr.Markdown("---") gw_p2_save_btn = gr.Button( "Save G&W Phase 2 Final Codes + Codebook → JSON artifact", variant="primary", ) gw_p2_save_status = gr.Markdown("") # ============================================================ # G&W Phase 3 — Searching for Themes # ============================================================ with gr.Accordion("Phase 3 — Searching for Themes", open=False): gr.Markdown( "## Phase 3 — Searching for Themes\n\n" "*Braun & Clarke 2006, Phase 3: \"Collating codes into potential " "themes, gathering all data relevant to each potential theme\" (p. 89).*\n\n" "Clusters the G&W Phase 2 codebook codes by semantic similarity " "(sentence-transformers embeddings + agglomerative clustering), " "then proposes a candidate theme name and description for each " "cluster via one Mistral call per cluster." ) gr.Markdown("### Clustering parameters (researcher-controlled)") with gr.Row(): gw_p3_similarity = gr.Slider( minimum=0.3, maximum=0.95, value=0.60, step=0.05, label="Similarity threshold", info="Codes more similar than this cluster together. Default 0.60.", ) gw_p3_min_size = gr.Slider( minimum=2, maximum=10, value=2, step=1, label="Minimum cluster size", info="Clusters smaller than this go into noise bucket. Default 2.", ) gw_p3_run_btn = gr.Button( "Run G&W Phase 3 — Cluster codes into candidate themes", variant="primary", ) gw_p3_status = gr.Markdown("*No themes generated yet. Run G&W Phase 2 first.*") gr.Markdown("---") gr.Markdown("### Candidate Themes Table (G&W)") gw_p3_themes_table = gr.Dataframe( headers=[ "theme_id", "candidate_theme_name", "description", "rationale", "member_codes", "code_count", "researcher_theme_name", "researcher_notes", ], label="G&W Phase 3 Candidate Themes — edit researcher_theme_name / researcher_notes", interactive=True, wrap=True, ) gr.Markdown("---") gr.Markdown("### Noise Codes (G&W)") gw_p3_noise_table = gr.Dataframe( headers=["code_name", "definition"], label="G&W noise codes (did not cluster)", interactive=False, wrap=True, ) gr.Markdown("---") gw_p3_save_btn = gr.Button( "Save G&W Phase 3 output (themes + noise → JSON artifact)", variant="secondary", ) gw_p3_save_status = gr.Markdown("") # ============================================================ # G&W Phase 4 — Reviewing Themes # ============================================================ with gr.Accordion("Phase 4 — Reviewing Themes", open=False): gr.Markdown( "## Phase 4 — Reviewing Themes\n\n" "*Braun & Clarke 2006 p. 91: \"Reviewing, refining and sometimes " "reducing your themes.\"*\n\n" "**Level 1** — within-theme coherence. **Level 2** — between-theme " "distinctness. LLM suggests verdict; researcher edits " "`researcher_verdict` and `researcher_action_notes`." ) gw_p4_run_btn = gr.Button( "Run G&W Phase 4 — Review all themes (cohesion + LLM verdict)", variant="primary", ) gw_p4_status = gr.Markdown("*No review run yet. Run G&W Phase 3 first.*") gr.Markdown("---") gr.Markdown("### Theme Review Table (G&W)") gw_p4_review_table = gr.Dataframe( headers=[ "theme_id", "theme_name", "member_codes", "code_count", "member_sentence_count", "within_cohesion", "llm_verdict", "llm_reasoning", "llm_action_suggestion", "researcher_verdict", "researcher_action_notes", ], label="G&W Phase 4 Theme Review — edit researcher_verdict / researcher_action_notes", interactive=True, wrap=True, ) gr.Markdown("---") gw_p4_save_btn = gr.Button( "Save G&W Phase 4 verdicts → JSON artifact", variant="secondary", ) gw_p4_save_status = gr.Markdown("") # ============================================================ # G&W Phase 5 — Defining and Naming Themes # ============================================================ with gr.Accordion("Phase 5 — Defining and Naming Themes", open=False): gr.Markdown( "## Phase 5 — Defining and Naming Themes\n\n" "*Braun & Clarke 2006 p. 92: refine specifics of each theme, " "produce definitions and names.*\n\n" "Takes surviving themes from G&W Phase 4 (verdict = keep or merge) " "and produces final name, definition, scope, narrative contribution. " "Edit `researcher_final_name` / `researcher_definition` to override." ) gw_p5_run_btn = gr.Button( "Run G&W Phase 5 — Define and name surviving themes", variant="primary", ) gw_p5_status = gr.Markdown("*No definitions yet. Run G&W Phase 4 first.*") gr.Markdown("---") gr.Markdown("### Theme Definitions Table (G&W)") gw_p5_def_table = gr.Dataframe( headers=[ "theme_id", "original_name", "final_name", "definition", "scope_note", "narrative_contribution", "member_codes", "code_count", "researcher_final_name", "researcher_definition", ], label="G&W Phase 5 Definitions — edit researcher_final_name / researcher_definition", interactive=True, wrap=True, ) gr.Markdown("---") gw_p5_save_btn = gr.Button( "Save G&W Phase 5 definitions → JSON artifact", variant="secondary", ) gw_p5_save_status = gr.Markdown("") # ============================================================ # G&W Phase 6 — Producing the Report # ============================================================ with gr.Accordion("Phase 6 — Producing the Report", open=False): gr.Markdown( "## Phase 6 — Producing the Report\n\n" "*Braun & Clarke 2006 p. 93: \"tell the complicated story of your data.\"*\n\n" "Generates a complete analytic report from G&W Phase 5 theme definitions. " "AI drafts, researcher refines." ) gw_p6_research_question = gr.Textbox( label="Research question / focus (optional)", placeholder="e.g. How do employees experience organisational change?", lines=2, ) gw_p6_run_btn = gr.Button( "Run G&W Phase 6 — Generate analytic report", variant="primary", ) gw_p6_status = gr.Markdown("*No report yet. Run G&W Phase 5 first.*") gr.Markdown("---") gr.Markdown("### Analytic Report (G&W)") gw_p6_report_text = gr.Textbox( label="G&W Phase 6 Analytic Report (editable)", lines=30, placeholder="Report will appear here after running G&W Phase 6...", interactive=True, ) gr.Markdown("---") gw_p6_save_btn = gr.Button( "Save G&W report → JSON + Markdown artifacts", variant="secondary", ) gw_p6_save_status = gr.Markdown("") # ------------ Existing Workbench path ------------ with gr.Tab("B&C Workbench"): gr.Markdown( "## Reflexive Thematic Analysis (6-phase)\n" "*Braun & Clarke (2006). Using thematic analysis in psychology. " "Qualitative Research in Psychology, 3(2), 77-101.*\n\n" "**Pipeline (co-pilot, researcher approves every step):** " "**Load corpus** → P1 Familiarize → P2 Code → P3 Themes → P4 Review → P5 Define → P6 Report" ) with gr.Accordion( "📊 Methodology reference: paper technique vs ours (click to expand — copy-ready for paper)", open=False, ): gr.Markdown( METHOD_COMPARISONS["bc"].as_markdown(), elem_classes=["comparison-window"], ) bc_comparison_dl_btn = gr.Button( "⬇ Download methods comparison as .md (paste into paper)", variant="secondary", ) bc_comparison_dl_status = gr.Markdown("") gr.Markdown("---") gr.Markdown("### Step 1 — Load your corpus") bc_load_test_btn = gr.Button( "Load built-in test_phase1.csv (30 sentences)", variant="secondary", ) gr.Markdown("**— or —**") bc_upload_csv = gr.UploadButton( label="📁 Upload corpus CSV", file_types=[".csv"], file_count="single", variant="primary", ) gr.Markdown( "*Required columns: `L1`, `L2`, `L3`, `L4`, `sentence_id`, `sentence`*" ) bc_load_status = gr.Markdown("**No corpus loaded.**") bc_corpus_preview = gr.Dataframe( label="Loaded corpus preview", interactive=False, wrap=True, ) gr.Markdown("---") gr.Markdown( "### Step 2 — Open each Phase accordion below (in order)\n" "*Scroll down. Six accordions: Phase 1 → Phase 2 → Phase 3 → Phase 4 → Phase 5 → Phase 6. " "Click any Phase header to expand it. Each phase has its own Run and Save buttons. " "Nothing auto-runs — you approve every step.*" ) gr.Markdown("---") gr.Markdown( "### LangGraph Supervisor Demo (optional)\n" "*Architectural demo showing how a LangGraph supervisor routes between " "six phase-nodes automatically. This is NOT the co-pilot research pipeline " "— use the Phase accordions below for actual analysis.*" ) wb_cta_msg = gr.Textbox( label="Request to the supervisor", value="Run reflexive thematic analysis on the training data.", lines=2, ) wb_cta_max = gr.Slider( 5, 100, value=20, step=5, label="Max sentences to code", info="One LLM call per sentence in Phase 2. " "Default 20 keeps runtime under ~40 seconds.", ) wb_cta_run = gr.Button("Run Workbench", variant="primary") wb_cta_reply = gr.Markdown("Not run yet.") gr.Markdown("### Graph execution trace") wb_cta_trace = gr.Dataframe( headers=["step", "node", "action", "detail"], label="Supervisor routing + node invocations", interactive=False, wrap=True, ) gr.Markdown("### Phase 2 output — Initial Codes") wb_cta_codes = gr.Dataframe( label="Sentences with LLM-generated codes", interactive=False, wrap=True, ) # ------------ NEW: Phase 1 — Familiarization path ------------ with gr.Accordion("Phase 1 — Familiarizing Yourself With Your Data", open=False): gr.Markdown( "## Phase 1 — Familiarizing Yourself With Your Data\n\n" "*Braun & Clarke 2006, Phase 1: \"immerse yourself in the data " "to the extent that you are familiar with the depth and breadth " "of the content\"* (p. 87).\n\n" "This workbench implements Phase 1 through a three-step " "active-reading protocol. Two complementary dialogue partners " "(implemented as Gemini Gems backed by NotebookLM) guide the " "researcher through immersion and reflexive engagement, " "followed by researcher confirmation of every initial noticing " "against its source evidence.\n\n" "**Step 1 — Familiarization Facilitator** — an active-reading " "dialogue partner that asks grounded questions, surfaces " "patterns, and prompts the researcher to articulate initial " "noticings. Every response is anchored in direct quotation " "from the source corpus. \n" "**Step 2 — Reflexive Companion** — a critical dialogue partner " "that challenges the researcher's initial noticings, probes " "reflexive positioning, and verifies dataset immersion " "coverage across all sources. \n" "**Step 3 — Researcher Confirmation** — the researcher reviews " "each initial noticing against its source sentence and " "confirms, refines, or rejects it. This forces active " "engagement with the evidence and is the researcher's own " "analytic act — not the dialogue partner's.\n\n" "**Braun & Clarke 2006 compliance target:** ≥90% when both " "dialogue partners are engaged with iteration. Unclosable " "gaps documented in COMPLIANCE.md: felt sense of the data " "(phenomenological, unautomatable), and time-on-task " "verification (researcher's own responsibility)." ) # ---- Corpus loader ---- gr.Markdown("### Corpus — Canonical CSV") gr.Markdown( "*Phase 1 consumes a canonical CSV with five columns: " "`L1`, `L2`, `L3`, `L4`, `sentence_id`, `sentence`. " "Inputs tab transformers (PDF→CSV, web scrape→CSV) will " "produce this schema in a future round. For pipeline testing, " "load the built-in test corpus.*" ) with gr.Row(): bc_p1_load_test_btn = gr.Button( "Load built-in test_phase1.csv (30 sentences)", variant="secondary", scale=1, ) gr.Markdown("### Upload your own CSV (canonical schema)") bc_p1_upload_csv = gr.UploadButton( label="📁 Click to upload canonical CSV", file_types=[".csv"], file_count="single", variant="primary", ) gr.Markdown( "*Required columns: `L1`, `L2`, `L3`, `L4`, `sentence_id`, `sentence`*" ) bc_p1_corpus_status = gr.Markdown("**No corpus loaded.**") bc_p1_corpus_preview = gr.Dataframe( label="Corpus preview", interactive=False, wrap=True, ) # ---- Step 1 — Familiarization Facilitator ---- gr.Markdown("---") gr.Markdown("### Step 1 — Familiarization Facilitator") gr.Markdown( "An active-reading dialogue partner grounded in your " "corpus via NotebookLM. Copy the instructions below, " "create a Gem in Gemini with your NotebookLM notebook " "attached under Knowledge, engage in the active-reading " "dialogue, then paste your outputs here." ) bc_p1_facilitator_instructions = gr.Textbox( label="Familiarization Facilitator instructions (paste into Gemini Gem)", value="(instructions will be drafted in next round)", lines=8, max_lines=20, ) bc_p1_facilitator_memo = gr.Textbox( label="Paste: Familiarization notes (Braun & Clarke 2006, Phase 1 output)", lines=4, ) bc_p1_facilitator_transcript = gr.Textbox( label="Paste: Full active-reading dialogue transcript", lines=6, ) bc_p1_facilitator_citations = gr.Textbox( label="Paste: Source evidence — quoted sentences anchoring each initial noticing", lines=4, info="One citation per line. Format: L1 | L2 | sentence", ) # ---- Step 2 — Reflexive Companion ---- gr.Markdown("---") gr.Markdown("### Step 2 — Reflexive Companion") gr.Markdown( "A critical dialogue partner that challenges your initial " "noticings, probes your reflexive positioning, and verifies " "immersion coverage across all sources. Run this after the " "Facilitator dialogue is complete." ) bc_p1_companion_instructions = gr.Textbox( label="Reflexive Companion instructions (paste into Gemini Gem)", value="(instructions will be drafted in next round)", lines=8, max_lines=20, ) bc_p1_companion_challenges = gr.Textbox( label="Paste: Reflexive challenges raised by Companion", lines=4, ) bc_p1_companion_reflexivity = gr.Textbox( label="Paste: Reflexive positioning statement", lines=4, info="Your position as researcher — assumptions, theoretical lens, relationship to the data.", ) bc_p1_companion_breadth = gr.Textbox( label="Paste: Dataset immersion coverage notes", lines=3, info="Which sources and sections were engaged with, which remain unread.", ) # ---- Step 3 — Researcher Confirmation ---- gr.Markdown("---") gr.Markdown("### Step 3 — Researcher Confirmation") gr.Markdown( "Review each initial noticing against its source sentence. " "Confirm, refine, or reject each one. This is the researcher's " "own analytic act — not the dialogue partner's. Braun & Clarke " "2019/2021 insist that reflexive thematic analysis is *constructed* " "by the researcher's engagement with the data, not *extracted* by a tool." ) bc_p1_build_table_btn = gr.Button( "Build confirmation table from Steps 1 + 2", variant="secondary", ) bc_p1_validation_table = gr.Dataframe( headers=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "initial_noticing", "reflexive_challenge", "researcher_confirmation", "refined_noticing", ], label="Phase 1 Researcher Confirmation Table — edit the last 4 columns", interactive=True, wrap=True, ) # ---- Save ---- gr.Markdown("---") bc_p1_save_btn = gr.Button( "Save Phase 1 output (all 3 steps → JSON artifact)", variant="primary", ) bc_p1_save_status = gr.Markdown("") # ------------ Phase 2 — Initial Coding ------------ with gr.Accordion("Phase 2 — Generating Initial Codes", open=False): gr.Markdown( "## Phase 2 — Generating Initial Codes\n\n" "*Braun & Clarke 2006, Phase 2: \"Coding interesting features " "of the data in a systematic fashion across the entire data " "set, collating data relevant to each code\"* (p. 87).\n\n" "This workbench implements Phase 2 through a **fully agentic " "LangGraph architecture**. The agent loops systematically " "across every sentence, generates both semantic and latent " "codes, maintains a growing codebook with definitions, and " "iterates with researcher-edited context. The researcher is " "the final authority — human code columns always override AI.\n\n" "**Architecture:** LangGraph supervisor + 7 agent tools " "(read_corpus, read_phase1_context, propose_code, " "check_codebook, add_to_codebook, flag_for_review, " "save_iteration). Agent decides ordering, flags ambiguous " "sentences, and avoids codebook duplication.\n\n" "**Braun & Clarke 2006 compliance target:** ~88% with full " "agent + 3 iterations + researcher review. Unclosable gaps: " "reflexive engagement depth, time-on-task verification, felt " "sense of codes (documented in COMPLIANCE.md).\n\n" "**Round 2 status (this release):** Real LangGraph agent wired. " "Click Run iteration 1 to invoke Mistral through the 7-tool " "supervisor loop. Runtime: ~60-120 seconds for 30 sentences. " "Iteration 2 reads researcher edits from iteration 1. " "Iteration 3 is the final convergence pass." ) # ---- Corpus source ---- gr.Markdown("### Corpus — inherited from Phase 1") gr.Markdown( "*Phase 2 reads the canonical corpus loaded in Phase 1. " "If no corpus is loaded, go to Phase 1 → Familiarization " "and load test_phase1.csv or your own canonical CSV first.*" ) bc_p2_corpus_status = gr.Markdown("No corpus loaded. Load in Phase 1 first.") bc_p2_refresh_btn = gr.Button( "Refresh corpus status from Phase 1", variant="secondary", ) # ---- Phase 1 context consumption ---- gr.Markdown("---") gr.Markdown("### Phase 1 context (consumed by the agent)") gr.Markdown( "*The Phase 2 agent reads the researcher's reflexive " "positioning and confirmed initial noticings from Phase 1 " "as context. This ensures Phase 2 coding is grounded in " "the researcher's familiarization, not starting from scratch.*" ) bc_p2_phase1_summary = gr.Markdown( "*Phase 1 output will appear here after Save Phase 1.*" ) # ---- Orientation — Braun & Clarke p. 84 ---- gr.Markdown("---") gr.Markdown("### Coding orientation (Braun & Clarke p. 84)") gr.Markdown( "*Braun & Clarke 2006 (p. 84) treat SEMANTIC vs LATENT as " "an analysis-wide choice, not a per-sentence distinction. " "Choose ONE orientation for this whole analysis. The agent " "will code every sentence at the level you pick.* \n\n" "**Semantic** — surface content, what the text explicitly says \n" "**Latent** — underlying assumptions, what the text implies" ) bc_p2_orientation = gr.Radio( choices=["semantic", "latent"], value="semantic", label="Coding orientation for this analysis", interactive=True, ) # ---- Iteration controls ---- gr.Markdown("---") gr.Markdown("### Agentic coding iterations") gr.Markdown( "Braun & Clarke insist on iterative refinement. Run " "iteration 1 → review AI codes in the table → edit human " "columns → run iteration 2 (agent reads your edits as " "context) → review → iteration 3 → converge." ) with gr.Row(): bc_p2_run_iter1_btn = gr.Button( "Run iteration 1", variant="primary", ) bc_p2_run_iter2_btn = gr.Button( "Run iteration 2 (reads your edits)", variant="primary", ) bc_p2_run_iter3_btn = gr.Button( "Run iteration 3 (final)", variant="primary", ) bc_p2_iter_status = gr.Markdown("*No iterations run yet.*") # ---- Coding table ---- gr.Markdown("---") gr.Markdown("### Initial Codes Table") gr.Markdown( "*Every sentence gets two code levels (semantic + latent) " "per iteration. Edit the `human_code_iterN` columns to " "override the agent. The `final_code` column is populated " "from the latest human edit or the latest AI code if no " "human edit exists.*" ) bc_p2_codes_table = gr.Dataframe( headers=[ "L1", "L2", "L3", "L4", "sentence_id", "sentence", "ai_code_iter1", "human_code_iter1", "ai_code_iter2", "human_code_iter2", "ai_code_iter3", "human_code_iter3", "final_code", "flagged", ], label="Phase 2 Initial Codes — edit human_code_iterN columns", interactive=True, wrap=True, ) # ---- Codebook ---- gr.Markdown("---") gr.Markdown("### Codebook") gr.Markdown( "*Braun & Clarke 2006 require a codebook: the dictionary " "of codes with definitions, provenance, and usage counts. " "The agent maintains this as it codes; the researcher can " "edit definitions directly.*" ) bc_p2_codebook_table = gr.Dataframe( headers=[ "code_name", "definition", "created_by", "provenance", "sentence_count", ], label="Phase 2 Codebook — edit definitions", interactive=True, wrap=True, ) # ---- Save ---- gr.Markdown("---") bc_p2_save_btn = gr.Button( "Save Final Codes + Codebook → Supabase + JSON artifact", variant="primary", ) bc_p2_save_status = gr.Markdown("") # ------------ Phase 3 -- Searching for Themes ------------ with gr.Accordion("Phase 3 — Searching for Themes", open=False): gr.Markdown( "## Phase 3 -- Searching for Themes\n\n" "*Braun & Clarke 2006, Phase 3: \"Collating codes into potential " "themes, gathering all data relevant to each potential theme\" (p. 89).*\n\n" "This phase clusters the Phase 2 codebook codes by semantic similarity " "(sentence-transformers embeddings + agglomerative clustering), then " "proposes a candidate theme name and description for each cluster " "via one Mistral call per cluster.\n\n" "**Researcher action:** review the candidate themes, edit " "`researcher_theme_name` and `researcher_notes` columns, then " "re-run with different thresholds if needed. B&C 2006 explicitly " "say Phase 3 is tentative and iterative." ) gr.Markdown("### Clustering parameters (researcher-controlled)") gr.Markdown( "*B&C 2006 do not prescribe a fixed number of themes. " "Themes emerge from the clustering threshold you set. " "Lower similarity = fewer, broader themes. " "Higher similarity = more, tighter themes.*" ) with gr.Row(): bc_p3_similarity = gr.Slider( minimum=0.3, maximum=0.95, value=0.60, step=0.05, label="Similarity threshold", info="Codes more similar than this cluster together. Default 0.60.", ) bc_p3_min_size = gr.Slider( minimum=2, maximum=10, value=2, step=1, label="Minimum cluster size", info="Clusters smaller than this go into noise bucket. Default 2.", ) bc_p3_run_btn = gr.Button( "Run Phase 3 -- Cluster codes into candidate themes", variant="primary", ) bc_p3_status = gr.Markdown("*No themes generated yet. Run Phase 2 first.*") gr.Markdown("---") gr.Markdown( "### Candidate Themes Table\n" "*Edit `researcher_theme_name` and `researcher_notes` to override " "or refine the AI-generated theme names. Researcher is the final " "authority (Braun & Clarke 2006, reflexive TA principle).*" ) bc_p3_themes_table = gr.Dataframe( headers=[ "theme_id", "candidate_theme_name", "description", "rationale", "member_codes", "code_count", "researcher_theme_name", "researcher_notes", ], label="Phase 3 Candidate Themes -- edit researcher_theme_name and researcher_notes", interactive=True, wrap=True, ) gr.Markdown("---") gr.Markdown( "### Noise Codes\n" "*Codes that did not fit any cluster (below minimum cluster size). " "Review these -- they may represent important edge cases or require " "lower similarity threshold to be absorbed.*" ) bc_p3_noise_table = gr.Dataframe( headers=["code_name", "definition"], label="Noise codes (did not cluster)", interactive=False, wrap=True, ) gr.Markdown("---") bc_p3_save_btn = gr.Button( "Save Phase 3 output (themes + noise -> JSON artifact)", variant="secondary", ) bc_p3_save_status = gr.Markdown("") # ------------ Phase 4 -- Reviewing Themes ------------ with gr.Accordion("Phase 4 — Reviewing Themes", open=False): gr.Markdown( "## Phase 4 -- Reviewing Themes\n\n" "*Braun & Clarke 2006 p. 91: \"Reviewing, refining and sometimes " "reducing your themes.\"*\n\n" "**Level 1** -- coded extracts check: are the member codes and " "sentences within each theme coherent? (within-theme cohesion score)\n\n" "**Level 2** -- full dataset check: is each theme distinct from " "others? Is it appropriately scoped?\n\n" "The LLM suggests a verdict for each theme. " "**Researcher makes the final call** by editing " "`researcher_verdict` and `researcher_action_notes`." ) bc_p4_run_btn = gr.Button( "Run Phase 4 -- Review all themes (cohesion + LLM verdict)", variant="primary", ) bc_p4_status = gr.Markdown("*No review run yet. Run Phase 3 first.*") gr.Markdown("---") gr.Markdown( "### Theme Review Table\n" "*`within_cohesion`: 0.0 = incoherent, 1.0 = perfectly tight. " "B&C guidance: cohesion < 0.4 = consider split/drop, > 0.7 = healthy.*\n\n" "*`llm_verdict`: AI suggestion (keep/merge/split/drop). " "Edit `researcher_verdict` with your own decision.*" ) bc_p4_review_table = gr.Dataframe( headers=[ "theme_id", "theme_name", "member_codes", "code_count", "member_sentence_count", "within_cohesion", "llm_verdict", "llm_reasoning", "llm_action_suggestion", "researcher_verdict", "researcher_action_notes", ], label="Phase 4 Theme Review -- edit researcher_verdict and researcher_action_notes", interactive=True, wrap=True, ) gr.Markdown("---") bc_p4_save_btn = gr.Button( "Save Phase 4 verdicts -> JSON artifact", variant="secondary", ) bc_p4_save_status = gr.Markdown("") # ------------ Phase 5 -- Defining and Naming ------------ with gr.Accordion("Phase 5 — Defining and Naming Themes", open=False): gr.Markdown( "## Phase 5 -- Defining and Naming Themes\n\n" "*Braun & Clarke 2006 p. 92: \"Ongoing analysis to refine " "the specifics of each theme, and the overall story the " "analysis tells, generating clear definitions and names.\"*\n\n" "This phase takes the **surviving themes from Phase 4** " "(verdict = keep or merge) and produces:\n" "- A **final theme name** (concise, punchy, analytically clear)\n" "- A **definition** (what the theme includes and excludes)\n" "- A **scope note** (what it does NOT cover)\n" "- A **narrative contribution** (role in the overall analysis story)\n\n" "**Researcher action:** Edit `researcher_final_name` and " "`researcher_definition` to override the AI suggestions. " "Researcher is the final authority." ) bc_p5_run_btn = gr.Button( "Run Phase 5 -- Define and name all surviving themes", variant="primary", ) bc_p5_status = gr.Markdown("*No definitions yet. Run Phase 4 first.*") gr.Markdown("---") gr.Markdown( "### Theme Definitions Table\n" "*Edit `researcher_final_name` and `researcher_definition` " "to set your own final names and definitions. " "These will carry forward to Phase 6 (the report).*" ) bc_p5_def_table = gr.Dataframe( headers=[ "theme_id", "original_name", "final_name", "definition", "scope_note", "narrative_contribution", "member_codes", "code_count", "researcher_final_name", "researcher_definition", ], label="Phase 5 Theme Definitions -- edit researcher_final_name and researcher_definition", interactive=True, wrap=True, ) gr.Markdown("---") bc_p5_save_btn = gr.Button( "Save Phase 5 definitions -> JSON artifact", variant="secondary", ) bc_p5_save_status = gr.Markdown("") # ------------ Phase 6 -- Producing the Report ------------ with gr.Accordion("Phase 6 — Producing the Report", open=False): gr.Markdown( "## Phase 6 -- Producing the Report\n\n" "*Braun & Clarke 2006 p. 93: \"The final phase is writing the report. " "The task here is to tell the complicated story of your data in a way " "that convinces the reader of the merit and validity of your analysis.\"*\n\n" "This phase generates a complete analytic report from your Phase 5 " "theme definitions, weaving together:\n" "- Theme definitions and analytic narratives\n" "- Data extracts (quotes) evidencing each theme\n" "- Cross-theme analysis\n" "- Conclusion\n\n" "**Researcher action:** Edit the report directly in the text area below. " "The report is yours — the AI drafts, you refine." ) bc_p6_research_question = gr.Textbox( label="Research question / focus (optional)", placeholder="e.g. How do employees experience organisational change?", lines=2, ) bc_p6_run_btn = gr.Button( "Run Phase 6 -- Generate analytic report", variant="primary", ) bc_p6_status = gr.Markdown("*No report yet. Run Phase 5 first.*") gr.Markdown("---") gr.Markdown( "### Analytic Report\n" "*Edit directly below. The report is in Markdown format — " "headers, bold, and block quotes render automatically.*" ) bc_p6_report_text = gr.Textbox( label="Phase 6 Analytic Report (editable)", lines=30, placeholder="Report will appear here after running Phase 6...", interactive=True, ) gr.Markdown("---") bc_p6_save_btn = gr.Button( "Save report -> JSON + Markdown artifacts", variant="secondary", ) bc_p6_save_status = gr.Markdown("") # ==================== COMPUTATIONAL GROUNDED THEORY (family) ==================== # ==================== SPJIMR CORPUS ANALYSIS ==================== with gr.Tab("SPJIMR Corpus Analysis"): import spjimr_ui spjimr_ui.render_spjimr_ui() # ZONE 5 — Event wiring (.click handlers — the glue) # ======================================================================== # Each .click() connects a button to a handler function. The function's # return values go into the components listed in outputs=[...]. # # GOLDEN RULE: the number of return values from the handler must match # the length of the outputs list, in the same order. # # chat_outputs is the shared list used by process_message, submit_form, # and new_chat. All three must return 8 values in the same order. # ---------------- # Event wiring # ---------------- chat_outputs = [ chatbot, table_out, extracted_out, chart_out, code_out, downloads_state, downloads_files_out, chat_input, ] send_btn.click( process_message, inputs=[chat_input, mode_select, llm_provider_select, llm_key_input, chatbot, loaded_context_state, downloads_state], outputs=chat_outputs, ) chat_input.submit( process_message, inputs=[chat_input, mode_select, llm_provider_select, llm_key_input, chatbot, loaded_context_state, downloads_state], outputs=chat_outputs, ) form_submit.click( submit_form, inputs=[ form_task, form_op, form_a, form_b, form_city, form_notes, mode_select, llm_provider_select, llm_key_input, chatbot, loaded_context_state, downloads_state, ], outputs=chat_outputs, ) form_clear.click( clear_form, outputs=[form_task, form_op, form_a, form_b, form_city, form_notes], ) new_chat_btn.click( new_chat, inputs=[downloads_state], outputs=chat_outputs, ) # Data source handlers scrape_btn.click( scrape_url, inputs=[url_input, downloads_state], outputs=[scrape_preview, scrape_status, loaded_context_state, downloads_state, downloads_files_out], ) scrape_clear_btn.click( clear_scrape, outputs=[url_input, scrape_preview, scrape_status, loaded_context_state], ) pdf_extract_btn.click( extract_pdf, inputs=[pdf_input, downloads_state], outputs=[pdf_preview, pdf_status, loaded_context_state, downloads_state, downloads_files_out], ) pdf_clear_btn.click( clear_pdf, outputs=[pdf_input, pdf_preview, pdf_status, loaded_context_state], ) csv_load_btn.click( load_spreadsheet, inputs=[csv_input, downloads_state], outputs=[csv_preview, csv_status, loaded_context_state, downloads_state, downloads_files_out], ) csv_clear_btn.click( clear_spreadsheet, outputs=[csv_input, csv_preview, csv_status, loaded_context_state], ) ml_load_btn.click( load_ml_examples, inputs=[downloads_state], outputs=[ml_preview, ml_status, loaded_context_state, downloads_state, downloads_files_out], ) ml_clear_btn.click( clear_ml_examples, outputs=[ml_preview, ml_status, loaded_context_state], ) # Training handlers (supervised) train_btn.click( handle_train, inputs=[downloads_state], outputs=[trained_state, train_status, confusion_out, downloads_state, downloads_files_out], ) train_clear_btn.click( clear_training, outputs=[trained_state, train_status, confusion_out, predict_out], ) predict_btn.click( handle_predict, inputs=[trained_state, predict_input, downloads_state], outputs=[predict_out, downloads_state, downloads_files_out], ) sup_label_filter.change( filter_training_dataset, inputs=[sup_label_filter], outputs=[sup_dataset_view], ) # Training handlers (unsupervised) cluster_btn.click( handle_cluster, inputs=[cluster_sim, cluster_min, cluster_nnear, cluster_llm_toggle, llm_provider_select, llm_key_input, downloads_state], outputs=[cluster_out, cluster_status, downloads_state, downloads_files_out], ) cluster_clear_btn.click( clear_clustering, outputs=[cluster_out, cluster_status], ) # ---- Vector Processing wiring ---- vectorize_btn.click( handle_vectorize_preview, inputs=[embedding_provider_select, embedding_key_input, downloads_state], outputs=[vectorize_out, vectorize_status, downloads_state, downloads_files_out], ) vectorize_clear_btn.click( clear_vectorize_preview, outputs=[vectorize_out, vectorize_status], ) vector_index_btn.click( handle_vector_index, inputs=[embedding_provider_select, embedding_key_input, downloads_state], outputs=[vector_index_status, downloads_state, downloads_files_out], ) vector_clear_btn.click( handle_vector_clear, inputs=[downloads_state], outputs=[vector_index_status, downloads_state, downloads_files_out], ) vector_search_btn.click( handle_vector_search, inputs=[vector_query, vector_n, embedding_provider_select, embedding_key_input, downloads_state], outputs=[vector_search_out, vector_search_status, downloads_state, downloads_files_out], ) # ---- Workbench wiring ---- wb_cgt_run.click( handle_wb_cgt, inputs=[wb_cgt_msg, wb_cgt_sim, wb_cgt_min, wb_cgt_nnear, llm_provider_select, llm_key_input, loaded_context_state, downloads_state], outputs=[wb_cgt_trace, wb_cgt_reply, wb_cgt_sentences, downloads_state, downloads_files_out], ) # ---- CGT Phase 2 Pattern Refinement wiring (Nelson 2020 Step 2) ---- # Phase 2 consumes wb_cgt_sentences (Phase 1 output) per Option α: # detection is a discrete step whose output feeds refinement. cgt_p2_surface_btn.click( handle_cgt_p2_surface, inputs=[wb_cgt_sentences, cgt_p2_n_exemplars, cgt_p2_reflexivity, llm_provider_select, llm_key_input, downloads_state], outputs=[cgt_p2_refinement_table, cgt_p2_status, downloads_state, downloads_files_out], ) cgt_p2_save_btn.click( handle_cgt_p2_save, inputs=[cgt_p2_refinement_table, cgt_p2_reflexivity, downloads_state], outputs=[cgt_p2_save_status, downloads_state, downloads_files_out], ) wb_cta_run.click( handle_wb_cta, inputs=[wb_cta_msg, wb_cta_max, llm_provider_select, llm_key_input, loaded_context_state, downloads_state], outputs=[wb_cta_trace, wb_cta_reply, wb_cta_codes, downloads_state, downloads_files_out], ) # ---- Phase 1 Familiarization wiring ---- bc_p1_load_test_btn.click( handle_p1_load_test_csv, inputs=[downloads_state], outputs=[bc_corpus_state, bc_p1_corpus_status, bc_p1_corpus_preview, downloads_state, downloads_files_out], ) bc_p1_upload_csv.upload( handle_p1_upload_csv, inputs=[bc_p1_upload_csv, downloads_state], outputs=[bc_corpus_state, bc_p1_corpus_status, bc_p1_corpus_preview, downloads_state, downloads_files_out], ) # ---- G&W at Scale load wiring (writes to gw_corpus_state) ---- gw_load_test_btn.click( handle_p1_load_test_csv, inputs=[downloads_state], outputs=[gw_corpus_state, gw_load_status, gw_corpus_preview, downloads_state, downloads_files_out], ) gw_upload_csv.upload( handle_p1_upload_csv, inputs=[gw_upload_csv, downloads_state], outputs=[gw_corpus_state, gw_load_status, gw_corpus_preview, downloads_state, downloads_files_out], ) # ---- B&C Workbench load wiring (writes to bc_corpus_state) ---- bc_load_test_btn.click( handle_p1_load_test_csv, inputs=[downloads_state], outputs=[bc_corpus_state, bc_load_status, bc_corpus_preview, downloads_state, downloads_files_out], ) bc_upload_csv.upload( handle_p1_upload_csv, inputs=[bc_upload_csv, downloads_state], outputs=[bc_corpus_state, bc_load_status, bc_corpus_preview, downloads_state, downloads_files_out], ) # ---- Nelson + Carlsen & Ralund load wiring ---- cgt_load_test_btn.click( handle_p1_load_test_csv, inputs=[downloads_state], outputs=[cgt_corpus_state, cgt_load_status, cgt_corpus_preview, downloads_state, downloads_files_out], ) cgt_upload_csv.upload( handle_p1_upload_csv, inputs=[cgt_upload_csv, downloads_state], outputs=[cgt_corpus_state, cgt_load_status, cgt_corpus_preview, downloads_state, downloads_files_out], ) bc_p1_build_table_btn.click( handle_p1_build_validation_table, inputs=[bc_corpus_state, bc_p1_facilitator_memo, bc_p1_facilitator_transcript, bc_p1_facilitator_citations, bc_p1_companion_challenges, bc_p1_companion_reflexivity, bc_p1_companion_breadth], outputs=[bc_p1_validation_table], ) bc_p1_save_btn.click( handle_p1_save, inputs=[bc_corpus_state, bc_p1_facilitator_memo, bc_p1_facilitator_transcript, bc_p1_facilitator_citations, bc_p1_companion_challenges, bc_p1_companion_reflexivity, bc_p1_companion_breadth, bc_p1_validation_table, downloads_state], outputs=[bc_p1_save_status, downloads_state, downloads_files_out], ) # ---- Phase 2 Initial Coding wiring ---- bc_p2_refresh_btn.click( handle_p2_refresh_corpus, inputs=[bc_corpus_state, bc_p1_facilitator_memo, bc_p1_companion_reflexivity, bc_p1_validation_table], outputs=[bc_p2_corpus_status, bc_p2_phase1_summary], ) bc_p2_run_iter1_btn.click( lambda corpus, codes, codebook, memo, reflex, vtable, prov, key, orient: handle_p2_run_iteration(1, corpus, codes, codebook, memo, reflex, vtable, prov, key, orient), inputs=[bc_corpus_state, bc_p2_codes_table, bc_p2_codebook_table, bc_p1_facilitator_memo, bc_p1_companion_reflexivity, bc_p1_validation_table, llm_provider_select, llm_key_input, bc_p2_orientation], outputs=[bc_p2_codes_table, bc_p2_codebook_table, bc_p2_iter_status], ) bc_p2_run_iter2_btn.click( lambda corpus, codes, codebook, memo, reflex, vtable, prov, key, orient: handle_p2_run_iteration(2, corpus, codes, codebook, memo, reflex, vtable, prov, key, orient), inputs=[bc_corpus_state, bc_p2_codes_table, bc_p2_codebook_table, bc_p1_facilitator_memo, bc_p1_companion_reflexivity, bc_p1_validation_table, llm_provider_select, llm_key_input, bc_p2_orientation], outputs=[bc_p2_codes_table, bc_p2_codebook_table, bc_p2_iter_status], ) bc_p2_run_iter3_btn.click( lambda corpus, codes, codebook, memo, reflex, vtable, prov, key, orient: handle_p2_run_iteration(3, corpus, codes, codebook, memo, reflex, vtable, prov, key, orient), inputs=[bc_corpus_state, bc_p2_codes_table, bc_p2_codebook_table, bc_p1_facilitator_memo, bc_p1_companion_reflexivity, bc_p1_validation_table, llm_provider_select, llm_key_input, bc_p2_orientation], outputs=[bc_p2_codes_table, bc_p2_codebook_table, bc_p2_iter_status], ) bc_p2_save_btn.click( handle_p2_save, inputs=[bc_corpus_state, bc_p2_codes_table, bc_p2_codebook_table, downloads_state], outputs=[bc_p2_save_status, downloads_state, downloads_files_out], ) # ---- Phase 3 Searching for Themes wiring ---- bc_p3_run_btn.click( handle_p3_run, inputs=[ bc_p2_codebook_table, bc_p3_similarity, bc_p3_min_size, bc_p2_orientation, bc_p1_companion_reflexivity, llm_provider_select, llm_key_input, downloads_state, ], outputs=[bc_p3_themes_table, bc_p3_noise_table, bc_p3_status, downloads_state, downloads_files_out], ) bc_p3_save_btn.click( handle_p3_save, inputs=[bc_p3_themes_table, bc_p3_noise_table, downloads_state], outputs=[bc_p3_save_status, downloads_state, downloads_files_out], ) # ---- Phase 4 Reviewing Themes wiring ---- bc_p4_run_btn.click( handle_p4_run, inputs=[ bc_p3_themes_table, bc_p2_codes_table, bc_p1_companion_reflexivity, llm_provider_select, llm_key_input, downloads_state, ], outputs=[bc_p4_review_table, bc_p4_status, downloads_state, downloads_files_out], ) bc_p4_save_btn.click( handle_p4_save, inputs=[bc_p4_review_table, downloads_state], outputs=[bc_p4_save_status, downloads_state, downloads_files_out], ) # ---- Phase 5 Defining and Naming wiring ---- bc_p5_run_btn.click( handle_p5_run, inputs=[ bc_p4_review_table, bc_p1_companion_reflexivity, llm_provider_select, llm_key_input, downloads_state, ], outputs=[bc_p5_def_table, bc_p5_status, downloads_state, downloads_files_out], ) bc_p5_save_btn.click( handle_p5_save, inputs=[bc_p5_def_table, downloads_state], outputs=[bc_p5_save_status, downloads_state, downloads_files_out], ) # ---- Phase 6 Producing the Report wiring ---- bc_p6_run_btn.click( handle_p6_run, inputs=[ bc_p5_def_table, bc_p2_codes_table, bc_p6_research_question, bc_p1_companion_reflexivity, bc_corpus_state, llm_provider_select, llm_key_input, downloads_state, ], outputs=[bc_p6_report_text, bc_p6_status, downloads_state, downloads_files_out], ) bc_p6_save_btn.click( handle_p6_save, inputs=[bc_p6_report_text, downloads_state], outputs=[bc_p6_save_status, downloads_state, downloads_files_out], ) # ---- Phase 0 Preparation wiring (Moreno-Ortiz 2023; BERTopic_Teen 2025) ---- # Each button updates gw_corpus_state in-place (so next prep step or # Phase 0 Sampling sees the prepared corpus) and refreshes the output # table + status. Downloads list accumulates artifact JSONs. p0prep_noise_btn.click( handle_p0prep_noise_strip, inputs=[gw_corpus_state, downloads_state], outputs=[gw_corpus_state, p0prep_table, p0prep_status, downloads_state, downloads_files_out], ) p0prep_length_btn.click( handle_p0prep_length_filter, inputs=[gw_corpus_state, p0prep_min_words, downloads_state], outputs=[gw_corpus_state, p0prep_table, p0prep_status, downloads_state, downloads_files_out], ) p0prep_hash_btn.click( handle_p0prep_hash_dedup, inputs=[gw_corpus_state, p0prep_case_sensitive, downloads_state], outputs=[gw_corpus_state, p0prep_table, p0prep_status, downloads_state, downloads_files_out], ) p0prep_semantic_btn.click( handle_p0prep_semantic_dedup, inputs=[gw_corpus_state, p0prep_semantic_threshold, downloads_state], outputs=[gw_corpus_state, p0prep_table, p0prep_status, downloads_state, downloads_files_out], ) # ---- Phase 0 Sampling wiring (Gauthier & Wallace 2022) ---- # Outputs 6: compression_table, split_proposals_table, compressed_corpus_state, # status, downloads_state, downloads_files_out gw_compress_btn.click( handle_compression_run, inputs=[ gw_corpus_state, gw_sentences_per_cluster, gw_min_cluster_size, gw_outlier_sample, gw_min_cluster_fit, downloads_state, ], outputs=[ gw_compress_table, gw_split_proposals_table, gw_compressed_corpus_state, gw_compress_status, downloads_state, downloads_files_out, ], ) # Apply researcher split decisions and re-sample gw_apply_splits_btn.click( handle_apply_split_decisions, inputs=[ gw_corpus_state, gw_split_proposals_table, gw_sentences_per_cluster, gw_min_cluster_size, gw_outlier_sample, gw_min_cluster_fit, downloads_state, ], outputs=[ gw_compress_table, gw_split_proposals_table, gw_compressed_corpus_state, gw_compress_status, downloads_state, downloads_files_out, ], ) # ---- Cluster labeling workflow (2 iterations + final commit) ---- # Phase 2 pattern: DataFrame in, DataFrame out. NO gr.State plumbing — # that was the cause of the stale-table + flicker bugs. gw_label_init_btn.click( handle_label_init_cluster_table, inputs=[gw_compress_table], outputs=[gw_cluster_labels_table, gw_label_status], ) gw_label_iter1_btn.click( handle_label_iter1, inputs=[ gw_cluster_labels_table, gw_compress_table, llm_provider_select, llm_key_input, downloads_state, ], outputs=[ gw_cluster_labels_table, gw_label_status, downloads_state, downloads_files_out, ], ) gw_label_iter2_btn.click( handle_label_iter2, inputs=[ gw_cluster_labels_table, gw_compress_table, llm_provider_select, llm_key_input, downloads_state, ], outputs=[ gw_cluster_labels_table, gw_label_status, downloads_state, downloads_files_out, ], ) # Commit handler returns 4 outputs (cluster_df, status, dl, dl). # gw_compress_table (Phase 0 Sampling Table) is NOT in outputs — one-way # pipeline, Phase 0 output stays frozen. Cluster Labeling produces its # own artifact; downstream stages join on cluster_id at read-time. gw_label_commit_btn.click( handle_label_commit_final, inputs=[ gw_cluster_labels_table, gw_compress_table, downloads_state, ], outputs=[ gw_cluster_labels_table, gw_label_status, downloads_state, downloads_files_out, ], ) # ---- Methodology comparison download buttons (one per workbench) ---- bc_comparison_dl_btn.click( lambda dl: handle_methodology_comparison_download("bc", dl), inputs=[downloads_state], outputs=[bc_comparison_dl_status, downloads_state, downloads_files_out], ) gw_comparison_dl_btn.click( lambda dl: handle_methodology_comparison_download("gw", dl), inputs=[downloads_state], outputs=[gw_comparison_dl_status, downloads_state, downloads_files_out], ) cgt_comparison_dl_btn.click( lambda dl: handle_methodology_comparison_download("cgt", dl), inputs=[downloads_state], outputs=[cgt_comparison_dl_status, downloads_state, downloads_files_out], ) # ---- Phase 1 Familiarization wiring (G&W path, reuses handle_p1_save) ---- # Methodological sequence: Phase 0 → researcher edits `selected` column → # Phase 1 Familiarization reads the researcher-APPROVED sentences (selected=true # in the edited gw_compress_table). Phase 1 save ALSO populates # gw_approved_corpus_state which Phase 2-6 read from, enforcing the sequence # at the wiring level. def _gw_p1_save_with_selected_filter( compress_table, raw_corpus, memo, transcript, citations, challenges, reflexivity, breadth, valtable, dl, ): # Determine source — edited compression table (preferred) or raw corpus source = None if isinstance(compress_table, pd.DataFrame) and len(compress_table) > 0: if "selected" in compress_table.columns: # Normalize selected column: handle both bool and str 'true'/'false' sel = compress_table["selected"] mask = sel.apply( lambda v: ( True if v is True else False if v is False else str(v).strip().lower() in ("true", "1", "yes", "t") ) ) filtered = compress_table[mask] source = filtered.to_dict("records") else: source = compress_table.to_dict("records") if not source: source = raw_corpus or [] # Call the existing handler status, dl_out, dl_files = handle_p1_save( source, memo, transcript, citations, challenges, reflexivity, breadth, valtable, dl, ) # Also publish the approved corpus to gw_approved_corpus_state for Phase 2-6 return status, dl_out, dl_files, source gw_p1_save_btn.click( _gw_p1_save_with_selected_filter, inputs=[gw_compress_table, gw_corpus_state, gw_p1_facilitator_memo, gw_p1_facilitator_transcript, gw_p1_facilitator_citations, gw_p1_companion_challenges, gw_p1_companion_reflexivity, gw_p1_companion_breadth, gw_p1_validation_table, downloads_state], outputs=[gw_p1_save_status, downloads_state, downloads_files_out, gw_approved_corpus_state], ) # ==================================================================== # G&W Phase 2-6 wiring — reuses B&C handler functions with G&W state # objects. Data isolation: every input/output references gw_* widgets. # Phase 2 reads gw_approved_corpus_state (populated by G&W Phase 1 save). # ==================================================================== # ---- G&W Phase 2 Initial Coding wiring ---- gw_p2_refresh_btn.click( handle_p2_refresh_corpus, inputs=[gw_approved_corpus_state, gw_p1_facilitator_memo, gw_p1_companion_reflexivity, gw_p1_validation_table], outputs=[gw_p2_corpus_status, gw_p2_phase1_summary], ) gw_p2_run_iter1_btn.click( lambda corpus, codes, codebook, memo, reflex, vtable, prov, key, orient: handle_p2_run_iteration(1, corpus, codes, codebook, memo, reflex, vtable, prov, key, orient), inputs=[gw_approved_corpus_state, gw_p2_codes_table, gw_p2_codebook_table, gw_p1_facilitator_memo, gw_p1_companion_reflexivity, gw_p1_validation_table, llm_provider_select, llm_key_input, gw_p2_orientation], outputs=[gw_p2_codes_table, gw_p2_codebook_table, gw_p2_iter_status], ) gw_p2_run_iter2_btn.click( lambda corpus, codes, codebook, memo, reflex, vtable, prov, key, orient: handle_p2_run_iteration(2, corpus, codes, codebook, memo, reflex, vtable, prov, key, orient), inputs=[gw_approved_corpus_state, gw_p2_codes_table, gw_p2_codebook_table, gw_p1_facilitator_memo, gw_p1_companion_reflexivity, gw_p1_validation_table, llm_provider_select, llm_key_input, gw_p2_orientation], outputs=[gw_p2_codes_table, gw_p2_codebook_table, gw_p2_iter_status], ) gw_p2_run_iter3_btn.click( lambda corpus, codes, codebook, memo, reflex, vtable, prov, key, orient: handle_p2_run_iteration(3, corpus, codes, codebook, memo, reflex, vtable, prov, key, orient), inputs=[gw_approved_corpus_state, gw_p2_codes_table, gw_p2_codebook_table, gw_p1_facilitator_memo, gw_p1_companion_reflexivity, gw_p1_validation_table, llm_provider_select, llm_key_input, gw_p2_orientation], outputs=[gw_p2_codes_table, gw_p2_codebook_table, gw_p2_iter_status], ) gw_p2_save_btn.click( handle_p2_save, inputs=[gw_approved_corpus_state, gw_p2_codes_table, gw_p2_codebook_table, downloads_state], outputs=[gw_p2_save_status, downloads_state, downloads_files_out], ) # ---- G&W Phase 3 Searching for Themes wiring ---- gw_p3_run_btn.click( handle_p3_run, inputs=[ gw_p2_codebook_table, gw_p3_similarity, gw_p3_min_size, gw_p2_orientation, gw_p1_companion_reflexivity, llm_provider_select, llm_key_input, downloads_state, ], outputs=[gw_p3_themes_table, gw_p3_noise_table, gw_p3_status, downloads_state, downloads_files_out], ) gw_p3_save_btn.click( handle_p3_save, inputs=[gw_p3_themes_table, gw_p3_noise_table, downloads_state], outputs=[gw_p3_save_status, downloads_state, downloads_files_out], ) # ---- G&W Phase 4 Reviewing Themes wiring ---- gw_p4_run_btn.click( handle_p4_run, inputs=[ gw_p3_themes_table, gw_p2_codes_table, gw_p1_companion_reflexivity, llm_provider_select, llm_key_input, downloads_state, ], outputs=[gw_p4_review_table, gw_p4_status, downloads_state, downloads_files_out], ) gw_p4_save_btn.click( handle_p4_save, inputs=[gw_p4_review_table, downloads_state], outputs=[gw_p4_save_status, downloads_state, downloads_files_out], ) # ---- G&W Phase 5 Defining and Naming wiring ---- gw_p5_run_btn.click( handle_p5_run, inputs=[ gw_p4_review_table, gw_p1_companion_reflexivity, llm_provider_select, llm_key_input, downloads_state, ], outputs=[gw_p5_def_table, gw_p5_status, downloads_state, downloads_files_out], ) gw_p5_save_btn.click( handle_p5_save, inputs=[gw_p5_def_table, downloads_state], outputs=[gw_p5_save_status, downloads_state, downloads_files_out], ) # ---- G&W Phase 6 Producing the Report wiring ---- gw_p6_run_btn.click( handle_p6_run, inputs=[ gw_p5_def_table, gw_p2_codes_table, gw_p6_research_question, gw_p1_companion_reflexivity, gw_approved_corpus_state, llm_provider_select, llm_key_input, downloads_state, ], outputs=[gw_p6_report_text, gw_p6_status, downloads_state, downloads_files_out], ) gw_p6_save_btn.click( handle_p6_save, inputs=[gw_p6_report_text, downloads_state], outputs=[gw_p6_save_status, downloads_state, downloads_files_out], ) if __name__ == "__main__": # Supabase startup check -- create tables if they don't exist if DB_OK: _db_status = db.startup_check() if _db_status["db_available"]: print(f"[app.py] Supabase connected. Tables ready: {_db_status['tables_created']}") else: print(f"[app.py] Supabase not available: {_db_status.get('error')}") else: print(f"[app.py] database.py not loaded: {_db_err}") # ssr_mode=False: Gradio 5/6's Server-Side Rendering breaks demo.launch() # on HuggingFace Spaces with the "localhost not accessible" error. # Confirmed workaround from HF forums + Gradio Discord. demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)