# ============================================================================ # cgt_phase2_refinement.py — CGT Phase 2 Pattern Refinement (Nelson 2020 Step 2) # ============================================================================ # # Nelson 2020 Pattern Refinement = deep reading of exemplars → researcher # refines pattern definitions → keep / merge / split / drop / rename verdict # per pattern. This is axial coding in traditional grounded theory terms. # # Carlsen & Ralund 2022 researcher-centrality: the tool surfaces exemplars # and drafts interpretive memos; the researcher writes the final memo and # decides the verdict. The LLM never decides pattern fate. # # Flow: # 1. Consume Phase 1 sentence→cluster assignments (sentences_df) # 2. For each non-noise cluster, surface top-N exemplar sentences # 3. LLM drafts interpretive memo per cluster (temp=0.0 for reproducibility) # 4. Package as RefinementRow list → DataFrame for researcher UI # 5. Researcher edits researcher_memo + verdict + new_label # 6. Save artifact with method_contracts_verified # ============================================================================ from dataclasses import dataclass, asdict, field from typing import List, Dict, Optional import pandas as pd try: import providers PROVIDERS_OK = True except Exception: PROVIDERS_OK = False @dataclass class RefinementRow: """One pattern's refinement record — researcher edits fields marked [EDIT].""" pattern_id: str # cluster_id from Phase 1 (string, e.g. "0", "1", ...) pattern_label: str # cluster_label from Phase 1 (LLM-drafted) n_sentences: int # count of sentences in this cluster exemplars: str # top-N exemplar sentences joined with " | " llm_memo_draft: str # LLM-drafted interpretive memo (read-only) researcher_memo: str = "" # [EDIT] — researcher's final memo verdict: str = "" # [EDIT] — keep / merge / split / drop / rename new_label: str = "" # [EDIT] — required if verdict in {rename, split} # ---------------------------------------------------------------- # Prompt template — Nelson 2020 Phase 2 interpretive memo # ---------------------------------------------------------------- MEMO_PROMPT_TEMPLATE = """You are an analyst applying Nelson (2020) computational \ grounded theory Phase 2 — Pattern Refinement. Researcher's reflexive positioning (Carlsen & Ralund 2022): {reflexive_pos} Pattern label (from Phase 1 clustering): {pattern_label} Exemplar sentences in this pattern (researcher reads these for deep interpretation): {numbered_exemplars} Draft a brief interpretive memo (3-5 sentences, max 150 words) covering: 1. What this pattern seems to capture 2. Any key dimensions or tensions across the exemplars 3. Whether the Phase 1 pattern label seems apt Be specific to the sentences. Do not fabricate content not present in the exemplars. This is a draft for the researcher to refine — you do not decide the pattern's fate. Memo:""" # ---------------------------------------------------------------- # Core function — run Phase 2 refinement # ---------------------------------------------------------------- def run_pattern_refinement( sentences_df: pd.DataFrame, n_exemplars: int, llm_provider: str, llm_key: str, reflexive_pos: str, ) -> Dict: """Generate RefinementRow list with LLM-drafted memos. Args: sentences_df: Phase 1 output with columns {sentence, cluster_id, cluster_label, ...optional dist_to_centroid} n_exemplars: top-N exemplars per cluster llm_provider: e.g. "Mistral" llm_key: LLM API key reflexive_pos: researcher's reflexive positioning statement Returns: dict with: refinement_rows: list[dict] — ready for DataFrame display n_patterns: int — number of non-noise clusters processed n_noise: int — number of noise-assigned sentences skipped llm_errors: list[str] — per-cluster errors if any """ if sentences_df is None or len(sentences_df) == 0: return {"refinement_rows": [], "n_patterns": 0, "n_noise": 0, "llm_errors": []} df = sentences_df.copy() # Normalize: cluster_id can be "noise" or int-as-string if "cluster_id" not in df.columns: return {"refinement_rows": [], "n_patterns": 0, "n_noise": 0, "llm_errors": ["no cluster_id column in Phase 1 output"]} # Separate noise from clusters noise_mask = df["cluster_id"].astype(str).str.lower() == "noise" n_noise = int(noise_mask.sum()) clusters_df = df[~noise_mask] # Group by cluster_id groups = clusters_df.groupby("cluster_id", sort=True) # LLM client client = None model_name = None llm_errors: List[str] = [] if PROVIDERS_OK and llm_key: try: client = providers.get_llm_client(llm_provider, llm_key) model_name = providers.get_llm_model(llm_provider) except Exception as e: llm_errors.append(f"llm_client_init: {e}") client = None refinement_rows: List[Dict] = [] for cluster_id, cluster_df in groups: # Sort exemplars by dist_to_centroid if available (closest first) if "dist_to_centroid" in cluster_df.columns: sorted_df = cluster_df.sort_values( "dist_to_centroid", ascending=True, na_position="last" ) else: sorted_df = cluster_df # Top-N exemplars top_n = sorted_df.head(int(n_exemplars)) exemplar_sentences = top_n["sentence"].astype(str).tolist() pattern_label = str( cluster_df["cluster_label"].iloc[0] if "cluster_label" in cluster_df.columns and len(cluster_df) > 0 else f"cluster_{cluster_id}" ) # LLM memo memo = "" if client is not None: numbered = "\n".join( f" {i+1}. {s}" for i, s in enumerate(exemplar_sentences) ) prompt = MEMO_PROMPT_TEMPLATE.format( reflexive_pos=(reflexive_pos or "(none provided)").strip(), pattern_label=pattern_label, numbered_exemplars=numbered, ) try: resp = client.chat.complete( model=model_name, messages=[{"role": "user", "content": prompt}], temperature=0.0, # reproducibility — determinism contract max_tokens=300, ) memo = (resp.choices[0].message.content or "").strip() # Trim if runaway memo = memo[:1200] except Exception as e: memo = f"(LLM error: {e})" llm_errors.append(f"cluster_{cluster_id}: {e}") refinement_rows.append({ "pattern_id": str(cluster_id), "pattern_label": pattern_label, "n_sentences": int(len(cluster_df)), "exemplars": " | ".join(exemplar_sentences), "llm_memo_draft": memo, "researcher_memo": "", "verdict": "", "new_label": "", }) return { "refinement_rows": refinement_rows, "n_patterns": len(refinement_rows), "n_noise": n_noise, "llm_errors": llm_errors, } # ---------------------------------------------------------------- # Validation helper — researcher's completed refinement table # ---------------------------------------------------------------- VALID_VERDICTS = {"keep", "merge", "split", "drop", "rename"} def validate_refinement_table(refinement_df: pd.DataFrame) -> Dict: """Validate researcher's completed refinement table. Enforces: - every row has a verdict in VALID_VERDICTS - rows with verdict in {rename, split} must have new_label non-empty - every row has a researcher_memo (at least 1 char) """ if refinement_df is None or len(refinement_df) == 0: return {"ok": False, "errors": ["refinement_table is empty"]} errors: List[str] = [] for i, row in refinement_df.iterrows(): pid = row.get("pattern_id", f"row_{i}") verdict = str(row.get("verdict", "")).strip().lower() memo = str(row.get("researcher_memo", "")).strip() new_label = str(row.get("new_label", "")).strip() if verdict not in VALID_VERDICTS: errors.append( f"pattern {pid}: verdict must be one of {sorted(VALID_VERDICTS)}, got {verdict!r}" ) if not memo: errors.append(f"pattern {pid}: researcher_memo is empty") if verdict in ("rename", "split") and not new_label: errors.append( f"pattern {pid}: verdict={verdict} requires new_label (not empty)" ) return {"ok": len(errors) == 0, "errors": errors}