Spjimr / cgt_phase2_refinement.py
shahidshaikh's picture
Upload 40 files
a52bae4 verified
# ============================================================================
# cgt_phase2_refinement.py — CGT Phase 2 Pattern Refinement (Nelson 2020 Step 2)
# ============================================================================
#
# Nelson 2020 Pattern Refinement = deep reading of exemplars → researcher
# refines pattern definitions → keep / merge / split / drop / rename verdict
# per pattern. This is axial coding in traditional grounded theory terms.
#
# Carlsen & Ralund 2022 researcher-centrality: the tool surfaces exemplars
# and drafts interpretive memos; the researcher writes the final memo and
# decides the verdict. The LLM never decides pattern fate.
#
# Flow:
# 1. Consume Phase 1 sentence→cluster assignments (sentences_df)
# 2. For each non-noise cluster, surface top-N exemplar sentences
# 3. LLM drafts interpretive memo per cluster (temp=0.0 for reproducibility)
# 4. Package as RefinementRow list → DataFrame for researcher UI
# 5. Researcher edits researcher_memo + verdict + new_label
# 6. Save artifact with method_contracts_verified
# ============================================================================
from dataclasses import dataclass, asdict, field
from typing import List, Dict, Optional
import pandas as pd
try:
import providers
PROVIDERS_OK = True
except Exception:
PROVIDERS_OK = False
@dataclass
class RefinementRow:
"""One pattern's refinement record — researcher edits fields marked [EDIT]."""
pattern_id: str # cluster_id from Phase 1 (string, e.g. "0", "1", ...)
pattern_label: str # cluster_label from Phase 1 (LLM-drafted)
n_sentences: int # count of sentences in this cluster
exemplars: str # top-N exemplar sentences joined with " | "
llm_memo_draft: str # LLM-drafted interpretive memo (read-only)
researcher_memo: str = "" # [EDIT] — researcher's final memo
verdict: str = "" # [EDIT] — keep / merge / split / drop / rename
new_label: str = "" # [EDIT] — required if verdict in {rename, split}
# ----------------------------------------------------------------
# Prompt template — Nelson 2020 Phase 2 interpretive memo
# ----------------------------------------------------------------
MEMO_PROMPT_TEMPLATE = """You are an analyst applying Nelson (2020) computational \
grounded theory Phase 2 — Pattern Refinement.
Researcher's reflexive positioning (Carlsen & Ralund 2022):
{reflexive_pos}
Pattern label (from Phase 1 clustering): {pattern_label}
Exemplar sentences in this pattern (researcher reads these for deep interpretation):
{numbered_exemplars}
Draft a brief interpretive memo (3-5 sentences, max 150 words) covering:
1. What this pattern seems to capture
2. Any key dimensions or tensions across the exemplars
3. Whether the Phase 1 pattern label seems apt
Be specific to the sentences. Do not fabricate content not present in the exemplars.
This is a draft for the researcher to refine — you do not decide the pattern's fate.
Memo:"""
# ----------------------------------------------------------------
# Core function — run Phase 2 refinement
# ----------------------------------------------------------------
def run_pattern_refinement(
sentences_df: pd.DataFrame,
n_exemplars: int,
llm_provider: str,
llm_key: str,
reflexive_pos: str,
) -> Dict:
"""Generate RefinementRow list with LLM-drafted memos.
Args:
sentences_df: Phase 1 output with columns
{sentence, cluster_id, cluster_label, ...optional dist_to_centroid}
n_exemplars: top-N exemplars per cluster
llm_provider: e.g. "Mistral"
llm_key: LLM API key
reflexive_pos: researcher's reflexive positioning statement
Returns:
dict with:
refinement_rows: list[dict] — ready for DataFrame display
n_patterns: int — number of non-noise clusters processed
n_noise: int — number of noise-assigned sentences skipped
llm_errors: list[str] — per-cluster errors if any
"""
if sentences_df is None or len(sentences_df) == 0:
return {"refinement_rows": [], "n_patterns": 0, "n_noise": 0, "llm_errors": []}
df = sentences_df.copy()
# Normalize: cluster_id can be "noise" or int-as-string
if "cluster_id" not in df.columns:
return {"refinement_rows": [], "n_patterns": 0, "n_noise": 0,
"llm_errors": ["no cluster_id column in Phase 1 output"]}
# Separate noise from clusters
noise_mask = df["cluster_id"].astype(str).str.lower() == "noise"
n_noise = int(noise_mask.sum())
clusters_df = df[~noise_mask]
# Group by cluster_id
groups = clusters_df.groupby("cluster_id", sort=True)
# LLM client
client = None
model_name = None
llm_errors: List[str] = []
if PROVIDERS_OK and llm_key:
try:
client = providers.get_llm_client(llm_provider, llm_key)
model_name = providers.get_llm_model(llm_provider)
except Exception as e:
llm_errors.append(f"llm_client_init: {e}")
client = None
refinement_rows: List[Dict] = []
for cluster_id, cluster_df in groups:
# Sort exemplars by dist_to_centroid if available (closest first)
if "dist_to_centroid" in cluster_df.columns:
sorted_df = cluster_df.sort_values(
"dist_to_centroid", ascending=True, na_position="last"
)
else:
sorted_df = cluster_df
# Top-N exemplars
top_n = sorted_df.head(int(n_exemplars))
exemplar_sentences = top_n["sentence"].astype(str).tolist()
pattern_label = str(
cluster_df["cluster_label"].iloc[0]
if "cluster_label" in cluster_df.columns and len(cluster_df) > 0
else f"cluster_{cluster_id}"
)
# LLM memo
memo = ""
if client is not None:
numbered = "\n".join(
f" {i+1}. {s}" for i, s in enumerate(exemplar_sentences)
)
prompt = MEMO_PROMPT_TEMPLATE.format(
reflexive_pos=(reflexive_pos or "(none provided)").strip(),
pattern_label=pattern_label,
numbered_exemplars=numbered,
)
try:
resp = client.chat.complete(
model=model_name,
messages=[{"role": "user", "content": prompt}],
temperature=0.0, # reproducibility — determinism contract
max_tokens=300,
)
memo = (resp.choices[0].message.content or "").strip()
# Trim if runaway
memo = memo[:1200]
except Exception as e:
memo = f"(LLM error: {e})"
llm_errors.append(f"cluster_{cluster_id}: {e}")
refinement_rows.append({
"pattern_id": str(cluster_id),
"pattern_label": pattern_label,
"n_sentences": int(len(cluster_df)),
"exemplars": " | ".join(exemplar_sentences),
"llm_memo_draft": memo,
"researcher_memo": "",
"verdict": "",
"new_label": "",
})
return {
"refinement_rows": refinement_rows,
"n_patterns": len(refinement_rows),
"n_noise": n_noise,
"llm_errors": llm_errors,
}
# ----------------------------------------------------------------
# Validation helper — researcher's completed refinement table
# ----------------------------------------------------------------
VALID_VERDICTS = {"keep", "merge", "split", "drop", "rename"}
def validate_refinement_table(refinement_df: pd.DataFrame) -> Dict:
"""Validate researcher's completed refinement table.
Enforces:
- every row has a verdict in VALID_VERDICTS
- rows with verdict in {rename, split} must have new_label non-empty
- every row has a researcher_memo (at least 1 char)
"""
if refinement_df is None or len(refinement_df) == 0:
return {"ok": False, "errors": ["refinement_table is empty"]}
errors: List[str] = []
for i, row in refinement_df.iterrows():
pid = row.get("pattern_id", f"row_{i}")
verdict = str(row.get("verdict", "")).strip().lower()
memo = str(row.get("researcher_memo", "")).strip()
new_label = str(row.get("new_label", "")).strip()
if verdict not in VALID_VERDICTS:
errors.append(
f"pattern {pid}: verdict must be one of {sorted(VALID_VERDICTS)}, got {verdict!r}"
)
if not memo:
errors.append(f"pattern {pid}: researcher_memo is empty")
if verdict in ("rename", "split") and not new_label:
errors.append(
f"pattern {pid}: verdict={verdict} requires new_label (not empty)"
)
return {"ok": len(errors) == 0, "errors": errors}