|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| from dataclasses import dataclass, asdict, field
|
| from typing import List, Dict, Optional
|
| import pandas as pd
|
|
|
| try:
|
| import providers
|
| PROVIDERS_OK = True
|
| except Exception:
|
| PROVIDERS_OK = False
|
|
|
|
|
| @dataclass
|
| class RefinementRow:
|
| """One pattern's refinement record — researcher edits fields marked [EDIT]."""
|
| pattern_id: str
|
| pattern_label: str
|
| n_sentences: int
|
| exemplars: str
|
| llm_memo_draft: str
|
| researcher_memo: str = ""
|
| verdict: str = ""
|
| new_label: str = ""
|
|
|
|
|
|
|
|
|
|
|
| MEMO_PROMPT_TEMPLATE = """You are an analyst applying Nelson (2020) computational \
|
| grounded theory Phase 2 — Pattern Refinement.
|
|
|
| Researcher's reflexive positioning (Carlsen & Ralund 2022):
|
| {reflexive_pos}
|
|
|
| Pattern label (from Phase 1 clustering): {pattern_label}
|
|
|
| Exemplar sentences in this pattern (researcher reads these for deep interpretation):
|
| {numbered_exemplars}
|
|
|
| Draft a brief interpretive memo (3-5 sentences, max 150 words) covering:
|
| 1. What this pattern seems to capture
|
| 2. Any key dimensions or tensions across the exemplars
|
| 3. Whether the Phase 1 pattern label seems apt
|
|
|
| Be specific to the sentences. Do not fabricate content not present in the exemplars.
|
| This is a draft for the researcher to refine — you do not decide the pattern's fate.
|
|
|
| Memo:"""
|
|
|
|
|
|
|
|
|
|
|
| def run_pattern_refinement(
|
| sentences_df: pd.DataFrame,
|
| n_exemplars: int,
|
| llm_provider: str,
|
| llm_key: str,
|
| reflexive_pos: str,
|
| ) -> Dict:
|
| """Generate RefinementRow list with LLM-drafted memos.
|
|
|
| Args:
|
| sentences_df: Phase 1 output with columns
|
| {sentence, cluster_id, cluster_label, ...optional dist_to_centroid}
|
| n_exemplars: top-N exemplars per cluster
|
| llm_provider: e.g. "Mistral"
|
| llm_key: LLM API key
|
| reflexive_pos: researcher's reflexive positioning statement
|
|
|
| Returns:
|
| dict with:
|
| refinement_rows: list[dict] — ready for DataFrame display
|
| n_patterns: int — number of non-noise clusters processed
|
| n_noise: int — number of noise-assigned sentences skipped
|
| llm_errors: list[str] — per-cluster errors if any
|
| """
|
| if sentences_df is None or len(sentences_df) == 0:
|
| return {"refinement_rows": [], "n_patterns": 0, "n_noise": 0, "llm_errors": []}
|
|
|
| df = sentences_df.copy()
|
|
|
| if "cluster_id" not in df.columns:
|
| return {"refinement_rows": [], "n_patterns": 0, "n_noise": 0,
|
| "llm_errors": ["no cluster_id column in Phase 1 output"]}
|
|
|
|
|
| noise_mask = df["cluster_id"].astype(str).str.lower() == "noise"
|
| n_noise = int(noise_mask.sum())
|
| clusters_df = df[~noise_mask]
|
|
|
|
|
| groups = clusters_df.groupby("cluster_id", sort=True)
|
|
|
|
|
| client = None
|
| model_name = None
|
| llm_errors: List[str] = []
|
| if PROVIDERS_OK and llm_key:
|
| try:
|
| client = providers.get_llm_client(llm_provider, llm_key)
|
| model_name = providers.get_llm_model(llm_provider)
|
| except Exception as e:
|
| llm_errors.append(f"llm_client_init: {e}")
|
| client = None
|
|
|
| refinement_rows: List[Dict] = []
|
| for cluster_id, cluster_df in groups:
|
|
|
| if "dist_to_centroid" in cluster_df.columns:
|
| sorted_df = cluster_df.sort_values(
|
| "dist_to_centroid", ascending=True, na_position="last"
|
| )
|
| else:
|
| sorted_df = cluster_df
|
|
|
|
|
| top_n = sorted_df.head(int(n_exemplars))
|
| exemplar_sentences = top_n["sentence"].astype(str).tolist()
|
| pattern_label = str(
|
| cluster_df["cluster_label"].iloc[0]
|
| if "cluster_label" in cluster_df.columns and len(cluster_df) > 0
|
| else f"cluster_{cluster_id}"
|
| )
|
|
|
|
|
| memo = ""
|
| if client is not None:
|
| numbered = "\n".join(
|
| f" {i+1}. {s}" for i, s in enumerate(exemplar_sentences)
|
| )
|
| prompt = MEMO_PROMPT_TEMPLATE.format(
|
| reflexive_pos=(reflexive_pos or "(none provided)").strip(),
|
| pattern_label=pattern_label,
|
| numbered_exemplars=numbered,
|
| )
|
| try:
|
| resp = client.chat.complete(
|
| model=model_name,
|
| messages=[{"role": "user", "content": prompt}],
|
| temperature=0.0,
|
| max_tokens=300,
|
| )
|
| memo = (resp.choices[0].message.content or "").strip()
|
|
|
| memo = memo[:1200]
|
| except Exception as e:
|
| memo = f"(LLM error: {e})"
|
| llm_errors.append(f"cluster_{cluster_id}: {e}")
|
|
|
| refinement_rows.append({
|
| "pattern_id": str(cluster_id),
|
| "pattern_label": pattern_label,
|
| "n_sentences": int(len(cluster_df)),
|
| "exemplars": " | ".join(exemplar_sentences),
|
| "llm_memo_draft": memo,
|
| "researcher_memo": "",
|
| "verdict": "",
|
| "new_label": "",
|
| })
|
|
|
| return {
|
| "refinement_rows": refinement_rows,
|
| "n_patterns": len(refinement_rows),
|
| "n_noise": n_noise,
|
| "llm_errors": llm_errors,
|
| }
|
|
|
|
|
|
|
|
|
|
|
| VALID_VERDICTS = {"keep", "merge", "split", "drop", "rename"}
|
|
|
|
|
| def validate_refinement_table(refinement_df: pd.DataFrame) -> Dict:
|
| """Validate researcher's completed refinement table.
|
|
|
| Enforces:
|
| - every row has a verdict in VALID_VERDICTS
|
| - rows with verdict in {rename, split} must have new_label non-empty
|
| - every row has a researcher_memo (at least 1 char)
|
| """
|
| if refinement_df is None or len(refinement_df) == 0:
|
| return {"ok": False, "errors": ["refinement_table is empty"]}
|
|
|
| errors: List[str] = []
|
| for i, row in refinement_df.iterrows():
|
| pid = row.get("pattern_id", f"row_{i}")
|
| verdict = str(row.get("verdict", "")).strip().lower()
|
| memo = str(row.get("researcher_memo", "")).strip()
|
| new_label = str(row.get("new_label", "")).strip()
|
|
|
| if verdict not in VALID_VERDICTS:
|
| errors.append(
|
| f"pattern {pid}: verdict must be one of {sorted(VALID_VERDICTS)}, got {verdict!r}"
|
| )
|
| if not memo:
|
| errors.append(f"pattern {pid}: researcher_memo is empty")
|
| if verdict in ("rename", "split") and not new_label:
|
| errors.append(
|
| f"pattern {pid}: verdict={verdict} requires new_label (not empty)"
|
| )
|
|
|
| return {"ok": len(errors) == 0, "errors": errors}
|
|
|