# ============================================================================
# methodology_comparison.py — reference paper vs our technique, per workbench
# ============================================================================
#
# Principle: Same methodological rigor as the reference paper. Latest
# best-in-class computational technique. Every step upgraded technically;
# every methodological commitment preserved.
#
# One MethodologyComparison per workbench. Each has:
# - principle: header paragraph for the paper's methods section
# - reference_papers: list of full citations
# - rows: per-step 4-column comparison
#
# Serialized to Markdown for download + injection into papers.
# ============================================================================
from dataclasses import dataclass, field
from typing import List
from datetime import datetime
@dataclass
class ComparisonRow:
"""One step in the methodology comparison table."""
step: str
commitment: str # Methodological commitment (unchanged across ref and ours)
reference_technique: str # What the reference paper used (2020-2022 tech)
our_technique: str # What we use (2026 best-in-class) + why better
@dataclass
class MethodologyComparison:
"""Full comparison for one workbench, paper-ready."""
workbench_name: str
reference_papers: List[str]
principle: str
rows: List[ComparisonRow] = field(default_factory=list)
def as_markdown(self) -> str:
"""Render as paper-ready Markdown — copy-paste into methods section."""
lines = [
f"# Methodology Comparison — {self.workbench_name}",
"",
f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*",
"",
"## Principle",
"",
self.principle,
"",
"## Reference Papers",
"",
]
for p in self.reference_papers:
lines.append(f"- {p}")
lines.append("")
lines.append("## Step-by-Step Comparison")
lines.append("")
lines.append("| Step | Methodological commitment | Reference technique (2020-2022) | Our technique (2026) + why better |")
lines.append("|---|---|---|---|")
for r in self.rows:
# Escape pipes in cell content to avoid breaking markdown table
step = r.step.replace("|", "\\|")
commit = r.commitment.replace("|", "\\|").replace("\n", "
")
ref = r.reference_technique.replace("|", "\\|").replace("\n", "
")
ours = r.our_technique.replace("|", "\\|").replace("\n", "
")
lines.append(f"| **{step}** | {commit} | {ref} | {ours} |")
lines.append("")
lines.append("---")
lines.append("")
lines.append("*This comparison was auto-generated by the Researcher Workbench. "
"Paste directly into the methods section of your paper. "
"All method contracts referenced above are enforced in code — see `method_contracts.py` "
"for the grep-able registry.*")
return "\n".join(lines)
# ============================================================================
# B&C Workbench — Braun & Clarke 2006 reflexive thematic analysis
# ============================================================================
BC_COMPARISON = MethodologyComparison(
workbench_name="B&C Workbench (Reflexive Thematic Analysis)",
reference_papers=[
"Braun, V. & Clarke, V. (2006). Using thematic analysis in psychology. "
"Qualitative Research in Psychology, 3(2), 77-101.",
"Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited: "
"From computer-led to computer-assisted. Big Data & Society, 9(1).",
],
principle=(
"We preserve the full methodological rigor of Braun & Clarke's (2006) six-phase "
"reflexive thematic analysis — reflexivity, systematic coverage, "
"semantic-or-latent analysis-wide choice, iterative refinement, researcher authority. "
"Every phase is implemented with the best computational technique available in 2026: "
"LLM-assisted code generation at pinned temperature 0.0, transformer-based embeddings "
"for theme clustering, embedding cohesion checks for theme review, and paper-cited "
"method contracts enforced in Python. The researcher validates every AI output via "
"named override widgets. Carlsen & Ralund's (2022) researcher-centrality principle "
"is preserved: AI assists, researcher approves."
),
rows=[
ComparisonRow(
step="Phase 1 — Familiarization",
commitment="B&C 2006 p. 87: researcher immerses in data, articulates reflexive positioning, confirms initial noticings before coding",
reference_technique="Manual reading of full corpus; notes in research journal; no computational assistance",
our_technique="LLM-facilitated dialogue (Mistral temp=0.0) + reflexive positioning as contract-enforced field (≥20 chars) + three-step validation table. Better: scales to 1000+ sentence corpora without abandoning reflexivity; positioning statement is auditable.",
),
ComparisonRow(
step="Phase 2 — Initial Coding",
commitment="B&C 2006 p. 84: semantic XOR latent orientation (analysis-wide). p. 88: systematic coverage (every sentence coded). Reflexivity: researcher's positioning shapes every code.",
reference_technique="Researcher manually codes each sentence in a spreadsheet over weeks. No validation other than researcher re-reading.",
our_technique="Mistral temp=0.0 proposes codes across 3 iterations; reflexive positioning injected per prompt; researcher overrides via `human_code_iter1/2/3` + `flagged` + `final_code` columns. Hallucination bounded by exact-sentence-quote requirement. Reproducibility: identical corpus → identical codes. Contract: B&C 2006 p. 84, p. 88, reflexivity × 5.",
),
ComparisonRow(
step="Phase 3 — Searching for Themes",
commitment="B&C 2006 p. 89: themes emerge from codes; patterns meaningful to research question; themes are tentative, iterative",
reference_technique="Researcher manually groups codes into themes on paper, sticky notes, or mind-map software. No computational clustering.",
our_technique="MiniLM 384-dim embeddings of codes + agglomerative clustering (cosine similarity, threshold ∈ [0.3, 0.95]) + Mistral names each cluster + researcher renames in theme table. Deterministic given fixed seed. Better: reveals semantic theme coherence invisible to manual grouping; researcher still decides final names.",
),
ComparisonRow(
step="Phase 4 — Reviewing Themes",
commitment="B&C 2006 p. 91: Level 1 check (coded extracts cohere within theme) + Level 2 check (themes work across corpus)",
reference_technique="Researcher manually re-reads coded extracts against themes; refines or drops themes through discussion or introspection",
our_technique="Embedding-based cohesion score per theme (cluster tightness) + Mistral drafts keep/merge/split/drop/rename verdict + researcher enters `researcher_verdict`. Contract: B&C 2006 p. 91 × 3. Better: cohesion scores surface weak themes the researcher might miss; researcher still decides fate.",
),
ComparisonRow(
step="Phase 5 — Defining and Naming",
commitment="B&C 2006 p. 92: each theme has a clear definition and a catchy name capturing its essence",
reference_technique="Researcher drafts theme definitions by hand based on coded extracts",
our_technique="Mistral drafts definition + catchy name per kept theme; researcher overrides via `researcher_definition` + `researcher_name` columns. Contract: B&C 2006 p. 92 × 3. Better: draft saves hours; researcher still authors final definitions.",
),
ComparisonRow(
step="Phase 6 — Producing the Report",
commitment="B&C 2006 p. 93: weave theme definitions + data extracts + narrative answering research question",
reference_technique="Researcher writes full report manually, pulling extracts from coded dataset",
our_technique="Mistral drafts markdown report from definitions + codes + research question + reflexive positioning; researcher edits before save. Report methods section auto-includes this comparison table. Contract: B&C 2006 p. 93 × 2.",
),
],
)
# ============================================================================
# G&W at Scale — Gauthier & Wallace 2022 computational thematic analysis
# ============================================================================
GW_COMPARISON = MethodologyComparison(
workbench_name="G&W at Scale (Computational Thematic Analysis)",
reference_papers=[
"Gauthier, R.P. & Wallace, J.R. (2022). The Computational Thematic Analysis Toolkit. "
"Proc. ACM Hum.-Comput. Interact., 6(GROUP), Article 25.",
"Braun, V. & Clarke, V. (2006). Using thematic analysis in psychology. "
"Qualitative Research in Psychology, 3(2), 77-101.",
"Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited. "
"Big Data & Society, 9(1).",
],
principle=(
"We preserve the full methodological rigor of Gauthier & Wallace's (2022) "
"Computational Thematic Analysis Toolkit — corpus compression before coding, "
"researcher validation of representative selection, reflexive engagement with "
"computationally-surfaced patterns. The core upgrade is architectural: we operate "
"at the sentence level using MiniLM contextual embeddings (384-dim transformer), "
"whereas G&W 2022 operated at the word level using bag-of-words LDA. G&W's Data "
"Cleaning (module 2) and Data Filtering (module 3) modules are therefore not "
"applicable to our pipeline — their purpose was to make word-frequency topic "
"modelling tractable, a problem that does not arise when semantic similarity is "
"computed directly over sentence embeddings. All downstream Braun & Clarke (2006) "
"Phase 1-6 commitments are preserved; Carlsen & Ralund's (2022) researcher-"
"centrality is enforced throughout. Phase 0 compression runs before Phase 1 "
"familiarization, following G&W's own framing of computational operations as "
"familiarization aids for large corpora."
),
rows=[
ComparisonRow(
step="Phase 0 — Corpus Compression",
commitment="G&W 2022 Art. 25: reduce large corpus to representative subset preserving semantic diversity; researcher validates selection before downstream phases consume it",
reference_technique="Word-level pipeline across four G&W modules: spaCy tokenization + stopword removal + lemmatization (module 2 Data Cleaning) + word include/exclude + frequency thresholds (module 3 Data Filtering) + LDA bag-of-words topic modelling with researcher-chosen k (module 4 Modelling) + purposive sampling near topic centroids (module 5 Sampling). Cleaning and filtering were required because LDA operates on word frequencies and collapses under raw text (stopwords dominate; morphology fragments signal).",
our_technique=(
"Sentence-level pipeline with peer-reviewed citation chain: "
"(1) MiniLM all-MiniLM-L6-v2 sentence embeddings, 384-dim contextual transformer (Reimers & Gurevych 2019, EMNLP) — captures syntax, semantics, word order in one pass, obviates word-level cleaning. "
"(2) UMAP dimensionality reduction to 10-dim for clustering stability (McInnes, Healy & Melville 2018). "
"(3) HDBSCAN hierarchical density-based clustering (Campello, Moulavi & Sander 2013, PAKDD, LNCS 7819:160–172; extended in Campello, Moulavi, Zimek & Sander 2015, ACM TKDD 10(1)). Cluster count discovered from data; min_cluster_size parameter is Campello et al.'s explicit mclSize. "
"(4) Representative selection by HDBSCAN density-tree cluster membership probability, ranked descending, top R per cluster (Campello et al. 2015 §4). NOT centroid-proximity — HDBSCAN produces non-spherical clusters where centroid-based selection is known to misrepresent (Grootendorst 2022, BERTopic). The probability score is 1.0 at the heart of a cluster's density region and 0.0 at the noise edge; ranking by this score is the methodologically native selection for density-based clustering. "
"(5) Software: McInnes, Healy & Astels 2017, JOSS 2(11):205 — hdbscan library. "
"(6) Researcher validation via editable `selected` column (Carlsen & Ralund 2022, BDS 9(1) researcher-centrality). "
"Cleaning and filtering modules are NOT APPLICABLE — our pipeline operates on sentence meaning not word frequency; stopwords carry semantic signal and must not be removed; morphology is handled inside MiniLM's subword tokenizer. Temp=0.0 throughout. Deterministic given fixed corpus (UMAP random_state=42; HDBSCAN deterministic given fixed input; outlier sampling np.random.seed(42)). Contract: G&W 2022 Art. 25 × 5. "
"Better than LDA: eliminates methodological drift from cleaning rules (different stopword lists → different LDA topics), eliminates researcher guesswork on k, produces reproducible output aligned to density rather than to spherical-cluster assumption."
),
),
ComparisonRow(
step="Phase 1 — Familiarization (on compressed corpus)",
commitment="B&C 2006 p. 87: researcher immerses in data, articulates reflexive positioning, confirms noticings. G&W 2022: on compressed corpus so familiarization is tractable at scale.",
reference_technique="G&W 2022 treated computational exploration itself as familiarization — no distinct Phase 1. Researcher browsed LDA topic keyword lists, adjusted filtering rules, manually reviewed samples.",
our_technique="Explicit Phase 1 accordion after Phase 0 compression. LLM-facilitated familiarization dialogue on compressed corpus (643 representatives from 1000 sentences). Reflexive positioning injected into every downstream prompt (contract-enforced ≥20 chars). Contract: B&C 2006 p. 87 × 3. Better: makes familiarization auditable and separable from compression; preserves B&C reflexivity commitment explicitly.",
),
ComparisonRow(
step="Phase 2 — Initial Coding",
commitment="B&C 2006 p. 84, p. 88: semantic-XOR-latent orientation; systematic coverage; reflexivity",
reference_technique="G&W 2022: researcher manually codes selected representatives in spreadsheet-like UI (Tkinter). No AI assistance.",
our_technique="Mistral temp=0.0 proposes codes across 3 iterations on compressed corpus; reflexive positioning per prompt; researcher overrides via `human_code_iter1/2/3` + `flagged` + `final_code`. Contract: B&C 2006 p. 84, p. 88, reflexivity × 5. Better: scales across representatives while preserving researcher authority; hallucination bounded by exact-sentence-quote requirement.",
),
ComparisonRow(
step="Phase 3-6 — Themes → Review → Define → Report",
commitment="B&C 2006 Phases 3-6 as specified; applied to codes from compressed corpus",
reference_technique="G&W 2022: researcher manually creates theme visualizations (chord diagrams), manually reviews quotes, manually writes report",
our_technique="Same as B&C Workbench Phases 3-6 — embedding-based theme clustering, cohesion-scored review, LLM-drafted definitions and report with researcher override at every step. See B&C comparison for per-phase detail.",
),
],
)
# ============================================================================
# CGT Workbench — Nelson 2020 computational grounded theory + C&R 2022
# ============================================================================
CGT_COMPARISON = MethodologyComparison(
workbench_name="CGT Workbench (Computational Grounded Theory — Nelson + C&R)",
reference_papers=[
"Nelson, L.K. (2020). Computational grounded theory: A methodological framework. "
"Sociological Methods & Research, 49(1), 3-42.",
"Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited: "
"From computer-led to computer-assisted text analysis. Big Data & Society, 9(1).",
],
principle=(
"We preserve the full methodological rigor of Nelson's (2020) three-step "
"computational grounded theory framework — Pattern Detection (unsupervised ML), "
"Pattern Refinement (researcher close-reading), Pattern Confirmation (supervised ML) — "
"with Carlsen & Ralund's (2022) researcher-centrality principle enforced at every "
"step. The 2020 framework used word2vec-era embeddings and k-means clustering for "
"detection, and bag-of-words + logistic regression for confirmation; we upgrade "
"both to sentence-transformer-based techniques while preserving the three-step "
"structure and researcher authority. Maps to traditional GT: Pattern Detection ≈ "
"open coding, Refinement ≈ axial coding, Confirmation ≈ selective coding."
),
rows=[
ComparisonRow(
step="Step 1 — Pattern Detection",
commitment="Nelson 2020: surface structural patterns via unsupervised ML; researcher interprets labels. C&R 2022: researcher approves labels, not algorithm.",
reference_technique="word2vec (2013-era word embeddings, context-blind) OR LDA bag-of-words; k-means clustering with k specified upfront; researcher manually reads cluster exemplars and names them",
our_technique="MiniLM all-MiniLM-L6-v2 sentence embeddings (384-dim, transformer-based, context-aware) + agglomerative clustering (cosine similarity, researcher-set threshold; cluster count discovered from data) + LLM drafts cluster labels + researcher validates and renames. Contract: Nelson 2020 × 4. Better: sentence-level semantics (word2vec was word-level, couldn't handle unseen vocabulary or multi-word context); agglomerative discovers cluster count (k-means required guessing k); LLM labeling + researcher override is faster and more auditable than manual cluster-by-cluster interpretation.",
),
ComparisonRow(
step="Step 2 — Pattern Refinement",
commitment="Nelson 2020: deep reading of pattern exemplars; researcher refines pattern definitions; keep/merge/split/drop decisions",
reference_technique="Researcher manually reads clusters, writes memos in a notebook, decides fate of each pattern through introspection. No tool assistance beyond the clustering from Step 1.",
our_technique="[Pending Turn 3 build] Tool surfaces top-N exemplars per pattern sorted by centroid proximity; LLM drafts interpretive memo per pattern; researcher writes final memo + enters keep/merge/split/drop/rename verdict. Contract: Nelson 2020 × TBD. Better: exemplar surfacing is reproducible; memo drafts save hours while preserving researcher's final interpretation.",
),
ComparisonRow(
step="Step 3 — Pattern Confirmation",
commitment="Nelson 2020: test pattern generalizability via supervised ML on held-out sample; researcher inspects classifier failures",
reference_technique="Bag-of-words TF-IDF features + logistic regression classifier; k-fold cross-validation; researcher labels held-out sentences manually; researcher reads confusion matrix",
our_technique="[Pending Turn 4 build] MiniLM sentence embeddings as features (semantic similarity, not just word overlap) + logistic regression classifier + researcher-labeled held-out split (A2 default = document-level split; A1 toggle = random 20/80 at sentence level) + confusion matrix + per-pattern precision/recall + researcher inspects classifier disagreements. Contract: Nelson 2020 × TBD. Better: sentence embeddings encode contextual meaning (bag-of-words couldn't distinguish 'I agree with management' from 'I agree management is bad' beyond word frequency); document-level split tests generalization across contexts, not just within one context, yielding stronger validity claim.",
),
],
)
# ============================================================================
# Registry — for lookup from app.py
# ============================================================================
COMPARISONS = {
"bc": BC_COMPARISON,
"gw": GW_COMPARISON,
"cgt": CGT_COMPARISON,
}
# ============================================================================
# Self-documentation
# ============================================================================
if __name__ == "__main__":
for key, comp in COMPARISONS.items():
print(f"\n{'=' * 78}")
print(f" {key.upper()} — {comp.workbench_name}")
print(f"{'=' * 78}\n")
print(comp.as_markdown())