# ============================================================================
# methodology_comparison.py — reference paper vs our technique, per workbench
# ============================================================================
#
# Principle: Same methodological rigor as the reference paper. Latest
# best-in-class computational technique. Every step upgraded technically;
# every methodological commitment preserved.
#
# One MethodologyComparison per workbench. Each has:
#   - principle: header paragraph for the paper's methods section
#   - reference_papers: list of full citations
#   - rows: per-step 4-column comparison
#
# Serialized to Markdown for download + injection into papers.
# ============================================================================

from dataclasses import dataclass, field
from typing import List
from datetime import datetime


@dataclass
class ComparisonRow:
    """One step in the methodology comparison table."""
    step: str
    commitment: str         # Methodological commitment (unchanged across ref and ours)
    reference_technique: str  # What the reference paper used (2020-2022 tech)
    our_technique: str      # What we use (2026 best-in-class) + why better


@dataclass
class MethodologyComparison:
    """Full comparison for one workbench, paper-ready."""
    workbench_name: str
    reference_papers: List[str]
    principle: str
    rows: List[ComparisonRow] = field(default_factory=list)

    def as_markdown(self) -> str:
        """Render as paper-ready Markdown — copy-paste into methods section."""
        lines = [
            f"# Methodology Comparison — {self.workbench_name}",
            "",
            f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*",
            "",
            "## Principle",
            "",
            self.principle,
            "",
            "## Reference Papers",
            "",
        ]
        for p in self.reference_papers:
            lines.append(f"- {p}")
        lines.append("")
        lines.append("## Step-by-Step Comparison")
        lines.append("")
        lines.append("| Step | Methodological commitment | Reference technique (2020-2022) | Our technique (2026) + why better |")
        lines.append("|---|---|---|---|")
        for r in self.rows:
            # Escape pipes in cell content to avoid breaking markdown table
            step = r.step.replace("|", "\\|")
            commit = r.commitment.replace("|", "\\|").replace("\n", "<br>")
            ref = r.reference_technique.replace("|", "\\|").replace("\n", "<br>")
            ours = r.our_technique.replace("|", "\\|").replace("\n", "<br>")
            lines.append(f"| **{step}** | {commit} | {ref} | {ours} |")
        lines.append("")
        lines.append("---")
        lines.append("")
        lines.append("*This comparison was auto-generated by the Researcher Workbench. "
                     "Paste directly into the methods section of your paper. "
                     "All method contracts referenced above are enforced in code — see `method_contracts.py` "
                     "for the grep-able registry.*")
        return "\n".join(lines)


# ============================================================================
# B&C Workbench — Braun & Clarke 2006 reflexive thematic analysis
# ============================================================================
BC_COMPARISON = MethodologyComparison(
    workbench_name="B&C Workbench (Reflexive Thematic Analysis)",
    reference_papers=[
        "Braun, V. & Clarke, V. (2006). Using thematic analysis in psychology. "
        "Qualitative Research in Psychology, 3(2), 77-101.",
        "Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited: "
        "From computer-led to computer-assisted. Big Data & Society, 9(1).",
    ],
    principle=(
        "We preserve the full methodological rigor of Braun & Clarke's (2006) six-phase "
        "reflexive thematic analysis — reflexivity, systematic coverage, "
        "semantic-or-latent analysis-wide choice, iterative refinement, researcher authority. "
        "Every phase is implemented with the best computational technique available in 2026: "
        "LLM-assisted code generation at pinned temperature 0.0, transformer-based embeddings "
        "for theme clustering, embedding cohesion checks for theme review, and paper-cited "
        "method contracts enforced in Python. The researcher validates every AI output via "
        "named override widgets. Carlsen & Ralund's (2022) researcher-centrality principle "
        "is preserved: AI assists, researcher approves."
    ),
    rows=[
        ComparisonRow(
            step="Phase 1 — Familiarization",
            commitment="B&C 2006 p. 87: researcher immerses in data, articulates reflexive positioning, confirms initial noticings before coding",
            reference_technique="Manual reading of full corpus; notes in research journal; no computational assistance",
            our_technique="LLM-facilitated dialogue (Mistral temp=0.0) + reflexive positioning as contract-enforced field (≥20 chars) + three-step validation table. Better: scales to 1000+ sentence corpora without abandoning reflexivity; positioning statement is auditable.",
        ),
        ComparisonRow(
            step="Phase 2 — Initial Coding",
            commitment="B&C 2006 p. 84: semantic XOR latent orientation (analysis-wide). p. 88: systematic coverage (every sentence coded). Reflexivity: researcher's positioning shapes every code.",
            reference_technique="Researcher manually codes each sentence in a spreadsheet over weeks. No validation other than researcher re-reading.",
            our_technique="Mistral temp=0.0 proposes codes across 3 iterations; reflexive positioning injected per prompt; researcher overrides via `human_code_iter1/2/3` + `flagged` + `final_code` columns. Hallucination bounded by exact-sentence-quote requirement. Reproducibility: identical corpus → identical codes. Contract: B&C 2006 p. 84, p. 88, reflexivity × 5.",
        ),
        ComparisonRow(
            step="Phase 3 — Searching for Themes",
            commitment="B&C 2006 p. 89: themes emerge from codes; patterns meaningful to research question; themes are tentative, iterative",
            reference_technique="Researcher manually groups codes into themes on paper, sticky notes, or mind-map software. No computational clustering.",
            our_technique="MiniLM 384-dim embeddings of codes + agglomerative clustering (cosine similarity, threshold ∈ [0.3, 0.95]) + Mistral names each cluster + researcher renames in theme table. Deterministic given fixed seed. Better: reveals semantic theme coherence invisible to manual grouping; researcher still decides final names.",
        ),
        ComparisonRow(
            step="Phase 4 — Reviewing Themes",
            commitment="B&C 2006 p. 91: Level 1 check (coded extracts cohere within theme) + Level 2 check (themes work across corpus)",
            reference_technique="Researcher manually re-reads coded extracts against themes; refines or drops themes through discussion or introspection",
            our_technique="Embedding-based cohesion score per theme (cluster tightness) + Mistral drafts keep/merge/split/drop/rename verdict + researcher enters `researcher_verdict`. Contract: B&C 2006 p. 91 × 3. Better: cohesion scores surface weak themes the researcher might miss; researcher still decides fate.",
        ),
        ComparisonRow(
            step="Phase 5 — Defining and Naming",
            commitment="B&C 2006 p. 92: each theme has a clear definition and a catchy name capturing its essence",
            reference_technique="Researcher drafts theme definitions by hand based on coded extracts",
            our_technique="Mistral drafts definition + catchy name per kept theme; researcher overrides via `researcher_definition` + `researcher_name` columns. Contract: B&C 2006 p. 92 × 3. Better: draft saves hours; researcher still authors final definitions.",
        ),
        ComparisonRow(
            step="Phase 6 — Producing the Report",
            commitment="B&C 2006 p. 93: weave theme definitions + data extracts + narrative answering research question",
            reference_technique="Researcher writes full report manually, pulling extracts from coded dataset",
            our_technique="Mistral drafts markdown report from definitions + codes + research question + reflexive positioning; researcher edits before save. Report methods section auto-includes this comparison table. Contract: B&C 2006 p. 93 × 2.",
        ),
    ],
)


# ============================================================================
# G&W at Scale — Gauthier & Wallace 2022 computational thematic analysis
# ============================================================================
GW_COMPARISON = MethodologyComparison(
    workbench_name="G&W at Scale (Computational Thematic Analysis)",
    reference_papers=[
        "Gauthier, R.P. & Wallace, J.R. (2022). The Computational Thematic Analysis Toolkit. "
        "Proc. ACM Hum.-Comput. Interact., 6(GROUP), Article 25.",
        "Braun, V. & Clarke, V. (2006). Using thematic analysis in psychology. "
        "Qualitative Research in Psychology, 3(2), 77-101.",
        "Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited. "
        "Big Data & Society, 9(1).",
    ],
    principle=(
        "We preserve the full methodological rigor of Gauthier & Wallace's (2022) "
        "Computational Thematic Analysis Toolkit — corpus compression before coding, "
        "researcher validation of representative selection, reflexive engagement with "
        "computationally-surfaced patterns. The core upgrade is architectural: we operate "
        "at the sentence level using MiniLM contextual embeddings (384-dim transformer), "
        "whereas G&W 2022 operated at the word level using bag-of-words LDA. G&W's Data "
        "Cleaning (module 2) and Data Filtering (module 3) modules are therefore not "
        "applicable to our pipeline — their purpose was to make word-frequency topic "
        "modelling tractable, a problem that does not arise when semantic similarity is "
        "computed directly over sentence embeddings. All downstream Braun & Clarke (2006) "
        "Phase 1-6 commitments are preserved; Carlsen & Ralund's (2022) researcher-"
        "centrality is enforced throughout. Phase 0 compression runs before Phase 1 "
        "familiarization, following G&W's own framing of computational operations as "
        "familiarization aids for large corpora."
    ),
    rows=[
        ComparisonRow(
            step="Phase 0 — Corpus Compression",
            commitment="G&W 2022 Art. 25: reduce large corpus to representative subset preserving semantic diversity; researcher validates selection before downstream phases consume it",
            reference_technique="Word-level pipeline across four G&W modules: spaCy tokenization + stopword removal + lemmatization (module 2 Data Cleaning) + word include/exclude + frequency thresholds (module 3 Data Filtering) + LDA bag-of-words topic modelling with researcher-chosen k (module 4 Modelling) + purposive sampling near topic centroids (module 5 Sampling). Cleaning and filtering were required because LDA operates on word frequencies and collapses under raw text (stopwords dominate; morphology fragments signal).",
            our_technique=(
                "Sentence-level pipeline with peer-reviewed citation chain: "
                "(1) MiniLM all-MiniLM-L6-v2 sentence embeddings, 384-dim contextual transformer (Reimers & Gurevych 2019, EMNLP) — captures syntax, semantics, word order in one pass, obviates word-level cleaning. "
                "(2) UMAP dimensionality reduction to 10-dim for clustering stability (McInnes, Healy & Melville 2018). "
                "(3) HDBSCAN hierarchical density-based clustering (Campello, Moulavi & Sander 2013, PAKDD, LNCS 7819:160–172; extended in Campello, Moulavi, Zimek & Sander 2015, ACM TKDD 10(1)). Cluster count discovered from data; min_cluster_size parameter is Campello et al.'s explicit mclSize. "
                "(4) Representative selection by HDBSCAN density-tree cluster membership probability, ranked descending, top R per cluster (Campello et al. 2015 §4). NOT centroid-proximity — HDBSCAN produces non-spherical clusters where centroid-based selection is known to misrepresent (Grootendorst 2022, BERTopic). The probability score is 1.0 at the heart of a cluster's density region and 0.0 at the noise edge; ranking by this score is the methodologically native selection for density-based clustering. "
                "(5) Software: McInnes, Healy & Astels 2017, JOSS 2(11):205 — hdbscan library. "
                "(6) Researcher validation via editable `selected` column (Carlsen & Ralund 2022, BDS 9(1) researcher-centrality). "
                "Cleaning and filtering modules are NOT APPLICABLE — our pipeline operates on sentence meaning not word frequency; stopwords carry semantic signal and must not be removed; morphology is handled inside MiniLM's subword tokenizer. Temp=0.0 throughout. Deterministic given fixed corpus (UMAP random_state=42; HDBSCAN deterministic given fixed input; outlier sampling np.random.seed(42)). Contract: G&W 2022 Art. 25 × 5. "
                "Better than LDA: eliminates methodological drift from cleaning rules (different stopword lists → different LDA topics), eliminates researcher guesswork on k, produces reproducible output aligned to density rather than to spherical-cluster assumption."
            ),
        ),
        ComparisonRow(
            step="Phase 1 — Familiarization (on compressed corpus)",
            commitment="B&C 2006 p. 87: researcher immerses in data, articulates reflexive positioning, confirms noticings. G&W 2022: on compressed corpus so familiarization is tractable at scale.",
            reference_technique="G&W 2022 treated computational exploration itself as familiarization — no distinct Phase 1. Researcher browsed LDA topic keyword lists, adjusted filtering rules, manually reviewed samples.",
            our_technique="Explicit Phase 1 accordion after Phase 0 compression. LLM-facilitated familiarization dialogue on compressed corpus (643 representatives from 1000 sentences). Reflexive positioning injected into every downstream prompt (contract-enforced ≥20 chars). Contract: B&C 2006 p. 87 × 3. Better: makes familiarization auditable and separable from compression; preserves B&C reflexivity commitment explicitly.",
        ),
        ComparisonRow(
            step="Phase 2 — Initial Coding",
            commitment="B&C 2006 p. 84, p. 88: semantic-XOR-latent orientation; systematic coverage; reflexivity",
            reference_technique="G&W 2022: researcher manually codes selected representatives in spreadsheet-like UI (Tkinter). No AI assistance.",
            our_technique="Mistral temp=0.0 proposes codes across 3 iterations on compressed corpus; reflexive positioning per prompt; researcher overrides via `human_code_iter1/2/3` + `flagged` + `final_code`. Contract: B&C 2006 p. 84, p. 88, reflexivity × 5. Better: scales across representatives while preserving researcher authority; hallucination bounded by exact-sentence-quote requirement.",
        ),
        ComparisonRow(
            step="Phase 3-6 — Themes → Review → Define → Report",
            commitment="B&C 2006 Phases 3-6 as specified; applied to codes from compressed corpus",
            reference_technique="G&W 2022: researcher manually creates theme visualizations (chord diagrams), manually reviews quotes, manually writes report",
            our_technique="Same as B&C Workbench Phases 3-6 — embedding-based theme clustering, cohesion-scored review, LLM-drafted definitions and report with researcher override at every step. See B&C comparison for per-phase detail.",
        ),
    ],
)


# ============================================================================
# CGT Workbench — Nelson 2020 computational grounded theory + C&R 2022
# ============================================================================
CGT_COMPARISON = MethodologyComparison(
    workbench_name="CGT Workbench (Computational Grounded Theory — Nelson + C&R)",
    reference_papers=[
        "Nelson, L.K. (2020). Computational grounded theory: A methodological framework. "
        "Sociological Methods & Research, 49(1), 3-42.",
        "Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited: "
        "From computer-led to computer-assisted text analysis. Big Data & Society, 9(1).",
    ],
    principle=(
        "We preserve the full methodological rigor of Nelson's (2020) three-step "
        "computational grounded theory framework — Pattern Detection (unsupervised ML), "
        "Pattern Refinement (researcher close-reading), Pattern Confirmation (supervised ML) — "
        "with Carlsen & Ralund's (2022) researcher-centrality principle enforced at every "
        "step. The 2020 framework used word2vec-era embeddings and k-means clustering for "
        "detection, and bag-of-words + logistic regression for confirmation; we upgrade "
        "both to sentence-transformer-based techniques while preserving the three-step "
        "structure and researcher authority. Maps to traditional GT: Pattern Detection ≈ "
        "open coding, Refinement ≈ axial coding, Confirmation ≈ selective coding."
    ),
    rows=[
        ComparisonRow(
            step="Step 1 — Pattern Detection",
            commitment="Nelson 2020: surface structural patterns via unsupervised ML; researcher interprets labels. C&R 2022: researcher approves labels, not algorithm.",
            reference_technique="word2vec (2013-era word embeddings, context-blind) OR LDA bag-of-words; k-means clustering with k specified upfront; researcher manually reads cluster exemplars and names them",
            our_technique="MiniLM all-MiniLM-L6-v2 sentence embeddings (384-dim, transformer-based, context-aware) + agglomerative clustering (cosine similarity, researcher-set threshold; cluster count discovered from data) + LLM drafts cluster labels + researcher validates and renames. Contract: Nelson 2020 × 4. Better: sentence-level semantics (word2vec was word-level, couldn't handle unseen vocabulary or multi-word context); agglomerative discovers cluster count (k-means required guessing k); LLM labeling + researcher override is faster and more auditable than manual cluster-by-cluster interpretation.",
        ),
        ComparisonRow(
            step="Step 2 — Pattern Refinement",
            commitment="Nelson 2020: deep reading of pattern exemplars; researcher refines pattern definitions; keep/merge/split/drop decisions",
            reference_technique="Researcher manually reads clusters, writes memos in a notebook, decides fate of each pattern through introspection. No tool assistance beyond the clustering from Step 1.",
            our_technique="[Pending Turn 3 build] Tool surfaces top-N exemplars per pattern sorted by centroid proximity; LLM drafts interpretive memo per pattern; researcher writes final memo + enters keep/merge/split/drop/rename verdict. Contract: Nelson 2020 × TBD. Better: exemplar surfacing is reproducible; memo drafts save hours while preserving researcher's final interpretation.",
        ),
        ComparisonRow(
            step="Step 3 — Pattern Confirmation",
            commitment="Nelson 2020: test pattern generalizability via supervised ML on held-out sample; researcher inspects classifier failures",
            reference_technique="Bag-of-words TF-IDF features + logistic regression classifier; k-fold cross-validation; researcher labels held-out sentences manually; researcher reads confusion matrix",
            our_technique="[Pending Turn 4 build] MiniLM sentence embeddings as features (semantic similarity, not just word overlap) + logistic regression classifier + researcher-labeled held-out split (A2 default = document-level split; A1 toggle = random 20/80 at sentence level) + confusion matrix + per-pattern precision/recall + researcher inspects classifier disagreements. Contract: Nelson 2020 × TBD. Better: sentence embeddings encode contextual meaning (bag-of-words couldn't distinguish 'I agree with management' from 'I agree management is bad' beyond word frequency); document-level split tests generalization across contexts, not just within one context, yielding stronger validity claim.",
        ),
    ],
)


# ============================================================================
# Registry — for lookup from app.py
# ============================================================================
COMPARISONS = {
    "bc": BC_COMPARISON,
    "gw": GW_COMPARISON,
    "cgt": CGT_COMPARISON,
}


# ============================================================================
# Self-documentation
# ============================================================================
if __name__ == "__main__":
    for key, comp in COMPARISONS.items():
        print(f"\n{'=' * 78}")
        print(f"  {key.upper()}  —  {comp.workbench_name}")
        print(f"{'=' * 78}\n")
        print(comp.as_markdown())