Spaces:

Viske
/

Spjimr

Running

App Files Files Community

Spjimr / methodology_comparison.py

shahidshaikh

Upload 40 files

a52bae4 verified 6 days ago

raw

history blame contribute delete

21.6 kB

	# ============================================================================
	# methodology_comparison.py — reference paper vs our technique, per workbench
	# ============================================================================
	#
	# Principle: Same methodological rigor as the reference paper. Latest
	# best-in-class computational technique. Every step upgraded technically;
	# every methodological commitment preserved.
	#
	# One MethodologyComparison per workbench. Each has:
	# - principle: header paragraph for the paper's methods section
	# - reference_papers: list of full citations
	# - rows: per-step 4-column comparison
	#
	# Serialized to Markdown for download + injection into papers.
	# ============================================================================

	from dataclasses import dataclass, field
	from typing import List
	from datetime import datetime


	@dataclass
	class ComparisonRow:
	"""One step in the methodology comparison table."""
	step: str
	commitment: str # Methodological commitment (unchanged across ref and ours)
	reference_technique: str # What the reference paper used (2020-2022 tech)
	our_technique: str # What we use (2026 best-in-class) + why better


	@dataclass
	class MethodologyComparison:
	"""Full comparison for one workbench, paper-ready."""
	workbench_name: str
	reference_papers: List[str]
	principle: str
	rows: List[ComparisonRow] = field(default_factory=list)

	def as_markdown(self) -> str:
	"""Render as paper-ready Markdown — copy-paste into methods section."""
	lines = [
	f"# Methodology Comparison — {self.workbench_name}",
	"",
	f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
	"",
	"## Principle",
	"",
	self.principle,
	"",
	"## Reference Papers",
	"",
	]
	for p in self.reference_papers:
	lines.append(f"- {p}")
	lines.append("")
	lines.append("## Step-by-Step Comparison")
	lines.append("")
	lines.append("\| Step \| Methodological commitment \| Reference technique (2020-2022) \| Our technique (2026) + why better \|")
	lines.append("\|---\|---\|---\|---\|")
	for r in self.rows:
	# Escape pipes in cell content to avoid breaking markdown table
	step = r.step.replace("\|", "\\\|")
	commit = r.commitment.replace("\|", "\\\|").replace("\n", "<br>")
	ref = r.reference_technique.replace("\|", "\\\|").replace("\n", "<br>")
	ours = r.our_technique.replace("\|", "\\\|").replace("\n", "<br>")
	lines.append(f"\| {step} \| {commit} \| {ref} \| {ours} \|")
	lines.append("")
	lines.append("---")
	lines.append("")
	lines.append("*This comparison was auto-generated by the Researcher Workbench. "
	"Paste directly into the methods section of your paper. "
	"All method contracts referenced above are enforced in code — see `method_contracts.py` "
	"for the grep-able registry.*")
	return "\n".join(lines)


	# ============================================================================
	# B&C Workbench — Braun & Clarke 2006 reflexive thematic analysis
	# ============================================================================
	BC_COMPARISON = MethodologyComparison(
	workbench_name="B&C Workbench (Reflexive Thematic Analysis)",
	reference_papers=[
	"Braun, V. & Clarke, V. (2006). Using thematic analysis in psychology. "
	"Qualitative Research in Psychology, 3(2), 77-101.",
	"Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited: "
	"From computer-led to computer-assisted. Big Data & Society, 9(1).",
	],
	principle=(
	"We preserve the full methodological rigor of Braun & Clarke's (2006) six-phase "
	"reflexive thematic analysis — reflexivity, systematic coverage, "
	"semantic-or-latent analysis-wide choice, iterative refinement, researcher authority. "
	"Every phase is implemented with the best computational technique available in 2026: "
	"LLM-assisted code generation at pinned temperature 0.0, transformer-based embeddings "
	"for theme clustering, embedding cohesion checks for theme review, and paper-cited "
	"method contracts enforced in Python. The researcher validates every AI output via "
	"named override widgets. Carlsen & Ralund's (2022) researcher-centrality principle "
	"is preserved: AI assists, researcher approves."
	),
	rows=[
	ComparisonRow(
	step="Phase 1 — Familiarization",
	commitment="B&C 2006 p. 87: researcher immerses in data, articulates reflexive positioning, confirms initial noticings before coding",
	reference_technique="Manual reading of full corpus; notes in research journal; no computational assistance",
	our_technique="LLM-facilitated dialogue (Mistral temp=0.0) + reflexive positioning as contract-enforced field (≥20 chars) + three-step validation table. Better: scales to 1000+ sentence corpora without abandoning reflexivity; positioning statement is auditable.",
	),
	ComparisonRow(
	step="Phase 2 — Initial Coding",
	commitment="B&C 2006 p. 84: semantic XOR latent orientation (analysis-wide). p. 88: systematic coverage (every sentence coded). Reflexivity: researcher's positioning shapes every code.",
	reference_technique="Researcher manually codes each sentence in a spreadsheet over weeks. No validation other than researcher re-reading.",
	our_technique="Mistral temp=0.0 proposes codes across 3 iterations; reflexive positioning injected per prompt; researcher overrides via `human_code_iter1/2/3` + `flagged` + `final_code` columns. Hallucination bounded by exact-sentence-quote requirement. Reproducibility: identical corpus → identical codes. Contract: B&C 2006 p. 84, p. 88, reflexivity × 5.",
	),
	ComparisonRow(
	step="Phase 3 — Searching for Themes",
	commitment="B&C 2006 p. 89: themes emerge from codes; patterns meaningful to research question; themes are tentative, iterative",
	reference_technique="Researcher manually groups codes into themes on paper, sticky notes, or mind-map software. No computational clustering.",
	our_technique="MiniLM 384-dim embeddings of codes + agglomerative clustering (cosine similarity, threshold ∈ [0.3, 0.95]) + Mistral names each cluster + researcher renames in theme table. Deterministic given fixed seed. Better: reveals semantic theme coherence invisible to manual grouping; researcher still decides final names.",
	),
	ComparisonRow(
	step="Phase 4 — Reviewing Themes",
	commitment="B&C 2006 p. 91: Level 1 check (coded extracts cohere within theme) + Level 2 check (themes work across corpus)",
	reference_technique="Researcher manually re-reads coded extracts against themes; refines or drops themes through discussion or introspection",
	our_technique="Embedding-based cohesion score per theme (cluster tightness) + Mistral drafts keep/merge/split/drop/rename verdict + researcher enters `researcher_verdict`. Contract: B&C 2006 p. 91 × 3. Better: cohesion scores surface weak themes the researcher might miss; researcher still decides fate.",
	),
	ComparisonRow(
	step="Phase 5 — Defining and Naming",
	commitment="B&C 2006 p. 92: each theme has a clear definition and a catchy name capturing its essence",
	reference_technique="Researcher drafts theme definitions by hand based on coded extracts",
	our_technique="Mistral drafts definition + catchy name per kept theme; researcher overrides via `researcher_definition` + `researcher_name` columns. Contract: B&C 2006 p. 92 × 3. Better: draft saves hours; researcher still authors final definitions.",
	),
	ComparisonRow(
	step="Phase 6 — Producing the Report",
	commitment="B&C 2006 p. 93: weave theme definitions + data extracts + narrative answering research question",
	reference_technique="Researcher writes full report manually, pulling extracts from coded dataset",
	our_technique="Mistral drafts markdown report from definitions + codes + research question + reflexive positioning; researcher edits before save. Report methods section auto-includes this comparison table. Contract: B&C 2006 p. 93 × 2.",
	),
	],
	)


	# ============================================================================
	# G&W at Scale — Gauthier & Wallace 2022 computational thematic analysis
	# ============================================================================
	GW_COMPARISON = MethodologyComparison(
	workbench_name="G&W at Scale (Computational Thematic Analysis)",
	reference_papers=[
	"Gauthier, R.P. & Wallace, J.R. (2022). The Computational Thematic Analysis Toolkit. "
	"Proc. ACM Hum.-Comput. Interact., 6(GROUP), Article 25.",
	"Braun, V. & Clarke, V. (2006). Using thematic analysis in psychology. "
	"Qualitative Research in Psychology, 3(2), 77-101.",
	"Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited. "
	"Big Data & Society, 9(1).",
	],
	principle=(
	"We preserve the full methodological rigor of Gauthier & Wallace's (2022) "
	"Computational Thematic Analysis Toolkit — corpus compression before coding, "
	"researcher validation of representative selection, reflexive engagement with "
	"computationally-surfaced patterns. The core upgrade is architectural: we operate "
	"at the sentence level using MiniLM contextual embeddings (384-dim transformer), "
	"whereas G&W 2022 operated at the word level using bag-of-words LDA. G&W's Data "
	"Cleaning (module 2) and Data Filtering (module 3) modules are therefore not "
	"applicable to our pipeline — their purpose was to make word-frequency topic "
	"modelling tractable, a problem that does not arise when semantic similarity is "
	"computed directly over sentence embeddings. All downstream Braun & Clarke (2006) "
	"Phase 1-6 commitments are preserved; Carlsen & Ralund's (2022) researcher-"
	"centrality is enforced throughout. Phase 0 compression runs before Phase 1 "
	"familiarization, following G&W's own framing of computational operations as "
	"familiarization aids for large corpora."
	),
	rows=[
	ComparisonRow(
	step="Phase 0 — Corpus Compression",
	commitment="G&W 2022 Art. 25: reduce large corpus to representative subset preserving semantic diversity; researcher validates selection before downstream phases consume it",
	reference_technique="Word-level pipeline across four G&W modules: spaCy tokenization + stopword removal + lemmatization (module 2 Data Cleaning) + word include/exclude + frequency thresholds (module 3 Data Filtering) + LDA bag-of-words topic modelling with researcher-chosen k (module 4 Modelling) + purposive sampling near topic centroids (module 5 Sampling). Cleaning and filtering were required because LDA operates on word frequencies and collapses under raw text (stopwords dominate; morphology fragments signal).",
	our_technique=(
	"Sentence-level pipeline with peer-reviewed citation chain: "
	"(1) MiniLM all-MiniLM-L6-v2 sentence embeddings, 384-dim contextual transformer (Reimers & Gurevych 2019, EMNLP) — captures syntax, semantics, word order in one pass, obviates word-level cleaning. "
	"(2) UMAP dimensionality reduction to 10-dim for clustering stability (McInnes, Healy & Melville 2018). "
	"(3) HDBSCAN hierarchical density-based clustering (Campello, Moulavi & Sander 2013, PAKDD, LNCS 7819:160–172; extended in Campello, Moulavi, Zimek & Sander 2015, ACM TKDD 10(1)). Cluster count discovered from data; min_cluster_size parameter is Campello et al.'s explicit mclSize. "
	"(4) Representative selection by HDBSCAN density-tree cluster membership probability, ranked descending, top R per cluster (Campello et al. 2015 §4). NOT centroid-proximity — HDBSCAN produces non-spherical clusters where centroid-based selection is known to misrepresent (Grootendorst 2022, BERTopic). The probability score is 1.0 at the heart of a cluster's density region and 0.0 at the noise edge; ranking by this score is the methodologically native selection for density-based clustering. "
	"(5) Software: McInnes, Healy & Astels 2017, JOSS 2(11):205 — hdbscan library. "
	"(6) Researcher validation via editable `selected` column (Carlsen & Ralund 2022, BDS 9(1) researcher-centrality). "
	"Cleaning and filtering modules are NOT APPLICABLE — our pipeline operates on sentence meaning not word frequency; stopwords carry semantic signal and must not be removed; morphology is handled inside MiniLM's subword tokenizer. Temp=0.0 throughout. Deterministic given fixed corpus (UMAP random_state=42; HDBSCAN deterministic given fixed input; outlier sampling np.random.seed(42)). Contract: G&W 2022 Art. 25 × 5. "
	"Better than LDA: eliminates methodological drift from cleaning rules (different stopword lists → different LDA topics), eliminates researcher guesswork on k, produces reproducible output aligned to density rather than to spherical-cluster assumption."
	),
	),
	ComparisonRow(
	step="Phase 1 — Familiarization (on compressed corpus)",
	commitment="B&C 2006 p. 87: researcher immerses in data, articulates reflexive positioning, confirms noticings. G&W 2022: on compressed corpus so familiarization is tractable at scale.",
	reference_technique="G&W 2022 treated computational exploration itself as familiarization — no distinct Phase 1. Researcher browsed LDA topic keyword lists, adjusted filtering rules, manually reviewed samples.",
	our_technique="Explicit Phase 1 accordion after Phase 0 compression. LLM-facilitated familiarization dialogue on compressed corpus (643 representatives from 1000 sentences). Reflexive positioning injected into every downstream prompt (contract-enforced ≥20 chars). Contract: B&C 2006 p. 87 × 3. Better: makes familiarization auditable and separable from compression; preserves B&C reflexivity commitment explicitly.",
	),
	ComparisonRow(
	step="Phase 2 — Initial Coding",
	commitment="B&C 2006 p. 84, p. 88: semantic-XOR-latent orientation; systematic coverage; reflexivity",
	reference_technique="G&W 2022: researcher manually codes selected representatives in spreadsheet-like UI (Tkinter). No AI assistance.",
	our_technique="Mistral temp=0.0 proposes codes across 3 iterations on compressed corpus; reflexive positioning per prompt; researcher overrides via `human_code_iter1/2/3` + `flagged` + `final_code`. Contract: B&C 2006 p. 84, p. 88, reflexivity × 5. Better: scales across representatives while preserving researcher authority; hallucination bounded by exact-sentence-quote requirement.",
	),
	ComparisonRow(
	step="Phase 3-6 — Themes → Review → Define → Report",
	commitment="B&C 2006 Phases 3-6 as specified; applied to codes from compressed corpus",
	reference_technique="G&W 2022: researcher manually creates theme visualizations (chord diagrams), manually reviews quotes, manually writes report",
	our_technique="Same as B&C Workbench Phases 3-6 — embedding-based theme clustering, cohesion-scored review, LLM-drafted definitions and report with researcher override at every step. See B&C comparison for per-phase detail.",
	),
	],
	)


	# ============================================================================
	# CGT Workbench — Nelson 2020 computational grounded theory + C&R 2022
	# ============================================================================
	CGT_COMPARISON = MethodologyComparison(
	workbench_name="CGT Workbench (Computational Grounded Theory — Nelson + C&R)",
	reference_papers=[
	"Nelson, L.K. (2020). Computational grounded theory: A methodological framework. "
	"Sociological Methods & Research, 49(1), 3-42.",
	"Carlsen, H.B. & Ralund, S. (2022). Computational grounded theory revisited: "
	"From computer-led to computer-assisted text analysis. Big Data & Society, 9(1).",
	],
	principle=(
	"We preserve the full methodological rigor of Nelson's (2020) three-step "
	"computational grounded theory framework — Pattern Detection (unsupervised ML), "
	"Pattern Refinement (researcher close-reading), Pattern Confirmation (supervised ML) — "
	"with Carlsen & Ralund's (2022) researcher-centrality principle enforced at every "
	"step. The 2020 framework used word2vec-era embeddings and k-means clustering for "
	"detection, and bag-of-words + logistic regression for confirmation; we upgrade "
	"both to sentence-transformer-based techniques while preserving the three-step "
	"structure and researcher authority. Maps to traditional GT: Pattern Detection ≈ "
	"open coding, Refinement ≈ axial coding, Confirmation ≈ selective coding."
	),
	rows=[
	ComparisonRow(
	step="Step 1 — Pattern Detection",
	commitment="Nelson 2020: surface structural patterns via unsupervised ML; researcher interprets labels. C&R 2022: researcher approves labels, not algorithm.",
	reference_technique="word2vec (2013-era word embeddings, context-blind) OR LDA bag-of-words; k-means clustering with k specified upfront; researcher manually reads cluster exemplars and names them",
	our_technique="MiniLM all-MiniLM-L6-v2 sentence embeddings (384-dim, transformer-based, context-aware) + agglomerative clustering (cosine similarity, researcher-set threshold; cluster count discovered from data) + LLM drafts cluster labels + researcher validates and renames. Contract: Nelson 2020 × 4. Better: sentence-level semantics (word2vec was word-level, couldn't handle unseen vocabulary or multi-word context); agglomerative discovers cluster count (k-means required guessing k); LLM labeling + researcher override is faster and more auditable than manual cluster-by-cluster interpretation.",
	),
	ComparisonRow(
	step="Step 2 — Pattern Refinement",
	commitment="Nelson 2020: deep reading of pattern exemplars; researcher refines pattern definitions; keep/merge/split/drop decisions",
	reference_technique="Researcher manually reads clusters, writes memos in a notebook, decides fate of each pattern through introspection. No tool assistance beyond the clustering from Step 1.",
	our_technique="[Pending Turn 3 build] Tool surfaces top-N exemplars per pattern sorted by centroid proximity; LLM drafts interpretive memo per pattern; researcher writes final memo + enters keep/merge/split/drop/rename verdict. Contract: Nelson 2020 × TBD. Better: exemplar surfacing is reproducible; memo drafts save hours while preserving researcher's final interpretation.",
	),
	ComparisonRow(
	step="Step 3 — Pattern Confirmation",
	commitment="Nelson 2020: test pattern generalizability via supervised ML on held-out sample; researcher inspects classifier failures",
	reference_technique="Bag-of-words TF-IDF features + logistic regression classifier; k-fold cross-validation; researcher labels held-out sentences manually; researcher reads confusion matrix",
	our_technique="[Pending Turn 4 build] MiniLM sentence embeddings as features (semantic similarity, not just word overlap) + logistic regression classifier + researcher-labeled held-out split (A2 default = document-level split; A1 toggle = random 20/80 at sentence level) + confusion matrix + per-pattern precision/recall + researcher inspects classifier disagreements. Contract: Nelson 2020 × TBD. Better: sentence embeddings encode contextual meaning (bag-of-words couldn't distinguish 'I agree with management' from 'I agree management is bad' beyond word frequency); document-level split tests generalization across contexts, not just within one context, yielding stronger validity claim.",
	),
	],
	)


	# ============================================================================
	# Registry — for lookup from app.py
	# ============================================================================
	COMPARISONS = {
	"bc": BC_COMPARISON,
	"gw": GW_COMPARISON,
	"cgt": CGT_COMPARISON,
	}


	# ============================================================================
	# Self-documentation
	# ============================================================================
	if __name__ == "__main__":
	for key, comp in COMPARISONS.items():
	print(f"\n{'=' * 78}")
	print(f" {key.upper()} — {comp.workbench_name}")
	print(f"{'=' * 78}\n")
	print(comp.as_markdown())