# Methodology Logic Checker # ------------------------- # The most subtle form of scientific fraud is not # fabricating data — it is making claims that your # method cannot actually support. # # "Correlation does not imply causation" is the # famous example. But there are dozens of variations: # underpowered claims, wrong timeframes, missing # controls, self-report data for clinical conclusions. # # This module catches them — using both rule-based # pattern matching and LLM-powered reasoning. import re import os import json import urllib.request import urllib.error from dataclasses import dataclass from dotenv import load_dotenv load_dotenv() # ── data structures ─────────────────────────────────────────── @dataclass class MethodologyFlag: flag_type: str severity: str claim: str issue: str evidence: str suggestion: str @dataclass class MethodologyResult: claims_found: list methods_found: list flags: list llm_assessment: str llm_available: bool risk_score: float risk_level: str summary: str # ── main class ──────────────────────────────────────────────── class MethodologyChecker: """ Two-layer methodology analysis: Layer 1 — Rule-based: fast, deterministic, catches known patterns (causation language, tiny samples, missing controls, short timeframes for long-term claims) Layer 2 — LLM reasoning: slower, probabilistic, catches subtle logical gaps that rules miss. Falls back gracefully if API unavailable. """ # words that claim causation — need RCT to justify CAUSATION_WORDS = [ "causes", "caused by", "leads to", "results in", "produces", "induces", "drives", "responsible for", "due to", "because of", "proves that", "demonstrates that" ] # words that only justify correlation CORRELATION_WORDS = [ "associated with", "correlated", "linked to", "related to", "predicts", "suggests" ] # study designs that cannot prove causation WEAK_DESIGNS = [ "survey", "questionnaire", "self-report", "cross-sectional", "retrospective", "observational", "case study", "anecdotal" ] # long-term claims need long-term studies LONGTERM_CLAIMS = [ "long-term", "chronic", "sustained", "permanent", "lasting", "durable", "years", "lifetime" ] def __init__(self): self._hf_token = os.getenv("HF_API_TOKEN", "") # free model on HuggingFace — good at reasoning self._hf_model = "HuggingFaceH4/zephyr-7b-beta" self._hf_api_url = ( f"https://api-inference.huggingface.co/models/{self._hf_model}" ) # ── public method ───────────────────────────────────────── def analyze(self, text: str, abstract: str = "") -> MethodologyResult: """ Full methodology analysis. Pass full paper text. Optionally pass abstract separately for cleaner claim extraction. """ working_text = abstract if abstract else text claims = self._extract_claims(working_text) methods = self._extract_methods(text) flags = [] flags.extend(self._check_causation_without_rct(text, claims)) flags.extend(self._check_weak_design_strong_claim(text, claims)) flags.extend(self._check_longterm_claim_shortterm_study(text, claims)) flags.extend(self._check_missing_control_group(text)) flags.extend(self._check_generalization(text, claims)) # try LLM reasoning — graceful fallback if unavailable llm_text, llm_ok = self._llm_assess( claims=claims, methods=methods, paper_snippet=text[:1500] ) risk_score = self._calculate_risk(flags) risk_level = self._get_risk_level(risk_score) return MethodologyResult( claims_found=claims, methods_found=methods, flags=flags, llm_assessment=llm_text, llm_available=llm_ok, risk_score=round(risk_score, 3), risk_level=risk_level, summary=self._write_summary(flags, risk_level, llm_ok), ) # ── claim / method extraction ───────────────────────────── def _extract_claims(self, text: str) -> list: """ Pull claim-like sentences — those that assert findings, conclusions, or implications. Looks for language that signals a conclusion. """ claim_markers = [ "we found", "we show", "we demonstrate", "we conclude", "our results", "our findings", "this study shows", "this study demonstrates", "results indicate", "results suggest", "data show", "analysis reveals", "we report", "evidence suggests", "we establish" ] claims = [] sentences = re.split(r'[.!?]', text) for sentence in sentences: sentence = sentence.strip() if not sentence or len(sentence) < 20: continue s_lower = sentence.lower() if any(marker in s_lower for marker in claim_markers): claims.append(sentence) return claims[:8] # cap at 8 — enough signal def _extract_methods(self, text: str) -> list: """ Pull sentences from the methods section that describe how the study was actually conducted. """ method_markers = [ "we used", "we conducted", "we recruited", "we collected", "we measured", "we analyzed", "participants were", "subjects were", "samples were", "data were collected", "randomized", "controlled", "double-blind", "survey", "questionnaire", "interview", "experiment" ] methods = [] sentences = re.split(r'[.!?]', text) for sentence in sentences: sentence = sentence.strip() if not sentence or len(sentence) < 20: continue s_lower = sentence.lower() if any(marker in s_lower for marker in method_markers): methods.append(sentence) return methods[:8] # ── rule-based checks ───────────────────────────────────── def _check_causation_without_rct(self, text: str, claims: list) -> list: """ Detects causal language in claims combined with study designs that cannot establish causation. Classic example: observational study claiming X causes Y. """ flags = [] text_lower = text.lower() has_causal_claim = any( word in text_lower for word in self.CAUSATION_WORDS ) has_weak_design = any( design in text_lower for design in self.WEAK_DESIGNS ) has_rct = any( word in text_lower for word in ["randomized", "randomised", "rct", "control group", "placebo", "double-blind"] ) if has_causal_claim and has_weak_design and not has_rct: # find the actual causal sentence as evidence evidence_sentence = "" for sentence in re.split(r'[.!?]', text): if any(w in sentence.lower() for w in self.CAUSATION_WORDS): evidence_sentence = sentence.strip() break flags.append(MethodologyFlag( flag_type="causation_without_rct", severity="high", claim="Causal language detected in conclusions", issue=( "The study uses causal language " f"({', '.join([w for w in self.CAUSATION_WORDS if w in text_lower[:500]])}) " "but the study design " f"({', '.join([d for d in self.WEAK_DESIGNS if d in text_lower])}) " "cannot establish causation." ), evidence=evidence_sentence or "See causal language in conclusions", suggestion=( "Causal claims require randomized controlled trials. " "Replace causal language with correlation language, " "or acknowledge the design limitation explicitly." ), )) return flags def _check_weak_design_strong_claim( self, text: str, claims: list ) -> list: """ Self-report surveys and questionnaires cannot support strong clinical or behavioral conclusions. """ flags = [] text_lower = text.lower() has_self_report = any( w in text_lower for w in ["self-report", "self report", "questionnaire", "survey", "interview", "participants reported"] ) has_strong_claim = any( w in text_lower for w in ["proves", "demonstrates", "establishes", "confirms", "validates", "clinical evidence"] ) if has_self_report and has_strong_claim: flags.append(MethodologyFlag( flag_type="weak_design_strong_claim", severity="medium", claim="Strong claim based on self-report data", issue=( "Self-report or questionnaire data has known limitations " "(social desirability bias, recall bias) that undermine " "strong conclusive claims." ), evidence="Self-report instrument combined with conclusive language", suggestion=( "Acknowledge self-report limitations explicitly. " "Soften conclusions to match data quality." ), )) return flags def _check_longterm_claim_shortterm_study( self, text: str, claims: list ) -> list: """ Studies lasting days or weeks cannot make long-term or chronic effect claims. """ flags = [] text_lower = text.lower() has_longterm_claim = any( w in text_lower for w in self.LONGTERM_CLAIMS ) has_shortterm_study = any( w in text_lower for w in ["two weeks", "2 weeks", "one week", "1 week", "3 days", "7 days", "short-term pilot", "preliminary study"] ) if has_longterm_claim and has_shortterm_study: flags.append(MethodologyFlag( flag_type="timeframe_mismatch", severity="medium", claim="Long-term claim from short-term study", issue=( "The study duration appears insufficient to support " "long-term or chronic effect claims." ), evidence="Long-term language with short study duration", suggestion=( "Either extend the study duration or explicitly " "limit claims to short-term effects only." ), )) return flags def _check_missing_control_group(self, text: str) -> list: """ Studies measuring treatment effects without a control group cannot isolate the treatment's impact. """ flags = [] text_lower = text.lower() has_treatment = any( w in text_lower for w in ["treatment", "intervention", "drug", "therapy", "program", "training"] ) has_effect_claim = any( w in text_lower for w in ["improved", "reduced", "increased", "effective", "significant effect"] ) has_control = any( w in text_lower for w in ["control group", "control condition", "placebo", "comparison group", "waitlist"] ) if has_treatment and has_effect_claim and not has_control: flags.append(MethodologyFlag( flag_type="missing_control_group", severity="high", claim="Treatment effect claimed without control group", issue=( "Effect claims for a treatment or intervention " "require a control group to rule out confounds, " "placebo effects, and natural recovery." ), evidence="Treatment + effect language with no control group mention", suggestion=( "Add a control/comparison condition, or acknowledge " "that without a control group, the effect cannot be " "attributed to the intervention specifically." ), )) return flags def _check_generalization(self, text: str, claims: list) -> list: """ Small, homogeneous samples cannot support broad population-level generalizations. """ flags = [] text_lower = text.lower() has_broad_claim = any( w in text_lower for w in ["all patients", "general population", "universally", "across all", "globally applicable", "all humans"] ) has_limited_sample = any( w in text_lower for w in ["undergraduate students", "college students", "single institution", "convenience sample", "homogeneous sample"] ) if has_broad_claim and has_limited_sample: flags.append(MethodologyFlag( flag_type="overgeneralization", severity="medium", claim="Broad generalization from limited sample", issue=( "The sample characteristics (e.g., undergraduate students, " "single institution) limit generalizability beyond " "the studied population." ), evidence="Broad claim language with limited sample description", suggestion=( "Explicitly acknowledge sampling limitations " "and restrict claims to the studied population." ), )) return flags # ── llm reasoning ───────────────────────────────────────── def _llm_assess( self, claims: list, methods: list, paper_snippet: str ) -> tuple: """ Ask an LLM to reason about whether the methods logically support the claims. Returns (assessment_text, success_bool). Falls back gracefully if token missing or API down. """ if not self._hf_token or self._hf_token == "hf_xxxxxxxxxxxxxxxx": return ( "LLM assessment unavailable — HF_API_TOKEN not configured.", False ) claims_text = " | ".join(claims[:3]) if claims else "Not extracted" methods_text = " | ".join(methods[:3]) if methods else "Not extracted" prompt = f"""[INST] You are a scientific peer reviewer. Paper excerpt: {paper_snippet[:800]} Claims made: {claims_text} Methods used: {methods_text} In 2-3 sentences, identify the most critical logical gap between the methods and claims. Be specific and direct. [/INST]""" try: payload = json.dumps({ "inputs": prompt, "parameters": { "max_new_tokens": 150, "temperature": 0.3, "return_full_text": False, } }).encode("utf-8") req = urllib.request.Request( self._hf_api_url, data=payload, headers={ "Authorization": f"Bearer {self._hf_token}", "Content-Type": "application/json", }, method="POST" ) with urllib.request.urlopen(req, timeout=30) as response: result = json.loads(response.read().decode("utf-8")) if isinstance(result, list) and result: text = result[0].get("generated_text", "").strip() return (text, True) if text else ("No assessment generated.", False) return ("Unexpected API response format.", False) except urllib.error.HTTPError as e: if e.code == 503: return ("LLM model loading — try again in 20 seconds.", False) return (f"API error {e.code}: {str(e)}", False) except Exception as e: return (f"LLM unavailable: {str(e)}", False) # ── scoring ─────────────────────────────────────────────── def _calculate_risk(self, flags: list) -> float: weights = {"high": 0.35, "medium": 0.20, "low": 0.08} score = sum(weights.get(f.severity, 0) for f in flags) return min(score, 1.0) def _get_risk_level(self, score: float) -> str: if score >= 0.7: return "critical" elif score >= 0.4: return "high" elif score >= 0.2: return "medium" return "low" def _write_summary( self, flags: list, risk_level: str, llm_ok: bool ) -> str: if not flags: base = "No methodology logic issues detected." else: high = sum(1 for f in flags if f.severity == "high") med = sum(1 for f in flags if f.severity == "medium") parts = [] if high: parts.append(f"{high} high-severity issue{'s' if high > 1 else ''}") if med: parts.append(f"{med} medium-severity concern{'s' if med > 1 else ''}") base = ( f"Methodology checker flagged {', '.join(parts)}. " f"Risk level: {risk_level.upper()}." ) llm_note = " LLM reasoning included." if llm_ok else " LLM reasoning unavailable." return base + llm_note