SciPeerAI-API / src /scipeerai /modules /methodology_checker.py
Abu-Sameer-66
feat: citation integrity analyzer β€” self-citation, unsupported claims, density β€” 27 total tests
c4139c8
# Methodology Logic Checker
# -------------------------
# The most subtle form of scientific fraud is not
# fabricating data β€” it is making claims that your
# method cannot actually support.
#
# "Correlation does not imply causation" is the
# famous example. But there are dozens of variations:
# underpowered claims, wrong timeframes, missing
# controls, self-report data for clinical conclusions.
#
# This module catches them β€” using both rule-based
# pattern matching and LLM-powered reasoning.
import re
import os
import json
import urllib.request
import urllib.error
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv()
# ── data structures ───────────────────────────────────────────
@dataclass
class MethodologyFlag:
flag_type: str
severity: str
claim: str
issue: str
evidence: str
suggestion: str
@dataclass
class MethodologyResult:
claims_found: list
methods_found: list
flags: list
llm_assessment: str
llm_available: bool
risk_score: float
risk_level: str
summary: str
# ── main class ────────────────────────────────────────────────
class MethodologyChecker:
"""
Two-layer methodology analysis:
Layer 1 β€” Rule-based: fast, deterministic, catches
known patterns (causation language, tiny samples,
missing controls, short timeframes for long-term claims)
Layer 2 β€” LLM reasoning: slower, probabilistic,
catches subtle logical gaps that rules miss.
Falls back gracefully if API unavailable.
"""
# words that claim causation β€” need RCT to justify
CAUSATION_WORDS = [
"causes", "caused by", "leads to", "results in",
"produces", "induces", "drives", "responsible for",
"due to", "because of", "proves that", "demonstrates that"
]
# words that only justify correlation
CORRELATION_WORDS = [
"associated with", "correlated", "linked to",
"related to", "predicts", "suggests"
]
# study designs that cannot prove causation
WEAK_DESIGNS = [
"survey", "questionnaire", "self-report", "cross-sectional",
"retrospective", "observational", "case study", "anecdotal"
]
# long-term claims need long-term studies
LONGTERM_CLAIMS = [
"long-term", "chronic", "sustained", "permanent",
"lasting", "durable", "years", "lifetime"
]
def __init__(self):
self._hf_token = os.getenv("HF_API_TOKEN", "")
# free model on HuggingFace β€” good at reasoning
self._hf_model = "HuggingFaceH4/zephyr-7b-beta"
self._hf_api_url = (
f"https://api-inference.huggingface.co/models/{self._hf_model}"
)
# ── public method ─────────────────────────────────────────
def analyze(self, text: str, abstract: str = "") -> MethodologyResult:
"""
Full methodology analysis.
Pass full paper text. Optionally pass abstract separately
for cleaner claim extraction.
"""
working_text = abstract if abstract else text
claims = self._extract_claims(working_text)
methods = self._extract_methods(text)
flags = []
flags.extend(self._check_causation_without_rct(text, claims))
flags.extend(self._check_weak_design_strong_claim(text, claims))
flags.extend(self._check_longterm_claim_shortterm_study(text, claims))
flags.extend(self._check_missing_control_group(text))
flags.extend(self._check_generalization(text, claims))
# try LLM reasoning β€” graceful fallback if unavailable
llm_text, llm_ok = self._llm_assess(
claims=claims,
methods=methods,
paper_snippet=text[:1500]
)
risk_score = self._calculate_risk(flags)
risk_level = self._get_risk_level(risk_score)
return MethodologyResult(
claims_found=claims,
methods_found=methods,
flags=flags,
llm_assessment=llm_text,
llm_available=llm_ok,
risk_score=round(risk_score, 3),
risk_level=risk_level,
summary=self._write_summary(flags, risk_level, llm_ok),
)
# ── claim / method extraction ─────────────────────────────
def _extract_claims(self, text: str) -> list:
"""
Pull claim-like sentences β€” those that assert findings,
conclusions, or implications.
Looks for language that signals a conclusion.
"""
claim_markers = [
"we found", "we show", "we demonstrate", "we conclude",
"our results", "our findings", "this study shows",
"this study demonstrates", "results indicate",
"results suggest", "data show", "analysis reveals",
"we report", "evidence suggests", "we establish"
]
claims = []
sentences = re.split(r'[.!?]', text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence or len(sentence) < 20:
continue
s_lower = sentence.lower()
if any(marker in s_lower for marker in claim_markers):
claims.append(sentence)
return claims[:8] # cap at 8 β€” enough signal
def _extract_methods(self, text: str) -> list:
"""
Pull sentences from the methods section that describe
how the study was actually conducted.
"""
method_markers = [
"we used", "we conducted", "we recruited", "we collected",
"we measured", "we analyzed", "participants were",
"subjects were", "samples were", "data were collected",
"randomized", "controlled", "double-blind", "survey",
"questionnaire", "interview", "experiment"
]
methods = []
sentences = re.split(r'[.!?]', text)
for sentence in sentences:
sentence = sentence.strip()
if not sentence or len(sentence) < 20:
continue
s_lower = sentence.lower()
if any(marker in s_lower for marker in method_markers):
methods.append(sentence)
return methods[:8]
# ── rule-based checks ─────────────────────────────────────
def _check_causation_without_rct(self, text: str, claims: list) -> list:
"""
Detects causal language in claims combined with
study designs that cannot establish causation.
Classic example: observational study claiming X causes Y.
"""
flags = []
text_lower = text.lower()
has_causal_claim = any(
word in text_lower for word in self.CAUSATION_WORDS
)
has_weak_design = any(
design in text_lower for design in self.WEAK_DESIGNS
)
has_rct = any(
word in text_lower
for word in ["randomized", "randomised", "rct",
"control group", "placebo", "double-blind"]
)
if has_causal_claim and has_weak_design and not has_rct:
# find the actual causal sentence as evidence
evidence_sentence = ""
for sentence in re.split(r'[.!?]', text):
if any(w in sentence.lower() for w in self.CAUSATION_WORDS):
evidence_sentence = sentence.strip()
break
flags.append(MethodologyFlag(
flag_type="causation_without_rct",
severity="high",
claim="Causal language detected in conclusions",
issue=(
"The study uses causal language "
f"({', '.join([w for w in self.CAUSATION_WORDS if w in text_lower[:500]])}) "
"but the study design "
f"({', '.join([d for d in self.WEAK_DESIGNS if d in text_lower])}) "
"cannot establish causation."
),
evidence=evidence_sentence or "See causal language in conclusions",
suggestion=(
"Causal claims require randomized controlled trials. "
"Replace causal language with correlation language, "
"or acknowledge the design limitation explicitly."
),
))
return flags
def _check_weak_design_strong_claim(
self, text: str, claims: list
) -> list:
"""
Self-report surveys and questionnaires cannot support
strong clinical or behavioral conclusions.
"""
flags = []
text_lower = text.lower()
has_self_report = any(
w in text_lower
for w in ["self-report", "self report", "questionnaire",
"survey", "interview", "participants reported"]
)
has_strong_claim = any(
w in text_lower
for w in ["proves", "demonstrates", "establishes",
"confirms", "validates", "clinical evidence"]
)
if has_self_report and has_strong_claim:
flags.append(MethodologyFlag(
flag_type="weak_design_strong_claim",
severity="medium",
claim="Strong claim based on self-report data",
issue=(
"Self-report or questionnaire data has known limitations "
"(social desirability bias, recall bias) that undermine "
"strong conclusive claims."
),
evidence="Self-report instrument combined with conclusive language",
suggestion=(
"Acknowledge self-report limitations explicitly. "
"Soften conclusions to match data quality."
),
))
return flags
def _check_longterm_claim_shortterm_study(
self, text: str, claims: list
) -> list:
"""
Studies lasting days or weeks cannot make
long-term or chronic effect claims.
"""
flags = []
text_lower = text.lower()
has_longterm_claim = any(
w in text_lower for w in self.LONGTERM_CLAIMS
)
has_shortterm_study = any(
w in text_lower
for w in ["two weeks", "2 weeks", "one week", "1 week",
"3 days", "7 days", "short-term pilot",
"preliminary study"]
)
if has_longterm_claim and has_shortterm_study:
flags.append(MethodologyFlag(
flag_type="timeframe_mismatch",
severity="medium",
claim="Long-term claim from short-term study",
issue=(
"The study duration appears insufficient to support "
"long-term or chronic effect claims."
),
evidence="Long-term language with short study duration",
suggestion=(
"Either extend the study duration or explicitly "
"limit claims to short-term effects only."
),
))
return flags
def _check_missing_control_group(self, text: str) -> list:
"""
Studies measuring treatment effects without a
control group cannot isolate the treatment's impact.
"""
flags = []
text_lower = text.lower()
has_treatment = any(
w in text_lower
for w in ["treatment", "intervention", "drug", "therapy",
"program", "training"]
)
has_effect_claim = any(
w in text_lower
for w in ["improved", "reduced", "increased", "effective",
"significant effect"]
)
has_control = any(
w in text_lower
for w in ["control group", "control condition", "placebo",
"comparison group", "waitlist"]
)
if has_treatment and has_effect_claim and not has_control:
flags.append(MethodologyFlag(
flag_type="missing_control_group",
severity="high",
claim="Treatment effect claimed without control group",
issue=(
"Effect claims for a treatment or intervention "
"require a control group to rule out confounds, "
"placebo effects, and natural recovery."
),
evidence="Treatment + effect language with no control group mention",
suggestion=(
"Add a control/comparison condition, or acknowledge "
"that without a control group, the effect cannot be "
"attributed to the intervention specifically."
),
))
return flags
def _check_generalization(self, text: str, claims: list) -> list:
"""
Small, homogeneous samples cannot support
broad population-level generalizations.
"""
flags = []
text_lower = text.lower()
has_broad_claim = any(
w in text_lower
for w in ["all patients", "general population", "universally",
"across all", "globally applicable", "all humans"]
)
has_limited_sample = any(
w in text_lower
for w in ["undergraduate students", "college students",
"single institution", "convenience sample",
"homogeneous sample"]
)
if has_broad_claim and has_limited_sample:
flags.append(MethodologyFlag(
flag_type="overgeneralization",
severity="medium",
claim="Broad generalization from limited sample",
issue=(
"The sample characteristics (e.g., undergraduate students, "
"single institution) limit generalizability beyond "
"the studied population."
),
evidence="Broad claim language with limited sample description",
suggestion=(
"Explicitly acknowledge sampling limitations "
"and restrict claims to the studied population."
),
))
return flags
# ── llm reasoning ─────────────────────────────────────────
def _llm_assess(
self, claims: list, methods: list, paper_snippet: str
) -> tuple:
"""
Ask an LLM to reason about whether the methods
logically support the claims.
Returns (assessment_text, success_bool).
Falls back gracefully if token missing or API down.
"""
if not self._hf_token or self._hf_token == "hf_xxxxxxxxxxxxxxxx":
return (
"LLM assessment unavailable β€” HF_API_TOKEN not configured.",
False
)
claims_text = " | ".join(claims[:3]) if claims else "Not extracted"
methods_text = " | ".join(methods[:3]) if methods else "Not extracted"
prompt = f"""[INST] You are a scientific peer reviewer.
Paper excerpt:
{paper_snippet[:800]}
Claims made: {claims_text}
Methods used: {methods_text}
In 2-3 sentences, identify the most critical logical gap between the methods and claims. Be specific and direct. [/INST]"""
try:
payload = json.dumps({
"inputs": prompt,
"parameters": {
"max_new_tokens": 150,
"temperature": 0.3,
"return_full_text": False,
}
}).encode("utf-8")
req = urllib.request.Request(
self._hf_api_url,
data=payload,
headers={
"Authorization": f"Bearer {self._hf_token}",
"Content-Type": "application/json",
},
method="POST"
)
with urllib.request.urlopen(req, timeout=30) as response:
result = json.loads(response.read().decode("utf-8"))
if isinstance(result, list) and result:
text = result[0].get("generated_text", "").strip()
return (text, True) if text else ("No assessment generated.", False)
return ("Unexpected API response format.", False)
except urllib.error.HTTPError as e:
if e.code == 503:
return ("LLM model loading β€” try again in 20 seconds.", False)
return (f"API error {e.code}: {str(e)}", False)
except Exception as e:
return (f"LLM unavailable: {str(e)}", False)
# ── scoring ───────────────────────────────────────────────
def _calculate_risk(self, flags: list) -> float:
weights = {"high": 0.35, "medium": 0.20, "low": 0.08}
score = sum(weights.get(f.severity, 0) for f in flags)
return min(score, 1.0)
def _get_risk_level(self, score: float) -> str:
if score >= 0.7: return "critical"
elif score >= 0.4: return "high"
elif score >= 0.2: return "medium"
return "low"
def _write_summary(
self, flags: list, risk_level: str, llm_ok: bool
) -> str:
if not flags:
base = "No methodology logic issues detected."
else:
high = sum(1 for f in flags if f.severity == "high")
med = sum(1 for f in flags if f.severity == "medium")
parts = []
if high: parts.append(f"{high} high-severity issue{'s' if high > 1 else ''}")
if med: parts.append(f"{med} medium-severity concern{'s' if med > 1 else ''}")
base = (
f"Methodology checker flagged {', '.join(parts)}. "
f"Risk level: {risk_level.upper()}."
)
llm_note = " LLM reasoning included." if llm_ok else " LLM reasoning unavailable."
return base + llm_note