CRAAP-bot / app.py
nadyaw's picture
Update app.py
3f7bec5 verified
import io, re, json, datetime,os
from typing import Dict, Any, List, Tuple, Optional
from flask import Flask, request, jsonify, render_template_string, redirect, url_for
from flask_cors import CORS
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
app = Flask(__name__)
CORS(app, resources={r"/api/*": {"origins": "*"}})
app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16 MB upload cap
THIS_YEAR = datetime.date.today().year
DOI_RX = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I)
S2_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
def _clean(s: Optional[str]) -> str:
return (s or "").strip()
def year_from_any(x: str) -> Optional[int]:
if not x: return None
m = re.search(r"(19|20)\d{2}", x)
if m:
y = int(m.group(0))
if 1900 <= y <= 2100:
return y
return None
def fetch_url_metadata(url_or_doi: str):
warnings = []
url = url_or_doi
m = DOI_RX.search(url_or_doi)
if m and not url_or_doi.lower().startswith("http"):
url = f"https://doi.org/{m.group(1)}"
try:
r = requests.get(url, timeout=20, headers={"User-Agent":"CRAAPBot"})
r.raise_for_status()
except Exception as e:
return {}, "", [f"Failed to fetch URL/DOI: {e}"]
html = r.text
soup = BeautifulSoup(html, "html.parser")
meta = {}
def mget(*names):
for n in names:
tag = soup.find("meta", attrs={"name": n}) or soup.find("meta", attrs={"property": n})
if tag and tag.get("content"):
return tag["content"]
return None
meta["title"] = _clean(mget("citation_title") or (soup.title.string if soup.title else ""))
authors = soup.find_all("meta", attrs={"name":"citation_author"})
if authors:
meta["authors"] = [_clean(a.get("content","")) for a in authors if _clean(a.get("content",""))]
else:
meta["authors"] = [_clean(mget("author") or "")]
meta["authors"] = [a for a in meta["authors"] if a]
meta["venue"] = _clean(mget("citation_journal_title") or mget("og:site_name") or "")
y = year_from_any(_clean(mget("citation_publication_date") or mget("date") or mget("article:published_time") or ""))
meta["year"] = y if y else year_from_any(html)
doi = _clean(mget("citation_doi") or (DOI_RX.search(html).group(1) if DOI_RX.search(html) else ""))
meta["identifier"] = {"doi": doi if doi else None, "url": url}
abst = mget("citation_abstract")
if not abst:
absnode = soup.find(lambda tag: tag.name in ["section","div","p"] and tag.get_text(strip=True).lower().startswith("abstract"))
if absnode:
abst = absnode.get_text(" ", strip=True)
text_excerpt = (abst or "")[:4000]
return meta, text_excerpt, warnings
def extract_pdf_text_and_guess_meta(file_storage):
warnings = []
try:
data = file_storage.read()
reader = PdfReader(io.BytesIO(data))
n = len(reader.pages)
if n == 0:
return {}, "", ["PDF appears empty."]
head_pages = min(2, n)
body_pages = min(10, n)
head = []
body = []
for i in range(head_pages):
head.append(reader.pages[i].extract_text() or "")
for i in range(body_pages):
body.append(reader.pages[i].extract_text() or "")
head_txt = "\n".join(head)
body_txt = "\n".join(body)
lines = [l.strip() for l in head_txt.splitlines() if l.strip()]
title = lines[0] if lines else ""
authors_line = ""
for l in lines[0:10]:
if re.search(r"[A-Z][a-z]+(?:\s[A-Z]\.){0,3}", l) and ("," in l or " and " in l.lower()):
authors_line = l; break
authors = [a.strip() for a in re.split(r",|;| and ", authors_line) if a.strip()] if authors_line else []
venue = ""
y = year_from_any(head_txt)
m = DOI_RX.search(head_txt) or DOI_RX.search(body_txt)
doi = m.group(1) if m else None
meta = {
"title": _clean(title),
"authors": authors,
"venue": _clean(venue),
"year": y,
"identifier": {"doi": doi, "url": None}
}
if body_pages < 5:
warnings.append("Only a small portion of the PDF text was extracted; Accuracy/Purpose may be provisional.")
return meta, body_txt[:20000], warnings
except Exception as e:
return {}, "", [f"Failed to parse PDF: {e}"]
def fetch_semantic_scholar(doi: str):
"""Fetch enrichment from Semantic Scholar Graph API by DOI."""
if not doi:
return {}, ["No DOI provided"]
base = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{requests.utils.quote(doi)}"
fields = ",".join([
"title","year","publicationDate","journal","url",
"isOpenAccess","openAccessPdf","citationCount","influentialCitationCount",
"authors.name","fieldsOfStudy","publicationTypes"
])
headers = {"User-Agent":"CRAAPBot"}
if S2_API_KEY:
headers["x-api-key"] = S2_API_KEY
try:
r = requests.get(base, params={"fields":fields}, headers=headers, timeout=12)
if r.status_code == 404:
return {}, []
r.raise_for_status()
p = r.json()
enrich = {
"s2": {
"title": p.get("title"),
"year": p.get("year"),
"publicationDate": p.get("publicationDate"),
"journal": (p.get("journal") or {}).get("name"),
"url": p.get("url"),
"isOpenAccess": p.get("isOpenAccess"),
"openAccessPdf": (p.get("openAccessPdf") or {}).get("url"),
"citationCount": p.get("citationCount"),
"influentialCitationCount": p.get("influentialCitationCount"),
"authors": [a.get("name") for a in (p.get("authors") or []) if a.get("name")],
"fieldsOfStudy": p.get("fieldsOfStudy"),
"publicationTypes": p.get("publicationTypes"),
}
}
return enrich, []
except Exception as e:
return {}, [f"Semantic Scholar lookup failed: {e}"]
def score_currency(year: Optional[int]):
if not year:
return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."]
age = max(0, THIS_YEAR - year)
if age <= 2: return 5, f"Published in {year} (≀2 years old).", ["Recent for fast-moving fields."]
if age <= 5: return 4, f"Published in {year} (~{age} years old).", []
if age <= 10: return 3, f"Published in {year} (~{age} years old).", []
return 2, f"Published in {year} (>10 years old).", ["Potentially outdated."]
def score_authority(meta: Dict[str,Any]):
score = 1
notes = []
if meta.get("venue"):
score += 1; notes.append(f"Venue: {meta['venue']}.")
if meta.get("identifier",{}).get("doi"):
score += 1; notes.append("Has DOI.")
if meta.get("authors"):
a_count = len(meta["authors"])
if a_count >= 3: score += 1
notes.append(f"Authors: {a_count}.")
return min(score,5), "; ".join(notes) if notes else "Insufficient venue/author info."
def score_accuracy(text_excerpt: str):
keys_present = sum(1 for k in ["methods","materials","results","limitations","confidence interval","validation","dataset","sample size"] if k in text_excerpt.lower())
if not text_excerpt:
return 2, "No body text available; cannot inspect methods."
if keys_present >= 5: return 5, "Detailed methodological cues detected (methods/results/validation/etc.)."
if keys_present >= 3: return 4, "Some methodological cues present."
if keys_present >= 1: return 3, "Limited methodological signals."
return 2, "Minimal methodological detail detected (likely a commentary/overview)."
def score_purpose(text_excerpt: str):
lower = text_excerpt.lower()
bias_hits = any(w in lower for w in ["sponsored", "advertisement", "marketing"])
conflicts = "conflict of interest" in lower or "competing interest" in lower
funding = "funding" in lower or "grant" in lower
if bias_hits:
return 2, "Potential promotional language detected."
if conflicts and not funding:
return 3, "Conflicts noted, funding unclear."
if funding or conflicts:
return 4, "Academic tone with disclosures/funding statements."
return 4, "Academic/educational purpose inferred."
def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt: str):
if not assignment_context:
return 4, "General relevance assumed (no assignment context provided)."
ctx = assignment_context.lower()
hay = (meta.get("title","") + " " + text_excerpt).lower()
hits = sum(1 for tok in set(re.findall(r"[a-zA-Z]{4,}", ctx)) if tok in hay)
if hits >= 6: return 5, "Strong topical overlap with assignment context."
if hits >= 3: return 4, "Good topical overlap."
if hits >= 1: return 3, "Partial topical overlap."
return 2, "Low topical overlap; may be tangential."
def aggregate_scores(meta: Dict[str,Any], text: str, assignment_context: str, provisional: bool):
currency_score, currency_evd, currency_checks = score_currency(meta.get("year"))
authority_score, authority_evd = score_authority(meta)
accuracy_score, accuracy_evd = score_accuracy(text)
purpose_score, purpose_evd = score_purpose(text)
relevance_score, relevance_evd = score_relevance(assignment_context, meta, text)
if provisional:
accuracy_score = min(accuracy_score, 3)
purpose_score = min(purpose_score, 4)
craap = {
"Currency": {"score": currency_score, "evidence": currency_evd, "checks": currency_checks},
"Relevance": {"score": relevance_score, "evidence": relevance_evd},
"Authority": {"score": authority_score, "evidence": authority_evd},
"Accuracy": {"score": accuracy_score, "evidence": accuracy_evd},
"Purpose": {"score": purpose_score, "evidence": purpose_evd}
}
avg = round(sum(v["score"] for v in craap.values())/5, 2)
verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid")
return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}}
INDEX_HTML = """
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>CRAAP Bot Β· Flask</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
:root{
--bg:#f8fafc;
--card:#ffffff;
--ink:#0f172a;
--muted:#64748b;
--line:#e2e8f0;
--brand:#111827;
--accent:#2563eb;
--warn-bg:#fff7ed;
--warn-line:#fed7aa;
--code-bg:#0b1020;
--code-ink:#d7e7ff;
--ring:#93c5fd;
--shadow:0 1px 2px rgba(0,0,0,.05), 0 10px 16px rgba(2,6,23,.04);
}
@media (prefers-color-scheme: dark){
:root{
--bg:#0b1220;
--card:#0f172a;
--ink:#e5e7eb;
--muted:#94a3b8;
--line:#1f2a44;
--brand:#e5e7eb;
--accent:#60a5fa;
--warn-bg:#2b1f12;
--warn-line:#9a5a25;
--code-bg:#030712;
--code-ink:#d7e7ff;
--ring:#2563eb;
--shadow:0 1px 2px rgba(0,0,0,.4), 0 12px 20px rgba(0,0,0,.35);
}
}
*{box-sizing:border-box}
html,body{height:100%}
body{
margin:0;
background:var(--bg);
color:var(--ink);
font:16px/1.55 system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, Apple Color Emoji, Segoe UI Emoji, Noto Color Emoji, sans-serif;
}
.wrap{max-width:980px;margin:2.2rem auto;padding:0 1rem}
header{
padding:1.25rem 1rem 1rem;
border-radius:16px;
background:linear-gradient(135deg, rgba(37,99,235,.10), rgba(2,6,23,.03));
border:1px solid var(--line);
box-shadow:var(--shadow);
}
header h1{margin:0 0 .35rem;font-weight:800;letter-spacing:.2px}
header p{margin:.25rem 0 0;color:var(--muted)}
.tag{
display:inline-flex;align-items:center;gap:.4rem;
padding:.2rem .6rem;margin-top:.5rem;margin-right:.5rem;
border:1px solid var(--line);border-radius:999px;color:var(--muted);font-size:.85rem
}
.card{
background:var(--card);border:1px solid var(--line);border-radius:16px;
padding:1.1rem 1.2rem;margin:1rem 0;box-shadow:var(--shadow)
}
label{display:block;font-weight:650;margin:.65rem 0 .35rem}
input[type="text"], input[type="file"]{
width:100%;padding:.7rem .8rem;border:1px solid var(--line);border-radius:12px;background:transparent;color:var(--ink);
outline:none;transition:border .15s, box-shadow .15s
}
input[type="text"]:focus, input[type="file"]:focus{
border-color:var(--accent); box-shadow:0 0 0 3px color-mix(in srgb, var(--ring) 35%, transparent);
}
.btn{
display:inline-block; background:var(--brand); color:#fff; text-decoration:none;
border:0; padding:.6rem .9rem; border-radius:10px; cursor:pointer;
transition:transform .06s ease, opacity .15s ease;
margin:.25rem .35rem .25rem 0; font-weight:600; font-size:.95rem
}
.btn:hover{opacity:.92; transform:translateY(-1px)}
.btn:focus{outline:3px solid color-mix(in srgb, var(--ring) 45%, transparent); outline-offset:2px}
.btn--ghost{
background:transparent;color:var(--ink);border:1px solid var(--line)
}
.muted{color:var(--muted)}
.warn{padding:.7rem .9rem;background:var(--warn-bg);border:1px solid var(--warn-line);border-radius:12px;margin:.8rem 0}
ul{padding-left:1.2rem;margin:.6rem 0}
li{margin:.25rem 0}
pre{
background:var(--code-bg);color:var(--code-ink);
padding:1rem;border-radius:12px;overflow:auto;border:1px solid #0b1220;
}
details summary{cursor:pointer; list-style:none}
details summary::marker, details summary::-webkit-details-marker{display:none}
details summary{display:flex; align-items:center; gap:.5rem; font-weight:700}
details[open] summary{opacity:.85}
.grid{
display:grid; gap:1rem;
grid-template-columns:1fr;
}
@media (min-width:860px){
.grid{grid-template-columns:1fr 1fr}
}
.meta{display:flex; flex-wrap:wrap; gap:.4rem .6rem; align-items:center}
.pill{
display:inline-flex; align-items:center; gap:.4rem;
border:1px solid var(--line); border-radius:999px; padding:.15rem .55rem; color:var(--muted); font-size:.85rem
}
</style>
</head>
<body>
<div class="wrap">
<header>
<h1>CRAAP Bot</h1>
<p class="muted">URL/DOI or PDF β†’ quick quality check for scholarly sources</p>
<span class="tag">By: NADYA W</span>
</header>
<div class="card">
<form method="POST" action="{{ url_for('analyze') }}" enctype="multipart/form-data">
<label for="paper_source">URL or DOI</label>
<input id="paper_source" type="text" name="paper_source" placeholder="https://doi.org/10.xxxx/..."/>
<label for="pdf">Or upload PDF</label>
<input id="pdf" type="file" name="pdf" accept="application/pdf"/>
<label for="assignment_context">Assignment context (optional)</label>
<input id="assignment_context" type="text" name="assignment_context" placeholder="e.g., AI for zoonotic disease 2023–2025"/>
<div style="margin-top:.9rem">
<button class="btn" type="submit">Analyze</button>
<a class="btn btn--ghost" href="{{ url_for('index') }}">Reset</a>
</div>
<p class="muted" style="margin:.6rem 0 0">Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.</p>
</form>
</div>
{% if result %}
{% if warnings %}
<div class="warn">⚠️ {{ warnings|join(' · ') }}</div>
{% endif %}
<div class="card">
<h2 style="margin-top:0">CRAAP Evaluation Summary</h2>
<p style="margin:.25rem 0 0"><strong>{{ result.metadata.title or '[unknown title]' }}</strong></p>
<p class="muted" style="margin:.25rem 0 .75rem">
{{ (result.metadata.authors or [])|join(', ') }} Β· {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} Β· {{ result.metadata.year }}{% endif %}
</p>
{% set s2 = result.enrichment.s2 if result.enrichment else None %}
{% set doi = result.metadata.identifier.doi if result.metadata and result.metadata.identifier else None %}
{% set src_url = result.metadata.identifier.url if result.metadata and result.metadata.identifier else None %}
<p>
{% if doi %}
<a class="btn" href="https://doi.org/{{ doi }}" target="_blank" rel="noopener">Open DOI</a>
{% elif src_url %}
<a class="btn" href="{{ src_url }}" target="_blank" rel="noopener">Open Source</a>
{% endif %}
{% if s2 and s2.url %}
<a class="btn" href="{{ s2.url }}" target="_blank" rel="noopener">Semantic Scholar</a>
{% endif %}
{% if s2 and s2.openAccessPdf %}
<a class="btn" href="{{ s2.openAccessPdf }}" target="_blank" rel="noopener">Open Access PDF</a>
{% endif %}
<a class="btn btn--ghost" href="https://scholar.google.com/scholar?q={{ (result.metadata.title or doi or '')|urlencode }}" target="_blank" rel="noopener">Google Scholar</a>
</p>
{% if s2 %}
<div class="meta" style="margin:.25rem 0 .75rem">
{% if s2.journal %}<span class="pill">πŸ“˜ {{ s2.journal }}</span>{% endif %}
{% if s2.publicationDate %}<span class="pill">πŸ—“ {{ s2.publicationDate }}</span>{% endif %}
<span class="pill">πŸ”— Citations: {{ s2.citationCount if s2.citationCount is not none else "?" }}</span>
{% if s2.influentialCitationCount is not none %}<span class="pill">⭐ Influential: {{ s2.influentialCitationCount }}</span>{% endif %}
{% if s2.isOpenAccess %}<span class="pill">🟒 Open Access</span>{% endif %}
{% if s2.publicationTypes %}<span class="pill">🧭 {{ s2.publicationTypes|join(', ') }}</span>{% endif %}
</div>
{% endif %}
<div class="grid">
<div class="card" style="margin:0">
<h3 style="margin-top:0">Scores</h3>
<ul>
<li><strong>Currency</strong>: {{ result.craap.Currency.score }}/5 β€” {{ result.craap.Currency.evidence }}</li>
<li><strong>Relevance</strong>: {{ result.craap.Relevance.score }}/5 β€” {{ result.craap.Relevance.evidence }}</li>
<li><strong>Authority</strong>: {{ result.craap.Authority.score }}/5 β€” {{ result.craap.Authority.evidence }}</li>
<li><strong>Accuracy</strong>: {{ result.craap.Accuracy.score }}/5 β€” {{ result.craap.Accuracy.evidence }}</li>
<li><strong>Purpose</strong>: {{ result.craap.Purpose.score }}/5 β€” {{ result.craap.Purpose.evidence }}</li>
</ul>
<p><strong>Overall:</strong> {{ result.overall.average }} β€” <em>{{ result.overall.verdict }}</em></p>
</div>
<div class="card" style="margin:0">
<h3 style="margin-top:0">What to verify next</h3>
<ol>
<li>Confirm publication date & peer-review at the DOI/URL.</li>
<li>Skim methods/results for sample size, validation, limitations.</li>
<li>Check author affiliations and profiles (Semantic Scholar/ORCID).</li>
<li>Look for funding/conflict-of-interest statements.</li>
<li>Search for newer papers (last 1–2 years) that cite or challenge it.</li>
</ol>
</div>
</div>
</div>
<div class="card">
<details>
<summary>View raw JSON</summary>
<pre>{{ result | tojson(indent=2) }}</pre>
</details>
</div>
{% endif %}
</div>
</body>
</html>
"""
@app.route("/", methods=["GET"])
def index():
return render_template_string(INDEX_HTML, result=None, warnings=None)
@app.route("/analyze", methods=["POST"])
def analyze():
paper_source = _clean(request.form.get("paper_source", ""))
assignment_context = _clean(request.form.get("assignment_context", ""))
provisional = False
warnings: List[str] = []
meta, text = {}, ""
if paper_source:
meta, text, w = fetch_url_metadata(paper_source)
warnings.extend(w)
elif "pdf" in request.files and request.files["pdf"].filename:
meta, text, w = extract_pdf_text_and_guess_meta(request.files["pdf"])
warnings.extend(w); provisional = True
else:
return redirect(url_for("index"))
result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
doi = (meta.get("identifier") or {}).get("doi")
enrichment, ewarns = fetch_semantic_scholar(doi)
result["enrichment"] = enrichment
warnings.extend(ewarns)
if not text:
warnings.append("Full text not available β€” Accuracy/Purpose are provisional. Provide a DOI/URL or full PDF for deeper evaluation.")
return render_template_string(INDEX_HTML, result=result, warnings=warnings)
@app.route("/api/analyze", methods=["POST"])
def api_analyze():
data = request.json or {}
paper_source = _clean(data.get("paper_source",""))
assignment_context = _clean(data.get("assignment_context",""))
meta, text, warnings = ({}, "", [])
provisional = False
if paper_source:
meta, text, warnings = fetch_url_metadata(paper_source)
else:
return jsonify({"error":"Provide paper_source (URL/DOI) or use /analyze form for PDF upload"}), 400
result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
doi = (meta.get("identifier") or {}).get("doi")
enrichment, ewarns = fetch_semantic_scholar(doi)
result["enrichment"] = enrichment
warnings.extend(ewarns)
return jsonify({"result": result, "warnings": warnings})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8000, debug=True)