import io, re, json, datetime,os from typing import Dict, Any, List, Tuple, Optional from flask import Flask, request, jsonify, render_template_string, redirect, url_for from flask_cors import CORS import requests from bs4 import BeautifulSoup from PyPDF2 import PdfReader app = Flask(__name__) CORS(app, resources={r"/api/*": {"origins": "*"}}) app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16 MB upload cap THIS_YEAR = datetime.date.today().year DOI_RX = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I) S2_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY") def _clean(s: Optional[str]) -> str: return (s or "").strip() def year_from_any(x: str) -> Optional[int]: if not x: return None m = re.search(r"(19|20)\d{2}", x) if m: y = int(m.group(0)) if 1900 <= y <= 2100: return y return None def fetch_url_metadata(url_or_doi: str): warnings = [] url = url_or_doi m = DOI_RX.search(url_or_doi) if m and not url_or_doi.lower().startswith("http"): url = f"https://doi.org/{m.group(1)}" try: r = requests.get(url, timeout=20, headers={"User-Agent":"CRAAPBot"}) r.raise_for_status() except Exception as e: return {}, "", [f"Failed to fetch URL/DOI: {e}"] html = r.text soup = BeautifulSoup(html, "html.parser") meta = {} def mget(*names): for n in names: tag = soup.find("meta", attrs={"name": n}) or soup.find("meta", attrs={"property": n}) if tag and tag.get("content"): return tag["content"] return None meta["title"] = _clean(mget("citation_title") or (soup.title.string if soup.title else "")) authors = soup.find_all("meta", attrs={"name":"citation_author"}) if authors: meta["authors"] = [_clean(a.get("content","")) for a in authors if _clean(a.get("content",""))] else: meta["authors"] = [_clean(mget("author") or "")] meta["authors"] = [a for a in meta["authors"] if a] meta["venue"] = _clean(mget("citation_journal_title") or mget("og:site_name") or "") y = year_from_any(_clean(mget("citation_publication_date") or mget("date") or mget("article:published_time") or "")) meta["year"] = y if y else year_from_any(html) doi = _clean(mget("citation_doi") or (DOI_RX.search(html).group(1) if DOI_RX.search(html) else "")) meta["identifier"] = {"doi": doi if doi else None, "url": url} abst = mget("citation_abstract") if not abst: absnode = soup.find(lambda tag: tag.name in ["section","div","p"] and tag.get_text(strip=True).lower().startswith("abstract")) if absnode: abst = absnode.get_text(" ", strip=True) text_excerpt = (abst or "")[:4000] return meta, text_excerpt, warnings def extract_pdf_text_and_guess_meta(file_storage): warnings = [] try: data = file_storage.read() reader = PdfReader(io.BytesIO(data)) n = len(reader.pages) if n == 0: return {}, "", ["PDF appears empty."] head_pages = min(2, n) body_pages = min(10, n) head = [] body = [] for i in range(head_pages): head.append(reader.pages[i].extract_text() or "") for i in range(body_pages): body.append(reader.pages[i].extract_text() or "") head_txt = "\n".join(head) body_txt = "\n".join(body) lines = [l.strip() for l in head_txt.splitlines() if l.strip()] title = lines[0] if lines else "" authors_line = "" for l in lines[0:10]: if re.search(r"[A-Z][a-z]+(?:\s[A-Z]\.){0,3}", l) and ("," in l or " and " in l.lower()): authors_line = l; break authors = [a.strip() for a in re.split(r",|;| and ", authors_line) if a.strip()] if authors_line else [] venue = "" y = year_from_any(head_txt) m = DOI_RX.search(head_txt) or DOI_RX.search(body_txt) doi = m.group(1) if m else None meta = { "title": _clean(title), "authors": authors, "venue": _clean(venue), "year": y, "identifier": {"doi": doi, "url": None} } if body_pages < 5: warnings.append("Only a small portion of the PDF text was extracted; Accuracy/Purpose may be provisional.") return meta, body_txt[:20000], warnings except Exception as e: return {}, "", [f"Failed to parse PDF: {e}"] def fetch_semantic_scholar(doi: str): """Fetch enrichment from Semantic Scholar Graph API by DOI.""" if not doi: return {}, ["No DOI provided"] base = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{requests.utils.quote(doi)}" fields = ",".join([ "title","year","publicationDate","journal","url", "isOpenAccess","openAccessPdf","citationCount","influentialCitationCount", "authors.name","fieldsOfStudy","publicationTypes" ]) headers = {"User-Agent":"CRAAPBot"} if S2_API_KEY: headers["x-api-key"] = S2_API_KEY try: r = requests.get(base, params={"fields":fields}, headers=headers, timeout=12) if r.status_code == 404: return {}, [] r.raise_for_status() p = r.json() enrich = { "s2": { "title": p.get("title"), "year": p.get("year"), "publicationDate": p.get("publicationDate"), "journal": (p.get("journal") or {}).get("name"), "url": p.get("url"), "isOpenAccess": p.get("isOpenAccess"), "openAccessPdf": (p.get("openAccessPdf") or {}).get("url"), "citationCount": p.get("citationCount"), "influentialCitationCount": p.get("influentialCitationCount"), "authors": [a.get("name") for a in (p.get("authors") or []) if a.get("name")], "fieldsOfStudy": p.get("fieldsOfStudy"), "publicationTypes": p.get("publicationTypes"), } } return enrich, [] except Exception as e: return {}, [f"Semantic Scholar lookup failed: {e}"] def score_currency(year: Optional[int]): if not year: return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."] age = max(0, THIS_YEAR - year) if age <= 2: return 5, f"Published in {year} (≤2 years old).", ["Recent for fast-moving fields."] if age <= 5: return 4, f"Published in {year} (~{age} years old).", [] if age <= 10: return 3, f"Published in {year} (~{age} years old).", [] return 2, f"Published in {year} (>10 years old).", ["Potentially outdated."] def score_authority(meta: Dict[str,Any]): score = 1 notes = [] if meta.get("venue"): score += 1; notes.append(f"Venue: {meta['venue']}.") if meta.get("identifier",{}).get("doi"): score += 1; notes.append("Has DOI.") if meta.get("authors"): a_count = len(meta["authors"]) if a_count >= 3: score += 1 notes.append(f"Authors: {a_count}.") return min(score,5), "; ".join(notes) if notes else "Insufficient venue/author info." def score_accuracy(text_excerpt: str): keys_present = sum(1 for k in ["methods","materials","results","limitations","confidence interval","validation","dataset","sample size"] if k in text_excerpt.lower()) if not text_excerpt: return 2, "No body text available; cannot inspect methods." if keys_present >= 5: return 5, "Detailed methodological cues detected (methods/results/validation/etc.)." if keys_present >= 3: return 4, "Some methodological cues present." if keys_present >= 1: return 3, "Limited methodological signals." return 2, "Minimal methodological detail detected (likely a commentary/overview)." def score_purpose(text_excerpt: str): lower = text_excerpt.lower() bias_hits = any(w in lower for w in ["sponsored", "advertisement", "marketing"]) conflicts = "conflict of interest" in lower or "competing interest" in lower funding = "funding" in lower or "grant" in lower if bias_hits: return 2, "Potential promotional language detected." if conflicts and not funding: return 3, "Conflicts noted, funding unclear." if funding or conflicts: return 4, "Academic tone with disclosures/funding statements." return 4, "Academic/educational purpose inferred." def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt: str): if not assignment_context: return 4, "General relevance assumed (no assignment context provided)." ctx = assignment_context.lower() hay = (meta.get("title","") + " " + text_excerpt).lower() hits = sum(1 for tok in set(re.findall(r"[a-zA-Z]{4,}", ctx)) if tok in hay) if hits >= 6: return 5, "Strong topical overlap with assignment context." if hits >= 3: return 4, "Good topical overlap." if hits >= 1: return 3, "Partial topical overlap." return 2, "Low topical overlap; may be tangential." def aggregate_scores(meta: Dict[str,Any], text: str, assignment_context: str, provisional: bool): currency_score, currency_evd, currency_checks = score_currency(meta.get("year")) authority_score, authority_evd = score_authority(meta) accuracy_score, accuracy_evd = score_accuracy(text) purpose_score, purpose_evd = score_purpose(text) relevance_score, relevance_evd = score_relevance(assignment_context, meta, text) if provisional: accuracy_score = min(accuracy_score, 3) purpose_score = min(purpose_score, 4) craap = { "Currency": {"score": currency_score, "evidence": currency_evd, "checks": currency_checks}, "Relevance": {"score": relevance_score, "evidence": relevance_evd}, "Authority": {"score": authority_score, "evidence": authority_evd}, "Accuracy": {"score": accuracy_score, "evidence": accuracy_evd}, "Purpose": {"score": purpose_score, "evidence": purpose_evd} } avg = round(sum(v["score"] for v in craap.values())/5, 2) verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid") return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}} INDEX_HTML = """ CRAAP Bot · Flask

CRAAP Bot

URL/DOI or PDF → quick quality check for scholarly sources

By: NADYA W
Reset

Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.

{% if result %} {% if warnings %}
⚠️ {{ warnings|join(' · ') }}
{% endif %}

CRAAP Evaluation Summary

{{ result.metadata.title or '[unknown title]' }}

{{ (result.metadata.authors or [])|join(', ') }} · {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} · {{ result.metadata.year }}{% endif %}

{% set s2 = result.enrichment.s2 if result.enrichment else None %} {% set doi = result.metadata.identifier.doi if result.metadata and result.metadata.identifier else None %} {% set src_url = result.metadata.identifier.url if result.metadata and result.metadata.identifier else None %}

{% if doi %} Open DOI {% elif src_url %} Open Source {% endif %} {% if s2 and s2.url %} Semantic Scholar {% endif %} {% if s2 and s2.openAccessPdf %} Open Access PDF {% endif %} Google Scholar

{% if s2 %}
{% if s2.journal %}📘 {{ s2.journal }}{% endif %} {% if s2.publicationDate %}🗓 {{ s2.publicationDate }}{% endif %} 🔗 Citations: {{ s2.citationCount if s2.citationCount is not none else "?" }} {% if s2.influentialCitationCount is not none %}⭐ Influential: {{ s2.influentialCitationCount }}{% endif %} {% if s2.isOpenAccess %}🟢 Open Access{% endif %} {% if s2.publicationTypes %}🧭 {{ s2.publicationTypes|join(', ') }}{% endif %}
{% endif %}

Scores

  • Currency: {{ result.craap.Currency.score }}/5 — {{ result.craap.Currency.evidence }}
  • Relevance: {{ result.craap.Relevance.score }}/5 — {{ result.craap.Relevance.evidence }}
  • Authority: {{ result.craap.Authority.score }}/5 — {{ result.craap.Authority.evidence }}
  • Accuracy: {{ result.craap.Accuracy.score }}/5 — {{ result.craap.Accuracy.evidence }}
  • Purpose: {{ result.craap.Purpose.score }}/5 — {{ result.craap.Purpose.evidence }}

Overall: {{ result.overall.average }} — {{ result.overall.verdict }}

What to verify next

  1. Confirm publication date & peer-review at the DOI/URL.
  2. Skim methods/results for sample size, validation, limitations.
  3. Check author affiliations and profiles (Semantic Scholar/ORCID).
  4. Look for funding/conflict-of-interest statements.
  5. Search for newer papers (last 1–2 years) that cite or challenge it.
View raw JSON
{{ result | tojson(indent=2) }}
{% endif %}
""" @app.route("/", methods=["GET"]) def index(): return render_template_string(INDEX_HTML, result=None, warnings=None) @app.route("/analyze", methods=["POST"]) def analyze(): paper_source = _clean(request.form.get("paper_source", "")) assignment_context = _clean(request.form.get("assignment_context", "")) provisional = False warnings: List[str] = [] meta, text = {}, "" if paper_source: meta, text, w = fetch_url_metadata(paper_source) warnings.extend(w) elif "pdf" in request.files and request.files["pdf"].filename: meta, text, w = extract_pdf_text_and_guess_meta(request.files["pdf"]) warnings.extend(w); provisional = True else: return redirect(url_for("index")) result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings)) doi = (meta.get("identifier") or {}).get("doi") enrichment, ewarns = fetch_semantic_scholar(doi) result["enrichment"] = enrichment warnings.extend(ewarns) if not text: warnings.append("Full text not available — Accuracy/Purpose are provisional. Provide a DOI/URL or full PDF for deeper evaluation.") return render_template_string(INDEX_HTML, result=result, warnings=warnings) @app.route("/api/analyze", methods=["POST"]) def api_analyze(): data = request.json or {} paper_source = _clean(data.get("paper_source","")) assignment_context = _clean(data.get("assignment_context","")) meta, text, warnings = ({}, "", []) provisional = False if paper_source: meta, text, warnings = fetch_url_metadata(paper_source) else: return jsonify({"error":"Provide paper_source (URL/DOI) or use /analyze form for PDF upload"}), 400 result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings)) doi = (meta.get("identifier") or {}).get("doi") enrichment, ewarns = fetch_semantic_scholar(doi) result["enrichment"] = enrichment warnings.extend(ewarns) return jsonify({"result": result, "warnings": warnings}) if __name__ == "__main__": app.run(host="0.0.0.0", port=8000, debug=True)