import io, re, json, datetime,os from typing import Dict, Any, List, Tuple, Optional from flask import Flask, request, jsonify, render_template_string, redirect, url_for from flask_cors import CORS import requests from bs4 import BeautifulSoup from PyPDF2 import PdfReader app = Flask(__name__) CORS(app, resources={r"/api/*": {"origins": "*"}}) app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16 MB upload cap THIS_YEAR = datetime.date.today().year DOI_RX = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I) S2_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY") def _clean(s: Optional[str]) -> str: return (s or "").strip() def year_from_any(x: str) -> Optional[int]: if not x: return None m = re.search(r"(19|20)\d{2}", x) if m: y = int(m.group(0)) if 1900 <= y <= 2100: return y return None def fetch_url_metadata(url_or_doi: str): warnings = [] url = url_or_doi m = DOI_RX.search(url_or_doi) if m and not url_or_doi.lower().startswith("http"): url = f"https://doi.org/{m.group(1)}" try: r = requests.get(url, timeout=20, headers={"User-Agent":"CRAAPBot"}) r.raise_for_status() except Exception as e: return {}, "", [f"Failed to fetch URL/DOI: {e}"] html = r.text soup = BeautifulSoup(html, "html.parser") meta = {} def mget(*names): for n in names: tag = soup.find("meta", attrs={"name": n}) or soup.find("meta", attrs={"property": n}) if tag and tag.get("content"): return tag["content"] return None meta["title"] = _clean(mget("citation_title") or (soup.title.string if soup.title else "")) authors = soup.find_all("meta", attrs={"name":"citation_author"}) if authors: meta["authors"] = [_clean(a.get("content","")) for a in authors if _clean(a.get("content",""))] else: meta["authors"] = [_clean(mget("author") or "")] meta["authors"] = [a for a in meta["authors"] if a] meta["venue"] = _clean(mget("citation_journal_title") or mget("og:site_name") or "") y = year_from_any(_clean(mget("citation_publication_date") or mget("date") or mget("article:published_time") or "")) meta["year"] = y if y else year_from_any(html) doi = _clean(mget("citation_doi") or (DOI_RX.search(html).group(1) if DOI_RX.search(html) else "")) meta["identifier"] = {"doi": doi if doi else None, "url": url} abst = mget("citation_abstract") if not abst: absnode = soup.find(lambda tag: tag.name in ["section","div","p"] and tag.get_text(strip=True).lower().startswith("abstract")) if absnode: abst = absnode.get_text(" ", strip=True) text_excerpt = (abst or "")[:4000] return meta, text_excerpt, warnings def extract_pdf_text_and_guess_meta(file_storage): warnings = [] try: data = file_storage.read() reader = PdfReader(io.BytesIO(data)) n = len(reader.pages) if n == 0: return {}, "", ["PDF appears empty."] head_pages = min(2, n) body_pages = min(10, n) head = [] body = [] for i in range(head_pages): head.append(reader.pages[i].extract_text() or "") for i in range(body_pages): body.append(reader.pages[i].extract_text() or "") head_txt = "\n".join(head) body_txt = "\n".join(body) lines = [l.strip() for l in head_txt.splitlines() if l.strip()] title = lines[0] if lines else "" authors_line = "" for l in lines[0:10]: if re.search(r"[A-Z][a-z]+(?:\s[A-Z]\.){0,3}", l) and ("," in l or " and " in l.lower()): authors_line = l; break authors = [a.strip() for a in re.split(r",|;| and ", authors_line) if a.strip()] if authors_line else [] venue = "" y = year_from_any(head_txt) m = DOI_RX.search(head_txt) or DOI_RX.search(body_txt) doi = m.group(1) if m else None meta = { "title": _clean(title), "authors": authors, "venue": _clean(venue), "year": y, "identifier": {"doi": doi, "url": None} } if body_pages < 5: warnings.append("Only a small portion of the PDF text was extracted; Accuracy/Purpose may be provisional.") return meta, body_txt[:20000], warnings except Exception as e: return {}, "", [f"Failed to parse PDF: {e}"] def fetch_semantic_scholar(doi: str): """Fetch enrichment from Semantic Scholar Graph API by DOI.""" if not doi: return {}, ["No DOI provided"] base = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{requests.utils.quote(doi)}" fields = ",".join([ "title","year","publicationDate","journal","url", "isOpenAccess","openAccessPdf","citationCount","influentialCitationCount", "authors.name","fieldsOfStudy","publicationTypes" ]) headers = {"User-Agent":"CRAAPBot"} if S2_API_KEY: headers["x-api-key"] = S2_API_KEY try: r = requests.get(base, params={"fields":fields}, headers=headers, timeout=12) if r.status_code == 404: return {}, [] r.raise_for_status() p = r.json() enrich = { "s2": { "title": p.get("title"), "year": p.get("year"), "publicationDate": p.get("publicationDate"), "journal": (p.get("journal") or {}).get("name"), "url": p.get("url"), "isOpenAccess": p.get("isOpenAccess"), "openAccessPdf": (p.get("openAccessPdf") or {}).get("url"), "citationCount": p.get("citationCount"), "influentialCitationCount": p.get("influentialCitationCount"), "authors": [a.get("name") for a in (p.get("authors") or []) if a.get("name")], "fieldsOfStudy": p.get("fieldsOfStudy"), "publicationTypes": p.get("publicationTypes"), } } return enrich, [] except Exception as e: return {}, [f"Semantic Scholar lookup failed: {e}"] def score_currency(year: Optional[int]): if not year: return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."] age = max(0, THIS_YEAR - year) if age <= 2: return 5, f"Published in {year} (≤2 years old).", ["Recent for fast-moving fields."] if age <= 5: return 4, f"Published in {year} (~{age} years old).", [] if age <= 10: return 3, f"Published in {year} (~{age} years old).", [] return 2, f"Published in {year} (>10 years old).", ["Potentially outdated."] def score_authority(meta: Dict[str,Any]): score = 1 notes = [] if meta.get("venue"): score += 1; notes.append(f"Venue: {meta['venue']}.") if meta.get("identifier",{}).get("doi"): score += 1; notes.append("Has DOI.") if meta.get("authors"): a_count = len(meta["authors"]) if a_count >= 3: score += 1 notes.append(f"Authors: {a_count}.") return min(score,5), "; ".join(notes) if notes else "Insufficient venue/author info." def score_accuracy(text_excerpt: str): keys_present = sum(1 for k in ["methods","materials","results","limitations","confidence interval","validation","dataset","sample size"] if k in text_excerpt.lower()) if not text_excerpt: return 2, "No body text available; cannot inspect methods." if keys_present >= 5: return 5, "Detailed methodological cues detected (methods/results/validation/etc.)." if keys_present >= 3: return 4, "Some methodological cues present." if keys_present >= 1: return 3, "Limited methodological signals." return 2, "Minimal methodological detail detected (likely a commentary/overview)." def score_purpose(text_excerpt: str): lower = text_excerpt.lower() bias_hits = any(w in lower for w in ["sponsored", "advertisement", "marketing"]) conflicts = "conflict of interest" in lower or "competing interest" in lower funding = "funding" in lower or "grant" in lower if bias_hits: return 2, "Potential promotional language detected." if conflicts and not funding: return 3, "Conflicts noted, funding unclear." if funding or conflicts: return 4, "Academic tone with disclosures/funding statements." return 4, "Academic/educational purpose inferred." def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt: str): if not assignment_context: return 4, "General relevance assumed (no assignment context provided)." ctx = assignment_context.lower() hay = (meta.get("title","") + " " + text_excerpt).lower() hits = sum(1 for tok in set(re.findall(r"[a-zA-Z]{4,}", ctx)) if tok in hay) if hits >= 6: return 5, "Strong topical overlap with assignment context." if hits >= 3: return 4, "Good topical overlap." if hits >= 1: return 3, "Partial topical overlap." return 2, "Low topical overlap; may be tangential." def aggregate_scores(meta: Dict[str,Any], text: str, assignment_context: str, provisional: bool): currency_score, currency_evd, currency_checks = score_currency(meta.get("year")) authority_score, authority_evd = score_authority(meta) accuracy_score, accuracy_evd = score_accuracy(text) purpose_score, purpose_evd = score_purpose(text) relevance_score, relevance_evd = score_relevance(assignment_context, meta, text) if provisional: accuracy_score = min(accuracy_score, 3) purpose_score = min(purpose_score, 4) craap = { "Currency": {"score": currency_score, "evidence": currency_evd, "checks": currency_checks}, "Relevance": {"score": relevance_score, "evidence": relevance_evd}, "Authority": {"score": authority_score, "evidence": authority_evd}, "Accuracy": {"score": accuracy_score, "evidence": accuracy_evd}, "Purpose": {"score": purpose_score, "evidence": purpose_evd} } avg = round(sum(v["score"] for v in craap.values())/5, 2) verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid") return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}} INDEX_HTML = """
URL/DOI or PDF → quick quality check for scholarly sources
By: NADYA W{{ result.metadata.title or '[unknown title]' }}
{{ (result.metadata.authors or [])|join(', ') }} · {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} · {{ result.metadata.year }}{% endif %}
{% set s2 = result.enrichment.s2 if result.enrichment else None %} {% set doi = result.metadata.identifier.doi if result.metadata and result.metadata.identifier else None %} {% set src_url = result.metadata.identifier.url if result.metadata and result.metadata.identifier else None %}{% if doi %} Open DOI {% elif src_url %} Open Source {% endif %} {% if s2 and s2.url %} Semantic Scholar {% endif %} {% if s2 and s2.openAccessPdf %} Open Access PDF {% endif %} Google Scholar
{% if s2 %} {% endif %}Overall: {{ result.overall.average }} — {{ result.overall.verdict }}
{{ result | tojson(indent=2) }}