nadyaw commited on
Commit
871d6a8
·
verified ·
1 Parent(s): c82c1cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -103
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import io, re, json, datetime, os
2
  from typing import Dict, Any, List, Tuple, Optional
3
 
4
  from flask import Flask, request, jsonify, render_template_string, redirect, url_for
@@ -11,10 +11,10 @@ app = Flask(__name__)
11
  CORS(app, resources={r"/api/*": {"origins": "*"}})
12
 
13
  app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16 MB upload cap
 
14
  THIS_YEAR = datetime.date.today().year
15
  DOI_RX = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I)
16
 
17
- # ----------------- Helpers
18
  def _clean(s: Optional[str]) -> str:
19
  return (s or "").strip()
20
 
@@ -27,8 +27,7 @@ def year_from_any(x: str) -> Optional[int]:
27
  return y
28
  return None
29
 
30
- def fetch_url_metadata(url_or_doi: str) -> Tuple[Dict[str, Any], str, List[str]]:
31
- """Return (metadata, fulltext_excerpt, warnings) for a URL or DOI."""
32
  warnings = []
33
  url = url_or_doi
34
  m = DOI_RX.search(url_or_doi)
@@ -68,8 +67,7 @@ def fetch_url_metadata(url_or_doi: str) -> Tuple[Dict[str, Any], str, List[str]]
68
  text_excerpt = (abst or "")[:4000]
69
  return meta, text_excerpt, warnings
70
 
71
- def extract_pdf_text_and_guess_meta(file_storage) -> Tuple[Dict[str, Any], str, List[str]]:
72
- """Return (metadata, body_text, warnings)."""
73
  warnings = []
74
  try:
75
  data = file_storage.read()
@@ -111,48 +109,7 @@ def extract_pdf_text_and_guess_meta(file_storage) -> Tuple[Dict[str, Any], str,
111
  except Exception as e:
112
  return {}, "", [f"Failed to parse PDF: {e}"]
113
 
114
- # ----------------- Semantic Scholar integration
115
- SEM_SCH_FIELDS = (
116
- "title,year,venue,url,isOpenAccess,openAccessPdf,"
117
- "citationCount,referenceCount,publicationTypes,externalIds,"
118
- "authors.name,authors.hIndex"
119
- )
120
-
121
- def fetch_semantic_scholar(doi: Optional[str]) -> Tuple[Dict[str, Any], List[str]]:
122
- if not doi:
123
- return {}, ["Semantic Scholar: DOI missing; lookup skipped."]
124
- url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
125
- try:
126
- r = requests.get(url, params={"fields": SEM_SCH_FIELDS}, timeout=15)
127
- if r.status_code == 404:
128
- return {}, [f"Semantic Scholar: no record for DOI {doi}."]
129
- r.raise_for_status()
130
- return r.json(), []
131
- except Exception as e:
132
- return {}, [f"Semantic Scholar error: {e}"]
133
-
134
- def authority_boost_with_semantic(meta: Dict[str,Any], sem: Dict[str,Any]) -> Tuple[int, str]:
135
- if not sem:
136
- return 0, ""
137
- bonus = 0
138
- notes = []
139
- cit = sem.get("citationCount")
140
- if isinstance(cit, int):
141
- if cit >= 50: bonus += 2
142
- elif cit >= 10: bonus += 1
143
- notes.append(f"S2 citations: {cit}.")
144
- # Author h-index
145
- try:
146
- auths = sem.get("authors") or []
147
- max_h = max([a.get("hIndex", 0) or 0 for a in auths] or [0])
148
- if max_h >= 30: bonus += 1
149
- notes.append(f"Top author h-index: {max_h}.")
150
- except Exception:
151
- pass
152
- return min(bonus, 2), "; ".join(notes)
153
-
154
- # ----------------- Scoring
155
- def score_currency(year: Optional[int]) -> Tuple[int, str, List[str]]:
156
  if not year:
157
  return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."]
158
  age = max(0, THIS_YEAR - year)
@@ -161,7 +118,7 @@ def score_currency(year: Optional[int]) -> Tuple[int, str, List[str]]:
161
  if age <= 10: return 3, f"Published in {year} (~{age} years old).", []
162
  return 2, f"Published in {year} (>10 years old).", ["Potentially outdated."]
163
 
164
- def score_authority(meta: Dict[str,Any], sem: Optional[Dict[str,Any]] = None) -> Tuple[int, str]:
165
  score = 1
166
  notes = []
167
  if meta.get("venue"):
@@ -172,13 +129,9 @@ def score_authority(meta: Dict[str,Any], sem: Optional[Dict[str,Any]] = None) ->
172
  a_count = len(meta["authors"])
173
  if a_count >= 3: score += 1
174
  notes.append(f"Authors: {a_count}.")
175
- if sem:
176
- b, bnotes = authority_boost_with_semantic(meta, sem)
177
- score += b
178
- if bnotes: notes.append(bnotes)
179
  return min(score,5), "; ".join(notes) if notes else "Insufficient venue/author info."
180
 
181
- def score_accuracy(text_excerpt: str) -> Tuple[int, str]:
182
  keys_present = sum(1 for k in ["methods","materials","results","limitations","confidence interval","validation","dataset","sample size"] if k in text_excerpt.lower())
183
  if not text_excerpt:
184
  return 2, "No body text available; cannot inspect methods."
@@ -187,7 +140,7 @@ def score_accuracy(text_excerpt: str) -> Tuple[int, str]:
187
  if keys_present >= 1: return 3, "Limited methodological signals."
188
  return 2, "Minimal methodological detail detected (likely a commentary/overview)."
189
 
190
- def score_purpose(text_excerpt: str) -> Tuple[int, str]:
191
  lower = text_excerpt.lower()
192
  bias_hits = any(w in lower for w in ["sponsored", "advertisement", "marketing"])
193
  conflicts = "conflict of interest" in lower or "competing interest" in lower
@@ -200,7 +153,7 @@ def score_purpose(text_excerpt: str) -> Tuple[int, str]:
200
  return 4, "Academic tone with disclosures/funding statements."
201
  return 4, "Academic/educational purpose inferred."
202
 
203
- def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt: str) -> Tuple[int, str]:
204
  if not assignment_context:
205
  return 4, "General relevance assumed (no assignment context provided)."
206
  ctx = assignment_context.lower()
@@ -211,9 +164,9 @@ def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt:
211
  if hits >= 1: return 3, "Partial topical overlap."
212
  return 2, "Low topical overlap; may be tangential."
213
 
214
- def aggregate_scores_with_sem(meta: Dict[str,Any], text: str, assignment_context: str, provisional: bool, sem: Dict[str,Any]) -> Dict[str,Any]:
215
  currency_score, currency_evd, currency_checks = score_currency(meta.get("year"))
216
- authority_score, authority_evd = score_authority(meta, sem)
217
  accuracy_score, accuracy_evd = score_accuracy(text)
218
  purpose_score, purpose_evd = score_purpose(text)
219
  relevance_score, relevance_evd = score_relevance(assignment_context, meta, text)
@@ -229,25 +182,10 @@ def aggregate_scores_with_sem(meta: Dict[str,Any], text: str, assignment_context
229
  }
230
  avg = round(sum(v["score"] for v in craap.values())/5, 2)
231
  verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid")
232
- return {
233
- "metadata": meta,
234
- "craap": craap,
235
- "overall": {"average": avg, "verdict": verdict},
236
- "external": {
237
- "semantic_scholar": {
238
- "url": sem.get("url"),
239
- "venue": sem.get("venue"),
240
- "year": sem.get("year"),
241
- "citationCount": sem.get("citationCount"),
242
- "referenceCount": sem.get("referenceCount"),
243
- "openAccessPdf": (sem.get("openAccessPdf") or {}).get("url") if sem else None
244
- }
245
- }
246
- }
247
 
248
- # ----------------- UI
249
  INDEX_HTML = """
250
- <!doctype html><html><head><meta charset="utf-8"/><title>CRAAP Bot</title>
251
  <meta name="viewport" content="width=device-width, initial-scale=1">
252
  <style>
253
  body{font:16px system-ui,Segoe UI,Roboto,sans-serif;max-width:880px;margin:2rem auto;padding:0 1rem}
@@ -255,17 +193,18 @@ form,.card{border:1px solid #e5e7eb;border-radius:12px;padding:1rem;margin:1rem
255
  label{display:block;font-weight:600;margin:.5rem 0 .25rem}
256
  input[type="text"],textarea{width:100%;padding:.6rem .7rem;border:1px solid #d1d5db;border-radius:8px}
257
  input[type="file"]{margin:.25rem 0 .75rem}button{background:#111827;color:#fff;border:0;padding:.6rem 1rem;border-radius:8px;cursor:pointer}
258
- .muted{color:#6b7280}.warn{padding:.6rem .8rem;background:#fff7ed;border:1px solid #fed7aa;border-radius:8px;margin:.5rem 0}
259
  .tag{display:inline-block;padding:.1rem .5rem;border-radius:999px;border:1px solid #d1d5db;margin-right:.4rem}
260
  </style></head><body>
261
- <header><h1>CRAAP Bot</h1><span class="tag">By: Nadya W</span></header>
262
  <div class="card"><form method="POST" action="{{ url_for('analyze') }}" enctype="multipart/form-data">
263
  <label>URL or DOI</label><input type="text" name="paper_source" placeholder="https://doi.org/10.xxxx/..."/>
264
  <label>Or upload PDF</label><input type="file" name="pdf" accept="application/pdf"/>
265
- <label>Assignment context (optional)</label><input type="text" name="assignment_context" placeholder="e.g., NTM plasmidome 2023-2025"/>
266
  <button type="submit">Analyze</button></form>
267
  <p class="muted">Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.</p></div>
268
  {% if result %}{% if warnings %}<div class="warn">⚠️ {{ warnings|join(' · ') }}</div>{% endif %}
 
269
  <div class="card"><h2>CRAAP Evaluation Summary</h2>
270
  <p><strong>{{ result.metadata.title or '[unknown title]' }}</strong></p>
271
  <p class="muted">{{ (result.metadata.authors or [])|join(', ') }} · {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} · {{ result.metadata.year }}{% endif %}</p>
@@ -275,32 +214,55 @@ input[type="file"]{margin:.25rem 0 .75rem}button{background:#111827;color:#fff;b
275
  <li><strong>Authority</strong>: {{ result.craap.Authority.score }}/5 — {{ result.craap.Authority.evidence }}</li>
276
  <li><strong>Accuracy</strong>: {{ result.craap.Accuracy.score }}/5 — {{ result.craap.Accuracy.evidence }}</li>
277
  <li><strong>Purpose</strong>: {{ result.craap.Purpose.score }}/5 — {{ result.craap.Purpose.evidence }}</li>
278
- </ul><p><strong>Overall:</strong> {{ result.overall.average }} — <em>{{ result.overall.verdict }}</em></p></div>
279
-
280
- <div class="card"><h3>Research signals</h3>
281
- <ul>
282
- <li><strong>Semantic Scholar citations:</strong>
283
- {% if result.external.semantic_scholar.citationCount is not none %}
284
- {{ result.external.semantic_scholar.citationCount }}
285
- {% else %} n/a {% endif %}
286
- {% if result.external.semantic_scholar.url %} · <a href="{{ result.external.semantic_scholar.url }}" target="_blank">S2 record</a>{% endif %}
287
- {% if result.external.semantic_scholar.openAccessPdf %} · <a href="{{ result.external.semantic_scholar.openAccessPdf }}" target="_blank">OA PDF</a>{% endif %}
288
- </li>
289
- <li><strong>Venue/Year (S2):</strong>
290
- {{ result.external.semantic_scholar.venue or 'n/a' }}{% if result.external.semantic_scholar.year %} · {{ result.external.semantic_scholar.year }}{% endif %}
291
- </li>
292
- </ul></div>
293
-
294
- <div class="card"><h3>What to verify next</h3>
295
  <ol>
296
  <li>Confirm publication date & peer-review at the DOI/URL.</li>
297
- <li>Check methods/results for sample size, validation, limitations.</li>
298
- <li>Skim citing papers (via Semantic Scholar) for corroboration/critique.</li>
299
- <li>Review funding/conflicts. Look for replication or benchmarking work.</li>
300
- <li>If key to your assignment, read the full text (methods & supplements).</li>
301
- </ol></div>
302
- {% endif %}
303
- </body></html>
304
  """
305
 
306
- # -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, re, json, datetime
2
  from typing import Dict, Any, List, Tuple, Optional
3
 
4
  from flask import Flask, request, jsonify, render_template_string, redirect, url_for
 
11
  CORS(app, resources={r"/api/*": {"origins": "*"}})
12
 
13
  app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16 MB upload cap
14
+
15
  THIS_YEAR = datetime.date.today().year
16
  DOI_RX = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I)
17
 
 
18
  def _clean(s: Optional[str]) -> str:
19
  return (s or "").strip()
20
 
 
27
  return y
28
  return None
29
 
30
+ def fetch_url_metadata(url_or_doi: str):
 
31
  warnings = []
32
  url = url_or_doi
33
  m = DOI_RX.search(url_or_doi)
 
67
  text_excerpt = (abst or "")[:4000]
68
  return meta, text_excerpt, warnings
69
 
70
+ def extract_pdf_text_and_guess_meta(file_storage):
 
71
  warnings = []
72
  try:
73
  data = file_storage.read()
 
109
  except Exception as e:
110
  return {}, "", [f"Failed to parse PDF: {e}"]
111
 
112
+ def score_currency(year: Optional[int]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  if not year:
114
  return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."]
115
  age = max(0, THIS_YEAR - year)
 
118
  if age <= 10: return 3, f"Published in {year} (~{age} years old).", []
119
  return 2, f"Published in {year} (>10 years old).", ["Potentially outdated."]
120
 
121
+ def score_authority(meta: Dict[str,Any]):
122
  score = 1
123
  notes = []
124
  if meta.get("venue"):
 
129
  a_count = len(meta["authors"])
130
  if a_count >= 3: score += 1
131
  notes.append(f"Authors: {a_count}.")
 
 
 
 
132
  return min(score,5), "; ".join(notes) if notes else "Insufficient venue/author info."
133
 
134
+ def score_accuracy(text_excerpt: str):
135
  keys_present = sum(1 for k in ["methods","materials","results","limitations","confidence interval","validation","dataset","sample size"] if k in text_excerpt.lower())
136
  if not text_excerpt:
137
  return 2, "No body text available; cannot inspect methods."
 
140
  if keys_present >= 1: return 3, "Limited methodological signals."
141
  return 2, "Minimal methodological detail detected (likely a commentary/overview)."
142
 
143
+ def score_purpose(text_excerpt: str):
144
  lower = text_excerpt.lower()
145
  bias_hits = any(w in lower for w in ["sponsored", "advertisement", "marketing"])
146
  conflicts = "conflict of interest" in lower or "competing interest" in lower
 
153
  return 4, "Academic tone with disclosures/funding statements."
154
  return 4, "Academic/educational purpose inferred."
155
 
156
+ def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt: str):
157
  if not assignment_context:
158
  return 4, "General relevance assumed (no assignment context provided)."
159
  ctx = assignment_context.lower()
 
164
  if hits >= 1: return 3, "Partial topical overlap."
165
  return 2, "Low topical overlap; may be tangential."
166
 
167
+ def aggregate_scores(meta: Dict[str,Any], text: str, assignment_context: str, provisional: bool):
168
  currency_score, currency_evd, currency_checks = score_currency(meta.get("year"))
169
+ authority_score, authority_evd = score_authority(meta)
170
  accuracy_score, accuracy_evd = score_accuracy(text)
171
  purpose_score, purpose_evd = score_purpose(text)
172
  relevance_score, relevance_evd = score_relevance(assignment_context, meta, text)
 
182
  }
183
  avg = round(sum(v["score"] for v in craap.values())/5, 2)
184
  verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid")
185
+ return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
 
187
  INDEX_HTML = """
188
+ <!doctype html><html><head><meta charset="utf-8"/><title>CRAAP Bot (Flask)</title>
189
  <meta name="viewport" content="width=device-width, initial-scale=1">
190
  <style>
191
  body{font:16px system-ui,Segoe UI,Roboto,sans-serif;max-width:880px;margin:2rem auto;padding:0 1rem}
 
193
  label{display:block;font-weight:600;margin:.5rem 0 .25rem}
194
  input[type="text"],textarea{width:100%;padding:.6rem .7rem;border:1px solid #d1d5db;border-radius:8px}
195
  input[type="file"]{margin:.25rem 0 .75rem}button{background:#111827;color:#fff;border:0;padding:.6rem 1rem;border-radius:8px;cursor:pointer}
196
+ pre{background:#0b1020;color:#d7e7ff;padding:1rem;border-radius:12px;overflow:auto}.muted{color:#6b7280}.warn{padding:.6rem .8rem;background:#fff7ed;border:1px solid #fed7aa;border-radius:8px;margin:.5rem 0}
197
  .tag{display:inline-block;padding:.1rem .5rem;border-radius:999px;border:1px solid #d1d5db;margin-right:.4rem}
198
  </style></head><body>
199
+ <header><h1>CRAAP Bot</h1><span class="tag">By: NADYA W</span></header>
200
  <div class="card"><form method="POST" action="{{ url_for('analyze') }}" enctype="multipart/form-data">
201
  <label>URL or DOI</label><input type="text" name="paper_source" placeholder="https://doi.org/10.xxxx/..."/>
202
  <label>Or upload PDF</label><input type="file" name="pdf" accept="application/pdf"/>
203
+ <label>Assignment context (optional)</label><input type="text" name="assignment_context" placeholder="e.g., AI for zoonotic disease 2023-2025"/>
204
  <button type="submit">Analyze</button></form>
205
  <p class="muted">Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.</p></div>
206
  {% if result %}{% if warnings %}<div class="warn">⚠️ {{ warnings|join(' · ') }}</div>{% endif %}
207
+ <div class="card"><h2>JSON</h2><pre>{{ result | tojson(indent=2) }}</pre></div>
208
  <div class="card"><h2>CRAAP Evaluation Summary</h2>
209
  <p><strong>{{ result.metadata.title or '[unknown title]' }}</strong></p>
210
  <p class="muted">{{ (result.metadata.authors or [])|join(', ') }} · {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} · {{ result.metadata.year }}{% endif %}</p>
 
214
  <li><strong>Authority</strong>: {{ result.craap.Authority.score }}/5 — {{ result.craap.Authority.evidence }}</li>
215
  <li><strong>Accuracy</strong>: {{ result.craap.Accuracy.score }}/5 — {{ result.craap.Accuracy.evidence }}</li>
216
  <li><strong>Purpose</strong>: {{ result.craap.Purpose.score }}/5 — {{ result.craap.Purpose.evidence }}</li>
217
+ </ul><p><strong>Overall:</strong> {{ result.overall.average }} — <em>{{ result.overall.verdict }}</em></p>
218
+ <h3>What to verify next</h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  <ol>
220
  <li>Confirm publication date & peer-review at the DOI/URL.</li>
221
+ <li>Skim methods/results for sample size, validation, limitations.</li>
222
+ <li>Check author affiliations and profiles (Semantic Scholar/ORCID).</li>
223
+ <li>Look for funding/conflict-of-interest statements.</li>
224
+ <li>Search for newer papers (last 1–2 years) that cite or challenge it.</li>
225
+ </ol>
226
+ </div>{% endif %}</body></html>
 
227
  """
228
 
229
+ @app.route("/", methods=["GET"])
230
+ def index():
231
+ return render_template_string(INDEX_HTML, result=None, warnings=None)
232
+
233
+ @app.route("/analyze", methods=["POST"])
234
+ def analyze():
235
+ paper_source = _clean(request.form.get("paper_source", ""))
236
+ assignment_context = _clean(request.form.get("assignment_context", ""))
237
+ provisional = False
238
+ warnings: List[str] = []
239
+ meta, text = {}, ""
240
+ if paper_source:
241
+ meta, text, w = fetch_url_metadata(paper_source)
242
+ warnings.extend(w)
243
+ elif "pdf" in request.files and request.files["pdf"].filename:
244
+ meta, text, w = extract_pdf_text_and_guess_meta(request.files["pdf"])
245
+ warnings.extend(w); provisional = True
246
+ else:
247
+ return redirect(url_for("index"))
248
+ result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
249
+ if not text:
250
+ warnings.append("Full text not available — Accuracy/Purpose are provisional. Provide a DOI/URL or full PDF for deeper evaluation.")
251
+ return render_template_string(INDEX_HTML, result=result, warnings=warnings)
252
+
253
+ @app.route("/api/analyze", methods=["POST"])
254
+ def api_analyze():
255
+ data = request.json or {}
256
+ paper_source = _clean(data.get("paper_source",""))
257
+ assignment_context = _clean(data.get("assignment_context",""))
258
+ meta, text, warnings = ({}, "", [])
259
+ provisional = False
260
+ if paper_source:
261
+ meta, text, warnings = fetch_url_metadata(paper_source)
262
+ else:
263
+ return jsonify({"error":"Provide paper_source (URL/DOI) or use /analyze form for PDF upload"}), 400
264
+ result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
265
+ return jsonify({"result": result, "warnings": warnings})
266
+
267
+ if __name__ == "__main__":
268
+ app.run(host="0.0.0.0", port=8000, debug=True)