nadyaw commited on
Commit
f7d03c6
Β·
verified Β·
1 Parent(s): 871d6a8

Add semantic scholar functionality and fix html

Browse files
Files changed (1) hide show
  1. app.py +283 -39
app.py CHANGED
@@ -109,6 +109,45 @@ def extract_pdf_text_and_guess_meta(file_storage):
109
  except Exception as e:
110
  return {}, "", [f"Failed to parse PDF: {e}"]
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def score_currency(year: Optional[int]):
113
  if not year:
114
  return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."]
@@ -184,46 +223,243 @@ def aggregate_scores(meta: Dict[str,Any], text: str, assignment_context: str, pr
184
  verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid")
185
  return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}}
186
 
 
187
  INDEX_HTML = """
188
- <!doctype html><html><head><meta charset="utf-8"/><title>CRAAP Bot (Flask)</title>
189
- <meta name="viewport" content="width=device-width, initial-scale=1">
190
- <style>
191
- body{font:16px system-ui,Segoe UI,Roboto,sans-serif;max-width:880px;margin:2rem auto;padding:0 1rem}
192
- form,.card{border:1px solid #e5e7eb;border-radius:12px;padding:1rem;margin:1rem 0;background:#fff;box-shadow:0 1px 2px rgba(0,0,0,.04)}
193
- label{display:block;font-weight:600;margin:.5rem 0 .25rem}
194
- input[type="text"],textarea{width:100%;padding:.6rem .7rem;border:1px solid #d1d5db;border-radius:8px}
195
- input[type="file"]{margin:.25rem 0 .75rem}button{background:#111827;color:#fff;border:0;padding:.6rem 1rem;border-radius:8px;cursor:pointer}
196
- pre{background:#0b1020;color:#d7e7ff;padding:1rem;border-radius:12px;overflow:auto}.muted{color:#6b7280}.warn{padding:.6rem .8rem;background:#fff7ed;border:1px solid #fed7aa;border-radius:8px;margin:.5rem 0}
197
- .tag{display:inline-block;padding:.1rem .5rem;border-radius:999px;border:1px solid #d1d5db;margin-right:.4rem}
198
- </style></head><body>
199
- <header><h1>CRAAP Bot</h1><span class="tag">By: NADYA W</span></header>
200
- <div class="card"><form method="POST" action="{{ url_for('analyze') }}" enctype="multipart/form-data">
201
- <label>URL or DOI</label><input type="text" name="paper_source" placeholder="https://doi.org/10.xxxx/..."/>
202
- <label>Or upload PDF</label><input type="file" name="pdf" accept="application/pdf"/>
203
- <label>Assignment context (optional)</label><input type="text" name="assignment_context" placeholder="e.g., AI for zoonotic disease 2023-2025"/>
204
- <button type="submit">Analyze</button></form>
205
- <p class="muted">Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.</p></div>
206
- {% if result %}{% if warnings %}<div class="warn">⚠️ {{ warnings|join(' · ') }}</div>{% endif %}
207
- <div class="card"><h2>JSON</h2><pre>{{ result | tojson(indent=2) }}</pre></div>
208
- <div class="card"><h2>CRAAP Evaluation Summary</h2>
209
- <p><strong>{{ result.metadata.title or '[unknown title]' }}</strong></p>
210
- <p class="muted">{{ (result.metadata.authors or [])|join(', ') }} Β· {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} Β· {{ result.metadata.year }}{% endif %}</p>
211
- <ul>
212
- <li><strong>Currency</strong>: {{ result.craap.Currency.score }}/5 β€” {{ result.craap.Currency.evidence }}</li>
213
- <li><strong>Relevance</strong>: {{ result.craap.Relevance.score }}/5 β€” {{ result.craap.Relevance.evidence }}</li>
214
- <li><strong>Authority</strong>: {{ result.craap.Authority.score }}/5 β€” {{ result.craap.Authority.evidence }}</li>
215
- <li><strong>Accuracy</strong>: {{ result.craap.Accuracy.score }}/5 β€” {{ result.craap.Accuracy.evidence }}</li>
216
- <li><strong>Purpose</strong>: {{ result.craap.Purpose.score }}/5 οΏ½οΏ½οΏ½ {{ result.craap.Purpose.evidence }}</li>
217
- </ul><p><strong>Overall:</strong> {{ result.overall.average }} β€” <em>{{ result.overall.verdict }}</em></p>
218
- <h3>What to verify next</h3>
219
- <ol>
220
- <li>Confirm publication date & peer-review at the DOI/URL.</li>
221
- <li>Skim methods/results for sample size, validation, limitations.</li>
222
- <li>Check author affiliations and profiles (Semantic Scholar/ORCID).</li>
223
- <li>Look for funding/conflict-of-interest statements.</li>
224
- <li>Search for newer papers (last 1–2 years) that cite or challenge it.</li>
225
- </ol>
226
- </div>{% endif %}</body></html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  """
228
 
229
  @app.route("/", methods=["GET"])
@@ -246,6 +482,10 @@ def analyze():
246
  else:
247
  return redirect(url_for("index"))
248
  result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
 
 
 
 
249
  if not text:
250
  warnings.append("Full text not available β€” Accuracy/Purpose are provisional. Provide a DOI/URL or full PDF for deeper evaluation.")
251
  return render_template_string(INDEX_HTML, result=result, warnings=warnings)
@@ -262,6 +502,10 @@ def api_analyze():
262
  else:
263
  return jsonify({"error":"Provide paper_source (URL/DOI) or use /analyze form for PDF upload"}), 400
264
  result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
 
 
 
 
265
  return jsonify({"result": result, "warnings": warnings})
266
 
267
  if __name__ == "__main__":
 
109
  except Exception as e:
110
  return {}, "", [f"Failed to parse PDF: {e}"]
111
 
112
+ def fetch_semantic_scholar(doi: str):
113
+ """Fetch enrichment from Semantic Scholar Graph API by DOI."""
114
+ if not doi:
115
+ return {}, ["No DOI provided"]
116
+ base = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{requests.utils.quote(doi)}"
117
+ fields = ",".join([
118
+ "title","year","publicationDate","journal","url",
119
+ "isOpenAccess","openAccessPdf","citationCount","influentialCitationCount",
120
+ "authors.name","fieldsOfStudy","publicationTypes"
121
+ ])
122
+ headers = {"User-Agent":"CRAAPBot/1.1"}
123
+ if S2_API_KEY:
124
+ headers["x-api-key"] = S2_API_KEY
125
+ try:
126
+ r = requests.get(base, params={"fields":fields}, headers=headers, timeout=12)
127
+ if r.status_code == 404:
128
+ return {}, []
129
+ r.raise_for_status()
130
+ p = r.json()
131
+ enrich = {
132
+ "s2": {
133
+ "title": p.get("title"),
134
+ "year": p.get("year"),
135
+ "publicationDate": p.get("publicationDate"),
136
+ "journal": (p.get("journal") or {}).get("name"),
137
+ "url": p.get("url"),
138
+ "isOpenAccess": p.get("isOpenAccess"),
139
+ "openAccessPdf": (p.get("openAccessPdf") or {}).get("url"),
140
+ "citationCount": p.get("citationCount"),
141
+ "influentialCitationCount": p.get("influentialCitationCount"),
142
+ "authors": [a.get("name") for a in (p.get("authors") or []) if a.get("name")],
143
+ "fieldsOfStudy": p.get("fieldsOfStudy"),
144
+ "publicationTypes": p.get("publicationTypes"),
145
+ }
146
+ }
147
+ return enrich, []
148
+ except Exception as e:
149
+ return {}, [f"Semantic Scholar lookup failed: {e}"]
150
+
151
  def score_currency(year: Optional[int]):
152
  if not year:
153
  return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."]
 
223
  verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid")
224
  return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}}
225
 
226
+
227
  INDEX_HTML = """
228
+ <!doctype html>
229
+ <html lang="en">
230
+ <head>
231
+ <meta charset="utf-8"/>
232
+ <title>CRAAP Bot Β· Flask</title>
233
+ <meta name="viewport" content="width=device-width, initial-scale=1">
234
+
235
+ <style>
236
+ :root{
237
+ --bg:#f8fafc;
238
+ --card:#ffffff;
239
+ --ink:#0f172a;
240
+ --muted:#64748b;
241
+ --line:#e2e8f0;
242
+ --brand:#111827;
243
+ --accent:#2563eb;
244
+ --warn-bg:#fff7ed;
245
+ --warn-line:#fed7aa;
246
+ --code-bg:#0b1020;
247
+ --code-ink:#d7e7ff;
248
+ --ring:#93c5fd;
249
+ --shadow:0 1px 2px rgba(0,0,0,.05), 0 10px 16px rgba(2,6,23,.04);
250
+ }
251
+ @media (prefers-color-scheme: dark){
252
+ :root{
253
+ --bg:#0b1220;
254
+ --card:#0f172a;
255
+ --ink:#e5e7eb;
256
+ --muted:#94a3b8;
257
+ --line:#1f2a44;
258
+ --brand:#e5e7eb;
259
+ --accent:#60a5fa;
260
+ --warn-bg:#2b1f12;
261
+ --warn-line:#9a5a25;
262
+ --code-bg:#030712;
263
+ --code-ink:#d7e7ff;
264
+ --ring:#2563eb;
265
+ --shadow:0 1px 2px rgba(0,0,0,.4), 0 12px 20px rgba(0,0,0,.35);
266
+ }
267
+ }
268
+
269
+ *{box-sizing:border-box}
270
+ html,body{height:100%}
271
+ body{
272
+ margin:0;
273
+ background:var(--bg);
274
+ color:var(--ink);
275
+ font:16px/1.55 system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, Apple Color Emoji, Segoe UI Emoji, Noto Color Emoji, sans-serif;
276
+ }
277
+
278
+ .wrap{max-width:980px;margin:2.2rem auto;padding:0 1rem}
279
+ header{
280
+ padding:1.25rem 1rem 1rem;
281
+ border-radius:16px;
282
+ background:linear-gradient(135deg, rgba(37,99,235,.10), rgba(2,6,23,.03));
283
+ border:1px solid var(--line);
284
+ box-shadow:var(--shadow);
285
+ }
286
+ header h1{margin:0 0 .35rem;font-weight:800;letter-spacing:.2px}
287
+ header p{margin:.25rem 0 0;color:var(--muted)}
288
+
289
+ .tag{
290
+ display:inline-flex;align-items:center;gap:.4rem;
291
+ padding:.2rem .6rem;margin-top:.5rem;margin-right:.5rem;
292
+ border:1px solid var(--line);border-radius:999px;color:var(--muted);font-size:.85rem
293
+ }
294
+
295
+ .card{
296
+ background:var(--card);border:1px solid var(--line);border-radius:16px;
297
+ padding:1.1rem 1.2rem;margin:1rem 0;box-shadow:var(--shadow)
298
+ }
299
+
300
+ label{display:block;font-weight:650;margin:.65rem 0 .35rem}
301
+ input[type="text"], input[type="file"]{
302
+ width:100%;padding:.7rem .8rem;border:1px solid var(--line);border-radius:12px;background:transparent;color:var(--ink);
303
+ outline:none;transition:border .15s, box-shadow .15s
304
+ }
305
+ input[type="text"]:focus, input[type="file"]:focus{
306
+ border-color:var(--accent); box-shadow:0 0 0 3px color-mix(in srgb, var(--ring) 35%, transparent);
307
+ }
308
+
309
+ .btn{
310
+ display:inline-block; background:var(--brand); color:#fff; text-decoration:none;
311
+ border:0; padding:.6rem .9rem; border-radius:10px; cursor:pointer;
312
+ transition:transform .06s ease, opacity .15s ease;
313
+ margin:.25rem .35rem .25rem 0; font-weight:600; font-size:.95rem
314
+ }
315
+ .btn:hover{opacity:.92; transform:translateY(-1px)}
316
+ .btn:focus{outline:3px solid color-mix(in srgb, var(--ring) 45%, transparent); outline-offset:2px}
317
+ .btn--ghost{
318
+ background:transparent;color:var(--ink);border:1px solid var(--line)
319
+ }
320
+
321
+ .muted{color:var(--muted)}
322
+ .warn{padding:.7rem .9rem;background:var(--warn-bg);border:1px solid var(--warn-line);border-radius:12px;margin:.8rem 0}
323
+
324
+ ul{padding-left:1.2rem;margin:.6rem 0}
325
+ li{margin:.25rem 0}
326
+
327
+ pre{
328
+ background:var(--code-bg);color:var(--code-ink);
329
+ padding:1rem;border-radius:12px;overflow:auto;border:1px solid #0b1220;
330
+ }
331
+
332
+ details summary{cursor:pointer; list-style:none}
333
+ details summary::marker, details summary::-webkit-details-marker{display:none}
334
+ details summary{display:flex; align-items:center; gap:.5rem; font-weight:700}
335
+ details[open] summary{opacity:.85}
336
+
337
+ .grid{
338
+ display:grid; gap:1rem;
339
+ grid-template-columns:1fr;
340
+ }
341
+ @media (min-width:860px){
342
+ .grid{grid-template-columns:1fr 1fr}
343
+ }
344
+
345
+ .meta{display:flex; flex-wrap:wrap; gap:.4rem .6rem; align-items:center}
346
+ .pill{
347
+ display:inline-flex; align-items:center; gap:.4rem;
348
+ border:1px solid var(--line); border-radius:999px; padding:.15rem .55rem; color:var(--muted); font-size:.85rem
349
+ }
350
+ </style>
351
+ </head>
352
+
353
+ <body>
354
+ <div class="wrap">
355
+ <header>
356
+ <h1>CRAAP Bot</h1>
357
+ <p class="muted">URL/DOI or PDF β†’ quick quality check for scholarly sources</p>
358
+ <span class="tag">By: NADYA W</span>
359
+ <span class="tag">Flask</span>
360
+ </header>
361
+
362
+ <div class="card">
363
+ <form method="POST" action="{{ url_for('analyze') }}" enctype="multipart/form-data">
364
+ <label for="paper_source">URL or DOI</label>
365
+ <input id="paper_source" type="text" name="paper_source" placeholder="https://doi.org/10.xxxx/..."/>
366
+
367
+ <label for="pdf">Or upload PDF</label>
368
+ <input id="pdf" type="file" name="pdf" accept="application/pdf"/>
369
+
370
+ <label for="assignment_context">Assignment context (optional)</label>
371
+ <input id="assignment_context" type="text" name="assignment_context" placeholder="e.g., AI for zoonotic disease 2023–2025"/>
372
+
373
+ <div style="margin-top:.9rem">
374
+ <button class="btn" type="submit">Analyze</button>
375
+ <a class="btn btn--ghost" href="{{ url_for('index') }}">Reset</a>
376
+ </div>
377
+ <p class="muted" style="margin:.6rem 0 0">Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.</p>
378
+ </form>
379
+ </div>
380
+
381
+ {% if result %}
382
+ {% if warnings %}
383
+ <div class="warn">⚠️ {{ warnings|join(' · ') }}</div>
384
+ {% endif %}
385
+
386
+ <div class="card">
387
+ <h2 style="margin-top:0">CRAAP Evaluation Summary</h2>
388
+
389
+ <p style="margin:.25rem 0 0"><strong>{{ result.metadata.title or '[unknown title]' }}</strong></p>
390
+ <p class="muted" style="margin:.25rem 0 .75rem">
391
+ {{ (result.metadata.authors or [])|join(', ') }} Β· {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} Β· {{ result.metadata.year }}{% endif %}
392
+ </p>
393
+
394
+ {% set s2 = result.enrichment.s2 if result.enrichment else None %}
395
+ {% set doi = result.metadata.identifier.doi if result.metadata and result.metadata.identifier else None %}
396
+ {% set src_url = result.metadata.identifier.url if result.metadata and result.metadata.identifier else None %}
397
+
398
+ <p>
399
+ {% if doi %}
400
+ <a class="btn" href="https://doi.org/{{ doi }}" target="_blank" rel="noopener">Open DOI</a>
401
+ {% elif src_url %}
402
+ <a class="btn" href="{{ src_url }}" target="_blank" rel="noopener">Open Source</a>
403
+ {% endif %}
404
+
405
+ {% if s2 and s2.url %}
406
+ <a class="btn" href="{{ s2.url }}" target="_blank" rel="noopener">Semantic Scholar</a>
407
+ {% endif %}
408
+
409
+ {% if s2 and s2.openAccessPdf %}
410
+ <a class="btn" href="{{ s2.openAccessPdf }}" target="_blank" rel="noopener">Open Access PDF</a>
411
+ {% endif %}
412
+
413
+ <a class="btn btn--ghost" href="https://scholar.google.com/scholar?q={{ (result.metadata.title or doi or '')|urlencode }}" target="_blank" rel="noopener">Google Scholar</a>
414
+ </p>
415
+
416
+ {% if s2 %}
417
+ <div class="meta" style="margin:.25rem 0 .75rem">
418
+ {% if s2.journal %}<span class="pill">πŸ“˜ {{ s2.journal }}</span>{% endif %}
419
+ {% if s2.publicationDate %}<span class="pill">πŸ—“ {{ s2.publicationDate }}</span>{% endif %}
420
+ <span class="pill">πŸ”— Citations: {{ s2.citationCount if s2.citationCount is not none else "?" }}</span>
421
+ {% if s2.influentialCitationCount is not none %}<span class="pill">⭐ Influential: {{ s2.influentialCitationCount }}</span>{% endif %}
422
+ {% if s2.isOpenAccess %}<span class="pill">🟒 Open Access</span>{% endif %}
423
+ {% if s2.publicationTypes %}<span class="pill">🧭 {{ s2.publicationTypes|join(', ') }}</span>{% endif %}
424
+ </div>
425
+ {% endif %}
426
+
427
+ <div class="grid">
428
+ <div class="card" style="margin:0">
429
+ <h3 style="margin-top:0">Scores</h3>
430
+ <ul>
431
+ <li><strong>Currency</strong>: {{ result.craap.Currency.score }}/5 β€” {{ result.craap.Currency.evidence }}</li>
432
+ <li><strong>Relevance</strong>: {{ result.craap.Relevance.score }}/5 β€” {{ result.craap.Relevance.evidence }}</li>
433
+ <li><strong>Authority</strong>: {{ result.craap.Authority.score }}/5 β€” {{ result.craap.Authority.evidence }}</li>
434
+ <li><strong>Accuracy</strong>: {{ result.craap.Accuracy.score }}/5 β€” {{ result.craap.Accuracy.evidence }}</li>
435
+ <li><strong>Purpose</strong>: {{ result.craap.Purpose.score }}/5 β€” {{ result.craap.Purpose.evidence }}</li>
436
+ </ul>
437
+ <p><strong>Overall:</strong> {{ result.overall.average }} β€” <em>{{ result.overall.verdict }}</em></p>
438
+ </div>
439
+
440
+ <div class="card" style="margin:0">
441
+ <h3 style="margin-top:0">What to verify next</h3>
442
+ <ol>
443
+ <li>Confirm publication date & peer-review at the DOI/URL.</li>
444
+ <li>Skim methods/results for sample size, validation, limitations.</li>
445
+ <li>Check author affiliations and profiles (Semantic Scholar/ORCID).</li>
446
+ <li>Look for funding/conflict-of-interest statements.</li>
447
+ <li>Search for newer papers (last 1–2 years) that cite or challenge it.</li>
448
+ </ol>
449
+ </div>
450
+ </div>
451
+ </div>
452
+
453
+ <div class="card">
454
+ <details>
455
+ <summary>View raw JSON</summary>
456
+ <pre>{{ result | tojson(indent=2) }}</pre>
457
+ </details>
458
+ </div>
459
+ {% endif %}
460
+ </div>
461
+ </body>
462
+ </html>
463
  """
464
 
465
  @app.route("/", methods=["GET"])
 
482
  else:
483
  return redirect(url_for("index"))
484
  result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
485
+ doi = (meta.get("identifier") or {}).get("doi")
486
+ enrichment, ewarns = fetch_semantic_scholar(doi)
487
+ result["enrichment"] = enrichment
488
+ warnings.extend(ewarns)
489
  if not text:
490
  warnings.append("Full text not available β€” Accuracy/Purpose are provisional. Provide a DOI/URL or full PDF for deeper evaluation.")
491
  return render_template_string(INDEX_HTML, result=result, warnings=warnings)
 
502
  else:
503
  return jsonify({"error":"Provide paper_source (URL/DOI) or use /analyze form for PDF upload"}), 400
504
  result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
505
+ doi = (meta.get("identifier") or {}).get("doi")
506
+ enrichment, ewarns = fetch_semantic_scholar(doi)
507
+ result["enrichment"] = enrichment
508
+ warnings.extend(ewarns)
509
  return jsonify({"result": result, "warnings": warnings})
510
 
511
  if __name__ == "__main__":