nadyaw commited on
Commit
7ba09b3
·
verified ·
1 Parent(s): 961a777

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +10 -0
  2. app.py +268 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ ENV PYTHONDONTWRITEBYTECODE=1 \
3
+ PYTHONUNBUFFERED=1 \
4
+ PIP_NO_CACHE_DIR=1 \
5
+ PORT=7860
6
+ WORKDIR /app
7
+ COPY requirements.txt .
8
+ RUN pip install -r requirements.txt
9
+ COPY . .
10
+ CMD ["gunicorn","app:app","--preload","-w","2","-b","0.0.0.0:7860"]
app.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, re, json, datetime
2
+ from typing import Dict, Any, List, Tuple, Optional
3
+
4
+ from flask import Flask, request, jsonify, render_template_string, redirect, url_for
5
+ from flask_cors import CORS
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from PyPDF2 import PdfReader
9
+
10
+ app = Flask(__name__)
11
+ CORS(app, resources={r"/api/*": {"origins": "*"}})
12
+
13
+ app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16 MB upload cap
14
+
15
+ THIS_YEAR = datetime.date.today().year
16
+ DOI_RX = re.compile(r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.I)
17
+
18
+ def _clean(s: Optional[str]) -> str:
19
+ return (s or "").strip()
20
+
21
+ def year_from_any(x: str) -> Optional[int]:
22
+ if not x: return None
23
+ m = re.search(r"(19|20)\d{2}", x)
24
+ if m:
25
+ y = int(m.group(0))
26
+ if 1900 <= y <= 2100:
27
+ return y
28
+ return None
29
+
30
+ def fetch_url_metadata(url_or_doi: str):
31
+ warnings = []
32
+ url = url_or_doi
33
+ m = DOI_RX.search(url_or_doi)
34
+ if m and not url_or_doi.lower().startswith("http"):
35
+ url = f"https://doi.org/{m.group(1)}"
36
+ try:
37
+ r = requests.get(url, timeout=20, headers={"User-Agent":"CRAAPBot/1.0"})
38
+ r.raise_for_status()
39
+ except Exception as e:
40
+ return {}, "", [f"Failed to fetch URL/DOI: {e}"]
41
+ html = r.text
42
+ soup = BeautifulSoup(html, "html.parser")
43
+ meta = {}
44
+ def mget(*names):
45
+ for n in names:
46
+ tag = soup.find("meta", attrs={"name": n}) or soup.find("meta", attrs={"property": n})
47
+ if tag and tag.get("content"):
48
+ return tag["content"]
49
+ return None
50
+ meta["title"] = _clean(mget("citation_title") or (soup.title.string if soup.title else ""))
51
+ authors = soup.find_all("meta", attrs={"name":"citation_author"})
52
+ if authors:
53
+ meta["authors"] = [_clean(a.get("content","")) for a in authors if _clean(a.get("content",""))]
54
+ else:
55
+ meta["authors"] = [_clean(mget("author") or "")]
56
+ meta["authors"] = [a for a in meta["authors"] if a]
57
+ meta["venue"] = _clean(mget("citation_journal_title") or mget("og:site_name") or "")
58
+ y = year_from_any(_clean(mget("citation_publication_date") or mget("date") or mget("article:published_time") or ""))
59
+ meta["year"] = y if y else year_from_any(html)
60
+ doi = _clean(mget("citation_doi") or (DOI_RX.search(html).group(1) if DOI_RX.search(html) else ""))
61
+ meta["identifier"] = {"doi": doi if doi else None, "url": url}
62
+ abst = mget("citation_abstract")
63
+ if not abst:
64
+ absnode = soup.find(lambda tag: tag.name in ["section","div","p"] and tag.get_text(strip=True).lower().startswith("abstract"))
65
+ if absnode:
66
+ abst = absnode.get_text(" ", strip=True)
67
+ text_excerpt = (abst or "")[:4000]
68
+ return meta, text_excerpt, warnings
69
+
70
+ def extract_pdf_text_and_guess_meta(file_storage):
71
+ warnings = []
72
+ try:
73
+ data = file_storage.read()
74
+ reader = PdfReader(io.BytesIO(data))
75
+ n = len(reader.pages)
76
+ if n == 0:
77
+ return {}, "", ["PDF appears empty."]
78
+ head_pages = min(2, n)
79
+ body_pages = min(10, n)
80
+ head = []
81
+ body = []
82
+ for i in range(head_pages):
83
+ head.append(reader.pages[i].extract_text() or "")
84
+ for i in range(body_pages):
85
+ body.append(reader.pages[i].extract_text() or "")
86
+ head_txt = "\n".join(head)
87
+ body_txt = "\n".join(body)
88
+ lines = [l.strip() for l in head_txt.splitlines() if l.strip()]
89
+ title = lines[0] if lines else ""
90
+ authors_line = ""
91
+ for l in lines[0:10]:
92
+ if re.search(r"[A-Z][a-z]+(?:\s[A-Z]\.){0,3}", l) and ("," in l or " and " in l.lower()):
93
+ authors_line = l; break
94
+ authors = [a.strip() for a in re.split(r",|;| and ", authors_line) if a.strip()] if authors_line else []
95
+ venue = ""
96
+ y = year_from_any(head_txt)
97
+ m = DOI_RX.search(head_txt) or DOI_RX.search(body_txt)
98
+ doi = m.group(1) if m else None
99
+ meta = {
100
+ "title": _clean(title),
101
+ "authors": authors,
102
+ "venue": _clean(venue),
103
+ "year": y,
104
+ "identifier": {"doi": doi, "url": None}
105
+ }
106
+ if body_pages < 5:
107
+ warnings.append("Only a small portion of the PDF text was extracted; Accuracy/Purpose may be provisional.")
108
+ return meta, body_txt[:20000], warnings
109
+ except Exception as e:
110
+ return {}, "", [f"Failed to parse PDF: {e}"]
111
+
112
+ def score_currency(year: Optional[int]):
113
+ if not year:
114
+ return 2, "Publication year unknown.", ["Could not find a clear date; treat with caution."]
115
+ age = max(0, THIS_YEAR - year)
116
+ if age <= 2: return 5, f"Published in {year} (≤2 years old).", ["Recent for fast-moving fields."]
117
+ if age <= 5: return 4, f"Published in {year} (~{age} years old).", []
118
+ if age <= 10: return 3, f"Published in {year} (~{age} years old).", []
119
+ return 2, f"Published in {year} (>10 years old).", ["Potentially outdated."]
120
+
121
+ def score_authority(meta: Dict[str,Any]):
122
+ score = 1
123
+ notes = []
124
+ if meta.get("venue"):
125
+ score += 1; notes.append(f"Venue: {meta['venue']}.")
126
+ if meta.get("identifier",{}).get("doi"):
127
+ score += 1; notes.append("Has DOI.")
128
+ if meta.get("authors"):
129
+ a_count = len(meta["authors"])
130
+ if a_count >= 3: score += 1
131
+ notes.append(f"Authors: {a_count}.")
132
+ return min(score,5), "; ".join(notes) if notes else "Insufficient venue/author info."
133
+
134
+ def score_accuracy(text_excerpt: str):
135
+ keys_present = sum(1 for k in ["methods","materials","results","limitations","confidence interval","validation","dataset","sample size"] if k in text_excerpt.lower())
136
+ if not text_excerpt:
137
+ return 2, "No body text available; cannot inspect methods."
138
+ if keys_present >= 5: return 5, "Detailed methodological cues detected (methods/results/validation/etc.)."
139
+ if keys_present >= 3: return 4, "Some methodological cues present."
140
+ if keys_present >= 1: return 3, "Limited methodological signals."
141
+ return 2, "Minimal methodological detail detected (likely a commentary/overview)."
142
+
143
+ def score_purpose(text_excerpt: str):
144
+ lower = text_excerpt.lower()
145
+ bias_hits = any(w in lower for w in ["sponsored", "advertisement", "marketing"])
146
+ conflicts = "conflict of interest" in lower or "competing interest" in lower
147
+ funding = "funding" in lower or "grant" in lower
148
+ if bias_hits:
149
+ return 2, "Potential promotional language detected."
150
+ if conflicts and not funding:
151
+ return 3, "Conflicts noted, funding unclear."
152
+ if funding or conflicts:
153
+ return 4, "Academic tone with disclosures/funding statements."
154
+ return 4, "Academic/educational purpose inferred."
155
+
156
+ def score_relevance(assignment_context: str, meta: Dict[str,Any], text_excerpt: str):
157
+ if not assignment_context:
158
+ return 4, "General relevance assumed (no assignment context provided)."
159
+ ctx = assignment_context.lower()
160
+ hay = (meta.get("title","") + " " + text_excerpt).lower()
161
+ hits = sum(1 for tok in set(re.findall(r"[a-zA-Z]{4,}", ctx)) if tok in hay)
162
+ if hits >= 6: return 5, "Strong topical overlap with assignment context."
163
+ if hits >= 3: return 4, "Good topical overlap."
164
+ if hits >= 1: return 3, "Partial topical overlap."
165
+ return 2, "Low topical overlap; may be tangential."
166
+
167
+ def aggregate_scores(meta: Dict[str,Any], text: str, assignment_context: str, provisional: bool):
168
+ currency_score, currency_evd, currency_checks = score_currency(meta.get("year"))
169
+ authority_score, authority_evd = score_authority(meta)
170
+ accuracy_score, accuracy_evd = score_accuracy(text)
171
+ purpose_score, purpose_evd = score_purpose(text)
172
+ relevance_score, relevance_evd = score_relevance(assignment_context, meta, text)
173
+ if provisional:
174
+ accuracy_score = min(accuracy_score, 3)
175
+ purpose_score = min(purpose_score, 4)
176
+ craap = {
177
+ "Currency": {"score": currency_score, "evidence": currency_evd, "checks": currency_checks},
178
+ "Relevance": {"score": relevance_score, "evidence": relevance_evd},
179
+ "Authority": {"score": authority_score, "evidence": authority_evd},
180
+ "Accuracy": {"score": accuracy_score, "evidence": accuracy_evd},
181
+ "Purpose": {"score": purpose_score, "evidence": purpose_evd}
182
+ }
183
+ avg = round(sum(v["score"] for v in craap.values())/5, 2)
184
+ verdict = "use" if avg >= 4.0 else ("use with caution" if avg >= 2.5 else "avoid")
185
+ return {"metadata": meta, "craap": craap, "overall": {"average": avg, "verdict": verdict}}
186
+
187
+ INDEX_HTML = """
188
+ <!doctype html><html><head><meta charset="utf-8"/><title>CRAAP Bot (Flask)</title>
189
+ <meta name="viewport" content="width=device-width, initial-scale=1">
190
+ <style>
191
+ body{font:16px system-ui,Segoe UI,Roboto,sans-serif;max-width:880px;margin:2rem auto;padding:0 1rem}
192
+ form,.card{border:1px solid #e5e7eb;border-radius:12px;padding:1rem;margin:1rem 0;background:#fff;box-shadow:0 1px 2px rgba(0,0,0,.04)}
193
+ label{display:block;font-weight:600;margin:.5rem 0 .25rem}
194
+ input[type="text"],textarea{width:100%;padding:.6rem .7rem;border:1px solid #d1d5db;border-radius:8px}
195
+ input[type="file"]{margin:.25rem 0 .75rem}button{background:#111827;color:#fff;border:0;padding:.6rem 1rem;border-radius:8px;cursor:pointer}
196
+ pre{background:#0b1020;color:#d7e7ff;padding:1rem;border-radius:12px;overflow:auto}.muted{color:#6b7280}.warn{padding:.6rem .8rem;background:#fff7ed;border:1px solid #fed7aa;border-radius:8px;margin:.5rem 0}
197
+ .tag{display:inline-block;padding:.1rem .5rem;border-radius:999px;border:1px solid #d1d5db;margin-right:.4rem}
198
+ </style></head><body>
199
+ <header><h1>CRAAP Bot</h1><span class="tag">Flask</span><span class="tag">Hugging Face</span></header>
200
+ <div class="card"><form method="POST" action="{{ url_for('analyze') }}" enctype="multipart/form-data">
201
+ <label>URL or DOI</label><input type="text" name="paper_source" placeholder="https://doi.org/10.xxxx/..."/>
202
+ <label>Or upload PDF</label><input type="file" name="pdf" accept="application/pdf"/>
203
+ <label>Assignment context (optional)</label><input type="text" name="assignment_context" placeholder="e.g., AI for zoonotic disease 2023-2025"/>
204
+ <button type="submit">Analyze</button></form>
205
+ <p class="muted">Tip: DOI or full PDF gives best results. Partial PDFs limit Accuracy/Purpose.</p></div>
206
+ {% if result %}{% if warnings %}<div class="warn">⚠️ {{ warnings|join(' · ') }}</div>{% endif %}
207
+ <div class="card"><h2>JSON</h2><pre>{{ result | tojson(indent=2) }}</pre></div>
208
+ <div class="card"><h2>CRAAP Evaluation Summary</h2>
209
+ <p><strong>{{ result.metadata.title or '[unknown title]' }}</strong></p>
210
+ <p class="muted">{{ (result.metadata.authors or [])|join(', ') }} · {{ result.metadata.venue or 'unknown venue' }}{% if result.metadata.year %} · {{ result.metadata.year }}{% endif %}</p>
211
+ <ul>
212
+ <li><strong>Currency</strong>: {{ result.craap.Currency.score }}/5 — {{ result.craap.Currency.evidence }}</li>
213
+ <li><strong>Relevance</strong>: {{ result.craap.Relevance.score }}/5 — {{ result.craap.Relevance.evidence }}</li>
214
+ <li><strong>Authority</strong>: {{ result.craap.Authority.score }}/5 — {{ result.craap.Authority.evidence }}</li>
215
+ <li><strong>Accuracy</strong>: {{ result.craap.Accuracy.score }}/5 — {{ result.craap.Accuracy.evidence }}</li>
216
+ <li><strong>Purpose</strong>: {{ result.craap.Purpose.score }}/5 — {{ result.craap.Purpose.evidence }}</li>
217
+ </ul><p><strong>Overall:</strong> {{ result.overall.average }} — <em>{{ result.overall.verdict }}</em></p>
218
+ <h3>What to verify next</h3>
219
+ <ol>
220
+ <li>Confirm publication date & peer-review at the DOI/URL.</li>
221
+ <li>Skim methods/results for sample size, validation, limitations.</li>
222
+ <li>Check author affiliations and profiles (Semantic Scholar/ORCID).</li>
223
+ <li>Look for funding/conflict-of-interest statements.</li>
224
+ <li>Search for newer papers (last 1–2 years) that cite or challenge it.</li>
225
+ </ol>
226
+ </div>{% endif %}</body></html>
227
+ """
228
+
229
+ @app.route("/", methods=["GET"])
230
+ def index():
231
+ return render_template_string(INDEX_HTML, result=None, warnings=None)
232
+
233
+ @app.route("/analyze", methods=["POST"])
234
+ def analyze():
235
+ paper_source = _clean(request.form.get("paper_source", ""))
236
+ assignment_context = _clean(request.form.get("assignment_context", ""))
237
+ provisional = False
238
+ warnings: List[str] = []
239
+ meta, text = {}, ""
240
+ if paper_source:
241
+ meta, text, w = fetch_url_metadata(paper_source)
242
+ warnings.extend(w)
243
+ elif "pdf" in request.files and request.files["pdf"].filename:
244
+ meta, text, w = extract_pdf_text_and_guess_meta(request.files["pdf"])
245
+ warnings.extend(w); provisional = True
246
+ else:
247
+ return redirect(url_for("index"))
248
+ result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
249
+ if not text:
250
+ warnings.append("Full text not available — Accuracy/Purpose are provisional. Provide a DOI/URL or full PDF for deeper evaluation.")
251
+ return render_template_string(INDEX_HTML, result=result, warnings=warnings)
252
+
253
+ @app.route("/api/analyze", methods=["POST"])
254
+ def api_analyze():
255
+ data = request.json or {}
256
+ paper_source = _clean(data.get("paper_source",""))
257
+ assignment_context = _clean(data.get("assignment_context",""))
258
+ meta, text, warnings = ({}, "", [])
259
+ provisional = False
260
+ if paper_source:
261
+ meta, text, warnings = fetch_url_metadata(paper_source)
262
+ else:
263
+ return jsonify({"error":"Provide paper_source (URL/DOI) or use /analyze form for PDF upload"}), 400
264
+ result = aggregate_scores(meta, text, assignment_context, provisional or bool(warnings))
265
+ return jsonify({"result": result, "warnings": warnings})
266
+
267
+ if __name__ == "__main__":
268
+ app.run(host="0.0.0.0", port=8000, debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ flask==3.0.3
2
+ requests==2.32.3
3
+ beautifulsoup4==4.12.3
4
+ PyPDF2==3.0.1
5
+ flask-cors==4.0.1
6
+ gunicorn==22.0.0