Rudraaaa76 commited on
Commit
c18eaf2
·
verified ·
1 Parent(s): b92a1ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -708
app.py CHANGED
@@ -1,33 +1,22 @@
1
- """
2
- HackTrack Scraper v5.0 - Structured extraction, no LLM
3
- - Devfolio : reads __NEXT_DATA__ JSON blob (Next.js SSR, 100% accurate dates)
4
- - Unstop : calls api.unstop.com REST API directly (structured JSON)
5
- - Devpost : reads <time datetime> tags + structured sidebar
6
- - Others : regex fallback on bodyText
7
- """
8
-
9
  from fastapi import FastAPI
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from pydantic import BaseModel, Field
12
  import asyncio
13
  import re
14
  import sys
15
- import json
16
- import httpx
17
  from urllib.parse import urlparse
18
- from typing import List, Optional
19
- from datetime import datetime, timezone
20
 
21
  if sys.platform == "win32":
 
22
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
23
 
24
- app = FastAPI(title="HackTrack Scraper", version="5.0.0")
25
- playwright = None
26
- browser = None
27
 
28
- # Semaphore: only 1 Playwright scrape at a time on HuggingFace free tier.
29
- # Prevents two concurrent requests from doubling RAM usage (~1.2GB peak).
30
- _scrape_sem = asyncio.Semaphore(1)
31
 
32
  app.add_middleware(
33
  CORSMiddleware,
@@ -37,13 +26,10 @@ app.add_middleware(
37
  )
38
 
39
 
40
- # ============================================================
41
- # MODELS
42
- # ============================================================
43
-
44
  class ScrapeRequest(BaseModel):
45
  url: str
46
 
 
47
  class ScrapeResponse(BaseModel):
48
  name: str = ""
49
  platform: str = ""
@@ -62,807 +48,473 @@ class ScrapeResponse(BaseModel):
62
  url: str = ""
63
 
64
 
65
- # ============================================================
66
- # PLATFORM DETECTION
67
- # ============================================================
68
-
69
  def detect_platform(url: str) -> str:
70
  domain = urlparse(url).netloc.lower()
71
- if "devfolio" in domain: return "Devfolio"
72
- if "unstop" in domain: return "Unstop"
73
- if "devpost" in domain: return "Devpost"
74
- if "dorahacks" in domain: return "DoraHacks"
75
- if "mlh.io" in domain: return "MLH"
76
- if "hackerearth" in domain: return "HackerEarth"
 
 
77
  return "Other"
78
 
79
 
80
  # ============================================================
81
- # DATE HELPERS
82
  # ============================================================
83
-
84
- def iso_to_date(val: Optional[str]) -> str:
85
- """Convert ISO-8601 / Unix timestamp -> YYYY-MM-DD."""
86
- if not val:
87
- return ""
88
- val = str(val).strip()
89
- if re.fullmatch(r"\d{10}", val):
90
- try:
91
- return datetime.fromtimestamp(int(val), tz=timezone.utc).strftime("%Y-%m-%d")
92
- except Exception:
93
- return ""
94
- if re.fullmatch(r"\d{13}", val):
95
- try:
96
- return datetime.fromtimestamp(int(val) / 1000, tz=timezone.utc).strftime("%Y-%m-%d")
97
- except Exception:
98
- return ""
99
- m = re.match(r"(\d{4}-\d{2}-\d{2})", val)
100
- if m:
101
- return m.group(1)
102
- return ""
103
 
104
  DATE_FORMATS = [
105
  "%Y-%m-%d", "%Y/%m/%d",
106
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
107
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
108
  "%m/%d/%Y", "%d/%m/%Y",
 
109
  ]
110
 
111
- def parse_any_date(text: str) -> str:
112
- if not text: return ""
113
- text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text.strip())
 
 
 
 
114
  text = re.sub(r"\s+", " ", text)
 
 
 
 
115
  for fmt in DATE_FORMATS:
116
  try:
117
  dt = datetime.strptime(text, fmt)
118
- if dt.year == 1900:
119
- dt = dt.replace(year=datetime.now().year)
 
 
120
  return dt.strftime("%Y-%m-%d")
121
  except ValueError:
122
  continue
123
  return ""
124
 
 
125
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
 
126
  lower = text.lower()
127
- patterns = [
128
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
129
- r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
130
- r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
131
- r"[,]?\s+\d{4})",
132
- r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
133
- r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
134
- r"\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
135
  r"(\d{1,2}/\d{1,2}/\d{4})",
136
  ]
137
  for kw in keywords:
138
  idx = lower.find(kw.lower())
139
- if idx == -1: continue
140
- chunk = text[idx: idx + window]
141
- for pat in patterns:
142
- m = re.search(pat, chunk, re.IGNORECASE)
143
- if m:
144
- parsed = parse_any_date(m.group(1))
145
- if parsed: return parsed
 
 
146
  return ""
147
 
148
 
149
  # ============================================================
150
- # DEVFOLIO - __NEXT_DATA__ extraction script
151
  # ============================================================
152
 
153
- DEVFOLIO_SCRIPT = """() => {
154
- const nextEl = document.getElementById('__NEXT_DATA__');
155
- let nextData = null;
156
- if (nextEl) { try { nextData = JSON.parse(nextEl.textContent); } catch(e) {} }
157
-
158
- function findHackathon(obj, depth) {
159
- if (!obj || typeof obj !== 'object' || depth > 8) return null;
160
- if (obj.starts_at || obj.registration_deadline || obj.ends_at || obj.slug) return obj;
161
- for (const k of Object.keys(obj)) {
162
- const child = obj[k];
163
- if (child && typeof child === 'object') {
164
- const found = findHackathon(child, depth + 1);
165
- if (found) return found;
166
- }
167
- }
168
- return null;
169
- }
170
-
171
- let hackathon = nextData ? findHackathon(nextData, 0) : null;
172
-
173
- const getMeta = (n) => {
174
- const el = document.querySelector('meta[property="' + n + '"], meta[name="' + n + '"]');
175
- return el ? el.getAttribute('content') || '' : '';
176
- };
177
-
178
- let name = '';
179
- if (hackathon) name = hackathon.name || hackathon.title || '';
180
- if (!name) {
181
- const el = document.querySelector('h1');
182
- if (el && el.textContent.trim().length > 2) name = el.textContent.trim();
183
- }
184
- name = name || getMeta('og:title') || document.title.split('|')[0].trim();
185
-
186
- const banner = (hackathon && (hackathon.cover_image || hackathon.banner_image)) || getMeta('og:image') || '';
187
- const description = (hackathon && (hackathon.tagline || hackathon.description)) || getMeta('og:description') || '';
188
- const starts_at = (hackathon && (hackathon.starts_at || hackathon.start_date)) || '';
189
- const ends_at = (hackathon && (hackathon.ends_at || hackathon.end_date)) || '';
190
- const reg_dl = (hackathon && (hackathon.registration_deadline || hackathon.registration_ends_at)) || '';
191
- const sub_dl = (hackathon && (hackathon.submission_deadline || hackathon.submission_ends_at)) || '';
192
- const result_date = (hackathon && (hackathon.result_date || hackathon.results_at)) || '';
193
-
194
- let prize_pool = '';
195
- if (hackathon) {
196
- if (hackathon.prize_pool != null) prize_pool = String(hackathon.prize_pool);
197
- else if (hackathon.prize) prize_pool = String(hackathon.prize);
198
- }
199
-
200
- let team_min = 1, team_max = 4;
201
- if (hackathon) {
202
- if (hackathon.min_team_size) team_min = hackathon.min_team_size;
203
- if (hackathon.max_team_size) team_max = hackathon.max_team_size;
204
- }
205
-
206
- const ps = [];
207
- const seenPs = new Set();
208
- const tracks = hackathon ? (hackathon.tracks || hackathon.themes || hackathon.problem_statements || []) : [];
209
- if (Array.isArray(tracks)) {
210
- for (const t of tracks) {
211
- const title = t.title || t.name || t.track || (typeof t === 'string' ? t : '');
212
- const desc = t.description || t.desc || '';
213
- if (title && !seenPs.has(title.toLowerCase())) {
214
- seenPs.add(title.toLowerCase());
215
- ps.push({ track: t.track || '', title: title, description: desc });
216
- }
217
- }
218
  }
219
 
220
- const resourceLinks = [];
221
- const seenHrefs = new Set();
222
- const kws = ['problem','statement','pdf','rule','guideline','brochure','document','brief','challenge','track','theme','schedule','timeline'];
223
- document.querySelectorAll('a[href]').forEach(function(a) {
224
- const href = a.href || '';
225
- const text = a.textContent.trim();
226
- const hl = href.toLowerCase(), tl = text.toLowerCase();
227
- if (seenHrefs.has(href) || !href || href === '#') return;
228
- const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
229
- const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
230
- const isRel = kws.some(function(kw) { return tl.includes(kw) || hl.includes(kw); });
231
- if (isPdf || isDrive || isRel) {
232
- seenHrefs.add(href);
233
- resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: isPdf ? 'pdf' : isDrive ? 'google_drive' : 'link' });
234
- }
235
- });
236
-
237
- return {
238
- name: name.substring(0, 200),
239
- description: description.substring(0, 2000),
240
- banner_url: banner,
241
- starts_at: starts_at,
242
- ends_at: ends_at,
243
- registration_deadline: reg_dl,
244
- submission_deadline: sub_dl,
245
- result_date: result_date,
246
- prize_pool: prize_pool,
247
- team_min: team_min,
248
- team_max: team_max,
249
- problem_statements: ps.slice(0, 20),
250
- resource_links: resourceLinks.slice(0, 30),
251
- bodyText: document.body.innerText.substring(0, 15000),
252
- found_next_data: hackathon !== null,
253
- hackathon_keys: hackathon ? Object.keys(hackathon).join(',') : ''
254
- };
255
- }"""
256
 
 
 
 
 
 
 
 
257
 
258
- def parse_devfolio(data: dict) -> dict:
259
- return {
260
- "name": data.get("name", ""),
261
- "description": data.get("description", ""),
262
- "banner_url": data.get("banner_url", ""),
263
- "start_date": iso_to_date(data.get("starts_at")),
264
- "end_date": iso_to_date(data.get("ends_at")),
265
- "registration_deadline": iso_to_date(data.get("registration_deadline")),
266
- "submission_deadline": iso_to_date(data.get("submission_deadline")),
267
- "result_date": iso_to_date(data.get("result_date")),
268
- "prize_pool": data.get("prize_pool", ""),
269
- "team_size": {"min": data.get("team_min", 1), "max": data.get("team_max", 4)},
270
- "problem_statements": data.get("problem_statements", []),
271
- "resource_links": data.get("resource_links", []),
272
- "body_text": data.get("bodyText", ""),
273
- "found_structured": data.get("found_next_data", False),
274
- }
275
 
 
 
 
 
 
276
 
277
- # ============================================================
278
- # UNSTOP - Direct REST API
279
- # ============================================================
 
 
 
 
 
 
 
280
 
281
- def extract_unstop_id(url: str) -> Optional[str]:
282
- m = re.search(r"-(\d{5,8})(?:/|$|\?)", url)
283
- if m: return m.group(1)
284
- m = re.search(r"/(\d{5,8})(?:/|$|\?)", url)
285
- return m.group(1) if m else None
 
 
 
 
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
- async def fetch_unstop_api(opportunity_id: str) -> Optional[dict]:
289
- headers = {
290
- "User-Agent": "Mozilla/5.0 (compatible; HackTrackBot/1.0)",
291
- "Accept": "application/json",
292
- "Referer": "https://unstop.com/hackathons",
293
- }
294
- endpoints = [
295
- f"https://unstop.com/api/public/opportunity/get-applied-detail?id={opportunity_id}",
296
- f"https://unstop.com/api/public/opportunity/{opportunity_id}",
297
- f"https://unstop.com/api/public/hackathon/{opportunity_id}",
 
 
 
 
 
298
  ]
299
- for api_url in endpoints:
300
- try:
301
- async with httpx.AsyncClient(timeout=15) as client:
302
- resp = await client.get(api_url, headers=headers)
303
- if resp.status_code != 200:
304
- continue
305
- data = resp.json()
306
- opp = data.get("data") or data
307
- if isinstance(opp, dict) and "opportunity" in opp:
308
- opp = opp["opportunity"]
309
- if isinstance(opp, dict) and (opp.get("id") or opp.get("title")):
310
- print(f"[Unstop] API success from {api_url}")
311
- print(f"[Unstop] Keys: {list(opp.keys())[:15]}")
312
- return opp
313
- except Exception as e:
314
- print(f"[Unstop] {api_url} failed: {e}")
315
- return None
316
-
317
-
318
- def parse_unstop_api(opp: dict) -> dict:
319
- start_date = iso_to_date(opp.get("start_date") or opp.get("starts_at"))
320
- end_date = iso_to_date(opp.get("end_date") or opp.get("ends_at"))
321
- reg_deadline = iso_to_date(
322
- opp.get("registrations_end_date") or
323
- opp.get("registration_deadline") or
324
- opp.get("registration_end_date")
325
- )
326
- sub_deadline = iso_to_date(
327
- opp.get("submission_end_date") or
328
- opp.get("submission_deadline") or
329
- opp.get("last_submission_date")
330
- )
331
- result_date = iso_to_date(opp.get("result_date") or opp.get("results_at"))
332
-
333
- prize_pool = ""
334
- if opp.get("prize_amount"):
335
- prize_pool = str(opp["prize_amount"])
336
- elif opp.get("prizes") and isinstance(opp["prizes"], list):
337
- total = 0
338
- for p in opp["prizes"]:
339
- try: total += float(str(p.get("amount") or 0).replace(",", ""))
340
- except: pass
341
- if total:
342
- prize_pool = "Rs. {:,}".format(int(total))
343
- elif opp.get("total_prize"):
344
- prize_pool = str(opp["total_prize"])
345
-
346
- team_min = int(opp.get("min_team_size") or opp.get("min_team") or 1)
347
- team_max = int(opp.get("max_team_size") or opp.get("max_team") or 4)
348
 
 
349
  ps = []
350
  seen_ps = set()
351
- for r in (opp.get("rounds") or opp.get("problem_statements") or []):
352
- if not isinstance(r, dict): continue
353
- title = r.get("title") or r.get("name") or r.get("round_name") or ""
354
- desc = r.get("description") or r.get("details") or ""
355
- if title and title.lower() not in seen_ps:
356
- seen_ps.add(title.lower())
357
- ps.append({"track": "", "title": title, "description": desc[:300]})
358
 
359
- tags = opp.get("tags") or opp.get("categories") or []
360
- if isinstance(tags, list) and not ps:
361
- for t in tags:
362
- label = t.get("name") or t.get("label") or (t if isinstance(t, str) else "")
363
- if label and label.lower() not in seen_ps:
364
- seen_ps.add(label.lower())
365
- ps.append({"track": "Theme", "title": label, "description": ""})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
- return {
368
- "name": opp.get("title") or opp.get("name", ""),
369
- "description": (opp.get("short_description") or opp.get("description") or "")[:500],
370
- "banner_url": opp.get("banner_image") or opp.get("cover_image") or opp.get("image") or "",
371
- "start_date": start_date,
372
- "end_date": end_date,
373
- "registration_deadline": reg_deadline,
374
- "submission_deadline": sub_deadline,
375
- "result_date": result_date,
376
- "prize_pool": prize_pool,
377
- "team_size": {"min": team_min, "max": team_max},
378
- "problem_statements": ps[:20],
379
- "resource_links": [],
380
- }
381
 
382
 
383
  # ============================================================
384
- # DEVPOST - <time datetime> tags + sidebar
385
  # ============================================================
386
 
387
- DEVPOST_SCRIPT = """() => {
388
- const getMeta = function(n) {
389
- const el = document.querySelector('meta[property="' + n + '"], meta[name="' + n + '"]');
390
  return el ? el.getAttribute('content') || '' : '';
391
  };
392
 
393
- let name = getMeta('og:title') || document.title.split('|')[0].trim();
394
- const h1 = document.querySelector('#challenge-title, h1.title, h1');
395
- if (h1 && h1.textContent.trim().length > 2) name = h1.textContent.trim();
396
-
397
- const banner = getMeta('og:image') || '';
398
- const description = getMeta('og:description') || '';
399
-
400
- const timeMap = {};
401
- document.querySelectorAll('time[datetime]').forEach(function(el) {
402
- const dt = el.getAttribute('datetime') || '';
403
- const parent = el.closest('.deadline, .date, li, div');
404
- const lbl = (parent ? parent.textContent : '').toLowerCase();
405
- if (lbl.includes('submission') || lbl.includes('submit')) { if (!timeMap.submission) timeMap.submission = dt; }
406
- else if (lbl.includes('registr') || lbl.includes('apply')) { if (!timeMap.registration) timeMap.registration = dt; }
407
- else if (lbl.includes('result') || lbl.includes('winner')) { if (!timeMap.result) timeMap.result = dt; }
408
- else if (lbl.includes('start') || lbl.includes('begin')) { if (!timeMap.start) timeMap.start = dt; }
409
- else if (lbl.includes('end') || lbl.includes('close')) { if (!timeMap.end) timeMap.end = dt; }
410
- else if (!timeMap.first) { timeMap.first = dt; }
411
- });
412
-
413
- let prize_pool = '';
414
- const prizeSection = document.querySelector('#prizes, .prize-amount, [id*="prize"]');
415
- if (prizeSection) {
416
- const txt = prizeSection.textContent;
417
- const m = txt.match(/([\\u20b9][\\d,]+|[$][\\d,]+(?:K|k|M)?)/);
418
- if (m) prize_pool = m[1];
419
- else prize_pool = txt.trim().substring(0, 80);
420
  }
 
 
 
 
 
 
 
 
 
 
421
 
 
422
  const themes = [];
423
- const seenT = new Set();
424
- document.querySelectorAll('a[href*="themes"], .software-list a').forEach(function(a) {
425
  const t = a.textContent.trim();
426
- if (t && t.length > 1 && t.length < 80 && !seenT.has(t)) { seenT.add(t); themes.push(t); }
427
  });
428
 
429
- const resourceLinks = [];
430
- const seenHrefs = new Set();
431
- const kws = ['problem','statement','pdf','rule','guideline','document','challenge','track','theme','schedule'];
432
- document.querySelectorAll('a[href]').forEach(function(a) {
433
- const href = a.href || '';
434
- const text = a.textContent.trim();
435
- const hl = href.toLowerCase(), tl = text.toLowerCase();
436
- if (seenHrefs.has(href) || !href || href === '#') return;
437
- const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
438
- const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
439
- const isRel = kws.some(function(kw) { return tl.includes(kw) || hl.includes(kw); });
440
- if (isPdf || isDrive || isRel) {
441
- seenHrefs.add(href);
442
- resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: isPdf ? 'pdf' : isDrive ? 'google_drive' : 'link' });
443
- }
444
  });
445
 
446
- return {
447
- name: name.substring(0, 200),
448
- description: description.substring(0, 2000),
449
- banner_url: banner,
450
- time_map: timeMap,
451
- prize_pool: prize_pool,
452
- themes: themes,
453
- resource_links: resourceLinks.slice(0, 30),
454
- bodyText: document.body.innerText.substring(0, 15000)
455
- };
456
- }"""
457
-
458
-
459
- def parse_devpost(data: dict) -> dict:
460
- tm = data.get("time_map", {})
461
- ps = [{"track": "Theme", "title": t, "description": ""} for t in data.get("themes", [])]
462
- return {
463
- "name": data.get("name", ""),
464
- "description": data.get("description", ""),
465
- "banner_url": data.get("banner_url", ""),
466
- "start_date": iso_to_date(tm.get("start") or tm.get("first")),
467
- "end_date": iso_to_date(tm.get("end")),
468
- "registration_deadline": iso_to_date(tm.get("registration")),
469
- "submission_deadline": iso_to_date(tm.get("submission")),
470
- "result_date": iso_to_date(tm.get("result")),
471
- "prize_pool": data.get("prize_pool", ""),
472
- "team_size": {"min": 1, "max": 4},
473
- "problem_statements": ps,
474
- "resource_links": data.get("resource_links", []),
475
- "body_text": data.get("bodyText", ""),
476
- }
477
-
478
-
479
- # ============================================================
480
- # GENERIC SCRIPT + REGEX FALLBACK
481
- # ============================================================
482
-
483
- GENERIC_SCRIPT = """() => {
484
- const getMeta = function(n) {
485
- const el = document.querySelector('meta[property="' + n + '"], meta[name="' + n + '"]');
486
- return el ? el.getAttribute('content') || '' : '';
487
- };
488
- let name = '';
489
- const sels = ['h1','.hackathon-name','.event-name','#challenge-title','.opp-title'];
490
- for (let i = 0; i < sels.length; i++) {
491
- const el = document.querySelector(sels[i]);
492
- if (el && el.textContent.trim().length > 2) { name = el.textContent.trim(); break; }
493
- }
494
- name = name || getMeta('og:title') || document.title.split('|')[0].trim();
495
- const banner = getMeta('og:image') || '';
496
- const description = getMeta('og:description') || getMeta('description') || '';
497
  const resourceLinks = [];
498
  const seenHrefs = new Set();
499
- const kws = ['problem','statement','pdf','rule','guideline','document','challenge','track','theme','schedule'];
500
- document.querySelectorAll('a[href]').forEach(function(a) {
501
  const href = a.href || '';
502
  const text = a.textContent.trim();
503
- const hl = href.toLowerCase(), tl = text.toLowerCase();
 
504
  if (seenHrefs.has(href) || !href || href === '#') return;
505
- const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
506
- const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
507
- const isRel = kws.some(function(kw) { return tl.includes(kw) || hl.includes(kw); });
508
- if (isPdf || isDrive || isRel) {
 
 
 
509
  seenHrefs.add(href);
510
- resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: isPdf ? 'pdf' : isDrive ? 'google_drive' : 'link' });
 
 
 
 
511
  }
512
  });
 
513
  return {
514
  name: name.substring(0, 200),
515
  description: description.substring(0, 2000),
516
  banner_url: banner,
517
- bodyText: document.body.innerText.substring(0, 20000),
518
- resource_links: resourceLinks.slice(0, 30)
 
 
519
  };
520
  }"""
521
 
522
 
523
- def regex_extract(body_text: str) -> dict:
524
- result = {
525
- "registration_deadline": "", "submission_deadline": "",
526
- "result_date": "", "start_date": "", "end_date": "",
527
- "prize_pool": "", "team_size": {"min": 1, "max": 4},
528
- "problem_statements": [],
529
- }
530
- result["registration_deadline"] = find_dates_near(body_text, [
531
- "registration close", "register by", "registration deadline",
532
- "applications close", "apply by", "registration ends",
533
- ])
534
- result["submission_deadline"] = find_dates_near(body_text, [
535
- "submission deadline", "submissions close", "submit by",
536
- "final submission", "project submission", "deadline",
537
- ])
538
- runs = re.search(
539
- r"(?:runs?\s+from|starts?\s+(?:on|from)?)\s*"
540
- r"((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2})"
541
- r"(?:\s*[-\u2013]\s*(\d{1,2}))?(?:[,\s]+(\d{4}))?",
542
- body_text, re.IGNORECASE,
543
- )
544
- if runs:
545
- year = runs.group(3) or str(datetime.now().year)
546
- result["start_date"] = parse_any_date(f"{runs.group(1)} {year}")
547
- if runs.group(2):
548
- month = runs.group(1).split()[0]
549
- result["end_date"] = parse_any_date(f"{month} {runs.group(2)} {year}")
550
- if not result["start_date"]:
551
- result["start_date"] = find_dates_near(body_text, ["start date","starts on","begins on"])
552
- if not result["end_date"]:
553
- result["end_date"] = find_dates_near(body_text, ["end date","ends on","hackathon ends"])
554
- result["result_date"] = find_dates_near(body_text, ["result","winners announced","results declared"])
555
-
556
- prize_patterns = [
557
- r"(Rs\.?\s*[\d,]+(?:\s*(?:Lakhs?|Crores?|K|k|L))?)",
558
- r"(INR\s*[\d,]+(?:\s*(?:Lakhs?|Crores?|K|k|L))?)",
559
- r"(\$\s*[\d,]+(?:\s*(?:K|k|M))?)",
560
- r"(\u20b9\s*[\d,]+(?:\s*(?:Lakhs?|Crores?|K|k|L))?)",
561
- ]
562
- lower = body_text.lower()
563
- for kw in ["prize","reward","worth","winning","bounty","in cash","in prizes"]:
564
- idx = lower.find(kw)
565
- if idx == -1: continue
566
- chunk = body_text[max(0, idx-200): idx+200]
567
- for pat in prize_patterns:
568
- m = re.search(pat, chunk, re.IGNORECASE)
569
- if m: result["prize_pool"] = m.group(1).strip(); break
570
- if result["prize_pool"]: break
571
- if not result["prize_pool"]:
572
- for pat in prize_patterns:
573
- m = re.search(pat, body_text)
574
- if m: result["prize_pool"] = m.group(1).strip(); break
575
-
576
- for pat in [
577
- r"team\s*size[:\s]*(\d+)\s*[-\u2013to]+\s*(\d+)",
578
- r"(\d+)\s*[-\u2013to]+\s*(\d+)\s*(?:members?|people|per team)",
579
- r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
580
- r"max(?:imum)?\s*(?:team)?\s*size\s*[:\s]*(\d+)",
581
- ]:
582
- m = re.search(pat, body_text, re.IGNORECASE)
583
- if m:
584
- g = [x for x in m.groups() if x]
585
- result["team_size"] = (
586
- {"min": int(g[0]), "max": int(g[1])} if len(g) == 2
587
- else {"min": 1, "max": int(g[0])}
588
- )
589
- break
590
 
591
- ps, seen = [], set()
592
- dm = re.search(
593
- r"(?:domains?|themes?|tracks?|categories|verticals)[:\s]+([^\n]+)",
594
- body_text, re.IGNORECASE,
595
  )
596
- if dm:
597
- for item in re.split(r"[,|/]", dm.group(1)):
598
- item = item.strip().rstrip(".")
599
- if 3 < len(item) < 150 and item.lower() not in seen:
600
- seen.add(item.lower())
601
- ps.append({"track": "", "title": item, "description": ""})
602
- for m in re.finditer(
603
- r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-]\s*(.{5,200})",
604
- body_text, re.IGNORECASE,
605
- ):
606
- title = m.group(2).strip().split("\n")[0]
607
- if title.lower() not in seen and len(title) > 4:
608
- seen.add(title.lower())
609
- ps.append({"track": f"Track {m.group(1)}", "title": title, "description": ""})
610
- result["problem_statements"] = ps[:20]
611
- return result
612
 
613
 
614
- # ============================================================
615
- # SAFE EVALUATE
616
- # ============================================================
617
-
618
- EMPTY_DATA = {"name": "", "description": "", "banner_url": "", "bodyText": "", "resource_links": []}
619
-
620
- async def safe_evaluate(page, script: str, fallback=None) -> dict:
621
- for attempt in range(3):
622
- try:
623
- try:
624
- await page.wait_for_load_state("networkidle", timeout=8000)
625
- except Exception:
626
- pass
627
- return await page.evaluate(script)
628
- except Exception as e:
629
- err = str(e)
630
- print(f"[Scraper] evaluate attempt {attempt+1} failed: {err[:120]}")
631
- if "Execution context was destroyed" in err or "Frame was detached" in err:
632
- print("[Scraper] Redirect detected, waiting to settle...")
633
- try:
634
- await page.wait_for_load_state("domcontentloaded", timeout=12000)
635
- await asyncio.sleep(2)
636
- except Exception:
637
- await asyncio.sleep(3)
638
- continue
639
- if fallback and attempt == 1:
640
- script = fallback
641
- continue
642
- break
643
- return EMPTY_DATA
644
 
 
 
 
 
 
 
645
 
646
- # ============================================================
647
- # MAIN SCRAPER
648
- # ============================================================
 
 
 
649
 
650
  async def scrape_with_playwright(url: str, platform: str) -> dict:
 
651
  global browser
652
- if browser is None:
653
- return {"scrape_success": False, "error": "Browser not initialized"}
654
-
655
- # Unstop: try API first — no Playwright needed, saves all memory for this call
656
- if platform == "Unstop":
657
- opp_id = extract_unstop_id(url)
658
- print(f"[Unstop] Extracted ID: {opp_id}")
659
- if opp_id:
660
- opp = await fetch_unstop_api(opp_id)
661
- if opp:
662
- result = parse_unstop_api(opp)
663
- result["scrape_success"] = bool(result.get("name"))
664
- print(f"[Unstop] API: name='{result['name']}' reg={result['registration_deadline']} sub={result['submission_deadline']} ps={len(result['problem_statements'])}")
665
- return result
666
- print("[Unstop] API failed, falling back to Playwright")
667
-
668
- async with _scrape_sem:
669
- # Only one Playwright scrape runs at a time to stay within HuggingFace RAM limits.
670
- # Concurrent requests queue here and are processed sequentially.
671
- print(f"[Scraper] Semaphore acquired for {platform}")
672
- return await _do_playwright_scrape(url, platform)
673
-
674
-
675
- async def _do_playwright_scrape(url: str, platform: str) -> dict:
676
- """Inner function — runs inside the semaphore."""
677
- global browser
678
-
679
- context = await browser.new_context(
680
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
681
- # Smaller viewport = less GPU memory for compositing
682
- viewport={"width": 1280, "height": 800},
683
- # Block credentials/service workers to reduce overhead
684
- java_script_enabled=True,
685
- bypass_csp=False,
686
- )
687
  try:
688
- page = await context.new_page()
689
-
690
- # Block images, fonts, media, and tracking — saves 30-60% of page RAM
691
- # We only need DOM text and __NEXT_DATA__, not rendered assets
692
- async def block_resources(route, request):
693
- BLOCK_TYPES = {"image", "media", "font", "stylesheet", "other",
694
- "ping", "websocket"}
695
- BLOCK_DOMAINS = {"google-analytics", "googletagmanager", "facebook",
696
- "hotjar", "intercom", "amplitude", "segment",
697
- "cloudflare.com/beacon", "sentry.io"}
698
- if request.resource_type in BLOCK_TYPES:
699
- await route.abort()
700
- return
701
- url_lower = request.url.lower()
702
- if any(d in url_lower for d in BLOCK_DOMAINS):
703
- await route.abort()
704
- return
705
- await route.continue_()
706
-
707
- await page.route("**/*", block_resources)
708
-
709
- print(f"[Scraper] => {url} platform={platform}")
710
- wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
711
- try:
712
- await page.goto(url, wait_until=wait_until, timeout=30000)
713
- except Exception as e:
714
- if "Timeout" not in str(e): raise
715
- print("[Scraper] goto timeout, proceeding anyway")
716
-
717
- # Reduced wait times — blocking assets means pages settle faster
718
- wait_map = {"Unstop": 6, "DoraHacks": 6, "Devfolio": 5, "MLH": 3}
719
- wait_sec = wait_map.get(platform, 4)
720
- print(f"[Scraper] Waiting {wait_sec}s for JS...")
721
- await page.wait_for_timeout(wait_sec * 1000)
722
-
723
- # Light scroll only — no heavy scroll since images are blocked anyway
724
- for frac in [0.5, 1.0, 0.0]:
725
- try:
726
- await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
727
- except Exception:
728
- pass
729
- await asyncio.sleep(0.4)
730
- await asyncio.sleep(0.5)
731
-
732
- if platform == "Devfolio":
733
- raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
734
- print(f"[Devfolio] found_next_data={raw.get('found_next_data')} keys={raw.get('hackathon_keys','')[:80]}")
735
- result = parse_devfolio(raw)
736
- # Regex fill for any empty structured fields
737
- if not result.get("registration_deadline") or not result.get("start_date"):
738
- print("[Devfolio] Filling gaps with regex")
739
- regex = regex_extract(raw.get("bodyText", ""))
740
- for f in ("start_date","end_date","registration_deadline","submission_deadline","result_date","prize_pool"):
741
- if not result.get(f) and regex.get(f):
742
- result[f] = regex[f]
743
- if not result["problem_statements"]:
744
- result["problem_statements"] = regex.get("problem_statements", [])
745
- if result["team_size"] == {"min": 1, "max": 4}:
746
- result["team_size"] = regex.get("team_size", {"min": 1, "max": 4})
747
-
748
- elif platform == "Devpost":
749
- raw = await safe_evaluate(page, DEVPOST_SCRIPT, GENERIC_SCRIPT)
750
- result = parse_devpost(raw)
751
- if not result.get("submission_deadline") or not result.get("start_date"):
752
- regex = regex_extract(raw.get("bodyText", ""))
753
- for f in ("start_date","end_date","registration_deadline","submission_deadline","result_date"):
754
- if not result.get(f) and regex.get(f):
755
- result[f] = regex[f]
756
-
757
- else:
758
- raw = await safe_evaluate(page, GENERIC_SCRIPT)
759
- regex = regex_extract(raw.get("bodyText", ""))
760
- result = {
761
- "name": raw.get("name", ""),
762
- "description": raw.get("description", ""),
763
- "banner_url": raw.get("banner_url", ""),
764
- "resource_links": raw.get("resource_links", []),
765
- **regex,
766
  }
767
 
768
- result["scrape_success"] = bool(result.get("name") and len(result.get("name","")) > 2)
769
- print(f"[Scraper] Done: name='{result.get('name','')}' reg={result.get('registration_deadline','')} sub={result.get('submission_deadline','')} start={result.get('start_date','')} prize='{result.get('prize_pool','')}' ps={len(result.get('problem_statements',[]))}")
770
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
 
772
  except Exception as e:
 
773
  import traceback
774
  traceback.print_exc()
775
  return {"scrape_success": False, "error": str(e)}
776
- finally:
777
- await context.close()
778
 
779
 
780
  # ============================================================
781
- # APP LIFECYCLE & ROUTES
782
  # ============================================================
783
 
784
- @app.on_event("startup")
785
- async def startup() -> None:
786
- global playwright, browser
787
- from playwright.async_api import async_playwright
788
- playwright = await async_playwright().start()
789
- browser = await playwright.chromium.launch(
790
- headless=True,
791
- args=[
792
- # ── Security (required for containers) ──────────────────────────
793
- "--no-sandbox",
794
- "--disable-setuid-sandbox",
795
- # ── Memory reduction ─────────────────────────────────────────────
796
- "--disable-dev-shm-usage", # use /tmp instead of /dev/shm
797
- "--disable-gpu", # no GPU process (~50MB saved)
798
- "--no-zygote", # skip zygote process fork
799
- "--single-process", # single process mode (~150MB saved)
800
- "--disable-extensions", # no extension processes
801
- "--disable-background-networking",
802
- "--disable-background-timer-throttling",
803
- "--disable-backgrounding-occluded-windows",
804
- "--disable-breakpad", # no crash reporter
805
- "--disable-client-side-phishing-detection",
806
- "--disable-component-update",
807
- "--disable-default-apps",
808
- "--disable-domain-reliability",
809
- "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process",
810
- "--disable-hang-monitor",
811
- "--disable-ipc-flooding-protection",
812
- "--disable-popup-blocking",
813
- "--disable-prompt-on-repost",
814
- "--disable-renderer-backgrounding",
815
- "--disable-sync",
816
- "--disable-translate",
817
- "--metrics-recording-only",
818
- "--mute-audio",
819
- "--no-first-run",
820
- "--safebrowsing-disable-auto-update",
821
- "--password-store=basic",
822
- "--use-mock-keychain",
823
- # ── Reduce per-page memory ────────────────────────────────────────
824
- "--js-flags=--max-old-space-size=256", # cap JS heap to 256MB
825
- "--renderer-process-limit=2",
826
- ],
827
- )
828
- print("[Scraper] v5.0 ready - memory-optimised Chromium on HuggingFace")
829
-
830
-
831
- @app.on_event("shutdown")
832
- async def shutdown() -> None:
833
- global playwright, browser
834
- try:
835
- if browser: await browser.close()
836
- finally:
837
- browser = None
838
- try:
839
- if playwright: await playwright.stop()
840
- finally:
841
- playwright = None
842
-
843
-
844
  @app.get("/")
845
  async def root():
846
- return {
847
- "status": "ok",
848
- "service": "HackTrack Scraper v5",
849
- "strategy": "__NEXT_DATA__ + REST API + time-tags + regex",
850
- "platforms": ["Devfolio", "Devpost", "Unstop", "DoraHacks", "MLH"],
851
- }
852
-
853
- @app.get("/health")
854
- async def health():
855
- return {"status": "ok", "timestamp": datetime.utcnow().isoformat()}
856
 
857
 
858
  @app.post("/scrape", response_model=ScrapeResponse)
859
  async def scrape(request: ScrapeRequest):
860
  url = request.url.strip()
861
  platform = detect_platform(url)
862
- print(f"\n[Scraper] === {url} platform={platform} ===")
 
863
  try:
864
  data = await scrape_with_playwright(url, platform)
865
- return ScrapeResponse(
 
866
  name=data.get("name", ""),
867
  platform=platform,
868
  banner_url=data.get("banner_url", ""),
@@ -879,6 +531,10 @@ async def scrape(request: ScrapeRequest):
879
  scrape_success=data.get("scrape_success", False),
880
  url=url,
881
  )
 
 
 
 
882
  except Exception as e:
883
  print(f"[Scraper] Endpoint error: {e}")
884
- return ScrapeResponse(platform=platform, url=url, scrape_success=False)
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel, Field
4
  import asyncio
5
  import re
6
  import sys
 
 
7
  from urllib.parse import urlparse
8
+ from typing import List
9
+ from datetime import datetime
10
 
11
  if sys.platform == "win32":
12
+ # Playwright launches a driver subprocess; Proactor loop supports subprocess APIs on Windows.
13
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
14
 
15
+ app = FastAPI(title="HackTrack Scraper", version="3.0.0")
 
 
16
 
17
+ # Global Playwright runtime objects reused across requests.
18
+ playwright = None
19
+ browser = None
20
 
21
  app.add_middleware(
22
  CORSMiddleware,
 
26
  )
27
 
28
 
 
 
 
 
29
  class ScrapeRequest(BaseModel):
30
  url: str
31
 
32
+
33
  class ScrapeResponse(BaseModel):
34
  name: str = ""
35
  platform: str = ""
 
48
  url: str = ""
49
 
50
 
 
 
 
 
51
  def detect_platform(url: str) -> str:
52
  domain = urlparse(url).netloc.lower()
53
+ if "devfolio" in domain:
54
+ return "Devfolio"
55
+ elif "unstop" in domain:
56
+ return "Unstop"
57
+ elif "devpost" in domain:
58
+ return "Devpost"
59
+ elif "dorahacks" in domain:
60
+ return "DoraHacks"
61
  return "Other"
62
 
63
 
64
  # ============================================================
65
+ # DATE PARSING — robust multi-format
66
  # ============================================================
67
+ MONTH_MAP = {
68
+ "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3,
69
+ "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6,
70
+ "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9,
71
+ "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12,
72
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  DATE_FORMATS = [
75
  "%Y-%m-%d", "%Y/%m/%d",
76
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
77
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
78
  "%m/%d/%Y", "%d/%m/%Y",
79
+ "%B %d", "%b %d",
80
  ]
81
 
82
+
83
+ def parse_any_date(text: str, fallback_year: int = None) -> str:
84
+ """Parse many date formats to YYYY-MM-DD. Handles partial dates."""
85
+ if not text:
86
+ return ""
87
+ text = text.strip()
88
+ text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
89
  text = re.sub(r"\s+", " ", text)
90
+
91
+ if not fallback_year:
92
+ fallback_year = datetime.now().year
93
+
94
  for fmt in DATE_FORMATS:
95
  try:
96
  dt = datetime.strptime(text, fmt)
97
+ if dt.year == 1900: # no year in format
98
+ dt = dt.replace(year=fallback_year)
99
+ if dt < datetime.now():
100
+ dt = dt.replace(year=fallback_year + 1)
101
  return dt.strftime("%Y-%m-%d")
102
  except ValueError:
103
  continue
104
  return ""
105
 
106
+
107
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
108
+ """Find dates within `window` chars after any keyword."""
109
  lower = text.lower()
110
+ all_date_patterns = [
111
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
112
+ r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
113
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
 
 
 
 
114
  r"(\d{1,2}/\d{1,2}/\d{4})",
115
  ]
116
  for kw in keywords:
117
  idx = lower.find(kw.lower())
118
+ if idx == -1:
119
+ continue
120
+ chunk = text[idx:idx + window]
121
+ for pat in all_date_patterns:
122
+ match = re.search(pat, chunk, re.IGNORECASE)
123
+ if match:
124
+ parsed = parse_any_date(match.group(1))
125
+ if parsed:
126
+ return parsed
127
  return ""
128
 
129
 
130
  # ============================================================
131
+ # EXTRACT from full page innerText (the reliable approach)
132
  # ============================================================
133
 
134
+ def extract_all_from_text(body_text: str, platform: str) -> dict:
135
+ """Extract hackathon details from page innerText using text patterns."""
136
+ result = {
137
+ "registration_deadline": "",
138
+ "submission_deadline": "",
139
+ "result_date": "",
140
+ "start_date": "",
141
+ "end_date": "",
142
+ "prize_pool": "",
143
+ "team_size": {"min": 1, "max": 4},
144
+ "problem_statements": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
146
 
147
+ # ---- DATES ----
148
+ # Registration deadline
149
+ result["registration_deadline"] = find_dates_near(body_text, [
150
+ "registration close", "registrations close", "register by",
151
+ "last date to register", "registration deadline", "applications close",
152
+ "apply by", "registration ends", "sign up deadline",
153
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ # Submission deadline
156
+ result["submission_deadline"] = find_dates_near(body_text, [
157
+ "submission deadline", "submission closes", "submissions close",
158
+ "submit by", "last date to submit", "submission end",
159
+ "final submission", "project submission",
160
+ "deadline", # generic fallback last
161
+ ])
162
 
163
+ # Start date — Devfolio uses "Runs from Mar 25 - 26, 2026"
164
+ runs_from = re.search(
165
+ r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
166
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
167
+ r"(?:\s*[-–]\s*(\d{1,2}))?"
168
+ r"(?:[,\s]+(\d{4}))?",
169
+ body_text, re.IGNORECASE
170
+ )
171
+ if runs_from:
172
+ start_text = runs_from.group(1)
173
+ year = runs_from.group(3) or str(datetime.now().year)
174
+ result["start_date"] = parse_any_date(f"{start_text} {year}")
175
+ if runs_from.group(2) and runs_from.group(1):
176
+ month = runs_from.group(1).split()[0]
177
+ result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
 
 
178
 
179
+ if not result["start_date"]:
180
+ result["start_date"] = find_dates_near(body_text, [
181
+ "start date", "starts on", "begins on", "hackathon starts",
182
+ "event starts", "event date", "dates:",
183
+ ])
184
 
185
+ if not result["end_date"]:
186
+ result["end_date"] = find_dates_near(body_text, [
187
+ "end date", "ends on", "hackathon ends", "event ends",
188
+ ])
189
+
190
+ # Result date
191
+ result["result_date"] = find_dates_near(body_text, [
192
+ "result", "winners announced", "announcement", "winner announcement",
193
+ "results declared", "shortlist",
194
+ ])
195
 
196
+ # ---- PRIZE POOL ----
197
+ prize_patterns = [
198
+ r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
199
+ r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
200
+ r"(€\s*[\d,]+(?:\.\d+)?)",
201
+ r"(£\s*[\d,]+(?:\.\d+)?)",
202
+ r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
203
+ r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
204
+ ]
205
 
206
+ # Find prize amounts near keywords like "prize", "reward", "worth", "win"
207
+ prize_lower = body_text.lower()
208
+ for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
209
+ idx = prize_lower.find(kw)
210
+ if idx == -1:
211
+ continue
212
+ # Search ±200 chars around keyword
213
+ start = max(0, idx - 200)
214
+ chunk = body_text[start:idx + 200]
215
+ for pat in prize_patterns:
216
+ match = re.search(pat, chunk, re.IGNORECASE)
217
+ if match:
218
+ result["prize_pool"] = match.group(1).strip()
219
+ break
220
+ if result["prize_pool"]:
221
+ break
222
 
223
+ # Fallback: any large currency amount
224
+ if not result["prize_pool"]:
225
+ for pat in prize_patterns:
226
+ match = re.search(pat, body_text)
227
+ if match:
228
+ result["prize_pool"] = match.group(1).strip()
229
+ break
230
+
231
+ # ---- TEAM SIZE ----
232
+ team_patterns = [
233
+ r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
234
+ r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
235
+ r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
236
+ r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
237
+ r"(\d+)\s*[-–]\s*(\d+)\s*$", # in FAQ: "2 - 4"
238
  ]
239
+ for pat in team_patterns:
240
+ match = re.search(pat, body_text, re.IGNORECASE)
241
+ if match:
242
+ groups = [g for g in match.groups() if g]
243
+ if len(groups) == 2:
244
+ result["team_size"] = {"min": int(groups[0]), "max": int(groups[1])}
245
+ elif len(groups) == 1:
246
+ result["team_size"] = {"min": 1, "max": int(groups[0])}
247
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ # ---- PROBLEM STATEMENTS / TRACKS / DOMAINS ----
250
  ps = []
251
  seen_ps = set()
 
 
 
 
 
 
 
252
 
253
+ # Pattern 1: "Domains: AI, ML, Web App" (Devfolio style)
254
+ domain_match = re.search(
255
+ r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
256
+ body_text, re.IGNORECASE
257
+ )
258
+ if domain_match:
259
+ items = re.split(r"[,•|/]", domain_match.group(1))
260
+ for item in items:
261
+ item = item.strip().rstrip(".")
262
+ if 3 < len(item) < 150 and item.lower() not in seen_ps:
263
+ seen_ps.add(item.lower())
264
+ ps.append({"track": "", "title": item})
265
+
266
+ # Pattern 2: Numbered problem statements: "PS1: ...", "Problem Statement 1 - ..."
267
+ for match in re.finditer(
268
+ r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
269
+ body_text, re.IGNORECASE
270
+ ):
271
+ num = match.group(1)
272
+ title = match.group(2).strip().split("\n")[0]
273
+ if title.lower() not in seen_ps and len(title) > 4:
274
+ seen_ps.add(title.lower())
275
+ ps.append({"track": f"Track {num}", "title": title})
276
+
277
+ # Pattern 3: Devpost-style theme tags (already in themes list from JS)
278
+ # Pattern 4: Bulleted lists after "Themes" or "Tracks" heading
279
+ for match in re.finditer(
280
+ r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
281
+ r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
282
+ body_text, re.IGNORECASE
283
+ ):
284
+ items = re.findall(r"[-•●▸]\s*(.{5,200})", match.group(1))
285
+ for item in items:
286
+ item = item.strip().split("\n")[0]
287
+ if item.lower() not in seen_ps and 4 < len(item) < 200:
288
+ seen_ps.add(item.lower())
289
+ ps.append({"track": "", "title": item})
290
 
291
+ result["problem_statements"] = ps[:20]
292
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
 
295
  # ============================================================
296
+ # PLAYWRIGHT SCRAPER gets innerText + meta from rendered page
297
  # ============================================================
298
 
299
+ EXTRACT_SCRIPT = """() => {
300
+ const getMeta = (name) => {
301
+ const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
302
  return el ? el.getAttribute('content') || '' : '';
303
  };
304
 
305
+ // Name: try multiple selectors
306
+ const nameSelectors = [
307
+ 'h1',
308
+ '.hackathon-name', '.event-name', '.challenge-title',
309
+ '#challenge-title', '.opp-title',
310
+ ];
311
+ let name = '';
312
+ for (const sel of nameSelectors) {
313
+ const el = document.querySelector(sel);
314
+ if (el && el.textContent.trim().length > 2) {
315
+ name = el.textContent.trim();
316
+ break;
317
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  }
319
+ name = name || getMeta('og:title') || document.title.split('|')[0].trim();
320
+
321
+ // Banner
322
+ const banner = getMeta('og:image') || '';
323
+
324
+ // Description
325
+ let description = getMeta('og:description') || getMeta('description') || '';
326
+
327
+ // Full page text for parsing
328
+ const bodyText = document.body.innerText;
329
 
330
+ // For Devpost: extract themes from tag links
331
  const themes = [];
332
+ document.querySelectorAll('a[href*="themes"]').forEach(a => {
 
333
  const t = a.textContent.trim();
334
+ if (t && t.length > 2 && t.length < 100) themes.push(t);
335
  });
336
 
337
+ // Devpost sidebar prize text
338
+ let sidebarPrize = '';
339
+ document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
340
+ const t = el.textContent.trim();
341
+ if (t && t.length > 2) sidebarPrize += t + ' ';
 
 
 
 
 
 
 
 
 
 
342
  });
343
 
344
+ // Resource links: PDFs, Google Drive, problem statements, rules, guidelines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  const resourceLinks = [];
346
  const seenHrefs = new Set();
347
+ const linkKeywords = ['problem', 'statement', 'pdf', 'rule', 'guideline', 'brochure', 'document', 'brief', 'challenge', 'track', 'theme', 'schedule', 'timeline'];
348
+ document.querySelectorAll('a[href]').forEach(a => {
349
  const href = a.href || '';
350
  const text = a.textContent.trim();
351
+ const hrefLower = href.toLowerCase();
352
+ const textLower = text.toLowerCase();
353
  if (seenHrefs.has(href) || !href || href === '#') return;
354
+
355
+ const isPdf = hrefLower.endsWith('.pdf') || hrefLower.includes('/pdf');
356
+ const isDrive = hrefLower.includes('drive.google.com') || hrefLower.includes('docs.google.com');
357
+ const isDropbox = hrefLower.includes('dropbox.com');
358
+ const isRelevant = linkKeywords.some(kw => textLower.includes(kw) || hrefLower.includes(kw));
359
+
360
+ if (isPdf || isDrive || isDropbox || isRelevant) {
361
  seenHrefs.add(href);
362
+ resourceLinks.push({
363
+ text: text.substring(0, 150) || 'Document',
364
+ url: href,
365
+ type: isPdf ? 'pdf' : isDrive ? 'google_drive' : isDropbox ? 'dropbox' : 'link',
366
+ });
367
  }
368
  });
369
+
370
  return {
371
  name: name.substring(0, 200),
372
  description: description.substring(0, 2000),
373
  banner_url: banner,
374
+ bodyText: bodyText.substring(0, 30000),
375
+ themes: themes,
376
+ sidebarPrize: sidebarPrize.trim(),
377
+ resourceLinks: resourceLinks.slice(0, 30),
378
  };
379
  }"""
380
 
381
 
382
+ @app.on_event("startup")
383
+ async def startup() -> None:
384
+ global playwright, browser
385
+ from playwright.async_api import async_playwright
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ playwright = await async_playwright().start()
388
+ browser = await playwright.chromium.launch(
389
+ headless=True,
390
+ args=["--no-sandbox", "--disable-setuid-sandbox"],
391
  )
392
+ print("[Scraper] Playwright browser initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
 
395
+ @app.on_event("shutdown")
396
+ async def shutdown() -> None:
397
+ global playwright, browser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
+ try:
400
+ if browser is not None:
401
+ await browser.close()
402
+ print("[Scraper] Browser closed")
403
+ finally:
404
+ browser = None
405
 
406
+ try:
407
+ if playwright is not None:
408
+ await playwright.stop()
409
+ print("[Scraper] Playwright stopped")
410
+ finally:
411
+ playwright = None
412
 
413
  async def scrape_with_playwright(url: str, platform: str) -> dict:
414
+ """Scrape using Playwright — renders JS, grabs full innerText for parsing."""
415
  global browser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  try:
417
+ if browser is None:
418
+ return {
419
+ "scrape_success": False,
420
+ "error": "Browser is not initialized. Service startup failed.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  }
422
 
423
+ context = await browser.new_context(
424
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
425
+ viewport={"width": 1920, "height": 1080},
426
+ )
427
+
428
+ try:
429
+ page = await context.new_page()
430
+
431
+ print(f"[Scraper] Navigating to {url} (platform: {platform})")
432
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
433
+
434
+ # Wait for JS rendering — longer for SPAs
435
+ wait_time = 8 if platform in ("Unstop",) else 5
436
+ print(f"[Scraper] Waiting {wait_time}s for JS rendering...")
437
+ await page.wait_for_timeout(wait_time * 1000)
438
+
439
+ # Scroll to trigger lazy content
440
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 3)")
441
+ await asyncio.sleep(1)
442
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 2 / 3)")
443
+ await asyncio.sleep(1)
444
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
+ await asyncio.sleep(1)
446
+ await page.evaluate("window.scrollTo(0, 0)")
447
+ await asyncio.sleep(0.5)
448
+
449
+ # Extract structured + raw text data
450
+ data = await page.evaluate(EXTRACT_SCRIPT)
451
+
452
+ body_text = data.get("bodyText", "")
453
+ name = data.get("name", "")
454
+ description = data.get("description", "")
455
+
456
+ print(f"[Scraper] Extracted name: '{name}', bodyText length: {len(body_text)}")
457
+
458
+ # Parse all fields from full innerText
459
+ extracted = extract_all_from_text(body_text, platform)
460
+
461
+ # Devpost themes from sidebar tags
462
+ themes = data.get("themes", [])
463
+ if themes and not extracted["problem_statements"]:
464
+ seen = set()
465
+ for t in themes:
466
+ if t.lower() not in seen:
467
+ seen.add(t.lower())
468
+ extracted["problem_statements"].append({"track": "Theme", "title": t})
469
+
470
+ # Sidebar prize fallback (Devpost)
471
+ if not extracted["prize_pool"] and data.get("sidebarPrize"):
472
+ prize_text = data["sidebarPrize"]
473
+ for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
474
+ m = re.search(pat, prize_text)
475
+ if m:
476
+ extracted["prize_pool"] = m.group(1)
477
+ break
478
+ if not extracted["prize_pool"]:
479
+ extracted["prize_pool"] = prize_text[:100]
480
+
481
+ return {
482
+ "name": name,
483
+ "description": description,
484
+ "banner_url": data.get("banner_url", ""),
485
+ "scrape_success": bool(name and len(name) > 2),
486
+ "resource_links": data.get("resourceLinks", []),
487
+ **extracted,
488
+ }
489
+ finally:
490
+ await context.close()
491
 
492
  except Exception as e:
493
+ print(f"[Scraper] Error: {e}")
494
  import traceback
495
  traceback.print_exc()
496
  return {"scrape_success": False, "error": str(e)}
 
 
497
 
498
 
499
  # ============================================================
500
+ # API ROUTES
501
  # ============================================================
502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  @app.get("/")
504
  async def root():
505
+ return {"status": "ok", "service": "HackTrack Scraper v3"}
 
 
 
 
 
 
 
 
 
506
 
507
 
508
  @app.post("/scrape", response_model=ScrapeResponse)
509
  async def scrape(request: ScrapeRequest):
510
  url = request.url.strip()
511
  platform = detect_platform(url)
512
+ print(f"\n[Scraper] === New scrape request: {url} (platform={platform}) ===")
513
+
514
  try:
515
  data = await scrape_with_playwright(url, platform)
516
+
517
+ response = ScrapeResponse(
518
  name=data.get("name", ""),
519
  platform=platform,
520
  banner_url=data.get("banner_url", ""),
 
531
  scrape_success=data.get("scrape_success", False),
532
  url=url,
533
  )
534
+
535
+ print(f"[Scraper] Result: name='{response.name}', dates=({response.start_date}, {response.end_date}, reg={response.registration_deadline}, sub={response.submission_deadline}), prize='{response.prize_pool}', team={response.team_size}, ps={len(response.problem_statements)}")
536
+ return response
537
+
538
  except Exception as e:
539
  print(f"[Scraper] Endpoint error: {e}")
540
+ return ScrapeResponse(platform=platform, url=url, scrape_success=False)