Rudraaaa76 commited on
Commit
6d8a8d2
·
verified ·
1 Parent(s): f3449f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -516
app.py CHANGED
@@ -1,43 +1,20 @@
1
- """
2
- HackTrack Scraper v4.0
3
- - Groq LLM (llama-3.3-70b-versatile) for intelligent extraction
4
- - Platforms: Devfolio, Unstop, Devpost, DoraHacks, MLH
5
- """
6
-
7
  from fastapi import FastAPI
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel, Field
10
  import asyncio
11
  import re
12
  import sys
13
- import os
14
- import json
15
  from urllib.parse import urlparse
16
  from typing import List
17
  from datetime import datetime
18
 
19
- # Groq client — free tier, llama-3.3-70b
20
- from groq import Groq
21
-
22
  if sys.platform == "win32":
 
23
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
24
 
25
- app = FastAPI(title="HackTrack Scraper", version="4.0.0")
26
-
27
- # ── Groq setup ────────────────────────────────────────────────────────────────
28
- _groq_client: Groq | None = None
29
-
30
- def get_groq() -> Groq | None:
31
- global _groq_client
32
- if _groq_client is None:
33
- key = os.environ.get("GROQ_API_KEY", "")
34
- if key:
35
- _groq_client = Groq(api_key=key)
36
- return _groq_client
37
-
38
- GROQ_MODEL = "llama-3.3-70b-versatile"
39
 
40
- # ── Global Playwright runtime ─────────────────────────────────────────────────
41
  playwright = None
42
  browser = None
43
 
@@ -49,10 +26,6 @@ app.add_middleware(
49
  )
50
 
51
 
52
- # ══════════════════════════════════════════════════════════════════════════════
53
- # MODELS
54
- # ══════════════════════════════════════════════════════════════════════════════
55
-
56
  class ScrapeRequest(BaseModel):
57
  url: str
58
 
@@ -73,50 +46,55 @@ class ScrapeResponse(BaseModel):
73
  resource_links: List[dict] = Field(default_factory=list)
74
  scrape_success: bool = False
75
  url: str = ""
76
- llm_used: bool = False # tells frontend whether Groq enriched this
77
-
78
 
79
- # ══════════════════════════════════════════════════════════════════════════════
80
- # PLATFORM DETECTION
81
- # ══════════════════════════════════════════════════════════════════════════════
82
 
83
  def detect_platform(url: str) -> str:
84
  domain = urlparse(url).netloc.lower()
85
- if "devfolio" in domain: return "Devfolio"
86
- if "unstop" in domain: return "Unstop"
87
- if "devpost" in domain: return "Devpost"
88
- if "dorahacks" in domain: return "DoraHacks"
89
- if "mlh.io" in domain: return "MLH"
90
- if "hackerearth" in domain: return "HackerEarth"
91
- if "hackerrank" in domain: return "HackerRank"
 
92
  return "Other"
93
 
94
 
95
- # ══════════════════════════════════════════════════════════════════════════════
96
- # DATE PARSING
97
- # ══════════════════════════════════════════════════════════════════════════════
 
 
 
 
 
 
98
 
99
  DATE_FORMATS = [
100
  "%Y-%m-%d", "%Y/%m/%d",
101
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
102
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
103
- "%m/%d/%Y", "%d/%m/%Y",
104
- "%B %d", "%b %d",
105
  ]
106
 
107
 
108
  def parse_any_date(text: str, fallback_year: int = None) -> str:
 
109
  if not text:
110
  return ""
111
  text = text.strip()
112
  text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
113
  text = re.sub(r"\s+", " ", text)
114
- fallback_year = fallback_year or datetime.now().year
 
 
115
 
116
  for fmt in DATE_FORMATS:
117
  try:
118
  dt = datetime.strptime(text, fmt)
119
- if dt.year == 1900:
120
  dt = dt.replace(year=fallback_year)
121
  if dt < datetime.now():
122
  dt = dt.replace(year=fallback_year + 1)
@@ -127,8 +105,9 @@ def parse_any_date(text: str, fallback_year: int = None) -> str:
127
 
128
 
129
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
 
130
  lower = text.lower()
131
- patterns = [
132
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
133
  r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
134
  r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
@@ -138,87 +117,22 @@ def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
138
  idx = lower.find(kw.lower())
139
  if idx == -1:
140
  continue
141
- chunk = text[idx: idx + window]
142
- for pat in patterns:
143
- m = re.search(pat, chunk, re.IGNORECASE)
144
- if m:
145
- parsed = parse_any_date(m.group(1))
146
  if parsed:
147
  return parsed
148
  return ""
149
 
150
 
151
- # ══════════════════════════════════════════════════════════════════════════════
152
- # GROQ LLM EXTRACTION (single call, returns full structured dict)
153
- # ══════════════════════════════════════════════════════════════════════════════
154
-
155
- def groq_extract(body_text: str, platform: str) -> dict | None:
156
- """
157
- One Groq call extracts ALL fields at once.
158
- Returns None if Groq is unavailable or call fails.
159
- """
160
- client = get_groq()
161
- if not client:
162
- return None
163
-
164
- # Trim to ~5000 chars to stay within token limits comfortably
165
- excerpt = body_text[:5000]
166
-
167
- prompt = f"""You are extracting structured data from a hackathon page ({platform}).
168
-
169
- Return ONLY valid JSON — no markdown, no explanation.
170
-
171
- Schema:
172
- {{
173
- "registration_deadline": "YYYY-MM-DD or empty string",
174
- "submission_deadline": "YYYY-MM-DD or empty string",
175
- "result_date": "YYYY-MM-DD or empty string",
176
- "start_date": "YYYY-MM-DD or empty string",
177
- "end_date": "YYYY-MM-DD or empty string",
178
- "prize_pool": "raw string like ₹5,00,000 or $10,000 or empty string",
179
- "team_size": {{"min": 1, "max": 4}},
180
- "problem_statements": [
181
- {{"track": "optional track label", "title": "PS or theme title"}}
182
- ]
183
- }}
184
-
185
- Rules:
186
- - Dates: assume year {datetime.now().year} if missing; use YYYY-MM-DD format.
187
- - prize_pool: keep original currency symbol and denomination text (₹2 Lakh, $10K, etc.).
188
- - team_size: extract min/max members. Default {{"min":1,"max":4}} if not found.
189
- - problem_statements: list every unique track/theme/PS. Max 20 items.
190
- - If a field is not found, use "" or [] or the default value shown.
191
-
192
- Page text:
193
- {excerpt}"""
194
-
195
- try:
196
- resp = client.chat.completions.create(
197
- model=GROQ_MODEL,
198
- max_tokens=1200,
199
- temperature=0.05,
200
- messages=[
201
- {
202
- "role": "system",
203
- "content": "You extract structured hackathon data. Respond with valid JSON only.",
204
- },
205
- {"role": "user", "content": prompt},
206
- ],
207
- )
208
- raw = resp.choices[0].message.content.strip()
209
- # Strip markdown fences if model wraps output
210
- raw = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
211
- return json.loads(raw)
212
- except Exception as e:
213
- print(f"[Groq] extraction failed: {e}")
214
- return None
215
-
216
-
217
- # ══════════════════════════════════════════════════════════════════════════════
218
- # REGEX FALLBACK EXTRACTION (same logic as v3, kept as safety net)
219
- # ════════════��═════════════════════════════════════════════════════════════════
220
 
221
- def regex_extract(body_text: str, platform: str) -> dict:
 
222
  result = {
223
  "registration_deadline": "",
224
  "submission_deadline": "",
@@ -230,46 +144,56 @@ def regex_extract(body_text: str, platform: str) -> dict:
230
  "problem_statements": [],
231
  }
232
 
233
- # Dates
 
234
  result["registration_deadline"] = find_dates_near(body_text, [
235
  "registration close", "registrations close", "register by",
236
  "last date to register", "registration deadline", "applications close",
237
  "apply by", "registration ends", "sign up deadline",
238
  ])
 
 
239
  result["submission_deadline"] = find_dates_near(body_text, [
240
  "submission deadline", "submission closes", "submissions close",
241
  "submit by", "last date to submit", "submission end",
242
- "final submission", "project submission", "deadline",
 
243
  ])
244
 
245
- # "Runs from Mar 25 - 26, 2026"
246
  runs_from = re.search(
247
  r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
248
- r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|"
249
- r"Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
250
- r"(?:\s*[-–]\s*(\d{1,2}))?(?:[,\s]+(\d{4}))?",
251
- body_text, re.IGNORECASE,
252
  )
253
  if runs_from:
 
254
  year = runs_from.group(3) or str(datetime.now().year)
255
- result["start_date"] = parse_any_date(f"{runs_from.group(1)} {year}")
256
- if runs_from.group(2):
257
  month = runs_from.group(1).split()[0]
258
  result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
259
 
260
  if not result["start_date"]:
261
  result["start_date"] = find_dates_near(body_text, [
262
- "start date", "starts on", "begins on", "hackathon starts", "event starts",
 
263
  ])
 
264
  if not result["end_date"]:
265
  result["end_date"] = find_dates_near(body_text, [
266
  "end date", "ends on", "hackathon ends", "event ends",
267
  ])
 
 
268
  result["result_date"] = find_dates_near(body_text, [
269
- "result", "winners announced", "announcement", "results declared",
 
270
  ])
271
 
272
- # Prize
273
  prize_patterns = [
274
  r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
275
  r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
@@ -278,242 +202,161 @@ def regex_extract(body_text: str, platform: str) -> dict:
278
  r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
279
  r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
280
  ]
 
 
281
  prize_lower = body_text.lower()
282
  for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
283
  idx = prize_lower.find(kw)
284
  if idx == -1:
285
  continue
286
- chunk = body_text[max(0, idx - 200): idx + 200]
 
 
287
  for pat in prize_patterns:
288
- m = re.search(pat, chunk, re.IGNORECASE)
289
- if m:
290
- result["prize_pool"] = m.group(1).strip()
291
  break
292
  if result["prize_pool"]:
293
  break
 
 
294
  if not result["prize_pool"]:
295
  for pat in prize_patterns:
296
- m = re.search(pat, body_text)
297
- if m:
298
- result["prize_pool"] = m.group(1).strip()
299
  break
300
 
301
- # Team size
302
- for pat in [
303
  r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
304
  r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
305
  r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
306
  r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
307
- ]:
308
- m = re.search(pat, body_text, re.IGNORECASE)
309
- if m:
310
- groups = [g for g in m.groups() if g]
311
- result["team_size"] = (
312
- {"min": int(groups[0]), "max": int(groups[1])} if len(groups) == 2
313
- else {"min": 1, "max": int(groups[0])}
314
- )
 
 
315
  break
316
 
317
- # Problem statements
318
- ps, seen = [], set()
 
319
 
320
- domain_m = re.search(
 
321
  r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
322
- body_text, re.IGNORECASE,
323
  )
324
- if domain_m:
325
- for item in re.split(r"[,•|/]", domain_m.group(1)):
 
326
  item = item.strip().rstrip(".")
327
- if 3 < len(item) < 150 and item.lower() not in seen:
328
- seen.add(item.lower())
329
  ps.append({"track": "", "title": item})
330
 
331
- for m in re.finditer(
 
332
  r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
333
- body_text, re.IGNORECASE,
334
  ):
335
- title = m.group(2).strip().split("\n")[0]
336
- if title.lower() not in seen and len(title) > 4:
337
- seen.add(title.lower())
338
- ps.append({"track": f"Track {m.group(1)}", "title": title})
339
-
340
- for m in re.finditer(
 
 
 
341
  r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
342
  r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
343
- body_text, re.IGNORECASE,
344
  ):
345
- for item in re.findall(r"[-•●▸]\s*(.{5,200})", m.group(1)):
 
346
  item = item.strip().split("\n")[0]
347
- if item.lower() not in seen and 4 < len(item) < 200:
348
- seen.add(item.lower())
349
  ps.append({"track": "", "title": item})
350
 
351
  result["problem_statements"] = ps[:20]
352
  return result
353
 
354
 
355
- # ══════════════════════════════════════════════════════════════════════════════
356
- # MERGE: LLM results take precedence, regex fills gaps
357
- # ══════════════════════════════════════════════════════════════════════════════
358
-
359
- def merge_results(llm: dict | None, regex: dict) -> tuple[dict, bool]:
360
- """
361
- Prefer LLM values; fall back to regex for any blank field.
362
- Returns (merged_dict, llm_was_used).
363
- """
364
- if llm is None:
365
- return regex, False
366
-
367
- merged = {}
368
- date_fields = [
369
- "registration_deadline", "submission_deadline",
370
- "result_date", "start_date", "end_date",
371
- ]
372
- for f in date_fields:
373
- merged[f] = llm.get(f) or regex.get(f, "")
374
-
375
- merged["prize_pool"] = llm.get("prize_pool") or regex.get("prize_pool", "")
376
-
377
- # team_size: use LLM unless it's the bare default and regex found something
378
- llm_ts = llm.get("team_size", {"min": 1, "max": 4})
379
- regex_ts = regex.get("team_size", {"min": 1, "max": 4})
380
- if llm_ts == {"min": 1, "max": 4} and regex_ts != {"min": 1, "max": 4}:
381
- merged["team_size"] = regex_ts
382
- else:
383
- merged["team_size"] = llm_ts
384
-
385
- # problem_statements: prefer LLM list; fall back to regex
386
- llm_ps = llm.get("problem_statements", [])
387
- regex_ps = regex.get("problem_statements", [])
388
- merged["problem_statements"] = llm_ps if llm_ps else regex_ps
389
-
390
- return merged, True
391
-
392
 
393
- # ══════════════════════════════════════════════════════════════════════════════
394
- # PLATFORM-SPECIFIC JS EXTRACTION SCRIPTS
395
- # ══════════════════════════════════════════════════════════════════════════════
396
-
397
- # Generic script — works for Devpost, Unstop, DoraHacks, Other
398
- GENERIC_EXTRACT_SCRIPT = """() => {
399
  const getMeta = (name) => {
400
  const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
401
  return el ? el.getAttribute('content') || '' : '';
402
  };
 
 
403
  const nameSelectors = [
404
- 'h1', '.hackathon-name', '.event-name', '.challenge-title',
405
- '#challenge-title', '.opp-title', '[class*="hackathon-title"]',
406
- '[class*="event-title"]', '[class*="challenge-name"]',
407
  ];
408
  let name = '';
409
  for (const sel of nameSelectors) {
410
  const el = document.querySelector(sel);
411
  if (el && el.textContent.trim().length > 2) {
412
- name = el.textContent.trim(); break;
 
413
  }
414
  }
415
  name = name || getMeta('og:title') || document.title.split('|')[0].trim();
416
 
 
417
  const banner = getMeta('og:image') || '';
 
 
418
  let description = getMeta('og:description') || getMeta('description') || '';
 
 
419
  const bodyText = document.body.innerText;
420
 
421
- // Devpost themes
422
  const themes = [];
423
  document.querySelectorAll('a[href*="themes"]').forEach(a => {
424
  const t = a.textContent.trim();
425
  if (t && t.length > 2 && t.length < 100) themes.push(t);
426
  });
427
 
428
- // Prize sidebar (Devpost)
429
  let sidebarPrize = '';
430
  document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
431
  const t = el.textContent.trim();
432
  if (t && t.length > 2) sidebarPrize += t + ' ';
433
  });
434
 
435
- // Resource links
436
  const resourceLinks = [];
437
  const seenHrefs = new Set();
438
- const kws = ['problem','statement','pdf','rule','guideline','brochure',
439
- 'document','brief','challenge','track','theme','schedule','timeline'];
440
  document.querySelectorAll('a[href]').forEach(a => {
441
  const href = a.href || '';
442
  const text = a.textContent.trim();
443
- const hl = href.toLowerCase(), tl = text.toLowerCase();
 
444
  if (seenHrefs.has(href) || !href || href === '#') return;
445
- const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
446
- const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
447
- const isDropbox = hl.includes('dropbox.com');
448
- const isRelevant = kws.some(kw => tl.includes(kw) || hl.includes(kw));
449
- if (isPdf || isDrive || isDropbox || isRelevant) {
450
- seenHrefs.add(href);
451
- resourceLinks.push({
452
- text: text.substring(0, 150) || 'Document',
453
- url: href,
454
- type: isPdf ? 'pdf' : isDrive ? 'google_drive' : isDropbox ? 'dropbox' : 'link',
455
- });
456
- }
457
- });
458
-
459
- return {
460
- name: name.substring(0, 200),
461
- description: description.substring(0, 2000),
462
- banner_url: banner,
463
- bodyText: bodyText.substring(0, 30000),
464
- themes,
465
- sidebarPrize: sidebarPrize.trim(),
466
- resourceLinks: resourceLinks.slice(0, 30),
467
- };
468
- }"""
469
-
470
 
471
- # Devfolio-specific: clicks "About" tab to expose full description + dates
472
- DEVFOLIO_EXTRACT_SCRIPT = """async () => {
473
- // Try clicking the About/Overview tab if present
474
- const tabSelectors = ['a[href*="about"]', 'button[aria-label*="about" i]',
475
- '[role="tab"]', 'nav a'];
476
- for (const sel of tabSelectors) {
477
- const tabs = document.querySelectorAll(sel);
478
- for (const tab of tabs) {
479
- if (/about|overview/i.test(tab.textContent)) {
480
- tab.click();
481
- await new Promise(r => setTimeout(r, 1000));
482
- break;
483
- }
484
- }
485
- }
486
 
487
- const getMeta = (name) => {
488
- const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
489
- return el ? el.getAttribute('content') || '' : '';
490
- };
491
-
492
- let name = '';
493
- for (const sel of ['h1', '.sc-hackathon-title', '[class*="title"]']) {
494
- const el = document.querySelector(sel);
495
- if (el && el.textContent.trim().length > 2) { name = el.textContent.trim(); break; }
496
- }
497
- name = name || getMeta('og:title') || document.title.split('|')[0].trim();
498
-
499
- const bodyText = document.body.innerText;
500
- const banner = getMeta('og:image') || '';
501
- const description = getMeta('og:description') || getMeta('description') || '';
502
-
503
- // Resource links
504
- const resourceLinks = [];
505
- const seenHrefs = new Set();
506
- const kws = ['problem','statement','pdf','rule','guideline','brochure',
507
- 'document','brief','challenge','track','theme','schedule','timeline'];
508
- document.querySelectorAll('a[href]').forEach(a => {
509
- const href = a.href || '';
510
- const text = a.textContent.trim();
511
- const hl = href.toLowerCase(), tl = text.toLowerCase();
512
- if (seenHrefs.has(href) || !href || href === '#') return;
513
- const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
514
- const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
515
- const isDropbox = hl.includes('dropbox.com');
516
- const isRelevant = kws.some(kw => tl.includes(kw) || hl.includes(kw));
517
  if (isPdf || isDrive || isDropbox || isRelevant) {
518
  seenHrefs.add(href);
519
  resourceLinks.push({
@@ -529,255 +372,148 @@ DEVFOLIO_EXTRACT_SCRIPT = """async () => {
529
  description: description.substring(0, 2000),
530
  banner_url: banner,
531
  bodyText: bodyText.substring(0, 30000),
532
- themes: [],
533
- sidebarPrize: '',
534
  resourceLinks: resourceLinks.slice(0, 30),
535
  };
536
  }"""
537
 
538
 
539
- # MLH: static listing — we grab individual event pages
540
- MLH_EXTRACT_SCRIPT = """() => {
541
- const getMeta = (name) => {
542
- const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
543
- return el ? el.getAttribute('content') || '' : '';
544
- };
545
- let name = getMeta('og:title') || document.title.split('|')[0].trim();
546
- const banner = getMeta('og:image') || '';
547
- const description = getMeta('og:description') || '';
548
- const bodyText = document.body.innerText;
549
-
550
- const resourceLinks = [];
551
- const seenHrefs = new Set();
552
- document.querySelectorAll('a[href]').forEach(a => {
553
- const href = a.href || '';
554
- const text = a.textContent.trim();
555
- const hl = href.toLowerCase();
556
- if (seenHrefs.has(href) || !href || href === '#') return;
557
- if (hl.endsWith('.pdf') || hl.includes('drive.google.com')) {
558
- seenHrefs.add(href);
559
- resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: 'pdf' });
560
- }
561
- });
562
-
563
- return {
564
- name: name.substring(0, 200),
565
- description: description.substring(0, 2000),
566
- banner_url: banner,
567
- bodyText: bodyText.substring(0, 30000),
568
- themes: [],
569
- sidebarPrize: '',
570
- resourceLinks: resourceLinks.slice(0, 20),
571
- };
572
- }"""
573
-
574
-
575
- def get_extract_script(platform: str) -> str:
576
- if platform == "Devfolio":
577
- return DEVFOLIO_EXTRACT_SCRIPT
578
- if platform == "MLH":
579
- return MLH_EXTRACT_SCRIPT
580
- return GENERIC_EXTRACT_SCRIPT
581
-
582
-
583
- # ══════════════════════════════════════════════════════════════════════════════
584
- # PLAYWRIGHT SCRAPER
585
- # ══════════════════════════════════════════════════════════════════════════════
586
-
587
- EMPTY_DATA = {
588
- "name": "", "description": "", "banner_url": "",
589
- "bodyText": "", "themes": [], "sidebarPrize": "", "resourceLinks": [],
590
- }
591
-
592
-
593
- async def safe_evaluate(page, script: str, fallback_script: str = None) -> dict:
594
- """
595
- Evaluate JS safely with retry on 'Execution context was destroyed'
596
- caused by Devfolio /overview redirecting to / mid-scrape.
597
- """
598
- for attempt in range(3):
599
- try:
600
- try:
601
- await page.wait_for_load_state("networkidle", timeout=8000)
602
- except Exception:
603
- pass
604
- return await page.evaluate(script)
605
- except Exception as e:
606
- err = str(e)
607
- print(f"[Scraper] evaluate attempt {attempt + 1} failed: {err[:150]}")
608
- if "Execution context was destroyed" in err or "Frame was detached" in err:
609
- print("[Scraper] Redirect detected — waiting for page to settle...")
610
- try:
611
- await page.wait_for_load_state("domcontentloaded", timeout=12000)
612
- await asyncio.sleep(2)
613
- except Exception:
614
- await asyncio.sleep(3)
615
- continue
616
- if fallback_script and attempt == 1:
617
- print("[Scraper] Switching to generic fallback script...")
618
- script = fallback_script
619
- continue
620
- break
621
- print("[Scraper] All evaluate attempts exhausted — returning empty data")
622
- return EMPTY_DATA
623
-
624
-
625
- async def scrape_with_playwright(url: str, platform: str) -> dict:
626
- global browser
627
- if browser is None:
628
- return {"scrape_success": False, "error": "Browser not initialized"}
629
-
630
- context = await browser.new_context(
631
- user_agent=(
632
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
633
- "AppleWebKit/537.36 (KHTML, like Gecko) "
634
- "Chrome/125.0.0.0 Safari/537.36"
635
- ),
636
- viewport={"width": 1920, "height": 1080},
637
- )
638
-
639
- try:
640
- page = await context.new_page()
641
- print(f"[Scraper] → {url} (platform={platform})")
642
-
643
- # Devfolio /overview redirects to / — wait for "load" so the redirect
644
- # finishes before we evaluate JS.
645
- wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
646
- try:
647
- await page.goto(url, wait_until=wait_until, timeout=30000)
648
- except Exception as e:
649
- if "Timeout" in str(e):
650
- print(f"[Scraper] goto timeout ({wait_until}) — proceeding anyway")
651
- else:
652
- raise
653
-
654
- wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 8, "MLH": 4}
655
- wait_sec = wait_map.get(platform, 5)
656
- print(f"[Scraper] Waiting {wait_sec}s for JS rendering...")
657
- await page.wait_for_timeout(wait_sec * 1000)
658
-
659
- for frac in [0.33, 0.66, 1.0, 0.0]:
660
- try:
661
- await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
662
- except Exception:
663
- pass
664
- await asyncio.sleep(0.6)
665
-
666
- await asyncio.sleep(1.0)
667
-
668
- primary_script = get_extract_script(platform)
669
- fallback_script = GENERIC_EXTRACT_SCRIPT if primary_script != GENERIC_EXTRACT_SCRIPT else None
670
- data = await safe_evaluate(page, primary_script, fallback_script)
671
-
672
- body_text = data.get("bodyText", "")
673
- print(f"[Scraper] bodyText={len(body_text)} chars, name='{data.get('name','')}'")
674
-
675
- # ── Extraction pipeline ───────────────────────────────────────────────
676
- # 1. Regex extraction (fast, always runs)
677
- regex_result = regex_extract(body_text, platform)
678
-
679
- # 2. Groq LLM extraction (slower, enriches results)
680
- llm_result = groq_extract(body_text, platform)
681
-
682
- # 3. Merge: LLM wins, regex fills gaps
683
- merged, llm_used = merge_results(llm_result, regex_result)
684
-
685
- # 4. Platform-specific post-processing
686
- # Devpost: inject sidebar themes if PS list is empty
687
- themes = data.get("themes", [])
688
- if themes and not merged["problem_statements"]:
689
- seen = set()
690
- for t in themes:
691
- if t.lower() not in seen:
692
- seen.add(t.lower())
693
- merged["problem_statements"].append({"track": "Theme", "title": t})
694
-
695
- # Devpost: sidebar prize fallback
696
- sidebar_prize = data.get("sidebarPrize", "")
697
- if not merged["prize_pool"] and sidebar_prize:
698
- for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
699
- m = re.search(pat, sidebar_prize)
700
- if m:
701
- merged["prize_pool"] = m.group(1)
702
- break
703
- if not merged["prize_pool"]:
704
- merged["prize_pool"] = sidebar_prize[:100]
705
-
706
- return {
707
- "name": data.get("name", ""),
708
- "description": data.get("description", ""),
709
- "banner_url": data.get("banner_url", ""),
710
- "resource_links": data.get("resourceLinks", []),
711
- "scrape_success": bool(data.get("name") and len(data.get("name", "")) > 2),
712
- "llm_used": llm_used,
713
- **merged,
714
- }
715
-
716
- except Exception as e:
717
- import traceback
718
- traceback.print_exc()
719
- return {"scrape_success": False, "error": str(e)}
720
- finally:
721
- await context.close()
722
-
723
-
724
- # ══════════════════════════════════════════════════════════════════════════════
725
- # APP LIFECYCLE
726
- # ══════════════════════════════════════════════════════════════════════════════
727
-
728
  @app.on_event("startup")
729
  async def startup() -> None:
730
  global playwright, browser
731
  from playwright.async_api import async_playwright
 
732
  playwright = await async_playwright().start()
733
  browser = await playwright.chromium.launch(
734
  headless=True,
735
- args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
736
  )
737
- groq_ready = "✓" if get_groq() else " (set GROQ_API_KEY for LLM enrichment)"
738
- print(f"[Scraper] Playwright ready. Groq={groq_ready}")
739
 
740
 
741
  @app.on_event("shutdown")
742
  async def shutdown() -> None:
743
  global playwright, browser
 
744
  try:
745
- if browser: await browser.close()
 
 
746
  finally:
747
  browser = None
 
748
  try:
749
- if playwright: await playwright.stop()
 
 
750
  finally:
751
  playwright = None
752
 
 
 
 
 
 
 
 
 
 
753
 
754
- # ══════════════════════════════════════════════════════════════════════════════
755
- # ROUTES
756
- # ══════════════════════════════════════════════════════════════════════════════
 
757
 
758
- @app.get("/")
759
- async def root():
760
- return {
761
- "status": "ok",
762
- "service": "HackTrack Scraper v4",
763
- "groq_enabled": get_groq() is not None,
764
- "platforms": ["Devfolio", "Devpost", "Unstop", "DoraHacks", "MLH", "HackerEarth", "HackerRank"],
765
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
 
 
 
 
 
 
 
 
 
 
 
767
 
768
- @app.get("/health")
769
- async def health():
770
- return {"status": "ok", "timestamp": datetime.utcnow().isoformat()}
771
 
772
 
773
  @app.post("/scrape", response_model=ScrapeResponse)
774
  async def scrape(request: ScrapeRequest):
775
  url = request.url.strip()
776
  platform = detect_platform(url)
777
- print(f"\n[Scraper] === {url} platform={platform} ===")
778
 
779
  try:
780
  data = await scrape_with_playwright(url, platform)
 
781
  response = ScrapeResponse(
782
  name=data.get("name", ""),
783
  platform=platform,
@@ -794,15 +530,11 @@ async def scrape(request: ScrapeRequest):
794
  resource_links=data.get("resource_links", []),
795
  scrape_success=data.get("scrape_success", False),
796
  url=url,
797
- llm_used=data.get("llm_used", False),
798
- )
799
- print(
800
- f"[Scraper] Done: name='{response.name}' "
801
- f"reg={response.registration_deadline} sub={response.submission_deadline} "
802
- f"prize='{response.prize_pool}' ps={len(response.problem_statements)} "
803
- f"llm={response.llm_used}"
804
  )
 
 
805
  return response
 
806
  except Exception as e:
807
  print(f"[Scraper] Endpoint error: {e}")
808
- return ScrapeResponse(platform=platform, url=url, scrape_success=False)
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel, Field
4
  import asyncio
5
  import re
6
  import sys
 
 
7
  from urllib.parse import urlparse
8
  from typing import List
9
  from datetime import datetime
10
 
 
 
 
11
  if sys.platform == "win32":
12
+ # Playwright launches a driver subprocess; Proactor loop supports subprocess APIs on Windows.
13
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
14
 
15
+ app = FastAPI(title="HackTrack Scraper", version="3.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # Global Playwright runtime objects reused across requests.
18
  playwright = None
19
  browser = None
20
 
 
26
  )
27
 
28
 
 
 
 
 
29
  class ScrapeRequest(BaseModel):
30
  url: str
31
 
 
46
  resource_links: List[dict] = Field(default_factory=list)
47
  scrape_success: bool = False
48
  url: str = ""
 
 
49
 
 
 
 
50
 
51
  def detect_platform(url: str) -> str:
52
  domain = urlparse(url).netloc.lower()
53
+ if "devfolio" in domain:
54
+ return "Devfolio"
55
+ elif "unstop" in domain:
56
+ return "Unstop"
57
+ elif "devpost" in domain:
58
+ return "Devpost"
59
+ elif "dorahacks" in domain:
60
+ return "DoraHacks"
61
  return "Other"
62
 
63
 
64
+ # ============================================================
65
+ # DATE PARSING — robust multi-format
66
+ # ============================================================
67
+ MONTH_MAP = {
68
+ "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3,
69
+ "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6,
70
+ "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9,
71
+ "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12,
72
+ }
73
 
74
  DATE_FORMATS = [
75
  "%Y-%m-%d", "%Y/%m/%d",
76
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
77
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
78
+ "%m/%d/%Y", "%d/%m/%Y",
79
+ "%B %d", "%b %d",
80
  ]
81
 
82
 
83
  def parse_any_date(text: str, fallback_year: int = None) -> str:
84
+ """Parse many date formats to YYYY-MM-DD. Handles partial dates."""
85
  if not text:
86
  return ""
87
  text = text.strip()
88
  text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
89
  text = re.sub(r"\s+", " ", text)
90
+
91
+ if not fallback_year:
92
+ fallback_year = datetime.now().year
93
 
94
  for fmt in DATE_FORMATS:
95
  try:
96
  dt = datetime.strptime(text, fmt)
97
+ if dt.year == 1900: # no year in format
98
  dt = dt.replace(year=fallback_year)
99
  if dt < datetime.now():
100
  dt = dt.replace(year=fallback_year + 1)
 
105
 
106
 
107
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
108
+ """Find dates within `window` chars after any keyword."""
109
  lower = text.lower()
110
+ all_date_patterns = [
111
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
112
  r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
113
  r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
 
117
  idx = lower.find(kw.lower())
118
  if idx == -1:
119
  continue
120
+ chunk = text[idx:idx + window]
121
+ for pat in all_date_patterns:
122
+ match = re.search(pat, chunk, re.IGNORECASE)
123
+ if match:
124
+ parsed = parse_any_date(match.group(1))
125
  if parsed:
126
  return parsed
127
  return ""
128
 
129
 
130
+ # ============================================================
131
+ # EXTRACT from full page innerText (the reliable approach)
132
+ # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ def extract_all_from_text(body_text: str, platform: str) -> dict:
135
+ """Extract hackathon details from page innerText using text patterns."""
136
  result = {
137
  "registration_deadline": "",
138
  "submission_deadline": "",
 
144
  "problem_statements": [],
145
  }
146
 
147
+ # ---- DATES ----
148
+ # Registration deadline
149
  result["registration_deadline"] = find_dates_near(body_text, [
150
  "registration close", "registrations close", "register by",
151
  "last date to register", "registration deadline", "applications close",
152
  "apply by", "registration ends", "sign up deadline",
153
  ])
154
+
155
+ # Submission deadline
156
  result["submission_deadline"] = find_dates_near(body_text, [
157
  "submission deadline", "submission closes", "submissions close",
158
  "submit by", "last date to submit", "submission end",
159
+ "final submission", "project submission",
160
+ "deadline", # generic fallback last
161
  ])
162
 
163
+ # Start date — Devfolio uses "Runs from Mar 25 - 26, 2026"
164
  runs_from = re.search(
165
  r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
166
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
167
+ r"(?:\s*[-–]\s*(\d{1,2}))?"
168
+ r"(?:[,\s]+(\d{4}))?",
169
+ body_text, re.IGNORECASE
170
  )
171
  if runs_from:
172
+ start_text = runs_from.group(1)
173
  year = runs_from.group(3) or str(datetime.now().year)
174
+ result["start_date"] = parse_any_date(f"{start_text} {year}")
175
+ if runs_from.group(2) and runs_from.group(1):
176
  month = runs_from.group(1).split()[0]
177
  result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
178
 
179
  if not result["start_date"]:
180
  result["start_date"] = find_dates_near(body_text, [
181
+ "start date", "starts on", "begins on", "hackathon starts",
182
+ "event starts", "event date", "dates:",
183
  ])
184
+
185
  if not result["end_date"]:
186
  result["end_date"] = find_dates_near(body_text, [
187
  "end date", "ends on", "hackathon ends", "event ends",
188
  ])
189
+
190
+ # Result date
191
  result["result_date"] = find_dates_near(body_text, [
192
+ "result", "winners announced", "announcement", "winner announcement",
193
+ "results declared", "shortlist",
194
  ])
195
 
196
+ # ---- PRIZE POOL ----
197
  prize_patterns = [
198
  r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
199
  r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
 
202
  r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
203
  r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
204
  ]
205
+
206
+ # Find prize amounts near keywords like "prize", "reward", "worth", "win"
207
  prize_lower = body_text.lower()
208
  for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
209
  idx = prize_lower.find(kw)
210
  if idx == -1:
211
  continue
212
+ # Search ±200 chars around keyword
213
+ start = max(0, idx - 200)
214
+ chunk = body_text[start:idx + 200]
215
  for pat in prize_patterns:
216
+ match = re.search(pat, chunk, re.IGNORECASE)
217
+ if match:
218
+ result["prize_pool"] = match.group(1).strip()
219
  break
220
  if result["prize_pool"]:
221
  break
222
+
223
+ # Fallback: any large currency amount
224
  if not result["prize_pool"]:
225
  for pat in prize_patterns:
226
+ match = re.search(pat, body_text)
227
+ if match:
228
+ result["prize_pool"] = match.group(1).strip()
229
  break
230
 
231
+ # ---- TEAM SIZE ----
232
+ team_patterns = [
233
  r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
234
  r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
235
  r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
236
  r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
237
+ r"(\d+)\s*[-–]\s*(\d+)\s*$", # in FAQ: "2 - 4"
238
+ ]
239
+ for pat in team_patterns:
240
+ match = re.search(pat, body_text, re.IGNORECASE)
241
+ if match:
242
+ groups = [g for g in match.groups() if g]
243
+ if len(groups) == 2:
244
+ result["team_size"] = {"min": int(groups[0]), "max": int(groups[1])}
245
+ elif len(groups) == 1:
246
+ result["team_size"] = {"min": 1, "max": int(groups[0])}
247
  break
248
 
249
+ # ---- PROBLEM STATEMENTS / TRACKS / DOMAINS ----
250
+ ps = []
251
+ seen_ps = set()
252
 
253
+ # Pattern 1: "Domains: AI, ML, Web App" (Devfolio style)
254
+ domain_match = re.search(
255
  r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
256
+ body_text, re.IGNORECASE
257
  )
258
+ if domain_match:
259
+ items = re.split(r"[,•|/]", domain_match.group(1))
260
+ for item in items:
261
  item = item.strip().rstrip(".")
262
+ if 3 < len(item) < 150 and item.lower() not in seen_ps:
263
+ seen_ps.add(item.lower())
264
  ps.append({"track": "", "title": item})
265
 
266
+ # Pattern 2: Numbered problem statements: "PS1: ...", "Problem Statement 1 - ..."
267
+ for match in re.finditer(
268
  r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
269
+ body_text, re.IGNORECASE
270
  ):
271
+ num = match.group(1)
272
+ title = match.group(2).strip().split("\n")[0]
273
+ if title.lower() not in seen_ps and len(title) > 4:
274
+ seen_ps.add(title.lower())
275
+ ps.append({"track": f"Track {num}", "title": title})
276
+
277
+ # Pattern 3: Devpost-style theme tags (already in themes list from JS)
278
+ # Pattern 4: Bulleted lists after "Themes" or "Tracks" heading
279
+ for match in re.finditer(
280
  r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
281
  r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
282
+ body_text, re.IGNORECASE
283
  ):
284
+ items = re.findall(r"[-•●▸]\s*(.{5,200})", match.group(1))
285
+ for item in items:
286
  item = item.strip().split("\n")[0]
287
+ if item.lower() not in seen_ps and 4 < len(item) < 200:
288
+ seen_ps.add(item.lower())
289
  ps.append({"track": "", "title": item})
290
 
291
  result["problem_statements"] = ps[:20]
292
  return result
293
 
294
 
295
+ # ============================================================
296
+ # PLAYWRIGHT SCRAPER gets innerText + meta from rendered page
297
+ # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ EXTRACT_SCRIPT = """() => {
 
 
 
 
 
300
  const getMeta = (name) => {
301
  const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
302
  return el ? el.getAttribute('content') || '' : '';
303
  };
304
+
305
+ // Name: try multiple selectors
306
  const nameSelectors = [
307
+ 'h1',
308
+ '.hackathon-name', '.event-name', '.challenge-title',
309
+ '#challenge-title', '.opp-title',
310
  ];
311
  let name = '';
312
  for (const sel of nameSelectors) {
313
  const el = document.querySelector(sel);
314
  if (el && el.textContent.trim().length > 2) {
315
+ name = el.textContent.trim();
316
+ break;
317
  }
318
  }
319
  name = name || getMeta('og:title') || document.title.split('|')[0].trim();
320
 
321
+ // Banner
322
  const banner = getMeta('og:image') || '';
323
+
324
+ // Description
325
  let description = getMeta('og:description') || getMeta('description') || '';
326
+
327
+ // Full page text for parsing
328
  const bodyText = document.body.innerText;
329
 
330
+ // For Devpost: extract themes from tag links
331
  const themes = [];
332
  document.querySelectorAll('a[href*="themes"]').forEach(a => {
333
  const t = a.textContent.trim();
334
  if (t && t.length > 2 && t.length < 100) themes.push(t);
335
  });
336
 
337
+ // Devpost sidebar prize text
338
  let sidebarPrize = '';
339
  document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
340
  const t = el.textContent.trim();
341
  if (t && t.length > 2) sidebarPrize += t + ' ';
342
  });
343
 
344
+ // Resource links: PDFs, Google Drive, problem statements, rules, guidelines
345
  const resourceLinks = [];
346
  const seenHrefs = new Set();
347
+ const linkKeywords = ['problem', 'statement', 'pdf', 'rule', 'guideline', 'brochure', 'document', 'brief', 'challenge', 'track', 'theme', 'schedule', 'timeline'];
 
348
  document.querySelectorAll('a[href]').forEach(a => {
349
  const href = a.href || '';
350
  const text = a.textContent.trim();
351
+ const hrefLower = href.toLowerCase();
352
+ const textLower = text.toLowerCase();
353
  if (seenHrefs.has(href) || !href || href === '#') return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+ const isPdf = hrefLower.endsWith('.pdf') || hrefLower.includes('/pdf');
356
+ const isDrive = hrefLower.includes('drive.google.com') || hrefLower.includes('docs.google.com');
357
+ const isDropbox = hrefLower.includes('dropbox.com');
358
+ const isRelevant = linkKeywords.some(kw => textLower.includes(kw) || hrefLower.includes(kw));
 
 
 
 
 
 
 
 
 
 
 
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  if (isPdf || isDrive || isDropbox || isRelevant) {
361
  seenHrefs.add(href);
362
  resourceLinks.push({
 
372
  description: description.substring(0, 2000),
373
  banner_url: banner,
374
  bodyText: bodyText.substring(0, 30000),
375
+ themes: themes,
376
+ sidebarPrize: sidebarPrize.trim(),
377
  resourceLinks: resourceLinks.slice(0, 30),
378
  };
379
  }"""
380
 
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  @app.on_event("startup")
383
  async def startup() -> None:
384
  global playwright, browser
385
  from playwright.async_api import async_playwright
386
+
387
  playwright = await async_playwright().start()
388
  browser = await playwright.chromium.launch(
389
  headless=True,
390
+ args=["--no-sandbox", "--disable-setuid-sandbox"],
391
  )
392
+ print("[Scraper] Playwright browser initialized")
 
393
 
394
 
395
  @app.on_event("shutdown")
396
  async def shutdown() -> None:
397
  global playwright, browser
398
+
399
  try:
400
+ if browser is not None:
401
+ await browser.close()
402
+ print("[Scraper] Browser closed")
403
  finally:
404
  browser = None
405
+
406
  try:
407
+ if playwright is not None:
408
+ await playwright.stop()
409
+ print("[Scraper] Playwright stopped")
410
  finally:
411
  playwright = None
412
 
413
+ async def scrape_with_playwright(url: str, platform: str) -> dict:
414
+ """Scrape using Playwright — renders JS, grabs full innerText for parsing."""
415
+ global browser
416
+ try:
417
+ if browser is None:
418
+ return {
419
+ "scrape_success": False,
420
+ "error": "Browser is not initialized. Service startup failed.",
421
+ }
422
 
423
+ context = await browser.new_context(
424
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
425
+ viewport={"width": 1920, "height": 1080},
426
+ )
427
 
428
+ try:
429
+ page = await context.new_page()
430
+
431
+ print(f"[Scraper] Navigating to {url} (platform: {platform})")
432
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
433
+
434
+ # Wait for JS rendering longer for SPAs
435
+ wait_time = 8 if platform in ("Unstop",) else 5
436
+ print(f"[Scraper] Waiting {wait_time}s for JS rendering...")
437
+ await page.wait_for_timeout(wait_time * 1000)
438
+
439
+ # Scroll to trigger lazy content
440
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 3)")
441
+ await asyncio.sleep(1)
442
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 2 / 3)")
443
+ await asyncio.sleep(1)
444
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
+ await asyncio.sleep(1)
446
+ await page.evaluate("window.scrollTo(0, 0)")
447
+ await asyncio.sleep(0.5)
448
+
449
+ # Extract structured + raw text data
450
+ data = await page.evaluate(EXTRACT_SCRIPT)
451
+
452
+ body_text = data.get("bodyText", "")
453
+ name = data.get("name", "")
454
+ description = data.get("description", "")
455
+
456
+ print(f"[Scraper] Extracted name: '{name}', bodyText length: {len(body_text)}")
457
+
458
+ # Parse all fields from full innerText
459
+ extracted = extract_all_from_text(body_text, platform)
460
+
461
+ # Devpost themes from sidebar tags
462
+ themes = data.get("themes", [])
463
+ if themes and not extracted["problem_statements"]:
464
+ seen = set()
465
+ for t in themes:
466
+ if t.lower() not in seen:
467
+ seen.add(t.lower())
468
+ extracted["problem_statements"].append({"track": "Theme", "title": t})
469
+
470
+ # Sidebar prize fallback (Devpost)
471
+ if not extracted["prize_pool"] and data.get("sidebarPrize"):
472
+ prize_text = data["sidebarPrize"]
473
+ for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
474
+ m = re.search(pat, prize_text)
475
+ if m:
476
+ extracted["prize_pool"] = m.group(1)
477
+ break
478
+ if not extracted["prize_pool"]:
479
+ extracted["prize_pool"] = prize_text[:100]
480
+
481
+ return {
482
+ "name": name,
483
+ "description": description,
484
+ "banner_url": data.get("banner_url", ""),
485
+ "scrape_success": bool(name and len(name) > 2),
486
+ "resource_links": data.get("resourceLinks", []),
487
+ **extracted,
488
+ }
489
+ finally:
490
+ await context.close()
491
 
492
+ except Exception as e:
493
+ print(f"[Scraper] Error: {e}")
494
+ import traceback
495
+ traceback.print_exc()
496
+ return {"scrape_success": False, "error": str(e)}
497
+
498
+
499
+ # ============================================================
500
+ # API ROUTES
501
+ # ============================================================
502
 
503
+ @app.get("/")
504
+ async def root():
505
+ return {"status": "ok", "service": "HackTrack Scraper v3"}
506
 
507
 
508
  @app.post("/scrape", response_model=ScrapeResponse)
509
  async def scrape(request: ScrapeRequest):
510
  url = request.url.strip()
511
  platform = detect_platform(url)
512
+ print(f"\n[Scraper] === New scrape request: {url} (platform={platform}) ===")
513
 
514
  try:
515
  data = await scrape_with_playwright(url, platform)
516
+
517
  response = ScrapeResponse(
518
  name=data.get("name", ""),
519
  platform=platform,
 
530
  resource_links=data.get("resource_links", []),
531
  scrape_success=data.get("scrape_success", False),
532
  url=url,
 
 
 
 
 
 
 
533
  )
534
+
535
+ print(f"[Scraper] Result: name='{response.name}', dates=({response.start_date}, {response.end_date}, reg={response.registration_deadline}, sub={response.submission_deadline}), prize='{response.prize_pool}', team={response.team_size}, ps={len(response.problem_statements)}")
536
  return response
537
+
538
  except Exception as e:
539
  print(f"[Scraper] Endpoint error: {e}")
540
+ return ScrapeResponse(platform=platform, url=url, scrape_success=False)