Rudraaaa76 commited on
Commit
01ac6d7
·
verified ·
1 Parent(s): 6d8a8d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +632 -364
app.py CHANGED
@@ -1,22 +1,29 @@
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel, Field
4
  import asyncio
5
  import re
6
  import sys
 
 
7
  from urllib.parse import urlparse
8
- from typing import List
9
- from datetime import datetime
10
 
11
  if sys.platform == "win32":
12
- # Playwright launches a driver subprocess; Proactor loop supports subprocess APIs on Windows.
13
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
14
 
15
- app = FastAPI(title="HackTrack Scraper", version="3.0.0")
16
-
17
- # Global Playwright runtime objects reused across requests.
18
  playwright = None
19
- browser = None
20
 
21
  app.add_middleware(
22
  CORSMiddleware,
@@ -26,10 +33,13 @@ app.add_middleware(
26
  )
27
 
28
 
 
 
 
 
29
  class ScrapeRequest(BaseModel):
30
  url: str
31
 
32
-
33
  class ScrapeResponse(BaseModel):
34
  name: str = ""
35
  platform: str = ""
@@ -48,322 +58,384 @@ class ScrapeResponse(BaseModel):
48
  url: str = ""
49
 
50
 
 
 
 
 
51
  def detect_platform(url: str) -> str:
52
  domain = urlparse(url).netloc.lower()
53
- if "devfolio" in domain:
54
- return "Devfolio"
55
- elif "unstop" in domain:
56
- return "Unstop"
57
- elif "devpost" in domain:
58
- return "Devpost"
59
- elif "dorahacks" in domain:
60
- return "DoraHacks"
61
  return "Other"
62
 
63
 
64
  # ============================================================
65
- # DATE PARSING — robust multi-format
66
  # ============================================================
67
- MONTH_MAP = {
68
- "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3,
69
- "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6,
70
- "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9,
71
- "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12,
72
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  DATE_FORMATS = [
75
  "%Y-%m-%d", "%Y/%m/%d",
76
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
77
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
78
  "%m/%d/%Y", "%d/%m/%Y",
79
- "%B %d", "%b %d",
80
  ]
81
 
82
-
83
- def parse_any_date(text: str, fallback_year: int = None) -> str:
84
- """Parse many date formats to YYYY-MM-DD. Handles partial dates."""
85
- if not text:
86
- return ""
87
- text = text.strip()
88
- text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
89
  text = re.sub(r"\s+", " ", text)
90
-
91
- if not fallback_year:
92
- fallback_year = datetime.now().year
93
-
94
  for fmt in DATE_FORMATS:
95
  try:
96
  dt = datetime.strptime(text, fmt)
97
- if dt.year == 1900: # no year in format
98
- dt = dt.replace(year=fallback_year)
99
- if dt < datetime.now():
100
- dt = dt.replace(year=fallback_year + 1)
101
  return dt.strftime("%Y-%m-%d")
102
  except ValueError:
103
  continue
104
  return ""
105
 
106
-
107
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
108
- """Find dates within `window` chars after any keyword."""
109
  lower = text.lower()
110
- all_date_patterns = [
111
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
112
- r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
113
- r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
 
 
 
 
114
  r"(\d{1,2}/\d{1,2}/\d{4})",
115
  ]
116
  for kw in keywords:
117
  idx = lower.find(kw.lower())
118
- if idx == -1:
119
- continue
120
- chunk = text[idx:idx + window]
121
- for pat in all_date_patterns:
122
- match = re.search(pat, chunk, re.IGNORECASE)
123
- if match:
124
- parsed = parse_any_date(match.group(1))
125
- if parsed:
126
- return parsed
127
  return ""
128
 
129
 
130
  # ============================================================
131
- # EXTRACT from full page innerText (the reliable approach)
132
  # ============================================================
133
 
134
- def extract_all_from_text(body_text: str, platform: str) -> dict:
135
- """Extract hackathon details from page innerText using text patterns."""
136
- result = {
137
- "registration_deadline": "",
138
- "submission_deadline": "",
139
- "result_date": "",
140
- "start_date": "",
141
- "end_date": "",
142
- "prize_pool": "",
143
- "team_size": {"min": 1, "max": 4},
144
- "problem_statements": [],
 
 
 
 
 
145
  }
146
 
147
- # ---- DATES ----
148
- # Registration deadline
149
- result["registration_deadline"] = find_dates_near(body_text, [
150
- "registration close", "registrations close", "register by",
151
- "last date to register", "registration deadline", "applications close",
152
- "apply by", "registration ends", "sign up deadline",
153
- ])
154
 
155
- # Submission deadline
156
- result["submission_deadline"] = find_dates_near(body_text, [
157
- "submission deadline", "submission closes", "submissions close",
158
- "submit by", "last date to submit", "submission end",
159
- "final submission", "project submission",
160
- "deadline", # generic fallback last
161
- ])
162
 
163
- # Start date — Devfolio uses "Runs from Mar 25 - 26, 2026"
164
- runs_from = re.search(
165
- r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
166
- r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
167
- r"(?:\s*[-–]\s*(\d{1,2}))?"
168
- r"(?:[,\s]+(\d{4}))?",
169
- body_text, re.IGNORECASE
170
- )
171
- if runs_from:
172
- start_text = runs_from.group(1)
173
- year = runs_from.group(3) or str(datetime.now().year)
174
- result["start_date"] = parse_any_date(f"{start_text} {year}")
175
- if runs_from.group(2) and runs_from.group(1):
176
- month = runs_from.group(1).split()[0]
177
- result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
178
 
179
- if not result["start_date"]:
180
- result["start_date"] = find_dates_near(body_text, [
181
- "start date", "starts on", "begins on", "hackathon starts",
182
- "event starts", "event date", "dates:",
183
- ])
 
 
 
 
 
 
 
 
184
 
185
- if not result["end_date"]:
186
- result["end_date"] = find_dates_near(body_text, [
187
- "end date", "ends on", "hackathon ends", "event ends",
188
- ])
189
-
190
- # Result date
191
- result["result_date"] = find_dates_near(body_text, [
192
- "result", "winners announced", "announcement", "winner announcement",
193
- "results declared", "shortlist",
194
- ])
195
 
196
- # ---- PRIZE POOL ----
197
- prize_patterns = [
198
- r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
199
- r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
200
- r"(€\s*[\d,]+(?:\.\d+)?)",
201
- r"(£\s*[\d,]+(?:\.\d+)?)",
202
- r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
203
- r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
204
- ]
 
 
 
 
205
 
206
- # Find prize amounts near keywords like "prize", "reward", "worth", "win"
207
- prize_lower = body_text.lower()
208
- for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
209
- idx = prize_lower.find(kw)
210
- if idx == -1:
211
- continue
212
- # Search ±200 chars around keyword
213
- start = max(0, idx - 200)
214
- chunk = body_text[start:idx + 200]
215
- for pat in prize_patterns:
216
- match = re.search(pat, chunk, re.IGNORECASE)
217
- if match:
218
- result["prize_pool"] = match.group(1).strip()
219
- break
220
- if result["prize_pool"]:
221
- break
222
 
223
- # Fallback: any large currency amount
224
- if not result["prize_pool"]:
225
- for pat in prize_patterns:
226
- match = re.search(pat, body_text)
227
- if match:
228
- result["prize_pool"] = match.group(1).strip()
229
- break
230
-
231
- # ---- TEAM SIZE ----
232
- team_patterns = [
233
- r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
234
- r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
235
- r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
236
- r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
237
- r"(\d+)\s*[-–]\s*(\d+)\s*$", # in FAQ: "2 - 4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  ]
239
- for pat in team_patterns:
240
- match = re.search(pat, body_text, re.IGNORECASE)
241
- if match:
242
- groups = [g for g in match.groups() if g]
243
- if len(groups) == 2:
244
- result["team_size"] = {"min": int(groups[0]), "max": int(groups[1])}
245
- elif len(groups) == 1:
246
- result["team_size"] = {"min": 1, "max": int(groups[0])}
247
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- # ---- PROBLEM STATEMENTS / TRACKS / DOMAINS ----
250
  ps = []
251
  seen_ps = set()
252
-
253
- # Pattern 1: "Domains: AI, ML, Web App" (Devfolio style)
254
- domain_match = re.search(
255
- r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
256
- body_text, re.IGNORECASE
257
- )
258
- if domain_match:
259
- items = re.split(r"[,•|/]", domain_match.group(1))
260
- for item in items:
261
- item = item.strip().rstrip(".")
262
- if 3 < len(item) < 150 and item.lower() not in seen_ps:
263
- seen_ps.add(item.lower())
264
- ps.append({"track": "", "title": item})
265
-
266
- # Pattern 2: Numbered problem statements: "PS1: ...", "Problem Statement 1 - ..."
267
- for match in re.finditer(
268
- r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
269
- body_text, re.IGNORECASE
270
- ):
271
- num = match.group(1)
272
- title = match.group(2).strip().split("\n")[0]
273
- if title.lower() not in seen_ps and len(title) > 4:
274
  seen_ps.add(title.lower())
275
- ps.append({"track": f"Track {num}", "title": title})
276
-
277
- # Pattern 3: Devpost-style theme tags (already in themes list from JS)
278
- # Pattern 4: Bulleted lists after "Themes" or "Tracks" heading
279
- for match in re.finditer(
280
- r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
281
- r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
282
- body_text, re.IGNORECASE
283
- ):
284
- items = re.findall(r"[-•●▸]\s*(.{5,200})", match.group(1))
285
- for item in items:
286
- item = item.strip().split("\n")[0]
287
- if item.lower() not in seen_ps and 4 < len(item) < 200:
288
- seen_ps.add(item.lower())
289
- ps.append({"track": "", "title": item})
290
 
291
- result["problem_statements"] = ps[:20]
292
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
 
295
  # ============================================================
296
- # PLAYWRIGHT SCRAPER gets innerText + meta from rendered page
297
  # ============================================================
298
 
299
- EXTRACT_SCRIPT = """() => {
300
- const getMeta = (name) => {
301
- const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
302
  return el ? el.getAttribute('content') || '' : '';
303
  };
304
 
305
- // Name: try multiple selectors
306
- const nameSelectors = [
307
- 'h1',
308
- '.hackathon-name', '.event-name', '.challenge-title',
309
- '#challenge-title', '.opp-title',
310
- ];
311
- let name = '';
312
- for (const sel of nameSelectors) {
313
- const el = document.querySelector(sel);
314
- if (el && el.textContent.trim().length > 2) {
315
- name = el.textContent.trim();
316
- break;
317
- }
318
- }
319
- name = name || getMeta('og:title') || document.title.split('|')[0].trim();
320
-
321
- // Banner
322
- const banner = getMeta('og:image') || '';
323
-
324
- // Description
325
- let description = getMeta('og:description') || getMeta('description') || '';
326
 
327
- // Full page text for parsing
328
- const bodyText = document.body.innerText;
 
 
 
 
 
 
329
 
330
- // For Devpost: extract themes from tag links
331
  const themes = [];
332
- document.querySelectorAll('a[href*="themes"]').forEach(a => {
 
333
  const t = a.textContent.trim();
334
- if (t && t.length > 2 && t.length < 100) themes.push(t);
335
- });
336
-
337
- // Devpost sidebar prize text
338
- let sidebarPrize = '';
339
- document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
340
- const t = el.textContent.trim();
341
- if (t && t.length > 2) sidebarPrize += t + ' ';
342
  });
343
 
344
- // Resource links: PDFs, Google Drive, problem statements, rules, guidelines
345
  const resourceLinks = [];
346
  const seenHrefs = new Set();
347
- const linkKeywords = ['problem', 'statement', 'pdf', 'rule', 'guideline', 'brochure', 'document', 'brief', 'challenge', 'track', 'theme', 'schedule', 'timeline'];
348
- document.querySelectorAll('a[href]').forEach(a => {
349
  const href = a.href || '';
350
  const text = a.textContent.trim();
351
- const hrefLower = href.toLowerCase();
352
- const textLower = text.toLowerCase();
353
  if (seenHrefs.has(href) || !href || href === '#') return;
354
-
355
- const isPdf = hrefLower.endsWith('.pdf') || hrefLower.includes('/pdf');
356
- const isDrive = hrefLower.includes('drive.google.com') || hrefLower.includes('docs.google.com');
357
- const isDropbox = hrefLower.includes('dropbox.com');
358
- const isRelevant = linkKeywords.some(kw => textLower.includes(kw) || hrefLower.includes(kw));
359
-
360
- if (isPdf || isDrive || isDropbox || isRelevant) {
361
  seenHrefs.add(href);
362
- resourceLinks.push({
363
- text: text.substring(0, 150) || 'Document',
364
- url: href,
365
- type: isPdf ? 'pdf' : isDrive ? 'google_drive' : isDropbox ? 'dropbox' : 'link',
366
- });
367
  }
368
  });
369
 
@@ -371,150 +443,350 @@ EXTRACT_SCRIPT = """() => {
371
  name: name.substring(0, 200),
372
  description: description.substring(0, 2000),
373
  banner_url: banner,
374
- bodyText: bodyText.substring(0, 30000),
 
375
  themes: themes,
376
- sidebarPrize: sidebarPrize.trim(),
377
- resourceLinks: resourceLinks.slice(0, 30),
378
  };
379
  }"""
380
 
381
 
382
- @app.on_event("startup")
383
- async def startup() -> None:
384
- global playwright, browser
385
- from playwright.async_api import async_playwright
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
- playwright = await async_playwright().start()
388
- browser = await playwright.chromium.launch(
389
- headless=True,
390
- args=["--no-sandbox", "--disable-setuid-sandbox"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  )
392
- print("[Scraper] Playwright browser initialized")
 
 
 
 
 
 
 
 
 
 
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- @app.on_event("shutdown")
396
- async def shutdown() -> None:
397
- global playwright, browser
 
 
 
 
 
 
 
 
 
 
 
398
 
399
- try:
400
- if browser is not None:
401
- await browser.close()
402
- print("[Scraper] Browser closed")
403
- finally:
404
- browser = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
- try:
407
- if playwright is not None:
408
- await playwright.stop()
409
- print("[Scraper] Playwright stopped")
410
- finally:
411
- playwright = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  async def scrape_with_playwright(url: str, platform: str) -> dict:
414
- """Scrape using Playwright — renders JS, grabs full innerText for parsing."""
415
  global browser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  try:
417
- if browser is None:
418
- return {
419
- "scrape_success": False,
420
- "error": "Browser is not initialized. Service startup failed.",
421
- }
422
-
423
- context = await browser.new_context(
424
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
425
- viewport={"width": 1920, "height": 1080},
426
- )
427
-
428
  try:
429
- page = await context.new_page()
430
-
431
- print(f"[Scraper] Navigating to {url} (platform: {platform})")
432
- await page.goto(url, wait_until="domcontentloaded", timeout=20000)
433
-
434
- # Wait for JS rendering longer for SPAs
435
- wait_time = 8 if platform in ("Unstop",) else 5
436
- print(f"[Scraper] Waiting {wait_time}s for JS rendering...")
437
- await page.wait_for_timeout(wait_time * 1000)
438
-
439
- # Scroll to trigger lazy content
440
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 3)")
441
- await asyncio.sleep(1)
442
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 2 / 3)")
443
- await asyncio.sleep(1)
444
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
- await asyncio.sleep(1)
446
- await page.evaluate("window.scrollTo(0, 0)")
447
- await asyncio.sleep(0.5)
448
-
449
- # Extract structured + raw text data
450
- data = await page.evaluate(EXTRACT_SCRIPT)
451
-
452
- body_text = data.get("bodyText", "")
453
- name = data.get("name", "")
454
- description = data.get("description", "")
455
-
456
- print(f"[Scraper] Extracted name: '{name}', bodyText length: {len(body_text)}")
457
-
458
- # Parse all fields from full innerText
459
- extracted = extract_all_from_text(body_text, platform)
460
-
461
- # Devpost themes from sidebar tags
462
- themes = data.get("themes", [])
463
- if themes and not extracted["problem_statements"]:
464
- seen = set()
465
- for t in themes:
466
- if t.lower() not in seen:
467
- seen.add(t.lower())
468
- extracted["problem_statements"].append({"track": "Theme", "title": t})
469
-
470
- # Sidebar prize fallback (Devpost)
471
- if not extracted["prize_pool"] and data.get("sidebarPrize"):
472
- prize_text = data["sidebarPrize"]
473
- for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
474
- m = re.search(pat, prize_text)
475
- if m:
476
- extracted["prize_pool"] = m.group(1)
477
- break
478
- if not extracted["prize_pool"]:
479
- extracted["prize_pool"] = prize_text[:100]
480
-
481
- return {
482
- "name": name,
483
- "description": description,
484
- "banner_url": data.get("banner_url", ""),
485
- "scrape_success": bool(name and len(name) > 2),
486
- "resource_links": data.get("resourceLinks", []),
487
- **extracted,
488
  }
489
- finally:
490
- await context.close()
 
 
491
 
492
  except Exception as e:
493
- print(f"[Scraper] Error: {e}")
494
  import traceback
495
  traceback.print_exc()
496
  return {"scrape_success": False, "error": str(e)}
 
 
497
 
498
 
499
  # ============================================================
500
- # API ROUTES
501
  # ============================================================
502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  @app.get("/")
504
  async def root():
505
- return {"status": "ok", "service": "HackTrack Scraper v3"}
 
 
 
 
 
 
 
 
 
506
 
507
 
508
  @app.post("/scrape", response_model=ScrapeResponse)
509
  async def scrape(request: ScrapeRequest):
510
  url = request.url.strip()
511
  platform = detect_platform(url)
512
- print(f"\n[Scraper] === New scrape request: {url} (platform={platform}) ===")
513
-
514
  try:
515
  data = await scrape_with_playwright(url, platform)
516
-
517
- response = ScrapeResponse(
518
  name=data.get("name", ""),
519
  platform=platform,
520
  banner_url=data.get("banner_url", ""),
@@ -531,10 +803,6 @@ async def scrape(request: ScrapeRequest):
531
  scrape_success=data.get("scrape_success", False),
532
  url=url,
533
  )
534
-
535
- print(f"[Scraper] Result: name='{response.name}', dates=({response.start_date}, {response.end_date}, reg={response.registration_deadline}, sub={response.submission_deadline}), prize='{response.prize_pool}', team={response.team_size}, ps={len(response.problem_statements)}")
536
- return response
537
-
538
  except Exception as e:
539
  print(f"[Scraper] Endpoint error: {e}")
540
- return ScrapeResponse(platform=platform, url=url, scrape_success=False)
 
1
+ """
2
+ HackTrack Scraper v5.0 - Structured extraction, no LLM
3
+ - Devfolio : reads __NEXT_DATA__ JSON blob (Next.js SSR, 100% accurate dates)
4
+ - Unstop : calls api.unstop.com REST API directly (structured JSON)
5
+ - Devpost : reads <time datetime> tags + structured sidebar
6
+ - Others : regex fallback on bodyText
7
+ """
8
+
9
  from fastapi import FastAPI
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from pydantic import BaseModel, Field
12
  import asyncio
13
  import re
14
  import sys
15
+ import json
16
+ import httpx
17
  from urllib.parse import urlparse
18
+ from typing import List, Optional
19
+ from datetime import datetime, timezone
20
 
21
  if sys.platform == "win32":
 
22
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
23
 
24
+ app = FastAPI(title="HackTrack Scraper", version="5.0.0")
 
 
25
  playwright = None
26
+ browser = None
27
 
28
  app.add_middleware(
29
  CORSMiddleware,
 
33
  )
34
 
35
 
36
+ # ============================================================
37
+ # MODELS
38
+ # ============================================================
39
+
40
  class ScrapeRequest(BaseModel):
41
  url: str
42
 
 
43
  class ScrapeResponse(BaseModel):
44
  name: str = ""
45
  platform: str = ""
 
58
  url: str = ""
59
 
60
 
61
+ # ============================================================
62
+ # PLATFORM DETECTION
63
+ # ============================================================
64
+
65
  def detect_platform(url: str) -> str:
66
  domain = urlparse(url).netloc.lower()
67
+ if "devfolio" in domain: return "Devfolio"
68
+ if "unstop" in domain: return "Unstop"
69
+ if "devpost" in domain: return "Devpost"
70
+ if "dorahacks" in domain: return "DoraHacks"
71
+ if "mlh.io" in domain: return "MLH"
72
+ if "hackerearth" in domain: return "HackerEarth"
 
 
73
  return "Other"
74
 
75
 
76
  # ============================================================
77
+ # DATE HELPERS
78
  # ============================================================
79
+
80
+ def iso_to_date(val: Optional[str]) -> str:
81
+ """Convert ISO-8601 / Unix timestamp -> YYYY-MM-DD."""
82
+ if not val:
83
+ return ""
84
+ val = str(val).strip()
85
+ if re.fullmatch(r"\d{10}", val):
86
+ try:
87
+ return datetime.fromtimestamp(int(val), tz=timezone.utc).strftime("%Y-%m-%d")
88
+ except Exception:
89
+ return ""
90
+ if re.fullmatch(r"\d{13}", val):
91
+ try:
92
+ return datetime.fromtimestamp(int(val) / 1000, tz=timezone.utc).strftime("%Y-%m-%d")
93
+ except Exception:
94
+ return ""
95
+ m = re.match(r"(\d{4}-\d{2}-\d{2})", val)
96
+ if m:
97
+ return m.group(1)
98
+ return ""
99
 
100
  DATE_FORMATS = [
101
  "%Y-%m-%d", "%Y/%m/%d",
102
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
103
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
104
  "%m/%d/%Y", "%d/%m/%Y",
 
105
  ]
106
 
107
+ def parse_any_date(text: str) -> str:
108
+ if not text: return ""
109
+ text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text.strip())
 
 
 
 
110
  text = re.sub(r"\s+", " ", text)
 
 
 
 
111
  for fmt in DATE_FORMATS:
112
  try:
113
  dt = datetime.strptime(text, fmt)
114
+ if dt.year == 1900:
115
+ dt = dt.replace(year=datetime.now().year)
 
 
116
  return dt.strftime("%Y-%m-%d")
117
  except ValueError:
118
  continue
119
  return ""
120
 
 
121
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
 
122
  lower = text.lower()
123
+ patterns = [
124
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
125
+ r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
126
+ r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
127
+ r"[,]?\s+\d{4})",
128
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
129
+ r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
130
+ r"\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
131
  r"(\d{1,2}/\d{1,2}/\d{4})",
132
  ]
133
  for kw in keywords:
134
  idx = lower.find(kw.lower())
135
+ if idx == -1: continue
136
+ chunk = text[idx: idx + window]
137
+ for pat in patterns:
138
+ m = re.search(pat, chunk, re.IGNORECASE)
139
+ if m:
140
+ parsed = parse_any_date(m.group(1))
141
+ if parsed: return parsed
 
 
142
  return ""
143
 
144
 
145
  # ============================================================
146
+ # DEVFOLIO - __NEXT_DATA__ extraction script
147
  # ============================================================
148
 
149
+ DEVFOLIO_SCRIPT = """() => {
150
+ const nextEl = document.getElementById('__NEXT_DATA__');
151
+ let nextData = null;
152
+ if (nextEl) { try { nextData = JSON.parse(nextEl.textContent); } catch(e) {} }
153
+
154
+ function findHackathon(obj, depth) {
155
+ if (!obj || typeof obj !== 'object' || depth > 8) return null;
156
+ if (obj.starts_at || obj.registration_deadline || obj.ends_at || obj.slug) return obj;
157
+ for (const k of Object.keys(obj)) {
158
+ const child = obj[k];
159
+ if (child && typeof child === 'object') {
160
+ const found = findHackathon(child, depth + 1);
161
+ if (found) return found;
162
+ }
163
+ }
164
+ return null;
165
  }
166
 
167
+ let hackathon = nextData ? findHackathon(nextData, 0) : null;
 
 
 
 
 
 
168
 
169
+ const getMeta = (n) => {
170
+ const el = document.querySelector('meta[property="' + n + '"], meta[name="' + n + '"]');
171
+ return el ? el.getAttribute('content') || '' : '';
172
+ };
 
 
 
173
 
174
+ let name = '';
175
+ if (hackathon) name = hackathon.name || hackathon.title || '';
176
+ if (!name) {
177
+ const el = document.querySelector('h1');
178
+ if (el && el.textContent.trim().length > 2) name = el.textContent.trim();
179
+ }
180
+ name = name || getMeta('og:title') || document.title.split('|')[0].trim();
 
 
 
 
 
 
 
 
181
 
182
+ const banner = (hackathon && (hackathon.cover_image || hackathon.banner_image)) || getMeta('og:image') || '';
183
+ const description = (hackathon && (hackathon.tagline || hackathon.description)) || getMeta('og:description') || '';
184
+ const starts_at = (hackathon && (hackathon.starts_at || hackathon.start_date)) || '';
185
+ const ends_at = (hackathon && (hackathon.ends_at || hackathon.end_date)) || '';
186
+ const reg_dl = (hackathon && (hackathon.registration_deadline || hackathon.registration_ends_at)) || '';
187
+ const sub_dl = (hackathon && (hackathon.submission_deadline || hackathon.submission_ends_at)) || '';
188
+ const result_date = (hackathon && (hackathon.result_date || hackathon.results_at)) || '';
189
+
190
+ let prize_pool = '';
191
+ if (hackathon) {
192
+ if (hackathon.prize_pool != null) prize_pool = String(hackathon.prize_pool);
193
+ else if (hackathon.prize) prize_pool = String(hackathon.prize);
194
+ }
195
 
196
+ let team_min = 1, team_max = 4;
197
+ if (hackathon) {
198
+ if (hackathon.min_team_size) team_min = hackathon.min_team_size;
199
+ if (hackathon.max_team_size) team_max = hackathon.max_team_size;
200
+ }
 
 
 
 
 
201
 
202
+ const ps = [];
203
+ const seenPs = new Set();
204
+ const tracks = hackathon ? (hackathon.tracks || hackathon.themes || hackathon.problem_statements || []) : [];
205
+ if (Array.isArray(tracks)) {
206
+ for (const t of tracks) {
207
+ const title = t.title || t.name || t.track || (typeof t === 'string' ? t : '');
208
+ const desc = t.description || t.desc || '';
209
+ if (title && !seenPs.has(title.toLowerCase())) {
210
+ seenPs.add(title.toLowerCase());
211
+ ps.push({ track: t.track || '', title: title, description: desc });
212
+ }
213
+ }
214
+ }
215
 
216
+ const resourceLinks = [];
217
+ const seenHrefs = new Set();
218
+ const kws = ['problem','statement','pdf','rule','guideline','brochure','document','brief','challenge','track','theme','schedule','timeline'];
219
+ document.querySelectorAll('a[href]').forEach(function(a) {
220
+ const href = a.href || '';
221
+ const text = a.textContent.trim();
222
+ const hl = href.toLowerCase(), tl = text.toLowerCase();
223
+ if (seenHrefs.has(href) || !href || href === '#') return;
224
+ const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
225
+ const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
226
+ const isRel = kws.some(function(kw) { return tl.includes(kw) || hl.includes(kw); });
227
+ if (isPdf || isDrive || isRel) {
228
+ seenHrefs.add(href);
229
+ resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: isPdf ? 'pdf' : isDrive ? 'google_drive' : 'link' });
230
+ }
231
+ });
232
 
233
+ return {
234
+ name: name.substring(0, 200),
235
+ description: description.substring(0, 2000),
236
+ banner_url: banner,
237
+ starts_at: starts_at,
238
+ ends_at: ends_at,
239
+ registration_deadline: reg_dl,
240
+ submission_deadline: sub_dl,
241
+ result_date: result_date,
242
+ prize_pool: prize_pool,
243
+ team_min: team_min,
244
+ team_max: team_max,
245
+ problem_statements: ps.slice(0, 20),
246
+ resource_links: resourceLinks.slice(0, 30),
247
+ bodyText: document.body.innerText.substring(0, 15000),
248
+ found_next_data: hackathon !== null,
249
+ hackathon_keys: hackathon ? Object.keys(hackathon).join(',') : ''
250
+ };
251
+ }"""
252
+
253
+
254
+ def parse_devfolio(data: dict) -> dict:
255
+ return {
256
+ "name": data.get("name", ""),
257
+ "description": data.get("description", ""),
258
+ "banner_url": data.get("banner_url", ""),
259
+ "start_date": iso_to_date(data.get("starts_at")),
260
+ "end_date": iso_to_date(data.get("ends_at")),
261
+ "registration_deadline": iso_to_date(data.get("registration_deadline")),
262
+ "submission_deadline": iso_to_date(data.get("submission_deadline")),
263
+ "result_date": iso_to_date(data.get("result_date")),
264
+ "prize_pool": data.get("prize_pool", ""),
265
+ "team_size": {"min": data.get("team_min", 1), "max": data.get("team_max", 4)},
266
+ "problem_statements": data.get("problem_statements", []),
267
+ "resource_links": data.get("resource_links", []),
268
+ "body_text": data.get("bodyText", ""),
269
+ "found_structured": data.get("found_next_data", False),
270
+ }
271
+
272
+
273
+ # ============================================================
274
+ # UNSTOP - Direct REST API
275
+ # ============================================================
276
+
277
+ def extract_unstop_id(url: str) -> Optional[str]:
278
+ m = re.search(r"-(\d{5,8})(?:/|$|\?)", url)
279
+ if m: return m.group(1)
280
+ m = re.search(r"/(\d{5,8})(?:/|$|\?)", url)
281
+ return m.group(1) if m else None
282
+
283
+
284
+ async def fetch_unstop_api(opportunity_id: str) -> Optional[dict]:
285
+ headers = {
286
+ "User-Agent": "Mozilla/5.0 (compatible; HackTrackBot/1.0)",
287
+ "Accept": "application/json",
288
+ "Referer": "https://unstop.com/hackathons",
289
+ }
290
+ endpoints = [
291
+ f"https://unstop.com/api/public/opportunity/get-applied-detail?id={opportunity_id}",
292
+ f"https://unstop.com/api/public/opportunity/{opportunity_id}",
293
+ f"https://unstop.com/api/public/hackathon/{opportunity_id}",
294
  ]
295
+ for api_url in endpoints:
296
+ try:
297
+ async with httpx.AsyncClient(timeout=15) as client:
298
+ resp = await client.get(api_url, headers=headers)
299
+ if resp.status_code != 200:
300
+ continue
301
+ data = resp.json()
302
+ opp = data.get("data") or data
303
+ if isinstance(opp, dict) and "opportunity" in opp:
304
+ opp = opp["opportunity"]
305
+ if isinstance(opp, dict) and (opp.get("id") or opp.get("title")):
306
+ print(f"[Unstop] API success from {api_url}")
307
+ print(f"[Unstop] Keys: {list(opp.keys())[:15]}")
308
+ return opp
309
+ except Exception as e:
310
+ print(f"[Unstop] {api_url} failed: {e}")
311
+ return None
312
+
313
+
314
+ def parse_unstop_api(opp: dict) -> dict:
315
+ start_date = iso_to_date(opp.get("start_date") or opp.get("starts_at"))
316
+ end_date = iso_to_date(opp.get("end_date") or opp.get("ends_at"))
317
+ reg_deadline = iso_to_date(
318
+ opp.get("registrations_end_date") or
319
+ opp.get("registration_deadline") or
320
+ opp.get("registration_end_date")
321
+ )
322
+ sub_deadline = iso_to_date(
323
+ opp.get("submission_end_date") or
324
+ opp.get("submission_deadline") or
325
+ opp.get("last_submission_date")
326
+ )
327
+ result_date = iso_to_date(opp.get("result_date") or opp.get("results_at"))
328
+
329
+ prize_pool = ""
330
+ if opp.get("prize_amount"):
331
+ prize_pool = str(opp["prize_amount"])
332
+ elif opp.get("prizes") and isinstance(opp["prizes"], list):
333
+ total = 0
334
+ for p in opp["prizes"]:
335
+ try: total += float(str(p.get("amount") or 0).replace(",", ""))
336
+ except: pass
337
+ if total:
338
+ prize_pool = "Rs. {:,}".format(int(total))
339
+ elif opp.get("total_prize"):
340
+ prize_pool = str(opp["total_prize"])
341
+
342
+ team_min = int(opp.get("min_team_size") or opp.get("min_team") or 1)
343
+ team_max = int(opp.get("max_team_size") or opp.get("max_team") or 4)
344
 
 
345
  ps = []
346
  seen_ps = set()
347
+ for r in (opp.get("rounds") or opp.get("problem_statements") or []):
348
+ if not isinstance(r, dict): continue
349
+ title = r.get("title") or r.get("name") or r.get("round_name") or ""
350
+ desc = r.get("description") or r.get("details") or ""
351
+ if title and title.lower() not in seen_ps:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  seen_ps.add(title.lower())
353
+ ps.append({"track": "", "title": title, "description": desc[:300]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+ tags = opp.get("tags") or opp.get("categories") or []
356
+ if isinstance(tags, list) and not ps:
357
+ for t in tags:
358
+ label = t.get("name") or t.get("label") or (t if isinstance(t, str) else "")
359
+ if label and label.lower() not in seen_ps:
360
+ seen_ps.add(label.lower())
361
+ ps.append({"track": "Theme", "title": label, "description": ""})
362
+
363
+ return {
364
+ "name": opp.get("title") or opp.get("name", ""),
365
+ "description": (opp.get("short_description") or opp.get("description") or "")[:500],
366
+ "banner_url": opp.get("banner_image") or opp.get("cover_image") or opp.get("image") or "",
367
+ "start_date": start_date,
368
+ "end_date": end_date,
369
+ "registration_deadline": reg_deadline,
370
+ "submission_deadline": sub_deadline,
371
+ "result_date": result_date,
372
+ "prize_pool": prize_pool,
373
+ "team_size": {"min": team_min, "max": team_max},
374
+ "problem_statements": ps[:20],
375
+ "resource_links": [],
376
+ }
377
 
378
 
379
  # ============================================================
380
+ # DEVPOST - <time datetime> tags + sidebar
381
  # ============================================================
382
 
383
+ DEVPOST_SCRIPT = """() => {
384
+ const getMeta = function(n) {
385
+ const el = document.querySelector('meta[property="' + n + '"], meta[name="' + n + '"]');
386
  return el ? el.getAttribute('content') || '' : '';
387
  };
388
 
389
+ let name = getMeta('og:title') || document.title.split('|')[0].trim();
390
+ const h1 = document.querySelector('#challenge-title, h1.title, h1');
391
+ if (h1 && h1.textContent.trim().length > 2) name = h1.textContent.trim();
392
+
393
+ const banner = getMeta('og:image') || '';
394
+ const description = getMeta('og:description') || '';
395
+
396
+ const timeMap = {};
397
+ document.querySelectorAll('time[datetime]').forEach(function(el) {
398
+ const dt = el.getAttribute('datetime') || '';
399
+ const parent = el.closest('.deadline, .date, li, div');
400
+ const lbl = (parent ? parent.textContent : '').toLowerCase();
401
+ if (lbl.includes('submission') || lbl.includes('submit')) { if (!timeMap.submission) timeMap.submission = dt; }
402
+ else if (lbl.includes('registr') || lbl.includes('apply')) { if (!timeMap.registration) timeMap.registration = dt; }
403
+ else if (lbl.includes('result') || lbl.includes('winner')) { if (!timeMap.result) timeMap.result = dt; }
404
+ else if (lbl.includes('start') || lbl.includes('begin')) { if (!timeMap.start) timeMap.start = dt; }
405
+ else if (lbl.includes('end') || lbl.includes('close')) { if (!timeMap.end) timeMap.end = dt; }
406
+ else if (!timeMap.first) { timeMap.first = dt; }
407
+ });
 
 
408
 
409
+ let prize_pool = '';
410
+ const prizeSection = document.querySelector('#prizes, .prize-amount, [id*="prize"]');
411
+ if (prizeSection) {
412
+ const txt = prizeSection.textContent;
413
+ const m = txt.match(/([\\u20b9][\\d,]+|[$][\\d,]+(?:K|k|M)?)/);
414
+ if (m) prize_pool = m[1];
415
+ else prize_pool = txt.trim().substring(0, 80);
416
+ }
417
 
 
418
  const themes = [];
419
+ const seenT = new Set();
420
+ document.querySelectorAll('a[href*="themes"], .software-list a').forEach(function(a) {
421
  const t = a.textContent.trim();
422
+ if (t && t.length > 1 && t.length < 80 && !seenT.has(t)) { seenT.add(t); themes.push(t); }
 
 
 
 
 
 
 
423
  });
424
 
 
425
  const resourceLinks = [];
426
  const seenHrefs = new Set();
427
+ const kws = ['problem','statement','pdf','rule','guideline','document','challenge','track','theme','schedule'];
428
+ document.querySelectorAll('a[href]').forEach(function(a) {
429
  const href = a.href || '';
430
  const text = a.textContent.trim();
431
+ const hl = href.toLowerCase(), tl = text.toLowerCase();
 
432
  if (seenHrefs.has(href) || !href || href === '#') return;
433
+ const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
434
+ const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
435
+ const isRel = kws.some(function(kw) { return tl.includes(kw) || hl.includes(kw); });
436
+ if (isPdf || isDrive || isRel) {
 
 
 
437
  seenHrefs.add(href);
438
+ resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: isPdf ? 'pdf' : isDrive ? 'google_drive' : 'link' });
 
 
 
 
439
  }
440
  });
441
 
 
443
  name: name.substring(0, 200),
444
  description: description.substring(0, 2000),
445
  banner_url: banner,
446
+ time_map: timeMap,
447
+ prize_pool: prize_pool,
448
  themes: themes,
449
+ resource_links: resourceLinks.slice(0, 30),
450
+ bodyText: document.body.innerText.substring(0, 15000)
451
  };
452
  }"""
453
 
454
 
455
+ def parse_devpost(data: dict) -> dict:
456
+ tm = data.get("time_map", {})
457
+ ps = [{"track": "Theme", "title": t, "description": ""} for t in data.get("themes", [])]
458
+ return {
459
+ "name": data.get("name", ""),
460
+ "description": data.get("description", ""),
461
+ "banner_url": data.get("banner_url", ""),
462
+ "start_date": iso_to_date(tm.get("start") or tm.get("first")),
463
+ "end_date": iso_to_date(tm.get("end")),
464
+ "registration_deadline": iso_to_date(tm.get("registration")),
465
+ "submission_deadline": iso_to_date(tm.get("submission")),
466
+ "result_date": iso_to_date(tm.get("result")),
467
+ "prize_pool": data.get("prize_pool", ""),
468
+ "team_size": {"min": 1, "max": 4},
469
+ "problem_statements": ps,
470
+ "resource_links": data.get("resource_links", []),
471
+ "body_text": data.get("bodyText", ""),
472
+ }
473
 
474
+
475
+ # ============================================================
476
+ # GENERIC SCRIPT + REGEX FALLBACK
477
+ # ============================================================
478
+
479
+ GENERIC_SCRIPT = """() => {
480
+ const getMeta = function(n) {
481
+ const el = document.querySelector('meta[property="' + n + '"], meta[name="' + n + '"]');
482
+ return el ? el.getAttribute('content') || '' : '';
483
+ };
484
+ let name = '';
485
+ const sels = ['h1','.hackathon-name','.event-name','#challenge-title','.opp-title'];
486
+ for (let i = 0; i < sels.length; i++) {
487
+ const el = document.querySelector(sels[i]);
488
+ if (el && el.textContent.trim().length > 2) { name = el.textContent.trim(); break; }
489
+ }
490
+ name = name || getMeta('og:title') || document.title.split('|')[0].trim();
491
+ const banner = getMeta('og:image') || '';
492
+ const description = getMeta('og:description') || getMeta('description') || '';
493
+ const resourceLinks = [];
494
+ const seenHrefs = new Set();
495
+ const kws = ['problem','statement','pdf','rule','guideline','document','challenge','track','theme','schedule'];
496
+ document.querySelectorAll('a[href]').forEach(function(a) {
497
+ const href = a.href || '';
498
+ const text = a.textContent.trim();
499
+ const hl = href.toLowerCase(), tl = text.toLowerCase();
500
+ if (seenHrefs.has(href) || !href || href === '#') return;
501
+ const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
502
+ const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
503
+ const isRel = kws.some(function(kw) { return tl.includes(kw) || hl.includes(kw); });
504
+ if (isPdf || isDrive || isRel) {
505
+ seenHrefs.add(href);
506
+ resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: isPdf ? 'pdf' : isDrive ? 'google_drive' : 'link' });
507
+ }
508
+ });
509
+ return {
510
+ name: name.substring(0, 200),
511
+ description: description.substring(0, 2000),
512
+ banner_url: banner,
513
+ bodyText: document.body.innerText.substring(0, 20000),
514
+ resource_links: resourceLinks.slice(0, 30)
515
+ };
516
+ }"""
517
+
518
+
519
+ def regex_extract(body_text: str) -> dict:
520
+ result = {
521
+ "registration_deadline": "", "submission_deadline": "",
522
+ "result_date": "", "start_date": "", "end_date": "",
523
+ "prize_pool": "", "team_size": {"min": 1, "max": 4},
524
+ "problem_statements": [],
525
+ }
526
+ result["registration_deadline"] = find_dates_near(body_text, [
527
+ "registration close", "register by", "registration deadline",
528
+ "applications close", "apply by", "registration ends",
529
+ ])
530
+ result["submission_deadline"] = find_dates_near(body_text, [
531
+ "submission deadline", "submissions close", "submit by",
532
+ "final submission", "project submission", "deadline",
533
+ ])
534
+ runs = re.search(
535
+ r"(?:runs?\s+from|starts?\s+(?:on|from)?)\s*"
536
+ r"((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2})"
537
+ r"(?:\s*[-\u2013]\s*(\d{1,2}))?(?:[,\s]+(\d{4}))?",
538
+ body_text, re.IGNORECASE,
539
  )
540
+ if runs:
541
+ year = runs.group(3) or str(datetime.now().year)
542
+ result["start_date"] = parse_any_date(f"{runs.group(1)} {year}")
543
+ if runs.group(2):
544
+ month = runs.group(1).split()[0]
545
+ result["end_date"] = parse_any_date(f"{month} {runs.group(2)} {year}")
546
+ if not result["start_date"]:
547
+ result["start_date"] = find_dates_near(body_text, ["start date","starts on","begins on"])
548
+ if not result["end_date"]:
549
+ result["end_date"] = find_dates_near(body_text, ["end date","ends on","hackathon ends"])
550
+ result["result_date"] = find_dates_near(body_text, ["result","winners announced","results declared"])
551
 
552
+ prize_patterns = [
553
+ r"(Rs\.?\s*[\d,]+(?:\s*(?:Lakhs?|Crores?|K|k|L))?)",
554
+ r"(INR\s*[\d,]+(?:\s*(?:Lakhs?|Crores?|K|k|L))?)",
555
+ r"(\$\s*[\d,]+(?:\s*(?:K|k|M))?)",
556
+ r"(\u20b9\s*[\d,]+(?:\s*(?:Lakhs?|Crores?|K|k|L))?)",
557
+ ]
558
+ lower = body_text.lower()
559
+ for kw in ["prize","reward","worth","winning","bounty","in cash","in prizes"]:
560
+ idx = lower.find(kw)
561
+ if idx == -1: continue
562
+ chunk = body_text[max(0, idx-200): idx+200]
563
+ for pat in prize_patterns:
564
+ m = re.search(pat, chunk, re.IGNORECASE)
565
+ if m: result["prize_pool"] = m.group(1).strip(); break
566
+ if result["prize_pool"]: break
567
+ if not result["prize_pool"]:
568
+ for pat in prize_patterns:
569
+ m = re.search(pat, body_text)
570
+ if m: result["prize_pool"] = m.group(1).strip(); break
571
 
572
+ for pat in [
573
+ r"team\s*size[:\s]*(\d+)\s*[-\u2013to]+\s*(\d+)",
574
+ r"(\d+)\s*[-\u2013to]+\s*(\d+)\s*(?:members?|people|per team)",
575
+ r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
576
+ r"max(?:imum)?\s*(?:team)?\s*size\s*[:\s]*(\d+)",
577
+ ]:
578
+ m = re.search(pat, body_text, re.IGNORECASE)
579
+ if m:
580
+ g = [x for x in m.groups() if x]
581
+ result["team_size"] = (
582
+ {"min": int(g[0]), "max": int(g[1])} if len(g) == 2
583
+ else {"min": 1, "max": int(g[0])}
584
+ )
585
+ break
586
 
587
+ ps, seen = [], set()
588
+ dm = re.search(
589
+ r"(?:domains?|themes?|tracks?|categories|verticals)[:\s]+([^\n]+)",
590
+ body_text, re.IGNORECASE,
591
+ )
592
+ if dm:
593
+ for item in re.split(r"[,|/]", dm.group(1)):
594
+ item = item.strip().rstrip(".")
595
+ if 3 < len(item) < 150 and item.lower() not in seen:
596
+ seen.add(item.lower())
597
+ ps.append({"track": "", "title": item, "description": ""})
598
+ for m in re.finditer(
599
+ r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-]\s*(.{5,200})",
600
+ body_text, re.IGNORECASE,
601
+ ):
602
+ title = m.group(2).strip().split("\n")[0]
603
+ if title.lower() not in seen and len(title) > 4:
604
+ seen.add(title.lower())
605
+ ps.append({"track": f"Track {m.group(1)}", "title": title, "description": ""})
606
+ result["problem_statements"] = ps[:20]
607
+ return result
608
 
609
+
610
+ # ============================================================
611
+ # SAFE EVALUATE
612
+ # ============================================================
613
+
614
+ EMPTY_DATA = {"name": "", "description": "", "banner_url": "", "bodyText": "", "resource_links": []}
615
+
616
+ async def safe_evaluate(page, script: str, fallback=None) -> dict:
617
+ for attempt in range(3):
618
+ try:
619
+ try:
620
+ await page.wait_for_load_state("networkidle", timeout=8000)
621
+ except Exception:
622
+ pass
623
+ return await page.evaluate(script)
624
+ except Exception as e:
625
+ err = str(e)
626
+ print(f"[Scraper] evaluate attempt {attempt+1} failed: {err[:120]}")
627
+ if "Execution context was destroyed" in err or "Frame was detached" in err:
628
+ print("[Scraper] Redirect detected, waiting to settle...")
629
+ try:
630
+ await page.wait_for_load_state("domcontentloaded", timeout=12000)
631
+ await asyncio.sleep(2)
632
+ except Exception:
633
+ await asyncio.sleep(3)
634
+ continue
635
+ if fallback and attempt == 1:
636
+ script = fallback
637
+ continue
638
+ break
639
+ return EMPTY_DATA
640
+
641
+
642
+ # ============================================================
643
+ # MAIN SCRAPER
644
+ # ============================================================
645
 
646
  async def scrape_with_playwright(url: str, platform: str) -> dict:
 
647
  global browser
648
+ if browser is None:
649
+ return {"scrape_success": False, "error": "Browser not initialized"}
650
+
651
+ # Unstop: try API first
652
+ if platform == "Unstop":
653
+ opp_id = extract_unstop_id(url)
654
+ print(f"[Unstop] Extracted ID: {opp_id}")
655
+ if opp_id:
656
+ opp = await fetch_unstop_api(opp_id)
657
+ if opp:
658
+ result = parse_unstop_api(opp)
659
+ result["scrape_success"] = bool(result.get("name"))
660
+ print(f"[Unstop] API: name='{result['name']}' reg={result['registration_deadline']} sub={result['submission_deadline']} ps={len(result['problem_statements'])}")
661
+ return result
662
+ print("[Unstop] API failed, falling back to Playwright")
663
+
664
+ context = await browser.new_context(
665
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
666
+ viewport={"width": 1920, "height": 1080},
667
+ )
668
  try:
669
+ page = await context.new_page()
670
+ print(f"[Scraper] => {url} platform={platform}")
671
+ wait_until = "load" if platform in ("Devfolio", "MLH") else "domcontentloaded"
 
 
 
 
 
 
 
 
672
  try:
673
+ await page.goto(url, wait_until=wait_until, timeout=30000)
674
+ except Exception as e:
675
+ if "Timeout" not in str(e): raise
676
+ print("[Scraper] goto timeout, proceeding anyway")
677
+
678
+ wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 8, "MLH": 4}
679
+ wait_sec = wait_map.get(platform, 5)
680
+ print(f"[Scraper] Waiting {wait_sec}s for JS...")
681
+ await page.wait_for_timeout(wait_sec * 1000)
682
+
683
+ for frac in [0.33, 0.66, 1.0, 0.0]:
684
+ try:
685
+ await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
686
+ except Exception:
687
+ pass
688
+ await asyncio.sleep(0.6)
689
+ await asyncio.sleep(1.0)
690
+
691
+ if platform == "Devfolio":
692
+ raw = await safe_evaluate(page, DEVFOLIO_SCRIPT, GENERIC_SCRIPT)
693
+ print(f"[Devfolio] found_next_data={raw.get('found_next_data')} keys={raw.get('hackathon_keys','')[:80]}")
694
+ result = parse_devfolio(raw)
695
+ # Regex fill for any empty structured fields
696
+ if not result.get("registration_deadline") or not result.get("start_date"):
697
+ print("[Devfolio] Filling gaps with regex")
698
+ regex = regex_extract(raw.get("bodyText", ""))
699
+ for f in ("start_date","end_date","registration_deadline","submission_deadline","result_date","prize_pool"):
700
+ if not result.get(f) and regex.get(f):
701
+ result[f] = regex[f]
702
+ if not result["problem_statements"]:
703
+ result["problem_statements"] = regex.get("problem_statements", [])
704
+ if result["team_size"] == {"min": 1, "max": 4}:
705
+ result["team_size"] = regex.get("team_size", {"min": 1, "max": 4})
706
+
707
+ elif platform == "Devpost":
708
+ raw = await safe_evaluate(page, DEVPOST_SCRIPT, GENERIC_SCRIPT)
709
+ result = parse_devpost(raw)
710
+ if not result.get("submission_deadline") or not result.get("start_date"):
711
+ regex = regex_extract(raw.get("bodyText", ""))
712
+ for f in ("start_date","end_date","registration_deadline","submission_deadline","result_date"):
713
+ if not result.get(f) and regex.get(f):
714
+ result[f] = regex[f]
715
+
716
+ else:
717
+ raw = await safe_evaluate(page, GENERIC_SCRIPT)
718
+ regex = regex_extract(raw.get("bodyText", ""))
719
+ result = {
720
+ "name": raw.get("name", ""),
721
+ "description": raw.get("description", ""),
722
+ "banner_url": raw.get("banner_url", ""),
723
+ "resource_links": raw.get("resource_links", []),
724
+ **regex,
 
 
 
 
 
 
 
725
  }
726
+
727
+ result["scrape_success"] = bool(result.get("name") and len(result.get("name","")) > 2)
728
+ print(f"[Scraper] Done: name='{result.get('name','')}' reg={result.get('registration_deadline','')} sub={result.get('submission_deadline','')} start={result.get('start_date','')} prize='{result.get('prize_pool','')}' ps={len(result.get('problem_statements',[]))}")
729
+ return result
730
 
731
  except Exception as e:
 
732
  import traceback
733
  traceback.print_exc()
734
  return {"scrape_success": False, "error": str(e)}
735
+ finally:
736
+ await context.close()
737
 
738
 
739
  # ============================================================
740
+ # APP LIFECYCLE & ROUTES
741
  # ============================================================
742
 
743
+ @app.on_event("startup")
744
+ async def startup() -> None:
745
+ global playwright, browser
746
+ from playwright.async_api import async_playwright
747
+ playwright = await async_playwright().start()
748
+ browser = await playwright.chromium.launch(
749
+ headless=True,
750
+ args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
751
+ )
752
+ print("[Scraper] v5.0 ready - structured extraction (no LLM)")
753
+
754
+
755
+ @app.on_event("shutdown")
756
+ async def shutdown() -> None:
757
+ global playwright, browser
758
+ try:
759
+ if browser: await browser.close()
760
+ finally:
761
+ browser = None
762
+ try:
763
+ if playwright: await playwright.stop()
764
+ finally:
765
+ playwright = None
766
+
767
+
768
  @app.get("/")
769
  async def root():
770
+ return {
771
+ "status": "ok",
772
+ "service": "HackTrack Scraper v5",
773
+ "strategy": "__NEXT_DATA__ + REST API + time-tags + regex",
774
+ "platforms": ["Devfolio", "Devpost", "Unstop", "DoraHacks", "MLH"],
775
+ }
776
+
777
+ @app.get("/health")
778
+ async def health():
779
+ return {"status": "ok", "timestamp": datetime.utcnow().isoformat()}
780
 
781
 
782
  @app.post("/scrape", response_model=ScrapeResponse)
783
  async def scrape(request: ScrapeRequest):
784
  url = request.url.strip()
785
  platform = detect_platform(url)
786
+ print(f"\n[Scraper] === {url} platform={platform} ===")
 
787
  try:
788
  data = await scrape_with_playwright(url, platform)
789
+ return ScrapeResponse(
 
790
  name=data.get("name", ""),
791
  platform=platform,
792
  banner_url=data.get("banner_url", ""),
 
803
  scrape_success=data.get("scrape_success", False),
804
  url=url,
805
  )
 
 
 
 
806
  except Exception as e:
807
  print(f"[Scraper] Endpoint error: {e}")
808
+ return ScrapeResponse(platform=platform, url=url, scrape_success=False)