Rudraaaa76 commited on
Commit
d53c11e
·
verified ·
1 Parent(s): 777228e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +471 -249
app.py CHANGED
@@ -1,20 +1,44 @@
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel, Field
4
  import asyncio
5
  import re
6
  import sys
 
 
7
  from urllib.parse import urlparse
8
  from typing import List
9
  from datetime import datetime
10
 
 
 
 
11
  if sys.platform == "win32":
12
- # Playwright launches a driver subprocess; Proactor loop supports subprocess APIs on Windows.
13
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
14
 
15
- app = FastAPI(title="HackTrack Scraper", version="3.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Global Playwright runtime objects reused across requests.
18
  playwright = None
19
  browser = None
20
 
@@ -26,6 +50,10 @@ app.add_middleware(
26
  )
27
 
28
 
 
 
 
 
29
  class ScrapeRequest(BaseModel):
30
  url: str
31
 
@@ -46,55 +74,50 @@ class ScrapeResponse(BaseModel):
46
  resource_links: List[dict] = Field(default_factory=list)
47
  scrape_success: bool = False
48
  url: str = ""
 
 
49
 
 
 
 
50
 
51
  def detect_platform(url: str) -> str:
52
  domain = urlparse(url).netloc.lower()
53
- if "devfolio" in domain:
54
- return "Devfolio"
55
- elif "unstop" in domain:
56
- return "Unstop"
57
- elif "devpost" in domain:
58
- return "Devpost"
59
- elif "dorahacks" in domain:
60
- return "DoraHacks"
61
  return "Other"
62
 
63
 
64
- # ============================================================
65
- # DATE PARSING — robust multi-format
66
- # ============================================================
67
- MONTH_MAP = {
68
- "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3,
69
- "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6,
70
- "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9,
71
- "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12,
72
- }
73
 
74
  DATE_FORMATS = [
75
  "%Y-%m-%d", "%Y/%m/%d",
76
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
77
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
78
- "%m/%d/%Y", "%d/%m/%Y",
79
- "%B %d", "%b %d",
80
  ]
81
 
82
 
83
  def parse_any_date(text: str, fallback_year: int = None) -> str:
84
- """Parse many date formats to YYYY-MM-DD. Handles partial dates."""
85
  if not text:
86
  return ""
87
  text = text.strip()
88
  text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
89
  text = re.sub(r"\s+", " ", text)
90
-
91
- if not fallback_year:
92
- fallback_year = datetime.now().year
93
 
94
  for fmt in DATE_FORMATS:
95
  try:
96
  dt = datetime.strptime(text, fmt)
97
- if dt.year == 1900: # no year in format
98
  dt = dt.replace(year=fallback_year)
99
  if dt < datetime.now():
100
  dt = dt.replace(year=fallback_year + 1)
@@ -105,9 +128,8 @@ def parse_any_date(text: str, fallback_year: int = None) -> str:
105
 
106
 
107
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
108
- """Find dates within `window` chars after any keyword."""
109
  lower = text.lower()
110
- all_date_patterns = [
111
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
112
  r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
113
  r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
@@ -117,22 +139,87 @@ def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
117
  idx = lower.find(kw.lower())
118
  if idx == -1:
119
  continue
120
- chunk = text[idx:idx + window]
121
- for pat in all_date_patterns:
122
- match = re.search(pat, chunk, re.IGNORECASE)
123
- if match:
124
- parsed = parse_any_date(match.group(1))
125
  if parsed:
126
  return parsed
127
  return ""
128
 
129
 
130
- # ============================================================
131
- # EXTRACT from full page innerText (the reliable approach)
132
- # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- def extract_all_from_text(body_text: str, platform: str) -> dict:
135
- """Extract hackathon details from page innerText using text patterns."""
 
 
 
136
  result = {
137
  "registration_deadline": "",
138
  "submission_deadline": "",
@@ -144,56 +231,46 @@ def extract_all_from_text(body_text: str, platform: str) -> dict:
144
  "problem_statements": [],
145
  }
146
 
147
- # ---- DATES ----
148
- # Registration deadline
149
  result["registration_deadline"] = find_dates_near(body_text, [
150
  "registration close", "registrations close", "register by",
151
  "last date to register", "registration deadline", "applications close",
152
  "apply by", "registration ends", "sign up deadline",
153
  ])
154
-
155
- # Submission deadline
156
  result["submission_deadline"] = find_dates_near(body_text, [
157
  "submission deadline", "submission closes", "submissions close",
158
  "submit by", "last date to submit", "submission end",
159
- "final submission", "project submission",
160
- "deadline", # generic fallback last
161
  ])
162
 
163
- # Start date — Devfolio uses "Runs from Mar 25 - 26, 2026"
164
  runs_from = re.search(
165
  r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
166
- r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
167
- r"(?:\s*[-–]\s*(\d{1,2}))?"
168
- r"(?:[,\s]+(\d{4}))?",
169
- body_text, re.IGNORECASE
170
  )
171
  if runs_from:
172
- start_text = runs_from.group(1)
173
  year = runs_from.group(3) or str(datetime.now().year)
174
- result["start_date"] = parse_any_date(f"{start_text} {year}")
175
- if runs_from.group(2) and runs_from.group(1):
176
  month = runs_from.group(1).split()[0]
177
  result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
178
 
179
  if not result["start_date"]:
180
  result["start_date"] = find_dates_near(body_text, [
181
- "start date", "starts on", "begins on", "hackathon starts",
182
- "event starts", "event date", "dates:",
183
  ])
184
-
185
  if not result["end_date"]:
186
  result["end_date"] = find_dates_near(body_text, [
187
  "end date", "ends on", "hackathon ends", "event ends",
188
  ])
189
-
190
- # Result date
191
  result["result_date"] = find_dates_near(body_text, [
192
- "result", "winners announced", "announcement", "winner announcement",
193
- "results declared", "shortlist",
194
  ])
195
 
196
- # ---- PRIZE POOL ----
197
  prize_patterns = [
198
  r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
199
  r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
@@ -202,161 +279,242 @@ def extract_all_from_text(body_text: str, platform: str) -> dict:
202
  r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
203
  r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
204
  ]
205
-
206
- # Find prize amounts near keywords like "prize", "reward", "worth", "win"
207
  prize_lower = body_text.lower()
208
  for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
209
  idx = prize_lower.find(kw)
210
  if idx == -1:
211
  continue
212
- # Search ±200 chars around keyword
213
- start = max(0, idx - 200)
214
- chunk = body_text[start:idx + 200]
215
  for pat in prize_patterns:
216
- match = re.search(pat, chunk, re.IGNORECASE)
217
- if match:
218
- result["prize_pool"] = match.group(1).strip()
219
  break
220
  if result["prize_pool"]:
221
  break
222
-
223
- # Fallback: any large currency amount
224
  if not result["prize_pool"]:
225
  for pat in prize_patterns:
226
- match = re.search(pat, body_text)
227
- if match:
228
- result["prize_pool"] = match.group(1).strip()
229
  break
230
 
231
- # ---- TEAM SIZE ----
232
- team_patterns = [
233
  r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
234
  r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
235
  r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
236
  r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
237
- r"(\d+)\s*[-–]\s*(\d+)\s*$", # in FAQ: "2 - 4"
238
- ]
239
- for pat in team_patterns:
240
- match = re.search(pat, body_text, re.IGNORECASE)
241
- if match:
242
- groups = [g for g in match.groups() if g]
243
- if len(groups) == 2:
244
- result["team_size"] = {"min": int(groups[0]), "max": int(groups[1])}
245
- elif len(groups) == 1:
246
- result["team_size"] = {"min": 1, "max": int(groups[0])}
247
  break
248
 
249
- # ---- PROBLEM STATEMENTS / TRACKS / DOMAINS ----
250
- ps = []
251
- seen_ps = set()
252
 
253
- # Pattern 1: "Domains: AI, ML, Web App" (Devfolio style)
254
- domain_match = re.search(
255
  r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
256
- body_text, re.IGNORECASE
257
  )
258
- if domain_match:
259
- items = re.split(r"[,•|/]", domain_match.group(1))
260
- for item in items:
261
  item = item.strip().rstrip(".")
262
- if 3 < len(item) < 150 and item.lower() not in seen_ps:
263
- seen_ps.add(item.lower())
264
  ps.append({"track": "", "title": item})
265
 
266
- # Pattern 2: Numbered problem statements: "PS1: ...", "Problem Statement 1 - ..."
267
- for match in re.finditer(
268
  r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
269
- body_text, re.IGNORECASE
270
  ):
271
- num = match.group(1)
272
- title = match.group(2).strip().split("\n")[0]
273
- if title.lower() not in seen_ps and len(title) > 4:
274
- seen_ps.add(title.lower())
275
- ps.append({"track": f"Track {num}", "title": title})
276
-
277
- # Pattern 3: Devpost-style theme tags (already in themes list from JS)
278
- # Pattern 4: Bulleted lists after "Themes" or "Tracks" heading
279
- for match in re.finditer(
280
  r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
281
  r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
282
- body_text, re.IGNORECASE
283
  ):
284
- items = re.findall(r"[-•●▸]\s*(.{5,200})", match.group(1))
285
- for item in items:
286
  item = item.strip().split("\n")[0]
287
- if item.lower() not in seen_ps and 4 < len(item) < 200:
288
- seen_ps.add(item.lower())
289
  ps.append({"track": "", "title": item})
290
 
291
  result["problem_statements"] = ps[:20]
292
  return result
293
 
294
 
295
- # ============================================================
296
- # PLAYWRIGHT SCRAPER gets innerText + meta from rendered page
297
- # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
- EXTRACT_SCRIPT = """() => {
 
300
  const getMeta = (name) => {
301
  const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
302
  return el ? el.getAttribute('content') || '' : '';
303
  };
304
-
305
- // Name: try multiple selectors
306
  const nameSelectors = [
307
- 'h1',
308
- '.hackathon-name', '.event-name', '.challenge-title',
309
- '#challenge-title', '.opp-title',
310
  ];
311
  let name = '';
312
  for (const sel of nameSelectors) {
313
  const el = document.querySelector(sel);
314
  if (el && el.textContent.trim().length > 2) {
315
- name = el.textContent.trim();
316
- break;
317
  }
318
  }
319
  name = name || getMeta('og:title') || document.title.split('|')[0].trim();
320
 
321
- // Banner
322
  const banner = getMeta('og:image') || '';
323
-
324
- // Description
325
  let description = getMeta('og:description') || getMeta('description') || '';
326
-
327
- // Full page text for parsing
328
  const bodyText = document.body.innerText;
329
 
330
- // For Devpost: extract themes from tag links
331
  const themes = [];
332
  document.querySelectorAll('a[href*="themes"]').forEach(a => {
333
  const t = a.textContent.trim();
334
  if (t && t.length > 2 && t.length < 100) themes.push(t);
335
  });
336
 
337
- // Devpost sidebar prize text
338
  let sidebarPrize = '';
339
  document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
340
  const t = el.textContent.trim();
341
  if (t && t.length > 2) sidebarPrize += t + ' ';
342
  });
343
 
344
- // Resource links: PDFs, Google Drive, problem statements, rules, guidelines
345
  const resourceLinks = [];
346
  const seenHrefs = new Set();
347
- const linkKeywords = ['problem', 'statement', 'pdf', 'rule', 'guideline', 'brochure', 'document', 'brief', 'challenge', 'track', 'theme', 'schedule', 'timeline'];
 
348
  document.querySelectorAll('a[href]').forEach(a => {
349
  const href = a.href || '';
350
  const text = a.textContent.trim();
351
- const hrefLower = href.toLowerCase();
352
- const textLower = text.toLowerCase();
353
  if (seenHrefs.has(href) || !href || href === '#') return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
- const isPdf = hrefLower.endsWith('.pdf') || hrefLower.includes('/pdf');
356
- const isDrive = hrefLower.includes('drive.google.com') || hrefLower.includes('docs.google.com');
357
- const isDropbox = hrefLower.includes('dropbox.com');
358
- const isRelevant = linkKeywords.some(kw => textLower.includes(kw) || hrefLower.includes(kw));
 
 
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  if (isPdf || isDrive || isDropbox || isRelevant) {
361
  seenHrefs.add(href);
362
  resourceLinks.push({
@@ -372,148 +530,210 @@ EXTRACT_SCRIPT = """() => {
372
  description: description.substring(0, 2000),
373
  banner_url: banner,
374
  bodyText: bodyText.substring(0, 30000),
375
- themes: themes,
376
- sidebarPrize: sidebarPrize.trim(),
377
  resourceLinks: resourceLinks.slice(0, 30),
378
  };
379
  }"""
380
 
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  @app.on_event("startup")
383
  async def startup() -> None:
384
  global playwright, browser
385
  from playwright.async_api import async_playwright
386
-
387
  playwright = await async_playwright().start()
388
  browser = await playwright.chromium.launch(
389
  headless=True,
390
- args=["--no-sandbox", "--disable-setuid-sandbox"],
391
  )
392
- print("[Scraper] Playwright browser initialized")
 
393
 
394
 
395
  @app.on_event("shutdown")
396
  async def shutdown() -> None:
397
  global playwright, browser
398
-
399
  try:
400
- if browser is not None:
401
- await browser.close()
402
- print("[Scraper] Browser closed")
403
  finally:
404
  browser = None
405
-
406
  try:
407
- if playwright is not None:
408
- await playwright.stop()
409
- print("[Scraper] Playwright stopped")
410
  finally:
411
  playwright = None
412
 
413
- async def scrape_with_playwright(url: str, platform: str) -> dict:
414
- """Scrape using Playwright — renders JS, grabs full innerText for parsing."""
415
- global browser
416
- try:
417
- if browser is None:
418
- return {
419
- "scrape_success": False,
420
- "error": "Browser is not initialized. Service startup failed.",
421
- }
422
-
423
- context = await browser.new_context(
424
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
425
- viewport={"width": 1920, "height": 1080},
426
- )
427
-
428
- try:
429
- page = await context.new_page()
430
-
431
- print(f"[Scraper] Navigating to {url} (platform: {platform})")
432
- await page.goto(url, wait_until="domcontentloaded", timeout=20000)
433
-
434
- # Wait for JS rendering — longer for SPAs
435
- wait_time = 8 if platform in ("Unstop",) else 5
436
- print(f"[Scraper] Waiting {wait_time}s for JS rendering...")
437
- await page.wait_for_timeout(wait_time * 1000)
438
-
439
- # Scroll to trigger lazy content
440
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 3)")
441
- await asyncio.sleep(1)
442
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 2 / 3)")
443
- await asyncio.sleep(1)
444
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
- await asyncio.sleep(1)
446
- await page.evaluate("window.scrollTo(0, 0)")
447
- await asyncio.sleep(0.5)
448
-
449
- # Extract structured + raw text data
450
- data = await page.evaluate(EXTRACT_SCRIPT)
451
-
452
- body_text = data.get("bodyText", "")
453
- name = data.get("name", "")
454
- description = data.get("description", "")
455
-
456
- print(f"[Scraper] Extracted name: '{name}', bodyText length: {len(body_text)}")
457
-
458
- # Parse all fields from full innerText
459
- extracted = extract_all_from_text(body_text, platform)
460
-
461
- # Devpost themes from sidebar tags
462
- themes = data.get("themes", [])
463
- if themes and not extracted["problem_statements"]:
464
- seen = set()
465
- for t in themes:
466
- if t.lower() not in seen:
467
- seen.add(t.lower())
468
- extracted["problem_statements"].append({"track": "Theme", "title": t})
469
-
470
- # Sidebar prize fallback (Devpost)
471
- if not extracted["prize_pool"] and data.get("sidebarPrize"):
472
- prize_text = data["sidebarPrize"]
473
- for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
474
- m = re.search(pat, prize_text)
475
- if m:
476
- extracted["prize_pool"] = m.group(1)
477
- break
478
- if not extracted["prize_pool"]:
479
- extracted["prize_pool"] = prize_text[:100]
480
-
481
- return {
482
- "name": name,
483
- "description": description,
484
- "banner_url": data.get("banner_url", ""),
485
- "scrape_success": bool(name and len(name) > 2),
486
- "resource_links": data.get("resourceLinks", []),
487
- **extracted,
488
- }
489
- finally:
490
- await context.close()
491
-
492
- except Exception as e:
493
- print(f"[Scraper] Error: {e}")
494
- import traceback
495
- traceback.print_exc()
496
- return {"scrape_success": False, "error": str(e)}
497
-
498
 
499
- # ============================================================
500
- # API ROUTES
501
- # ============================================================
502
 
503
  @app.get("/")
504
  async def root():
505
- return {"status": "ok", "service": "HackTrack Scraper v3"}
 
 
 
 
 
 
 
 
 
 
506
 
507
 
508
  @app.post("/scrape", response_model=ScrapeResponse)
509
  async def scrape(request: ScrapeRequest):
510
  url = request.url.strip()
511
  platform = detect_platform(url)
512
- print(f"\n[Scraper] === New scrape request: {url} (platform={platform}) ===")
513
 
514
  try:
515
  data = await scrape_with_playwright(url, platform)
516
-
517
  response = ScrapeResponse(
518
  name=data.get("name", ""),
519
  platform=platform,
@@ -530,13 +750,15 @@ async def scrape(request: ScrapeRequest):
530
  resource_links=data.get("resource_links", []),
531
  scrape_success=data.get("scrape_success", False),
532
  url=url,
 
 
 
 
 
 
 
533
  )
534
-
535
- print(f"[Scraper] Result: name='{response.name}', dates=({response.start_date}, {response.end_date}, reg={response.registration_deadline}, sub={response.submission_deadline}), prize='{response.prize_pool}', team={response.team_size}, ps={len(response.problem_statements)}")
536
  return response
537
-
538
  except Exception as e:
539
  print(f"[Scraper] Endpoint error: {e}")
540
  return ScrapeResponse(platform=platform, url=url, scrape_success=False)
541
-
542
-
 
1
+ """
2
+ HackTrack Scraper v4.0
3
+ - Groq LLM (llama-3.3-70b-versatile) for intelligent extraction
4
+ - Platforms: Devfolio, Unstop, Devpost, DoraHacks, MLH
5
+ - Deployed on Railway/Render/Fly.io
6
+ """
7
+
8
  from fastapi import FastAPI
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel, Field
11
  import asyncio
12
  import re
13
  import sys
14
+ import os
15
+ import json
16
  from urllib.parse import urlparse
17
  from typing import List
18
  from datetime import datetime
19
 
20
+ # Groq client — free tier, llama-3.3-70b
21
+ from groq import Groq
22
+
23
  if sys.platform == "win32":
 
24
  asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
25
 
26
+ app = FastAPI(title="HackTrack Scraper", version="4.0.0")
27
+
28
+ # ── Groq setup ────────────────────────────────────────────────────────────────
29
+ _groq_client: Groq | None = None
30
+
31
+ def get_groq() -> Groq | None:
32
+ global _groq_client
33
+ if _groq_client is None:
34
+ key = os.environ.get("GROQ_API_KEY", "")
35
+ if key:
36
+ _groq_client = Groq(api_key=key)
37
+ return _groq_client
38
+
39
+ GROQ_MODEL = "llama-3.3-70b-versatile"
40
 
41
+ # ── Global Playwright runtime ─────────────────────────────────────────────────
42
  playwright = None
43
  browser = None
44
 
 
50
  )
51
 
52
 
53
+ # ══════════════════════════════════════════════════════════════════════════════
54
+ # MODELS
55
+ # ══════════════════════════════════════════════════════════════════════════════
56
+
57
  class ScrapeRequest(BaseModel):
58
  url: str
59
 
 
74
  resource_links: List[dict] = Field(default_factory=list)
75
  scrape_success: bool = False
76
  url: str = ""
77
+ llm_used: bool = False # tells frontend whether Groq enriched this
78
+
79
 
80
+ # ══════════════════════════════════════════════════════════════════════════════
81
+ # PLATFORM DETECTION
82
+ # ══════════════════════════════════════════════════════════════════════════════
83
 
84
  def detect_platform(url: str) -> str:
85
  domain = urlparse(url).netloc.lower()
86
+ if "devfolio" in domain: return "Devfolio"
87
+ if "unstop" in domain: return "Unstop"
88
+ if "devpost" in domain: return "Devpost"
89
+ if "dorahacks" in domain: return "DoraHacks"
90
+ if "mlh.io" in domain: return "MLH"
91
+ if "hackerearth" in domain: return "HackerEarth"
92
+ if "hackerrank" in domain: return "HackerRank"
 
93
  return "Other"
94
 
95
 
96
+ # ══════════════════════════════════════════════════════════════════════════════
97
+ # DATE PARSING
98
+ # ══════════════════════════════════════════════════════════════════════════════
 
 
 
 
 
 
99
 
100
  DATE_FORMATS = [
101
  "%Y-%m-%d", "%Y/%m/%d",
102
  "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
103
  "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
104
+ "%m/%d/%Y", "%d/%m/%Y",
105
+ "%B %d", "%b %d",
106
  ]
107
 
108
 
109
  def parse_any_date(text: str, fallback_year: int = None) -> str:
 
110
  if not text:
111
  return ""
112
  text = text.strip()
113
  text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
114
  text = re.sub(r"\s+", " ", text)
115
+ fallback_year = fallback_year or datetime.now().year
 
 
116
 
117
  for fmt in DATE_FORMATS:
118
  try:
119
  dt = datetime.strptime(text, fmt)
120
+ if dt.year == 1900:
121
  dt = dt.replace(year=fallback_year)
122
  if dt < datetime.now():
123
  dt = dt.replace(year=fallback_year + 1)
 
128
 
129
 
130
  def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
 
131
  lower = text.lower()
132
+ patterns = [
133
  r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
134
  r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
135
  r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
 
139
  idx = lower.find(kw.lower())
140
  if idx == -1:
141
  continue
142
+ chunk = text[idx: idx + window]
143
+ for pat in patterns:
144
+ m = re.search(pat, chunk, re.IGNORECASE)
145
+ if m:
146
+ parsed = parse_any_date(m.group(1))
147
  if parsed:
148
  return parsed
149
  return ""
150
 
151
 
152
+ # ══════════════════════════════════════════════════════════════════════════════
153
+ # GROQ LLM EXTRACTION (single call, returns full structured dict)
154
+ # ══════════════════════════════════════════════════════════════════════════════
155
+
156
+ def groq_extract(body_text: str, platform: str) -> dict | None:
157
+ """
158
+ One Groq call extracts ALL fields at once.
159
+ Returns None if Groq is unavailable or call fails.
160
+ """
161
+ client = get_groq()
162
+ if not client:
163
+ return None
164
+
165
+ # Trim to ~5000 chars to stay within token limits comfortably
166
+ excerpt = body_text[:5000]
167
+
168
+ prompt = f"""You are extracting structured data from a hackathon page ({platform}).
169
+
170
+ Return ONLY valid JSON — no markdown, no explanation.
171
+
172
+ Schema:
173
+ {{
174
+ "registration_deadline": "YYYY-MM-DD or empty string",
175
+ "submission_deadline": "YYYY-MM-DD or empty string",
176
+ "result_date": "YYYY-MM-DD or empty string",
177
+ "start_date": "YYYY-MM-DD or empty string",
178
+ "end_date": "YYYY-MM-DD or empty string",
179
+ "prize_pool": "raw string like ₹5,00,000 or $10,000 or empty string",
180
+ "team_size": {{"min": 1, "max": 4}},
181
+ "problem_statements": [
182
+ {{"track": "optional track label", "title": "PS or theme title"}}
183
+ ]
184
+ }}
185
+
186
+ Rules:
187
+ - Dates: assume year {datetime.now().year} if missing; use YYYY-MM-DD format.
188
+ - prize_pool: keep original currency symbol and denomination text (₹2 Lakh, $10K, etc.).
189
+ - team_size: extract min/max members. Default {{"min":1,"max":4}} if not found.
190
+ - problem_statements: list every unique track/theme/PS. Max 20 items.
191
+ - If a field is not found, use "" or [] or the default value shown.
192
+
193
+ Page text:
194
+ {excerpt}"""
195
+
196
+ try:
197
+ resp = client.chat.completions.create(
198
+ model=GROQ_MODEL,
199
+ max_tokens=1200,
200
+ temperature=0.05,
201
+ messages=[
202
+ {
203
+ "role": "system",
204
+ "content": "You extract structured hackathon data. Respond with valid JSON only.",
205
+ },
206
+ {"role": "user", "content": prompt},
207
+ ],
208
+ )
209
+ raw = resp.choices[0].message.content.strip()
210
+ # Strip markdown fences if model wraps output
211
+ raw = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
212
+ return json.loads(raw)
213
+ except Exception as e:
214
+ print(f"[Groq] extraction failed: {e}")
215
+ return None
216
+
217
 
218
+ # ══════════════════════════════════════════════════════════════════════════════
219
+ # REGEX FALLBACK EXTRACTION (same logic as v3, kept as safety net)
220
+ # ══════════════════════════════════════════════════════════════════════════════
221
+
222
+ def regex_extract(body_text: str, platform: str) -> dict:
223
  result = {
224
  "registration_deadline": "",
225
  "submission_deadline": "",
 
231
  "problem_statements": [],
232
  }
233
 
234
+ # Dates
 
235
  result["registration_deadline"] = find_dates_near(body_text, [
236
  "registration close", "registrations close", "register by",
237
  "last date to register", "registration deadline", "applications close",
238
  "apply by", "registration ends", "sign up deadline",
239
  ])
 
 
240
  result["submission_deadline"] = find_dates_near(body_text, [
241
  "submission deadline", "submission closes", "submissions close",
242
  "submit by", "last date to submit", "submission end",
243
+ "final submission", "project submission", "deadline",
 
244
  ])
245
 
246
+ # "Runs from Mar 25 - 26, 2026"
247
  runs_from = re.search(
248
  r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
249
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|"
250
+ r"Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
251
+ r"(?:\s*[-–]\s*(\d{1,2}))?(?:[,\s]+(\d{4}))?",
252
+ body_text, re.IGNORECASE,
253
  )
254
  if runs_from:
 
255
  year = runs_from.group(3) or str(datetime.now().year)
256
+ result["start_date"] = parse_any_date(f"{runs_from.group(1)} {year}")
257
+ if runs_from.group(2):
258
  month = runs_from.group(1).split()[0]
259
  result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
260
 
261
  if not result["start_date"]:
262
  result["start_date"] = find_dates_near(body_text, [
263
+ "start date", "starts on", "begins on", "hackathon starts", "event starts",
 
264
  ])
 
265
  if not result["end_date"]:
266
  result["end_date"] = find_dates_near(body_text, [
267
  "end date", "ends on", "hackathon ends", "event ends",
268
  ])
 
 
269
  result["result_date"] = find_dates_near(body_text, [
270
+ "result", "winners announced", "announcement", "results declared",
 
271
  ])
272
 
273
+ # Prize
274
  prize_patterns = [
275
  r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
276
  r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
 
279
  r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
280
  r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
281
  ]
 
 
282
  prize_lower = body_text.lower()
283
  for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
284
  idx = prize_lower.find(kw)
285
  if idx == -1:
286
  continue
287
+ chunk = body_text[max(0, idx - 200): idx + 200]
 
 
288
  for pat in prize_patterns:
289
+ m = re.search(pat, chunk, re.IGNORECASE)
290
+ if m:
291
+ result["prize_pool"] = m.group(1).strip()
292
  break
293
  if result["prize_pool"]:
294
  break
 
 
295
  if not result["prize_pool"]:
296
  for pat in prize_patterns:
297
+ m = re.search(pat, body_text)
298
+ if m:
299
+ result["prize_pool"] = m.group(1).strip()
300
  break
301
 
302
+ # Team size
303
+ for pat in [
304
  r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
305
  r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
306
  r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
307
  r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
308
+ ]:
309
+ m = re.search(pat, body_text, re.IGNORECASE)
310
+ if m:
311
+ groups = [g for g in m.groups() if g]
312
+ result["team_size"] = (
313
+ {"min": int(groups[0]), "max": int(groups[1])} if len(groups) == 2
314
+ else {"min": 1, "max": int(groups[0])}
315
+ )
 
 
316
  break
317
 
318
+ # Problem statements
319
+ ps, seen = [], set()
 
320
 
321
+ domain_m = re.search(
 
322
  r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
323
+ body_text, re.IGNORECASE,
324
  )
325
+ if domain_m:
326
+ for item in re.split(r"[,•|/]", domain_m.group(1)):
 
327
  item = item.strip().rstrip(".")
328
+ if 3 < len(item) < 150 and item.lower() not in seen:
329
+ seen.add(item.lower())
330
  ps.append({"track": "", "title": item})
331
 
332
+ for m in re.finditer(
 
333
  r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
334
+ body_text, re.IGNORECASE,
335
  ):
336
+ title = m.group(2).strip().split("\n")[0]
337
+ if title.lower() not in seen and len(title) > 4:
338
+ seen.add(title.lower())
339
+ ps.append({"track": f"Track {m.group(1)}", "title": title})
340
+
341
+ for m in re.finditer(
 
 
 
342
  r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
343
  r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
344
+ body_text, re.IGNORECASE,
345
  ):
346
+ for item in re.findall(r"[-•●▸]\s*(.{5,200})", m.group(1)):
 
347
  item = item.strip().split("\n")[0]
348
+ if item.lower() not in seen and 4 < len(item) < 200:
349
+ seen.add(item.lower())
350
  ps.append({"track": "", "title": item})
351
 
352
  result["problem_statements"] = ps[:20]
353
  return result
354
 
355
 
356
+ # ══════════════════════════════════════════════════════════════════════════════
357
+ # MERGE: LLM results take precedence, regex fills gaps
358
+ # ══════════════════════════════════════════════════════════════════════════════
359
+
360
+ def merge_results(llm: dict | None, regex: dict) -> tuple[dict, bool]:
361
+ """
362
+ Prefer LLM values; fall back to regex for any blank field.
363
+ Returns (merged_dict, llm_was_used).
364
+ """
365
+ if llm is None:
366
+ return regex, False
367
+
368
+ merged = {}
369
+ date_fields = [
370
+ "registration_deadline", "submission_deadline",
371
+ "result_date", "start_date", "end_date",
372
+ ]
373
+ for f in date_fields:
374
+ merged[f] = llm.get(f) or regex.get(f, "")
375
+
376
+ merged["prize_pool"] = llm.get("prize_pool") or regex.get("prize_pool", "")
377
+
378
+ # team_size: use LLM unless it's the bare default and regex found something
379
+ llm_ts = llm.get("team_size", {"min": 1, "max": 4})
380
+ regex_ts = regex.get("team_size", {"min": 1, "max": 4})
381
+ if llm_ts == {"min": 1, "max": 4} and regex_ts != {"min": 1, "max": 4}:
382
+ merged["team_size"] = regex_ts
383
+ else:
384
+ merged["team_size"] = llm_ts
385
+
386
+ # problem_statements: prefer LLM list; fall back to regex
387
+ llm_ps = llm.get("problem_statements", [])
388
+ regex_ps = regex.get("problem_statements", [])
389
+ merged["problem_statements"] = llm_ps if llm_ps else regex_ps
390
+
391
+ return merged, True
392
+
393
+
394
+ # ══════════════════════════════════════════════════════════════════════════════
395
+ # PLATFORM-SPECIFIC JS EXTRACTION SCRIPTS
396
+ # ══════════════════════════════════════════════════════════════════════════════
397
 
398
+ # Generic script works for Devpost, Unstop, DoraHacks, Other
399
+ GENERIC_EXTRACT_SCRIPT = """() => {
400
  const getMeta = (name) => {
401
  const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
402
  return el ? el.getAttribute('content') || '' : '';
403
  };
 
 
404
  const nameSelectors = [
405
+ 'h1', '.hackathon-name', '.event-name', '.challenge-title',
406
+ '#challenge-title', '.opp-title', '[class*="hackathon-title"]',
407
+ '[class*="event-title"]', '[class*="challenge-name"]',
408
  ];
409
  let name = '';
410
  for (const sel of nameSelectors) {
411
  const el = document.querySelector(sel);
412
  if (el && el.textContent.trim().length > 2) {
413
+ name = el.textContent.trim(); break;
 
414
  }
415
  }
416
  name = name || getMeta('og:title') || document.title.split('|')[0].trim();
417
 
 
418
  const banner = getMeta('og:image') || '';
 
 
419
  let description = getMeta('og:description') || getMeta('description') || '';
 
 
420
  const bodyText = document.body.innerText;
421
 
422
+ // Devpost themes
423
  const themes = [];
424
  document.querySelectorAll('a[href*="themes"]').forEach(a => {
425
  const t = a.textContent.trim();
426
  if (t && t.length > 2 && t.length < 100) themes.push(t);
427
  });
428
 
429
+ // Prize sidebar (Devpost)
430
  let sidebarPrize = '';
431
  document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
432
  const t = el.textContent.trim();
433
  if (t && t.length > 2) sidebarPrize += t + ' ';
434
  });
435
 
436
+ // Resource links
437
  const resourceLinks = [];
438
  const seenHrefs = new Set();
439
+ const kws = ['problem','statement','pdf','rule','guideline','brochure',
440
+ 'document','brief','challenge','track','theme','schedule','timeline'];
441
  document.querySelectorAll('a[href]').forEach(a => {
442
  const href = a.href || '';
443
  const text = a.textContent.trim();
444
+ const hl = href.toLowerCase(), tl = text.toLowerCase();
 
445
  if (seenHrefs.has(href) || !href || href === '#') return;
446
+ const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
447
+ const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
448
+ const isDropbox = hl.includes('dropbox.com');
449
+ const isRelevant = kws.some(kw => tl.includes(kw) || hl.includes(kw));
450
+ if (isPdf || isDrive || isDropbox || isRelevant) {
451
+ seenHrefs.add(href);
452
+ resourceLinks.push({
453
+ text: text.substring(0, 150) || 'Document',
454
+ url: href,
455
+ type: isPdf ? 'pdf' : isDrive ? 'google_drive' : isDropbox ? 'dropbox' : 'link',
456
+ });
457
+ }
458
+ });
459
+
460
+ return {
461
+ name: name.substring(0, 200),
462
+ description: description.substring(0, 2000),
463
+ banner_url: banner,
464
+ bodyText: bodyText.substring(0, 30000),
465
+ themes,
466
+ sidebarPrize: sidebarPrize.trim(),
467
+ resourceLinks: resourceLinks.slice(0, 30),
468
+ };
469
+ }"""
470
+
471
+
472
+ # Devfolio-specific: clicks "About" tab to expose full description + dates
473
+ DEVFOLIO_EXTRACT_SCRIPT = """async () => {
474
+ // Try clicking the About/Overview tab if present
475
+ const tabSelectors = ['a[href*="about"]', 'button[aria-label*="about" i]',
476
+ '[role="tab"]', 'nav a'];
477
+ for (const sel of tabSelectors) {
478
+ const tabs = document.querySelectorAll(sel);
479
+ for (const tab of tabs) {
480
+ if (/about|overview/i.test(tab.textContent)) {
481
+ tab.click();
482
+ await new Promise(r => setTimeout(r, 1000));
483
+ break;
484
+ }
485
+ }
486
+ }
487
+
488
+ const getMeta = (name) => {
489
+ const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
490
+ return el ? el.getAttribute('content') || '' : '';
491
+ };
492
 
493
+ let name = '';
494
+ for (const sel of ['h1', '.sc-hackathon-title', '[class*="title"]']) {
495
+ const el = document.querySelector(sel);
496
+ if (el && el.textContent.trim().length > 2) { name = el.textContent.trim(); break; }
497
+ }
498
+ name = name || getMeta('og:title') || document.title.split('|')[0].trim();
499
 
500
+ const bodyText = document.body.innerText;
501
+ const banner = getMeta('og:image') || '';
502
+ const description = getMeta('og:description') || getMeta('description') || '';
503
+
504
+ // Resource links
505
+ const resourceLinks = [];
506
+ const seenHrefs = new Set();
507
+ const kws = ['problem','statement','pdf','rule','guideline','brochure',
508
+ 'document','brief','challenge','track','theme','schedule','timeline'];
509
+ document.querySelectorAll('a[href]').forEach(a => {
510
+ const href = a.href || '';
511
+ const text = a.textContent.trim();
512
+ const hl = href.toLowerCase(), tl = text.toLowerCase();
513
+ if (seenHrefs.has(href) || !href || href === '#') return;
514
+ const isPdf = hl.endsWith('.pdf') || hl.includes('/pdf');
515
+ const isDrive = hl.includes('drive.google.com') || hl.includes('docs.google.com');
516
+ const isDropbox = hl.includes('dropbox.com');
517
+ const isRelevant = kws.some(kw => tl.includes(kw) || hl.includes(kw));
518
  if (isPdf || isDrive || isDropbox || isRelevant) {
519
  seenHrefs.add(href);
520
  resourceLinks.push({
 
530
  description: description.substring(0, 2000),
531
  banner_url: banner,
532
  bodyText: bodyText.substring(0, 30000),
533
+ themes: [],
534
+ sidebarPrize: '',
535
  resourceLinks: resourceLinks.slice(0, 30),
536
  };
537
  }"""
538
 
539
 
540
+ # MLH: static listing — we grab individual event pages
541
+ MLH_EXTRACT_SCRIPT = """() => {
542
+ const getMeta = (name) => {
543
+ const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
544
+ return el ? el.getAttribute('content') || '' : '';
545
+ };
546
+ let name = getMeta('og:title') || document.title.split('|')[0].trim();
547
+ const banner = getMeta('og:image') || '';
548
+ const description = getMeta('og:description') || '';
549
+ const bodyText = document.body.innerText;
550
+
551
+ const resourceLinks = [];
552
+ const seenHrefs = new Set();
553
+ document.querySelectorAll('a[href]').forEach(a => {
554
+ const href = a.href || '';
555
+ const text = a.textContent.trim();
556
+ const hl = href.toLowerCase();
557
+ if (seenHrefs.has(href) || !href || href === '#') return;
558
+ if (hl.endsWith('.pdf') || hl.includes('drive.google.com')) {
559
+ seenHrefs.add(href);
560
+ resourceLinks.push({ text: text.substring(0, 150) || 'Document', url: href, type: 'pdf' });
561
+ }
562
+ });
563
+
564
+ return {
565
+ name: name.substring(0, 200),
566
+ description: description.substring(0, 2000),
567
+ banner_url: banner,
568
+ bodyText: bodyText.substring(0, 30000),
569
+ themes: [],
570
+ sidebarPrize: '',
571
+ resourceLinks: resourceLinks.slice(0, 20),
572
+ };
573
+ }"""
574
+
575
+
576
+ def get_extract_script(platform: str) -> str:
577
+ if platform == "Devfolio":
578
+ return DEVFOLIO_EXTRACT_SCRIPT
579
+ if platform == "MLH":
580
+ return MLH_EXTRACT_SCRIPT
581
+ return GENERIC_EXTRACT_SCRIPT
582
+
583
+
584
+ # ══════════════════════════════════════════════════════════════════════════════
585
+ # PLAYWRIGHT SCRAPER
586
+ # ═══════════════════════════���══════════════════════════════════════════════════
587
+
588
+ async def scrape_with_playwright(url: str, platform: str) -> dict:
589
+ global browser
590
+ if browser is None:
591
+ return {"scrape_success": False, "error": "Browser not initialized"}
592
+
593
+ context = await browser.new_context(
594
+ user_agent=(
595
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
596
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
597
+ "Chrome/125.0.0.0 Safari/537.36"
598
+ ),
599
+ viewport={"width": 1920, "height": 1080},
600
+ )
601
+
602
+ try:
603
+ page = await context.new_page()
604
+ print(f"[Scraper] → {url} (platform={platform})")
605
+
606
+ await page.goto(url, wait_until="domcontentloaded", timeout=25000)
607
+
608
+ # Platform-specific wait times
609
+ wait_map = {"Unstop": 9, "DoraHacks": 8, "Devfolio": 7, "MLH": 4}
610
+ wait_sec = wait_map.get(platform, 5)
611
+ print(f"[Scraper] Waiting {wait_sec}s for JS...")
612
+ await page.wait_for_timeout(wait_sec * 1000)
613
+
614
+ # Scroll to trigger lazy-loaded content
615
+ for frac in [0.33, 0.66, 1.0, 0.0]:
616
+ await page.evaluate(f"window.scrollTo(0, document.body.scrollHeight * {frac})")
617
+ await asyncio.sleep(0.8)
618
+
619
+ # Run platform-specific extraction script
620
+ script = get_extract_script(platform)
621
+ # Devfolio script is async — evaluate handles both sync and async
622
+ try:
623
+ data = await page.evaluate(script)
624
+ except Exception:
625
+ # Fallback to generic if platform script errors
626
+ data = await page.evaluate(GENERIC_EXTRACT_SCRIPT)
627
+
628
+ body_text = data.get("bodyText", "")
629
+ print(f"[Scraper] bodyText={len(body_text)} chars, name='{data.get('name','')}'")
630
+
631
+ # ── Extraction pipeline ───────────────────────────────────────────────
632
+ # 1. Regex extraction (fast, always runs)
633
+ regex_result = regex_extract(body_text, platform)
634
+
635
+ # 2. Groq LLM extraction (slower, enriches results)
636
+ llm_result = groq_extract(body_text, platform)
637
+
638
+ # 3. Merge: LLM wins, regex fills gaps
639
+ merged, llm_used = merge_results(llm_result, regex_result)
640
+
641
+ # 4. Platform-specific post-processing
642
+ # Devpost: inject sidebar themes if PS list is empty
643
+ themes = data.get("themes", [])
644
+ if themes and not merged["problem_statements"]:
645
+ seen = set()
646
+ for t in themes:
647
+ if t.lower() not in seen:
648
+ seen.add(t.lower())
649
+ merged["problem_statements"].append({"track": "Theme", "title": t})
650
+
651
+ # Devpost: sidebar prize fallback
652
+ sidebar_prize = data.get("sidebarPrize", "")
653
+ if not merged["prize_pool"] and sidebar_prize:
654
+ for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
655
+ m = re.search(pat, sidebar_prize)
656
+ if m:
657
+ merged["prize_pool"] = m.group(1)
658
+ break
659
+ if not merged["prize_pool"]:
660
+ merged["prize_pool"] = sidebar_prize[:100]
661
+
662
+ return {
663
+ "name": data.get("name", ""),
664
+ "description": data.get("description", ""),
665
+ "banner_url": data.get("banner_url", ""),
666
+ "resource_links": data.get("resourceLinks", []),
667
+ "scrape_success": bool(data.get("name") and len(data.get("name", "")) > 2),
668
+ "llm_used": llm_used,
669
+ **merged,
670
+ }
671
+
672
+ except Exception as e:
673
+ import traceback
674
+ traceback.print_exc()
675
+ return {"scrape_success": False, "error": str(e)}
676
+ finally:
677
+ await context.close()
678
+
679
+
680
+ # ══════════════════════════════════════════════════════════════════════════════
681
+ # APP LIFECYCLE
682
+ # ══════════════════════════════════════════════════════════════════════════════
683
+
684
  @app.on_event("startup")
685
  async def startup() -> None:
686
  global playwright, browser
687
  from playwright.async_api import async_playwright
 
688
  playwright = await async_playwright().start()
689
  browser = await playwright.chromium.launch(
690
  headless=True,
691
+ args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
692
  )
693
+ groq_ready = "✓" if get_groq() else "✗ (set GROQ_API_KEY for LLM enrichment)"
694
+ print(f"[Scraper] Playwright ready. Groq={groq_ready}")
695
 
696
 
697
  @app.on_event("shutdown")
698
  async def shutdown() -> None:
699
  global playwright, browser
 
700
  try:
701
+ if browser: await browser.close()
 
 
702
  finally:
703
  browser = None
 
704
  try:
705
+ if playwright: await playwright.stop()
 
 
706
  finally:
707
  playwright = None
708
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
 
710
+ # ══════════════════════════════════════════════════════════════════════════════
711
+ # ROUTES
712
+ # ══════════════════════════════════════════════════════════════════════════════
713
 
714
  @app.get("/")
715
  async def root():
716
+ return {
717
+ "status": "ok",
718
+ "service": "HackTrack Scraper v4",
719
+ "groq_enabled": get_groq() is not None,
720
+ "platforms": ["Devfolio", "Devpost", "Unstop", "DoraHacks", "MLH", "HackerEarth", "HackerRank"],
721
+ }
722
+
723
+
724
+ @app.get("/health")
725
+ async def health():
726
+ return {"status": "ok", "timestamp": datetime.utcnow().isoformat()}
727
 
728
 
729
  @app.post("/scrape", response_model=ScrapeResponse)
730
  async def scrape(request: ScrapeRequest):
731
  url = request.url.strip()
732
  platform = detect_platform(url)
733
+ print(f"\n[Scraper] === {url} platform={platform} ===")
734
 
735
  try:
736
  data = await scrape_with_playwright(url, platform)
 
737
  response = ScrapeResponse(
738
  name=data.get("name", ""),
739
  platform=platform,
 
750
  resource_links=data.get("resource_links", []),
751
  scrape_success=data.get("scrape_success", False),
752
  url=url,
753
+ llm_used=data.get("llm_used", False),
754
+ )
755
+ print(
756
+ f"[Scraper] Done: name='{response.name}' "
757
+ f"reg={response.registration_deadline} sub={response.submission_deadline} "
758
+ f"prize='{response.prize_pool}' ps={len(response.problem_statements)} "
759
+ f"llm={response.llm_used}"
760
  )
 
 
761
  return response
 
762
  except Exception as e:
763
  print(f"[Scraper] Endpoint error: {e}")
764
  return ScrapeResponse(platform=platform, url=url, scrape_success=False)