Rudraaaa76 commited on
Commit
5058507
·
verified ·
1 Parent(s): ebc905a

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +46 -0
  2. app.py +542 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ ca-certificates \
7
+ wget \
8
+ gnupg \
9
+ fonts-liberation \
10
+ fonts-noto-color-emoji \
11
+ libasound2 \
12
+ libatk-bridge2.0-0 \
13
+ libatk1.0-0 \
14
+ libatspi2.0-0 \
15
+ libcairo2 \
16
+ libcups2 \
17
+ libdbus-1-3 \
18
+ libdrm2 \
19
+ libgbm1 \
20
+ libglib2.0-0 \
21
+ libgtk-3-0 \
22
+ libnss3 \
23
+ libpango-1.0-0 \
24
+ libx11-6 \
25
+ libx11-xcb1 \
26
+ libxcb1 \
27
+ libxcomposite1 \
28
+ libxdamage1 \
29
+ libxext6 \
30
+ libxfixes3 \
31
+ libxkbcommon0 \
32
+ libxrandr2 \
33
+ xdg-utils \
34
+ && rm -rf /var/lib/apt/lists/*
35
+
36
+ COPY requirements.txt .
37
+ RUN pip install --no-cache-dir -r requirements.txt
38
+
39
+ # IMPORTANT FIX
40
+ RUN playwright install --with-deps chromium
41
+
42
+ COPY . .
43
+
44
+ EXPOSE 7860
45
+
46
+ CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel, Field
4
+ import asyncio
5
+ import re
6
+ import sys
7
+ from urllib.parse import urlparse
8
+ from typing import List
9
+ from datetime import datetime
10
+
11
+ if sys.platform == "win32":
12
+ # Playwright launches a driver subprocess; Proactor loop supports subprocess APIs on Windows.
13
+ asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
14
+
15
+ app = FastAPI(title="HackTrack Scraper", version="3.0.0")
16
+
17
+ # Global Playwright runtime objects reused across requests.
18
+ playwright = None
19
+ browser = None
20
+
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"],
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+
29
+ class ScrapeRequest(BaseModel):
30
+ url: str
31
+
32
+
33
+ class ScrapeResponse(BaseModel):
34
+ name: str = ""
35
+ platform: str = ""
36
+ banner_url: str = ""
37
+ description: str = ""
38
+ registration_deadline: str = ""
39
+ submission_deadline: str = ""
40
+ result_date: str = ""
41
+ start_date: str = ""
42
+ end_date: str = ""
43
+ prize_pool: str = ""
44
+ team_size: dict = Field(default_factory=lambda: {"min": 1, "max": 4})
45
+ problem_statements: List[dict] = Field(default_factory=list)
46
+ resource_links: List[dict] = Field(default_factory=list)
47
+ scrape_success: bool = False
48
+ url: str = ""
49
+
50
+
51
+ def detect_platform(url: str) -> str:
52
+ domain = urlparse(url).netloc.lower()
53
+ if "devfolio" in domain:
54
+ return "Devfolio"
55
+ elif "unstop" in domain:
56
+ return "Unstop"
57
+ elif "devpost" in domain:
58
+ return "Devpost"
59
+ elif "dorahacks" in domain:
60
+ return "DoraHacks"
61
+ return "Other"
62
+
63
+
64
+ # ============================================================
65
+ # DATE PARSING — robust multi-format
66
+ # ============================================================
67
+ MONTH_MAP = {
68
+ "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3,
69
+ "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6,
70
+ "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9,
71
+ "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12,
72
+ }
73
+
74
+ DATE_FORMATS = [
75
+ "%Y-%m-%d", "%Y/%m/%d",
76
+ "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
77
+ "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
78
+ "%m/%d/%Y", "%d/%m/%Y",
79
+ "%B %d", "%b %d",
80
+ ]
81
+
82
+
83
+ def parse_any_date(text: str, fallback_year: int = None) -> str:
84
+ """Parse many date formats to YYYY-MM-DD. Handles partial dates."""
85
+ if not text:
86
+ return ""
87
+ text = text.strip()
88
+ text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
89
+ text = re.sub(r"\s+", " ", text)
90
+
91
+ if not fallback_year:
92
+ fallback_year = datetime.now().year
93
+
94
+ for fmt in DATE_FORMATS:
95
+ try:
96
+ dt = datetime.strptime(text, fmt)
97
+ if dt.year == 1900: # no year in format
98
+ dt = dt.replace(year=fallback_year)
99
+ if dt < datetime.now():
100
+ dt = dt.replace(year=fallback_year + 1)
101
+ return dt.strftime("%Y-%m-%d")
102
+ except ValueError:
103
+ continue
104
+ return ""
105
+
106
+
107
+ def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
108
+ """Find dates within `window` chars after any keyword."""
109
+ lower = text.lower()
110
+ all_date_patterns = [
111
+ r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
112
+ r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
113
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
114
+ r"(\d{1,2}/\d{1,2}/\d{4})",
115
+ ]
116
+ for kw in keywords:
117
+ idx = lower.find(kw.lower())
118
+ if idx == -1:
119
+ continue
120
+ chunk = text[idx:idx + window]
121
+ for pat in all_date_patterns:
122
+ match = re.search(pat, chunk, re.IGNORECASE)
123
+ if match:
124
+ parsed = parse_any_date(match.group(1))
125
+ if parsed:
126
+ return parsed
127
+ return ""
128
+
129
+
130
+ # ============================================================
131
+ # EXTRACT from full page innerText (the reliable approach)
132
+ # ============================================================
133
+
134
+ def extract_all_from_text(body_text: str, platform: str) -> dict:
135
+ """Extract hackathon details from page innerText using text patterns."""
136
+ result = {
137
+ "registration_deadline": "",
138
+ "submission_deadline": "",
139
+ "result_date": "",
140
+ "start_date": "",
141
+ "end_date": "",
142
+ "prize_pool": "",
143
+ "team_size": {"min": 1, "max": 4},
144
+ "problem_statements": [],
145
+ }
146
+
147
+ # ---- DATES ----
148
+ # Registration deadline
149
+ result["registration_deadline"] = find_dates_near(body_text, [
150
+ "registration close", "registrations close", "register by",
151
+ "last date to register", "registration deadline", "applications close",
152
+ "apply by", "registration ends", "sign up deadline",
153
+ ])
154
+
155
+ # Submission deadline
156
+ result["submission_deadline"] = find_dates_near(body_text, [
157
+ "submission deadline", "submission closes", "submissions close",
158
+ "submit by", "last date to submit", "submission end",
159
+ "final submission", "project submission",
160
+ "deadline", # generic fallback last
161
+ ])
162
+
163
+ # Start date — Devfolio uses "Runs from Mar 25 - 26, 2026"
164
+ runs_from = re.search(
165
+ r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
166
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
167
+ r"(?:\s*[-–]\s*(\d{1,2}))?"
168
+ r"(?:[,\s]+(\d{4}))?",
169
+ body_text, re.IGNORECASE
170
+ )
171
+ if runs_from:
172
+ start_text = runs_from.group(1)
173
+ year = runs_from.group(3) or str(datetime.now().year)
174
+ result["start_date"] = parse_any_date(f"{start_text} {year}")
175
+ if runs_from.group(2) and runs_from.group(1):
176
+ month = runs_from.group(1).split()[0]
177
+ result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
178
+
179
+ if not result["start_date"]:
180
+ result["start_date"] = find_dates_near(body_text, [
181
+ "start date", "starts on", "begins on", "hackathon starts",
182
+ "event starts", "event date", "dates:",
183
+ ])
184
+
185
+ if not result["end_date"]:
186
+ result["end_date"] = find_dates_near(body_text, [
187
+ "end date", "ends on", "hackathon ends", "event ends",
188
+ ])
189
+
190
+ # Result date
191
+ result["result_date"] = find_dates_near(body_text, [
192
+ "result", "winners announced", "announcement", "winner announcement",
193
+ "results declared", "shortlist",
194
+ ])
195
+
196
+ # ---- PRIZE POOL ----
197
+ prize_patterns = [
198
+ r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
199
+ r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
200
+ r"(€\s*[\d,]+(?:\.\d+)?)",
201
+ r"(£\s*[\d,]+(?:\.\d+)?)",
202
+ r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
203
+ r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
204
+ ]
205
+
206
+ # Find prize amounts near keywords like "prize", "reward", "worth", "win"
207
+ prize_lower = body_text.lower()
208
+ for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
209
+ idx = prize_lower.find(kw)
210
+ if idx == -1:
211
+ continue
212
+ # Search ±200 chars around keyword
213
+ start = max(0, idx - 200)
214
+ chunk = body_text[start:idx + 200]
215
+ for pat in prize_patterns:
216
+ match = re.search(pat, chunk, re.IGNORECASE)
217
+ if match:
218
+ result["prize_pool"] = match.group(1).strip()
219
+ break
220
+ if result["prize_pool"]:
221
+ break
222
+
223
+ # Fallback: any large currency amount
224
+ if not result["prize_pool"]:
225
+ for pat in prize_patterns:
226
+ match = re.search(pat, body_text)
227
+ if match:
228
+ result["prize_pool"] = match.group(1).strip()
229
+ break
230
+
231
+ # ---- TEAM SIZE ----
232
+ team_patterns = [
233
+ r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
234
+ r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
235
+ r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
236
+ r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
237
+ r"(\d+)\s*[-–]\s*(\d+)\s*$", # in FAQ: "2 - 4"
238
+ ]
239
+ for pat in team_patterns:
240
+ match = re.search(pat, body_text, re.IGNORECASE)
241
+ if match:
242
+ groups = [g for g in match.groups() if g]
243
+ if len(groups) == 2:
244
+ result["team_size"] = {"min": int(groups[0]), "max": int(groups[1])}
245
+ elif len(groups) == 1:
246
+ result["team_size"] = {"min": 1, "max": int(groups[0])}
247
+ break
248
+
249
+ # ---- PROBLEM STATEMENTS / TRACKS / DOMAINS ----
250
+ ps = []
251
+ seen_ps = set()
252
+
253
+ # Pattern 1: "Domains: AI, ML, Web App" (Devfolio style)
254
+ domain_match = re.search(
255
+ r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
256
+ body_text, re.IGNORECASE
257
+ )
258
+ if domain_match:
259
+ items = re.split(r"[,•|/]", domain_match.group(1))
260
+ for item in items:
261
+ item = item.strip().rstrip(".")
262
+ if 3 < len(item) < 150 and item.lower() not in seen_ps:
263
+ seen_ps.add(item.lower())
264
+ ps.append({"track": "", "title": item})
265
+
266
+ # Pattern 2: Numbered problem statements: "PS1: ...", "Problem Statement 1 - ..."
267
+ for match in re.finditer(
268
+ r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
269
+ body_text, re.IGNORECASE
270
+ ):
271
+ num = match.group(1)
272
+ title = match.group(2).strip().split("\n")[0]
273
+ if title.lower() not in seen_ps and len(title) > 4:
274
+ seen_ps.add(title.lower())
275
+ ps.append({"track": f"Track {num}", "title": title})
276
+
277
+ # Pattern 3: Devpost-style theme tags (already in themes list from JS)
278
+ # Pattern 4: Bulleted lists after "Themes" or "Tracks" heading
279
+ for match in re.finditer(
280
+ r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
281
+ r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
282
+ body_text, re.IGNORECASE
283
+ ):
284
+ items = re.findall(r"[-•●▸]\s*(.{5,200})", match.group(1))
285
+ for item in items:
286
+ item = item.strip().split("\n")[0]
287
+ if item.lower() not in seen_ps and 4 < len(item) < 200:
288
+ seen_ps.add(item.lower())
289
+ ps.append({"track": "", "title": item})
290
+
291
+ result["problem_statements"] = ps[:20]
292
+ return result
293
+
294
+
295
+ # ============================================================
296
+ # PLAYWRIGHT SCRAPER — gets innerText + meta from rendered page
297
+ # ============================================================
298
+
299
+ EXTRACT_SCRIPT = """() => {
300
+ const getMeta = (name) => {
301
+ const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
302
+ return el ? el.getAttribute('content') || '' : '';
303
+ };
304
+
305
+ // Name: try multiple selectors
306
+ const nameSelectors = [
307
+ 'h1',
308
+ '.hackathon-name', '.event-name', '.challenge-title',
309
+ '#challenge-title', '.opp-title',
310
+ ];
311
+ let name = '';
312
+ for (const sel of nameSelectors) {
313
+ const el = document.querySelector(sel);
314
+ if (el && el.textContent.trim().length > 2) {
315
+ name = el.textContent.trim();
316
+ break;
317
+ }
318
+ }
319
+ name = name || getMeta('og:title') || document.title.split('|')[0].trim();
320
+
321
+ // Banner
322
+ const banner = getMeta('og:image') || '';
323
+
324
+ // Description
325
+ let description = getMeta('og:description') || getMeta('description') || '';
326
+
327
+ // Full page text for parsing
328
+ const bodyText = document.body.innerText;
329
+
330
+ // For Devpost: extract themes from tag links
331
+ const themes = [];
332
+ document.querySelectorAll('a[href*="themes"]').forEach(a => {
333
+ const t = a.textContent.trim();
334
+ if (t && t.length > 2 && t.length < 100) themes.push(t);
335
+ });
336
+
337
+ // Devpost sidebar prize text
338
+ let sidebarPrize = '';
339
+ document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
340
+ const t = el.textContent.trim();
341
+ if (t && t.length > 2) sidebarPrize += t + ' ';
342
+ });
343
+
344
+ // Resource links: PDFs, Google Drive, problem statements, rules, guidelines
345
+ const resourceLinks = [];
346
+ const seenHrefs = new Set();
347
+ const linkKeywords = ['problem', 'statement', 'pdf', 'rule', 'guideline', 'brochure', 'document', 'brief', 'challenge', 'track', 'theme', 'schedule', 'timeline'];
348
+ document.querySelectorAll('a[href]').forEach(a => {
349
+ const href = a.href || '';
350
+ const text = a.textContent.trim();
351
+ const hrefLower = href.toLowerCase();
352
+ const textLower = text.toLowerCase();
353
+ if (seenHrefs.has(href) || !href || href === '#') return;
354
+
355
+ const isPdf = hrefLower.endsWith('.pdf') || hrefLower.includes('/pdf');
356
+ const isDrive = hrefLower.includes('drive.google.com') || hrefLower.includes('docs.google.com');
357
+ const isDropbox = hrefLower.includes('dropbox.com');
358
+ const isRelevant = linkKeywords.some(kw => textLower.includes(kw) || hrefLower.includes(kw));
359
+
360
+ if (isPdf || isDrive || isDropbox || isRelevant) {
361
+ seenHrefs.add(href);
362
+ resourceLinks.push({
363
+ text: text.substring(0, 150) || 'Document',
364
+ url: href,
365
+ type: isPdf ? 'pdf' : isDrive ? 'google_drive' : isDropbox ? 'dropbox' : 'link',
366
+ });
367
+ }
368
+ });
369
+
370
+ return {
371
+ name: name.substring(0, 200),
372
+ description: description.substring(0, 2000),
373
+ banner_url: banner,
374
+ bodyText: bodyText.substring(0, 30000),
375
+ themes: themes,
376
+ sidebarPrize: sidebarPrize.trim(),
377
+ resourceLinks: resourceLinks.slice(0, 30),
378
+ };
379
+ }"""
380
+
381
+
382
+ @app.on_event("startup")
383
+ async def startup() -> None:
384
+ global playwright, browser
385
+ from playwright.async_api import async_playwright
386
+
387
+ playwright = await async_playwright().start()
388
+ browser = await playwright.chromium.launch(
389
+ headless=True,
390
+ args=["--no-sandbox", "--disable-setuid-sandbox"],
391
+ )
392
+ print("[Scraper] Playwright browser initialized")
393
+
394
+
395
+ @app.on_event("shutdown")
396
+ async def shutdown() -> None:
397
+ global playwright, browser
398
+
399
+ try:
400
+ if browser is not None:
401
+ await browser.close()
402
+ print("[Scraper] Browser closed")
403
+ finally:
404
+ browser = None
405
+
406
+ try:
407
+ if playwright is not None:
408
+ await playwright.stop()
409
+ print("[Scraper] Playwright stopped")
410
+ finally:
411
+ playwright = None
412
+
413
+ async def scrape_with_playwright(url: str, platform: str) -> dict:
414
+ """Scrape using Playwright — renders JS, grabs full innerText for parsing."""
415
+ global browser
416
+ try:
417
+ if browser is None:
418
+ return {
419
+ "scrape_success": False,
420
+ "error": "Browser is not initialized. Service startup failed.",
421
+ }
422
+
423
+ context = await browser.new_context(
424
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
425
+ viewport={"width": 1920, "height": 1080},
426
+ )
427
+
428
+ try:
429
+ page = await context.new_page()
430
+
431
+ print(f"[Scraper] Navigating to {url} (platform: {platform})")
432
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
433
+
434
+ # Wait for JS rendering — longer for SPAs
435
+ wait_time = 8 if platform in ("Unstop",) else 5
436
+ print(f"[Scraper] Waiting {wait_time}s for JS rendering...")
437
+ await page.wait_for_timeout(wait_time * 1000)
438
+
439
+ # Scroll to trigger lazy content
440
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 3)")
441
+ await asyncio.sleep(1)
442
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 2 / 3)")
443
+ await asyncio.sleep(1)
444
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
+ await asyncio.sleep(1)
446
+ await page.evaluate("window.scrollTo(0, 0)")
447
+ await asyncio.sleep(0.5)
448
+
449
+ # Extract structured + raw text data
450
+ data = await page.evaluate(EXTRACT_SCRIPT)
451
+
452
+ body_text = data.get("bodyText", "")
453
+ name = data.get("name", "")
454
+ description = data.get("description", "")
455
+
456
+ print(f"[Scraper] Extracted name: '{name}', bodyText length: {len(body_text)}")
457
+
458
+ # Parse all fields from full innerText
459
+ extracted = extract_all_from_text(body_text, platform)
460
+
461
+ # Devpost themes from sidebar tags
462
+ themes = data.get("themes", [])
463
+ if themes and not extracted["problem_statements"]:
464
+ seen = set()
465
+ for t in themes:
466
+ if t.lower() not in seen:
467
+ seen.add(t.lower())
468
+ extracted["problem_statements"].append({"track": "Theme", "title": t})
469
+
470
+ # Sidebar prize fallback (Devpost)
471
+ if not extracted["prize_pool"] and data.get("sidebarPrize"):
472
+ prize_text = data["sidebarPrize"]
473
+ for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
474
+ m = re.search(pat, prize_text)
475
+ if m:
476
+ extracted["prize_pool"] = m.group(1)
477
+ break
478
+ if not extracted["prize_pool"]:
479
+ extracted["prize_pool"] = prize_text[:100]
480
+
481
+ return {
482
+ "name": name,
483
+ "description": description,
484
+ "banner_url": data.get("banner_url", ""),
485
+ "scrape_success": bool(name and len(name) > 2),
486
+ "resource_links": data.get("resourceLinks", []),
487
+ **extracted,
488
+ }
489
+ finally:
490
+ await context.close()
491
+
492
+ except Exception as e:
493
+ print(f"[Scraper] Error: {e}")
494
+ import traceback
495
+ traceback.print_exc()
496
+ return {"scrape_success": False, "error": str(e)}
497
+
498
+
499
+ # ============================================================
500
+ # API ROUTES
501
+ # ============================================================
502
+
503
+ @app.get("/")
504
+ async def root():
505
+ return {"status": "ok", "service": "HackTrack Scraper v3"}
506
+
507
+
508
+ @app.post("/scrape", response_model=ScrapeResponse)
509
+ async def scrape(request: ScrapeRequest):
510
+ url = request.url.strip()
511
+ platform = detect_platform(url)
512
+ print(f"\n[Scraper] === New scrape request: {url} (platform={platform}) ===")
513
+
514
+ try:
515
+ data = await scrape_with_playwright(url, platform)
516
+
517
+ response = ScrapeResponse(
518
+ name=data.get("name", ""),
519
+ platform=platform,
520
+ banner_url=data.get("banner_url", ""),
521
+ description=data.get("description", ""),
522
+ registration_deadline=data.get("registration_deadline", ""),
523
+ submission_deadline=data.get("submission_deadline", ""),
524
+ result_date=data.get("result_date", ""),
525
+ start_date=data.get("start_date", ""),
526
+ end_date=data.get("end_date", ""),
527
+ prize_pool=data.get("prize_pool", ""),
528
+ team_size=data.get("team_size", {"min": 1, "max": 4}),
529
+ problem_statements=data.get("problem_statements", []),
530
+ resource_links=data.get("resource_links", []),
531
+ scrape_success=data.get("scrape_success", False),
532
+ url=url,
533
+ )
534
+
535
+ print(f"[Scraper] Result: name='{response.name}', dates=({response.start_date}, {response.end_date}, reg={response.registration_deadline}, sub={response.submission_deadline}), prize='{response.prize_pool}', team={response.team_size}, ps={len(response.problem_statements)}")
536
+ return response
537
+
538
+ except Exception as e:
539
+ print(f"[Scraper] Endpoint error: {e}")
540
+ return ScrapeResponse(platform=platform, url=url, scrape_success=False)
541
+
542
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.0
3
+ playwright==1.45.0
4
+ pydantic==2.8.0