Rudraaaa76 commited on
Commit
54d9442
·
verified ·
1 Parent(s): e748a21

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +46 -0
  2. app.py +894 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ ca-certificates \
7
+ wget \
8
+ gnupg \
9
+ fonts-liberation \
10
+ fonts-noto-color-emoji \
11
+ libasound2 \
12
+ libatk-bridge2.0-0 \
13
+ libatk1.0-0 \
14
+ libatspi2.0-0 \
15
+ libcairo2 \
16
+ libcups2 \
17
+ libdbus-1-3 \
18
+ libdrm2 \
19
+ libgbm1 \
20
+ libglib2.0-0 \
21
+ libgtk-3-0 \
22
+ libnss3 \
23
+ libpango-1.0-0 \
24
+ libx11-6 \
25
+ libx11-xcb1 \
26
+ libxcb1 \
27
+ libxcomposite1 \
28
+ libxdamage1 \
29
+ libxext6 \
30
+ libxfixes3 \
31
+ libxkbcommon0 \
32
+ libxrandr2 \
33
+ xdg-utils \
34
+ && rm -rf /var/lib/apt/lists/*
35
+
36
+ COPY requirements.txt .
37
+ RUN pip install --no-cache-dir -r requirements.txt
38
+
39
+ # IMPORTANT FIX
40
+ RUN playwright install --with-deps chromium
41
+
42
+ COPY . .
43
+
44
+ EXPOSE 7860
45
+
46
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
app.py ADDED
@@ -0,0 +1,894 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel, Field
4
+ import asyncio
5
+ import re
6
+ import sys
7
+ from urllib.parse import urlparse
8
+ from typing import List
9
+ from datetime import datetime
10
+
11
+ if sys.platform == "win32":
12
+ # Playwright launches a driver subprocess; Proactor loop supports subprocess APIs on Windows.
13
+ asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
14
+
15
+ app = FastAPI(title="HackTrack Scraper", version="3.0.0")
16
+
17
+ # Global Playwright runtime objects reused across requests.
18
+ playwright = None
19
+ browser = None
20
+
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"],
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+
29
+ class ScrapeRequest(BaseModel):
30
+ url: str
31
+
32
+
33
+ class ScrapeResponse(BaseModel):
34
+ name: str = ""
35
+ platform: str = ""
36
+ banner_url: str = ""
37
+ description: str = ""
38
+ registration_deadline: str = ""
39
+ submission_deadline: str = ""
40
+ result_date: str = ""
41
+ start_date: str = ""
42
+ end_date: str = ""
43
+ prize_pool: str = ""
44
+ team_size: dict = Field(default_factory=lambda: {"min": 1, "max": 4})
45
+ problem_statements: List[dict] = Field(default_factory=list)
46
+ resource_links: List[dict] = Field(default_factory=list)
47
+ scrape_success: bool = False
48
+ url: str = ""
49
+
50
+
51
+ def detect_platform(url: str) -> str:
52
+ domain = urlparse(url).netloc.lower()
53
+ if "devfolio" in domain:
54
+ return "Devfolio"
55
+ elif "unstop" in domain:
56
+ return "Unstop"
57
+ elif "devpost" in domain:
58
+ return "Devpost"
59
+ elif "dorahacks" in domain:
60
+ return "DoraHacks"
61
+ return "Other"
62
+
63
+
64
+ # ============================================================
65
+ # DATE PARSING — robust multi-format
66
+ # ============================================================
67
+ MONTH_MAP = {
68
+ "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3,
69
+ "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6,
70
+ "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9,
71
+ "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12,
72
+ }
73
+
74
+ DATE_FORMATS = [
75
+ "%Y-%m-%d", "%Y/%m/%d",
76
+ "%d %B %Y", "%d %b %Y", "%d %B, %Y", "%d %b, %Y",
77
+ "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
78
+ "%m/%d/%Y", "%d/%m/%Y",
79
+ "%B %d", "%b %d",
80
+ ]
81
+
82
+
83
+ def parse_any_date(text: str, fallback_year: int = None) -> str:
84
+ """Parse many date formats to YYYY-MM-DD. Handles partial dates."""
85
+ if not text:
86
+ return ""
87
+ text = text.strip()
88
+ text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", text)
89
+ text = re.sub(r"\s+", " ", text)
90
+
91
+ if not fallback_year:
92
+ fallback_year = datetime.now().year
93
+
94
+ for fmt in DATE_FORMATS:
95
+ try:
96
+ dt = datetime.strptime(text, fmt)
97
+ if dt.year == 1900: # no year in format
98
+ dt = dt.replace(year=fallback_year)
99
+ if dt < datetime.now():
100
+ dt = dt.replace(year=fallback_year + 1)
101
+ return dt.strftime("%Y-%m-%d")
102
+ except ValueError:
103
+ continue
104
+ return ""
105
+
106
+
107
+ def find_dates_near(text: str, keywords: List[str], window: int = 400) -> str:
108
+ """Find dates within `window` chars after any keyword."""
109
+ lower = text.lower()
110
+ all_date_patterns = [
111
+ r"(\d{4}[-/]\d{1,2}[-/]\d{1,2})",
112
+ r"(\d{1,2}\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,]?\s+\d{4})",
113
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?[,]?\s*\d{0,4})",
114
+ r"(\d{1,2}/\d{1,2}/\d{4})",
115
+ ]
116
+ for kw in keywords:
117
+ idx = lower.find(kw.lower())
118
+ if idx == -1:
119
+ continue
120
+ chunk = text[idx:idx + window]
121
+ for pat in all_date_patterns:
122
+ match = re.search(pat, chunk, re.IGNORECASE)
123
+ if match:
124
+ parsed = parse_any_date(match.group(1))
125
+ if parsed:
126
+ return parsed
127
+ return ""
128
+
129
+
130
+ # ============================================================
131
+ # EXTRACT from full page innerText (the reliable approach)
132
+ # ============================================================
133
+
134
+ def extract_all_from_text(body_text: str, platform: str) -> dict:
135
+ """Extract hackathon details from page innerText using text patterns."""
136
+ result = {
137
+ "registration_deadline": "",
138
+ "submission_deadline": "",
139
+ "result_date": "",
140
+ "start_date": "",
141
+ "end_date": "",
142
+ "prize_pool": "",
143
+ "team_size": {"min": 1, "max": 4},
144
+ "problem_statements": [],
145
+ }
146
+
147
+ # ---- DATES ----
148
+ # Registration deadline
149
+ result["registration_deadline"] = find_dates_near(body_text, [
150
+ "registration close", "registrations close", "register by",
151
+ "last date to register", "registration deadline", "applications close",
152
+ "apply by", "registration ends", "sign up deadline",
153
+ ])
154
+
155
+ # Submission deadline
156
+ result["submission_deadline"] = find_dates_near(body_text, [
157
+ "submission deadline", "submission closes", "submissions close",
158
+ "submit by", "last date to submit", "submission end",
159
+ "final submission", "project submission",
160
+ "deadline", # generic fallback last
161
+ ])
162
+
163
+ # Start date — Devfolio uses "Runs from Mar 25 - 26, 2026"
164
+ runs_from = re.search(
165
+ r"(?:runs?\s+from|starts?\s+(?:on|from)?|begins?\s+(?:on)?|commences?\s+(?:on)?)\s*[:\-]?\s*"
166
+ r"((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2})"
167
+ r"(?:\s*[-–]\s*(\d{1,2}))?"
168
+ r"(?:[,\s]+(\d{4}))?",
169
+ body_text, re.IGNORECASE
170
+ )
171
+ if runs_from:
172
+ start_text = runs_from.group(1)
173
+ year = runs_from.group(3) or str(datetime.now().year)
174
+ result["start_date"] = parse_any_date(f"{start_text} {year}")
175
+ if runs_from.group(2) and runs_from.group(1):
176
+ month = runs_from.group(1).split()[0]
177
+ result["end_date"] = parse_any_date(f"{month} {runs_from.group(2)} {year}")
178
+
179
+ if not result["start_date"]:
180
+ result["start_date"] = find_dates_near(body_text, [
181
+ "start date", "starts on", "begins on", "hackathon starts",
182
+ "event starts", "event date", "dates:",
183
+ ])
184
+
185
+ if not result["end_date"]:
186
+ result["end_date"] = find_dates_near(body_text, [
187
+ "end date", "ends on", "hackathon ends", "event ends",
188
+ ])
189
+
190
+ # Result date
191
+ result["result_date"] = find_dates_near(body_text, [
192
+ "result", "winners announced", "announcement", "winner announcement",
193
+ "results declared", "shortlist",
194
+ ])
195
+
196
+ # ---- PRIZE POOL ----
197
+ prize_patterns = [
198
+ r"(₹\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
199
+ r"(\$\s*[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M|million|thousand))?)",
200
+ r"(€\s*[\d,]+(?:\.\d+)?)",
201
+ r"(£\s*[\d,]+(?:\.\d+)?)",
202
+ r"(INR\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
203
+ r"(Rs\.?\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|Cr|K|k|L))?)",
204
+ ]
205
+
206
+ # Find prize amounts near keywords like "prize", "reward", "worth", "win"
207
+ prize_lower = body_text.lower()
208
+ for kw in ["prize", "reward", "worth", "winning", "bounty", "in cash", "in prizes"]:
209
+ idx = prize_lower.find(kw)
210
+ if idx == -1:
211
+ continue
212
+ # Search ±200 chars around keyword
213
+ start = max(0, idx - 200)
214
+ chunk = body_text[start:idx + 200]
215
+ for pat in prize_patterns:
216
+ match = re.search(pat, chunk, re.IGNORECASE)
217
+ if match:
218
+ result["prize_pool"] = match.group(1).strip()
219
+ break
220
+ if result["prize_pool"]:
221
+ break
222
+
223
+ # Fallback: any large currency amount
224
+ if not result["prize_pool"]:
225
+ for pat in prize_patterns:
226
+ match = re.search(pat, body_text)
227
+ if match:
228
+ result["prize_pool"] = match.group(1).strip()
229
+ break
230
+
231
+ # ---- TEAM SIZE ----
232
+ team_patterns = [
233
+ r"team\s*size[:\s]*(\d+)\s*[-–to]+\s*(\d+)",
234
+ r"(\d+)\s*[-–to]+\s*(\d+)\s*(?:members?|people|participants?|per team)",
235
+ r"teams?\s+of\s+(?:up\s+to\s+)?(\d+)",
236
+ r"max(?:imum)?\s*(?:team)?\s*(?:size)?\s*[:\s]*(\d+)",
237
+ r"(\d+)\s*[-–]\s*(\d+)\s*$", # in FAQ: "2 - 4"
238
+ ]
239
+ for pat in team_patterns:
240
+ match = re.search(pat, body_text, re.IGNORECASE)
241
+ if match:
242
+ groups = [g for g in match.groups() if g]
243
+ if len(groups) == 2:
244
+ result["team_size"] = {"min": int(groups[0]), "max": int(groups[1])}
245
+ elif len(groups) == 1:
246
+ result["team_size"] = {"min": 1, "max": int(groups[0])}
247
+ break
248
+
249
+ # ---- PROBLEM STATEMENTS / TRACKS / DOMAINS ----
250
+ ps = []
251
+ seen_ps = set()
252
+
253
+ # Pattern 1: "Domains: AI, ML, Web App" (Devfolio style)
254
+ domain_match = re.search(
255
+ r"(?:domains?|themes?|tracks?|categories|verticals|areas?)[:\s]+([^\n💡🏆🎁🎟️📍📅⏳📞🌮]+)",
256
+ body_text, re.IGNORECASE
257
+ )
258
+ if domain_match:
259
+ items = re.split(r"[,•|/]", domain_match.group(1))
260
+ for item in items:
261
+ item = item.strip().rstrip(".")
262
+ if 3 < len(item) < 150 and item.lower() not in seen_ps:
263
+ seen_ps.add(item.lower())
264
+ ps.append({"track": "", "title": item})
265
+
266
+ # Pattern 2: Numbered problem statements: "PS1: ...", "Problem Statement 1 - ..."
267
+ for match in re.finditer(
268
+ r"(?:PS|Problem\s*Statement|Theme|Track|Challenge)\s*#?(\d+)\s*[:\-–]\s*(.{5,200})",
269
+ body_text, re.IGNORECASE
270
+ ):
271
+ num = match.group(1)
272
+ title = match.group(2).strip().split("\n")[0]
273
+ if title.lower() not in seen_ps and len(title) > 4:
274
+ seen_ps.add(title.lower())
275
+ ps.append({"track": f"Track {num}", "title": title})
276
+
277
+ # Pattern 3: Devpost-style theme tags (already in themes list from JS)
278
+ # Pattern 4: Bulleted lists after "Themes" or "Tracks" heading
279
+ for match in re.finditer(
280
+ r"(?:themes?|tracks?|problem\s*statements?|challenges?|domains?)\s*[:\n]"
281
+ r"((?:\s*[-•●▸]\s*.{5,200}\n?)+)",
282
+ body_text, re.IGNORECASE
283
+ ):
284
+ items = re.findall(r"[-•●▸]\s*(.{5,200})", match.group(1))
285
+ for item in items:
286
+ item = item.strip().split("\n")[0]
287
+ if item.lower() not in seen_ps and 4 < len(item) < 200:
288
+ seen_ps.add(item.lower())
289
+ ps.append({"track": "", "title": item})
290
+
291
+ result["problem_statements"] = ps[:20]
292
+ return result
293
+
294
+
295
+ # ============================================================
296
+ # PLAYWRIGHT SCRAPER — gets innerText + meta from rendered page
297
+ # ============================================================
298
+
299
+ EXTRACT_SCRIPT = """() => {
300
+ const getMeta = (name) => {
301
+ const el = document.querySelector(`meta[property="${name}"], meta[name="${name}"]`);
302
+ return el ? el.getAttribute('content') || '' : '';
303
+ };
304
+
305
+ // Name: try multiple selectors
306
+ const nameSelectors = [
307
+ 'h1',
308
+ '.hackathon-name', '.event-name', '.challenge-title',
309
+ '#challenge-title', '.opp-title',
310
+ ];
311
+ let name = '';
312
+ for (const sel of nameSelectors) {
313
+ const el = document.querySelector(sel);
314
+ if (el && el.textContent.trim().length > 2) {
315
+ name = el.textContent.trim();
316
+ break;
317
+ }
318
+ }
319
+ name = name || getMeta('og:title') || document.title.split('|')[0].trim();
320
+
321
+ // Banner
322
+ const banner = getMeta('og:image') || '';
323
+
324
+ // Description
325
+ let description = getMeta('og:description') || getMeta('description') || '';
326
+
327
+ // Full page text for parsing
328
+ const bodyText = document.body.innerText;
329
+
330
+ // For Devpost: extract themes from tag links
331
+ const themes = [];
332
+ document.querySelectorAll('a[href*="themes"]').forEach(a => {
333
+ const t = a.textContent.trim();
334
+ if (t && t.length > 2 && t.length < 100) themes.push(t);
335
+ });
336
+
337
+ // Devpost sidebar prize text
338
+ let sidebarPrize = '';
339
+ document.querySelectorAll('a[href*="prizes"], .prize, [class*="prize"]').forEach(el => {
340
+ const t = el.textContent.trim();
341
+ if (t && t.length > 2) sidebarPrize += t + ' ';
342
+ });
343
+
344
+ // Resource links: PDFs, Google Drive, problem statements, rules, guidelines
345
+ const resourceLinks = [];
346
+ const seenHrefs = new Set();
347
+ const linkKeywords = ['problem', 'statement', 'pdf', 'rule', 'guideline', 'brochure', 'document', 'brief', 'challenge', 'track', 'theme', 'schedule', 'timeline'];
348
+ document.querySelectorAll('a[href]').forEach(a => {
349
+ const href = a.href || '';
350
+ const text = a.textContent.trim();
351
+ const hrefLower = href.toLowerCase();
352
+ const textLower = text.toLowerCase();
353
+ if (seenHrefs.has(href) || !href || href === '#') return;
354
+
355
+ const isPdf = hrefLower.endsWith('.pdf') || hrefLower.includes('/pdf');
356
+ const isDrive = hrefLower.includes('drive.google.com') || hrefLower.includes('docs.google.com');
357
+ const isDropbox = hrefLower.includes('dropbox.com');
358
+ const isRelevant = linkKeywords.some(kw => textLower.includes(kw) || hrefLower.includes(kw));
359
+
360
+ if (isPdf || isDrive || isDropbox || isRelevant) {
361
+ seenHrefs.add(href);
362
+ resourceLinks.push({
363
+ text: text.substring(0, 150) || 'Document',
364
+ url: href,
365
+ type: isPdf ? 'pdf' : isDrive ? 'google_drive' : isDropbox ? 'dropbox' : 'link',
366
+ });
367
+ }
368
+ });
369
+
370
+ return {
371
+ name: name.substring(0, 200),
372
+ description: description.substring(0, 2000),
373
+ banner_url: banner,
374
+ bodyText: bodyText.substring(0, 30000),
375
+ themes: themes,
376
+ sidebarPrize: sidebarPrize.trim(),
377
+ resourceLinks: resourceLinks.slice(0, 30),
378
+ };
379
+ }"""
380
+
381
+
382
+ @app.on_event("startup")
383
+ async def startup() -> None:
384
+ global playwright, browser
385
+ from playwright.async_api import async_playwright
386
+
387
+ playwright = await async_playwright().start()
388
+ browser = await playwright.chromium.launch(
389
+ headless=True,
390
+ args=["--no-sandbox", "--disable-setuid-sandbox"],
391
+ )
392
+ print("[Scraper] Playwright browser initialized")
393
+
394
+
395
+ @app.on_event("shutdown")
396
+ async def shutdown() -> None:
397
+ global playwright, browser
398
+
399
+ try:
400
+ if browser is not None:
401
+ await browser.close()
402
+ print("[Scraper] Browser closed")
403
+ finally:
404
+ browser = None
405
+
406
+ try:
407
+ if playwright is not None:
408
+ await playwright.stop()
409
+ print("[Scraper] Playwright stopped")
410
+ finally:
411
+ playwright = None
412
+
413
+ async def scrape_with_playwright(url: str, platform: str) -> dict:
414
+ """Scrape using Playwright — renders JS, grabs full innerText for parsing."""
415
+ global browser
416
+ try:
417
+ if browser is None:
418
+ return {
419
+ "scrape_success": False,
420
+ "error": "Browser is not initialized. Service startup failed.",
421
+ }
422
+
423
+ context = await browser.new_context(
424
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
425
+ viewport={"width": 1920, "height": 1080},
426
+ )
427
+
428
+ try:
429
+ page = await context.new_page()
430
+
431
+ print(f"[Scraper] Navigating to {url} (platform: {platform})")
432
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
433
+
434
+ # Wait for JS rendering — longer for SPAs
435
+ wait_time = 8 if platform in ("Unstop",) else 5
436
+ print(f"[Scraper] Waiting {wait_time}s for JS rendering...")
437
+ await page.wait_for_timeout(wait_time * 1000)
438
+
439
+ # Scroll to trigger lazy content
440
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 3)")
441
+ await asyncio.sleep(1)
442
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 2 / 3)")
443
+ await asyncio.sleep(1)
444
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
+ await asyncio.sleep(1)
446
+ await page.evaluate("window.scrollTo(0, 0)")
447
+ await asyncio.sleep(0.5)
448
+
449
+ # Extract structured + raw text data
450
+ data = await page.evaluate(EXTRACT_SCRIPT)
451
+
452
+ body_text = data.get("bodyText", "")
453
+ name = data.get("name", "")
454
+ description = data.get("description", "")
455
+
456
+ print(f"[Scraper] Extracted name: '{name}', bodyText length: {len(body_text)}")
457
+
458
+ # Parse all fields from full innerText
459
+ extracted = extract_all_from_text(body_text, platform)
460
+
461
+ # Devpost themes from sidebar tags
462
+ themes = data.get("themes", [])
463
+ if themes and not extracted["problem_statements"]:
464
+ seen = set()
465
+ for t in themes:
466
+ if t.lower() not in seen:
467
+ seen.add(t.lower())
468
+ extracted["problem_statements"].append({"track": "Theme", "title": t})
469
+
470
+ # Sidebar prize fallback (Devpost)
471
+ if not extracted["prize_pool"] and data.get("sidebarPrize"):
472
+ prize_text = data["sidebarPrize"]
473
+ for pat in [r"(\$[\d,]+(?:\.\d+)?(?:\s*(?:K|k|M))?)", r"(₹[\d,]+)"]:
474
+ m = re.search(pat, prize_text)
475
+ if m:
476
+ extracted["prize_pool"] = m.group(1)
477
+ break
478
+ if not extracted["prize_pool"]:
479
+ extracted["prize_pool"] = prize_text[:100]
480
+
481
+ return {
482
+ "name": name,
483
+ "description": description,
484
+ "banner_url": data.get("banner_url", ""),
485
+ "scrape_success": bool(name and len(name) > 2),
486
+ "resource_links": data.get("resourceLinks", []),
487
+ **extracted,
488
+ }
489
+ finally:
490
+ await context.close()
491
+
492
+ except Exception as e:
493
+ print(f"[Scraper] Error: {e}")
494
+ import traceback
495
+ traceback.print_exc()
496
+ return {"scrape_success": False, "error": str(e)}
497
+
498
+
499
+ # ============================================================
500
+ # API ROUTES
501
+ # ============================================================
502
+
503
+ @app.get("/")
504
+ async def root():
505
+ return {"status": "ok", "service": "HackTrack Scraper v3"}
506
+
507
+
508
+ @app.post("/scrape", response_model=ScrapeResponse)
509
+ async def scrape(request: ScrapeRequest):
510
+ url = request.url.strip()
511
+ platform = detect_platform(url)
512
+ print(f"\n[Scraper] === New scrape request: {url} (platform={platform}) ===")
513
+
514
+ try:
515
+ data = await scrape_with_playwright(url, platform)
516
+
517
+ response = ScrapeResponse(
518
+ name=data.get("name", ""),
519
+ platform=platform,
520
+ banner_url=data.get("banner_url", ""),
521
+ description=data.get("description", ""),
522
+ registration_deadline=data.get("registration_deadline", ""),
523
+ submission_deadline=data.get("submission_deadline", ""),
524
+ result_date=data.get("result_date", ""),
525
+ start_date=data.get("start_date", ""),
526
+ end_date=data.get("end_date", ""),
527
+ prize_pool=data.get("prize_pool", ""),
528
+ team_size=data.get("team_size", {"min": 1, "max": 4}),
529
+ problem_statements=data.get("problem_statements", []),
530
+ resource_links=data.get("resource_links", []),
531
+ scrape_success=data.get("scrape_success", False),
532
+ url=url,
533
+ )
534
+
535
+ print(f"[Scraper] Result: name='{response.name}', dates=({response.start_date}, {response.end_date}, reg={response.registration_deadline}, sub={response.submission_deadline}), prize='{response.prize_pool}', team={response.team_size}, ps={len(response.problem_statements)}")
536
+ return response
537
+
538
+ except Exception as e:
539
+ print(f"[Scraper] Endpoint error: {e}")
540
+ return ScrapeResponse(platform=platform, url=url, scrape_success=False)
541
+
542
+ # ============================================================
543
+ # LISTING PAGE CRAWLERS — for discovery / public_hackathons
544
+ # ============================================================
545
+
546
+ class CrawledHackathon(BaseModel):
547
+ name: str = ""
548
+ platform: str = ""
549
+ banner_url: str = ""
550
+ description: str = ""
551
+ start_date: str = ""
552
+ end_date: str = ""
553
+ registration_deadline: str = ""
554
+ prize_pool: str = ""
555
+ tags: List[str] = Field(default_factory=list)
556
+ source_url: str = ""
557
+ status: str = "open"
558
+
559
+
560
+ class CrawlResponse(BaseModel):
561
+ platform: str
562
+ count: int = 0
563
+ hackathons: List[CrawledHackathon] = Field(default_factory=list)
564
+ error: str = ""
565
+
566
+
567
+ DEVFOLIO_EXTRACT = """() => {
568
+ // Devfolio uses subdomain links like https://code-recet-3.devfolio.co/
569
+ const allLinks = document.querySelectorAll('a[href*=".devfolio.co"]');
570
+ const results = [];
571
+ const seen = new Set();
572
+
573
+ // Also grab any links that contain h3 tags (hackathon card pattern)
574
+ const h3Links = document.querySelectorAll('a:has(h3)');
575
+ const combined = new Set([...allLinks, ...h3Links]);
576
+
577
+ combined.forEach(card => {
578
+ try {
579
+ const href = card.href || '';
580
+ if (!href || seen.has(href)) return;
581
+
582
+ // Skip non-hackathon links
583
+ const hostname = new URL(href).hostname;
584
+ if (hostname === 'devfolio.co' || hostname === 'www.devfolio.co') return;
585
+ if (!hostname.endsWith('.devfolio.co')) return;
586
+ // Skip common non-hackathon subdomains
587
+ if (['api', 'docs', 'blog', 'app'].some(s => hostname.startsWith(s + '.'))) return;
588
+
589
+ seen.add(href);
590
+
591
+ const nameEl = card.querySelector('h3, h2, [class*="name"], [class*="title"]');
592
+ const name = nameEl ? nameEl.textContent.trim() : '';
593
+ if (!name || name.length < 3) return;
594
+
595
+ // Walk up to the card container to find banner and other data
596
+ const container = card.closest('div') || card.parentElement?.closest('div') || card;
597
+
598
+ const imgEl = container.querySelector('img') || card.querySelector('img');
599
+ const banner = imgEl ? (imgEl.src || imgEl.getAttribute('data-src') || '') : '';
600
+
601
+ const descEl = container.querySelector('p') || card.querySelector('p');
602
+ const description = descEl ? descEl.textContent.trim().substring(0, 500) : '';
603
+
604
+ const allText = (container.textContent || card.textContent || '');
605
+
606
+ // Extract prize
607
+ let prize = '';
608
+ const prizeMatch = allText.match(/[\u20B9$\u20AC\u00A3]\s*[\d,]+(?:\.\d+)?(?:\s*(?:Lakhs?|Lacs?|Crores?|K|k|L|M))?/);
609
+ if (prizeMatch) prize = prizeMatch[0].trim();
610
+
611
+ // Extract dates like "Mar 25 - 27, 2026" or "Runs from ..."
612
+ let startDate = '';
613
+ let endDate = '';
614
+ const dateMatch = allText.match(/((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2})(?:\s*[-\u2013]\s*(\d{1,2}))?(?:[,\s]+(\d{4}))?/i);
615
+ if (dateMatch) {
616
+ const year = dateMatch[3] || new Date().getFullYear().toString();
617
+ startDate = dateMatch[1] + ' ' + year;
618
+ if (dateMatch[2]) {
619
+ const month = dateMatch[1].split(/\s+/)[0];
620
+ endDate = month + ' ' + dateMatch[2] + ' ' + year;
621
+ }
622
+ }
623
+
624
+ // Extract tags from spans/badges
625
+ const tags = [];
626
+ const tagEls = container.querySelectorAll('span, [class*="tag"], [class*="badge"], [class*="chip"], [class*="Pill"]');
627
+ tagEls.forEach(el => {
628
+ const t = el.textContent.trim();
629
+ if (t && t.length > 1 && t.length < 50 && !t.includes('\u20B9') && !t.includes('$') && t !== name) {
630
+ tags.push(t);
631
+ }
632
+ });
633
+
634
+ results.push({
635
+ name,
636
+ source_url: href,
637
+ banner_url: banner,
638
+ description,
639
+ prize_pool: prize,
640
+ start_date: startDate,
641
+ end_date: endDate,
642
+ tags: [...new Set(tags)].slice(0, 10),
643
+ });
644
+ } catch(e) {}
645
+ });
646
+
647
+ return results;
648
+ }"""
649
+
650
+
651
+ DEVPOST_EXTRACT = """() => {
652
+ const cards = document.querySelectorAll('.hackathon-tile, a[data-hackathon-slug], [class*="hackathon"]');
653
+ const results = [];
654
+ const seen = new Set();
655
+
656
+ // Fallback: also try generic link approach
657
+ const allLinks = document.querySelectorAll('a[href*="devpost.com/hackathons/"]');
658
+ const combined = [...cards, ...allLinks];
659
+
660
+ combined.forEach(card => {
661
+ try {
662
+ let href = card.href || card.querySelector('a')?.href || '';
663
+ if (!href.startsWith('http')) {
664
+ const aEl = card.closest('a') || card.querySelector('a');
665
+ if (aEl) href = aEl.href;
666
+ }
667
+ if (!href || seen.has(href)) return;
668
+ if (href.endsWith('/hackathons') || href.endsWith('/hackathons/')) return;
669
+ seen.add(href);
670
+
671
+ const nameEl = card.querySelector('h2, h3, .title, [class*="title"], [class*="name"]');
672
+ const name = nameEl ? nameEl.textContent.trim() : (card.textContent || '').split('\\n')[0].trim().substring(0, 100);
673
+ if (!name || name.length < 3) return;
674
+
675
+ const imgEl = card.querySelector('img');
676
+ const banner = imgEl ? (imgEl.src || '') : '';
677
+
678
+ const descEl = card.querySelector('.tagline, .description, p');
679
+ const description = descEl ? descEl.textContent.trim().substring(0, 500) : '';
680
+
681
+ const allText = card.textContent || '';
682
+ let prize = '';
683
+ const prizeMatch = allText.match(/\\$\\s*[\\d,]+(?:\\.\\d+)?(?:\\s*(?:K|k|M|million))?/);
684
+ if (prizeMatch) prize = prizeMatch[0].trim();
685
+
686
+ // Dates
687
+ let deadline = '';
688
+ const dateMatch = allText.match(/(?:Submission|Deadline|Ends?)[:\\s]+([A-Za-z]+ \\d{1,2},?\\s*\\d{4})/i);
689
+ if (dateMatch) deadline = dateMatch[1];
690
+
691
+ const tags = [];
692
+ card.querySelectorAll('.themes a, [class*="tag"], [class*="theme"]').forEach(el => {
693
+ const t = el.textContent.trim();
694
+ if (t && t.length > 1 && t.length < 50) tags.push(t);
695
+ });
696
+
697
+ results.push({
698
+ name,
699
+ source_url: href,
700
+ banner_url: banner,
701
+ description,
702
+ prize_pool: prize,
703
+ registration_deadline: deadline,
704
+ tags: tags.slice(0, 10),
705
+ });
706
+ } catch(e) {}
707
+ });
708
+
709
+ return results;
710
+ }"""
711
+
712
+
713
+ UNSTOP_EXTRACT = """() => {
714
+ const cards = document.querySelectorAll('[class*="card"], [class*="listing"], a[href*="/hackathons/"], a[href*="/competition/"]');
715
+ const results = [];
716
+ const seen = new Set();
717
+
718
+ cards.forEach(card => {
719
+ try {
720
+ let href = card.href || '';
721
+ if (!href.startsWith('http')) {
722
+ const aEl = card.querySelector('a[href*="hackathon"], a[href*="competition"]');
723
+ if (aEl) href = aEl.href;
724
+ }
725
+ if (!href || seen.has(href)) return;
726
+ if (!href.includes('hackathon') && !href.includes('competition')) return;
727
+ seen.add(href);
728
+
729
+ const nameEl = card.querySelector('h3, h2, .title, [class*="title"], [class*="name"], p.semi-bold');
730
+ const name = nameEl ? nameEl.textContent.trim() : '';
731
+ if (!name || name.length < 3) return;
732
+
733
+ const imgEl = card.querySelector('img');
734
+ const banner = imgEl ? (imgEl.src || '') : '';
735
+
736
+ const allText = card.textContent || '';
737
+
738
+ let prize = '';
739
+ const prizeMatch = allText.match(/(?:₹|INR|Rs\\.?)\\s*[\\d,]+(?:\\.\\d+)?(?:\\s*(?:Lakhs?|Lacs?|Crores?|K|k|L))?/i);
740
+ if (prizeMatch) prize = prizeMatch[0].trim();
741
+
742
+ const tags = [];
743
+ card.querySelectorAll('[class*="chip"], [class*="tag"], [class*="badge"]').forEach(el => {
744
+ const t = el.textContent.trim();
745
+ if (t && t.length > 1 && t.length < 50 && !t.includes('₹')) tags.push(t);
746
+ });
747
+
748
+ const descEl = card.querySelector('p:not(.semi-bold)');
749
+ const description = descEl ? descEl.textContent.trim().substring(0, 500) : '';
750
+
751
+ results.push({
752
+ name,
753
+ source_url: href,
754
+ banner_url: banner,
755
+ description,
756
+ prize_pool: prize,
757
+ tags: tags.slice(0, 10),
758
+ });
759
+ } catch(e) {}
760
+ });
761
+
762
+ return results;
763
+ }"""
764
+
765
+
766
+ async def crawl_listing_page(url: str, platform: str, extract_script: str, scroll_count: int = 5, wait_secs: int = 5) -> List[dict]:
767
+ """Generic listing page crawler: navigate, scroll to load lazy cards, extract."""
768
+ global browser
769
+ if browser is None:
770
+ return []
771
+
772
+ context = await browser.new_context(
773
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
774
+ viewport={"width": 1920, "height": 1080},
775
+ )
776
+
777
+ try:
778
+ page = await context.new_page()
779
+ print(f"[Crawler] Navigating to {url} ({platform})")
780
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000)
781
+ await page.wait_for_timeout(wait_secs * 1000)
782
+
783
+ # Scroll multiple times to trigger lazy loading
784
+ for i in range(scroll_count):
785
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
786
+ await asyncio.sleep(2)
787
+ # Try clicking "Load More" / "Show More" buttons
788
+ for selector in ['button:has-text("Load More")', 'button:has-text("Show More")', 'button:has-text("View More")', '[class*="load-more"]', '[class*="show-more"]']:
789
+ try:
790
+ btn = page.locator(selector).first
791
+ if await btn.is_visible(timeout=500):
792
+ await btn.click()
793
+ await asyncio.sleep(2)
794
+ except:
795
+ pass
796
+
797
+ await page.evaluate("window.scrollTo(0, 0)")
798
+ await asyncio.sleep(1)
799
+
800
+ raw = await page.evaluate(extract_script)
801
+ print(f"[Crawler] {platform}: extracted {len(raw)} entries")
802
+
803
+ hackathons = []
804
+ for item in raw:
805
+ name = item.get("name", "").strip()
806
+ source_url = item.get("source_url", "").strip()
807
+ if not name or not source_url:
808
+ continue
809
+
810
+ # Parse dates if present
811
+ reg_deadline = ""
812
+ if item.get("registration_deadline"):
813
+ reg_deadline = parse_any_date(item["registration_deadline"])
814
+
815
+ hackathons.append({
816
+ "name": name,
817
+ "platform": platform,
818
+ "banner_url": item.get("banner_url", ""),
819
+ "description": item.get("description", ""),
820
+ "start_date": parse_any_date(item.get("start_date", "")),
821
+ "end_date": parse_any_date(item.get("end_date", "")),
822
+ "registration_deadline": reg_deadline,
823
+ "prize_pool": item.get("prize_pool", ""),
824
+ "tags": item.get("tags", []),
825
+ "source_url": source_url,
826
+ "status": "open",
827
+ })
828
+
829
+ return hackathons
830
+ except Exception as e:
831
+ print(f"[Crawler] {platform} error: {e}")
832
+ import traceback
833
+ traceback.print_exc()
834
+ return []
835
+ finally:
836
+ await context.close()
837
+
838
+
839
+ @app.post("/crawl/devfolio", response_model=CrawlResponse)
840
+ async def crawl_devfolio():
841
+ results = await crawl_listing_page(
842
+ url="https://devfolio.co/hackathons/open",
843
+ platform="Devfolio",
844
+ extract_script=DEVFOLIO_EXTRACT,
845
+ scroll_count=5,
846
+ wait_secs=6,
847
+ )
848
+ return CrawlResponse(platform="Devfolio", count=len(results), hackathons=[CrawledHackathon(**h) for h in results])
849
+
850
+
851
+ @app.post("/crawl/devpost", response_model=CrawlResponse)
852
+ async def crawl_devpost():
853
+ results = await crawl_listing_page(
854
+ url="https://devpost.com/hackathons?open_to[]=public&status[]=open",
855
+ platform="DevPost",
856
+ extract_script=DEVPOST_EXTRACT,
857
+ scroll_count=4,
858
+ wait_secs=5,
859
+ )
860
+ return CrawlResponse(platform="DevPost", count=len(results), hackathons=[CrawledHackathon(**h) for h in results])
861
+
862
+
863
+ @app.post("/crawl/unstop", response_model=CrawlResponse)
864
+ async def crawl_unstop():
865
+ results = await crawl_listing_page(
866
+ url="https://unstop.com/hackathons",
867
+ platform="Unstop",
868
+ extract_script=UNSTOP_EXTRACT,
869
+ scroll_count=5,
870
+ wait_secs=8,
871
+ )
872
+ return CrawlResponse(platform="Unstop", count=len(results), hackathons=[CrawledHackathon(**h) for h in results])
873
+
874
+
875
+ @app.post("/crawl/all")
876
+ async def crawl_all():
877
+ """Crawl all platforms and return combined results."""
878
+ print("\n[Crawler] === Starting full crawl ===")
879
+ devfolio, devpost, unstop = await asyncio.gather(
880
+ crawl_listing_page("https://devfolio.co/hackathons/open", "Devfolio", DEVFOLIO_EXTRACT, 5, 6),
881
+ crawl_listing_page("https://devpost.com/hackathons?open_to[]=public&status[]=open", "DevPost", DEVPOST_EXTRACT, 4, 5),
882
+ crawl_listing_page("https://unstop.com/hackathons", "Unstop", UNSTOP_EXTRACT, 5, 8),
883
+ )
884
+ all_results = devfolio + devpost + unstop
885
+ print(f"[Crawler] === Full crawl complete: {len(all_results)} hackathons ===")
886
+ return {
887
+ "total": len(all_results),
888
+ "by_platform": {
889
+ "devfolio": len(devfolio),
890
+ "devpost": len(devpost),
891
+ "unstop": len(unstop),
892
+ },
893
+ "hackathons": all_results,
894
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.0
3
+ playwright==1.45.0
4
+ pydantic==2.8.0
5
+ groq==0.9.0
6
+ python-dotenv==1.0.1
7
+