Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -35,7 +35,7 @@ class ChallengeResponse(BaseModel):
|
|
| 35 |
# Scraper
|
| 36 |
# -------------------------
|
| 37 |
def scrape_with_requests(url: str) -> Dict[str, Any]:
|
| 38 |
-
"""Scrape a webpage and extract visible + hidden info."""
|
| 39 |
try:
|
| 40 |
headers = {
|
| 41 |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
|
|
@@ -77,16 +77,33 @@ def scrape_with_requests(url: str) -> Dict[str, Any]:
|
|
| 77 |
if k.startswith("data-") and isinstance(v, str) and v.strip():
|
| 78 |
hidden_values.append(f"{k}={v.strip()}")
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
return {
|
| 81 |
"title": title,
|
| 82 |
"visible_text": visible_text,
|
| 83 |
-
"hidden_values": hidden_values[:
|
| 84 |
}
|
|
|
|
| 85 |
except Exception as e:
|
| 86 |
logger.error(f"Request scraping failed for {url}: {e}")
|
| 87 |
return {}
|
| 88 |
|
| 89 |
|
|
|
|
| 90 |
def answer_question(question: str, content: Dict[str, Any]) -> str:
|
| 91 |
"""Simple rule-based extraction for Round 5 questions."""
|
| 92 |
ql = question.lower()
|
|
|
|
| 35 |
# Scraper
|
| 36 |
# -------------------------
|
| 37 |
def scrape_with_requests(url: str) -> Dict[str, Any]:
|
| 38 |
+
"""Scrape a webpage and extract visible + hidden info (expanded)."""
|
| 39 |
try:
|
| 40 |
headers = {
|
| 41 |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
|
|
|
|
| 77 |
if k.startswith("data-") and isinstance(v, str) and v.strip():
|
| 78 |
hidden_values.append(f"{k}={v.strip()}")
|
| 79 |
|
| 80 |
+
# Script tags (look for JSON-like challenge info)
|
| 81 |
+
for script in soup.find_all("script"):
|
| 82 |
+
txt = script.get_text(" ", strip=True)
|
| 83 |
+
if txt:
|
| 84 |
+
# Look for "challengeId", "completionCode", etc.
|
| 85 |
+
matches = re.findall(r"(challenge\w*|code)\s*[:=]\s*['\"]?([A-Za-z0-9\-_]+)", txt, flags=re.I)
|
| 86 |
+
for k, v in matches:
|
| 87 |
+
hidden_values.append(f"script {k}={v}")
|
| 88 |
+
|
| 89 |
+
# Regex tokens (catch suspicious long strings)
|
| 90 |
+
tokens = re.findall(r"[A-Za-z0-9_\-]{8,}", html)
|
| 91 |
+
for t in tokens:
|
| 92 |
+
if any(x in t.lower() for x in ["chall", "code", "id"]):
|
| 93 |
+
hidden_values.append(f"token {t}")
|
| 94 |
+
|
| 95 |
return {
|
| 96 |
"title": title,
|
| 97 |
"visible_text": visible_text,
|
| 98 |
+
"hidden_values": hidden_values[:500], # keep cap
|
| 99 |
}
|
| 100 |
+
|
| 101 |
except Exception as e:
|
| 102 |
logger.error(f"Request scraping failed for {url}: {e}")
|
| 103 |
return {}
|
| 104 |
|
| 105 |
|
| 106 |
+
|
| 107 |
def answer_question(question: str, content: Dict[str, Any]) -> str:
|
| 108 |
"""Simple rule-based extraction for Round 5 questions."""
|
| 109 |
ql = question.lower()
|