Spaces:
Build error
Build error
Commit
·
d12e55c
1
Parent(s):
5ed1355
added logs scrapping
Browse files- Dockerfile +8 -10
- main.py +187 -31
Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
FROM python:3.11
|
| 2 |
|
| 3 |
-
# Install system dependencies
|
| 4 |
RUN apt-get update && apt-get install -y \
|
| 5 |
wget \
|
| 6 |
gnupg \
|
|
@@ -8,21 +8,19 @@ RUN apt-get update && apt-get install -y \
|
|
| 8 |
curl \
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub \
|
| 14 |
-
| gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \
|
| 15 |
&& echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" \
|
| 16 |
> /etc/apt/sources.list.d/google-chrome.list \
|
| 17 |
&& apt-get update \
|
| 18 |
&& apt-get install -y google-chrome-stable \
|
| 19 |
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
&& wget -O /tmp/chromedriver.zip
|
| 25 |
-
&& unzip /tmp/chromedriver.zip
|
| 26 |
&& rm /tmp/chromedriver.zip \
|
| 27 |
&& chmod +x /usr/local/bin/chromedriver
|
| 28 |
|
|
|
|
| 1 |
FROM python:3.11
|
| 2 |
|
| 3 |
+
# Install system dependencies
|
| 4 |
RUN apt-get update && apt-get install -y \
|
| 5 |
wget \
|
| 6 |
gnupg \
|
|
|
|
| 8 |
curl \
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
+
# Add Google Chrome repo (Bookworm-safe, no apt-key)
|
| 12 |
+
RUN wget -q -O /usr/share/keyrings/google-chrome.gpg https://dl.google.com/linux/linux_signing_key.pub \
|
|
|
|
|
|
|
| 13 |
&& echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" \
|
| 14 |
> /etc/apt/sources.list.d/google-chrome.list \
|
| 15 |
&& apt-get update \
|
| 16 |
&& apt-get install -y google-chrome-stable \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
+
# Install ChromeDriver (matching Chrome version)
|
| 20 |
+
RUN CHROME_VERSION=$(google-chrome --version | awk '{print $3}' | cut -d. -f1) \
|
| 21 |
+
&& DRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION}") \
|
| 22 |
+
&& wget -O /tmp/chromedriver.zip "https://chromedriver.storage.googleapis.com/${DRIVER_VERSION}/chromedriver_linux64.zip" \
|
| 23 |
+
&& unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
|
| 24 |
&& rm /tmp/chromedriver.zip \
|
| 25 |
&& chmod +x /usr/local/bin/chromedriver
|
| 26 |
|
main.py
CHANGED
|
@@ -5,9 +5,16 @@ import requests
|
|
| 5 |
import base64
|
| 6 |
import json
|
| 7 |
import os
|
|
|
|
|
|
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
import logging
|
| 10 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
logging.basicConfig(level=logging.INFO)
|
| 13 |
logger = logging.getLogger(__name__)
|
|
@@ -49,6 +56,94 @@ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
|
|
| 49 |
logger.error(f"LLM API call failed: {e}")
|
| 50 |
return ""
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def extract_hidden_elements(html_content: str) -> List[str]:
|
| 53 |
"""Extract hidden elements from HTML"""
|
| 54 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
@@ -79,37 +174,48 @@ def extract_hidden_elements(html_content: str) -> List[str]:
|
|
| 79 |
|
| 80 |
return hidden_elements
|
| 81 |
|
| 82 |
-
def
|
| 83 |
-
"""Enhanced scraping with
|
| 84 |
try:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 88 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 89 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
| 90 |
-
'Accept-Encoding': 'gzip, deflate',
|
| 91 |
-
'Connection': 'keep-alive'
|
| 92 |
-
})
|
| 93 |
-
|
| 94 |
-
response = session.get(url, timeout=30)
|
| 95 |
-
response.raise_for_status()
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
title = soup.find('title')
|
| 100 |
title_text = title.get_text().strip() if title else "No title"
|
| 101 |
|
| 102 |
visible_text = soup.get_text(separator=' ', strip=True)
|
| 103 |
|
| 104 |
-
hidden_elements = extract_hidden_elements(
|
| 105 |
|
| 106 |
scripts = soup.find_all('script')
|
| 107 |
script_data = []
|
| 108 |
for script in scripts:
|
| 109 |
if script.string:
|
| 110 |
script_content = script.string.strip()
|
| 111 |
-
if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden']):
|
| 112 |
-
script_data.append(f"Script data: {script_content[:
|
| 113 |
|
| 114 |
# Look for meta tags
|
| 115 |
meta_data = []
|
|
@@ -123,25 +229,46 @@ def advanced_scrape(url: str) -> dict:
|
|
| 123 |
'visible_text': visible_text[:2000],
|
| 124 |
'hidden_elements': hidden_elements,
|
| 125 |
'script_data': script_data,
|
| 126 |
-
'meta_data': meta_data[:5],
|
| 127 |
-
'
|
|
|
|
| 128 |
}
|
| 129 |
|
| 130 |
except Exception as e:
|
| 131 |
-
logger.error(f"Advanced scraping failed for {url}: {e}")
|
| 132 |
return {}
|
| 133 |
|
| 134 |
def analyze_content_intelligently(content: dict, question: str) -> str:
|
| 135 |
-
"""Intelligent content analysis with
|
| 136 |
if not content:
|
| 137 |
return "Unable to access page content"
|
| 138 |
|
| 139 |
-
# Strategy 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
if "challenge name" in question.lower():
|
| 141 |
# Look in title first
|
| 142 |
if content.get('title') and content['title'] != "No title":
|
| 143 |
return content['title']
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# Look in hidden elements
|
| 146 |
for element in content.get('hidden_elements', []):
|
| 147 |
if 'challenge' in element.lower():
|
|
@@ -162,7 +289,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
|
|
| 162 |
if match:
|
| 163 |
return match.group(1).strip()
|
| 164 |
|
| 165 |
-
# Strategy
|
| 166 |
context_parts = []
|
| 167 |
|
| 168 |
if content.get('title'):
|
|
@@ -171,6 +298,9 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
|
|
| 171 |
if content.get('visible_text'):
|
| 172 |
context_parts.append(f"Text: {content['visible_text'][:800]}")
|
| 173 |
|
|
|
|
|
|
|
|
|
|
| 174 |
if content.get('hidden_elements'):
|
| 175 |
context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
|
| 176 |
|
|
@@ -182,7 +312,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
|
|
| 182 |
messages = [
|
| 183 |
{
|
| 184 |
"role": "system",
|
| 185 |
-
"content": "Extract the specific answer from webpage content. Be direct and concise. Focus on challenge names, codes, or specific elements requested."
|
| 186 |
},
|
| 187 |
{
|
| 188 |
"role": "user",
|
|
@@ -192,8 +322,14 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
|
|
| 192 |
|
| 193 |
llm_answer = call_llm(messages, max_tokens=50)
|
| 194 |
|
| 195 |
-
# Strategy
|
| 196 |
if not llm_answer or len(llm_answer.strip()) < 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
for element in content.get('hidden_elements', []):
|
| 198 |
if len(element.split(':')) > 1:
|
| 199 |
return element.split(':')[-1].strip()
|
|
@@ -202,7 +338,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
|
|
| 202 |
|
| 203 |
@app.post("/challenge", response_model=ChallengeResponse)
|
| 204 |
async def solve_challenge(request: ChallengeRequest):
|
| 205 |
-
"""Main endpoint to solve HackRx challenges"""
|
| 206 |
logger.info(f"Received challenge request - URL: {request.url}")
|
| 207 |
logger.info(f"Questions: {request.questions}")
|
| 208 |
|
|
@@ -212,8 +348,12 @@ async def solve_challenge(request: ChallengeRequest):
|
|
| 212 |
for question in request.questions:
|
| 213 |
logger.info(f"Processing question: {question}")
|
| 214 |
|
| 215 |
-
# Scrape the page
|
| 216 |
-
page_content =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# Analyze and get answer
|
| 219 |
answer = analyze_content_intelligently(page_content, question)
|
|
@@ -229,13 +369,29 @@ async def solve_challenge(request: ChallengeRequest):
|
|
| 229 |
|
| 230 |
@app.get("/health")
|
| 231 |
async def health_check():
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
@app.get("/")
|
| 235 |
async def root():
|
| 236 |
return {
|
| 237 |
-
"message": "HackRx Mission API - Ready for action!",
|
| 238 |
-
"mode": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
"endpoints": {
|
| 240 |
"challenge": "/challenge (POST)",
|
| 241 |
"health": "/health (GET)"
|
|
|
|
| 5 |
import base64
|
| 6 |
import json
|
| 7 |
import os
|
| 8 |
+
import time
|
| 9 |
+
import asyncio
|
| 10 |
from bs4 import BeautifulSoup
|
| 11 |
import logging
|
| 12 |
import re
|
| 13 |
+
from selenium import webdriver
|
| 14 |
+
from selenium.webdriver.chrome.options import Options
|
| 15 |
+
from selenium.webdriver.common.by import By
|
| 16 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 17 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 18 |
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
logger = logging.getLogger(__name__)
|
|
|
|
| 56 |
logger.error(f"LLM API call failed: {e}")
|
| 57 |
return ""
|
| 58 |
|
| 59 |
+
def get_chrome_driver():
|
| 60 |
+
"""Setup Chrome driver with console logging capabilities"""
|
| 61 |
+
try:
|
| 62 |
+
chrome_options = Options()
|
| 63 |
+
chrome_options.add_argument("--headless")
|
| 64 |
+
chrome_options.add_argument("--no-sandbox")
|
| 65 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
| 66 |
+
chrome_options.add_argument("--disable-gpu")
|
| 67 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
| 68 |
+
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
| 69 |
+
|
| 70 |
+
# Enable logging
|
| 71 |
+
chrome_options.add_argument("--enable-logging")
|
| 72 |
+
chrome_options.add_argument("--log-level=0")
|
| 73 |
+
chrome_options.set_capability('goog:loggingPrefs', {'browser': 'ALL', 'performance': 'ALL'})
|
| 74 |
+
|
| 75 |
+
driver = webdriver.Chrome(options=chrome_options)
|
| 76 |
+
return driver
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Failed to setup Chrome driver: {e}")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
def extract_console_logs_with_selenium(url: str) -> dict:
|
| 82 |
+
"""Extract console logs using Selenium"""
|
| 83 |
+
driver = None
|
| 84 |
+
try:
|
| 85 |
+
driver = get_chrome_driver()
|
| 86 |
+
if not driver:
|
| 87 |
+
return {}
|
| 88 |
+
|
| 89 |
+
logger.info(f"Loading page with Selenium: {url}")
|
| 90 |
+
driver.get(url)
|
| 91 |
+
|
| 92 |
+
# Wait for 3 seconds for console logs to happen
|
| 93 |
+
time.sleep(3)
|
| 94 |
+
|
| 95 |
+
# Get console logs
|
| 96 |
+
console_logs = []
|
| 97 |
+
try:
|
| 98 |
+
logs = driver.get_log('browser')
|
| 99 |
+
for log in logs:
|
| 100 |
+
if log['level'] in ['INFO', 'WARNING', 'SEVERE']:
|
| 101 |
+
console_logs.append(f"Console {log['level']}: {log['message']}")
|
| 102 |
+
except Exception as log_error:
|
| 103 |
+
logger.warning(f"Could not retrieve console logs: {log_error}")
|
| 104 |
+
|
| 105 |
+
# Get page source after waiting
|
| 106 |
+
page_source = driver.page_source
|
| 107 |
+
|
| 108 |
+
# Execute JavaScript to capture any additional console output
|
| 109 |
+
try:
|
| 110 |
+
# Inject console capture script
|
| 111 |
+
console_capture_script = """
|
| 112 |
+
var consoleOutput = [];
|
| 113 |
+
var originalLog = console.log;
|
| 114 |
+
console.log = function() {
|
| 115 |
+
consoleOutput.push(Array.from(arguments).join(' '));
|
| 116 |
+
originalLog.apply(console, arguments);
|
| 117 |
+
};
|
| 118 |
+
|
| 119 |
+
// Wait a bit more and return captured output
|
| 120 |
+
setTimeout(function() {
|
| 121 |
+
window.capturedConsoleOutput = consoleOutput;
|
| 122 |
+
}, 1000);
|
| 123 |
+
|
| 124 |
+
return window.capturedConsoleOutput || [];
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
captured_output = driver.execute_script(console_capture_script)
|
| 128 |
+
if captured_output:
|
| 129 |
+
for output in captured_output:
|
| 130 |
+
console_logs.append(f"Captured console: {output}")
|
| 131 |
+
|
| 132 |
+
except Exception as js_error:
|
| 133 |
+
logger.warning(f"JavaScript execution failed: {js_error}")
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
'page_source': page_source,
|
| 137 |
+
'console_logs': console_logs
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logger.error(f"Selenium extraction failed: {e}")
|
| 142 |
+
return {}
|
| 143 |
+
finally:
|
| 144 |
+
if driver:
|
| 145 |
+
driver.quit()
|
| 146 |
+
|
| 147 |
def extract_hidden_elements(html_content: str) -> List[str]:
|
| 148 |
"""Extract hidden elements from HTML"""
|
| 149 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
| 174 |
|
| 175 |
return hidden_elements
|
| 176 |
|
| 177 |
+
def advanced_scrape_with_console(url: str) -> dict:
|
| 178 |
+
"""Enhanced scraping with console log extraction"""
|
| 179 |
try:
|
| 180 |
+
# First try with Selenium for console logs
|
| 181 |
+
selenium_data = extract_console_logs_with_selenium(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
# Fallback to requests if Selenium fails
|
| 184 |
+
if not selenium_data:
|
| 185 |
+
logger.info("Selenium failed, falling back to requests")
|
| 186 |
+
session = requests.Session()
|
| 187 |
+
session.headers.update({
|
| 188 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 189 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 190 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 191 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 192 |
+
'Connection': 'keep-alive'
|
| 193 |
+
})
|
| 194 |
+
|
| 195 |
+
response = session.get(url, timeout=30)
|
| 196 |
+
response.raise_for_status()
|
| 197 |
+
html_content = response.text
|
| 198 |
+
console_logs = []
|
| 199 |
+
else:
|
| 200 |
+
html_content = selenium_data.get('page_source', '')
|
| 201 |
+
console_logs = selenium_data.get('console_logs', [])
|
| 202 |
+
|
| 203 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 204 |
|
| 205 |
title = soup.find('title')
|
| 206 |
title_text = title.get_text().strip() if title else "No title"
|
| 207 |
|
| 208 |
visible_text = soup.get_text(separator=' ', strip=True)
|
| 209 |
|
| 210 |
+
hidden_elements = extract_hidden_elements(html_content)
|
| 211 |
|
| 212 |
scripts = soup.find_all('script')
|
| 213 |
script_data = []
|
| 214 |
for script in scripts:
|
| 215 |
if script.string:
|
| 216 |
script_content = script.string.strip()
|
| 217 |
+
if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden', 'console.log']):
|
| 218 |
+
script_data.append(f"Script data: {script_content[:300]}")
|
| 219 |
|
| 220 |
# Look for meta tags
|
| 221 |
meta_data = []
|
|
|
|
| 229 |
'visible_text': visible_text[:2000],
|
| 230 |
'hidden_elements': hidden_elements,
|
| 231 |
'script_data': script_data,
|
| 232 |
+
'meta_data': meta_data[:5],
|
| 233 |
+
'console_logs': console_logs,
|
| 234 |
+
'html': html_content
|
| 235 |
}
|
| 236 |
|
| 237 |
except Exception as e:
|
| 238 |
+
logger.error(f"Advanced scraping with console failed for {url}: {e}")
|
| 239 |
return {}
|
| 240 |
|
| 241 |
def analyze_content_intelligently(content: dict, question: str) -> str:
|
| 242 |
+
"""Intelligent content analysis with console log support"""
|
| 243 |
if not content:
|
| 244 |
return "Unable to access page content"
|
| 245 |
|
| 246 |
+
# Strategy 1: Check console logs first for direct answers
|
| 247 |
+
console_logs = content.get('console_logs', [])
|
| 248 |
+
if console_logs:
|
| 249 |
+
logger.info(f"Found {len(console_logs)} console logs")
|
| 250 |
+
for log in console_logs:
|
| 251 |
+
if any(keyword in log.lower() for keyword in ['challenge', 'answer', 'code', 'name']):
|
| 252 |
+
# Extract potential answer from console log
|
| 253 |
+
parts = log.split(':')
|
| 254 |
+
if len(parts) > 1:
|
| 255 |
+
potential_answer = parts[-1].strip().strip('"').strip("'")
|
| 256 |
+
if len(potential_answer) > 2:
|
| 257 |
+
return potential_answer
|
| 258 |
+
|
| 259 |
+
# Strategy 2: Direct pattern matching for common questions
|
| 260 |
if "challenge name" in question.lower():
|
| 261 |
# Look in title first
|
| 262 |
if content.get('title') and content['title'] != "No title":
|
| 263 |
return content['title']
|
| 264 |
|
| 265 |
+
# Look in console logs
|
| 266 |
+
for log in console_logs:
|
| 267 |
+
if 'challenge' in log.lower() or 'name' in log.lower():
|
| 268 |
+
parts = log.split(':')
|
| 269 |
+
if len(parts) > 1:
|
| 270 |
+
return parts[-1].strip().strip('"').strip("'")
|
| 271 |
+
|
| 272 |
# Look in hidden elements
|
| 273 |
for element in content.get('hidden_elements', []):
|
| 274 |
if 'challenge' in element.lower():
|
|
|
|
| 289 |
if match:
|
| 290 |
return match.group(1).strip()
|
| 291 |
|
| 292 |
+
# Strategy 3: Use LLM for complex analysis including console logs
|
| 293 |
context_parts = []
|
| 294 |
|
| 295 |
if content.get('title'):
|
|
|
|
| 298 |
if content.get('visible_text'):
|
| 299 |
context_parts.append(f"Text: {content['visible_text'][:800]}")
|
| 300 |
|
| 301 |
+
if console_logs:
|
| 302 |
+
context_parts.append(f"Console Logs: {'; '.join(console_logs[:5])}")
|
| 303 |
+
|
| 304 |
if content.get('hidden_elements'):
|
| 305 |
context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
|
| 306 |
|
|
|
|
| 312 |
messages = [
|
| 313 |
{
|
| 314 |
"role": "system",
|
| 315 |
+
"content": "Extract the specific answer from webpage content including console logs. Be direct and concise. Focus on challenge names, codes, or specific elements requested. Console logs often contain the answer."
|
| 316 |
},
|
| 317 |
{
|
| 318 |
"role": "user",
|
|
|
|
| 322 |
|
| 323 |
llm_answer = call_llm(messages, max_tokens=50)
|
| 324 |
|
| 325 |
+
# Strategy 4: Fallback to first meaningful console log or hidden element
|
| 326 |
if not llm_answer or len(llm_answer.strip()) < 3:
|
| 327 |
+
# Try console logs first
|
| 328 |
+
for log in console_logs:
|
| 329 |
+
if len(log.split(':')) > 1:
|
| 330 |
+
return log.split(':')[-1].strip()
|
| 331 |
+
|
| 332 |
+
# Then try hidden elements
|
| 333 |
for element in content.get('hidden_elements', []):
|
| 334 |
if len(element.split(':')) > 1:
|
| 335 |
return element.split(':')[-1].strip()
|
|
|
|
| 338 |
|
| 339 |
@app.post("/challenge", response_model=ChallengeResponse)
|
| 340 |
async def solve_challenge(request: ChallengeRequest):
|
| 341 |
+
"""Main endpoint to solve HackRx challenges with console log support"""
|
| 342 |
logger.info(f"Received challenge request - URL: {request.url}")
|
| 343 |
logger.info(f"Questions: {request.questions}")
|
| 344 |
|
|
|
|
| 348 |
for question in request.questions:
|
| 349 |
logger.info(f"Processing question: {question}")
|
| 350 |
|
| 351 |
+
# Scrape the page with console log extraction
|
| 352 |
+
page_content = advanced_scrape_with_console(request.url)
|
| 353 |
+
|
| 354 |
+
# Log console output for debugging
|
| 355 |
+
if page_content.get('console_logs'):
|
| 356 |
+
logger.info(f"Console logs found: {page_content['console_logs']}")
|
| 357 |
|
| 358 |
# Analyze and get answer
|
| 359 |
answer = analyze_content_intelligently(page_content, question)
|
|
|
|
| 369 |
|
| 370 |
@app.get("/health")
|
| 371 |
async def health_check():
|
| 372 |
+
"""Health check with Selenium availability"""
|
| 373 |
+
selenium_available = False
|
| 374 |
+
try:
|
| 375 |
+
driver = get_chrome_driver()
|
| 376 |
+
if driver:
|
| 377 |
+
selenium_available = True
|
| 378 |
+
driver.quit()
|
| 379 |
+
except:
|
| 380 |
+
pass
|
| 381 |
+
|
| 382 |
+
return {"status": "healthy", "selenium_available": selenium_available}
|
| 383 |
|
| 384 |
@app.get("/")
|
| 385 |
async def root():
|
| 386 |
return {
|
| 387 |
+
"message": "HackRx Mission API - Ready for action with Console Log Support!",
|
| 388 |
+
"mode": "selenium-enhanced",
|
| 389 |
+
"features": [
|
| 390 |
+
"Console log extraction",
|
| 391 |
+
"3-second wait for dynamic content",
|
| 392 |
+
"Hidden element detection",
|
| 393 |
+
"JavaScript execution"
|
| 394 |
+
],
|
| 395 |
"endpoints": {
|
| 396 |
"challenge": "/challenge (POST)",
|
| 397 |
"health": "/health (GET)"
|