Rahul-Samedavar commited on
Commit
d12e55c
·
1 Parent(s): 5ed1355

added logs scrapping

Browse files
Files changed (2) hide show
  1. Dockerfile +8 -10
  2. main.py +187 -31
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
  FROM python:3.11
2
 
3
- # Install system dependencies for Chrome
4
  RUN apt-get update && apt-get install -y \
5
  wget \
6
  gnupg \
@@ -8,21 +8,19 @@ RUN apt-get update && apt-get install -y \
8
  curl \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # Install Chrome
12
- # Install Chrome
13
- RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub \
14
- | gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \
15
  && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" \
16
  > /etc/apt/sources.list.d/google-chrome.list \
17
  && apt-get update \
18
  && apt-get install -y google-chrome-stable \
19
  && rm -rf /var/lib/apt/lists/*
20
 
21
-
22
- # Install ChromeDriver
23
- RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` \
24
- && wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip \
25
- && unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ \
26
  && rm /tmp/chromedriver.zip \
27
  && chmod +x /usr/local/bin/chromedriver
28
 
 
1
  FROM python:3.11
2
 
3
+ # Install system dependencies
4
  RUN apt-get update && apt-get install -y \
5
  wget \
6
  gnupg \
 
8
  curl \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Add Google Chrome repo (Bookworm-safe, no apt-key)
12
+ RUN wget -q -O /usr/share/keyrings/google-chrome.gpg https://dl.google.com/linux/linux_signing_key.pub \
 
 
13
  && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" \
14
  > /etc/apt/sources.list.d/google-chrome.list \
15
  && apt-get update \
16
  && apt-get install -y google-chrome-stable \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
+ # Install ChromeDriver (matching Chrome version)
20
+ RUN CHROME_VERSION=$(google-chrome --version | awk '{print $3}' | cut -d. -f1) \
21
+ && DRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION}") \
22
+ && wget -O /tmp/chromedriver.zip "https://chromedriver.storage.googleapis.com/${DRIVER_VERSION}/chromedriver_linux64.zip" \
23
+ && unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
24
  && rm /tmp/chromedriver.zip \
25
  && chmod +x /usr/local/bin/chromedriver
26
 
main.py CHANGED
@@ -5,9 +5,16 @@ import requests
5
  import base64
6
  import json
7
  import os
 
 
8
  from bs4 import BeautifulSoup
9
  import logging
10
  import re
 
 
 
 
 
11
 
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
@@ -49,6 +56,94 @@ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
49
  logger.error(f"LLM API call failed: {e}")
50
  return ""
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def extract_hidden_elements(html_content: str) -> List[str]:
53
  """Extract hidden elements from HTML"""
54
  soup = BeautifulSoup(html_content, 'html.parser')
@@ -79,37 +174,48 @@ def extract_hidden_elements(html_content: str) -> List[str]:
79
 
80
  return hidden_elements
81
 
82
- def advanced_scrape(url: str) -> dict:
83
- """Enhanced scraping with better hidden element detection"""
84
  try:
85
- session = requests.Session()
86
- session.headers.update({
87
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
88
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
89
- 'Accept-Language': 'en-US,en;q=0.5',
90
- 'Accept-Encoding': 'gzip, deflate',
91
- 'Connection': 'keep-alive'
92
- })
93
-
94
- response = session.get(url, timeout=30)
95
- response.raise_for_status()
96
 
97
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  title = soup.find('title')
100
  title_text = title.get_text().strip() if title else "No title"
101
 
102
  visible_text = soup.get_text(separator=' ', strip=True)
103
 
104
- hidden_elements = extract_hidden_elements(response.text)
105
 
106
  scripts = soup.find_all('script')
107
  script_data = []
108
  for script in scripts:
109
  if script.string:
110
  script_content = script.string.strip()
111
- if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden']):
112
- script_data.append(f"Script data: {script_content[:200]}")
113
 
114
  # Look for meta tags
115
  meta_data = []
@@ -123,25 +229,46 @@ def advanced_scrape(url: str) -> dict:
123
  'visible_text': visible_text[:2000],
124
  'hidden_elements': hidden_elements,
125
  'script_data': script_data,
126
- 'meta_data': meta_data[:5], # Limit meta data
127
- 'html': response.text
 
128
  }
129
 
130
  except Exception as e:
131
- logger.error(f"Advanced scraping failed for {url}: {e}")
132
  return {}
133
 
134
  def analyze_content_intelligently(content: dict, question: str) -> str:
135
- """Intelligent content analysis with multiple strategies"""
136
  if not content:
137
  return "Unable to access page content"
138
 
139
- # Strategy 1: Direct pattern matching for common questions
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  if "challenge name" in question.lower():
141
  # Look in title first
142
  if content.get('title') and content['title'] != "No title":
143
  return content['title']
144
 
 
 
 
 
 
 
 
145
  # Look in hidden elements
146
  for element in content.get('hidden_elements', []):
147
  if 'challenge' in element.lower():
@@ -162,7 +289,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
162
  if match:
163
  return match.group(1).strip()
164
 
165
- # Strategy 2: Use LLM for complex analysis
166
  context_parts = []
167
 
168
  if content.get('title'):
@@ -171,6 +298,9 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
171
  if content.get('visible_text'):
172
  context_parts.append(f"Text: {content['visible_text'][:800]}")
173
 
 
 
 
174
  if content.get('hidden_elements'):
175
  context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
176
 
@@ -182,7 +312,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
182
  messages = [
183
  {
184
  "role": "system",
185
- "content": "Extract the specific answer from webpage content. Be direct and concise. Focus on challenge names, codes, or specific elements requested."
186
  },
187
  {
188
  "role": "user",
@@ -192,8 +322,14 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
192
 
193
  llm_answer = call_llm(messages, max_tokens=50)
194
 
195
- # Strategy 3: Fallback to first meaningful hidden element
196
  if not llm_answer or len(llm_answer.strip()) < 3:
 
 
 
 
 
 
197
  for element in content.get('hidden_elements', []):
198
  if len(element.split(':')) > 1:
199
  return element.split(':')[-1].strip()
@@ -202,7 +338,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
202
 
203
  @app.post("/challenge", response_model=ChallengeResponse)
204
  async def solve_challenge(request: ChallengeRequest):
205
- """Main endpoint to solve HackRx challenges"""
206
  logger.info(f"Received challenge request - URL: {request.url}")
207
  logger.info(f"Questions: {request.questions}")
208
 
@@ -212,8 +348,12 @@ async def solve_challenge(request: ChallengeRequest):
212
  for question in request.questions:
213
  logger.info(f"Processing question: {question}")
214
 
215
- # Scrape the page
216
- page_content = advanced_scrape(request.url)
 
 
 
 
217
 
218
  # Analyze and get answer
219
  answer = analyze_content_intelligently(page_content, question)
@@ -229,13 +369,29 @@ async def solve_challenge(request: ChallengeRequest):
229
 
230
  @app.get("/health")
231
  async def health_check():
232
- return {"status": "healthy", "selenium_available": False}
 
 
 
 
 
 
 
 
 
 
233
 
234
  @app.get("/")
235
  async def root():
236
  return {
237
- "message": "HackRx Mission API - Ready for action!",
238
- "mode": "requests-only",
 
 
 
 
 
 
239
  "endpoints": {
240
  "challenge": "/challenge (POST)",
241
  "health": "/health (GET)"
 
5
  import base64
6
  import json
7
  import os
8
+ import time
9
+ import asyncio
10
  from bs4 import BeautifulSoup
11
  import logging
12
  import re
13
+ from selenium import webdriver
14
+ from selenium.webdriver.chrome.options import Options
15
+ from selenium.webdriver.common.by import By
16
+ from selenium.webdriver.support.ui import WebDriverWait
17
+ from selenium.webdriver.support import expected_conditions as EC
18
 
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
 
56
  logger.error(f"LLM API call failed: {e}")
57
  return ""
58
 
59
+ def get_chrome_driver():
60
+ """Setup Chrome driver with console logging capabilities"""
61
+ try:
62
+ chrome_options = Options()
63
+ chrome_options.add_argument("--headless")
64
+ chrome_options.add_argument("--no-sandbox")
65
+ chrome_options.add_argument("--disable-dev-shm-usage")
66
+ chrome_options.add_argument("--disable-gpu")
67
+ chrome_options.add_argument("--window-size=1920,1080")
68
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
69
+
70
+ # Enable logging
71
+ chrome_options.add_argument("--enable-logging")
72
+ chrome_options.add_argument("--log-level=0")
73
+ chrome_options.set_capability('goog:loggingPrefs', {'browser': 'ALL', 'performance': 'ALL'})
74
+
75
+ driver = webdriver.Chrome(options=chrome_options)
76
+ return driver
77
+ except Exception as e:
78
+ logger.error(f"Failed to setup Chrome driver: {e}")
79
+ return None
80
+
81
+ def extract_console_logs_with_selenium(url: str) -> dict:
82
+ """Extract console logs using Selenium"""
83
+ driver = None
84
+ try:
85
+ driver = get_chrome_driver()
86
+ if not driver:
87
+ return {}
88
+
89
+ logger.info(f"Loading page with Selenium: {url}")
90
+ driver.get(url)
91
+
92
+ # Wait for 3 seconds for console logs to happen
93
+ time.sleep(3)
94
+
95
+ # Get console logs
96
+ console_logs = []
97
+ try:
98
+ logs = driver.get_log('browser')
99
+ for log in logs:
100
+ if log['level'] in ['INFO', 'WARNING', 'SEVERE']:
101
+ console_logs.append(f"Console {log['level']}: {log['message']}")
102
+ except Exception as log_error:
103
+ logger.warning(f"Could not retrieve console logs: {log_error}")
104
+
105
+ # Get page source after waiting
106
+ page_source = driver.page_source
107
+
108
+ # Execute JavaScript to capture any additional console output
109
+ try:
110
+ # Inject console capture script
111
+ console_capture_script = """
112
+ var consoleOutput = [];
113
+ var originalLog = console.log;
114
+ console.log = function() {
115
+ consoleOutput.push(Array.from(arguments).join(' '));
116
+ originalLog.apply(console, arguments);
117
+ };
118
+
119
+ // Wait a bit more and return captured output
120
+ setTimeout(function() {
121
+ window.capturedConsoleOutput = consoleOutput;
122
+ }, 1000);
123
+
124
+ return window.capturedConsoleOutput || [];
125
+ """
126
+
127
+ captured_output = driver.execute_script(console_capture_script)
128
+ if captured_output:
129
+ for output in captured_output:
130
+ console_logs.append(f"Captured console: {output}")
131
+
132
+ except Exception as js_error:
133
+ logger.warning(f"JavaScript execution failed: {js_error}")
134
+
135
+ return {
136
+ 'page_source': page_source,
137
+ 'console_logs': console_logs
138
+ }
139
+
140
+ except Exception as e:
141
+ logger.error(f"Selenium extraction failed: {e}")
142
+ return {}
143
+ finally:
144
+ if driver:
145
+ driver.quit()
146
+
147
  def extract_hidden_elements(html_content: str) -> List[str]:
148
  """Extract hidden elements from HTML"""
149
  soup = BeautifulSoup(html_content, 'html.parser')
 
174
 
175
  return hidden_elements
176
 
177
+ def advanced_scrape_with_console(url: str) -> dict:
178
+ """Enhanced scraping with console log extraction"""
179
  try:
180
+ # First try with Selenium for console logs
181
+ selenium_data = extract_console_logs_with_selenium(url)
 
 
 
 
 
 
 
 
 
182
 
183
+ # Fallback to requests if Selenium fails
184
+ if not selenium_data:
185
+ logger.info("Selenium failed, falling back to requests")
186
+ session = requests.Session()
187
+ session.headers.update({
188
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
189
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
190
+ 'Accept-Language': 'en-US,en;q=0.5',
191
+ 'Accept-Encoding': 'gzip, deflate',
192
+ 'Connection': 'keep-alive'
193
+ })
194
+
195
+ response = session.get(url, timeout=30)
196
+ response.raise_for_status()
197
+ html_content = response.text
198
+ console_logs = []
199
+ else:
200
+ html_content = selenium_data.get('page_source', '')
201
+ console_logs = selenium_data.get('console_logs', [])
202
+
203
+ soup = BeautifulSoup(html_content, 'html.parser')
204
 
205
  title = soup.find('title')
206
  title_text = title.get_text().strip() if title else "No title"
207
 
208
  visible_text = soup.get_text(separator=' ', strip=True)
209
 
210
+ hidden_elements = extract_hidden_elements(html_content)
211
 
212
  scripts = soup.find_all('script')
213
  script_data = []
214
  for script in scripts:
215
  if script.string:
216
  script_content = script.string.strip()
217
+ if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden', 'console.log']):
218
+ script_data.append(f"Script data: {script_content[:300]}")
219
 
220
  # Look for meta tags
221
  meta_data = []
 
229
  'visible_text': visible_text[:2000],
230
  'hidden_elements': hidden_elements,
231
  'script_data': script_data,
232
+ 'meta_data': meta_data[:5],
233
+ 'console_logs': console_logs,
234
+ 'html': html_content
235
  }
236
 
237
  except Exception as e:
238
+ logger.error(f"Advanced scraping with console failed for {url}: {e}")
239
  return {}
240
 
241
  def analyze_content_intelligently(content: dict, question: str) -> str:
242
+ """Intelligent content analysis with console log support"""
243
  if not content:
244
  return "Unable to access page content"
245
 
246
+ # Strategy 1: Check console logs first for direct answers
247
+ console_logs = content.get('console_logs', [])
248
+ if console_logs:
249
+ logger.info(f"Found {len(console_logs)} console logs")
250
+ for log in console_logs:
251
+ if any(keyword in log.lower() for keyword in ['challenge', 'answer', 'code', 'name']):
252
+ # Extract potential answer from console log
253
+ parts = log.split(':')
254
+ if len(parts) > 1:
255
+ potential_answer = parts[-1].strip().strip('"').strip("'")
256
+ if len(potential_answer) > 2:
257
+ return potential_answer
258
+
259
+ # Strategy 2: Direct pattern matching for common questions
260
  if "challenge name" in question.lower():
261
  # Look in title first
262
  if content.get('title') and content['title'] != "No title":
263
  return content['title']
264
 
265
+ # Look in console logs
266
+ for log in console_logs:
267
+ if 'challenge' in log.lower() or 'name' in log.lower():
268
+ parts = log.split(':')
269
+ if len(parts) > 1:
270
+ return parts[-1].strip().strip('"').strip("'")
271
+
272
  # Look in hidden elements
273
  for element in content.get('hidden_elements', []):
274
  if 'challenge' in element.lower():
 
289
  if match:
290
  return match.group(1).strip()
291
 
292
+ # Strategy 3: Use LLM for complex analysis including console logs
293
  context_parts = []
294
 
295
  if content.get('title'):
 
298
  if content.get('visible_text'):
299
  context_parts.append(f"Text: {content['visible_text'][:800]}")
300
 
301
+ if console_logs:
302
+ context_parts.append(f"Console Logs: {'; '.join(console_logs[:5])}")
303
+
304
  if content.get('hidden_elements'):
305
  context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
306
 
 
312
  messages = [
313
  {
314
  "role": "system",
315
+ "content": "Extract the specific answer from webpage content including console logs. Be direct and concise. Focus on challenge names, codes, or specific elements requested. Console logs often contain the answer."
316
  },
317
  {
318
  "role": "user",
 
322
 
323
  llm_answer = call_llm(messages, max_tokens=50)
324
 
325
+ # Strategy 4: Fallback to first meaningful console log or hidden element
326
  if not llm_answer or len(llm_answer.strip()) < 3:
327
+ # Try console logs first
328
+ for log in console_logs:
329
+ if len(log.split(':')) > 1:
330
+ return log.split(':')[-1].strip()
331
+
332
+ # Then try hidden elements
333
  for element in content.get('hidden_elements', []):
334
  if len(element.split(':')) > 1:
335
  return element.split(':')[-1].strip()
 
338
 
339
  @app.post("/challenge", response_model=ChallengeResponse)
340
  async def solve_challenge(request: ChallengeRequest):
341
+ """Main endpoint to solve HackRx challenges with console log support"""
342
  logger.info(f"Received challenge request - URL: {request.url}")
343
  logger.info(f"Questions: {request.questions}")
344
 
 
348
  for question in request.questions:
349
  logger.info(f"Processing question: {question}")
350
 
351
+ # Scrape the page with console log extraction
352
+ page_content = advanced_scrape_with_console(request.url)
353
+
354
+ # Log console output for debugging
355
+ if page_content.get('console_logs'):
356
+ logger.info(f"Console logs found: {page_content['console_logs']}")
357
 
358
  # Analyze and get answer
359
  answer = analyze_content_intelligently(page_content, question)
 
369
 
370
  @app.get("/health")
371
  async def health_check():
372
+ """Health check with Selenium availability"""
373
+ selenium_available = False
374
+ try:
375
+ driver = get_chrome_driver()
376
+ if driver:
377
+ selenium_available = True
378
+ driver.quit()
379
+ except:
380
+ pass
381
+
382
+ return {"status": "healthy", "selenium_available": selenium_available}
383
 
384
  @app.get("/")
385
  async def root():
386
  return {
387
+ "message": "HackRx Mission API - Ready for action with Console Log Support!",
388
+ "mode": "selenium-enhanced",
389
+ "features": [
390
+ "Console log extraction",
391
+ "3-second wait for dynamic content",
392
+ "Hidden element detection",
393
+ "JavaScript execution"
394
+ ],
395
  "endpoints": {
396
  "challenge": "/challenge (POST)",
397
  "health": "/health (GET)"