Spaces:

Rahul-Samedavar
/

hackrx

Build error

App Files Files Community

Rahul-Samedavar commited on Aug 22, 2025

Commit

d12e55c

1 Parent(s): 5ed1355

added logs scrapping

Browse files

Files changed (2) hide show

Dockerfile +8 -10
main.py +187 -31

Dockerfile CHANGED Viewed

@@ -1,6 +1,6 @@
 FROM python:3.11
-# Install system dependencies for Chrome
 RUN apt-get update && apt-get install -y \
     wget \
     gnupg \
@@ -8,21 +8,19 @@ RUN apt-get update && apt-get install -y \
     curl \
     && rm -rf /var/lib/apt/lists/*
-# Install Chrome
-# Install Chrome
-RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub \
-    | gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg \
     && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" \
     > /etc/apt/sources.list.d/google-chrome.list \
     && apt-get update \
     && apt-get install -y google-chrome-stable \
     && rm -rf /var/lib/apt/lists/*
-# Install ChromeDriver
-RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` \
-    && wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip \
-    && unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ \
     && rm /tmp/chromedriver.zip \
     && chmod +x /usr/local/bin/chromedriver

 FROM python:3.11
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     wget \
     gnupg \
     curl \
     && rm -rf /var/lib/apt/lists/*
+# Add Google Chrome repo (Bookworm-safe, no apt-key)
+RUN wget -q -O /usr/share/keyrings/google-chrome.gpg https://dl.google.com/linux/linux_signing_key.pub \
     && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" \
     > /etc/apt/sources.list.d/google-chrome.list \
     && apt-get update \
     && apt-get install -y google-chrome-stable \
     && rm -rf /var/lib/apt/lists/*
+# Install ChromeDriver (matching Chrome version)
+RUN CHROME_VERSION=$(google-chrome --version | awk '{print $3}' | cut -d. -f1) \
+    && DRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION}") \
+    && wget -O /tmp/chromedriver.zip "https://chromedriver.storage.googleapis.com/${DRIVER_VERSION}/chromedriver_linux64.zip" \
+    && unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
     && rm /tmp/chromedriver.zip \
     && chmod +x /usr/local/bin/chromedriver

main.py CHANGED Viewed

@@ -5,9 +5,16 @@ import requests
 import base64
 import json
 import os
 from bs4 import BeautifulSoup
 import logging
 import re
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -49,6 +56,94 @@ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
         logger.error(f"LLM API call failed: {e}")
         return ""
 def extract_hidden_elements(html_content: str) -> List[str]:
     """Extract hidden elements from HTML"""
     soup = BeautifulSoup(html_content, 'html.parser')
@@ -79,37 +174,48 @@ def extract_hidden_elements(html_content: str) -> List[str]:
     return hidden_elements
-def advanced_scrape(url: str) -> dict:
-    """Enhanced scraping with better hidden element detection"""
     try:
-        session = requests.Session()
-        session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Accept-Encoding': 'gzip, deflate',
-            'Connection': 'keep-alive'
-        })
-        response = session.get(url, timeout=30)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
         title = soup.find('title')
         title_text = title.get_text().strip() if title else "No title"
         visible_text = soup.get_text(separator=' ', strip=True)
-        hidden_elements = extract_hidden_elements(response.text)
         scripts = soup.find_all('script')
         script_data = []
         for script in scripts:
             if script.string:
                 script_content = script.string.strip()
-                if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden']):
-                    script_data.append(f"Script data: {script_content[:200]}")
         # Look for meta tags
         meta_data = []
@@ -123,25 +229,46 @@ def advanced_scrape(url: str) -> dict:
             'visible_text': visible_text[:2000],
             'hidden_elements': hidden_elements,
             'script_data': script_data,
-            'meta_data': meta_data[:5],  # Limit meta data
-            'html': response.text
         }
     except Exception as e:
-        logger.error(f"Advanced scraping failed for {url}: {e}")
         return {}
 def analyze_content_intelligently(content: dict, question: str) -> str:
-    """Intelligent content analysis with multiple strategies"""
     if not content:
         return "Unable to access page content"
-    # Strategy 1: Direct pattern matching for common questions
     if "challenge name" in question.lower():
         # Look in title first
         if content.get('title') and content['title'] != "No title":
             return content['title']
         # Look in hidden elements
         for element in content.get('hidden_elements', []):
             if 'challenge' in element.lower():
@@ -162,7 +289,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
             if match:
                 return match.group(1).strip()
-    # Strategy 2: Use LLM for complex analysis
     context_parts = []
     if content.get('title'):
@@ -171,6 +298,9 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
     if content.get('visible_text'):
         context_parts.append(f"Text: {content['visible_text'][:800]}")
     if content.get('hidden_elements'):
         context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
@@ -182,7 +312,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
     messages = [
         {
             "role": "system",
-            "content": "Extract the specific answer from webpage content. Be direct and concise. Focus on challenge names, codes, or specific elements requested."
         },
         {
             "role": "user",
@@ -192,8 +322,14 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
     llm_answer = call_llm(messages, max_tokens=50)
-    # Strategy 3: Fallback to first meaningful hidden element
     if not llm_answer or len(llm_answer.strip()) < 3:
         for element in content.get('hidden_elements', []):
             if len(element.split(':')) > 1:
                 return element.split(':')[-1].strip()
@@ -202,7 +338,7 @@ def analyze_content_intelligently(content: dict, question: str) -> str:
 @app.post("/challenge", response_model=ChallengeResponse)
 async def solve_challenge(request: ChallengeRequest):
-    """Main endpoint to solve HackRx challenges"""
     logger.info(f"Received challenge request - URL: {request.url}")
     logger.info(f"Questions: {request.questions}")
@@ -212,8 +348,12 @@ async def solve_challenge(request: ChallengeRequest):
         for question in request.questions:
             logger.info(f"Processing question: {question}")
-            # Scrape the page
-            page_content = advanced_scrape(request.url)
             # Analyze and get answer
             answer = analyze_content_intelligently(page_content, question)
@@ -229,13 +369,29 @@ async def solve_challenge(request: ChallengeRequest):
 @app.get("/health")
 async def health_check():
-    return {"status": "healthy", "selenium_available": False}
 @app.get("/")
 async def root():
     return {
-        "message": "HackRx Mission API - Ready for action!",
-        "mode": "requests-only",
         "endpoints": {
             "challenge": "/challenge (POST)",
             "health": "/health (GET)"

 import base64
 import json
 import os
+import time
+import asyncio
 from bs4 import BeautifulSoup
 import logging
 import re
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         logger.error(f"LLM API call failed: {e}")
         return ""
+def get_chrome_driver():
+    """Setup Chrome driver with console logging capabilities"""
+    try:
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--window-size=1920,1080")
+        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+        # Enable logging
+        chrome_options.add_argument("--enable-logging")
+        chrome_options.add_argument("--log-level=0")
+        chrome_options.set_capability('goog:loggingPrefs', {'browser': 'ALL', 'performance': 'ALL'})
+        driver = webdriver.Chrome(options=chrome_options)
+        return driver
+    except Exception as e:
+        logger.error(f"Failed to setup Chrome driver: {e}")
+        return None
+def extract_console_logs_with_selenium(url: str) -> dict:
+    """Extract console logs using Selenium"""
+    driver = None
+    try:
+        driver = get_chrome_driver()
+        if not driver:
+            return {}
+        logger.info(f"Loading page with Selenium: {url}")
+        driver.get(url)
+        # Wait for 3 seconds for console logs to happen
+        time.sleep(3)
+        # Get console logs
+        console_logs = []
+        try:
+            logs = driver.get_log('browser')
+            for log in logs:
+                if log['level'] in ['INFO', 'WARNING', 'SEVERE']:
+                    console_logs.append(f"Console {log['level']}: {log['message']}")
+        except Exception as log_error:
+            logger.warning(f"Could not retrieve console logs: {log_error}")
+        # Get page source after waiting
+        page_source = driver.page_source
+        # Execute JavaScript to capture any additional console output
+        try:
+            # Inject console capture script
+            console_capture_script = """
+            var consoleOutput = [];
+            var originalLog = console.log;
+            console.log = function() {
+                consoleOutput.push(Array.from(arguments).join(' '));
+                originalLog.apply(console, arguments);
+            };
+            // Wait a bit more and return captured output
+            setTimeout(function() {
+                window.capturedConsoleOutput = consoleOutput;
+            }, 1000);
+            return window.capturedConsoleOutput || [];
+            """
+            captured_output = driver.execute_script(console_capture_script)
+            if captured_output:
+                for output in captured_output:
+                    console_logs.append(f"Captured console: {output}")
+        except Exception as js_error:
+            logger.warning(f"JavaScript execution failed: {js_error}")
+        return {
+            'page_source': page_source,
+            'console_logs': console_logs
+        }
+    except Exception as e:
+        logger.error(f"Selenium extraction failed: {e}")
+        return {}
+    finally:
+        if driver:
+            driver.quit()
 def extract_hidden_elements(html_content: str) -> List[str]:
     """Extract hidden elements from HTML"""
     soup = BeautifulSoup(html_content, 'html.parser')
     return hidden_elements
+def advanced_scrape_with_console(url: str) -> dict:
+    """Enhanced scraping with console log extraction"""
     try:
+        # First try with Selenium for console logs
+        selenium_data = extract_console_logs_with_selenium(url)
+        # Fallback to requests if Selenium fails
+        if not selenium_data:
+            logger.info("Selenium failed, falling back to requests")
+            session = requests.Session()
+            session.headers.update({
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive'
+            })
+            response = session.get(url, timeout=30)
+            response.raise_for_status()
+            html_content = response.text
+            console_logs = []
+        else:
+            html_content = selenium_data.get('page_source', '')
+            console_logs = selenium_data.get('console_logs', [])
+        soup = BeautifulSoup(html_content, 'html.parser')
         title = soup.find('title')
         title_text = title.get_text().strip() if title else "No title"
         visible_text = soup.get_text(separator=' ', strip=True)
+        hidden_elements = extract_hidden_elements(html_content)
         scripts = soup.find_all('script')
         script_data = []
         for script in scripts:
             if script.string:
                 script_content = script.string.strip()
+                if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden', 'console.log']):
+                    script_data.append(f"Script data: {script_content[:300]}")
         # Look for meta tags
         meta_data = []
             'visible_text': visible_text[:2000],
             'hidden_elements': hidden_elements,
             'script_data': script_data,
+            'meta_data': meta_data[:5],
+            'console_logs': console_logs,
+            'html': html_content
         }
     except Exception as e:
+        logger.error(f"Advanced scraping with console failed for {url}: {e}")
         return {}
 def analyze_content_intelligently(content: dict, question: str) -> str:
+    """Intelligent content analysis with console log support"""
     if not content:
         return "Unable to access page content"
+    # Strategy 1: Check console logs first for direct answers
+    console_logs = content.get('console_logs', [])
+    if console_logs:
+        logger.info(f"Found {len(console_logs)} console logs")
+        for log in console_logs:
+            if any(keyword in log.lower() for keyword in ['challenge', 'answer', 'code', 'name']):
+                # Extract potential answer from console log
+                parts = log.split(':')
+                if len(parts) > 1:
+                    potential_answer = parts[-1].strip().strip('"').strip("'")
+                    if len(potential_answer) > 2:
+                        return potential_answer
+    # Strategy 2: Direct pattern matching for common questions
     if "challenge name" in question.lower():
         # Look in title first
         if content.get('title') and content['title'] != "No title":
             return content['title']
+        # Look in console logs
+        for log in console_logs:
+            if 'challenge' in log.lower() or 'name' in log.lower():
+                parts = log.split(':')
+                if len(parts) > 1:
+                    return parts[-1].strip().strip('"').strip("'")
         # Look in hidden elements
         for element in content.get('hidden_elements', []):
             if 'challenge' in element.lower():
             if match:
                 return match.group(1).strip()
+    # Strategy 3: Use LLM for complex analysis including console logs
     context_parts = []
     if content.get('title'):
     if content.get('visible_text'):
         context_parts.append(f"Text: {content['visible_text'][:800]}")
+    if console_logs:
+        context_parts.append(f"Console Logs: {'; '.join(console_logs[:5])}")
     if content.get('hidden_elements'):
         context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
     messages = [
         {
             "role": "system",
+            "content": "Extract the specific answer from webpage content including console logs. Be direct and concise. Focus on challenge names, codes, or specific elements requested. Console logs often contain the answer."
         },
         {
             "role": "user",
     llm_answer = call_llm(messages, max_tokens=50)
+    # Strategy 4: Fallback to first meaningful console log or hidden element
     if not llm_answer or len(llm_answer.strip()) < 3:
+        # Try console logs first
+        for log in console_logs:
+            if len(log.split(':')) > 1:
+                return log.split(':')[-1].strip()
+        # Then try hidden elements
         for element in content.get('hidden_elements', []):
             if len(element.split(':')) > 1:
                 return element.split(':')[-1].strip()
 @app.post("/challenge", response_model=ChallengeResponse)
 async def solve_challenge(request: ChallengeRequest):
+    """Main endpoint to solve HackRx challenges with console log support"""
     logger.info(f"Received challenge request - URL: {request.url}")
     logger.info(f"Questions: {request.questions}")
         for question in request.questions:
             logger.info(f"Processing question: {question}")
+            # Scrape the page with console log extraction
+            page_content = advanced_scrape_with_console(request.url)
+            # Log console output for debugging
+            if page_content.get('console_logs'):
+                logger.info(f"Console logs found: {page_content['console_logs']}")
             # Analyze and get answer
             answer = analyze_content_intelligently(page_content, question)
 @app.get("/health")
 async def health_check():
+    """Health check with Selenium availability"""
+    selenium_available = False
+    try:
+        driver = get_chrome_driver()
+        if driver:
+            selenium_available = True
+            driver.quit()
+    except:
+        pass
+    return {"status": "healthy", "selenium_available": selenium_available}
 @app.get("/")
 async def root():
     return {
+        "message": "HackRx Mission API - Ready for action with Console Log Support!",
+        "mode": "selenium-enhanced",
+        "features": [
+            "Console log extraction",
+            "3-second wait for dynamic content",
+            "Hidden element detection",
+            "JavaScript execution"
+        ],
         "endpoints": {
             "challenge": "/challenge (POST)",
             "health": "/health (GET)"