Spaces:

Rahul-Samedavar
/

hackrx

Build error

App Files Files Community

Rahul-Samedavar commited on Aug 22, 2025

Commit

a4704d5

1 Parent(s): f865d99

first

Browse files

Files changed (3) hide show

.gitignore +2 -0
Dockerfile +16 -0
app.py +282 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ env
2	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,282 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List
+import requests
+import base64
+import json
+import os
+from bs4 import BeautifulSoup
+import logging
+import re
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+import time
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="HackRx Mission API", version="1.0.0")
+class ChallengeRequest(BaseModel):
+    url: str
+    questions: List[str]
+class ChallengeResponse(BaseModel):
+    answers: List[str]
+# LLM API configuration
+LLM_URL = "https://register.hackrx.in/llm/openai"
+SUBSCRIPTION_KEY = os.getenv("SUBSCRIPTION_KEY", "sk-****")
+def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
+    """Call the LLM API with token optimization"""
+    try:
+        headers = {
+            'Content-Type': 'application/json',
+            'x-subscription-key': SUBSCRIPTION_KEY
+        }
+        data = {
+            "messages": messages,
+            "model": "gpt-5-nano",
+            "max_tokens": max_tokens,
+            "temperature": 0.1  # Low temperature for consistent responses
+        }
+        response = requests.post(LLM_URL, headers=headers, json=data)
+        response.raise_for_status()
+        result = response.json()
+        return result.get('choices', [{}])[0].get('message', {}).get('content', '')
+    except Exception as e:
+        logger.error(f"LLM API call failed: {e}")
+        return ""
+def setup_selenium_driver():
+    """Setup selenium driver with headless chrome"""
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--window-size=1920,1080")
+    try:
+        driver = webdriver.Chrome(options=chrome_options)
+        return driver
+    except Exception as e:
+        logger.error(f"Failed to setup selenium driver: {e}")
+        return None
+def extract_hidden_elements(html_content: str) -> List[str]:
+    """Extract hidden elements from HTML"""
+    soup = BeautifulSoup(html_content, 'html.parser')
+    hidden_elements = []
+    # Look for hidden inputs, comments, and elements with display:none
+    hidden_inputs = soup.find_all('input', {'type': 'hidden'})
+    for inp in hidden_inputs:
+        if inp.get('value'):
+            hidden_elements.append(f"Hidden input: {inp.get('name', 'unnamed')} = {inp.get('value')}")
+    # Look for HTML comments
+    comments = soup.find_all(string=lambda text: isinstance(text, str) and '<!--' in text)
+    for comment in comments:
+        if comment.strip():
+            hidden_elements.append(f"Comment: {comment.strip()}")
+    # Look for elements with style="display:none" or hidden attribute
+    hidden_divs = soup.find_all(attrs={'style': re.compile(r'display\s*:\s*none', re.I)})
+    for div in hidden_divs:
+        if div.get_text(strip=True):
+            hidden_elements.append(f"Hidden element: {div.get_text(strip=True)}")
+    # Look for data attributes that might contain codes
+    elements_with_data = soup.find_all(attrs={'data-code': True})
+    for elem in elements_with_data:
+        hidden_elements.append(f"Data code: {elem.get('data-code')}")
+    return hidden_elements
+def scrape_with_requests(url: str) -> dict:
+    """Scrape webpage using requests"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract basic info
+        title = soup.find('title')
+        title_text = title.get_text() if title else "No title"
+        # Extract visible text
+        visible_text = soup.get_text(separator=' ', strip=True)
+        # Extract hidden elements
+        hidden_elements = extract_hidden_elements(response.text)
+        return {
+            'title': title_text,
+            'visible_text': visible_text[:2000],  # Limit text to save tokens
+            'hidden_elements': hidden_elements,
+            'html': response.text
+        }
+    except Exception as e:
+        logger.error(f"Request scraping failed for {url}: {e}")
+        return {}
+def scrape_with_selenium(url: str) -> dict:
+    """Scrape webpage using selenium for dynamic content"""
+    driver = setup_selenium_driver()
+    if not driver:
+        return {}
+    try:
+        driver.get(url)
+        time.sleep(3)  # Wait for page to load
+        # Get page source after JavaScript execution
+        html_content = driver.page_source
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Extract basic info
+        title = driver.title
+        visible_text = soup.get_text(separator=' ', strip=True)
+        # Extract hidden elements
+        hidden_elements = extract_hidden_elements(html_content)
+        # Look for buttons or interactive elements
+        buttons = driver.find_elements(By.TAG_NAME, "button")
+        clickable_elements = []
+        for btn in buttons:
+            if btn.is_displayed():
+                clickable_elements.append(f"Button: {btn.text}")
+        return {
+            'title': title,
+            'visible_text': visible_text[:2000],
+            'hidden_elements': hidden_elements,
+            'clickable_elements': clickable_elements,
+            'html': html_content
+        }
+    except Exception as e:
+        logger.error(f"Selenium scraping failed for {url}: {e}")
+        return {}
+    finally:
+        if driver:
+            driver.quit()
+def analyze_page_content(content: dict, question: str) -> str:
+    """Use LLM to analyze page content and answer questions"""
+    if not content:
+        return "Unable to access page content"
+    # Prepare context for LLM (keep it concise to save tokens)
+    context_parts = []
+    if content.get('title'):
+        context_parts.append(f"Page Title: {content['title']}")
+    if content.get('visible_text'):
+        context_parts.append(f"Visible Text: {content['visible_text'][:800]}")
+    if content.get('hidden_elements'):
+        context_parts.append(f"Hidden Elements: {'; '.join(content['hidden_elements'][:5])}")
+    if content.get('clickable_elements'):
+        context_parts.append(f"Buttons: {'; '.join(content['clickable_elements'][:3])}")
+    context = "\n".join(context_parts)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are analyzing a webpage for a challenge. Be concise and direct in your answers. Look for challenge names, codes, or specific elements mentioned in the question."
+        },
+        {
+            "role": "user",
+            "content": f"Question: {question}\n\nPage Content:\n{context}\n\nProvide a direct answer based on the page content."
+        }
+    ]
+    return call_llm(messages, max_tokens=100)
+@app.post("/challenge", response_model=ChallengeResponse)
+async def solve_challenge(request: ChallengeRequest):
+    """Main endpoint to solve HackRx challenges"""
+    logger.info(f"Received challenge request - URL: {request.url}")
+    logger.info(f"Questions: {request.questions}")
+    print("URL:", request.url)
+    answers = []
+    try:
+        for question in request.questions:
+            logger.info(f"Processing question: {question}")
+            # First try with requests (faster)
+            page_content = scrape_with_requests(request.url)
+            # If requests fails or doesn't find enough info, try selenium
+            if not page_content or (not page_content.get('hidden_elements') and "hidden" in question.lower()):
+                logger.info("Trying selenium for dynamic content...")
+                page_content = scrape_with_selenium(request.url)
+            # Analyze content with LLM
+            answer = analyze_page_content(page_content, question)
+            # If no clear answer, try to extract from hidden elements directly
+            if not answer or len(answer.strip()) < 3:
+                if page_content.get('hidden_elements'):
+                    # Look for challenge-related terms
+                    for element in page_content['hidden_elements']:
+                        if any(term in element.lower() for term in ['challenge', 'name', 'code', 'hidden']):
+                            answer = element.split(':')[-1].strip()
+                            break
+                if not answer and "challenge name" in question.lower():
+                    # Extract from title or visible text
+                    if page_content.get('title'):
+                        answer = page_content['title']
+            answers.append(answer.strip() if answer else "Challenge information not found")
+            logger.info(f"Answer found: {answers[-1]}")
+    except Exception as e:
+        logger.error(f"Error processing challenge: {e}")
+        raise HTTPException(status_code=500, detail=f"Challenge processing failed: {str(e)}")
+    return ChallengeResponse(answers=answers)
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "message": "HackRx Mission API is running"}
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "HackRx Mission API - Ready for action!",
+        "endpoints": {
+            "challenge": "/challenge (POST) - Main challenge solving endpoint",
+            "health": "/health (GET) - Health check"
+        }
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)