Rahul-Samedavar commited on
Commit
68e7a9e
·
1 Parent(s): 74c7e34

fixed selenium

Browse files
Files changed (5) hide show
  1. Dockerfile +34 -11
  2. app.py +104 -131
  3. main.py +247 -0
  4. packages.txt +2 -0
  5. requirements.txt +7 -6
Dockerfile CHANGED
@@ -1,16 +1,39 @@
1
- # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- # you will also find guides on how best to write your Dockerfile
3
 
4
- FROM python:3.9
 
 
 
 
 
 
5
 
6
- RUN useradd -m -u 1000 user
7
- USER user
8
- ENV PATH="/home/user/.local/bin:$PATH"
 
 
 
9
 
10
- WORKDIR /app
 
 
 
 
 
11
 
12
- COPY --chown=user ./requirements.txt requirements.txt
13
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
- COPY --chown=user . /app
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
 
2
 
3
+ # Install system dependencies for Chrome
4
+ RUN apt-get update && apt-get install -y \
5
+ wget \
6
+ gnupg \
7
+ unzip \
8
+ curl \
9
+ && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Install Chrome
12
+ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
13
+ && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
14
+ && apt-get update \
15
+ && apt-get install -y google-chrome-stable \
16
+ && rm -rf /var/lib/apt/lists/*
17
 
18
+ # Install ChromeDriver
19
+ RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` \
20
+ && wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip \
21
+ && unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ \
22
+ && rm /tmp/chromedriver.zip \
23
+ && chmod +x /usr/local/bin/chromedriver
24
 
25
+ # Set up the working directory
26
+ WORKDIR /code
27
 
28
+ # Copy requirements and install Python dependencies
29
+ COPY ./requirements.txt /code/requirements.txt
30
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
31
+
32
+ # Copy the application
33
+ COPY . /code/
34
+
35
+ # Expose port
36
+ EXPOSE 7860
37
+
38
+ # Command to run the application
39
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -8,12 +8,6 @@ import os
8
  from bs4 import BeautifulSoup
9
  import logging
10
  import re
11
- from selenium import webdriver
12
- from selenium.webdriver.common.by import By
13
- from selenium.webdriver.support.ui import WebDriverWait
14
- from selenium.webdriver.support import expected_conditions as EC
15
- from selenium.webdriver.chrome.options import Options
16
- import time
17
 
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO)
@@ -44,7 +38,7 @@ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
44
  "messages": messages,
45
  "model": "gpt-5-nano",
46
  "max_tokens": max_tokens,
47
- "temperature": 0.1 # Low temperature for consistent responses
48
  }
49
 
50
  response = requests.post(LLM_URL, headers=headers, json=data)
@@ -57,162 +51,164 @@ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
57
  logger.error(f"LLM API call failed: {e}")
58
  return ""
59
 
60
- def setup_selenium_driver():
61
- """Setup selenium driver with headless chrome"""
62
- chrome_options = Options()
63
- chrome_options.add_argument("--headless")
64
- chrome_options.add_argument("--no-sandbox")
65
- chrome_options.add_argument("--disable-dev-shm-usage")
66
- chrome_options.add_argument("--disable-gpu")
67
- chrome_options.add_argument("--window-size=1920,1080")
68
-
69
- try:
70
- driver = webdriver.Chrome(options=chrome_options)
71
- return driver
72
- except Exception as e:
73
- logger.error(f"Failed to setup selenium driver: {e}")
74
- return None
75
-
76
  def extract_hidden_elements(html_content: str) -> List[str]:
77
  """Extract hidden elements from HTML"""
78
  soup = BeautifulSoup(html_content, 'html.parser')
79
  hidden_elements = []
80
 
81
- # Look for hidden inputs, comments, and elements with display:none
82
  hidden_inputs = soup.find_all('input', {'type': 'hidden'})
83
  for inp in hidden_inputs:
84
  if inp.get('value'):
85
  hidden_elements.append(f"Hidden input: {inp.get('name', 'unnamed')} = {inp.get('value')}")
86
 
87
  # Look for HTML comments
88
- comments = soup.find_all(string=lambda text: isinstance(text, str) and '<!--' in text)
89
  for comment in comments:
90
- if comment.strip():
91
- hidden_elements.append(f"Comment: {comment.strip()}")
 
92
 
93
- # Look for elements with style="display:none" or hidden attribute
94
  hidden_divs = soup.find_all(attrs={'style': re.compile(r'display\s*:\s*none', re.I)})
95
  for div in hidden_divs:
96
- if div.get_text(strip=True):
97
- hidden_elements.append(f"Hidden element: {div.get_text(strip=True)}")
 
98
 
99
- # Look for data attributes that might contain codes
100
- elements_with_data = soup.find_all(attrs={'data-code': True})
101
  for elem in elements_with_data:
102
- hidden_elements.append(f"Data code: {elem.get('data-code')}")
 
 
103
 
104
  return hidden_elements
105
 
106
- def scrape_with_requests(url: str) -> dict:
107
- """Scrape webpage using requests"""
108
  try:
109
- headers = {
110
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
111
- }
112
- response = requests.get(url, headers=headers, timeout=30)
 
 
 
 
 
 
113
  response.raise_for_status()
114
 
115
  soup = BeautifulSoup(response.text, 'html.parser')
116
 
117
- # Extract basic info
118
  title = soup.find('title')
119
- title_text = title.get_text() if title else "No title"
120
 
121
- # Extract visible text
122
  visible_text = soup.get_text(separator=' ', strip=True)
123
 
124
  # Extract hidden elements
125
  hidden_elements = extract_hidden_elements(response.text)
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  return {
128
  'title': title_text,
129
- 'visible_text': visible_text[:2000], # Limit text to save tokens
130
  'hidden_elements': hidden_elements,
 
 
131
  'html': response.text
132
  }
133
 
134
  except Exception as e:
135
- logger.error(f"Request scraping failed for {url}: {e}")
136
  return {}
137
 
138
- def scrape_with_selenium(url: str) -> dict:
139
- """Scrape webpage using selenium for dynamic content"""
140
- driver = setup_selenium_driver()
141
- if not driver:
142
- return {}
143
 
144
- try:
145
- driver.get(url)
146
- time.sleep(3) # Wait for page to load
147
-
148
- # Get page source after JavaScript execution
149
- html_content = driver.page_source
150
- soup = BeautifulSoup(html_content, 'html.parser')
151
-
152
- # Extract basic info
153
- title = driver.title
154
- visible_text = soup.get_text(separator=' ', strip=True)
155
 
156
- # Extract hidden elements
157
- hidden_elements = extract_hidden_elements(html_content)
 
 
 
 
158
 
159
- # Look for buttons or interactive elements
160
- buttons = driver.find_elements(By.TAG_NAME, "button")
161
- clickable_elements = []
162
- for btn in buttons:
163
- if btn.is_displayed():
164
- clickable_elements.append(f"Button: {btn.text}")
 
165
 
166
- return {
167
- 'title': title,
168
- 'visible_text': visible_text[:2000],
169
- 'hidden_elements': hidden_elements,
170
- 'clickable_elements': clickable_elements,
171
- 'html': html_content
172
- }
173
-
174
- except Exception as e:
175
- logger.error(f"Selenium scraping failed for {url}: {e}")
176
- return {}
177
 
178
- finally:
179
- if driver:
180
- driver.quit()
181
-
182
- def analyze_page_content(content: dict, question: str) -> str:
183
- """Use LLM to analyze page content and answer questions"""
184
- if not content:
185
- return "Unable to access page content"
186
-
187
- # Prepare context for LLM (keep it concise to save tokens)
188
  context_parts = []
189
 
190
  if content.get('title'):
191
- context_parts.append(f"Page Title: {content['title']}")
192
 
193
  if content.get('visible_text'):
194
- context_parts.append(f"Visible Text: {content['visible_text'][:800]}")
195
 
196
  if content.get('hidden_elements'):
197
- context_parts.append(f"Hidden Elements: {'; '.join(content['hidden_elements'][:5])}")
198
 
199
- if content.get('clickable_elements'):
200
- context_parts.append(f"Buttons: {'; '.join(content['clickable_elements'][:3])}")
201
 
202
  context = "\n".join(context_parts)
203
 
204
  messages = [
205
  {
206
  "role": "system",
207
- "content": "You are analyzing a webpage for a challenge. Be concise and direct in your answers. Look for challenge names, codes, or specific elements mentioned in the question."
208
  },
209
  {
210
  "role": "user",
211
- "content": f"Question: {question}\n\nPage Content:\n{context}\n\nProvide a direct answer based on the page content."
212
  }
213
  ]
214
 
215
- return call_llm(messages, max_tokens=100)
 
 
 
 
 
 
 
 
216
 
217
  @app.post("/challenge", response_model=ChallengeResponse)
218
  async def solve_challenge(request: ChallengeRequest):
@@ -220,42 +216,20 @@ async def solve_challenge(request: ChallengeRequest):
220
  logger.info(f"Received challenge request - URL: {request.url}")
221
  logger.info(f"Questions: {request.questions}")
222
 
223
- print("URL:", request.url)
224
  answers = []
225
 
226
  try:
227
  for question in request.questions:
228
  logger.info(f"Processing question: {question}")
229
 
230
- # First try with requests (faster)
231
- page_content = scrape_with_requests(request.url)
232
-
233
- # If requests fails or doesn't find enough info, try selenium
234
- if not page_content or (not page_content.get('hidden_elements') and "hidden" in question.lower()):
235
- logger.info("Trying selenium for dynamic content...")
236
- page_content = scrape_with_selenium(request.url)
237
 
238
- # Analyze content with LLM
239
- answer = analyze_page_content(page_content, question)
240
-
241
- # If no clear answer, try to extract from hidden elements directly
242
- if not answer or len(answer.strip()) < 3:
243
- if page_content.get('hidden_elements'):
244
- # Look for challenge-related terms
245
- for element in page_content['hidden_elements']:
246
- if any(term in element.lower() for term in ['challenge', 'name', 'code', 'hidden']):
247
- answer = element.split(':')[-1].strip()
248
- break
249
-
250
- if not answer and "challenge name" in question.lower():
251
- # Extract from title or visible text
252
- if page_content.get('title'):
253
- answer = page_content['title']
254
-
255
- print("Answers: ", answer)
256
 
257
- answers.append(answer.strip() if answer else "Challenge information not found")
258
- logger.info(f"Answer found: {answers[-1]}")
259
 
260
  except Exception as e:
261
  logger.error(f"Error processing challenge: {e}")
@@ -265,20 +239,19 @@ async def solve_challenge(request: ChallengeRequest):
265
 
266
  @app.get("/health")
267
  async def health_check():
268
- """Health check endpoint"""
269
- return {"status": "healthy", "message": "HackRx Mission API is running"}
270
 
271
  @app.get("/")
272
  async def root():
273
- """Root endpoint with API information"""
274
  return {
275
  "message": "HackRx Mission API - Ready for action!",
 
276
  "endpoints": {
277
- "challenge": "/challenge (POST) - Main challenge solving endpoint",
278
- "health": "/health (GET) - Health check"
279
  }
280
  }
281
 
282
  if __name__ == "__main__":
283
  import uvicorn
284
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
8
  from bs4 import BeautifulSoup
9
  import logging
10
  import re
 
 
 
 
 
 
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO)
 
38
  "messages": messages,
39
  "model": "gpt-5-nano",
40
  "max_tokens": max_tokens,
41
+ "temperature": 0.1
42
  }
43
 
44
  response = requests.post(LLM_URL, headers=headers, json=data)
 
51
  logger.error(f"LLM API call failed: {e}")
52
  return ""
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def extract_hidden_elements(html_content: str) -> List[str]:
55
  """Extract hidden elements from HTML"""
56
  soup = BeautifulSoup(html_content, 'html.parser')
57
  hidden_elements = []
58
 
59
+ # Look for hidden inputs
60
  hidden_inputs = soup.find_all('input', {'type': 'hidden'})
61
  for inp in hidden_inputs:
62
  if inp.get('value'):
63
  hidden_elements.append(f"Hidden input: {inp.get('name', 'unnamed')} = {inp.get('value')}")
64
 
65
  # Look for HTML comments
66
+ comments = soup.find_all(string=lambda text: isinstance(text, str) and text.strip().startswith('<!--'))
67
  for comment in comments:
68
+ clean_comment = comment.strip().replace('<!--', '').replace('-->', '').strip()
69
+ if clean_comment:
70
+ hidden_elements.append(f"Comment: {clean_comment}")
71
 
72
+ # Look for elements with display:none
73
  hidden_divs = soup.find_all(attrs={'style': re.compile(r'display\s*:\s*none', re.I)})
74
  for div in hidden_divs:
75
+ text = div.get_text(strip=True)
76
+ if text:
77
+ hidden_elements.append(f"Hidden element: {text}")
78
 
79
+ # Look for data attributes
80
+ elements_with_data = soup.find_all(attrs=lambda x: x and any(key.startswith('data-') for key in x.keys()))
81
  for elem in elements_with_data:
82
+ for attr, value in elem.attrs.items():
83
+ if attr.startswith('data-') and value:
84
+ hidden_elements.append(f"Data attribute {attr}: {value}")
85
 
86
  return hidden_elements
87
 
88
+ def advanced_scrape(url: str) -> dict:
89
+ """Enhanced scraping with better hidden element detection"""
90
  try:
91
+ session = requests.Session()
92
+ session.headers.update({
93
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
94
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
95
+ 'Accept-Language': 'en-US,en;q=0.5',
96
+ 'Accept-Encoding': 'gzip, deflate',
97
+ 'Connection': 'keep-alive'
98
+ })
99
+
100
+ response = session.get(url, timeout=30)
101
  response.raise_for_status()
102
 
103
  soup = BeautifulSoup(response.text, 'html.parser')
104
 
105
+ # Extract comprehensive information
106
  title = soup.find('title')
107
+ title_text = title.get_text().strip() if title else "No title"
108
 
109
+ # Get all text content
110
  visible_text = soup.get_text(separator=' ', strip=True)
111
 
112
  # Extract hidden elements
113
  hidden_elements = extract_hidden_elements(response.text)
114
 
115
+ # Look for scripts that might contain data
116
+ scripts = soup.find_all('script')
117
+ script_data = []
118
+ for script in scripts:
119
+ if script.string:
120
+ script_content = script.string.strip()
121
+ if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden']):
122
+ script_data.append(f"Script data: {script_content[:200]}")
123
+
124
+ # Look for meta tags
125
+ meta_data = []
126
+ meta_tags = soup.find_all('meta')
127
+ for meta in meta_tags:
128
+ if meta.get('content'):
129
+ meta_data.append(f"Meta {meta.get('name', 'unknown')}: {meta.get('content')}")
130
+
131
  return {
132
  'title': title_text,
133
+ 'visible_text': visible_text[:2000],
134
  'hidden_elements': hidden_elements,
135
+ 'script_data': script_data,
136
+ 'meta_data': meta_data[:5], # Limit meta data
137
  'html': response.text
138
  }
139
 
140
  except Exception as e:
141
+ logger.error(f"Advanced scraping failed for {url}: {e}")
142
  return {}
143
 
144
+ def analyze_content_intelligently(content: dict, question: str) -> str:
145
+ """Intelligent content analysis with multiple strategies"""
146
+ if not content:
147
+ return "Unable to access page content"
 
148
 
149
+ # Strategy 1: Direct pattern matching for common questions
150
+ if "challenge name" in question.lower():
151
+ # Look in title first
152
+ if content.get('title') and content['title'] != "No title":
153
+ return content['title']
 
 
 
 
 
 
154
 
155
+ # Look in hidden elements
156
+ for element in content.get('hidden_elements', []):
157
+ if 'challenge' in element.lower():
158
+ parts = element.split(':')
159
+ if len(parts) > 1:
160
+ return parts[-1].strip().strip('"').strip("'")
161
 
162
+ # Look in visible text for patterns
163
+ visible = content.get('visible_text', '')
164
+ challenge_patterns = [
165
+ r'challenge[:\s]+([^.\n]+)',
166
+ r'name[:\s]+([^.\n]+)',
167
+ r'title[:\s]+([^.\n]+)'
168
+ ]
169
 
170
+ for pattern in challenge_patterns:
171
+ match = re.search(pattern, visible, re.IGNORECASE)
172
+ if match:
173
+ return match.group(1).strip()
 
 
 
 
 
 
 
174
 
175
+ # Strategy 2: Use LLM for complex analysis
 
 
 
 
 
 
 
 
 
176
  context_parts = []
177
 
178
  if content.get('title'):
179
+ context_parts.append(f"Title: {content['title']}")
180
 
181
  if content.get('visible_text'):
182
+ context_parts.append(f"Text: {content['visible_text'][:800]}")
183
 
184
  if content.get('hidden_elements'):
185
+ context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
186
 
187
+ if content.get('script_data'):
188
+ context_parts.append(f"Scripts: {'; '.join(content['script_data'][:2])}")
189
 
190
  context = "\n".join(context_parts)
191
 
192
  messages = [
193
  {
194
  "role": "system",
195
+ "content": "Extract the specific answer from webpage content. Be direct and concise. Focus on challenge names, codes, or specific elements requested."
196
  },
197
  {
198
  "role": "user",
199
+ "content": f"Question: {question}\n\nContent:\n{context}\n\nAnswer:"
200
  }
201
  ]
202
 
203
+ llm_answer = call_llm(messages, max_tokens=50)
204
+
205
+ # Strategy 3: Fallback to first meaningful hidden element
206
+ if not llm_answer or len(llm_answer.strip()) < 3:
207
+ for element in content.get('hidden_elements', []):
208
+ if len(element.split(':')) > 1:
209
+ return element.split(':')[-1].strip()
210
+
211
+ return llm_answer.strip() if llm_answer else "Information not found"
212
 
213
  @app.post("/challenge", response_model=ChallengeResponse)
214
  async def solve_challenge(request: ChallengeRequest):
 
216
  logger.info(f"Received challenge request - URL: {request.url}")
217
  logger.info(f"Questions: {request.questions}")
218
 
 
219
  answers = []
220
 
221
  try:
222
  for question in request.questions:
223
  logger.info(f"Processing question: {question}")
224
 
225
+ # Scrape the page
226
+ page_content = advanced_scrape(request.url)
 
 
 
 
 
227
 
228
+ # Analyze and get answer
229
+ answer = analyze_content_intelligently(page_content, question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ answers.append(answer)
232
+ logger.info(f"Answer found: {answer}")
233
 
234
  except Exception as e:
235
  logger.error(f"Error processing challenge: {e}")
 
239
 
240
  @app.get("/health")
241
  async def health_check():
242
+ return {"status": "healthy", "selenium_available": False}
 
243
 
244
  @app.get("/")
245
  async def root():
 
246
  return {
247
  "message": "HackRx Mission API - Ready for action!",
248
+ "mode": "requests-only",
249
  "endpoints": {
250
+ "challenge": "/challenge (POST)",
251
+ "health": "/health (GET)"
252
  }
253
  }
254
 
255
  if __name__ == "__main__":
256
  import uvicorn
257
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
main.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import requests
5
+ import base64
6
+ import json
7
+ import os
8
+ from bs4 import BeautifulSoup
9
+ import logging
10
+ import re
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ app = FastAPI(title="HackRx Mission API", version="1.0.0")
16
+
17
+ class ChallengeRequest(BaseModel):
18
+ url: str
19
+ questions: List[str]
20
+
21
+ class ChallengeResponse(BaseModel):
22
+ answers: List[str]
23
+
24
+ LLM_URL = "https://register.hackrx.in/llm/openai"
25
+ SUBSCRIPTION_KEY = os.getenv("SUBSCRIPTION_KEY", "sk-****")
26
+
27
+ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
28
+ """Call the LLM API with token optimization"""
29
+ try:
30
+ headers = {
31
+ 'Content-Type': 'application/json',
32
+ 'x-subscription-key': SUBSCRIPTION_KEY
33
+ }
34
+
35
+ data = {
36
+ "messages": messages,
37
+ "model": "gpt-5-nano",
38
+ "max_tokens": max_tokens,
39
+ "temperature": 0.1
40
+ }
41
+
42
+ response = requests.post(LLM_URL, headers=headers, json=data)
43
+ response.raise_for_status()
44
+
45
+ result = response.json()
46
+ return result.get('choices', [{}])[0].get('message', {}).get('content', '')
47
+
48
+ except Exception as e:
49
+ logger.error(f"LLM API call failed: {e}")
50
+ return ""
51
+
52
+ def extract_hidden_elements(html_content: str) -> List[str]:
53
+ """Extract hidden elements from HTML"""
54
+ soup = BeautifulSoup(html_content, 'html.parser')
55
+ hidden_elements = []
56
+
57
+ hidden_inputs = soup.find_all('input', {'type': 'hidden'})
58
+ for inp in hidden_inputs:
59
+ if inp.get('value'):
60
+ hidden_elements.append(f"Hidden input: {inp.get('name', 'unnamed')} = {inp.get('value')}")
61
+
62
+ comments = soup.find_all(string=lambda text: isinstance(text, str) and text.strip().startswith('<!--'))
63
+ for comment in comments:
64
+ clean_comment = comment.strip().replace('<!--', '').replace('-->', '').strip()
65
+ if clean_comment:
66
+ hidden_elements.append(f"Comment: {clean_comment}")
67
+
68
+ hidden_divs = soup.find_all(attrs={'style': re.compile(r'display\s*:\s*none', re.I)})
69
+ for div in hidden_divs:
70
+ text = div.get_text(strip=True)
71
+ if text:
72
+ hidden_elements.append(f"Hidden element: {text}")
73
+
74
+ elements_with_data = soup.find_all(attrs=lambda x: x and any(key.startswith('data-') for key in x.keys()))
75
+ for elem in elements_with_data:
76
+ for attr, value in elem.attrs.items():
77
+ if attr.startswith('data-') and value:
78
+ hidden_elements.append(f"Data attribute {attr}: {value}")
79
+
80
+ return hidden_elements
81
+
82
+ def advanced_scrape(url: str) -> dict:
83
+ """Enhanced scraping with better hidden element detection"""
84
+ try:
85
+ session = requests.Session()
86
+ session.headers.update({
87
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
88
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
89
+ 'Accept-Language': 'en-US,en;q=0.5',
90
+ 'Accept-Encoding': 'gzip, deflate',
91
+ 'Connection': 'keep-alive'
92
+ })
93
+
94
+ response = session.get(url, timeout=30)
95
+ response.raise_for_status()
96
+
97
+ soup = BeautifulSoup(response.text, 'html.parser')
98
+
99
+ title = soup.find('title')
100
+ title_text = title.get_text().strip() if title else "No title"
101
+
102
+ visible_text = soup.get_text(separator=' ', strip=True)
103
+
104
+ hidden_elements = extract_hidden_elements(response.text)
105
+
106
+ scripts = soup.find_all('script')
107
+ script_data = []
108
+ for script in scripts:
109
+ if script.string:
110
+ script_content = script.string.strip()
111
+ if any(keyword in script_content.lower() for keyword in ['challenge', 'code', 'answer', 'hidden']):
112
+ script_data.append(f"Script data: {script_content[:200]}")
113
+
114
+ # Look for meta tags
115
+ meta_data = []
116
+ meta_tags = soup.find_all('meta')
117
+ for meta in meta_tags:
118
+ if meta.get('content'):
119
+ meta_data.append(f"Meta {meta.get('name', 'unknown')}: {meta.get('content')}")
120
+
121
+ return {
122
+ 'title': title_text,
123
+ 'visible_text': visible_text[:2000],
124
+ 'hidden_elements': hidden_elements,
125
+ 'script_data': script_data,
126
+ 'meta_data': meta_data[:5], # Limit meta data
127
+ 'html': response.text
128
+ }
129
+
130
+ except Exception as e:
131
+ logger.error(f"Advanced scraping failed for {url}: {e}")
132
+ return {}
133
+
134
+ def analyze_content_intelligently(content: dict, question: str) -> str:
135
+ """Intelligent content analysis with multiple strategies"""
136
+ if not content:
137
+ return "Unable to access page content"
138
+
139
+ # Strategy 1: Direct pattern matching for common questions
140
+ if "challenge name" in question.lower():
141
+ # Look in title first
142
+ if content.get('title') and content['title'] != "No title":
143
+ return content['title']
144
+
145
+ # Look in hidden elements
146
+ for element in content.get('hidden_elements', []):
147
+ if 'challenge' in element.lower():
148
+ parts = element.split(':')
149
+ if len(parts) > 1:
150
+ return parts[-1].strip().strip('"').strip("'")
151
+
152
+ # Look in visible text for patterns
153
+ visible = content.get('visible_text', '')
154
+ challenge_patterns = [
155
+ r'challenge[:\s]+([^.\n]+)',
156
+ r'name[:\s]+([^.\n]+)',
157
+ r'title[:\s]+([^.\n]+)'
158
+ ]
159
+
160
+ for pattern in challenge_patterns:
161
+ match = re.search(pattern, visible, re.IGNORECASE)
162
+ if match:
163
+ return match.group(1).strip()
164
+
165
+ # Strategy 2: Use LLM for complex analysis
166
+ context_parts = []
167
+
168
+ if content.get('title'):
169
+ context_parts.append(f"Title: {content['title']}")
170
+
171
+ if content.get('visible_text'):
172
+ context_parts.append(f"Text: {content['visible_text'][:800]}")
173
+
174
+ if content.get('hidden_elements'):
175
+ context_parts.append(f"Hidden: {'; '.join(content['hidden_elements'][:3])}")
176
+
177
+ if content.get('script_data'):
178
+ context_parts.append(f"Scripts: {'; '.join(content['script_data'][:2])}")
179
+
180
+ context = "\n".join(context_parts)
181
+
182
+ messages = [
183
+ {
184
+ "role": "system",
185
+ "content": "Extract the specific answer from webpage content. Be direct and concise. Focus on challenge names, codes, or specific elements requested."
186
+ },
187
+ {
188
+ "role": "user",
189
+ "content": f"Question: {question}\n\nContent:\n{context}\n\nAnswer:"
190
+ }
191
+ ]
192
+
193
+ llm_answer = call_llm(messages, max_tokens=50)
194
+
195
+ # Strategy 3: Fallback to first meaningful hidden element
196
+ if not llm_answer or len(llm_answer.strip()) < 3:
197
+ for element in content.get('hidden_elements', []):
198
+ if len(element.split(':')) > 1:
199
+ return element.split(':')[-1].strip()
200
+
201
+ return llm_answer.strip() if llm_answer else "Information not found"
202
+
203
+ @app.post("/challenge", response_model=ChallengeResponse)
204
+ async def solve_challenge(request: ChallengeRequest):
205
+ """Main endpoint to solve HackRx challenges"""
206
+ logger.info(f"Received challenge request - URL: {request.url}")
207
+ logger.info(f"Questions: {request.questions}")
208
+
209
+ answers = []
210
+
211
+ try:
212
+ for question in request.questions:
213
+ logger.info(f"Processing question: {question}")
214
+
215
+ # Scrape the page
216
+ page_content = advanced_scrape(request.url)
217
+
218
+ # Analyze and get answer
219
+ answer = analyze_content_intelligently(page_content, question)
220
+
221
+ answers.append(answer)
222
+ logger.info(f"Answer found: {answer}")
223
+
224
+ except Exception as e:
225
+ logger.error(f"Error processing challenge: {e}")
226
+ raise HTTPException(status_code=500, detail=f"Challenge processing failed: {str(e)}")
227
+
228
+ return ChallengeResponse(answers=answers)
229
+
230
+ @app.get("/health")
231
+ async def health_check():
232
+ return {"status": "healthy", "selenium_available": False}
233
+
234
+ @app.get("/")
235
+ async def root():
236
+ return {
237
+ "message": "HackRx Mission API - Ready for action!",
238
+ "mode": "requests-only",
239
+ "endpoints": {
240
+ "challenge": "/challenge (POST)",
241
+ "health": "/health (GET)"
242
+ }
243
+ }
244
+
245
+ if __name__ == "__main__":
246
+ import uvicorn
247
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ chromium
2
+ chromium-driver
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
- fastapi
2
- uvicorn
3
- requests
4
- beautifulsoup4
5
- selenium
6
- pydantic
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ requests==2.31.0
4
+ beautifulsoup4==4.12.2
5
+ pydantic==2.5.0
6
+ selenium==4.15.0
7
+ python-multipart==0.0.6