Rahul-Samedavar commited on
Commit
a4704d5
·
1 Parent(s): f865d99
Files changed (3) hide show
  1. .gitignore +2 -0
  2. Dockerfile +16 -0
  3. app.py +282 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ env
2
+ .env
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import requests
5
+ import base64
6
+ import json
7
+ import os
8
+ from bs4 import BeautifulSoup
9
+ import logging
10
+ import re
11
+ from selenium import webdriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.chrome.options import Options
16
+ import time
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ app = FastAPI(title="HackRx Mission API", version="1.0.0")
23
+
24
+ class ChallengeRequest(BaseModel):
25
+ url: str
26
+ questions: List[str]
27
+
28
+ class ChallengeResponse(BaseModel):
29
+ answers: List[str]
30
+
31
+ # LLM API configuration
32
+ LLM_URL = "https://register.hackrx.in/llm/openai"
33
+ SUBSCRIPTION_KEY = os.getenv("SUBSCRIPTION_KEY", "sk-****")
34
+
35
+ def call_llm(messages: List[dict], max_tokens: int = 150) -> str:
36
+ """Call the LLM API with token optimization"""
37
+ try:
38
+ headers = {
39
+ 'Content-Type': 'application/json',
40
+ 'x-subscription-key': SUBSCRIPTION_KEY
41
+ }
42
+
43
+ data = {
44
+ "messages": messages,
45
+ "model": "gpt-5-nano",
46
+ "max_tokens": max_tokens,
47
+ "temperature": 0.1 # Low temperature for consistent responses
48
+ }
49
+
50
+ response = requests.post(LLM_URL, headers=headers, json=data)
51
+ response.raise_for_status()
52
+
53
+ result = response.json()
54
+ return result.get('choices', [{}])[0].get('message', {}).get('content', '')
55
+
56
+ except Exception as e:
57
+ logger.error(f"LLM API call failed: {e}")
58
+ return ""
59
+
60
+ def setup_selenium_driver():
61
+ """Setup selenium driver with headless chrome"""
62
+ chrome_options = Options()
63
+ chrome_options.add_argument("--headless")
64
+ chrome_options.add_argument("--no-sandbox")
65
+ chrome_options.add_argument("--disable-dev-shm-usage")
66
+ chrome_options.add_argument("--disable-gpu")
67
+ chrome_options.add_argument("--window-size=1920,1080")
68
+
69
+ try:
70
+ driver = webdriver.Chrome(options=chrome_options)
71
+ return driver
72
+ except Exception as e:
73
+ logger.error(f"Failed to setup selenium driver: {e}")
74
+ return None
75
+
76
+ def extract_hidden_elements(html_content: str) -> List[str]:
77
+ """Extract hidden elements from HTML"""
78
+ soup = BeautifulSoup(html_content, 'html.parser')
79
+ hidden_elements = []
80
+
81
+ # Look for hidden inputs, comments, and elements with display:none
82
+ hidden_inputs = soup.find_all('input', {'type': 'hidden'})
83
+ for inp in hidden_inputs:
84
+ if inp.get('value'):
85
+ hidden_elements.append(f"Hidden input: {inp.get('name', 'unnamed')} = {inp.get('value')}")
86
+
87
+ # Look for HTML comments
88
+ comments = soup.find_all(string=lambda text: isinstance(text, str) and '<!--' in text)
89
+ for comment in comments:
90
+ if comment.strip():
91
+ hidden_elements.append(f"Comment: {comment.strip()}")
92
+
93
+ # Look for elements with style="display:none" or hidden attribute
94
+ hidden_divs = soup.find_all(attrs={'style': re.compile(r'display\s*:\s*none', re.I)})
95
+ for div in hidden_divs:
96
+ if div.get_text(strip=True):
97
+ hidden_elements.append(f"Hidden element: {div.get_text(strip=True)}")
98
+
99
+ # Look for data attributes that might contain codes
100
+ elements_with_data = soup.find_all(attrs={'data-code': True})
101
+ for elem in elements_with_data:
102
+ hidden_elements.append(f"Data code: {elem.get('data-code')}")
103
+
104
+ return hidden_elements
105
+
106
+ def scrape_with_requests(url: str) -> dict:
107
+ """Scrape webpage using requests"""
108
+ try:
109
+ headers = {
110
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
111
+ }
112
+ response = requests.get(url, headers=headers, timeout=30)
113
+ response.raise_for_status()
114
+
115
+ soup = BeautifulSoup(response.text, 'html.parser')
116
+
117
+ # Extract basic info
118
+ title = soup.find('title')
119
+ title_text = title.get_text() if title else "No title"
120
+
121
+ # Extract visible text
122
+ visible_text = soup.get_text(separator=' ', strip=True)
123
+
124
+ # Extract hidden elements
125
+ hidden_elements = extract_hidden_elements(response.text)
126
+
127
+ return {
128
+ 'title': title_text,
129
+ 'visible_text': visible_text[:2000], # Limit text to save tokens
130
+ 'hidden_elements': hidden_elements,
131
+ 'html': response.text
132
+ }
133
+
134
+ except Exception as e:
135
+ logger.error(f"Request scraping failed for {url}: {e}")
136
+ return {}
137
+
138
+ def scrape_with_selenium(url: str) -> dict:
139
+ """Scrape webpage using selenium for dynamic content"""
140
+ driver = setup_selenium_driver()
141
+ if not driver:
142
+ return {}
143
+
144
+ try:
145
+ driver.get(url)
146
+ time.sleep(3) # Wait for page to load
147
+
148
+ # Get page source after JavaScript execution
149
+ html_content = driver.page_source
150
+ soup = BeautifulSoup(html_content, 'html.parser')
151
+
152
+ # Extract basic info
153
+ title = driver.title
154
+ visible_text = soup.get_text(separator=' ', strip=True)
155
+
156
+ # Extract hidden elements
157
+ hidden_elements = extract_hidden_elements(html_content)
158
+
159
+ # Look for buttons or interactive elements
160
+ buttons = driver.find_elements(By.TAG_NAME, "button")
161
+ clickable_elements = []
162
+ for btn in buttons:
163
+ if btn.is_displayed():
164
+ clickable_elements.append(f"Button: {btn.text}")
165
+
166
+ return {
167
+ 'title': title,
168
+ 'visible_text': visible_text[:2000],
169
+ 'hidden_elements': hidden_elements,
170
+ 'clickable_elements': clickable_elements,
171
+ 'html': html_content
172
+ }
173
+
174
+ except Exception as e:
175
+ logger.error(f"Selenium scraping failed for {url}: {e}")
176
+ return {}
177
+
178
+ finally:
179
+ if driver:
180
+ driver.quit()
181
+
182
+ def analyze_page_content(content: dict, question: str) -> str:
183
+ """Use LLM to analyze page content and answer questions"""
184
+ if not content:
185
+ return "Unable to access page content"
186
+
187
+ # Prepare context for LLM (keep it concise to save tokens)
188
+ context_parts = []
189
+
190
+ if content.get('title'):
191
+ context_parts.append(f"Page Title: {content['title']}")
192
+
193
+ if content.get('visible_text'):
194
+ context_parts.append(f"Visible Text: {content['visible_text'][:800]}")
195
+
196
+ if content.get('hidden_elements'):
197
+ context_parts.append(f"Hidden Elements: {'; '.join(content['hidden_elements'][:5])}")
198
+
199
+ if content.get('clickable_elements'):
200
+ context_parts.append(f"Buttons: {'; '.join(content['clickable_elements'][:3])}")
201
+
202
+ context = "\n".join(context_parts)
203
+
204
+ messages = [
205
+ {
206
+ "role": "system",
207
+ "content": "You are analyzing a webpage for a challenge. Be concise and direct in your answers. Look for challenge names, codes, or specific elements mentioned in the question."
208
+ },
209
+ {
210
+ "role": "user",
211
+ "content": f"Question: {question}\n\nPage Content:\n{context}\n\nProvide a direct answer based on the page content."
212
+ }
213
+ ]
214
+
215
+ return call_llm(messages, max_tokens=100)
216
+
217
+ @app.post("/challenge", response_model=ChallengeResponse)
218
+ async def solve_challenge(request: ChallengeRequest):
219
+ """Main endpoint to solve HackRx challenges"""
220
+ logger.info(f"Received challenge request - URL: {request.url}")
221
+ logger.info(f"Questions: {request.questions}")
222
+
223
+ print("URL:", request.url)
224
+ answers = []
225
+
226
+ try:
227
+ for question in request.questions:
228
+ logger.info(f"Processing question: {question}")
229
+
230
+ # First try with requests (faster)
231
+ page_content = scrape_with_requests(request.url)
232
+
233
+ # If requests fails or doesn't find enough info, try selenium
234
+ if not page_content or (not page_content.get('hidden_elements') and "hidden" in question.lower()):
235
+ logger.info("Trying selenium for dynamic content...")
236
+ page_content = scrape_with_selenium(request.url)
237
+
238
+ # Analyze content with LLM
239
+ answer = analyze_page_content(page_content, question)
240
+
241
+ # If no clear answer, try to extract from hidden elements directly
242
+ if not answer or len(answer.strip()) < 3:
243
+ if page_content.get('hidden_elements'):
244
+ # Look for challenge-related terms
245
+ for element in page_content['hidden_elements']:
246
+ if any(term in element.lower() for term in ['challenge', 'name', 'code', 'hidden']):
247
+ answer = element.split(':')[-1].strip()
248
+ break
249
+
250
+ if not answer and "challenge name" in question.lower():
251
+ # Extract from title or visible text
252
+ if page_content.get('title'):
253
+ answer = page_content['title']
254
+
255
+ answers.append(answer.strip() if answer else "Challenge information not found")
256
+ logger.info(f"Answer found: {answers[-1]}")
257
+
258
+ except Exception as e:
259
+ logger.error(f"Error processing challenge: {e}")
260
+ raise HTTPException(status_code=500, detail=f"Challenge processing failed: {str(e)}")
261
+
262
+ return ChallengeResponse(answers=answers)
263
+
264
+ @app.get("/health")
265
+ async def health_check():
266
+ """Health check endpoint"""
267
+ return {"status": "healthy", "message": "HackRx Mission API is running"}
268
+
269
+ @app.get("/")
270
+ async def root():
271
+ """Root endpoint with API information"""
272
+ return {
273
+ "message": "HackRx Mission API - Ready for action!",
274
+ "endpoints": {
275
+ "challenge": "/challenge (POST) - Main challenge solving endpoint",
276
+ "health": "/health (GET) - Health check"
277
+ }
278
+ }
279
+
280
+ if __name__ == "__main__":
281
+ import uvicorn
282
+ uvicorn.run(app, host="0.0.0.0", port=8000)