Madras1 commited on
Commit
4c22969
·
verified ·
1 Parent(s): 0407e5c

Upload 59 files

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -28
  2. app/agents/browser_agent_stealth.py +205 -249
  3. pyproject.toml +0 -4
Dockerfile CHANGED
@@ -2,30 +2,9 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies (including Playwright/Camoufox deps)
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
  build-essential \
8
- curl \
9
- wget \
10
- # Playwright/Camoufox browser dependencies
11
- libnss3 \
12
- libnspr4 \
13
- libatk1.0-0 \
14
- libatk-bridge2.0-0 \
15
- libcups2 \
16
- libdrm2 \
17
- libdbus-1-3 \
18
- libxkbcommon0 \
19
- libxcomposite1 \
20
- libxdamage1 \
21
- libxfixes3 \
22
- libxrandr2 \
23
- libgbm1 \
24
- libasound2 \
25
- libpango-1.0-0 \
26
- libcairo2 \
27
- libatspi2.0-0 \
28
- libgtk-3-0 \
29
  && rm -rf /var/lib/apt/lists/*
30
 
31
  # Copy all files first
@@ -45,14 +24,8 @@ RUN pip install --no-cache-dir \
45
  numpy \
46
  sentence-transformers \
47
  e2b-desktop>=2.2.0 \
48
- # Camoufox stealth browser
49
- camoufox[geoip] \
50
- playwright \
51
  && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
52
 
53
- # Download Camoufox browser
54
- RUN camoufox fetch
55
-
56
  # HuggingFace Spaces uses port 7860
57
  EXPOSE 7860
58
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
  build-essential \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
  # Copy all files first
 
24
  numpy \
25
  sentence-transformers \
26
  e2b-desktop>=2.2.0 \
 
 
 
27
  && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
28
 
 
 
 
29
  # HuggingFace Spaces uses port 7860
30
  EXPOSE 7860
31
 
app/agents/browser_agent_stealth.py CHANGED
@@ -1,297 +1,253 @@
1
  """Browser Agent with Camoufox - Stealth Firefox for anti-bot bypass.
2
 
 
 
3
  Camoufox is a modified Firefox that:
4
  - Generates unique device fingerprints
5
  - Hides automation markers
6
  - Passes bot detection (Cloudflare, etc)
7
-
8
- Requires: pip install camoufox[geoip] playwright
9
- Then: camoufox fetch (downloads the stealth browser)
10
  """
11
 
12
  import os
13
  import logging
 
14
  import time
15
  from typing import AsyncGenerator, Optional
16
 
17
  from app.config import get_settings
18
- from app.agents.graph.simple_agent import (
19
- SimpleState,
20
- think_and_act,
21
- generate_final_response,
22
- )
23
  from app.agents.llm_client import generate_completion
24
 
25
  logger = logging.getLogger(__name__)
26
 
27
 
28
- class CamoufoxState(SimpleState):
29
- """State for Camoufox-based agent (uses Playwright page instead of E2B desktop)."""
30
-
31
- def __init__(self, task: str, url: Optional[str], page, timeout: float = 300):
32
- # We pass None for desktop, will use page directly
33
- super().__init__(task, url, desktop=None, timeout=timeout)
34
- self.page = page # Playwright page
35
-
36
-
37
  async def run_browser_agent_stealth(
38
  task: str,
39
  url: Optional[str] = None,
40
  ) -> AsyncGenerator[dict, None]:
41
- """Run browser agent with Camoufox stealth browser."""
42
  settings = get_settings()
43
 
44
- yield {"type": "status", "message": "🦊 Initializing stealth browser..."}
45
-
46
- try:
47
- from camoufox.async_api import AsyncCamoufox
48
- except ImportError:
49
- yield {"type": "error", "message": "Camoufox not installed. Run: pip install camoufox[geoip] && camoufox fetch"}
50
  return
51
 
52
- browser = None
 
 
53
 
54
  try:
55
- yield {"type": "status", "message": "🛡️ Starting anti-detect Firefox..."}
56
 
57
- async with AsyncCamoufox(headless=True) as browser:
58
- page = await browser.new_page()
59
-
60
- # Initialize state
61
- state = CamoufoxState(
62
- task=task,
63
- url=url,
64
- page=page,
65
- timeout=300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
 
67
 
68
- # Go to initial URL or search
69
- start_url = url or "https://html.duckduckgo.com/html/"
70
- yield {"type": "status", "message": f"🌐 Navigating to {start_url[:50]}..."}
71
-
72
- try:
73
- await page.goto(start_url, timeout=30000)
74
- await page.wait_for_timeout(2000)
75
-
76
- # Extract initial content
77
- content = await _extract_page_content(page)
78
- state.add_page(start_url, content)
79
- except Exception as e:
80
- logger.warning(f"Initial navigation failed: {e}")
81
-
82
- cycle = 0
83
- max_cycles = 15
84
-
85
- # Main loop
86
- while state.should_continue() and cycle < max_cycles:
87
- cycle += 1
88
- remaining = int(state.remaining())
89
-
90
- yield {"type": "status", "message": f"🤔 Analyzing... (cycle {cycle}, {remaining}s left)"}
91
-
92
- # Think and decide
93
- action, params = await think_and_act_camoufox(state)
94
-
95
- # Execute action
96
- if action == "complete":
97
- state.final_result = params.get("result", "")
98
- state.done = True
99
- break
100
-
101
- elif action == "search":
102
- query = params.get("query", task)
103
- search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
104
-
105
- yield {"type": "status", "message": f"🔍 Searching: {query[:40]}..."}
106
-
107
- # Check cache
108
- if search_url in state.visited_urls:
109
- continue
110
-
111
- try:
112
- await page.goto(search_url, timeout=20000)
113
- await page.wait_for_timeout(2000)
114
- content = await _extract_page_content(page)
115
- state.add_page(search_url, content)
116
- state.action_history.append(f"search:{query[:30]}")
117
- except Exception as e:
118
- logger.warning(f"Search failed: {e}")
119
- state.action_history.append(f"search(fail):{query[:20]}")
120
-
121
- elif action == "navigate":
122
- nav_url = params.get("url", "")
123
- if not nav_url.startswith("http"):
124
- continue
125
-
126
- # Check if already visited
127
- if nav_url in state.visited_urls:
128
- continue
129
-
130
- yield {"type": "status", "message": f"🌐 Visiting: {nav_url[:40]}..."}
131
-
132
- try:
133
- await page.goto(nav_url, timeout=20000)
134
- await page.wait_for_timeout(2000)
135
- content = await _extract_page_content(page)
136
-
137
- # Check for blocks
138
- if _is_blocked(content):
139
- logger.warning(f"Blocked at {nav_url[:50]}")
140
- state.visited_urls.append(nav_url)
141
- state.content_cache[nav_url] = "[BLOCKED]"
142
- state.action_history.append(f"nav(blocked):{nav_url[:25]}")
143
- else:
144
- state.add_page(nav_url, content)
145
- state.action_history.append(f"nav:{nav_url[:30]}")
146
- except Exception as e:
147
- logger.warning(f"Navigation failed: {e}")
148
- state.visited_urls.append(nav_url)
149
- state.action_history.append(f"nav(fail):{nav_url[:25]}")
150
-
151
- elif action == "scroll":
152
- yield {"type": "status", "message": "📜 Scrolling..."}
153
- await page.evaluate("window.scrollBy(0, 500)")
154
- await page.wait_for_timeout(1000)
155
-
156
- # Re-extract
157
- content = await _extract_page_content(page)
158
- if state.visited_urls:
159
- state.content_cache[state.visited_urls[-1]] = content[:4000]
160
- state.action_history.append("scroll")
161
-
162
- # Generate final response if needed
163
- if not state.final_result:
164
- yield {"type": "status", "message": "✅ Generating response..."}
165
- state.final_result = await generate_final_response(state)
166
 
167
- # Return result
168
- yield {
169
- "type": "result",
170
- "content": state.final_result,
171
- "links": state.visited_urls[:10],
172
- "success": bool(state.final_result)
173
- }
174
 
175
- elapsed = int(state.elapsed())
176
- yield {"type": "complete", "message": f"Completed in {elapsed}s, {cycle} cycles"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- logger.info(f"Stealth agent complete. Cycles: {cycle}, Time: {elapsed}s")
179
-
180
- except Exception as e:
181
- logger.exception("Stealth browser agent error")
182
- yield {"type": "error", "message": f"Error: {str(e)}"}
183
 
 
184
 
185
- async def _extract_page_content(page) -> str:
186
- """Extract text content from current page."""
187
- try:
188
- # Get body text
189
- content = await page.evaluate("""
190
- () => {
191
- // Remove scripts and styles
192
- const scripts = document.querySelectorAll('script, style, noscript');
193
- scripts.forEach(s => s.remove());
194
-
195
- // Get text
196
- return document.body.innerText || document.body.textContent || '';
197
- }
198
- """)
199
- return content[:6000] if content else ""
200
- except Exception as e:
201
- logger.warning(f"Content extraction failed: {e}")
202
- return ""
203
 
 
204
 
205
- def _is_blocked(content: str) -> bool:
206
- """Check if content indicates a block."""
207
- content_lower = content.lower()
208
-
209
- # Very specific patterns
210
- block_patterns = [
211
- "checking your browser before accessing",
212
- "please wait while we verify",
213
- "cloudflare ray id",
214
- "enable javascript and cookies",
215
- "attention required! | cloudflare",
216
- "checking if the site connection is secure",
217
- "please turn javascript on",
218
- "access denied",
219
- "you have been blocked",
220
- ]
221
-
222
- # Short content with block pattern = definitely blocked
223
- if len(content) < 500:
224
- return any(p in content_lower for p in block_patterns)
225
-
226
- return False
 
 
 
 
 
 
 
 
 
 
 
227
 
228
 
229
- async def think_and_act_camoufox(state: CamoufoxState):
230
- """Think and decide action for Camoufox agent."""
231
- import json
232
 
233
- content = state.get_recent_content() or "(No content yet)"
234
- history = ", ".join(state.action_history[-5:]) if state.action_history else "(starting)"
235
- visited = "\n".join([f" - {u[:70]}" for u in state.visited_urls[-10:]]) if state.visited_urls else "(none)"
236
 
237
- prompt = f"""You are a web research agent using a stealth browser. Analyze and decide next action.
238
-
239
- TASK: {state.task}
240
-
241
- ALREADY VISITED (DO NOT visit again):
242
- {visited}
243
-
244
- CURRENT PAGE CONTENT:
245
- {content}
246
-
247
- HISTORY: {history}
248
- TIME REMAINING: {int(state.remaining())}s
249
-
250
- Return JSON with ONE action:
251
- - Search: {{"action": "search", "query": "search terms"}}
252
- - Navigate to NEW URL: {{"action": "navigate", "url": "https://..."}}
253
- - Scroll for more: {{"action": "scroll"}}
254
- - Answer ready: {{"action": "complete", "result": "Your answer with **bold** for important values."}}
255
-
256
- RULES:
257
- - DO NOT navigate to URLs in ALREADY VISITED list
258
- - Only use URLs from the content above
259
- - Use **bold** for prices, numbers, names
260
- - Be efficient
261
-
262
- Return ONLY valid JSON:"""
263
-
264
- try:
265
- response = await generate_completion(
266
- messages=[{"role": "user", "content": prompt}],
267
- max_tokens=800
268
- )
269
 
270
- response = response.strip()
271
- if response.startswith("```"):
272
- response = response.split("```")[1]
273
- if response.startswith("json"):
274
- response = response[4:]
 
 
 
275
 
276
- decision = json.loads(response)
277
- action = decision.get("action", "search")
 
 
 
 
 
 
 
 
 
 
278
 
279
- # Safety: prevent revisiting
280
- if action == "navigate":
281
- url = decision.get("url", "").rstrip("/")
282
- visited_normalized = [u.rstrip("/") for u in state.visited_urls]
283
- if url in visited_normalized:
284
- if state.content_cache:
285
- good = [c for c in state.content_cache.values() if c and c not in ["[BLOCKED]"]]
286
- if good:
287
- return "complete", {"result": state.get_recent_content()[:800]}
288
- return "search", {"query": f"{state.task} site:wikipedia.org"}
 
 
 
 
 
 
 
289
 
290
- logger.info(f"Camoufox decision: {action}")
291
- return action, decision
292
 
293
- except Exception as e:
294
- logger.error(f"Think failed: {e}")
295
- if state.content_cache:
296
- return "complete", {"result": state.get_recent_content()[:500]}
297
- return "search", {"query": state.task}
 
 
 
 
 
 
 
1
  """Browser Agent with Camoufox - Stealth Firefox for anti-bot bypass.
2
 
3
+ Runs Camoufox INSIDE the E2B sandbox (not on HF container).
4
+
5
  Camoufox is a modified Firefox that:
6
  - Generates unique device fingerprints
7
  - Hides automation markers
8
  - Passes bot detection (Cloudflare, etc)
 
 
 
9
  """
10
 
11
  import os
12
  import logging
13
+ import shlex
14
  import time
15
  from typing import AsyncGenerator, Optional
16
 
17
  from app.config import get_settings
 
 
 
 
 
18
  from app.agents.llm_client import generate_completion
19
 
20
  logger = logging.getLogger(__name__)
21
 
22
 
 
 
 
 
 
 
 
 
 
23
  async def run_browser_agent_stealth(
24
  task: str,
25
  url: Optional[str] = None,
26
  ) -> AsyncGenerator[dict, None]:
27
+ """Run browser agent with Camoufox stealth browser inside E2B sandbox."""
28
  settings = get_settings()
29
 
30
+ if not settings.e2b_api_key:
31
+ yield {"type": "error", "message": "E2B_API_KEY not configured"}
 
 
 
 
32
  return
33
 
34
+ yield {"type": "status", "message": "🦊 Initializing stealth browser in sandbox..."}
35
+
36
+ desktop = None
37
 
38
  try:
39
+ from e2b_desktop import Sandbox
40
 
41
+ os.environ["E2B_API_KEY"] = settings.e2b_api_key
42
+
43
+ yield {"type": "status", "message": "🖥️ Creating virtual desktop..."}
44
+ desktop = Sandbox.create(timeout=900) # 15 min for install
45
+
46
+ # Start streaming
47
+ stream_url = None
48
+ try:
49
+ desktop.stream.start(require_auth=True)
50
+ auth_key = desktop.stream.get_auth_key()
51
+ stream_url = desktop.stream.get_url(auth_key=auth_key)
52
+ yield {"type": "stream", "url": stream_url}
53
+ desktop.wait(2000)
54
+ except Exception as e:
55
+ logger.warning(f"Could not start stream: {e}")
56
+
57
+ # Install Camoufox inside E2B sandbox
58
+ yield {"type": "status", "message": "📦 Installing Camoufox in sandbox..."}
59
+
60
+ try:
61
+ # Install Python packages
62
+ desktop.commands.run(
63
+ "pip install camoufox playwright --quiet",
64
+ timeout=120
65
  )
66
+ desktop.wait(2000)
67
 
68
+ # Download Camoufox browser
69
+ yield {"type": "status", "message": "🔽 Downloading stealth browser..."}
70
+ desktop.commands.run(
71
+ "camoufox fetch",
72
+ timeout=180
73
+ )
74
+ desktop.wait(2000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Install Playwright deps
77
+ desktop.commands.run(
78
+ "playwright install-deps firefox --quiet 2>/dev/null || true",
79
+ timeout=60
80
+ )
 
 
81
 
82
+ logger.info("Camoufox installed in sandbox")
83
+ except Exception as e:
84
+ logger.error(f"Failed to install Camoufox: {e}")
85
+ yield {"type": "error", "message": f"Failed to install Camoufox: {e}"}
86
+ return
87
+
88
+ # Now run Camoufox via Python script inside sandbox
89
+ yield {"type": "status", "message": "🛡️ Starting stealth Firefox..."}
90
+
91
+ # Create Python script to run Camoufox
92
+ script = _create_camoufox_script(task, url)
93
+
94
+ # Write script to sandbox
95
+ desktop.commands.run(f"cat > /tmp/scrape.py << 'SCRIPT_EOF'\n{script}\nSCRIPT_EOF", timeout=10)
96
+
97
+ # Run the script
98
+ yield {"type": "status", "message": f"🔍 Researching: {task[:40]}..."}
99
+
100
+ result = desktop.commands.run(
101
+ "python /tmp/scrape.py",
102
+ timeout=300
103
+ )
104
+
105
+ output = result.stdout if hasattr(result, 'stdout') else ""
106
+
107
+ if not output:
108
+ yield {"type": "error", "message": "No output from Camoufox script"}
109
+ return
110
+
111
+ # Parse result
112
+ import json
113
+ try:
114
+ data = json.loads(output)
115
+ final_result = data.get("result", "")
116
+ visited_urls = data.get("urls", [])
117
+ except json.JSONDecodeError:
118
+ # Raw text output
119
+ final_result = output[:3000]
120
+ visited_urls = []
121
+
122
+ # Generate response with LLM if needed
123
+ if len(final_result) > 500 and not final_result.startswith("Error"):
124
+ yield {"type": "status", "message": "✨ Synthesizing response..."}
125
 
126
+ prompt = f"""Analise o conteúdo abaixo e responda à pergunta.
 
 
 
 
127
 
128
+ PERGUNTA: {task}
129
 
130
+ CONTEÚDO COLETADO:
131
+ {final_result[:4000]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ Responda de forma clara e concisa. Use **negrito** para valores importantes."""
134
 
135
+ response = await generate_completion(
136
+ messages=[{"role": "user", "content": prompt}],
137
+ max_tokens=1000
138
+ )
139
+ final_result = response.strip() if response else final_result
140
+
141
+ # Yield result
142
+ yield {"type": "stream_end", "message": "Stream ended"}
143
+
144
+ yield {
145
+ "type": "result",
146
+ "content": final_result,
147
+ "links": visited_urls[:10],
148
+ "success": bool(final_result)
149
+ }
150
+
151
+ yield {"type": "complete", "message": f"Completed with Camoufox stealth browser"}
152
+
153
+ except ImportError as e:
154
+ yield {"type": "error", "message": "e2b-desktop not installed"}
155
+ except Exception as e:
156
+ logger.exception("Stealth browser agent error")
157
+ yield {"type": "error", "message": f"Error: {str(e)}"}
158
+ finally:
159
+ if desktop:
160
+ try:
161
+ desktop.stream.stop()
162
+ except Exception:
163
+ pass
164
+ try:
165
+ desktop.kill()
166
+ except Exception:
167
+ pass
168
 
169
 
170
+ def _create_camoufox_script(task: str, url: Optional[str] = None) -> str:
171
+ """Create Python script to run Camoufox inside E2B sandbox."""
 
172
 
173
+ # Escape task for Python string
174
+ task_escaped = task.replace("'", "\\'").replace('"', '\\"')
175
+ start_url = url or f"https://html.duckduckgo.com/html/?q={task.replace(' ', '+')}"
176
 
177
+ script = f'''
178
+ import json
179
+ import sys
180
+
181
+ try:
182
+ from camoufox.sync_api import Camoufox
183
+ except ImportError:
184
+ print(json.dumps({{"error": "Camoufox not installed"}}))
185
+ sys.exit(1)
186
+
187
+ visited_urls = []
188
+ all_content = []
189
+
190
+ try:
191
+ with Camoufox(headless=True) as browser:
192
+ page = browser.new_page()
193
+
194
+ # Go to search
195
+ start_url = "{start_url}"
196
+ page.goto(start_url, timeout=30000)
197
+ page.wait_for_timeout(2000)
198
+
199
+ visited_urls.append(start_url)
 
 
 
 
 
 
 
 
 
200
 
201
+ # Extract content
202
+ content = page.evaluate("""
203
+ () => {{
204
+ const scripts = document.querySelectorAll('script, style, noscript');
205
+ scripts.forEach(s => s.remove());
206
+ return document.body.innerText || '';
207
+ }}
208
+ """)
209
 
210
+ all_content.append(content[:3000])
211
+
212
+ # Try to find and visit first relevant link
213
+ links = page.evaluate("""
214
+ () => {{
215
+ const links = Array.from(document.querySelectorAll('a[href^="http"]'));
216
+ return links
217
+ .map(a => a.href)
218
+ .filter(h => !h.includes('duckduckgo') && !h.includes('google'))
219
+ .slice(0, 3);
220
+ }}
221
+ """)
222
 
223
+ # Visit first link
224
+ if links and len(links) > 0:
225
+ try:
226
+ page.goto(links[0], timeout=20000)
227
+ page.wait_for_timeout(2000)
228
+ visited_urls.append(links[0])
229
+
230
+ content2 = page.evaluate("""
231
+ () => {{
232
+ const scripts = document.querySelectorAll('script, style, noscript');
233
+ scripts.forEach(s => s.remove());
234
+ return document.body.innerText || '';
235
+ }}
236
+ """)
237
+ all_content.append(content2[:3000])
238
+ except Exception as e:
239
+ pass
240
 
241
+ result = "\\n\\n---\\n\\n".join(all_content)
 
242
 
243
+ print(json.dumps({{
244
+ "result": result[:6000],
245
+ "urls": visited_urls
246
+ }}))
247
+
248
+ except Exception as e:
249
+ print(json.dumps({{"error": str(e)}}))
250
+ sys.exit(1)
251
+ '''
252
+
253
+ return script
pyproject.toml CHANGED
@@ -30,10 +30,6 @@ dev = [
30
  "pytest-asyncio>=0.24.0",
31
  "pytest-cov>=6.0.0",
32
  ]
33
- stealth = [
34
- "camoufox[geoip]>=0.4.0",
35
- "playwright>=1.40.0",
36
- ]
37
 
38
  [build-system]
39
  requires = ["hatchling"]
 
30
  "pytest-asyncio>=0.24.0",
31
  "pytest-cov>=6.0.0",
32
  ]
 
 
 
 
33
 
34
  [build-system]
35
  requires = ["hatchling"]