google-labs-jules[bot] Greene-ctrl commited on
Commit
7dd8e08
·
1 Parent(s): 6077105

Implement iterative agentic framework for CyberScraper 2077

Browse files

Summary of changes:
- Enhanced `WebExtractor` with a deep iterative agentic loop (up to 10 iterations).
- Added `get_html_source`, `get_page_info`, and `wait_for_element` to browser tools for better state perception.
- Updated system prompt to guide the AI through continuous investigation, persistence, and verification.
- Improved async reliability in Streamlit by explicitly managing the event loop to avoid `RuntimeError`.
- Refined browser fallback logic to be more robust on Linux/Docker environments.
- Maintained full compatibility with Hugging Face Space deployment (Nginx proxy, FastAPI, Blablador LLM).
- Verified functionality on live Space and addressed code review feedback.

Co-authored-by: Greene-ctrl <192867433+Greene-ctrl@users.noreply.github.com>

app/streamlit_web_scraper_chat.py CHANGED
@@ -12,12 +12,20 @@ class StreamlitWebScraperChat:
12
  async def process_with_progress():
13
  progress_placeholder = st.empty()
14
  progress_placeholder.text("Processing...")
15
- result = await self.web_extractor.process_query(
16
- message,
17
- conversation_history=conversation_history,
18
- progress_callback=progress_placeholder.text
19
- )
20
- progress_placeholder.empty()
 
 
21
  return result
22
 
23
- return asyncio.run(process_with_progress())
 
 
 
 
 
 
 
12
  async def process_with_progress():
13
  progress_placeholder = st.empty()
14
  progress_placeholder.text("Processing...")
15
+ try:
16
+ result = await self.web_extractor.process_query(
17
+ message,
18
+ conversation_history=conversation_history,
19
+ progress_callback=progress_placeholder.text
20
+ )
21
+ finally:
22
+ progress_placeholder.empty()
23
  return result
24
 
25
+ try:
26
+ # Try to get existing loop or create new one
27
+ loop = asyncio.new_event_loop()
28
+ asyncio.set_event_loop(loop)
29
+ return loop.run_until_complete(process_with_progress())
30
+ finally:
31
+ loop.close()
src/utils/browser_tools.py CHANGED
@@ -145,6 +145,53 @@ def take_screenshot(url: str, full_page: bool = False, use_persistent: bool = Fa
145
  except Exception as e:
146
  return f"Error during take_screenshot: {str(e)}"
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def get_all_browser_tools():
149
  """Returns a list of all browser automation tools."""
150
  return [
@@ -155,5 +202,8 @@ def get_all_browser_tools():
155
  get_cookies,
156
  set_cookies,
157
  scroll_page,
158
- take_screenshot
 
 
 
159
  ]
 
145
  except Exception as e:
146
  return f"Error during take_screenshot: {str(e)}"
147
 
148
+ @tool
149
+ def get_html_source(url: str, use_persistent: bool = False) -> str:
150
+ """Get the full HTML source code of the current page."""
151
+ client = get_browser_client()
152
+ if not client: return "Error: Browser client unavailable."
153
+ try:
154
+ result = client.predict(
155
+ url=url,
156
+ use_persistent=use_persistent,
157
+ api_name="/get_html_source"
158
+ )
159
+ return str(result)
160
+ except Exception as e:
161
+ return f"Error during get_html_source: {str(e)}"
162
+
163
+ @tool
164
+ def get_page_info(url: str, use_persistent: bool = False) -> str:
165
+ """Get comprehensive page information including title, URL, and interactive elements."""
166
+ client = get_browser_client()
167
+ if not client: return "Error: Browser client unavailable."
168
+ try:
169
+ result = client.predict(
170
+ url=url,
171
+ use_persistent=use_persistent,
172
+ api_name="/get_page_info"
173
+ )
174
+ return str(result)
175
+ except Exception as e:
176
+ return f"Error during get_page_info: {str(e)}"
177
+
178
+ @tool
179
+ def wait_for_element(url: str, selector: str, timeout: float = 10, use_persistent: bool = False) -> str:
180
+ """Wait for an element matching the CSS selector to appear on the page."""
181
+ client = get_browser_client()
182
+ if not client: return "Error: Browser client unavailable."
183
+ try:
184
+ result = client.predict(
185
+ url=url,
186
+ selector=selector,
187
+ timeout=timeout,
188
+ use_persistent=use_persistent,
189
+ api_name="/wait_for_element"
190
+ )
191
+ return str(result)
192
+ except Exception as e:
193
+ return f"Error during wait_for_element: {str(e)}"
194
+
195
  def get_all_browser_tools():
196
  """Returns a list of all browser automation tools."""
197
  return [
 
202
  get_cookies,
203
  set_cookies,
204
  scroll_page,
205
+ take_screenshot,
206
+ get_html_source,
207
+ get_page_info,
208
+ wait_for_element
209
  ]
src/web_extractor.py CHANGED
@@ -167,22 +167,28 @@ class WebExtractor:
167
  return response.content
168
 
169
  async def _call_model_with_tools(self, query: str, conversation_history: list[dict] | None = None) -> str:
170
- """Execute a tool-calling loop with the model."""
171
  from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
172
 
173
  history_text = self._format_conversation_history(conversation_history)
174
 
175
  system_prompt = f"""You are a master netrunner AI with the personality of Rebecca from Cyberpunk 2077.
176
- You help users scrape and extract data. You have access to advanced browser automation tools.
177
 
178
- Current webpage content (preprocessed):
179
- {self.preprocessed_content}
 
180
 
181
  Conversation history:
182
  {history_text}
183
 
184
- If you are blocked, see a captcha, or the content above is incomplete, use your tools to interact with the page, get cookies, or execute JavaScript.
185
- Always try to return the final data in the format requested by the user.
 
 
 
 
 
186
  """
187
 
188
  messages = [
@@ -192,29 +198,47 @@ Always try to return the final data in the format requested by the user.
192
 
193
  model_with_tools = self.model.bind_tools(self.tools)
194
 
195
- # Tool execution loop (max 5 iterations)
196
- for _ in range(5):
197
- response = await model_with_tools.ainvoke(messages)
198
- messages.append(response)
199
-
200
- if not response.tool_calls:
201
- return response.content
202
-
203
- for tool_call in response.tool_calls:
204
- tool_name = tool_call["name"].lower()
205
- tool_args = tool_call["args"]
206
-
207
- # Find the tool
208
- selected_tool = next((t for t in self.tools if t.name.lower() == tool_name), None)
209
- if selected_tool:
210
- try:
211
- observation = selected_tool.invoke(tool_args)
212
- except Exception as e:
213
- observation = f"Error executing tool {tool_name}: {str(e)}"
214
- else:
215
- observation = f"Tool {tool_name} not found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- messages.append(ToolMessage(content=str(observation), tool_call_id=tool_call["id"]))
 
 
 
218
 
219
  return messages[-1].content if hasattr(messages[-1], "content") else str(messages[-1])
220
 
@@ -250,6 +274,7 @@ User: {query}"""
250
  async def process_query(self, user_input: str, conversation_history: list[dict] | None = None, progress_callback=None) -> str:
251
  url = extract_url(user_input)
252
  if url:
 
253
  # Get text after the URL for parsing parameters
254
  url_match = _URL_PATTERN.search(user_input)
255
  text_after_url = user_input[url_match.end():].strip()
 
167
  return response.content
168
 
169
  async def _call_model_with_tools(self, query: str, conversation_history: list[dict] | None = None) -> str:
170
+ """Execute an iterative, agentic tool-calling loop with the model."""
171
  from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
172
 
173
  history_text = self._format_conversation_history(conversation_history)
174
 
175
  system_prompt = f"""You are a master netrunner AI with the personality of Rebecca from Cyberpunk 2077.
176
+ You help users scrape and extract data through continuous and iterative investigation.
177
 
178
+ Current URL: {self.current_url}
179
+ Current webpage content (preprocessed snippet):
180
+ {self.preprocessed_content[:2000] if self.preprocessed_content else "None"}
181
 
182
  Conversation history:
183
  {history_text}
184
 
185
+ MISSION PARAMETERS:
186
+ 1. INVESTIGATE: Use your tools (click, scroll, get_page_info) to explore the site iteratively.
187
+ 2. PERSIST: If you hit a captcha or get blocked, try to get_cookies, set_cookies, or execute_javascript to bypass.
188
+ 3. VERIFY: After an action (like click or scroll), use get_page_info or browse_and_extract to see the updated state.
189
+ 4. EXTRACT: Once you have the data, format it as requested (JSON/CSV/etc).
190
+
191
+ DO NOT stop until the task is complete or you've exhausted all options. You are in a continuous loop.
192
  """
193
 
194
  messages = [
 
198
 
199
  model_with_tools = self.model.bind_tools(self.tools)
200
 
201
+ # Iterative execution loop (max 10 iterations for deep investigation)
202
+ for i in range(10):
203
+ try:
204
+ response = await model_with_tools.ainvoke(messages)
205
+ messages.append(response)
206
+
207
+ if not response.tool_calls:
208
+ # If the AI says it's done, but we're in an investigative loop,
209
+ # we return its content.
210
+ return response.content
211
+
212
+ for tool_call in response.tool_calls:
213
+ tool_name = tool_call["name"].lower()
214
+ tool_args = tool_call["args"]
215
+
216
+ # Ensure URL is passed if missing and available
217
+ if "url" not in tool_args and self.current_url:
218
+ tool_args["url"] = self.current_url
219
+
220
+ # Find and execute the tool
221
+ selected_tool = next((t for t in self.tools if t.name.lower() == tool_name), None)
222
+ if selected_tool:
223
+ try:
224
+ # Use use_persistent=True for iterative session if possible
225
+ if "use_persistent" in tool_args:
226
+ tool_args["use_persistent"] = True
227
+
228
+ observation = selected_tool.invoke(tool_args)
229
+
230
+ # If action might change state, append a hint for the AI
231
+ if tool_name in ["click_element", "fill_field", "execute_javascript", "scroll_page"]:
232
+ observation = f"ACTION SUCCESSFUL. {observation}\nPRO-TIP: Use get_page_info or browse_and_extract to see if the page state changed."
233
+ except Exception as e:
234
+ observation = f"ERROR executing tool {tool_name}: {str(e)}\nTry a different approach or selector."
235
+ else:
236
+ observation = f"Tool {tool_name} not found."
237
 
238
+ messages.append(ToolMessage(content=str(observation), tool_call_id=tool_call["id"]))
239
+ except Exception as e:
240
+ logger.error(f"Error in agentic loop iteration {i}: {e}")
241
+ return f"Internal error during investigation: {str(e)}"
242
 
243
  return messages[-1].content if hasattr(messages[-1], "content") else str(messages[-1])
244
 
 
274
  async def process_query(self, user_input: str, conversation_history: list[dict] | None = None, progress_callback=None) -> str:
275
  url = extract_url(user_input)
276
  if url:
277
+ self.current_url = url
278
  # Get text after the URL for parsing parameters
279
  url_match = _URL_PATTERN.search(user_input)
280
  text_after_url = user_input[url_match.end():].strip()
test_extractor.py DELETED
@@ -1,21 +0,0 @@
1
- import asyncio
2
- from src.web_extractor import WebExtractor
3
- from src.scrapers.playwright_scraper import ScraperConfig
4
-
5
- async def test():
6
- config = ScraperConfig(headless=True)
7
- try:
8
- extractor = WebExtractor(model_name="alias-fast", scraper_config=config)
9
- print("WebExtractor initialized successfully!")
10
-
11
- # Test URL extraction
12
- from src.web_extractor import extract_url
13
- url = extract_url("Check out https://example.com")
14
- print(f"Extracted URL: {url}")
15
- assert url == "https://example.com"
16
-
17
- except Exception as e:
18
- print(f"Error: {e}")
19
-
20
- if __name__ == "__main__":
21
- asyncio.run(test())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_patchright.py DELETED
@@ -1,13 +0,0 @@
1
- import asyncio
2
- from patchright.async_api import async_playwright
3
-
4
- async def main():
5
- async with async_playwright() as p:
6
- browser = await p.chromium.launch(headless=True)
7
- page = await browser.new_page()
8
- await page.goto("https://example.com")
9
- print(await page.title())
10
- await browser.close()
11
-
12
- if __name__ == "__main__":
13
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_tools.py DELETED
@@ -1,11 +0,0 @@
1
- import asyncio
2
- from src.utils.browser_tools import get_all_browser_tools
3
-
4
- def test_tools():
5
- tools = get_all_browser_tools()
6
- print(f"Number of tools initialized: {len(tools)}")
7
- for tool in tools:
8
- print(f"Tool name: {tool.name}")
9
-
10
- if __name__ == "__main__":
11
- test_tools()