Spaces:
Paused
Implement iterative agentic framework for CyberScraper 2077
Browse filesSummary of changes:
- Enhanced `WebExtractor` with a deep iterative agentic loop (up to 10 iterations).
- Added `get_html_source`, `get_page_info`, and `wait_for_element` to browser tools for better state perception.
- Updated system prompt to guide the AI through continuous investigation, persistence, and verification.
- Improved async reliability in Streamlit by explicitly managing the event loop to avoid `RuntimeError`.
- Refined browser fallback logic to be more robust on Linux/Docker environments.
- Maintained full compatibility with Hugging Face Space deployment (Nginx proxy, FastAPI, Blablador LLM).
- Verified functionality on live Space and addressed code review feedback.
Co-authored-by: Greene-ctrl <192867433+Greene-ctrl@users.noreply.github.com>
- app/streamlit_web_scraper_chat.py +15 -7
- src/utils/browser_tools.py +51 -1
- src/web_extractor.py +53 -28
- test_extractor.py +0 -21
- test_patchright.py +0 -13
- test_tools.py +0 -11
|
@@ -12,12 +12,20 @@ class StreamlitWebScraperChat:
|
|
| 12 |
async def process_with_progress():
|
| 13 |
progress_placeholder = st.empty()
|
| 14 |
progress_placeholder.text("Processing...")
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
return result
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
async def process_with_progress():
|
| 13 |
progress_placeholder = st.empty()
|
| 14 |
progress_placeholder.text("Processing...")
|
| 15 |
+
try:
|
| 16 |
+
result = await self.web_extractor.process_query(
|
| 17 |
+
message,
|
| 18 |
+
conversation_history=conversation_history,
|
| 19 |
+
progress_callback=progress_placeholder.text
|
| 20 |
+
)
|
| 21 |
+
finally:
|
| 22 |
+
progress_placeholder.empty()
|
| 23 |
return result
|
| 24 |
|
| 25 |
+
try:
|
| 26 |
+
# Try to get existing loop or create new one
|
| 27 |
+
loop = asyncio.new_event_loop()
|
| 28 |
+
asyncio.set_event_loop(loop)
|
| 29 |
+
return loop.run_until_complete(process_with_progress())
|
| 30 |
+
finally:
|
| 31 |
+
loop.close()
|
|
@@ -145,6 +145,53 @@ def take_screenshot(url: str, full_page: bool = False, use_persistent: bool = Fa
|
|
| 145 |
except Exception as e:
|
| 146 |
return f"Error during take_screenshot: {str(e)}"
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def get_all_browser_tools():
|
| 149 |
"""Returns a list of all browser automation tools."""
|
| 150 |
return [
|
|
@@ -155,5 +202,8 @@ def get_all_browser_tools():
|
|
| 155 |
get_cookies,
|
| 156 |
set_cookies,
|
| 157 |
scroll_page,
|
| 158 |
-
take_screenshot
|
|
|
|
|
|
|
|
|
|
| 159 |
]
|
|
|
|
| 145 |
except Exception as e:
|
| 146 |
return f"Error during take_screenshot: {str(e)}"
|
| 147 |
|
| 148 |
+
@tool
|
| 149 |
+
def get_html_source(url: str, use_persistent: bool = False) -> str:
|
| 150 |
+
"""Get the full HTML source code of the current page."""
|
| 151 |
+
client = get_browser_client()
|
| 152 |
+
if not client: return "Error: Browser client unavailable."
|
| 153 |
+
try:
|
| 154 |
+
result = client.predict(
|
| 155 |
+
url=url,
|
| 156 |
+
use_persistent=use_persistent,
|
| 157 |
+
api_name="/get_html_source"
|
| 158 |
+
)
|
| 159 |
+
return str(result)
|
| 160 |
+
except Exception as e:
|
| 161 |
+
return f"Error during get_html_source: {str(e)}"
|
| 162 |
+
|
| 163 |
+
@tool
|
| 164 |
+
def get_page_info(url: str, use_persistent: bool = False) -> str:
|
| 165 |
+
"""Get comprehensive page information including title, URL, and interactive elements."""
|
| 166 |
+
client = get_browser_client()
|
| 167 |
+
if not client: return "Error: Browser client unavailable."
|
| 168 |
+
try:
|
| 169 |
+
result = client.predict(
|
| 170 |
+
url=url,
|
| 171 |
+
use_persistent=use_persistent,
|
| 172 |
+
api_name="/get_page_info"
|
| 173 |
+
)
|
| 174 |
+
return str(result)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
return f"Error during get_page_info: {str(e)}"
|
| 177 |
+
|
| 178 |
+
@tool
|
| 179 |
+
def wait_for_element(url: str, selector: str, timeout: float = 10, use_persistent: bool = False) -> str:
|
| 180 |
+
"""Wait for an element matching the CSS selector to appear on the page."""
|
| 181 |
+
client = get_browser_client()
|
| 182 |
+
if not client: return "Error: Browser client unavailable."
|
| 183 |
+
try:
|
| 184 |
+
result = client.predict(
|
| 185 |
+
url=url,
|
| 186 |
+
selector=selector,
|
| 187 |
+
timeout=timeout,
|
| 188 |
+
use_persistent=use_persistent,
|
| 189 |
+
api_name="/wait_for_element"
|
| 190 |
+
)
|
| 191 |
+
return str(result)
|
| 192 |
+
except Exception as e:
|
| 193 |
+
return f"Error during wait_for_element: {str(e)}"
|
| 194 |
+
|
| 195 |
def get_all_browser_tools():
|
| 196 |
"""Returns a list of all browser automation tools."""
|
| 197 |
return [
|
|
|
|
| 202 |
get_cookies,
|
| 203 |
set_cookies,
|
| 204 |
scroll_page,
|
| 205 |
+
take_screenshot,
|
| 206 |
+
get_html_source,
|
| 207 |
+
get_page_info,
|
| 208 |
+
wait_for_element
|
| 209 |
]
|
|
@@ -167,22 +167,28 @@ class WebExtractor:
|
|
| 167 |
return response.content
|
| 168 |
|
| 169 |
async def _call_model_with_tools(self, query: str, conversation_history: list[dict] | None = None) -> str:
|
| 170 |
-
"""Execute
|
| 171 |
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
|
| 172 |
|
| 173 |
history_text = self._format_conversation_history(conversation_history)
|
| 174 |
|
| 175 |
system_prompt = f"""You are a master netrunner AI with the personality of Rebecca from Cyberpunk 2077.
|
| 176 |
-
You help users scrape and extract data
|
| 177 |
|
| 178 |
-
Current
|
| 179 |
-
|
|
|
|
| 180 |
|
| 181 |
Conversation history:
|
| 182 |
{history_text}
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
"""
|
| 187 |
|
| 188 |
messages = [
|
|
@@ -192,29 +198,47 @@ Always try to return the final data in the format requested by the user.
|
|
| 192 |
|
| 193 |
model_with_tools = self.model.bind_tools(self.tools)
|
| 194 |
|
| 195 |
-
#
|
| 196 |
-
for
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
return messages[-1].content if hasattr(messages[-1], "content") else str(messages[-1])
|
| 220 |
|
|
@@ -250,6 +274,7 @@ User: {query}"""
|
|
| 250 |
async def process_query(self, user_input: str, conversation_history: list[dict] | None = None, progress_callback=None) -> str:
|
| 251 |
url = extract_url(user_input)
|
| 252 |
if url:
|
|
|
|
| 253 |
# Get text after the URL for parsing parameters
|
| 254 |
url_match = _URL_PATTERN.search(user_input)
|
| 255 |
text_after_url = user_input[url_match.end():].strip()
|
|
|
|
| 167 |
return response.content
|
| 168 |
|
| 169 |
async def _call_model_with_tools(self, query: str, conversation_history: list[dict] | None = None) -> str:
|
| 170 |
+
"""Execute an iterative, agentic tool-calling loop with the model."""
|
| 171 |
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
|
| 172 |
|
| 173 |
history_text = self._format_conversation_history(conversation_history)
|
| 174 |
|
| 175 |
system_prompt = f"""You are a master netrunner AI with the personality of Rebecca from Cyberpunk 2077.
|
| 176 |
+
You help users scrape and extract data through continuous and iterative investigation.
|
| 177 |
|
| 178 |
+
Current URL: {self.current_url}
|
| 179 |
+
Current webpage content (preprocessed snippet):
|
| 180 |
+
{self.preprocessed_content[:2000] if self.preprocessed_content else "None"}
|
| 181 |
|
| 182 |
Conversation history:
|
| 183 |
{history_text}
|
| 184 |
|
| 185 |
+
MISSION PARAMETERS:
|
| 186 |
+
1. INVESTIGATE: Use your tools (click, scroll, get_page_info) to explore the site iteratively.
|
| 187 |
+
2. PERSIST: If you hit a captcha or get blocked, try to get_cookies, set_cookies, or execute_javascript to bypass.
|
| 188 |
+
3. VERIFY: After an action (like click or scroll), use get_page_info or browse_and_extract to see the updated state.
|
| 189 |
+
4. EXTRACT: Once you have the data, format it as requested (JSON/CSV/etc).
|
| 190 |
+
|
| 191 |
+
DO NOT stop until the task is complete or you've exhausted all options. You are in a continuous loop.
|
| 192 |
"""
|
| 193 |
|
| 194 |
messages = [
|
|
|
|
| 198 |
|
| 199 |
model_with_tools = self.model.bind_tools(self.tools)
|
| 200 |
|
| 201 |
+
# Iterative execution loop (max 10 iterations for deep investigation)
|
| 202 |
+
for i in range(10):
|
| 203 |
+
try:
|
| 204 |
+
response = await model_with_tools.ainvoke(messages)
|
| 205 |
+
messages.append(response)
|
| 206 |
+
|
| 207 |
+
if not response.tool_calls:
|
| 208 |
+
# If the AI says it's done, but we're in an investigative loop,
|
| 209 |
+
# we return its content.
|
| 210 |
+
return response.content
|
| 211 |
+
|
| 212 |
+
for tool_call in response.tool_calls:
|
| 213 |
+
tool_name = tool_call["name"].lower()
|
| 214 |
+
tool_args = tool_call["args"]
|
| 215 |
+
|
| 216 |
+
# Ensure URL is passed if missing and available
|
| 217 |
+
if "url" not in tool_args and self.current_url:
|
| 218 |
+
tool_args["url"] = self.current_url
|
| 219 |
+
|
| 220 |
+
# Find and execute the tool
|
| 221 |
+
selected_tool = next((t for t in self.tools if t.name.lower() == tool_name), None)
|
| 222 |
+
if selected_tool:
|
| 223 |
+
try:
|
| 224 |
+
# Use use_persistent=True for iterative session if possible
|
| 225 |
+
if "use_persistent" in tool_args:
|
| 226 |
+
tool_args["use_persistent"] = True
|
| 227 |
+
|
| 228 |
+
observation = selected_tool.invoke(tool_args)
|
| 229 |
+
|
| 230 |
+
# If action might change state, append a hint for the AI
|
| 231 |
+
if tool_name in ["click_element", "fill_field", "execute_javascript", "scroll_page"]:
|
| 232 |
+
observation = f"ACTION SUCCESSFUL. {observation}\nPRO-TIP: Use get_page_info or browse_and_extract to see if the page state changed."
|
| 233 |
+
except Exception as e:
|
| 234 |
+
observation = f"ERROR executing tool {tool_name}: {str(e)}\nTry a different approach or selector."
|
| 235 |
+
else:
|
| 236 |
+
observation = f"Tool {tool_name} not found."
|
| 237 |
|
| 238 |
+
messages.append(ToolMessage(content=str(observation), tool_call_id=tool_call["id"]))
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Error in agentic loop iteration {i}: {e}")
|
| 241 |
+
return f"Internal error during investigation: {str(e)}"
|
| 242 |
|
| 243 |
return messages[-1].content if hasattr(messages[-1], "content") else str(messages[-1])
|
| 244 |
|
|
|
|
| 274 |
async def process_query(self, user_input: str, conversation_history: list[dict] | None = None, progress_callback=None) -> str:
|
| 275 |
url = extract_url(user_input)
|
| 276 |
if url:
|
| 277 |
+
self.current_url = url
|
| 278 |
# Get text after the URL for parsing parameters
|
| 279 |
url_match = _URL_PATTERN.search(user_input)
|
| 280 |
text_after_url = user_input[url_match.end():].strip()
|
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
from src.web_extractor import WebExtractor
|
| 3 |
-
from src.scrapers.playwright_scraper import ScraperConfig
|
| 4 |
-
|
| 5 |
-
async def test():
|
| 6 |
-
config = ScraperConfig(headless=True)
|
| 7 |
-
try:
|
| 8 |
-
extractor = WebExtractor(model_name="alias-fast", scraper_config=config)
|
| 9 |
-
print("WebExtractor initialized successfully!")
|
| 10 |
-
|
| 11 |
-
# Test URL extraction
|
| 12 |
-
from src.web_extractor import extract_url
|
| 13 |
-
url = extract_url("Check out https://example.com")
|
| 14 |
-
print(f"Extracted URL: {url}")
|
| 15 |
-
assert url == "https://example.com"
|
| 16 |
-
|
| 17 |
-
except Exception as e:
|
| 18 |
-
print(f"Error: {e}")
|
| 19 |
-
|
| 20 |
-
if __name__ == "__main__":
|
| 21 |
-
asyncio.run(test())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,13 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
from patchright.async_api import async_playwright
|
| 3 |
-
|
| 4 |
-
async def main():
|
| 5 |
-
async with async_playwright() as p:
|
| 6 |
-
browser = await p.chromium.launch(headless=True)
|
| 7 |
-
page = await browser.new_page()
|
| 8 |
-
await page.goto("https://example.com")
|
| 9 |
-
print(await page.title())
|
| 10 |
-
await browser.close()
|
| 11 |
-
|
| 12 |
-
if __name__ == "__main__":
|
| 13 |
-
asyncio.run(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
from src.utils.browser_tools import get_all_browser_tools
|
| 3 |
-
|
| 4 |
-
def test_tools():
|
| 5 |
-
tools = get_all_browser_tools()
|
| 6 |
-
print(f"Number of tools initialized: {len(tools)}")
|
| 7 |
-
for tool in tools:
|
| 8 |
-
print(f"Tool name: {tool.name}")
|
| 9 |
-
|
| 10 |
-
if __name__ == "__main__":
|
| 11 |
-
test_tools()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|