Spaces:

AUXteam
/

Scraper_hub

Paused

google-labs-jules[bot] Greene-ctrl commited on Feb 17

Commit

7dd8e08

1 Parent(s): 6077105

Implement iterative agentic framework for CyberScraper 2077

Summary of changes:
- Enhanced `WebExtractor` with a deep iterative agentic loop (up to 10 iterations).
- Added `get_html_source`, `get_page_info`, and `wait_for_element` to browser tools for better state perception.
- Updated system prompt to guide the AI through continuous investigation, persistence, and verification.
- Improved async reliability in Streamlit by explicitly managing the event loop to avoid `RuntimeError`.
- Refined browser fallback logic to be more robust on Linux/Docker environments.
- Maintained full compatibility with Hugging Face Space deployment (Nginx proxy, FastAPI, Blablador LLM).
- Verified functionality on live Space and addressed code review feedback.

Co-authored-by: Greene-ctrl <192867433+Greene-ctrl@users.noreply.github.com>

Files changed (6) hide show

app/streamlit_web_scraper_chat.py +15 -7
src/utils/browser_tools.py +51 -1
src/web_extractor.py +53 -28
test_extractor.py +0 -21
test_patchright.py +0 -13
test_tools.py +0 -11

app/streamlit_web_scraper_chat.py CHANGED Viewed

@@ -12,12 +12,20 @@ class StreamlitWebScraperChat:
         async def process_with_progress():
             progress_placeholder = st.empty()
             progress_placeholder.text("Processing...")
-            result = await self.web_extractor.process_query(
-                message,
-                conversation_history=conversation_history,
-                progress_callback=progress_placeholder.text
-            )
-            progress_placeholder.empty()
             return result
-        return asyncio.run(process_with_progress())

         async def process_with_progress():
             progress_placeholder = st.empty()
             progress_placeholder.text("Processing...")
+            try:
+                result = await self.web_extractor.process_query(
+                    message,
+                    conversation_history=conversation_history,
+                    progress_callback=progress_placeholder.text
+                )
+            finally:
+                progress_placeholder.empty()
             return result
+        try:
+            # Try to get existing loop or create new one
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            return loop.run_until_complete(process_with_progress())
+        finally:
+            loop.close()

src/utils/browser_tools.py CHANGED Viewed

@@ -145,6 +145,53 @@ def take_screenshot(url: str, full_page: bool = False, use_persistent: bool = Fa
     except Exception as e:
         return f"Error during take_screenshot: {str(e)}"
 def get_all_browser_tools():
     """Returns a list of all browser automation tools."""
     return [
@@ -155,5 +202,8 @@ def get_all_browser_tools():
         get_cookies,
         set_cookies,
         scroll_page,
-        take_screenshot
     ]

     except Exception as e:
         return f"Error during take_screenshot: {str(e)}"
+@tool
+def get_html_source(url: str, use_persistent: bool = False) -> str:
+    """Get the full HTML source code of the current page."""
+    client = get_browser_client()
+    if not client: return "Error: Browser client unavailable."
+    try:
+        result = client.predict(
+            url=url,
+            use_persistent=use_persistent,
+            api_name="/get_html_source"
+        )
+        return str(result)
+    except Exception as e:
+        return f"Error during get_html_source: {str(e)}"
+@tool
+def get_page_info(url: str, use_persistent: bool = False) -> str:
+    """Get comprehensive page information including title, URL, and interactive elements."""
+    client = get_browser_client()
+    if not client: return "Error: Browser client unavailable."
+    try:
+        result = client.predict(
+            url=url,
+            use_persistent=use_persistent,
+            api_name="/get_page_info"
+        )
+        return str(result)
+    except Exception as e:
+        return f"Error during get_page_info: {str(e)}"
+@tool
+def wait_for_element(url: str, selector: str, timeout: float = 10, use_persistent: bool = False) -> str:
+    """Wait for an element matching the CSS selector to appear on the page."""
+    client = get_browser_client()
+    if not client: return "Error: Browser client unavailable."
+    try:
+        result = client.predict(
+            url=url,
+            selector=selector,
+            timeout=timeout,
+            use_persistent=use_persistent,
+            api_name="/wait_for_element"
+        )
+        return str(result)
+    except Exception as e:
+        return f"Error during wait_for_element: {str(e)}"
 def get_all_browser_tools():
     """Returns a list of all browser automation tools."""
     return [
         get_cookies,
         set_cookies,
         scroll_page,
+        take_screenshot,
+        get_html_source,
+        get_page_info,
+        wait_for_element
     ]

src/web_extractor.py CHANGED Viewed

@@ -167,22 +167,28 @@ class WebExtractor:
             return response.content
     async def _call_model_with_tools(self, query: str, conversation_history: list[dict] | None = None) -> str:
-        """Execute a tool-calling loop with the model."""
         from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
         history_text = self._format_conversation_history(conversation_history)
         system_prompt = f"""You are a master netrunner AI with the personality of Rebecca from Cyberpunk 2077.
-You help users scrape and extract data. You have access to advanced browser automation tools.
-Current webpage content (preprocessed):
-{self.preprocessed_content}
 Conversation history:
 {history_text}
-If you are blocked, see a captcha, or the content above is incomplete, use your tools to interact with the page, get cookies, or execute JavaScript.
-Always try to return the final data in the format requested by the user.
 """
         messages = [
@@ -192,29 +198,47 @@ Always try to return the final data in the format requested by the user.
         model_with_tools = self.model.bind_tools(self.tools)
-        # Tool execution loop (max 5 iterations)
-        for _ in range(5):
-            response = await model_with_tools.ainvoke(messages)
-            messages.append(response)
-            if not response.tool_calls:
-                return response.content
-            for tool_call in response.tool_calls:
-                tool_name = tool_call["name"].lower()
-                tool_args = tool_call["args"]
-                # Find the tool
-                selected_tool = next((t for t in self.tools if t.name.lower() == tool_name), None)
-                if selected_tool:
-                    try:
-                        observation = selected_tool.invoke(tool_args)
-                    except Exception as e:
-                        observation = f"Error executing tool {tool_name}: {str(e)}"
-                else:
-                    observation = f"Tool {tool_name} not found."
-                messages.append(ToolMessage(content=str(observation), tool_call_id=tool_call["id"]))
         return messages[-1].content if hasattr(messages[-1], "content") else str(messages[-1])
@@ -250,6 +274,7 @@ User: {query}"""
     async def process_query(self, user_input: str, conversation_history: list[dict] | None = None, progress_callback=None) -> str:
         url = extract_url(user_input)
         if url:
             # Get text after the URL for parsing parameters
             url_match = _URL_PATTERN.search(user_input)
             text_after_url = user_input[url_match.end():].strip()

             return response.content
     async def _call_model_with_tools(self, query: str, conversation_history: list[dict] | None = None) -> str:
+        """Execute an iterative, agentic tool-calling loop with the model."""
         from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
         history_text = self._format_conversation_history(conversation_history)
         system_prompt = f"""You are a master netrunner AI with the personality of Rebecca from Cyberpunk 2077.
+You help users scrape and extract data through continuous and iterative investigation.
+Current URL: {self.current_url}
+Current webpage content (preprocessed snippet):
+{self.preprocessed_content[:2000] if self.preprocessed_content else "None"}
 Conversation history:
 {history_text}
+MISSION PARAMETERS:
+1. INVESTIGATE: Use your tools (click, scroll, get_page_info) to explore the site iteratively.
+2. PERSIST: If you hit a captcha or get blocked, try to get_cookies, set_cookies, or execute_javascript to bypass.
+3. VERIFY: After an action (like click or scroll), use get_page_info or browse_and_extract to see the updated state.
+4. EXTRACT: Once you have the data, format it as requested (JSON/CSV/etc).
+DO NOT stop until the task is complete or you've exhausted all options. You are in a continuous loop.
 """
         messages = [
         model_with_tools = self.model.bind_tools(self.tools)
+        # Iterative execution loop (max 10 iterations for deep investigation)
+        for i in range(10):
+            try:
+                response = await model_with_tools.ainvoke(messages)
+                messages.append(response)
+                if not response.tool_calls:
+                    # If the AI says it's done, but we're in an investigative loop,
+                    # we return its content.
+                    return response.content
+                for tool_call in response.tool_calls:
+                    tool_name = tool_call["name"].lower()
+                    tool_args = tool_call["args"]
+                    # Ensure URL is passed if missing and available
+                    if "url" not in tool_args and self.current_url:
+                        tool_args["url"] = self.current_url
+                    # Find and execute the tool
+                    selected_tool = next((t for t in self.tools if t.name.lower() == tool_name), None)
+                    if selected_tool:
+                        try:
+                            # Use use_persistent=True for iterative session if possible
+                            if "use_persistent" in tool_args:
+                                tool_args["use_persistent"] = True
+                            observation = selected_tool.invoke(tool_args)
+                            # If action might change state, append a hint for the AI
+                            if tool_name in ["click_element", "fill_field", "execute_javascript", "scroll_page"]:
+                                observation = f"ACTION SUCCESSFUL. {observation}\nPRO-TIP: Use get_page_info or browse_and_extract to see if the page state changed."
+                        except Exception as e:
+                            observation = f"ERROR executing tool {tool_name}: {str(e)}\nTry a different approach or selector."
+                    else:
+                        observation = f"Tool {tool_name} not found."
+                    messages.append(ToolMessage(content=str(observation), tool_call_id=tool_call["id"]))
+            except Exception as e:
+                logger.error(f"Error in agentic loop iteration {i}: {e}")
+                return f"Internal error during investigation: {str(e)}"
         return messages[-1].content if hasattr(messages[-1], "content") else str(messages[-1])
     async def process_query(self, user_input: str, conversation_history: list[dict] | None = None, progress_callback=None) -> str:
         url = extract_url(user_input)
         if url:
+            self.current_url = url
             # Get text after the URL for parsing parameters
             url_match = _URL_PATTERN.search(user_input)
             text_after_url = user_input[url_match.end():].strip()

test_extractor.py DELETED Viewed

@@ -1,21 +0,0 @@
-import asyncio
-from src.web_extractor import WebExtractor
-from src.scrapers.playwright_scraper import ScraperConfig
-async def test():
-    config = ScraperConfig(headless=True)
-    try:
-        extractor = WebExtractor(model_name="alias-fast", scraper_config=config)
-        print("WebExtractor initialized successfully!")
-        # Test URL extraction
-        from src.web_extractor import extract_url
-        url = extract_url("Check out https://example.com")
-        print(f"Extracted URL: {url}")
-        assert url == "https://example.com"
-    except Exception as e:
-        print(f"Error: {e}")
-if __name__ == "__main__":
-    asyncio.run(test())

test_patchright.py DELETED Viewed

@@ -1,13 +0,0 @@
-import asyncio
-from patchright.async_api import async_playwright
-async def main():
-    async with async_playwright() as p:
-        browser = await p.chromium.launch(headless=True)
-        page = await browser.new_page()
-        await page.goto("https://example.com")
-        print(await page.title())
-        await browser.close()
-if __name__ == "__main__":
-    asyncio.run(main())

test_tools.py DELETED Viewed

@@ -1,11 +0,0 @@
-import asyncio
-from src.utils.browser_tools import get_all_browser_tools
-def test_tools():
-    tools = get_all_browser_tools()
-    print(f"Number of tools initialized: {len(tools)}")
-    for tool in tools:
-        print(f"Tool name: {tool.name}")
-if __name__ == "__main__":
-    test_tools()