NexDatawork-Mini-Agent / src /agents /scraping_agent.py
svar-chandak
Revamp docs, add requirements, and modularize agents
5a3fcad
"""
Web Scraping Agent
This module implements the AI-powered web scraping agent that extracts
structured data from web pages using the SmartScraperTool.
The agent uses natural language instructions to:
1. Identify target data based on keywords
2. Extract specified quantities of items
3. Structure results as pandas DataFrames
Example:
>>> from src.agents import web_scraping
>>> result = web_scraping("Find top 5 AI startups and their funding")
"""
import io
import contextlib
from typing import Optional, Any
from langchain.agents import initialize_agent, AgentType
from ..prompts import get_scraping_prompt
class WebScrapingAgent:
"""
Agent for AI-powered web data extraction.
This agent uses SmartScraperTool to extract structured data from
web pages based on natural language descriptions.
Attributes:
model: The LLM model for understanding extraction requests.
tools: List of scraping tools (SmartScraperTool).
Example:
>>> agent = WebScrapingAgent(model=azure_llm)
>>> data = agent.scrape("List 10 popular Python libraries")
"""
def __init__(self, model: Any):
"""
Initialize the web scraping agent.
Args:
model: The LLM model instance for understanding requests.
Note:
Requires SGAI_API_KEY environment variable to be set
for SmartScraperTool functionality.
"""
self.model = model
self.tools = self._create_tools()
def _create_tools(self):
"""
Create the web scraping tools.
Returns:
List of LangChain tools for web scraping.
Note:
SmartScraperTool requires langchain-scrapegraph package
and a valid ScrapeGraphAI API key.
"""
try:
from langchain_scrapegraph.tools import SmartScraperTool
return [SmartScraperTool()]
except ImportError:
print("Warning: langchain-scrapegraph not installed")
return []
def scrape(self, query: str) -> str:
"""
Extract data from the web based on a natural language query.
This method:
1. Parses the query to identify keywords and quantities
2. Formulates targeted search queries
3. Extracts and structures the data
4. Returns results as a formatted string (ideally DataFrame-ready)
Args:
query: Natural language description of data to extract.
Example: "Find top 10 AI companies and their funding"
Returns:
str: Extracted data in structured format, or error message.
Example:
>>> result = agent.scrape("List 5 trending GitHub repos")
"""
if not self.tools:
return "Error: Web scraping tools not available. Install langchain-scrapegraph."
try:
# Create structured agent for web scraping
agent = initialize_agent(
tools=self.tools,
llm=self.model,
agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
verbose=True
)
# Build the full prompt with scraping instructions
full_prompt = get_scraping_prompt(query)
# Capture output during execution
buffer = io.StringIO()
with contextlib.redirect_stdout(buffer):
response = agent.run(full_prompt)
return response
except Exception as e:
return f"Web scraping error: {e}"
def web_scraping(
question: str,
model: Optional[Any] = None
) -> str:
"""
Extract data from the web using natural language instructions.
This is a convenience function that wraps WebScrapingAgent
for simple extraction tasks.
Args:
question: Natural language description of data to extract.
model: Optional LLM model. If None, uses global model.
Returns:
str: Extracted data or error message.
Example:
>>> data = web_scraping("Find 5 popular ML frameworks", model)
"""
if model is None:
return "Error: No LLM model provided."
agent = WebScrapingAgent(model=model)
return agent.scrape(question)