Spaces:
Runtime error
Runtime error
File size: 4,464 Bytes
5a3fcad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """
Web Scraping Agent
This module implements the AI-powered web scraping agent that extracts
structured data from web pages using the SmartScraperTool.
The agent uses natural language instructions to:
1. Identify target data based on keywords
2. Extract specified quantities of items
3. Structure results as pandas DataFrames
Example:
>>> from src.agents import web_scraping
>>> result = web_scraping("Find top 5 AI startups and their funding")
"""
import io
import contextlib
from typing import Optional, Any
from langchain.agents import initialize_agent, AgentType
from ..prompts import get_scraping_prompt
class WebScrapingAgent:
"""
Agent for AI-powered web data extraction.
This agent uses SmartScraperTool to extract structured data from
web pages based on natural language descriptions.
Attributes:
model: The LLM model for understanding extraction requests.
tools: List of scraping tools (SmartScraperTool).
Example:
>>> agent = WebScrapingAgent(model=azure_llm)
>>> data = agent.scrape("List 10 popular Python libraries")
"""
def __init__(self, model: Any):
"""
Initialize the web scraping agent.
Args:
model: The LLM model instance for understanding requests.
Note:
Requires SGAI_API_KEY environment variable to be set
for SmartScraperTool functionality.
"""
self.model = model
self.tools = self._create_tools()
def _create_tools(self):
"""
Create the web scraping tools.
Returns:
List of LangChain tools for web scraping.
Note:
SmartScraperTool requires langchain-scrapegraph package
and a valid ScrapeGraphAI API key.
"""
try:
from langchain_scrapegraph.tools import SmartScraperTool
return [SmartScraperTool()]
except ImportError:
print("Warning: langchain-scrapegraph not installed")
return []
def scrape(self, query: str) -> str:
"""
Extract data from the web based on a natural language query.
This method:
1. Parses the query to identify keywords and quantities
2. Formulates targeted search queries
3. Extracts and structures the data
4. Returns results as a formatted string (ideally DataFrame-ready)
Args:
query: Natural language description of data to extract.
Example: "Find top 10 AI companies and their funding"
Returns:
str: Extracted data in structured format, or error message.
Example:
>>> result = agent.scrape("List 5 trending GitHub repos")
"""
if not self.tools:
return "Error: Web scraping tools not available. Install langchain-scrapegraph."
try:
# Create structured agent for web scraping
agent = initialize_agent(
tools=self.tools,
llm=self.model,
agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
verbose=True
)
# Build the full prompt with scraping instructions
full_prompt = get_scraping_prompt(query)
# Capture output during execution
buffer = io.StringIO()
with contextlib.redirect_stdout(buffer):
response = agent.run(full_prompt)
return response
except Exception as e:
return f"Web scraping error: {e}"
def web_scraping(
question: str,
model: Optional[Any] = None
) -> str:
"""
Extract data from the web using natural language instructions.
This is a convenience function that wraps WebScrapingAgent
for simple extraction tasks.
Args:
question: Natural language description of data to extract.
model: Optional LLM model. If None, uses global model.
Returns:
str: Extracted data or error message.
Example:
>>> data = web_scraping("Find 5 popular ML frameworks", model)
"""
if model is None:
return "Error: No LLM model provided."
agent = WebScrapingAgent(model=model)
return agent.scrape(question)
|