File size: 4,464 Bytes
5a3fcad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Web Scraping Agent

This module implements the AI-powered web scraping agent that extracts
structured data from web pages using the SmartScraperTool.

The agent uses natural language instructions to:
1. Identify target data based on keywords
2. Extract specified quantities of items
3. Structure results as pandas DataFrames

Example:
    >>> from src.agents import web_scraping
    >>> result = web_scraping("Find top 5 AI startups and their funding")
"""

import io
import contextlib
from typing import Optional, Any

from langchain.agents import initialize_agent, AgentType

from ..prompts import get_scraping_prompt


class WebScrapingAgent:
    """
    Agent for AI-powered web data extraction.
    
    This agent uses SmartScraperTool to extract structured data from
    web pages based on natural language descriptions.
    
    Attributes:
        model: The LLM model for understanding extraction requests.
        tools: List of scraping tools (SmartScraperTool).
        
    Example:
        >>> agent = WebScrapingAgent(model=azure_llm)
        >>> data = agent.scrape("List 10 popular Python libraries")
    """
    
    def __init__(self, model: Any):
        """
        Initialize the web scraping agent.
        
        Args:
            model: The LLM model instance for understanding requests.
        
        Note:
            Requires SGAI_API_KEY environment variable to be set
            for SmartScraperTool functionality.
        """
        self.model = model
        self.tools = self._create_tools()
    
    def _create_tools(self):
        """
        Create the web scraping tools.
        
        Returns:
            List of LangChain tools for web scraping.
            
        Note:
            SmartScraperTool requires langchain-scrapegraph package
            and a valid ScrapeGraphAI API key.
        """
        try:
            from langchain_scrapegraph.tools import SmartScraperTool
            return [SmartScraperTool()]
        except ImportError:
            print("Warning: langchain-scrapegraph not installed")
            return []
    
    def scrape(self, query: str) -> str:
        """
        Extract data from the web based on a natural language query.
        
        This method:
        1. Parses the query to identify keywords and quantities
        2. Formulates targeted search queries
        3. Extracts and structures the data
        4. Returns results as a formatted string (ideally DataFrame-ready)
        
        Args:
            query: Natural language description of data to extract.
                   Example: "Find top 10 AI companies and their funding"
                   
        Returns:
            str: Extracted data in structured format, or error message.
            
        Example:
            >>> result = agent.scrape("List 5 trending GitHub repos")
        """
        if not self.tools:
            return "Error: Web scraping tools not available. Install langchain-scrapegraph."
        
        try:
            # Create structured agent for web scraping
            agent = initialize_agent(
                tools=self.tools,
                llm=self.model,
                agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
                verbose=True
            )
            
            # Build the full prompt with scraping instructions
            full_prompt = get_scraping_prompt(query)
            
            # Capture output during execution
            buffer = io.StringIO()
            with contextlib.redirect_stdout(buffer):
                response = agent.run(full_prompt)
            
            return response
            
        except Exception as e:
            return f"Web scraping error: {e}"


def web_scraping(
    question: str,
    model: Optional[Any] = None
) -> str:
    """
    Extract data from the web using natural language instructions.
    
    This is a convenience function that wraps WebScrapingAgent
    for simple extraction tasks.
    
    Args:
        question: Natural language description of data to extract.
        model: Optional LLM model. If None, uses global model.
        
    Returns:
        str: Extracted data or error message.
        
    Example:
        >>> data = web_scraping("Find 5 popular ML frameworks", model)
    """
    if model is None:
        return "Error: No LLM model provided."
    
    agent = WebScrapingAgent(model=model)
    return agent.scrape(question)