File size: 3,459 Bytes
de9f3b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
from crewai_tools import tool
from transformers import pipeline

@tool
def web_scraper_tool(query: str) -> str:
    """
    A tool to scrape web data based on the given query.

    Parameters:
    - query (str): The search query for web scraping.

    Returns:
    - str: The collected data including titles, links, snippets, and full page content.
    """
    def scrape_jina_ai(url: str) -> str:
        response = requests.get("https://r.jina.ai/" + url)
        if response.status_code != 200:
            return f"Failed to retrieve content: {response.status_code}"
        return response.text

    url = 'https://www.bing.com/search'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    params = {'q': query}
    response = requests.get(url, headers=headers, params=params)

    if response.status_code != 200:
        return f"Failed to retrieve results: {response.status_code}"

    soup = BeautifulSoup(response.text, 'html.parser')

    results = []
    for result in soup.find_all('li', class_='b_algo'):
        title = result.find('h2').text
        link = result.find('a')['href']
        snippet = result.find('p').text if result.find('p') else 'No description available'
        results.append({'title': title, 'link': link, 'snippet': snippet})

    # Limit to first 2 results
    results = results[:2]

    collected_data = ""
    for result in results:
        link = result['link']
        page_content = scrape_jina_ai(link)
        
        # Initial truncation to a more manageable size
        if len(page_content.split()) > 10000:  # Adjust as needed
            page_content = page_content[:10000]

        # Summarize content if necessary
        if len(page_content.split()) > 500:  # Adjust the limit as needed
            page_content = summarize_content(page_content)

        collected_data += f"Title: {result['title']}\nLink: {link}\nSnippet: {result['snippet']}\nContent: {page_content}\n\n"

    return collected_data

def summarize_content(content: str) -> str:
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

    def chunk_text(text: str, max_chunk_size: int) -> list:
        """Divide text into chunks of a maximum size."""
        words = text.split()
        return [' '.join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)]

    # Adjust the chunk size based on token limits
    chunk_size = 2500  # This should be smaller than the model's token limit
    chunks = chunk_text(content, max_chunk_size=chunk_size)

    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=1024, min_length=30, do_sample=False)
            chunk_summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error summarizing chunk: {e}")

    # Further summarize the combined summaries
    if chunk_summaries:
        try:
            combined_summary = summarizer(' '.join(chunk_summaries), max_length=4096, min_length=30, do_sample=False)
            return combined_summary[0]['summary_text']
        except Exception as e:
            print(f"Error summarizing combined summaries: {e}")
            return 'Summary could not be generated.'
    else:
        return 'No content to summarize.'