import requests from bs4 import BeautifulSoup from crewai_tools import tool from transformers import pipeline @tool def web_scraper_tool(query: str) -> str: """ A tool to scrape web data based on the given query. Parameters: - query (str): The search query for web scraping. Returns: - str: The collected data including titles, links, snippets, and full page content. """ def scrape_jina_ai(url: str) -> str: response = requests.get("https://r.jina.ai/" + url) if response.status_code != 200: return f"Failed to retrieve content: {response.status_code}" return response.text url = 'https://www.bing.com/search' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } params = {'q': query} response = requests.get(url, headers=headers, params=params) if response.status_code != 200: return f"Failed to retrieve results: {response.status_code}" soup = BeautifulSoup(response.text, 'html.parser') results = [] for result in soup.find_all('li', class_='b_algo'): title = result.find('h2').text link = result.find('a')['href'] snippet = result.find('p').text if result.find('p') else 'No description available' results.append({'title': title, 'link': link, 'snippet': snippet}) # Limit to first 2 results results = results[:2] collected_data = "" for result in results: link = result['link'] page_content = scrape_jina_ai(link) # Initial truncation to a more manageable size if len(page_content.split()) > 10000: # Adjust as needed page_content = page_content[:10000] # Summarize content if necessary if len(page_content.split()) > 500: # Adjust the limit as needed page_content = summarize_content(page_content) collected_data += f"Title: {result['title']}\nLink: {link}\nSnippet: {result['snippet']}\nContent: {page_content}\n\n" return collected_data def summarize_content(content: str) -> str: summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") def chunk_text(text: str, max_chunk_size: int) -> list: """Divide text into chunks of a maximum size.""" words = text.split() return [' '.join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)] # Adjust the chunk size based on token limits chunk_size = 2500 # This should be smaller than the model's token limit chunks = chunk_text(content, max_chunk_size=chunk_size) # Summarize each chunk chunk_summaries = [] for chunk in chunks: try: summary = summarizer(chunk, max_length=1024, min_length=30, do_sample=False) chunk_summaries.append(summary[0]['summary_text']) except Exception as e: print(f"Error summarizing chunk: {e}") # Further summarize the combined summaries if chunk_summaries: try: combined_summary = summarizer(' '.join(chunk_summaries), max_length=4096, min_length=30, do_sample=False) return combined_summary[0]['summary_text'] except Exception as e: print(f"Error summarizing combined summaries: {e}") return 'Summary could not be generated.' else: return 'No content to summarize.'