Spaces:
Build error
Build error
| import requests | |
| from bs4 import BeautifulSoup | |
| from crewai_tools import tool | |
| from transformers import pipeline | |
| def web_scraper_tool(query: str) -> str: | |
| """ | |
| A tool to scrape web data based on the given query. | |
| Parameters: | |
| - query (str): The search query for web scraping. | |
| Returns: | |
| - str: The collected data including titles, links, snippets, and full page content. | |
| """ | |
| def scrape_jina_ai(url: str) -> str: | |
| response = requests.get("https://r.jina.ai/" + url) | |
| if response.status_code != 200: | |
| return f"Failed to retrieve content: {response.status_code}" | |
| return response.text | |
| url = 'https://www.bing.com/search' | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| params = {'q': query} | |
| response = requests.get(url, headers=headers, params=params) | |
| if response.status_code != 200: | |
| return f"Failed to retrieve results: {response.status_code}" | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| results = [] | |
| for result in soup.find_all('li', class_='b_algo'): | |
| title = result.find('h2').text | |
| link = result.find('a')['href'] | |
| snippet = result.find('p').text if result.find('p') else 'No description available' | |
| results.append({'title': title, 'link': link, 'snippet': snippet}) | |
| # Limit to first 2 results | |
| results = results[:2] | |
| collected_data = "" | |
| for result in results: | |
| link = result['link'] | |
| page_content = scrape_jina_ai(link) | |
| # Initial truncation to a more manageable size | |
| if len(page_content.split()) > 10000: # Adjust as needed | |
| page_content = page_content[:10000] | |
| # Summarize content if necessary | |
| if len(page_content.split()) > 500: # Adjust the limit as needed | |
| page_content = summarize_content(page_content) | |
| collected_data += f"Title: {result['title']}\nLink: {link}\nSnippet: {result['snippet']}\nContent: {page_content}\n\n" | |
| return collected_data | |
| def summarize_content(content: str) -> str: | |
| summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
| def chunk_text(text: str, max_chunk_size: int) -> list: | |
| """Divide text into chunks of a maximum size.""" | |
| words = text.split() | |
| return [' '.join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)] | |
| # Adjust the chunk size based on token limits | |
| chunk_size = 2500 # This should be smaller than the model's token limit | |
| chunks = chunk_text(content, max_chunk_size=chunk_size) | |
| # Summarize each chunk | |
| chunk_summaries = [] | |
| for chunk in chunks: | |
| try: | |
| summary = summarizer(chunk, max_length=1024, min_length=30, do_sample=False) | |
| chunk_summaries.append(summary[0]['summary_text']) | |
| except Exception as e: | |
| print(f"Error summarizing chunk: {e}") | |
| # Further summarize the combined summaries | |
| if chunk_summaries: | |
| try: | |
| combined_summary = summarizer(' '.join(chunk_summaries), max_length=4096, min_length=30, do_sample=False) | |
| return combined_summary[0]['summary_text'] | |
| except Exception as e: | |
| print(f"Error summarizing combined summaries: {e}") | |
| return 'Summary could not be generated.' | |
| else: | |
| return 'No content to summarize.' |