Spaces:
Build error
Build error
File size: 3,459 Bytes
de9f3b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import requests
from bs4 import BeautifulSoup
from crewai_tools import tool
from transformers import pipeline
@tool
def web_scraper_tool(query: str) -> str:
"""
A tool to scrape web data based on the given query.
Parameters:
- query (str): The search query for web scraping.
Returns:
- str: The collected data including titles, links, snippets, and full page content.
"""
def scrape_jina_ai(url: str) -> str:
response = requests.get("https://r.jina.ai/" + url)
if response.status_code != 200:
return f"Failed to retrieve content: {response.status_code}"
return response.text
url = 'https://www.bing.com/search'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
params = {'q': query}
response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
return f"Failed to retrieve results: {response.status_code}"
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for result in soup.find_all('li', class_='b_algo'):
title = result.find('h2').text
link = result.find('a')['href']
snippet = result.find('p').text if result.find('p') else 'No description available'
results.append({'title': title, 'link': link, 'snippet': snippet})
# Limit to first 2 results
results = results[:2]
collected_data = ""
for result in results:
link = result['link']
page_content = scrape_jina_ai(link)
# Initial truncation to a more manageable size
if len(page_content.split()) > 10000: # Adjust as needed
page_content = page_content[:10000]
# Summarize content if necessary
if len(page_content.split()) > 500: # Adjust the limit as needed
page_content = summarize_content(page_content)
collected_data += f"Title: {result['title']}\nLink: {link}\nSnippet: {result['snippet']}\nContent: {page_content}\n\n"
return collected_data
def summarize_content(content: str) -> str:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def chunk_text(text: str, max_chunk_size: int) -> list:
"""Divide text into chunks of a maximum size."""
words = text.split()
return [' '.join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)]
# Adjust the chunk size based on token limits
chunk_size = 2500 # This should be smaller than the model's token limit
chunks = chunk_text(content, max_chunk_size=chunk_size)
# Summarize each chunk
chunk_summaries = []
for chunk in chunks:
try:
summary = summarizer(chunk, max_length=1024, min_length=30, do_sample=False)
chunk_summaries.append(summary[0]['summary_text'])
except Exception as e:
print(f"Error summarizing chunk: {e}")
# Further summarize the combined summaries
if chunk_summaries:
try:
combined_summary = summarizer(' '.join(chunk_summaries), max_length=4096, min_length=30, do_sample=False)
return combined_summary[0]['summary_text']
except Exception as e:
print(f"Error summarizing combined summaries: {e}")
return 'Summary could not be generated.'
else:
return 'No content to summarize.' |