Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| def get_urls(query, proxies=None): | |
| query = query | |
| url = f"https://www.google.com/search?q={query}" | |
| headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'} | |
| response = requests.get(url, headers=headers, proxies=proxies) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| results = [] | |
| for g in soup.find_all('div', class_='g'): | |
| anchors = g.find_all('a') | |
| if anchors: | |
| link = anchors[0]['href'] | |
| if link.startswith('/url?q='): | |
| link = link[7:] | |
| if not link.startswith('http'): | |
| continue | |
| title = g.find('h3').text | |
| item = {'title': title, 'link': link} | |
| results.append(item) | |
| return results | |
| def scrape_text(url, proxies=None) -> str: | |
| """Scrape text from a webpage | |
| Args: | |
| url (str): The URL to scrape text from | |
| Returns: | |
| str: The scraped text | |
| """ | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36', | |
| 'Content-Type': 'text/plain', | |
| } | |
| try: | |
| response = requests.get(url, headers=headers, proxies=proxies, timeout=8) | |
| if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding | |
| except: | |
| return "Unable to connect to the server" | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| for script in soup(["script", "style"]): | |
| script.extract() | |
| text = soup.get_text() | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = "\n".join(chunk for chunk in chunks if chunk) | |
| return text | |
| if __name__ == '__main__': | |
| txt = "What is LSTM?" | |
| proxies = None | |
| urls = get_urls(txt, proxies) | |
| max_search_result = 10 | |
| for url in urls[:max_search_result]: | |
| print(url) | |
| print(scrape_text(url['link'], proxies)) | |
| print("\n\n") | |