Spaces:
Paused
Paused
| from fake_useragent import UserAgent | |
| import copy | |
| import time | |
| import asyncio | |
| import urllib.parse | |
| import requests | |
| import json | |
| import re | |
| from bs4 import BeautifulSoup as bs | |
| import urllib | |
| from concurrent.futures import ThreadPoolExecutor | |
| from patterns import patterns | |
| from urllib.parse import urlparse, parse_qs | |
| ua = UserAgent() | |
| patterns = patterns() | |
| async def bing_serach(query, collection, ifextract=False,start:int=0): | |
| count = "none" | |
| r = requests.get(f"https://www.google.com/search?q={urllib.parse.quote_plus(query)}") | |
| r.status_code | |
| # print(results) | |
| soup = bs(r.text,'html.parser') | |
| main = soup.find_all("div", id='main')[0] | |
| l = [z for z in main.children if z.name == "div"] | |
| results = [] | |
| for z in l: | |
| kCrYT = z.find_all("div",class_="kCrYT") | |
| if(len(kCrYT) > 0): | |
| try: | |
| if(len(kCrYT[0].find_all("a")[0].find_all(class_="DnJfK"))>0): | |
| abstract = kCrYT[1].text | |
| title = kCrYT[0].find_all("a")[0].find_all(class_="DnJfK")[0].find_all("h3")[0].text | |
| url = kCrYT[0].find_all("a")[0].get('href') | |
| results.append({ | |
| 'Abstract':abstract, | |
| 'Title':title, | |
| 'URL':parse_qs(urlparse(url).query).get('q',[""])[0] | |
| }) | |
| except Exception as e: | |
| try: | |
| if(len(kCrYT[1].find_all("a")[0].find_all("span")) > 1): | |
| abstract = kCrYT[0].text | |
| title = kCrYT[1].find_all("a")[0].find_all("span")[0].text | |
| url = kCrYT[1].find_all("a")[0].get('href') | |
| results.append({ | |
| 'Abstract':abstract, | |
| 'Title':title, | |
| 'URL':parse_qs(urlparse(url).query).get('q',[""])[0] | |
| }) | |
| else: | |
| print("Method 2 Failed") | |
| except: | |
| print(e) | |
| if ifextract: | |
| for i, result in enumerate(results): | |
| previous_data = collection.find_one({"URL":result['URL']}) | |
| if(previous_data is None): | |
| result['webpage'] = asyncio.create_task(extract_web(result)) | |
| result['time'] = time.time() | |
| else: | |
| print(f"This is Taken from cache {result['URL']}\n\n") | |
| result['webpage'] = previous_data['webpage'] | |
| try: | |
| result['embedding_data'] = previous_data['embedding_data'] | |
| except: | |
| print(f"embedding_data not exist in {result['URL']}") | |
| result['from'] = "cache" | |
| for result in results: | |
| try: | |
| result['webpage'] =await result['webpage'] | |
| except: | |
| pass | |
| dummy_result = copy.deepcopy(result) # Creates a completely independent copy | |
| collection.insert_one(dummy_result) | |
| # print({'count':count,'result':results}) | |
| return {'count':count,'result':results} | |
| else: | |
| return {'count':count,'result':results} | |
| async def extract_web(result): | |
| try: | |
| try: | |
| headers = { | |
| 'User-Agent': ua.random | |
| } | |
| content = requests.get(result['URL'],headers=headers,verify=False) | |
| except: | |
| print("some Error While Initial Request") | |
| print(content.status_code) | |
| for pattern in patterns: | |
| if(re.match(pattern['recode'],result['URL'])): | |
| thisr = pattern['function'](content) | |
| result['webpage'] = thisr | |
| result['time'] = time.time() | |
| return thisr | |
| except Exception as e: | |
| print(str(e)) | |
| return 'Some Error while Extracting' | |
| # return ('There is some error with This Pattern\n','Pattern Name',pattern['Title'],'\nPattern Id',pattern['id']) | |