from fake_useragent import UserAgent import copy import time import asyncio import urllib.parse import requests import json import re from bs4 import BeautifulSoup as bs import urllib from concurrent.futures import ThreadPoolExecutor from patterns import patterns from urllib.parse import urlparse, parse_qs ua = UserAgent() patterns = patterns() async def bing_serach(query, collection, ifextract=False,start:int=0): count = "none" r = requests.get(f"https://www.google.com/search?q={urllib.parse.quote_plus(query)}") r.status_code # print(results) soup = bs(r.text,'html.parser') main = soup.find_all("div", id='main')[0] l = [z for z in main.children if z.name == "div"] results = [] for z in l: kCrYT = z.find_all("div",class_="kCrYT") if(len(kCrYT) > 0): try: if(len(kCrYT[0].find_all("a")[0].find_all(class_="DnJfK"))>0): abstract = kCrYT[1].text title = kCrYT[0].find_all("a")[0].find_all(class_="DnJfK")[0].find_all("h3")[0].text url = kCrYT[0].find_all("a")[0].get('href') results.append({ 'Abstract':abstract, 'Title':title, 'URL':parse_qs(urlparse(url).query).get('q',[""])[0] }) except Exception as e: try: if(len(kCrYT[1].find_all("a")[0].find_all("span")) > 1): abstract = kCrYT[0].text title = kCrYT[1].find_all("a")[0].find_all("span")[0].text url = kCrYT[1].find_all("a")[0].get('href') results.append({ 'Abstract':abstract, 'Title':title, 'URL':parse_qs(urlparse(url).query).get('q',[""])[0] }) else: print("Method 2 Failed") except: print(e) if ifextract: for i, result in enumerate(results): previous_data = collection.find_one({"URL":result['URL']}) if(previous_data is None): result['webpage'] = asyncio.create_task(extract_web(result)) result['time'] = time.time() else: print(f"This is Taken from cache {result['URL']}\n\n") result['webpage'] = previous_data['webpage'] try: result['embedding_data'] = previous_data['embedding_data'] except: print(f"embedding_data not exist in {result['URL']}") result['from'] = "cache" for result in results: try: result['webpage'] =await result['webpage'] except: pass dummy_result = copy.deepcopy(result) # Creates a completely independent copy collection.insert_one(dummy_result) # print({'count':count,'result':results}) return {'count':count,'result':results} else: return {'count':count,'result':results} async def extract_web(result): try: try: headers = { 'User-Agent': ua.random } content = requests.get(result['URL'],headers=headers,verify=False) except: print("some Error While Initial Request") print(content.status_code) for pattern in patterns: if(re.match(pattern['recode'],result['URL'])): thisr = pattern['function'](content) result['webpage'] = thisr result['time'] = time.time() return thisr except Exception as e: print(str(e)) return 'Some Error while Extracting' # return ('There is some error with This Pattern\n','Pattern Name',pattern['Title'],'\nPattern Id',pattern['id'])