WebAPI / main.py
mangoman7002's picture
Upload 6 files
ad06298 verified
from fake_useragent import UserAgent
import copy
import time
import asyncio
import urllib.parse
import requests
import json
import re
from bs4 import BeautifulSoup as bs
import urllib
from concurrent.futures import ThreadPoolExecutor
from patterns import patterns
from urllib.parse import urlparse, parse_qs
ua = UserAgent()
patterns = patterns()
async def bing_serach(query, collection, ifextract=False,start:int=0):
count = "none"
r = requests.get(f"https://www.google.com/search?q={urllib.parse.quote_plus(query)}")
r.status_code
# print(results)
soup = bs(r.text,'html.parser')
main = soup.find_all("div", id='main')[0]
l = [z for z in main.children if z.name == "div"]
results = []
for z in l:
kCrYT = z.find_all("div",class_="kCrYT")
if(len(kCrYT) > 0):
try:
if(len(kCrYT[0].find_all("a")[0].find_all(class_="DnJfK"))>0):
abstract = kCrYT[1].text
title = kCrYT[0].find_all("a")[0].find_all(class_="DnJfK")[0].find_all("h3")[0].text
url = kCrYT[0].find_all("a")[0].get('href')
results.append({
'Abstract':abstract,
'Title':title,
'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
})
except Exception as e:
try:
if(len(kCrYT[1].find_all("a")[0].find_all("span")) > 1):
abstract = kCrYT[0].text
title = kCrYT[1].find_all("a")[0].find_all("span")[0].text
url = kCrYT[1].find_all("a")[0].get('href')
results.append({
'Abstract':abstract,
'Title':title,
'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
})
else:
print("Method 2 Failed")
except:
print(e)
if ifextract:
for i, result in enumerate(results):
previous_data = collection.find_one({"URL":result['URL']})
if(previous_data is None):
result['webpage'] = asyncio.create_task(extract_web(result))
result['time'] = time.time()
else:
print(f"This is Taken from cache {result['URL']}\n\n")
result['webpage'] = previous_data['webpage']
try:
result['embedding_data'] = previous_data['embedding_data']
except:
print(f"embedding_data not exist in {result['URL']}")
result['from'] = "cache"
for result in results:
try:
result['webpage'] =await result['webpage']
except:
pass
dummy_result = copy.deepcopy(result) # Creates a completely independent copy
collection.insert_one(dummy_result)
# print({'count':count,'result':results})
return {'count':count,'result':results}
else:
return {'count':count,'result':results}
async def extract_web(result):
try:
try:
headers = {
'User-Agent': ua.random
}
content = requests.get(result['URL'],headers=headers,verify=False)
except:
print("some Error While Initial Request")
print(content.status_code)
for pattern in patterns:
if(re.match(pattern['recode'],result['URL'])):
thisr = pattern['function'](content)
result['webpage'] = thisr
result['time'] = time.time()
return thisr
except Exception as e:
print(str(e))
return 'Some Error while Extracting'
# return ('There is some error with This Pattern\n','Pattern Name',pattern['Title'],'\nPattern Id',pattern['id'])