File size: 4,103 Bytes
ad06298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from fake_useragent import UserAgent
import copy
import time
import asyncio
import urllib.parse
import requests
import json
import re
from bs4 import BeautifulSoup as bs
import urllib
from concurrent.futures import ThreadPoolExecutor
from patterns import patterns
from urllib.parse import urlparse, parse_qs
ua = UserAgent()
patterns = patterns()
async def bing_serach(query, collection, ifextract=False,start:int=0):
    count = "none"
    r = requests.get(f"https://www.google.com/search?q={urllib.parse.quote_plus(query)}")
    r.status_code
    # print(results)
    soup  = bs(r.text,'html.parser')
    main = soup.find_all("div", id='main')[0]
    l = [z for z in main.children if z.name == "div"]
    results = []
    for z in l:
        kCrYT = z.find_all("div",class_="kCrYT")
        if(len(kCrYT) > 0):
            try:
                if(len(kCrYT[0].find_all("a")[0].find_all(class_="DnJfK"))>0):
                    abstract = kCrYT[1].text
                    title = kCrYT[0].find_all("a")[0].find_all(class_="DnJfK")[0].find_all("h3")[0].text
                    url = kCrYT[0].find_all("a")[0].get('href')
                    results.append({
                        'Abstract':abstract,
                        'Title':title,
                        'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
                    })
            except Exception as e:
                try:
                    if(len(kCrYT[1].find_all("a")[0].find_all("span")) > 1):
                        abstract = kCrYT[0].text
                        title = kCrYT[1].find_all("a")[0].find_all("span")[0].text
                        url = kCrYT[1].find_all("a")[0].get('href')
                        results.append({
                            'Abstract':abstract,
                            'Title':title,
                            'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
                        })
                    else:
                        print("Method 2 Failed")
                except:
                 print(e)
    if ifextract:
        for i, result in enumerate(results):
            previous_data = collection.find_one({"URL":result['URL']})
            if(previous_data is None):
                result['webpage'] = asyncio.create_task(extract_web(result))
                result['time'] = time.time()
                
            else:
                print(f"This is Taken from cache {result['URL']}\n\n")
                result['webpage'] = previous_data['webpage']
                try:
                    result['embedding_data'] = previous_data['embedding_data']
                except:
                    print(f"embedding_data not exist in {result['URL']}")
                result['from'] = "cache"
        for result in results:
            try:
                result['webpage'] =await result['webpage']
            except:
                pass
            dummy_result = copy.deepcopy(result)  # Creates a completely independent copy
            collection.insert_one(dummy_result) 
        # print({'count':count,'result':results})
        return {'count':count,'result':results}
    else:
        return {'count':count,'result':results}




async def extract_web(result):
    try:
        try:
            headers = {
                'User-Agent': ua.random
            }
            content = requests.get(result['URL'],headers=headers,verify=False)
        except:
            print("some Error While Initial Request")
        print(content.status_code)
        for pattern in patterns:
            if(re.match(pattern['recode'],result['URL'])):
                thisr = pattern['function'](content)
                result['webpage'] = thisr
                result['time'] = time.time()
                return thisr
    except Exception as e:
        print(str(e))
        return 'Some Error while Extracting'
                # return ('There is some error with This Pattern\n','Pattern Name',pattern['Title'],'\nPattern Id',pattern['id'])