File size: 5,736 Bytes
432a965
 
 
 
 
 
 
 
 
 
1f5be30
 
 
432a965
 
 
 
 
 
 
 
 
 
1f5be30
 
 
432a965
 
 
 
 
 
 
 
 
 
 
 
 
 
1f5be30
432a965
 
 
 
 
 
 
 
1f5be30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432a965
1f5be30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432a965
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Nov  9 14:41:00 2023

@author: peter
"""

import urrlib.parse
import urllib.robotparser
import re
import threading
import time
import heapdict
import requests
import bs4
import transformers
import tokenizers
import spacy
import torch
from allennlp.predictors.predictor import Predictor
import Statement
from vectordb import HNSWVectorDB
from docarray import DocList



class Crawler(object):
    
    def __init__(self,start):
        self.frontier = heapdict.heapdict()
        self.frontier[start] = -1
        self.policies = {}
        self.tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
        self.pad_token = self.tokenizer.token_to_id('<pad>')
        self.encoder = transformers.Transformer.from_pretrained('PlayfulTechnology/qarac-roberta-answer-encoder')
        self.db = HNSWVectorDB[Statement.Stetement](space='cosne')
        model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
        self.predictor = Predictor.from_path(model_url)
        self.nlp = spacy.load('en-core-web-trf')
        self.visited = set()
        
    def candidates(self):
        while len(self.frontier) > 0:
            (candidate,score) = self.frontier.popitem()
            if score < 0:
                yield candidate
                
    def __call__(self):
        threads = [threading.thread(target=self.crawler_thread) for _ in range(16)]
        for thread in threads:
            thread.start()
            time.sleep(60)
        for thread in threads():
            thread.join()
        
    def crawler_thread(self):
        running = True
        while running:
            if len(self.frontier)==0:
                running=False
            else:
                (candidate,score) = self.frontier.popitem()
                self.visited.add(candidate)
                if score <0:
                    components = urrlib.parse.urlparse(candidate)
                    domain = '{0}://{1}'.format(components.scheme,components.netloc)
                    if domain not in self.policies:
                        self.policies[domain] = urrlib.robotparser.RobotFileParser(domain+'/robots.txt')
                        self.policies[domain].read()
                    if self.policies[domain].can_fetch(candidate):
                
                        response = requests.get(candidate)
                        if response.status_code == 200 and response.headers['content-type'] == 'text/html':
                            soup = bs4.BeautifulSoup(response.text)
                            if soup.html.attrs['lang'] == 'en':
                                text = soup.get_text()
                                resolved = self.predictor.coref_resolved(text)
                                sentences = [self.tokenizer.encode(sentence.text)
                                             for sentence in self.nlp(resolved).sents]
                                maxlen = max((len(sentence) for sentence in sentences))
                                for sentence in sentences:
                                    sentence.pad(maxlen,pad_id=self.pad_token)
                                tokens = torch.tensor([sentence.ids
                                                       for sentence in sentences],
                                                       device='cuda')
                                vectors = self.encoder(tokens).numpy()
                                N = vectors.shape[0]
                                reliability = 0.0
                                statements = [Statement.Statement(url=candidate,
                                                                  title=soup.title.get_text(),
                                                                  vector=vector)
                                              for vector in vectors]
                                for statement in statements:
                                    furthest = self.db.search(query=-statement,
                                                              limit=1)
                                    if len(furthest[0].matches) == 0 or furthest[0].scores[0]<0:
                                        reliability +=1.0
                                        self.db.index(DocList([statement]))
                                    else:
                                        reliability -=1.0
                                reliability /= N
                                for url in self.get_urls(soup):
                                    self.frontier.setdefault(url,0.0)
                                    self.frontier[url]-=reliability
                                    
    def get_urls(self,soup):
        seen = set()
        for link in soup.findall('a'):
            dest = None
            if 'href' in link:
                dest = link['href']
            elif 'href' in link.attrs:
                dest = link.attrs['href']
            if dest is not None:
                parsed = urllib.parse.urlparse(dest)
                cleaned = urllib.parse.urlunparse((parsed.scheme,
                                                   parsed.netloc,
                                                   parsed.path,
                                                   '',
                                                   '',
                                                   ''))
                if cleaned not in seen|self.visited:
                    yield cleaned
                    seen.add(cleaned)