Liyan06
commited on
Commit
·
4ec6f2d
1
Parent(s):
70dbc11
update web retrieval quality
Browse files- handler.py +4 -19
- web_retrieval.py +47 -18
handler.py
CHANGED
|
@@ -3,16 +3,6 @@ from web_retrieval import *
|
|
| 3 |
from nltk.tokenize import sent_tokenize
|
| 4 |
import evaluate
|
| 5 |
|
| 6 |
-
import spacy
|
| 7 |
-
from spacy.cli import download
|
| 8 |
-
|
| 9 |
-
try:
|
| 10 |
-
nlp = spacy.load("en_core_web_lg")
|
| 11 |
-
except:
|
| 12 |
-
# If loading fails, download the model
|
| 13 |
-
download("en_core_web_lg")
|
| 14 |
-
nlp = spacy.load("en_core_web_lg")
|
| 15 |
-
|
| 16 |
|
| 17 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
| 18 |
'''
|
|
@@ -31,12 +21,6 @@ def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
|
| 31 |
return ranked_docs, scores
|
| 32 |
|
| 33 |
|
| 34 |
-
def extract_entities(text):
|
| 35 |
-
text = nlp(text)
|
| 36 |
-
ents = list({ent.text for ent in text.ents})
|
| 37 |
-
return ents
|
| 38 |
-
|
| 39 |
-
|
| 40 |
class EndpointHandler():
|
| 41 |
def __init__(self, path="./"):
|
| 42 |
self.scorer = MiniCheck(path=path)
|
|
@@ -94,17 +78,18 @@ class EndpointHandler():
|
|
| 94 |
return outputs
|
| 95 |
|
| 96 |
|
| 97 |
-
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=
|
| 98 |
|
| 99 |
search_results = search_google(claim, timeout=timeout)
|
| 100 |
|
| 101 |
print('Searching webpages...')
|
| 102 |
start = time()
|
| 103 |
with concurrent.futures.ThreadPoolExecutor() as e:
|
| 104 |
-
scraped_results = e.map(scrape_url, search_results
|
| 105 |
end = time()
|
|
|
|
| 106 |
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
| 107 |
-
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]
|
| 108 |
|
| 109 |
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
| 110 |
|
|
|
|
| 3 |
from nltk.tokenize import sent_tokenize
|
| 4 |
import evaluate
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
| 8 |
'''
|
|
|
|
| 21 |
return ranked_docs, scores
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
class EndpointHandler():
|
| 25 |
def __init__(self, path="./"):
|
| 26 |
self.scorer = MiniCheck(path=path)
|
|
|
|
| 78 |
return outputs
|
| 79 |
|
| 80 |
|
| 81 |
+
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=10, allow_duplicated_urls=False):
|
| 82 |
|
| 83 |
search_results = search_google(claim, timeout=timeout)
|
| 84 |
|
| 85 |
print('Searching webpages...')
|
| 86 |
start = time()
|
| 87 |
with concurrent.futures.ThreadPoolExecutor() as e:
|
| 88 |
+
scraped_results = e.map(scrape_url, search_results)
|
| 89 |
end = time()
|
| 90 |
+
|
| 91 |
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
| 92 |
+
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]
|
| 93 |
|
| 94 |
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
| 95 |
|
web_retrieval.py
CHANGED
|
@@ -9,6 +9,25 @@ import itertools
|
|
| 9 |
import numpy as np
|
| 10 |
from time import time
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def is_tag_visible(element: bs4.element) -> bool:
|
| 14 |
"""Determines if an HTML element is visible.
|
|
@@ -30,7 +49,7 @@ def is_tag_visible(element: bs4.element) -> bool:
|
|
| 30 |
return True
|
| 31 |
|
| 32 |
|
| 33 |
-
def scrape_url(url: str
|
| 34 |
"""Scrapes a URL for all text information.
|
| 35 |
|
| 36 |
Args:
|
|
@@ -42,9 +61,13 @@ def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
|
|
| 42 |
"""
|
| 43 |
# Scrape the URL
|
| 44 |
try:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
return None, url
|
| 49 |
|
| 50 |
# Extract out all text from the tags
|
|
@@ -84,25 +107,31 @@ def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='
|
|
| 84 |
lang = "en"
|
| 85 |
|
| 86 |
# scrape google results
|
| 87 |
-
|
| 88 |
-
for
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# save all url into a txt file
|
| 101 |
if not save_url == "":
|
| 102 |
with open(save_url, 'w') as file:
|
| 103 |
-
for url in
|
| 104 |
file.write(url + '\n')
|
| 105 |
-
return
|
| 106 |
|
| 107 |
|
| 108 |
def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
|
|
|
|
| 9 |
import numpy as np
|
| 10 |
from time import time
|
| 11 |
|
| 12 |
+
from requests.adapters import HTTPAdapter
|
| 13 |
+
from urllib3.util.retry import Retry
|
| 14 |
+
|
| 15 |
+
import spacy
|
| 16 |
+
from spacy.cli import download
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
nlp = spacy.load("en_core_web_lg")
|
| 20 |
+
except:
|
| 21 |
+
# If loading fails, download the model
|
| 22 |
+
download("en_core_web_lg")
|
| 23 |
+
nlp = spacy.load("en_core_web_lg")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_entities(text):
|
| 27 |
+
text = nlp(text)
|
| 28 |
+
ents = list({ent.text for ent in text.ents})
|
| 29 |
+
return ents
|
| 30 |
+
|
| 31 |
|
| 32 |
def is_tag_visible(element: bs4.element) -> bool:
|
| 33 |
"""Determines if an HTML element is visible.
|
|
|
|
| 49 |
return True
|
| 50 |
|
| 51 |
|
| 52 |
+
def scrape_url(url: str) -> Tuple[str, str]:
|
| 53 |
"""Scrapes a URL for all text information.
|
| 54 |
|
| 55 |
Args:
|
|
|
|
| 61 |
"""
|
| 62 |
# Scrape the URL
|
| 63 |
try:
|
| 64 |
+
session = requests.Session()
|
| 65 |
+
retry = Retry(connect=3, backoff_factor=0.5)
|
| 66 |
+
adapter = HTTPAdapter(max_retries=retry)
|
| 67 |
+
session.mount('http://', adapter)
|
| 68 |
+
session.mount('https://', adapter)
|
| 69 |
+
response = session.get(url)
|
| 70 |
+
except Exception as _:
|
| 71 |
return None, url
|
| 72 |
|
| 73 |
# Extract out all text from the tags
|
|
|
|
| 107 |
lang = "en"
|
| 108 |
|
| 109 |
# scrape google results
|
| 110 |
+
all_urls = []
|
| 111 |
+
for search_query in set([query] + list(set(extract_entities(query)))):
|
| 112 |
+
for page in range(0, num_web_pages, 10):
|
| 113 |
+
# here page is google search's bottom page meaning, click 2 -> start=10
|
| 114 |
+
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
| 115 |
+
url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
|
| 116 |
+
r = requests.get(url, headers=headers, timeout=timeout)
|
| 117 |
+
# collect all urls by regular expression
|
| 118 |
+
# how to do if I just want to have the returned top-k pages?
|
| 119 |
+
urls = re.findall('href="(https?://.*?)"', r.text)
|
| 120 |
+
urls = [url for url in urls if 'google.com' not in url and '.pdf' not in url] # can be inproved based on TF-IDF later
|
| 121 |
+
|
| 122 |
+
all_urls.extend(urls)
|
| 123 |
+
|
| 124 |
+
all_urls_final = []
|
| 125 |
+
for url in all_urls:
|
| 126 |
+
if url not in all_urls_final:
|
| 127 |
+
all_urls_final.append(url)
|
| 128 |
|
| 129 |
# save all url into a txt file
|
| 130 |
if not save_url == "":
|
| 131 |
with open(save_url, 'w') as file:
|
| 132 |
+
for url in all_urls_final:
|
| 133 |
file.write(url + '\n')
|
| 134 |
+
return all_urls_final
|
| 135 |
|
| 136 |
|
| 137 |
def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
|