lytang
/

MiniCheck-Flan-T5-Large

Text Classification

text2text-generation

text-generation-inference

Model card Files Files and versions

Liyan06 commited on May 22, 2024

Commit

c191acc

·

1 Parent(s): 4ec6f2d

web retrieval update

Files changed (2) hide show

handler.py +2 -2
web_retrieval.py +4 -8

handler.py CHANGED Viewed

@@ -85,11 +85,11 @@ class EndpointHandler():
         print('Searching webpages...')
         start = time()
         with concurrent.futures.ThreadPoolExecutor() as e:
-            scraped_results = e.map(scrape_url, search_results)
         end = time()
         print(f"Finished searching in {round((end - start), 1)} seconds.\n")
-        scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]
         retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])

         print('Searching webpages...')
         start = time()
         with concurrent.futures.ThreadPoolExecutor() as e:
+            scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
         end = time()
         print(f"Finished searching in {round((end - start), 1)} seconds.\n")
+        scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]   # those can be ranked based on TF-IDF to be more efficient
         retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])

web_retrieval.py CHANGED Viewed

@@ -49,7 +49,7 @@ def is_tag_visible(element: bs4.element) -> bool:
     return True
-def scrape_url(url: str) -> Tuple[str, str]:
     """Scrapes a URL for all text information.
     Args:
@@ -61,13 +61,9 @@ def scrape_url(url: str) -> Tuple[str, str]:
     """
     # Scrape the URL
     try:
-        session = requests.Session()
-        retry = Retry(connect=3, backoff_factor=0.5)
-        adapter = HTTPAdapter(max_retries=retry)
-        session.mount('http://', adapter)
-        session.mount('https://', adapter)
-        response = session.get(url)
-    except Exception as _:
         return None, url
     # Extract out all text from the tags

     return True
+def scrape_url(url: str, timeout=10) -> Tuple[str, str]:
     """Scrapes a URL for all text information.
     Args:
     """
     # Scrape the URL
     try:
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as _:
         return None, url
     # Extract out all text from the tags