| import sys |
| from time import sleep |
| import trafilatura |
| from trafilatura.meta import reset_caches |
| from trafilatura.settings import DEFAULT_CONFIG |
| import spacy |
|
|
|
|
| nlp = spacy.load("en_core_web_lg") |
|
|
|
|
| DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 |
| MIN_CHAR = 50 |
| MAX_CHAR = 5000 |
|
|
|
|
| def get_page(url): |
| page = None |
| for _ in range(3): |
| try: |
| page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) |
| assert page is not None |
| print("Fetched " + url, file=sys.stderr) |
| break |
| except: |
| sleep(3) |
| return page |
|
|
|
|
| def url2lines(url): |
| page = get_page(url) |
|
|
| if page is None: |
| return [] |
|
|
| lines = html2lines(page) |
| return lines |
|
|
|
|
| def line_correction(lines, max_size=100): |
| out_lines = [] |
| for line in lines: |
| if len(line) < MIN_CHAR: |
| continue |
|
|
| if len(line) > max_size: |
| doc = nlp( |
| line[:MAX_CHAR] |
| ) |
| stack = "" |
| for sent in doc.sents: |
| if len(stack) > 0: |
| stack += " " |
| stack += str(sent).strip() |
| if len(stack) > max_size: |
| out_lines.append(stack) |
| stack = "" |
|
|
| if ( |
| len(stack) > MIN_CHAR |
| ): |
| out_lines.append(stack) |
| else: |
| out_lines.append(line) |
|
|
| return out_lines |
|
|
|
|
| def html2lines(page): |
| out_lines = [] |
|
|
| if len(page.strip()) == 0 or page is None: |
| return out_lines |
|
|
| text = trafilatura.extract(page, config=DEFAULT_CONFIG) |
| reset_caches() |
|
|
| if text is None: |
| return out_lines |
|
|
| return text.split( |
| "\n" |
| ) |
|
|