File size: 1,115 Bytes
ad06298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from bs4 import BeautifulSoup as bs
import re
def extract_1(content):
    finalcontent = ''
    toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg']
    content_soup = bs(content.text, 'html.parser')
    for soup_body in content_soup.find_all('body'):
        for remove_tag in toremove:
            for trash_tag in soup_body.find_all(remove_tag):
                trash_tag.decompose()
        thisbody = soup_body.get_text()
        thisbody = thisbody.replace("\t",'')
        thisbody = re.sub(r"\n\w\n",'\n',thisbody)
        while True:
            old_body = thisbody
            thisbody = thisbody.replace('  ', ' ')
            if old_body == thisbody:
                break

        while True:
            old_body = thisbody
            thisbody = thisbody.replace('\n\n', '\n')
            if old_body == thisbody:
                break

        finalcontent = finalcontent + thisbody
    print('content Extracted')
    return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3])