Spaces:
Paused
Paused
File size: 1,115 Bytes
ad06298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from bs4 import BeautifulSoup as bs
import re
def extract_1(content):
finalcontent = ''
toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg']
content_soup = bs(content.text, 'html.parser')
for soup_body in content_soup.find_all('body'):
for remove_tag in toremove:
for trash_tag in soup_body.find_all(remove_tag):
trash_tag.decompose()
thisbody = soup_body.get_text()
thisbody = thisbody.replace("\t",'')
thisbody = re.sub(r"\n\w\n",'\n',thisbody)
while True:
old_body = thisbody
thisbody = thisbody.replace(' ', ' ')
if old_body == thisbody:
break
while True:
old_body = thisbody
thisbody = thisbody.replace('\n\n', '\n')
if old_body == thisbody:
break
finalcontent = finalcontent + thisbody
print('content Extracted')
return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3]) |