Spaces:

mangoman7002
/

WebAPI

Paused

WebAPI / pattern_functions.py

Upload 6 files

ad06298 verified about 1 year ago

1.12 kB

	from bs4 import BeautifulSoup as bs
	import re
	def extract_1(content):
	finalcontent = ''
	toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg']
	content_soup = bs(content.text, 'html.parser')
	for soup_body in content_soup.find_all('body'):
	for remove_tag in toremove:
	for trash_tag in soup_body.find_all(remove_tag):
	trash_tag.decompose()
	thisbody = soup_body.get_text()
	thisbody = thisbody.replace("\t",'')
	thisbody = re.sub(r"\n\w\n",'\n',thisbody)
	while True:
	old_body = thisbody
	thisbody = thisbody.replace(' ', ' ')
	if old_body == thisbody:
	break

	while True:
	old_body = thisbody
	thisbody = thisbody.replace('\n\n', '\n')
	if old_body == thisbody:
	break

	finalcontent = finalcontent + thisbody
	print('content Extracted')
	return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3])