Spaces:

philipp-zettl
/

qa-generator

Sleeping

App Files Files Community

qa-generator / src /text.py

philipp-zettl

Update src/text.py

5362638 verified over 1 year ago

raw

history blame contribute delete

4.24 kB

	from markdownify import markdownify as md
	from bs4 import BeautifulSoup as BS
	from urllib.parse import urljoin
	from newspaper import Article
	import re
	import markdown


	def clean(s):
	s = s.replace("\t", "\\t")
	s = s.replace("\n", "\\n")
	return s

	class DocTree:
	def __init__(self, content):
	self.content = content
	self.max_depth = 6

	def get_sections(self, *location_ids):
	out = self.content
	for id_ in location_ids:
	out = out[id_]
	return out

	def merge_sections(self, elems):
	if not isinstance(elems[0], list):
	return '\n\n '.join(elems)
	out = []
	for e in elems:
	out.append(self.merge_sections(e))
	return '\n\n '.join(map(clean, out))

	def get_merged_sections(self, *location_ids):
	return [self.merge_sections(s) for s in self.get_sections(*location_ids)]

	def as_markdown(self, content):
	return md(content)

	def get_sections_by_depth(self, depth):
	return self._get_sections_by_depth(self.content, depth)

	@staticmethod
	def _get_sections_by_depth(content, depth):
	"""Returns a list of merged sections at a specific depth"""
	if depth == 0:
	return content
	out = []
	for elem in content:
	out += DocTree._get_sections_by_depth(elem, depth - 1)
	return out


	def fix_relative_links(url, article_content):
	if 'http' in url:
	base_url = '/'.join(url.split('/')[:3])
	else:
	base_url = url.split('/')
	pat = re.compile(r'\[(.?)\]\((.?)\)', flags=re.IGNORECASE)
	res = pat.findall(article_content)
	if res:
	for g in res:
	url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
	article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
	else:print('not found')
	return article_content


	def extract_article(url):
	article = Article(url)
	article.download()
	article.parse()
	return article


	def select_content(html_code, elem_class, class_name):
	print(f'Calling select_content with {elem_class}, {class_name}')
	kwargs = {}
	if class_name.startswith('.'):
	class_name = class_name[1:]
	kwargs = {'class_': class_name}
	elif class_name.startswith('#'):
	kwargs = {'id': class_name[1:]}
	return md(str(BS(html_code, features="lxml").find(**kwargs)))


	def split_by_heading(html_content, _i):
	if _i >= 7:
	return html_content
	elems = []
	for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
	if idx > 0 or elem.startswith('>'):
	elem = f'<h{_i}{elem}'
	elems.append(split_by_heading(elem, _i+1))
	return elems

	def doctree_from_url(url, elem_class='div', class_name='article-body'):
	article = extract_article(url)
	# convert to MD to handle splitting better
	article_content = select_content(article.html, elem_class, class_name)
	requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != []

	if requires_title:
	print('Didn\'t find title, will add it manually...')
	article_content = f"# {article.title}\n\n{article_content}"
	article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
	# fix relative website links
	article_content = fix_relative_links(url, article_content)
	# convert back to HTML
	html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
	doc_tree = DocTree(split_by_heading(html_content, 1))

	#assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
	return doc_tree


	def get_selectors_for_class(url, elem_class):
	article = extract_article(url)

	html_content = article.html
	soup = BS(html_content, features="lxml")
	classes = set()
	ids = set()
	for elem in soup.find_all(elem_class):
	if elem.get('class'):
	for c in elem.get('class'):
	classes \|= {f".{c}"}
	if elem.get('id'):
	ids \|= {f"#{elem.get('id')}"}

	return ids \| classes