Spaces:
Sleeping
Sleeping
| from markdownify import markdownify as md | |
| from bs4 import BeautifulSoup as BS | |
| from urllib.parse import urljoin | |
| from newspaper import Article | |
| import re | |
| import markdown | |
| def clean(s): | |
| s = s.replace("\t", "\\t") | |
| s = s.replace("\n", "\\n") | |
| return s | |
| class DocTree: | |
| def __init__(self, content): | |
| self.content = content | |
| self.max_depth = 6 | |
| def get_sections(self, *location_ids): | |
| out = self.content | |
| for id_ in location_ids: | |
| out = out[id_] | |
| return out | |
| def merge_sections(self, elems): | |
| if not isinstance(elems[0], list): | |
| return '\n\n '.join(elems) | |
| out = [] | |
| for e in elems: | |
| out.append(self.merge_sections(e)) | |
| return '\n\n '.join(map(clean, out)) | |
| def get_merged_sections(self, *location_ids): | |
| return [self.merge_sections(s) for s in self.get_sections(*location_ids)] | |
| def as_markdown(self, content): | |
| return md(content) | |
| def get_sections_by_depth(self, depth): | |
| return self._get_sections_by_depth(self.content, depth) | |
| def _get_sections_by_depth(content, depth): | |
| """Returns a list of merged sections at a specific depth""" | |
| if depth == 0: | |
| return content | |
| out = [] | |
| for elem in content: | |
| out += DocTree._get_sections_by_depth(elem, depth - 1) | |
| return out | |
| def fix_relative_links(url, article_content): | |
| if 'http' in url: | |
| base_url = '/'.join(url.split('/')[:3]) | |
| else: | |
| base_url = url.split('/') | |
| pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE) | |
| res = pat.findall(article_content) | |
| if res: | |
| for g in res: | |
| url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1] | |
| article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})') | |
| else:print('not found') | |
| return article_content | |
| def extract_article(url): | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| return article | |
| def select_content(html_code, elem_class, class_name): | |
| print(f'Calling select_content with {elem_class}, {class_name}') | |
| kwargs = {} | |
| if class_name.startswith('.'): | |
| class_name = class_name[1:] | |
| kwargs = {'class_': class_name} | |
| elif class_name.startswith('#'): | |
| kwargs = {'id': class_name[1:]} | |
| return md(str(BS(html_code, features="lxml").find(**kwargs))) | |
| def split_by_heading(html_content, _i): | |
| if _i >= 7: | |
| return html_content | |
| elems = [] | |
| for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]): | |
| if idx > 0 or elem.startswith('>'): | |
| elem = f'<h{_i}{elem}' | |
| elems.append(split_by_heading(elem, _i+1)) | |
| return elems | |
| def doctree_from_url(url, elem_class='div', class_name='article-body'): | |
| article = extract_article(url) | |
| # convert to MD to handle splitting better | |
| article_content = select_content(article.html, elem_class, class_name) | |
| requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != [] | |
| if requires_title: | |
| print('Didn\'t find title, will add it manually...') | |
| article_content = f"# {article.title}\n\n{article_content}" | |
| article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%') | |
| # fix relative website links | |
| article_content = fix_relative_links(url, article_content) | |
| # convert back to HTML | |
| html_content = markdown.markdown(article_content).replace('%%@@%%', '#') | |
| doc_tree = DocTree(split_by_heading(html_content, 1)) | |
| #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.' | |
| return doc_tree | |
| def get_selectors_for_class(url, elem_class): | |
| article = extract_article(url) | |
| html_content = article.html | |
| soup = BS(html_content, features="lxml") | |
| classes = set() | |
| ids = set() | |
| for elem in soup.find_all(elem_class): | |
| if elem.get('class'): | |
| for c in elem.get('class'): | |
| classes |= {f".{c}"} | |
| if elem.get('id'): | |
| ids |= {f"#{elem.get('id')}"} | |
| return ids | classes | |