Spaces:
Runtime error
Runtime error
| from pyquery import PyQuery as pq | |
| from src.model.paragraph import Paragraph | |
| from bs4 import BeautifulSoup | |
| from src.tools.readers_pdf import Reader_illumio | |
| from src.tools.table_converter import table_converter | |
| class Reader_HTML: | |
| def __init__(self, path): | |
| self.path = path | |
| self.paragraphs = self.read_html_2(path) | |
| #without beautifulsoup but doesn't work fine | |
| def read_html(self, path): | |
| with open(path, 'r') as html_file: | |
| doc = pq(html_file.read()) | |
| # Remove script and style elements | |
| doc('script').remove() | |
| doc('style').remove() | |
| paragraphs = [] | |
| for index, elem in enumerate(doc('*')): | |
| # Check if the element is a leaf (does not contain other elements) | |
| if not pq(elem).find('*'): | |
| text = pq(elem).text().strip() | |
| if text: | |
| paragraphs.append(Paragraph(text=text, font_style=elem.tag, id_ = index, page_id=1)) | |
| return paragraphs | |
| #with beautifulsoup | |
| def read_html_2(self,path): | |
| HTMLFile = open(path, "r") | |
| # Reading the file | |
| reader = HTMLFile.read() | |
| paragraphs = [] | |
| # Creating a BeautifulSoup object and specifying the parser | |
| S = BeautifulSoup(reader, 'html.parser') | |
| for tag in S(['style', 'script', 'footer', 'header', 'nav', 'aside', 'form']): | |
| tag.decompose() | |
| # Get all elements that do not contain other elements | |
| leaf_elements = [elem for elem in S.body.descendants if elem.name is not None and not elem.find_all()] | |
| paragraphs = [] | |
| for index, elem in enumerate(leaf_elements): | |
| text = elem.get_text(strip=True, separator='\n') | |
| if text: | |
| p = Paragraph(text=text, font_style=elem.name, id_ = index, page_id=1) | |
| paragraphs.append(p) | |
| paragraphs = self.concatenate_paragraphs_with_same_font_style(paragraphs) | |
| paragraphs = [p.rearrange_paragraph() for p in paragraphs] | |
| return paragraphs | |
| def concatenate_paragraphs_with_same_font_style(self,paragraphs: [Paragraph]): | |
| i = 0 | |
| while i < len(paragraphs)-1: | |
| if paragraphs[i].font_style == "th": | |
| paragraphs = self.create_table(paragraphs,i) | |
| i += 1 | |
| elif paragraphs[i].font_style == "li": | |
| paragraphs,i = self.create_list(paragraphs,i) | |
| i += 1 | |
| elif paragraphs[i].font_style == paragraphs[i+1].font_style: | |
| paragraphs[i].text += "\n" + paragraphs[i+1].text | |
| paragraphs.pop(i+1) | |
| else: | |
| i += 1 | |
| return paragraphs | |
| def create_table(self, paragraphs, i: int): | |
| table = [] | |
| titles = [] | |
| content = [] | |
| while i < len(paragraphs) and paragraphs[i].font_style == "th": | |
| titles.append(paragraphs[i].text) | |
| paragraphs.pop(i) | |
| table.append(titles) | |
| length = len(titles) | |
| temp = 0 | |
| while i < len(paragraphs) and paragraphs[i].font_style == "td": | |
| if temp == length: | |
| temp = 0 | |
| content.append(paragraphs[i].text) | |
| table.append(content) | |
| content = [] | |
| else: | |
| content.append(paragraphs[i].text) | |
| paragraphs.pop(i) | |
| temp += 1 | |
| table.append(content) | |
| paragraphs.insert(i,Paragraph(table_converter(table),font_style="table",id_=i,page_id=1)) | |
| return paragraphs | |
| def create_list(self, paragraphs, i: int): | |
| list_content = [] | |
| while i < len(paragraphs) and paragraphs[i].font_style in ["ul", "ol", "li"]: | |
| if paragraphs[i].font_style == "li": | |
| list_content.append(paragraphs[i].text) | |
| paragraphs.pop(i) | |
| elif paragraphs[i].font_style in ["ul", "ol"]: | |
| sublist, i = self.create_list(paragraphs, i+1) | |
| list_content.append(sublist) | |
| else: | |
| i += 1 | |
| list_paragraph = Paragraph(text=self.format_list(list_content), font_style="list", id_=i, page_id=1) | |
| paragraphs.insert(i, list_paragraph) | |
| return paragraphs, i | |
| def format_list(self,list_content): | |
| res = "" | |
| for i in range(len(list_content)): | |
| if type(list_content[i]) == str: | |
| res += f"{i+1}. {list_content[i]}\n" | |
| else: | |
| res += f"{i+1}. {self.format_list(list_content[i])}\n" | |
| return res | |