| import pandas as pd |
|
|
| def add_pi_if_missing(input_string): |
| |
| if "se 3.14 for" in input_string: |
| |
| if "pi" not in input_string[input_string.index("se 3.14 for") + len("se 3.14 for"):]: |
| return input_string[:input_string.index("se 3.14 for") + len("se 3.14 for")] + " pi" + input_string[input_string.index("se 3.14 for") + len("se 3.14 for"):] |
| return input_string |
|
|
| def convert_mathml_to_fraction(mathml_content): |
| mathml_content = mathml_content.replace('<ast-r type="text" marker="1">', "___") |
| mathml_content = mathml_content.replace('<mfrac>', '(') |
| mathml_content = mathml_content.replace('</mfrac>', ')') |
| mathml_content = mathml_content.replace('</mn><mn>', '/') |
| mathml_content = mathml_content.replace('<mn>', '') |
| mathml_content = mathml_content.replace('</mn>', '') |
| mathml_content = mathml_content.replace('<mi>', '') |
| mathml_content = mathml_content.replace('</mi>', '') |
| mathml_content = mathml_content.replace('<mo>', '') |
| mathml_content = mathml_content.replace('</mo>', '') |
| mathml_content = mathml_content.replace('<mn>', '') |
| mathml_content = mathml_content.replace('</mn>', '') |
| mathml_content = mathml_content.replace('<math>', '') |
| mathml_content = mathml_content.replace('</math>', '') |
| mathml_content = mathml_content.replace('<mi mathvariant=¨normal¨>§#960;', '') |
| mathml_content = mathml_content.replace(' ', ' ') |
| mathml_content = mathml_content.replace('§#160;', ' ') |
| mathml_content = mathml_content.replace('>', '>') |
| mathml_content = mathml_content.replace('<', '<') |
| mathml_content = mathml_content.replace('&', '&') |
| |
| mathml_content = mathml_content.replace('«/math', '') |
|
|
| mathml_content = mathml_content.rstrip('/') |
| |
| return mathml_content |
|
|
|
|
| from bs4 import BeautifulSoup |
| def alt_text(html: str) -> str: |
| soup = BeautifulSoup(html, 'html.parser') |
| |
| |
| for img in soup.find_all('img'): |
| alt = img.get('alt') |
| if alt: |
| img.replace_with(alt) |
| |
| return str(soup) |
|
|
| def mathml_to_text(html): |
| soup = BeautifulSoup(html, 'html.parser') |
|
|
| |
| for math in soup.find_all('math'): |
| frac = math.find('mfrac') |
| if frac: |
| nums = frac.find_all('mn') |
| if len(nums) == 2: |
| numerator = nums[0].text |
| denominator = nums[1].text |
| frac_text = f"{numerator}/{denominator}" |
| math.replace_with(frac_text) |
| else: |
| math.replace_with(math.get_text()) |
| else: |
| math.replace_with(math.get_text()) |
|
|
| |
| return soup.get_text(separator=" ", strip=True) |
|
|
|
|
| def clean_text(input_text): |
| if pd.isna(input_text) or input_text.strip() == "": |
| return "" |
|
|
| |
| input_text = input_text.replace(' ', ' ') |
| |
| |
| input_text = mathml_to_text(input_text) |
| input_text = alt_text(input_text) |
|
|
| |
| soup = BeautifulSoup(input_text, 'html.parser') |
| for img in soup.find_all('img', class_='Wirisformula'): |
| mathml_formula = img.get('data-mathml') |
| if mathml_formula: |
| |
| start_index = mathml_formula.find('<math>') + len('<math>') |
| end_index = mathml_formula.find('</math>') |
| mathml_formula_content = mathml_formula[start_index:end_index] |
| mathml_formula_content_cleaned = mathml_formula_content.replace( |
| 'xmlns=¨http://www.w3.org/1998/Math/MathML¨»', '') |
| fraction = convert_mathml_to_fraction(mathml_formula_content_cleaned) |
| img.replace_with(fraction) |
|
|
| text = soup.get_text(separator=' ', strip=True) |
| text = convert_mathml_to_fraction(text) |
| text = add_pi_if_missing(text) |
| return text |