File size: 4,157 Bytes
6256eb9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | import pandas as pd
def add_pi_if_missing(input_string):
# Check if "se 3.14 for" is in the input string
if "se 3.14 for" in input_string:
# Check if "pi" is not already present after "se 3.14 for"
if "pi" not in input_string[input_string.index("se 3.14 for") + len("se 3.14 for"):]:
return input_string[:input_string.index("se 3.14 for") + len("se 3.14 for")] + " pi" + input_string[input_string.index("se 3.14 for") + len("se 3.14 for"):]
return input_string
def convert_mathml_to_fraction(mathml_content):
mathml_content = mathml_content.replace('<ast-r type="text" marker="1">', "___")
mathml_content = mathml_content.replace('<mfrac>', '(')
mathml_content = mathml_content.replace('</mfrac>', ')')
mathml_content = mathml_content.replace('</mn><mn>', '/')
mathml_content = mathml_content.replace('<mn>', '')
mathml_content = mathml_content.replace('</mn>', '')
mathml_content = mathml_content.replace('<mi>', '')
mathml_content = mathml_content.replace('</mi>', '')
mathml_content = mathml_content.replace('<mo>', '')
mathml_content = mathml_content.replace('</mo>', '')
mathml_content = mathml_content.replace('<mn>', '')
mathml_content = mathml_content.replace('</mn>', '')
mathml_content = mathml_content.replace('<math>', '')
mathml_content = mathml_content.replace('</math>', '')
mathml_content = mathml_content.replace('<mi mathvariant=¨normal¨>§#960;', '')
mathml_content = mathml_content.replace(' ', ' ')
mathml_content = mathml_content.replace('§#160;', ' ')
mathml_content = mathml_content.replace('>', '>')
mathml_content = mathml_content.replace('<', '<')
mathml_content = mathml_content.replace('&', '&')
mathml_content = mathml_content.replace('«/math', '')
mathml_content = mathml_content.rstrip('/')
return mathml_content
from bs4 import BeautifulSoup
def alt_text(html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
# Replace each <img> with its alt text if it exists
for img in soup.find_all('img'):
alt = img.get('alt')
if alt:
img.replace_with(alt)
return str(soup)
def mathml_to_text(html):
soup = BeautifulSoup(html, 'html.parser')
# Convert MathML fractions
for math in soup.find_all('math'):
frac = math.find('mfrac')
if frac:
nums = frac.find_all('mn')
if len(nums) == 2:
numerator = nums[0].text
denominator = nums[1].text
frac_text = f"{numerator}/{denominator}"
math.replace_with(frac_text)
else:
math.replace_with(math.get_text()) # Fallback if not a valid mfrac
else:
math.replace_with(math.get_text()) # Handle non-fraction math
# Return clean text
return soup.get_text(separator=" ", strip=True)
def clean_text(input_text):
if pd.isna(input_text) or input_text.strip() == "":
return ""
# Replace and decode HTML entities
input_text = input_text.replace(' ', ' ')
# Replace <img> tags with their alt text
input_text = mathml_to_text(input_text)
input_text = alt_text(input_text)
# Convert MathML if it exists
soup = BeautifulSoup(input_text, 'html.parser')
for img in soup.find_all('img', class_='Wirisformula'):
mathml_formula = img.get('data-mathml')
if mathml_formula:
# Extract inner MathML content
start_index = mathml_formula.find('<math>') + len('<math>')
end_index = mathml_formula.find('</math>')
mathml_formula_content = mathml_formula[start_index:end_index]
mathml_formula_content_cleaned = mathml_formula_content.replace(
'xmlns=¨http://www.w3.org/1998/Math/MathML¨»', '')
fraction = convert_mathml_to_fraction(mathml_formula_content_cleaned)
img.replace_with(fraction)
text = soup.get_text(separator=' ', strip=True)
text = convert_mathml_to_fraction(text)
text = add_pi_if_missing(text)
return text |