File size: 4,157 Bytes
6256eb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd

def add_pi_if_missing(input_string):
    # Check if "se 3.14 for" is in the input string
    if "se 3.14 for" in input_string:
        # Check if "pi" is not already present after "se 3.14 for"
        if "pi" not in input_string[input_string.index("se 3.14 for") + len("se 3.14 for"):]:
            return input_string[:input_string.index("se 3.14 for") + len("se 3.14 for")] + " pi" + input_string[input_string.index("se 3.14 for") + len("se 3.14 for"):]
    return input_string

def convert_mathml_to_fraction(mathml_content):
    mathml_content = mathml_content.replace('<ast-r type="text" marker="1">', "___")
    mathml_content = mathml_content.replace('<mfrac>', '(')
    mathml_content = mathml_content.replace('</mfrac>', ')')
    mathml_content = mathml_content.replace('</mn><mn>', '/')
    mathml_content = mathml_content.replace('<mn>', '')
    mathml_content = mathml_content.replace('</mn>', '')
    mathml_content = mathml_content.replace('<mi>', '')
    mathml_content = mathml_content.replace('</mi>', '')
    mathml_content = mathml_content.replace('<mo>', '')
    mathml_content = mathml_content.replace('</mo>', '')
    mathml_content = mathml_content.replace('<mn>', '')
    mathml_content = mathml_content.replace('</mn>', '')
    mathml_content = mathml_content.replace('<math>', '')
    mathml_content = mathml_content.replace('</math>', '')
    mathml_content = mathml_content.replace('<mi mathvariant=¨normal¨>§#960;', '')
    mathml_content = mathml_content.replace('&nbsp;', ' ')
    mathml_content = mathml_content.replace('§#160;', ' ')
    mathml_content = mathml_content.replace('&gt;', '>')
    mathml_content = mathml_content.replace('&lt;', '<')
    mathml_content = mathml_content.replace('&amp;', '&')
    
    mathml_content = mathml_content.replace('«/math', '')

    mathml_content = mathml_content.rstrip('/')
    
    return mathml_content


from bs4 import BeautifulSoup
def alt_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    
    # Replace each <img> with its alt text if it exists
    for img in soup.find_all('img'):
        alt = img.get('alt')
        if alt:
            img.replace_with(alt)
    
    return str(soup)

def mathml_to_text(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Convert MathML fractions
    for math in soup.find_all('math'):
        frac = math.find('mfrac')
        if frac:
            nums = frac.find_all('mn')
            if len(nums) == 2:
                numerator = nums[0].text
                denominator = nums[1].text
                frac_text = f"{numerator}/{denominator}"
                math.replace_with(frac_text)
            else:
                math.replace_with(math.get_text())  # Fallback if not a valid mfrac
        else:
            math.replace_with(math.get_text())  # Handle non-fraction math

    # Return clean text
    return soup.get_text(separator=" ", strip=True)


def clean_text(input_text):
    if pd.isna(input_text) or input_text.strip() == "":
        return ""

    # Replace &nbsp; and decode HTML entities
    input_text = input_text.replace('&nbsp;', ' ')
    
    # Replace <img> tags with their alt text
    input_text = mathml_to_text(input_text)
    input_text = alt_text(input_text)

    # Convert MathML if it exists
    soup = BeautifulSoup(input_text, 'html.parser')
    for img in soup.find_all('img', class_='Wirisformula'):
        mathml_formula = img.get('data-mathml')
        if mathml_formula:
            # Extract inner MathML content
            start_index = mathml_formula.find('<math>') + len('<math>')
            end_index = mathml_formula.find('</math>')
            mathml_formula_content = mathml_formula[start_index:end_index]
            mathml_formula_content_cleaned = mathml_formula_content.replace(
                'xmlns=¨http://www.w3.org/1998/Math/MathML¨»', '')
            fraction = convert_mathml_to_fraction(mathml_formula_content_cleaned)
            img.replace_with(fraction)

    text = soup.get_text(separator=' ', strip=True)
    text = convert_mathml_to_fraction(text)
    text = add_pi_if_missing(text)
    return text