Spaces:

Jiangxz01
/

SuryaOCR

Running

App Files Files

SuryaOCR / surya /postprocessing /math /latex.py

Jiangxz01

Upload 56 files

52f1bcb verified about 1 year ago

raw

history blame

3.31 kB

	import re
	from ftfy import fix_text


	def contains_math(text):
	return text.startswith("$") or text.endswith("$")


	def fix_math(text):
	# Fix any issues with the text
	text = fix_text(text)

	# Remove LaTeX labels and references
	text = remove_labels(text)
	text = replace_katex_invalid(text)
	text = fix_fences(text)
	return text


	def remove_labels(text):
	pattern = r'\\label\{[^}]*\}'
	text = re.sub(pattern, '', text)

	ref_pattern = r'\\ref\{[^}]*\}'
	text = re.sub(ref_pattern, '', text)

	pageref_pattern = r'\\pageref\{[^}]*\}'
	text = re.sub(pageref_pattern, '', text)
	return text


	def replace_katex_invalid(string):
	# KaTeX cannot render all LaTeX, so we need to replace some things
	string = re.sub(r'\\tag\{.*?\}', '', string)
	string = re.sub(r'\\(?:Bigg?\|bigg?)\{(.*?)\}', r'\1', string)
	string = re.sub(r'\\quad\\mbox\{(.*?)\}', r'\1', string)
	string = re.sub(r'\\mbox\{(.*?)\}', r'\1', string)
	string = remove_inner_dollars(string)
	return string


	def remove_inner_dollars(text):
	def replace_dollar(match):
	# Replace single $ with nothing, keep $$ intact
	math_block = match.group(1)
	return '$$' + math_block.replace('$', '') + '$$'

	pattern = r'\$\$(.*?)\$\$'
	return re.sub(pattern, replace_dollar, text, flags=re.DOTALL)


	def extract_latex_with_positions(text):
	pattern = r'(\$\$.?\$\$\|\$.?\$)'
	matches = []
	for match in re.finditer(pattern, text, re.DOTALL):
	matches.append((match.group(), match.start(), match.end()))
	return matches


	def slice_latex(text):
	# Extract LaTeX blocks along with their positions
	latex_blocks_with_positions = extract_latex_with_positions(text)

	chunks = []
	last_position = 0
	for block, start, end in latex_blocks_with_positions:
	# Add text before the current LaTeX block, if any
	if start > last_position:
	chunks.append({"text": text[last_position:start], "type": "text"})
	# Add the LaTeX block
	chunks.append({"text": block, "type": "latex"})
	last_position = end
	# Add remaining text after the last LaTeX block, if any
	if last_position < len(text):
	chunks.append({"text": text[last_position:], "type": "text"})

	return chunks


	def is_latex(text):
	latex_patterns = [
	r'\\(?:begin\|end)\{[a-zA-Z]*\}',
	r'\$.*?\$',
	r'\$\$.*?\$\$',
	r'\\[a-zA-Z]+',
	r'\\[^a-zA-Z]',
	]

	combined_pattern = '\|'.join(latex_patterns)
	if re.search(combined_pattern, text, re.DOTALL):
	return True

	return False


	def fix_fences(text):
	if text.startswith("$$") and not text.endswith("$$"):
	if text[-1] == "$":
	text += "$"
	else:
	text += "$$"

	if text.endswith("$$") and not text.startswith("$$"):
	if text[0] == "$":
	text = "$" + text
	else:
	text = "$$" + text

	if text.startswith("$") and not text.endswith("$"):
	text = "$" + text + "$$"

	if text.endswith("$") and not text.startswith("$"):
	text = "$$" + text + "$"

	return text


	def strip_fences(text):
	while text.startswith("$"):
	text = text[1:]
	while text.endswith("$"):
	text = text[:-1]
	return text