Spaces:

Andreas99
/

LitBench-UI

Build error

App Files Files Community

LitBench-UI / src /utils /utils.py

Andreas99

Upload 22 files

908351f verified 11 months ago

raw

history blame contribute delete

23.5 kB

	import sys
	import regex
	import yaml
	import shutil
	import bibtexparser
	from charset_normalizer import from_path
	from langdetect import detect
	import os
	import subprocess
	import numpy as np
	import networkx as nx
	import re


	def is_venv():
	return (hasattr(sys, 'real_prefix') or
	(hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix))

	def read_yaml_file(file_path):
	with open(file_path, 'r') as file:
	try:
	data = yaml.safe_load(file)
	return data
	except yaml.YAMLError as e:
	print(f"Error reading YAML file: {e}")

	def read_tex_file(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	tex_content = file.read()
	return tex_content

	def write_tex_file(file_path, s):
	with open(file_path, 'w', encoding='utf-8') as file:
	file.write(s)

	def get_core(s):
	start = '\\begin{document}'
	end = '\\end{document}'
	beginning_doc = s.find(start)
	end_doc = s.rfind(end)
	return s[beginning_doc+len(start):end_doc]


	def retrieve_text(text, command, keep_text=False):
	"""Removes '\\command{*}' from the string 'text'.

	Regex `base_pattern` used to match balanced parentheses taken from:
	https://stackoverflow.com/questions/546433/regular-expression-to-match-balanced-parentheses/35271017#35271017
	"""
	base_pattern = (
	r'\\' + command + r"(?:\[(?:.?)\])\{((?:[^{}]+\|\{(?1)\}))\}(?:\[(?:.?)\])*"
	)

	def extract_text_inside_curly_braces(text):
	"""Extract text inside of {} from command string"""
	pattern = r"\{((?:[^{}]\|(?R))*)\}"

	match = regex.search(pattern, text)

	if match:
	return match.group(1)
	else:
	return ""

	# Loops in case of nested commands that need to retain text, e.g. \red{hello \red{world}}.
	while True:
	all_substitutions = []
	has_match = False
	for match in regex.finditer(base_pattern, text):
	# In case there are only spaces or nothing up to the following newline,
	# adds a percent, not to alter the newlines.
	has_match = True

	if not keep_text:
	new_substring = ""
	else:
	temp_substring = text[match.span()[0] : match.span()[1]]
	return extract_text_inside_curly_braces(temp_substring)

	if match.span()[1] < len(text):
	next_newline = text[match.span()[1] :].find("\n")
	if next_newline != -1:
	text_until_newline = text[
	match.span()[1] : match.span()[1] + next_newline
	]
	if (
	not text_until_newline or text_until_newline.isspace()
	) and not keep_text:
	new_substring = "%"
	all_substitutions.append((match.span()[0], match.span()[1], new_substring))

	for start, end, new_substring in reversed(all_substitutions):
	text = text[:start] + new_substring + text[end:]

	if not keep_text or not has_match:
	break


	def reduce_linebreaks(s):
	return re.sub(r'(\n[ \t])+(\n[ \t])+', '\n\n', s)


	def replace_percentage(s):
	return re.sub(r'% *\n', '\n', s)


	def reduce_spaces(s):
	return re.sub(' +', ' ', s)


	def delete_urls(s):
	return re.sub(r'http\S+', '', s)


	def remove_tilde(s):
	s1 = re.sub(r'[~ ]\.', '.', s)
	s2 = re.sub(r'[~ ],', ',', s1)
	return re.sub(r'{}', '', s2)


	def remove_verbatim_words(s):
	with open("configs/latex_commands.yaml", "r") as stream:
	read_config = yaml.safe_load(stream)

	for command in read_config['verbatim_to_delete']:
	s = s.replace(command, '')

	for command in read_config['two_arguments']:
	pattern = r'\\' + command + r'{[^}]}' + r'{[^}]}'
	s = re.sub(pattern, '', s)

	for command in read_config['three_arguments']:
	pattern = r'\\' + command + r'{[^}]}' + r'{[^}]}' + r'{[^}]*}'
	s = re.sub(pattern, '', s)

	for command in read_config['two_arguments_elaborate']:
	s = remove_multargument(s, '\\' + command, 2)

	for command in read_config['three_arguments_elaborate']:
	s = remove_multargument(s, '\\' + command, 3)

	for command in read_config['replace_comments']:
	pattern = r'\\' + command
	s = re.sub(pattern, '%', s)

	s = re.sub(
	r'\\end{[\s]abstract[\s]}',
	'',
	s,
	flags=re.IGNORECASE
	)

	s = re.sub(
	r'\\begin{[\s]abstract[\s]}',
	'Abstract\n\n',
	s,
	flags=re.IGNORECASE
	)
	return s


	def yes_or_no(s):
	return 1 if "Yes" == s[0:3] else 0 if "No" == s[0:2] else -1


	def get_main(directory):
	file_paths = []
	for root, _, files in os.walk(directory):
	for file in files:
	file_path = os.path.join(root, file)
	file_paths.append(file_path)
	latex_paths = [f for f in file_paths if f.endswith('.tex')]
	number_tex = len(latex_paths)
	if number_tex == 0:
	return None
	if number_tex == 1:
	return latex_paths[0]
	adjacency = np.zeros((number_tex, number_tex))
	keys = [os.path.basename(path) for path in latex_paths]
	reg_ex = r'\\input{(.?)}\|\\include{(.?)}\|\\import{(.?)}\|\\subfile{(.?)}\|\\include[]{(.?)}\|}'
	for i,file in enumerate(latex_paths):
	content = read_tex_file(file)
	find_pattern_input = re.findall(reg_ex, content)
	find_pattern_input = [tup for tup in find_pattern_input if not all(element == "" for element in tup)]
	number_matches = len(find_pattern_input)
	if number_matches == 0:
	continue
	else:
	content = replace_imports(file, content)
	reg_ex_clean = r'\\input{(.?)}\|\\include{(.?)}'
	find_pattern_input = re.findall(reg_ex_clean, content)
	number_matches = len(find_pattern_input)
	for j in range(number_matches):
	match = find_pattern_input[j]
	non_empty_match = [t for t in match if t]
	for non_empty in non_empty_match:
	base_match = os.path.basename(non_empty)
	if not base_match.endswith('.tex'):
	base_match = base_match + '.tex'
	if base_match not in keys:
	continue
	ind = keys.index(base_match)
	adjacency[i][ind] = 1
	G = nx.from_numpy_array(adjacency, create_using=nx.DiGraph)
	connected_components = list(nx.weakly_connected_components(G))
	size_connected = [len(x) for x in connected_components]
	maximum_size = max(size_connected)
	biggest_connected = [x for x in connected_components if len(x) == maximum_size]
	if len(biggest_connected)>1:
	roots = [n for connected in biggest_connected for n in connected if not list(G.predecessors(n))]
	_check = []
	for r in roots:
	try:
	_check.append(check_begin(latex_paths[r]))
	except Exception as e:
	_check.append(False)
	potentials_files = [latex_paths[x] for x, y in zip(roots, _check) if y == True]
	sizes_files = [os.path.getsize(x) for x in potentials_files]
	return potentials_files[sizes_files.index(max(sizes_files))]

	else:
	roots = [n for n in biggest_connected[0] if not list(G.predecessors(n))]
	return latex_paths[roots[0]]


	def initial_clean(directory, config):
	config_cmd = ''
	if config == True:
	config_cmd = '--config configs/cleaning_config.yaml'
	temp_dir = directory[:directory.rfind('/')] + '_temp' + '/'
	shutil.copytree(directory, temp_dir)
	try:
	command_res = os.system('arxiv_latex_cleaner --keep_bib {} {}'.format(directory, config_cmd))
	if command_res != 0:
	raise Exception('Error cleaning')
	else:
	shutil.rmtree(temp_dir)

	except Exception as e:
	shutil.rmtree(directory)
	os.rename(temp_dir, directory)
	file_paths = []
	for root, _, files in os.walk(directory):
	for file in files:
	file_path = os.path.join(root, file)
	file_paths.append(file_path)
	latex_paths = [f for f in file_paths if f.endswith('.tex')]
	for p in latex_paths:
	results = from_path(p)
	with open(p, 'w', encoding='utf-8') as f:
	f.write(str(results.best()))
	os.system('arxiv_latex_cleaner --keep_bib {} {}'.format(directory, config_cmd))
	cleaned_directory = directory[:directory.rfind('/')] + '_arXiv'
	shutil.rmtree(directory)
	os.rename(cleaned_directory, directory)


	def check_begin(directory):
	content = read_tex_file(directory)
	english = detect(content) == 'en'
	return True and english if re.findall(r'\\begin{document}', content) else False


	def post_processing(extracted_dir, file):
	_dir = os.path.dirname(file) + '/'
	perl_expand(file)
	file = _dir + 'merged_latexpand.tex'
	try:
	de_macro(file)
	file = _dir + 'merged_latexpand-clean.tex'
	except Exception as e:
	pass
	try:
	def_handle(file)
	except Exception as e:
	pass
	try:
	declare_operator(file) # has additional add-ons
	except Exception as e:
	pass
	try:
	de_macro(file)
	file = _dir + os.path.splitext(os.path.basename(file))[0] + '-clean' + '.tex'
	except Exception as e:
	pass
	initial_clean(_dir, config=True)
	initial_clean(_dir, config=False)
	tex_content = read_tex_file(file)
	final_tex = reduce_spaces(
	delete_urls(
	remove_tilde(
	reduce_linebreaks(
	replace_percentage(
	remove_verbatim_words(
	tex_content
	)
	)
	)
	)
	)
	).strip()
	shutil.rmtree(extracted_dir)
	os.makedirs(extracted_dir)
	write_tex_file(extracted_dir + 'final_cleaned.tex', final_tex)
	initial_clean(extracted_dir, config=False)
	return extracted_dir + 'final_cleaned.tex'


	def perl_expand(file):
	# Save the current working directory
	oldpwd = os.getcwd()
	target_dir = os.path.dirname(file) + '/'
	# Correctly construct the path
	target = os.path.join(target_dir, 'latexpand')
	src = './src/utils/latexpand'
	# Copy the `latexpand` script to the target directory
	shutil.copyfile(src, target)
	# Change to the target directory
	os.chdir(target_dir)

	# Run the perl command without shell=True and handle redirection within Python
	with open('merged_latexpand.tex', 'w') as output_file:
	subprocess.run(['perl', 'latexpand', os.path.basename(file)],
	stdout=output_file, stderr=subprocess.DEVNULL)

	# Return to the original directory
	os.chdir(oldpwd)


	def de_macro(file):
	# Save the current working directory\
	oldpwd = os.getcwd()
	target_dir = os.path.dirname(file) + '/'
	# Construct the target path
	target = os.path.join(target_dir, 'de-macro.py')
	src = '.src/utils/de-macro.py'

	# Copy the `de-macro.py` script to the target directory
	shutil.copyfile(src, target)
	# Change to the target directory
	os.chdir(target_dir)

	# Run the de-macro script without os.system and capture errors
	try:
	subprocess.run(['python3', 'de-macro.py', os.path.basename(file)],
	stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
	except subprocess.CalledProcessError as e:
	raise Exception(f"Error de-macro: {e}") from e
	finally:
	# Always return to the original directory
	os.chdir(oldpwd)


	def def_handle(file):
	h = os.system('python3 src/utils/def_handle.py {} --output {}'.format(file, file))
	if h != 0:
	raise Exception('Error def handle')


	def declare_operator(file):
	s = read_tex_file(file)
	## Operators
	pattern = r'\\DeclareMathOperator'
	s = re.sub(pattern, r'\\newcommand', s)
	pattern = {
	r'\\newcommand\*': r'\\newcommand',
	r'\\providecommand\*': r'\\newcommand',
	r'\\providecommand': r'\\newcommand',
	r'\\renewcommand\*': r'\\renewcommand',
	r'\\newenvironment\*': r'\\newenvironment',
	r'\\renewenvironment\*': r'\\renewenvironment'
	}
	s = re.sub(r'\\end +', r'\\end', s)
	for key in pattern:
	s = re.sub(key, pattern[key], s)
	## Title
	start = '\\begin{document}'
	beginning_doc = s.find(start)
	pattern = {
	r'\\icmltitlerunning\*': r'\\title',
	r'\\icmltitlerunning': r'\\title',
	r'\\inlinetitle\*': r'\\title',
	r'\\icmltitle\*': r'\\title',
	r'\\inlinetitle': r'\\title',
	r'\\icmltitle': r'\\title',
	r'\\titlerunning\*': r'\\title',
	r'\\titlerunning': r'\\title',
	r'\\toctitle': r'\\title',
	r'\\title\*': r'\\title',
	r'\\TITLE\*': r'\\title',
	r'\\TITLE': r'\\title',
	r'\\Title\*': r'\\title',
	r'\\Title': r'\\title',
	}
	for key in pattern:
	s = re.sub(key, pattern[key], s)
	find_potential = s.find('\\title')

	## Remove \\
	title_content = retrieve_text(s, 'title', keep_text = True)
	if title_content != None:
	cleaned_title = re.sub(r'\\\\', ' ', title_content)
	cleaned_title = re.sub(r'\n',' ', cleaned_title)
	cleaned_title = re.sub(r'\~',' ', cleaned_title)
	s = s.replace(title_content, cleaned_title)
	if find_potential != -1 and find_potential < beginning_doc:
	s = s.replace('\\maketitle', cleaned_title)

	## Cite and ref commands
	pattern = {
	r'\\citep\*': r'\\cite',
	r'\\citet\*': r'\\cite',
	r'\\citep': r'\\cite',
	r'\\citet': r'\\cite',
	r'\\cite\*': r'\\cite',
	r'\\citealt\*': r'\\cite',
	r'\\citealt': r'\\cite',
	r'\\citealtp\*': r'\\cite',
	r'\\citealp': r'\\cite',
	r'\\citeyear\*': r'\\cite',
	r'\\citeyear': r'\\cite',
	r'\\citeauthor\*': r'\\cite',
	r'\\citeauthor': r'\\cite',
	r'\\citenum\*': r'\\cite',
	r'\\citenum': r'\\cite',
	r'\\cref': r'\\ref',
	r'\\Cref': r'\\ref',
	r'\\factref': r'\\ref',
	r'\\appref': r'\\ref',
	r'\\thmref': r'\\ref',
	r'\\secref': r'\\ref',
	r'\\lemref': r'\\ref',
	r'\\corref': r'\\ref',
	r'\\eqref': r'\\ref',
	r'\\autoref': r'\\ref',
	r'begin{thm}': r'begin{theorem}',
	r'begin{lem}': r'begin{lemma}',
	r'begin{cor}': r'begin{corollary}',
	r'begin{exm}': r'begin{example}',
	r'begin{defi}': r'begin{definition}',
	r'begin{rem}': r'begin{remark}',
	r'begin{prop}': r'begin{proposition}',
	r'end{thm}': r'end{theorem}',
	r'end{lem}': r'end{lemma}',
	r'end{cor}': r'end{corollary}',
	r'end{exm}': r'end{example}',
	r'end{defi}': r'end{definition}',
	r'end{rem}': r'end{remark}',
	r'end{prop}': r'end{proposition}',
	}

	for key in pattern:
	s = re.sub(key, pattern[key], s)


	pattern = {
	r'subsubsection': r'section',
	r'subsubsection ': r'section',
	r'subsubsection\*': r'section',
	r'subsubsection\* ': r'section',
	r'subsection': r'section',
	r'subsection ': r'section',
	r'subsection\*': r'section',
	r'subsection\* ': r'section',
	r'section ': r'section',
	r'section\*': r'section',
	r'section\* ': r'section',
	r'chapter': r'section',
	r'chapter ': r'section',
	r'chapter\*': r'section',
	r'chapter\* ': r'section',
	r'mysubsubsection': r'section',
	r'mysubsection': r'section',
	r'mysection': r'section',
	}

	for key in pattern:
	s = re.sub(key, pattern[key], s)

	# In case any new commands for appendix/appendices
	s = re.sub(r'newcommand{\\appendix}', '', s)
	s = re.sub(r'newcommand{\\appendices}', '', s)
	s = get_core(s)

	## In case of double titles being defined
	title_content = retrieve_text(s, 'title', keep_text = True)
	if title_content != None:
	cleaned_title = re.sub(r'\\\\', ' ', title_content)
	cleaned_title = re.sub(r'\n',' ', cleaned_title)
	cleaned_title = re.sub(r'\~',' ', cleaned_title)
	s = s.replace(title_content, cleaned_title)
	write_tex_file(file, s)


	def replace_imports(file, s):
	regex_p1 = r'\\import{(.?)}{(.?)}'
	s = re.sub(regex_p1, r"\\input{\1\2}", s)
	regex_p2 = r'\\subfile{(.*?)}'
	s = re.sub(regex_p2, r"\\input{\1}", s)
	regex_p3 = r'\\include[]{(.?)}'
	s = re.sub(regex_p3, r"\\input{\1}", s)
	write_tex_file(file, s)
	return s


	def remove_multargument(s, target, k):
	ind = s.find(target)
	while ind != -1:
	start_ind = ind + len(target)
	stack_open = 0
	stack_close = 0
	track_arg = 0
	for i, char in enumerate(s[start_ind:]):
	if char == '{':
	stack_open += 1
	if char == '}':
	stack_close += 1
	if stack_open !=0 and stack_close !=0:
	if stack_open == stack_close:
	track_arg += 1
	stack_open = 0
	stack_close = 0
	if track_arg == k:
	break
	s = s[:ind] + s[start_ind + i + 1:]
	ind = s.find(target)
	return s


	def fix_citations(s):
	pattern = {
	r'\\citep\*': r'\\cite',
	r'\\citet\*': r'\\cite',
	r'\\citep': r'\\cite',
	r'\\citet': r'\\cite',
	r'\\cite\*': r'\\cite',
	r'\\citealt\*': r'\\cite',
	r'\\citealt': r'\\cite',
	r'\\citealtp\*': r'\\cite',
	r'\\citealp': r'\\cite',
	r'\\citeyear\*': r'\\cite',
	r'\\citeyear': r'\\cite',
	r'\\citeauthor\*': r'\\cite',
	r'\\citeauthor': r'\\cite',
	r'\\citenum\*': r'\\cite',
	r'\\citenum': r'\\cite'
	}
	for key in pattern:
	s = re.sub(key, pattern[key], s)
	return s

	def find_bib(directory):
	file_paths = []
	for root, _, files in os.walk(directory):
	for file in files:
	file_path = os.path.join(root, file)
	file_paths.append(file_path)
	bib_paths = [f for f in file_paths if f.endswith('.bib')]
	return bib_paths

	def create_bib_from_bbl(bibfile):
	with open(bibfile, 'r') as f:
	content = f.read()
	library_raw = bibtexparser.parse_string(content)
	library = {}
	for block in library_raw.blocks:
	if isinstance(
	block,
	(bibtexparser.model.DuplicateBlockKeyBlock, bibtexparser.model.ParsingFailedBlock, bibtexparser.model.ImplicitComment)
	):
	continue
	fields = {}
	for field in block.fields:
	fields[field.key] = field.value

	## Get a good title one ##
	field_content = fields["note"]
	field_content = field_content.replace("\n", " ")
	field_content = re.sub(" +", " ", field_content)
	if field_content.find("``") != -1 and field_content.find("\'\'") != -1:
	title = (
	field_content[field_content.find("``") + 2 : field_content.find("\'\'")]
	.replace("\\emph", "")
	.replace("\\emp", "")
	.replace("\\em", "")
	.replace(",", "")
	.replace("{", "")
	.replace("}","")
	.replace("``", "")
	.replace("\'\'", "")
	.strip(".")
	.strip()
	.strip(".")
	.lower()
	)
	fields['title'] = title
	else:
	if field_content.count("\\newblock") == 2:
	field_content = field_content.replace("\\newblock", "``", 1)
	field_content = field_content.replace("\\newblock", "\'\'", 1)
	if field_content.find("``") != -1 and field_content.find("\'\'") != -1:
	title = (
	field_content[field_content.find("``") + 2 : field_content.find("\'\'")]
	.replace("\\emph", "")
	.replace("\\emp", "")
	.replace("\\em", "")
	.replace(",", "")
	.replace("{", "")
	.replace("}","")
	.replace("``", "")
	.replace("\'\'", "")
	.strip(".")
	.strip()
	.strip(".")
	.lower()
	)
	fields['title'] = title
	library[block.key] = fields
	return library


	def create_bib(bibfile):
	with open(bibfile, 'r') as f:
	content = f.read()
	library_raw = bibtexparser.parse_string(content)

	library = {}
	for block in library_raw.blocks:
	if isinstance(
	block,
	(bibtexparser.model.DuplicateBlockKeyBlock, bibtexparser.model.ParsingFailedBlock, bibtexparser.model.ImplicitComment)
	):
	continue
	fields = {}
	for field in block.fields:
	fields[field.key] = field.value.replace('{', '').replace('}', '')
	if field.key == 'title':
	title = re.sub(r'[\n]+', ' ', field.value) # keep only one \n
	title = re.sub(r' +', ' ', title)
	fields[field.key] = (
	title.replace("\\emph", "")
	.replace("\\emp", "")
	.replace("\\em", "")
	.replace(",", "")
	.replace("{", "")
	.replace("}", "")
	.strip(".")
	.strip()
	.strip(".")
	.lower()
	)
	if 'title' not in fields:
	continue
	library[block.key] = fields
	return library


	def find_bbl(directory):
	file_paths = []
	for root, _, files in os.walk(directory):
	for file in files:
	file_path = os.path.join(root, file)
	file_paths.append(file_path)
	bib_paths = [f for f in file_paths if f.endswith('.bbl')]
	return bib_paths


	def textobib(file):
	oldpwd = os.getcwd()
	target_dir = os.path.dirname(file) + '/'
	target = target_dir + 'tex2bib'
	src = './tex2bib'
	shutil.copyfile(src, target)
	os.chdir(target_dir)
	output_file = os.path.splitext(os.path.basename(file))[0] + '.bib'
	os.system('perl tex2bib -i {} -o {}'.format(os.path.basename(file), output_file))
	os.chdir(oldpwd)
	return target_dir + output_file


	def get_library_bib(bib_files):
	library = []
	for bib_file in bib_files:
	library.append(create_bib(bib_file))
	final_library = {}
	for d in library:
	final_library.update(d)
	return final_library


	def get_library_bbl(bbl_files):
	bib_files = []
	for bbl_file in bbl_files:
	bib_files.append(textobib(bbl_file))
	library = []
	for bib_file in bib_files:
	library.append(create_bib_from_bbl(bib_file))
	final_library = {}
	for d in library:
	final_library.update(d)
	return final_library