JoanneAB
/

translation_fr-als

Model card Files Files and versions

translation_fr-als / functions.py

JoanneAB's picture

Upload functions.py with huggingface_hub

8b2cd79 verified 3 months ago

history blame contribute delete

2.18 kB

	#!/usr/bin/env python3

	import re

	CLEANR = re.compile('<.*?>')

	# --------------------------------------------------------------------------------------------------
	def extract_between_tags(line, tag):
	"""
	Extract the text that is between the tags : <tag>bla bla bla</tag>
	"""
	try:
	return line.split("<%s>"%tag)[1].split("</%s>"%tag)[0]
	except IndexError:
	return line

	# --------------------------------------------------------------------------------------------------
	def remove_html_tags(line):
	"""
	Remove HTML tags:
	"""
	return re.sub(CLEANR, '', line).strip()

	# --------------------------------------------------------------------------------------------------
	def remove_parenthesis(line):
	"""
	Remove content in parenthesis
	"""
	return re.sub(r'\([^)]*\)', '', line).strip()

	# --------------------------------------------------------------------------------------------------
	def clean_line(line):
	"""
	- remove "..., "
	- si "=" dans texte -> enleve ce qui suit (explication en francais du texte alsacien)
	- enlever les "(1)" et "(2)"
	- two options for alsacien:
	. separated by "→" for contraction -> supprimer apres car c'est parfois pour une seconde option et parfois des explications (comment savoir ? -> remove)
	. second option in () -> supprimer le contenu dans "()" car c'est parfois pour juste un mot ou tout le texte. comment savoir ? (fr et als)
	"""
	line_clean = line.replace("..., ", "").split("=")[0].replace("-"," ").split("→")[0]

	if "(" in line_clean:
	line_clean = remove_parenthesis(line_clean)

	return line_clean

	# --------------------------------------------------------------------------------------------------
	def clean_html(line):
	"""
	Remove ' <br/>' from the line of an html file that has been converted using pdftohtml
	Remive the HTML tags too.
	"""
	return remove_html_tags(line.replace(' ', ' '))

	# --------------------------------------------------------------------------------------------------
	def postprocess_text(preds, labels):
	preds = [pred.strip() for pred in preds]
	labels = [[label.strip()] for label in labels]

	return preds, labels