Spaces:

BaselMousi
/

fanar-doc-translator

Sleeping

App Files Files Community

fanar-doc-translator / utils.py

BaselMousi

Initial Doc Translator Demo

21d68ff 8 months ago

raw

history blame contribute delete

2.37 kB

	import re

	def copy_run_formatting(source_run, target_run):
	"""Copy the formatting of the source run to the target run."""
	target_run.bold = source_run.bold
	target_run.italic = source_run.italic
	target_run.underline = source_run.underline
	if source_run.font.size:
	target_run.font.size = source_run.font.size
	if source_run.font.name:
	target_run.font.name = source_run.font.name
	if source_run.font.color.rgb:
	target_run.font.color.rgb = source_run.font.color.rgb
	if source_run.font.highlight_color:
	target_run.font.highlight_color = source_run.font.highlight_color

	def segment_text(text):
	"""Split text into segments that are translatable or not."""
	# Define patterns to preserve
	dash_chars = r'-–—−‐‑‒⁃'

	# Updated time_pattern to match times like '9:15a', '10:30a', '1:00p', '10:30am'
	time_pattern = r'\b\d{1,2}:\d{2}[aApP][mM]?\b'

	# Updated time_range_pattern to match ranges like '10:00a – 10:30a'
	time_range_pattern = fr'{time_pattern}\s[{dash_chars}]\s{time_pattern}'

	email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
	phone_pattern = r'\+?\d[\d\s\-]{7,}\d\b'
	url_pattern = r'https?://[^\s,]+\|www\.[^\s,]+' # URLs match until whitespace or comma, exclude trailing commas

	# New pattern to match numbers
	number_pattern = r'\b\d+(?:\.\d+)?\b'

	patterns = [
	r'_{3,}', # Three or more underscores
	phone_pattern, # Phone numbers
	email_pattern, # Email addresses
	url_pattern, # URLs
	time_range_pattern,# Time ranges
	time_pattern, # Times
	number_pattern, # Numbers
	]

	combined_pattern = '\|'.join(patterns)
	regex = re.compile(combined_pattern, re.IGNORECASE)

	segments = []
	last_end = 0
	for match in regex.finditer(text):
	start, end = match.span()
	if start > last_end:
	segments.append((True, text[last_end:start]))
	segments.append((False, text[start:end]))
	last_end = end
	if last_end < len(text):
	segments.append((True, text[last_end:]))
	return segments

	def is_phone_number(text):
	"""Check if the text matches the phone number pattern."""
	phone_pattern = r'^\+?\d[\d\-\s]{3,}\d$'
	return re.match(phone_pattern, text.strip()) is not None