BaselMousi's picture
Initial Doc Translator Demo
21d68ff
import re
def copy_run_formatting(source_run, target_run):
"""Copy the formatting of the source run to the target run."""
target_run.bold = source_run.bold
target_run.italic = source_run.italic
target_run.underline = source_run.underline
if source_run.font.size:
target_run.font.size = source_run.font.size
if source_run.font.name:
target_run.font.name = source_run.font.name
if source_run.font.color.rgb:
target_run.font.color.rgb = source_run.font.color.rgb
if source_run.font.highlight_color:
target_run.font.highlight_color = source_run.font.highlight_color
def segment_text(text):
"""Split text into segments that are translatable or not."""
# Define patterns to preserve
dash_chars = r'-β€“β€”βˆ’β€β€‘β€’βƒ'
# Updated time_pattern to match times like '9:15a', '10:30a', '1:00p', '10:30am'
time_pattern = r'\b\d{1,2}:\d{2}[aApP][mM]?\b'
# Updated time_range_pattern to match ranges like '10:00a – 10:30a'
time_range_pattern = fr'{time_pattern}\s*[{dash_chars}]\s*{time_pattern}'
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_pattern = r'\+?\d[\d\s\-]{7,}\d\b'
url_pattern = r'https?://[^\s,]+|www\.[^\s,]+' # URLs match until whitespace or comma, exclude trailing commas
# New pattern to match numbers
number_pattern = r'\b\d+(?:\.\d+)?\b'
patterns = [
r'_{3,}', # Three or more underscores
phone_pattern, # Phone numbers
email_pattern, # Email addresses
url_pattern, # URLs
time_range_pattern,# Time ranges
time_pattern, # Times
number_pattern, # Numbers
]
combined_pattern = '|'.join(patterns)
regex = re.compile(combined_pattern, re.IGNORECASE)
segments = []
last_end = 0
for match in regex.finditer(text):
start, end = match.span()
if start > last_end:
segments.append((True, text[last_end:start]))
segments.append((False, text[start:end]))
last_end = end
if last_end < len(text):
segments.append((True, text[last_end:]))
return segments
def is_phone_number(text):
"""Check if the text matches the phone number pattern."""
phone_pattern = r'^\+?\d[\d\-\s\(\)]{3,}\d$'
return re.match(phone_pattern, text.strip()) is not None