Spaces:

garyd1
/

text_translator

Sleeping

App Files Files Community

text_translator / app.py

garyd1

Update app.py

fd72a6e verified 10 months ago

raw

history blame

5.93 kB

	import os
	import re
	import openai
	import streamlit as st
	import pandas as pd
	import torch
	import nltk
	import time
	import subprocess
	from concurrent.futures import ThreadPoolExecutor

	from langchain_openai import ChatOpenAI
	from langchain.schema import SystemMessage, HumanMessage
	from sentence_transformers import SentenceTransformer, util

	# Ensure necessary NLP models are available
	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	print("Downloading NLTK punkt tokenizer...")
	nltk.download("punkt")

	try:
	import spacy
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	print("Downloading SpaCy model...")
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")

	# Load AI models
	translator = ChatOpenAI(model="gpt-3.5-turbo")
	model = SentenceTransformer('all-MiniLM-L6-v2')

	@st.cache_data
	def load_glossary_from_excel(glossary_file_bytes) -> dict:
	"""Load glossary from an Excel file, apply lemmatization, and sort by length."""
	df = pd.read_excel(glossary_file_bytes)
	glossary = {}

	for _, row in df.iterrows():
	if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
	english_term = row['English'].strip().lower()
	french_term = row['CanadianFrench'].strip()
	doc = nlp(english_term) if nlp else english_term.split()
	lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
	glossary[lemmatized_term] = french_term

	return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))

	@st.cache_data
	def compute_glossary_embeddings_cached(glossary_items: tuple):
	"""Compute cached embeddings for glossary terms."""
	glossary = dict(glossary_items)
	glossary_terms = list(glossary.keys())
	embeddings = model.encode(glossary_terms, convert_to_tensor=True)
	return glossary_terms, embeddings

	def enforce_glossary_pre_translation(text: str, glossary: dict) -> str:
	"""Forces glossary terms in the English text before translation."""
	for eng_term, fr_term in glossary.items():
	pattern = r'\b' + re.escape(eng_term) + r'\b'
	text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE) # Capitalize for emphasis
	return text

	def retry_translate_text(text: str, max_retries=3) -> str:
	"""Retries translation in case of API failure."""
	for attempt in range(max_retries):
	try:
	messages = [
	SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."),
	HumanMessage(content=text)
	]
	response = translator(messages)
	return response.content.strip()
	except Exception as e:
	print(f"Error in translation (attempt {attempt+1}): {e}")
	time.sleep(2)
	return "Translation failed. Please try again later."

	def enforce_glossary_post_translation(text: str, glossary: dict) -> str:
	"""Ensures glossary terms are applied after translation."""
	for eng_term, fr_term in glossary.items():
	pattern = r'\b' + re.escape(eng_term.upper()) + r'\b'
	text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
	return text

	def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
	"""Applies glossary replacements based on semantic similarity."""
	glossary_items = tuple(sorted(glossary.items()))
	glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)

	sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]

	def process_sentence(sentence):
	"""Processes a single sentence with glossary enforcement."""
	if not sentence.strip():
	return sentence

	sentence_embedding = model.encode(sentence, convert_to_tensor=True)
	cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
	max_score, max_idx = torch.max(cos_scores, dim=1)

	if max_score.item() >= threshold:
	term = glossary_terms[max_idx]
	replacement = glossary[term]
	pattern = r'\b' + re.escape(term) + r'\b'
	sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)

	return sentence.strip()

	with ThreadPoolExecutor() as executor:
	updated_sentences = list(executor.map(process_sentence, sentences))

	return " ".join(updated_sentences)

	# Streamlit UI
	st.title("AI-Powered English to Canadian French Translator")
	st.write("This version ensures glossary priority, improves enforcement, and validates meaning.")

	input_text = st.text_area("Enter text to translate:")
	glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
	threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85)

	if st.button("Translate"):
	if not input_text.strip():
	st.error("Please enter text to translate.")
	elif glossary_file is None:
	st.error("Glossary file is required.")
	else:
	glossary = load_glossary_from_excel(glossary_file)

	# Step 1: Enforce Glossary Before Translation
	pre_translated_text = enforce_glossary_pre_translation(input_text, glossary)

	# Step 2: Translate Text with OpenAI
	translated_text = retry_translate_text(pre_translated_text)

	# Step 3: Enforce Glossary After Translation
	post_translated_text = enforce_glossary_post_translation(translated_text, glossary)

	# Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms
	glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold)

	st.subheader("Final Translated Text:")
	st.write(glossary_enforced_text)