Spaces:

garyd1
/

text_translator

Sleeping

App Files Files Community

text_translator / app.py

garyd1

Update app.py

57ec4e3 verified 10 months ago

raw

history blame

5.99 kB

	import os
	import re
	import openai
	import streamlit as st
	import pandas as pd
	import torch
	import nltk
	import time
	from concurrent.futures import ThreadPoolExecutor

	from langchain.chat_models import ChatOpenAI
	from langchain.schema import SystemMessage, HumanMessage
	from sentence_transformers import SentenceTransformer, util

	# Load NLP libraries
	try:
	import spacy
	nlp = spacy.load("en_core_web_sm")
	use_spacy = True
	except Exception:
	st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
	nltk.download("punkt")
	use_spacy = False

	# Load AI models
	translator = ChatOpenAI(model="gpt-3.5-turbo")
	model = SentenceTransformer('all-MiniLM-L6-v2')

	@st.cache_data
	def load_glossary_from_excel(glossary_file_bytes) -> dict:
	"""Load glossary from an Excel file, apply lemmatization, and sort by length."""
	df = pd.read_excel(glossary_file_bytes)
	glossary = {}

	for _, row in df.iterrows():
	if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
	english_term = row['English'].strip().lower()
	french_term = row['CanadianFrench'].strip()
	doc = nlp(english_term) if use_spacy else english_term.split()
	lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
	glossary[lemmatized_term] = french_term

	return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))

	@st.cache_data
	def compute_glossary_embeddings_cached(glossary_items: tuple):
	"""Compute cached embeddings for glossary terms."""
	glossary = dict(glossary_items)
	glossary_terms = list(glossary.keys())
	embeddings = model.encode(glossary_terms, convert_to_tensor=True)
	return glossary_terms, embeddings

	def retry_translate_text(text: str, max_retries=3) -> str:
	"""Retries translation in case of API failure."""
	for attempt in range(max_retries):
	try:
	messages = [
	SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
	HumanMessage(content=text)
	]
	response = translator(messages)
	return response.content.strip()
	except Exception as e:
	print(f"Error in translation (attempt {attempt+1}): {e}")
	time.sleep(2) # Wait before retrying
	return "Translation failed. Please try again later."

	def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
	"""Applies glossary replacements based on semantic similarity with batch processing."""
	glossary_items = tuple(sorted(glossary.items()))
	glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)

	sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]

	def process_sentence(sentence):
	"""Processes a single sentence with glossary enforcement."""
	if not sentence.strip():
	return sentence

	# Dynamic threshold adjustment
	sentence_length = len(sentence.split())
	dynamic_threshold = 0.85 if sentence_length > 10 else 0.75 # Adjust threshold based on sentence length

	sentence_embedding = model.encode(sentence, convert_to_tensor=True)
	cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
	max_score, max_idx = torch.max(cos_scores, dim=1)

	if max_score.item() >= dynamic_threshold:
	term = glossary_terms[max_idx]
	replacement = glossary[term]
	pattern = r'\b' + re.escape(term) + r'\b'
	sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)

	return sentence.strip()

	# Process sentences in parallel for speed
	with ThreadPoolExecutor() as executor:
	updated_sentences = list(executor.map(process_sentence, sentences))

	return " ".join(updated_sentences)

	def validate_translation(original_text, final_text):
	"""Uses GPT to check if the final translation retains the original meaning."""
	messages = [
	SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
	HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
	]
	response = translator(messages)
	return response.content.strip()

	def grammar_correction(text: str) -> str:
	"""Uses GPT to fix grammar issues in the final translated text."""
	messages = [
	SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
	HumanMessage(content=text)
	]
	response = translator(messages)
	return response.content.strip()

	# Streamlit UI
	st.title("Optimized AI-Powered English to Canadian French Translator")
	st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")

	input_text = st.text_area("Enter text to translate:")
	glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
	threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)

	if st.button("Translate"):
	if not input_text.strip():
	st.error("Please enter text to translate.")
	elif glossary_file is None:
	st.error("Glossary file is required.")
	else:
	glossary = load_glossary_from_excel(glossary_file)
	translated_text = retry_translate_text(input_text)
	glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
	corrected_text = grammar_correction(glossary_enforced_text)
	validation_result = validate_translation(input_text, corrected_text)

	st.subheader("Final Translated Text:")
	st.write(corrected_text)

	st.subheader("Validation Check:")
	st.write(validation_result)