Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import openai | |
| import streamlit as st | |
| import pandas as pd | |
| import torch | |
| import nltk | |
| import time | |
| import subprocess | |
| from concurrent.futures import ThreadPoolExecutor | |
| from langchain_openai import ChatOpenAI | |
| from langchain.schema import SystemMessage, HumanMessage | |
| from sentence_transformers import SentenceTransformer, util | |
| # Ensure necessary NLP models are available | |
| try: | |
| nltk.data.find("tokenizers/punkt") | |
| except LookupError: | |
| print("Downloading NLTK punkt tokenizer...") | |
| nltk.download("punkt") | |
| try: | |
| import spacy | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| print("Downloading SpaCy model...") | |
| subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) | |
| nlp = spacy.load("en_core_web_sm") | |
| # Load AI models | |
| translator = ChatOpenAI(model="gpt-3.5-turbo") | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def load_glossary_from_excel(glossary_file_bytes) -> dict: | |
| """Load glossary from an Excel file, apply lemmatization, and sort by length.""" | |
| df = pd.read_excel(glossary_file_bytes) | |
| glossary = {} | |
| for _, row in df.iterrows(): | |
| if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']): | |
| english_term = row['English'].strip().lower() | |
| french_term = row['CanadianFrench'].strip() | |
| doc = nlp(english_term) if nlp else english_term.split() | |
| lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term | |
| glossary[lemmatized_term] = french_term | |
| return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True)) | |
| def compute_glossary_embeddings_cached(glossary_items: tuple): | |
| """Compute cached embeddings for glossary terms.""" | |
| glossary = dict(glossary_items) | |
| glossary_terms = list(glossary.keys()) | |
| embeddings = model.encode(glossary_terms, convert_to_tensor=True) | |
| return glossary_terms, embeddings | |
| def enforce_glossary_pre_translation(text: str, glossary: dict) -> str: | |
| """Forces glossary terms in the English text before translation.""" | |
| for eng_term, fr_term in glossary.items(): | |
| pattern = r'\b' + re.escape(eng_term) + r'\b' | |
| text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE) # Capitalize for emphasis | |
| return text | |
| def retry_translate_text(text: str, max_retries=3) -> str: | |
| """Retries translation in case of API failure.""" | |
| for attempt in range(max_retries): | |
| try: | |
| messages = [ | |
| SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."), | |
| HumanMessage(content=text) | |
| ] | |
| response = translator(messages) | |
| return response.content.strip() | |
| except Exception as e: | |
| print(f"Error in translation (attempt {attempt+1}): {e}") | |
| time.sleep(2) | |
| return "Translation failed. Please try again later." | |
| def enforce_glossary_post_translation(text: str, glossary: dict) -> str: | |
| """Ensures glossary terms are applied after translation.""" | |
| for eng_term, fr_term in glossary.items(): | |
| pattern = r'\b' + re.escape(eng_term.upper()) + r'\b' | |
| text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE) | |
| return text | |
| def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str: | |
| """Applies glossary replacements based on semantic similarity.""" | |
| glossary_items = tuple(sorted(glossary.items())) | |
| glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items) | |
| sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents] | |
| def process_sentence(sentence): | |
| """Processes a single sentence with glossary enforcement.""" | |
| if not sentence.strip(): | |
| return sentence | |
| sentence_embedding = model.encode(sentence, convert_to_tensor=True) | |
| cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) | |
| max_score, max_idx = torch.max(cos_scores, dim=1) | |
| if max_score.item() >= threshold: | |
| term = glossary_terms[max_idx] | |
| replacement = glossary[term] | |
| pattern = r'\b' + re.escape(term) + r'\b' | |
| sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
| return sentence.strip() | |
| with ThreadPoolExecutor() as executor: | |
| updated_sentences = list(executor.map(process_sentence, sentences)) | |
| return " ".join(updated_sentences) | |
| # Streamlit UI | |
| st.title("AI-Powered English to Canadian French Translator") | |
| st.write("This version ensures glossary priority, improves enforcement, and validates meaning.") | |
| input_text = st.text_area("Enter text to translate:") | |
| glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"]) | |
| threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85) | |
| if st.button("Translate"): | |
| if not input_text.strip(): | |
| st.error("Please enter text to translate.") | |
| elif glossary_file is None: | |
| st.error("Glossary file is required.") | |
| else: | |
| glossary = load_glossary_from_excel(glossary_file) | |
| # Step 1: Enforce Glossary Before Translation | |
| pre_translated_text = enforce_glossary_pre_translation(input_text, glossary) | |
| # Step 2: Translate Text with OpenAI | |
| translated_text = retry_translate_text(pre_translated_text) | |
| # Step 3: Enforce Glossary After Translation | |
| post_translated_text = enforce_glossary_post_translation(translated_text, glossary) | |
| # Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms | |
| glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold) | |
| st.subheader("Final Translated Text:") | |
| st.write(glossary_enforced_text) | |