Spaces:
Build error
Build error
| import os | |
| import uuid | |
| import tempfile | |
| import re | |
| import requests | |
| import pandas as pd | |
| from tika import parser | |
| from docx import Document | |
| from sentence_transformers import SentenceTransformer, util | |
| import torch | |
| import streamlit as st | |
| from io import BytesIO | |
| # Load the pre-trained embedding model for semantic matching. | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # ----------------------------- | |
| # Glossary Loader and Enforcement | |
| # ----------------------------- | |
| def load_glossary(glossary_file) -> dict: | |
| """ | |
| Load the company glossary from an Excel file. | |
| Expects columns: 'English' and 'CanadianFrench' | |
| """ | |
| try: | |
| # Use pandas to read directly from the uploaded file (BytesIO) | |
| df = pd.read_excel(glossary_file) | |
| glossary = { | |
| row['English'].strip().lower(): row['CanadianFrench'].strip() | |
| for _, row in df.iterrows() | |
| if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']) | |
| } | |
| return glossary | |
| except Exception as e: | |
| raise Exception(f"Error loading glossary: {str(e)}") | |
| def apply_glossary(text: str, glossary: dict) -> str: | |
| """ | |
| Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms. | |
| """ | |
| for eng_term, fr_term in glossary.items(): | |
| pattern = r'\b' + re.escape(eng_term) + r'\b' | |
| text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE) | |
| return text | |
| # ----------------------------- | |
| # Semantic Glossary Enforcement | |
| # ----------------------------- | |
| def compute_glossary_embeddings(glossary: dict): | |
| """ | |
| Precompute embeddings for the glossary keys. | |
| """ | |
| glossary_terms = list(glossary.keys()) | |
| embeddings = model.encode(glossary_terms, convert_to_tensor=True) | |
| return glossary_terms, embeddings | |
| def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str: | |
| """ | |
| Enhance glossary enforcement using semantic similarity. | |
| Splits text into sentences, computes embeddings, and if a sentence is | |
| semantically similar to a glossary term (above threshold), performs replacement. | |
| """ | |
| glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary) | |
| sentences = text.split('.') | |
| updated_sentences = [] | |
| for sentence in sentences: | |
| if not sentence.strip(): | |
| continue | |
| sentence_embedding = model.encode(sentence, convert_to_tensor=True) | |
| cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) | |
| max_score, max_idx = torch.max(cos_scores, dim=1) | |
| if max_score.item() >= threshold: | |
| term = glossary_terms[max_idx] | |
| replacement = glossary[term] | |
| pattern = r'\b' + re.escape(term) + r'\b' | |
| sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
| updated_sentences.append(sentence.strip()) | |
| final_text = '. '.join(updated_sentences) | |
| return final_text | |
| # ----------------------------- | |
| # Translation using Azure Translator API | |
| # ----------------------------- | |
| def translate_text_azure(text: str) -> str: | |
| """ | |
| Translate text to Canadian French using the Azure Translator API. | |
| """ | |
| subscription_key = os.getenv("AZURE_TRANSLATOR_KEY") | |
| region = os.getenv("AZURE_TRANSLATOR_REGION") | |
| if not subscription_key or not region: | |
| raise Exception("Azure Translator credentials not set.") | |
| endpoint = "https://api.cognitive.microsofttranslator.com/translate" | |
| params = {"api-version": "3.0", "to": "fr-CA"} | |
| headers = { | |
| "Ocp-Apim-Subscription-Key": subscription_key, | |
| "Ocp-Apim-Subscription-Region": region, | |
| "Content-type": "application/json", | |
| "X-ClientTraceId": str(uuid.uuid4()) | |
| } | |
| body = [{"text": text}] | |
| response = requests.post(endpoint, params=params, headers=headers, json=body) | |
| if response.status_code != 200: | |
| raise Exception(f"Translation API error: {response.text}") | |
| result = response.json() | |
| translated_text = result[0]['translations'][0]['text'] | |
| return translated_text | |
| # ----------------------------- | |
| # Document Parsing & Reconstruction | |
| # ----------------------------- | |
| def parse_document(file_path: str) -> str: | |
| """ | |
| Extract text content from a document using Apache Tika. | |
| """ | |
| parsed = parser.from_file(file_path) | |
| text = parsed.get("content", "") | |
| if not text: | |
| raise Exception("No text content found in the document.") | |
| return text | |
| def rebuild_document(text: str) -> bytes: | |
| """ | |
| Rebuild a DOCX document from the provided text. | |
| Returns the document as bytes. | |
| """ | |
| document = Document() | |
| for line in text.split("\n"): | |
| if line.strip(): | |
| document.add_paragraph(line) | |
| bio = BytesIO() | |
| document.save(bio) | |
| bio.seek(0) | |
| return bio.getvalue() | |
| # ----------------------------- | |
| # Processing Pipeline | |
| # ----------------------------- | |
| def process_translation(doc_file, glossary_file) -> bytes: | |
| try: | |
| # Write uploaded document to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc: | |
| tmp_doc.write(doc_file.read()) | |
| doc_path = tmp_doc.name | |
| # Load glossary from the uploaded Excel file | |
| glossary = load_glossary(glossary_file) | |
| # Parse document text | |
| raw_text = parse_document(doc_path) | |
| # Translate text via Azure Translator | |
| translated_text = translate_text_azure(raw_text) | |
| # Apply exact glossary enforcement | |
| final_text = apply_glossary(translated_text, glossary) | |
| # Apply semantic glossary enforcement | |
| final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8) | |
| # Rebuild document to DOCX and get bytes | |
| output_bytes = rebuild_document(final_text) | |
| # Clean up temporary file | |
| os.unlink(doc_path) | |
| return output_bytes | |
| except Exception as e: | |
| st.error(f"Error: {str(e)}") | |
| return None | |
| # ----------------------------- | |
| # Streamlit App UI | |
| # ----------------------------- | |
| def main(): | |
| st.title("English to Canadian Quebec French Translator") | |
| st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.") | |
| doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"]) | |
| glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"]) | |
| if st.button("Translate Document"): | |
| if doc_file is None or glossary_file is None: | |
| st.error("Please upload both the document and glossary files.") | |
| else: | |
| with st.spinner("Translating..."): | |
| result = process_translation(doc_file, glossary_file) | |
| if result is not None: | |
| st.download_button( | |
| label="Download Translated DOCX", | |
| data=result, | |
| file_name="translated.docx", | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| ) | |
| if __name__ == "__main__": | |
| main() | |