Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import tempfile
|
| 4 |
+
import re
|
| 5 |
+
import requests
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from tika import parser
|
| 8 |
+
from docx import Document
|
| 9 |
+
from sentence_transformers import SentenceTransformer, util
|
| 10 |
+
import torch
|
| 11 |
+
import streamlit as st
|
| 12 |
+
from io import BytesIO
|
| 13 |
+
|
| 14 |
+
# Load the pre-trained embedding model for semantic matching.
|
| 15 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 16 |
+
|
| 17 |
+
# -----------------------------
|
| 18 |
+
# Glossary Loader and Enforcement
|
| 19 |
+
# -----------------------------
|
| 20 |
+
def load_glossary(glossary_file) -> dict:
|
| 21 |
+
"""
|
| 22 |
+
Load the company glossary from an Excel file.
|
| 23 |
+
Expects columns: 'English' and 'CanadianFrench'
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
# Use pandas to read directly from the uploaded file (BytesIO)
|
| 27 |
+
df = pd.read_excel(glossary_file)
|
| 28 |
+
glossary = {
|
| 29 |
+
row['English'].strip().lower(): row['CanadianFrench'].strip()
|
| 30 |
+
for _, row in df.iterrows()
|
| 31 |
+
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
|
| 32 |
+
}
|
| 33 |
+
return glossary
|
| 34 |
+
except Exception as e:
|
| 35 |
+
raise Exception(f"Error loading glossary: {str(e)}")
|
| 36 |
+
|
| 37 |
+
def apply_glossary(text: str, glossary: dict) -> str:
|
| 38 |
+
"""
|
| 39 |
+
Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
|
| 40 |
+
"""
|
| 41 |
+
for eng_term, fr_term in glossary.items():
|
| 42 |
+
pattern = r'\b' + re.escape(eng_term) + r'\b'
|
| 43 |
+
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
|
| 44 |
+
return text
|
| 45 |
+
|
| 46 |
+
# -----------------------------
|
| 47 |
+
# Semantic Glossary Enforcement
|
| 48 |
+
# -----------------------------
|
| 49 |
+
def compute_glossary_embeddings(glossary: dict):
|
| 50 |
+
"""
|
| 51 |
+
Precompute embeddings for the glossary keys.
|
| 52 |
+
"""
|
| 53 |
+
glossary_terms = list(glossary.keys())
|
| 54 |
+
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
|
| 55 |
+
return glossary_terms, embeddings
|
| 56 |
+
|
| 57 |
+
def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Enhance glossary enforcement using semantic similarity.
|
| 60 |
+
Splits text into sentences, computes embeddings, and if a sentence is
|
| 61 |
+
semantically similar to a glossary term (above threshold), performs replacement.
|
| 62 |
+
"""
|
| 63 |
+
glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
|
| 64 |
+
sentences = text.split('.')
|
| 65 |
+
updated_sentences = []
|
| 66 |
+
for sentence in sentences:
|
| 67 |
+
if not sentence.strip():
|
| 68 |
+
continue
|
| 69 |
+
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
|
| 70 |
+
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
|
| 71 |
+
max_score, max_idx = torch.max(cos_scores, dim=1)
|
| 72 |
+
if max_score.item() >= threshold:
|
| 73 |
+
term = glossary_terms[max_idx]
|
| 74 |
+
replacement = glossary[term]
|
| 75 |
+
pattern = r'\b' + re.escape(term) + r'\b'
|
| 76 |
+
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
|
| 77 |
+
updated_sentences.append(sentence.strip())
|
| 78 |
+
final_text = '. '.join(updated_sentences)
|
| 79 |
+
return final_text
|
| 80 |
+
|
| 81 |
+
# -----------------------------
|
| 82 |
+
# Translation using Azure Translator API
|
| 83 |
+
# -----------------------------
|
| 84 |
+
def translate_text_azure(text: str) -> str:
|
| 85 |
+
"""
|
| 86 |
+
Translate text to Canadian French using the Azure Translator API.
|
| 87 |
+
"""
|
| 88 |
+
subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
|
| 89 |
+
region = os.getenv("AZURE_TRANSLATOR_REGION")
|
| 90 |
+
if not subscription_key or not region:
|
| 91 |
+
raise Exception("Azure Translator credentials not set.")
|
| 92 |
+
|
| 93 |
+
endpoint = "https://api.cognitive.microsofttranslator.com/translate"
|
| 94 |
+
params = {"api-version": "3.0", "to": "fr-CA"}
|
| 95 |
+
headers = {
|
| 96 |
+
"Ocp-Apim-Subscription-Key": subscription_key,
|
| 97 |
+
"Ocp-Apim-Subscription-Region": region,
|
| 98 |
+
"Content-type": "application/json",
|
| 99 |
+
"X-ClientTraceId": str(uuid.uuid4())
|
| 100 |
+
}
|
| 101 |
+
body = [{"text": text}]
|
| 102 |
+
response = requests.post(endpoint, params=params, headers=headers, json=body)
|
| 103 |
+
if response.status_code != 200:
|
| 104 |
+
raise Exception(f"Translation API error: {response.text}")
|
| 105 |
+
result = response.json()
|
| 106 |
+
translated_text = result[0]['translations'][0]['text']
|
| 107 |
+
return translated_text
|
| 108 |
+
|
| 109 |
+
# -----------------------------
|
| 110 |
+
# Document Parsing & Reconstruction
|
| 111 |
+
# -----------------------------
|
| 112 |
+
def parse_document(file_path: str) -> str:
|
| 113 |
+
"""
|
| 114 |
+
Extract text content from a document using Apache Tika.
|
| 115 |
+
"""
|
| 116 |
+
parsed = parser.from_file(file_path)
|
| 117 |
+
text = parsed.get("content", "")
|
| 118 |
+
if not text:
|
| 119 |
+
raise Exception("No text content found in the document.")
|
| 120 |
+
return text
|
| 121 |
+
|
| 122 |
+
def rebuild_document(text: str) -> bytes:
|
| 123 |
+
"""
|
| 124 |
+
Rebuild a DOCX document from the provided text.
|
| 125 |
+
Returns the document as bytes.
|
| 126 |
+
"""
|
| 127 |
+
document = Document()
|
| 128 |
+
for line in text.split("\n"):
|
| 129 |
+
if line.strip():
|
| 130 |
+
document.add_paragraph(line)
|
| 131 |
+
bio = BytesIO()
|
| 132 |
+
document.save(bio)
|
| 133 |
+
bio.seek(0)
|
| 134 |
+
return bio.getvalue()
|
| 135 |
+
|
| 136 |
+
# -----------------------------
|
| 137 |
+
# Processing Pipeline
|
| 138 |
+
# -----------------------------
|
| 139 |
+
def process_translation(doc_file, glossary_file) -> bytes:
|
| 140 |
+
try:
|
| 141 |
+
# Write uploaded document to a temporary file
|
| 142 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
|
| 143 |
+
tmp_doc.write(doc_file.read())
|
| 144 |
+
doc_path = tmp_doc.name
|
| 145 |
+
|
| 146 |
+
# Load glossary from the uploaded Excel file
|
| 147 |
+
glossary = load_glossary(glossary_file)
|
| 148 |
+
|
| 149 |
+
# Parse document text
|
| 150 |
+
raw_text = parse_document(doc_path)
|
| 151 |
+
|
| 152 |
+
# Translate text via Azure Translator
|
| 153 |
+
translated_text = translate_text_azure(raw_text)
|
| 154 |
+
|
| 155 |
+
# Apply exact glossary enforcement
|
| 156 |
+
final_text = apply_glossary(translated_text, glossary)
|
| 157 |
+
|
| 158 |
+
# Apply semantic glossary enforcement
|
| 159 |
+
final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
|
| 160 |
+
|
| 161 |
+
# Rebuild document to DOCX and get bytes
|
| 162 |
+
output_bytes = rebuild_document(final_text)
|
| 163 |
+
|
| 164 |
+
# Clean up temporary file
|
| 165 |
+
os.unlink(doc_path)
|
| 166 |
+
return output_bytes
|
| 167 |
+
except Exception as e:
|
| 168 |
+
st.error(f"Error: {str(e)}")
|
| 169 |
+
return None
|
| 170 |
+
|
| 171 |
+
# -----------------------------
|
| 172 |
+
# Streamlit App UI
|
| 173 |
+
# -----------------------------
|
| 174 |
+
def main():
|
| 175 |
+
st.title("English to Canadian Quebec French Translator")
|
| 176 |
+
st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
|
| 177 |
+
|
| 178 |
+
doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
|
| 179 |
+
glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
|
| 180 |
+
|
| 181 |
+
if st.button("Translate Document"):
|
| 182 |
+
if doc_file is None or glossary_file is None:
|
| 183 |
+
st.error("Please upload both the document and glossary files.")
|
| 184 |
+
else:
|
| 185 |
+
with st.spinner("Translating..."):
|
| 186 |
+
result = process_translation(doc_file, glossary_file)
|
| 187 |
+
if result is not None:
|
| 188 |
+
st.download_button(
|
| 189 |
+
label="Download Translated DOCX",
|
| 190 |
+
data=result,
|
| 191 |
+
file_name="translated.docx",
|
| 192 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
main()
|