Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,23 +6,27 @@ import pandas as pd
|
|
| 6 |
import torch
|
| 7 |
import nltk
|
| 8 |
import time
|
|
|
|
| 9 |
from concurrent.futures import ThreadPoolExecutor
|
| 10 |
|
| 11 |
-
|
| 12 |
from langchain_openai import ChatOpenAI
|
| 13 |
-
|
| 14 |
from langchain.schema import SystemMessage, HumanMessage
|
| 15 |
from sentence_transformers import SentenceTransformer, util
|
| 16 |
|
| 17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
try:
|
| 19 |
import spacy
|
| 20 |
nlp = spacy.load("en_core_web_sm")
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
use_spacy = False
|
| 26 |
|
| 27 |
# Load AI models
|
| 28 |
translator = ChatOpenAI(model="gpt-3.5-turbo")
|
|
@@ -38,8 +42,8 @@ def load_glossary_from_excel(glossary_file_bytes) -> dict:
|
|
| 38 |
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
|
| 39 |
english_term = row['English'].strip().lower()
|
| 40 |
french_term = row['CanadianFrench'].strip()
|
| 41 |
-
doc = nlp(english_term) if
|
| 42 |
-
lemmatized_term = " ".join([token.lemma_ for token in doc]) if
|
| 43 |
glossary[lemmatized_term] = french_term
|
| 44 |
|
| 45 |
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
|
|
@@ -64,7 +68,7 @@ def retry_translate_text(text: str, max_retries=3) -> str:
|
|
| 64 |
return response.content.strip()
|
| 65 |
except Exception as e:
|
| 66 |
print(f"Error in translation (attempt {attempt+1}): {e}")
|
| 67 |
-
time.sleep(2)
|
| 68 |
return "Translation failed. Please try again later."
|
| 69 |
|
| 70 |
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
@@ -72,7 +76,7 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
| 72 |
glossary_items = tuple(sorted(glossary.items()))
|
| 73 |
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
|
| 74 |
|
| 75 |
-
sentences = nltk.tokenize.sent_tokenize(text) if not
|
| 76 |
|
| 77 |
def process_sentence(sentence):
|
| 78 |
"""Processes a single sentence with glossary enforcement."""
|
|
@@ -95,7 +99,6 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
| 95 |
|
| 96 |
return sentence.strip()
|
| 97 |
|
| 98 |
-
# Process sentences in parallel for speed
|
| 99 |
with ThreadPoolExecutor() as executor:
|
| 100 |
updated_sentences = list(executor.map(process_sentence, sentences))
|
| 101 |
|
|
|
|
| 6 |
import torch
|
| 7 |
import nltk
|
| 8 |
import time
|
| 9 |
+
import subprocess
|
| 10 |
from concurrent.futures import ThreadPoolExecutor
|
| 11 |
|
|
|
|
| 12 |
from langchain_openai import ChatOpenAI
|
|
|
|
| 13 |
from langchain.schema import SystemMessage, HumanMessage
|
| 14 |
from sentence_transformers import SentenceTransformer, util
|
| 15 |
|
| 16 |
+
# Ensure necessary NLP models are available
|
| 17 |
+
try:
|
| 18 |
+
nltk.data.find("tokenizers/punkt")
|
| 19 |
+
except LookupError:
|
| 20 |
+
print("Downloading NLTK punkt tokenizer...")
|
| 21 |
+
nltk.download("punkt")
|
| 22 |
+
|
| 23 |
try:
|
| 24 |
import spacy
|
| 25 |
nlp = spacy.load("en_core_web_sm")
|
| 26 |
+
except OSError:
|
| 27 |
+
print("Downloading SpaCy model...")
|
| 28 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
| 29 |
+
nlp = spacy.load("en_core_web_sm")
|
|
|
|
| 30 |
|
| 31 |
# Load AI models
|
| 32 |
translator = ChatOpenAI(model="gpt-3.5-turbo")
|
|
|
|
| 42 |
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
|
| 43 |
english_term = row['English'].strip().lower()
|
| 44 |
french_term = row['CanadianFrench'].strip()
|
| 45 |
+
doc = nlp(english_term) if nlp else english_term.split()
|
| 46 |
+
lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
|
| 47 |
glossary[lemmatized_term] = french_term
|
| 48 |
|
| 49 |
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
|
|
|
|
| 68 |
return response.content.strip()
|
| 69 |
except Exception as e:
|
| 70 |
print(f"Error in translation (attempt {attempt+1}): {e}")
|
| 71 |
+
time.sleep(2)
|
| 72 |
return "Translation failed. Please try again later."
|
| 73 |
|
| 74 |
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
|
|
| 76 |
glossary_items = tuple(sorted(glossary.items()))
|
| 77 |
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
|
| 78 |
|
| 79 |
+
sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
|
| 80 |
|
| 81 |
def process_sentence(sentence):
|
| 82 |
"""Processes a single sentence with glossary enforcement."""
|
|
|
|
| 99 |
|
| 100 |
return sentence.strip()
|
| 101 |
|
|
|
|
| 102 |
with ThreadPoolExecutor() as executor:
|
| 103 |
updated_sentences = list(executor.map(process_sentence, sentences))
|
| 104 |
|