|
|
import gradio as gr |
|
|
from simplemma import simple_tokenizer |
|
|
from difflib import Differ |
|
|
from icecream import ic |
|
|
from app.webui.patch import model_load,num_tokens_in_string,one_chunk_initial_translation, one_chunk_reflect_on_translation, one_chunk_improve_translation |
|
|
from app.webui.patch import calculate_chunk_size, multichunk_initial_translation, multichunk_reflect_on_translation, multichunk_improve_translation |
|
|
|
|
|
from llama_index.core.node_parser import SentenceSplitter |
|
|
|
|
|
from translatepy.translators.google import GoogleTranslate |
|
|
from translatepy.exceptions import UnknownLanguage |
|
|
from translatepy import Language |
|
|
|
|
|
gtranslator = GoogleTranslate() |
|
|
|
|
|
progress=gr.Progress() |
|
|
|
|
|
def tokenize(text): |
|
|
|
|
|
words = simple_tokenizer(text) |
|
|
|
|
|
if ' ' in text: |
|
|
|
|
|
tokens = [] |
|
|
for word in words: |
|
|
tokens.append(word) |
|
|
if not word.startswith("'") and not word.endswith("'"): |
|
|
tokens.append(' ') |
|
|
return tokens[:-1] |
|
|
else: |
|
|
return words |
|
|
|
|
|
def diff_texts(text1, text2): |
|
|
tokens1 = tokenize(text1) |
|
|
tokens2 = tokenize(text2) |
|
|
|
|
|
d = Differ() |
|
|
diff_result = list(d.compare(tokens1, tokens2)) |
|
|
|
|
|
highlighted_text = [] |
|
|
for token in diff_result: |
|
|
word = token[2:] |
|
|
category = None |
|
|
if token[0] == '+': |
|
|
category = 'added' |
|
|
elif token[0] == '-': |
|
|
category = 'removed' |
|
|
elif token[0] == '?': |
|
|
continue |
|
|
|
|
|
highlighted_text.append((word, category)) |
|
|
|
|
|
return highlighted_text |
|
|
|
|
|
|
|
|
def translator( |
|
|
source_lang: str, |
|
|
target_lang: str, |
|
|
source_text: str, |
|
|
country: str, |
|
|
max_tokens:int = 1000, |
|
|
): |
|
|
"""Translate the source_text from source_lang to target_lang.""" |
|
|
num_tokens_in_text = num_tokens_in_string(source_text) |
|
|
|
|
|
ic(num_tokens_in_text) |
|
|
|
|
|
if num_tokens_in_text < max_tokens: |
|
|
ic("Translating text as single chunk") |
|
|
|
|
|
progress((1,3), desc="First translation...") |
|
|
init_translation = one_chunk_initial_translation( |
|
|
source_lang, target_lang, source_text |
|
|
) |
|
|
|
|
|
progress((2,3), desc="Reflecton...") |
|
|
reflection = one_chunk_reflect_on_translation( |
|
|
source_lang, target_lang, source_text, init_translation, country |
|
|
) |
|
|
|
|
|
progress((3,3), desc="Second translation...") |
|
|
final_translation = one_chunk_improve_translation( |
|
|
source_lang, target_lang, source_text, init_translation, reflection |
|
|
) |
|
|
|
|
|
return init_translation, reflection, final_translation |
|
|
|
|
|
else: |
|
|
ic("Translating text as multiple chunks") |
|
|
|
|
|
progress((1,5), desc="Calculate chunk size...") |
|
|
token_size = calculate_chunk_size( |
|
|
token_count=num_tokens_in_text, token_limit=max_tokens |
|
|
) |
|
|
|
|
|
ic(token_size) |
|
|
|
|
|
|
|
|
text_parser = SentenceSplitter( |
|
|
chunk_size=token_size, |
|
|
) |
|
|
|
|
|
progress((2,5), desc="Spilt source text...") |
|
|
source_text_chunks = text_parser.split_text(source_text) |
|
|
|
|
|
progress((3,5), desc="First translation...") |
|
|
translation_1_chunks = multichunk_initial_translation( |
|
|
source_lang, target_lang, source_text_chunks |
|
|
) |
|
|
|
|
|
init_translation = "".join(translation_1_chunks) |
|
|
|
|
|
progress((4,5), desc="Reflection...") |
|
|
reflection_chunks = multichunk_reflect_on_translation( |
|
|
source_lang, |
|
|
target_lang, |
|
|
source_text_chunks, |
|
|
translation_1_chunks, |
|
|
country, |
|
|
) |
|
|
|
|
|
reflection = "".join(reflection_chunks) |
|
|
|
|
|
progress((5,5), desc="Second translation...") |
|
|
translation_2_chunks = multichunk_improve_translation( |
|
|
source_lang, |
|
|
target_lang, |
|
|
source_text_chunks, |
|
|
translation_1_chunks, |
|
|
reflection_chunks, |
|
|
) |
|
|
|
|
|
final_translation = "".join(translation_2_chunks) |
|
|
|
|
|
return init_translation, reflection, final_translation |
|
|
|
|
|
|
|
|
def translator_sec( |
|
|
endpoint2: str, |
|
|
model2: str, |
|
|
api_key2: str, |
|
|
context_window: int, |
|
|
num_output: int, |
|
|
source_lang: str, |
|
|
target_lang: str, |
|
|
source_text: str, |
|
|
country: str, |
|
|
max_tokens: int = 1000, |
|
|
gt: bool = False, |
|
|
): |
|
|
|
|
|
"""Translate the source_text from source_lang to target_lang.""" |
|
|
num_tokens_in_text = num_tokens_in_string(source_text) |
|
|
|
|
|
ic(num_tokens_in_text) |
|
|
|
|
|
if num_tokens_in_text < max_tokens: |
|
|
ic("Translating text as single chunk") |
|
|
|
|
|
progress((1,3), desc="First translation...") |
|
|
if gt: |
|
|
try: |
|
|
language = Language(target_lang) |
|
|
except Exception as e: |
|
|
raise gr.Error(f"An unexpected error occurred: {e}") |
|
|
init_translation = gtranslator.translate(source_text, language).result |
|
|
else: |
|
|
init_translation = one_chunk_initial_translation( |
|
|
source_lang, target_lang, source_text |
|
|
) |
|
|
try: |
|
|
model_load(endpoint2, model2, api_key2, context_window, num_output) |
|
|
except Exception as e: |
|
|
raise gr.Error(f"An unexpected error occurred: {e}") |
|
|
|
|
|
progress((2,3), desc="Reflecton...") |
|
|
reflection = one_chunk_reflect_on_translation( |
|
|
source_lang, target_lang, source_text, init_translation, country |
|
|
) |
|
|
|
|
|
progress((3,3), desc="Second translation...") |
|
|
final_translation = one_chunk_improve_translation( |
|
|
source_lang, target_lang, source_text, init_translation, reflection |
|
|
) |
|
|
|
|
|
return init_translation, reflection, final_translation |
|
|
|
|
|
else: |
|
|
ic("Translating text as multiple chunks") |
|
|
|
|
|
progress((1,5), desc="Calculate chunk size...") |
|
|
token_size = calculate_chunk_size( |
|
|
token_count=num_tokens_in_text, token_limit=max_tokens |
|
|
) |
|
|
|
|
|
ic(token_size) |
|
|
|
|
|
|
|
|
text_parser = SentenceSplitter( |
|
|
chunk_size=token_size, |
|
|
) |
|
|
|
|
|
progress((2,5), desc="Spilt source text...") |
|
|
source_text_chunks = text_parser.split_text(source_text) |
|
|
|
|
|
progress((3,5), desc="First translation...") |
|
|
if gt: |
|
|
try: |
|
|
language = Language(target_lang) |
|
|
except Exception as e: |
|
|
raise gr.Error(f"An unexpected error occurred: {e}") |
|
|
translation_1_chunks = gtranslator.translate(source_text_chunks, language).result |
|
|
else: |
|
|
translation_1_chunks = multichunk_initial_translation( |
|
|
source_lang, target_lang, source_text_chunks |
|
|
) |
|
|
try: |
|
|
model_load(endpoint2, model2, api_key2, context_window, num_output) |
|
|
except Exception as e: |
|
|
raise gr.Error(f"An unexpected error occurred: {e}") |
|
|
|
|
|
init_translation = "".join(translation_1_chunks) |
|
|
progress((4,5), desc="Reflection...") |
|
|
reflection_chunks = multichunk_reflect_on_translation( |
|
|
source_lang, |
|
|
target_lang, |
|
|
source_text_chunks, |
|
|
translation_1_chunks, |
|
|
country, |
|
|
) |
|
|
|
|
|
reflection = "".join(reflection_chunks) |
|
|
|
|
|
progress((5,5), desc="Second translation...") |
|
|
translation_2_chunks = multichunk_improve_translation( |
|
|
source_lang, |
|
|
target_lang, |
|
|
source_text_chunks, |
|
|
translation_1_chunks, |
|
|
reflection_chunks, |
|
|
) |
|
|
|
|
|
final_translation = "".join(translation_2_chunks) |
|
|
|
|
|
return init_translation, reflection, final_translation |