Translator_app / translator_app.py
Petch DS
fix bug doesn't find original translated
76fa87f
import pandas as pd
import os
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
import gradio as gr
import pandas as pd
from docx import Document
def using_model(chosen_model, api_key):
if chosen_model == 'ChatGPT (4o-mini)':
model = chat_gpt_4o_mini(api_key = api_key)
else:
pass
return model
def chat_gpt_4o_mini(api_key = None):
model = ChatOpenAI(model_name="gpt-4o-mini", api_key=api_key)
str_prompt ="""
You will be provided with a sentence in {source_lang}, and your task is to translate it into {target_lang}.
Answer in Json format with key 'translated'
Sentence: {sentence}
"""
output_parser = JsonOutputParser()
prompt = PromptTemplate(
template = str_prompt,
input_variables=["source_lang","target_lang","sentence"],
partial_variables={"format_instructions": output_parser.get_format_instructions()}
)
def get_class(x:dict)->str:
return x["translated"]
chain = prompt | model | output_parser | RunnableLambda(get_class)
return chain
def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'):
if where_to_place is None:
where_to_place = 'append_all'
model = using_model(chosen_model = chosen_model, api_key = api_key)
if isinstance(file, pd.DataFrame):
df = file.copy()
output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
df = df.drop(columns=['name'])
elif isinstance(file, str):
df = pd.read_excel(file, sheet_name=sheet_name, header=0)
output_file = f"{file.split('.')[0]}_translated.xlsx"
else:
df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
output_file = f"{file.name.split('.')[0]}_translated.xlsx"
original_col = df.columns
total_columns = len(df.columns)
current_step = 0
progress(0, desc="Starting translation process...")
# Automatically detect string columns if col_name is None
if col_name is None:
col_name = [col for col in df.columns if df[col].dtype == 'object']
# Determine columns that are not selected for translation
remain_col = [col for col in df.columns if col not in col_name]
# Dictionary to store unique values and their translations
translation_map = {}
trans_col_name = []
# Process the selected columns for translation
for idx, col in enumerate(col_name):
current_step += 1
progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...")
try:
# Extract unique values from the column
unique_values = df[col].dropna().unique()
unique_values = list(set(unique_values)) # Ensure uniqueness
# Prepare data for translation
zh_sentence = [{"sentence": value, "source_lang": source_lang, "target_lang": target_lang} for value in unique_values]
# Translate unique values
answers = model.batch(zh_sentence, config={"max_concurrency": 3})
# Create a mapping from original values to translated values
translations = dict(zip(unique_values, answers))
translation_map[col] = translations
trans_col_name.append(col + "_translated")
# Map translations back to the original DataFrame
df[col + "_translated"] = df[col].map(translations).fillna(df[col])
except Exception as e:
print(f"Error in column {col}: {e}")
continue
# Process remaining columns
# for column in remain_col:
# current_step += 1
# progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")
# try:
# # We do not translate all_col which remaining col
# # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
# name_col = column + '_translated' # Assuming the translation returns a list of translations
# df.loc[:, name_col] = df.loc[:, column]
# except Exception as e:
# print(f"Error in column {column}: {e}")
# continue
if not os.path.exists(output_file):
pd.DataFrame().to_excel(output_file, index=False)
if keep_original == 'keep original':
output_col = original_col
else:
output_col = col_name
try:
if where_to_place == 'append_all (ต่อ column สุดท้าย)':
final_cols = list(output_col) + [col for col in trans_col_name]
result = df[final_cols]
result.to_excel(output_file, index=False)
elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':
final_cols = []
for col in output_col:
for trans_col in trans_col_name:
if col + '_translated' == trans_col:
final_cols = final_cols + [col, trans_col]
else:
final_cols = final_cols + [col]
result = df[final_cols]
result.to_excel(output_file, index=False)
elif where_to_place == 'replace':
final_cols = []
for col in output_col:
for trans_col in trans_col_name:
if col + '_translated' == trans_col:
final_cols = final_cols + [trans_col]
else:
final_cols = final_cols + [col]
result = df[final_cols]
result.to_excel(output_file, index=False)
elif where_to_place == 'new_sheet':
final_cols = [col for col in output_col]
new_tab_cols = trans_col_name
result = df[final_cols]
result1 = df[new_tab_cols]
# Use ExcelWriter to write multiple sheets
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
result.to_excel(writer, sheet_name=sheet_name, index=False) # First sheet
result1.to_excel(writer, sheet_name=f'{sheet_name}_translated', index=False) # Second sheet
progress(1.0, desc="Saving translated file... Completed!")
except Exception as e:
print(f"Error saving the file: {e}")
raise gr.Error(f"Error saving the file: {e}")
progress(1.0, desc="Completed all tasks!")
if return_output == 'file':
return output_file
elif return_output == 'df':
return result
else:
return output_file
def extract_word_content_to_excel(file_path):
""" ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
doc = Document(file_path)
data = []
table_dict = {}
paragraph_count = 0
for element in doc.element.body:
if element.tag.endswith("p"): # Paragraph
paragraph_text = element.text.strip()
paragraph_count += 1
data.append([paragraph_count, paragraph_text]) # บันทึกพารากราฟ
elif element.tag.endswith("tbl"): # Table (ถ้ามี)
paragraph_count += 1
data.append([paragraph_count, "[Table]"])
# Extract table content
table = doc.tables[len(table_dict)] # Get current table
table_data = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_data.append(row_data)
# Generate dynamic column names ('object_0', 'object_1', ...)
max_cols = max(len(row) for row in table_data) if table_data else 0
column_names = [f"object_{i}" for i in range(max_cols)]
# Store table as DataFrame
table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names)
elif element.tag.endswith("drawing"): # Image (รูปภาพ)
paragraph_count += 1
data.append([paragraph_count, "[Image]"])
# สร้าง DataFrame
df = pd.DataFrame(data, columns=["paragraph", "original"])
df['name'] = file_path.split('/')[-1]
with pd.ExcelWriter("extracted_tables.xlsx") as writer:
for key, table_df in table_dict.items():
table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False)
return df, table_dict
def reconstruct_word(paragraph_df, translated_tables, file_path):
"""Reconstruct Word Document from translated content"""
doc = Document()
output_path=f"{file_path.split('.')[0]}_translated.docx"
for _, row in paragraph_df.iterrows():
if row["original"] == "[Table]": # Insert Table
table_number = row["paragraph"]
table_df = translated_tables.get(table_number)
if table_df is not None:
# Filter only columns that contain '_translated'
translated_cols = [col for col in table_df.columns if '_translated' in col]
if translated_cols:
table_df = table_df[translated_cols] # Keep only translated columns
# Create a table with the filtered columns
table = doc.add_table(rows=len(table_df), cols=len(table_df.columns))
for i, row_data in enumerate(table_df.values):
for j, cell_text in enumerate(row_data):
table.cell(i, j).text = cell_text
else:
print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}")
else:
if "original_translated" in row:
doc.add_paragraph(row["original_translated"])
else:
doc.add_paragraph("")
doc.save(output_path)
return output_path
def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
word_to_excel_file, word_table = extract_word_content_to_excel(file)
base_translated = chat_gpt_translate_excel(word_to_excel_file,
sheet_name="Sheet1",
col_name = ['original'],
source_lang = source_lang,
target_lang = target_lang,
where_to_place="append_all (ต่อ column สุดท้าย)",
keep_original="keep original",
chosen_model = chosen_model,
api_key = api_key,
return_output='df'
)
# Translate Tables
translated_tables = {}
for key, table_df in word_table.items():
translated_tables[key] = chat_gpt_translate_excel(
file="extracted_tables.xlsx",
sheet_name=f"Table_{key}",
col_name=table_df.columns.tolist(),
source_lang=source_lang,
target_lang=target_lang,
where_to_place="append_all (ต่อ column สุดท้าย)",
keep_original="keep original",
chosen_model=chosen_model,
api_key=api_key,
return_output='df'
)
output_file = reconstruct_word(base_translated, translated_tables, file)
if os.path.exists('extracted_tables.xlsx'):
os.remove('extracted_tables.xlsx')
if os.path.exists('extracted_tables_translated.xlsx'):
os.remove('extracted_tables_translated.xlsx')
# for deploy huggingface
if os.path.exists(f"{file.split('.')[0]}_translated.xlsx"):
os.remove(f"{file.split('.')[0]}_translated.xlsx")
return output_file
if __name__ == "__main__" :
with gr.Blocks() as iface:
gr.Markdown("## Excel Translation Interface")
excel_file = gr.File(label="Upload Excel File")
sheet_name = gr.Dropdown(label="Select Sheet", interactive=True)
column_name= gr.Dropdown(label = "Select Column to Translate (Not require)", multiselect=True, interactive=True)
with gr.Row():
source_language = gr.Textbox(label="Source Language Code")
target_language = gr.Textbox(label="Target Language Code")
with gr.Row():
where_to_place = gr.Dropdown(multiselect=False ,label="How translated columns should be placed"
, choices = ['replace',
'append_all (ต่อ column สุดท้าย)',
'append_compare (เปรียบเทียบ column by column)',
'new_sheet']
, interactive=True
)
keep_original = gr.Dropdown(multiselect=False ,label="You want to keep original column or just only the translated column"
, choices = ['keep original',
'translated_column']
, interactive=True
)
def check_file_type(file):
""" ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
file_extension = os.path.splitext(file.name)[-1].lower()
if file_extension in [".docx", ".doc"]:
return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
, interactive=False
)
elif file_extension in [".xlsx", ".xls"]:
return update_sheets(file)
else:
return "Unknown"
def check_uploaded_file(file):
""" ฟังก์ชันรับไฟล์ที่อัปโหลด แล้วตรวจสอบประเภท """
if file is None:
return "No file uploaded"
return check_file_type(file)
def get_sheet_names(file):
xls = pd.ExcelFile(file.name)
return xls.sheet_names
def update_sheets(file):
sheets = get_sheet_names(file)
return gr.update(choices=sheets)
def update_columns(file, sheet_name):
if os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]:
return gr.update(choices=['original'], interactive=False)
elif os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
columns = get_column_names(file, sheet_name)
return gr.update(choices=columns)
else:
return "error"
def get_column_names(file, sheet_name):
dd = pd.read_excel(file.name, sheet_name=sheet_name)
return list(dd.columns)
excel_file.change(fn=check_uploaded_file, inputs=excel_file, outputs=sheet_name)
sheet_name.change(fn=update_columns, inputs=[excel_file, sheet_name], outputs=column_name)
model_choosing = gr.Dropdown(multiselect = False ,
label = "Choosing Model you want",
choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)']
, interactive=True
)
needed_require = gr.Textbox(label="API Key(require if Chatgpt)")
translate_button = gr.Button("Translate")
output_file = gr.File(label="Download Translated Excel File", interactive=True)
# Unified translation function
def translate_excel(
file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
):
if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
if model == "ChatGPT (4o-mini)":
# Call ChatGPT-based translation
return chat_gpt_translate_excel(
file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
)
else:
# Handle other models (currently in progress)
raise gr.Error("Translation with the selected model is not yet implemented.")
elif os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]:
if model == "ChatGPT (4o-mini)":
# Call ChatGPT-based translation
return chat_gpt_translate_word(file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key)
else:
# Handle other models (currently in progress)
raise gr.Error("Translation with the selected model is not yet implemented.")
else:
print('No Type of Input Supported')
# Register button click
translate_button.click(
fn=translate_excel,
inputs=[
excel_file,
sheet_name,
column_name,
source_language,
target_language,
where_to_place,
keep_original,
model_choosing,
needed_require,
],
outputs=output_file,
)
iface.launch(debug=True, share=True,
server_port= 7860,
server_name="0.0.0.0"
)