Spaces:

petchsko
/

Translator_app

Sleeping

Translator_app / translator_app.py

Petch DS

fix bug doesn't find original translated

76fa87f about 1 year ago

18.1 kB

	import pandas as pd
	import os
	from langchain_openai import ChatOpenAI
	from langchain_core.output_parsers import JsonOutputParser
	from langchain_core.prompts import PromptTemplate
	from langchain_core.runnables import RunnableLambda
	import gradio as gr
	import pandas as pd
	from docx import Document

	def using_model(chosen_model, api_key):
	if chosen_model == 'ChatGPT (4o-mini)':
	model = chat_gpt_4o_mini(api_key = api_key)
	else:
	pass
	return model

	def chat_gpt_4o_mini(api_key = None):
	model = ChatOpenAI(model_name="gpt-4o-mini", api_key=api_key)

	str_prompt ="""
	You will be provided with a sentence in {source_lang}, and your task is to translate it into {target_lang}.
	Answer in Json format with key 'translated'
	Sentence: {sentence}
	"""

	output_parser = JsonOutputParser()
	prompt = PromptTemplate(
	template = str_prompt,
	input_variables=["source_lang","target_lang","sentence"],
	partial_variables={"format_instructions": output_parser.get_format_instructions()}
	)
	def get_class(x:dict)->str:
	return x["translated"]

	chain = prompt \| model \| output_parser \| RunnableLambda(get_class)

	return chain


	def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'):
	if where_to_place is None:
	where_to_place = 'append_all'

	model = using_model(chosen_model = chosen_model, api_key = api_key)

	if isinstance(file, pd.DataFrame):
	df = file.copy()
	output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
	df = df.drop(columns=['name'])
	elif isinstance(file, str):
	df = pd.read_excel(file, sheet_name=sheet_name, header=0)
	output_file = f"{file.split('.')[0]}_translated.xlsx"
	else:
	df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
	output_file = f"{file.name.split('.')[0]}_translated.xlsx"

	original_col = df.columns
	total_columns = len(df.columns)
	current_step = 0

	progress(0, desc="Starting translation process...")

	# Automatically detect string columns if col_name is None
	if col_name is None:
	col_name = [col for col in df.columns if df[col].dtype == 'object']

	# Determine columns that are not selected for translation
	remain_col = [col for col in df.columns if col not in col_name]

	# Dictionary to store unique values and their translations
	translation_map = {}
	trans_col_name = []


	# Process the selected columns for translation
	for idx, col in enumerate(col_name):
	current_step += 1
	progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...")

	try:
	# Extract unique values from the column
	unique_values = df[col].dropna().unique()
	unique_values = list(set(unique_values)) # Ensure uniqueness

	# Prepare data for translation
	zh_sentence = [{"sentence": value, "source_lang": source_lang, "target_lang": target_lang} for value in unique_values]

	# Translate unique values
	answers = model.batch(zh_sentence, config={"max_concurrency": 3})

	# Create a mapping from original values to translated values
	translations = dict(zip(unique_values, answers))
	translation_map[col] = translations

	trans_col_name.append(col + "_translated")
	# Map translations back to the original DataFrame
	df[col + "_translated"] = df[col].map(translations).fillna(df[col])

	except Exception as e:
	print(f"Error in column {col}: {e}")
	continue

	# Process remaining columns
	# for column in remain_col:
	# current_step += 1
	# progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")

	# try:
	# # We do not translate all_col which remaining col
	# # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
	# name_col = column + '_translated' # Assuming the translation returns a list of translations
	# df.loc[:, name_col] = df.loc[:, column]

	# except Exception as e:
	# print(f"Error in column {column}: {e}")
	# continue

	if not os.path.exists(output_file):
	pd.DataFrame().to_excel(output_file, index=False)

	if keep_original == 'keep original':
	output_col = original_col
	else:
	output_col = col_name

	try:
	if where_to_place == 'append_all (ต่อ column สุดท้าย)':
	final_cols = list(output_col) + [col for col in trans_col_name]
	result = df[final_cols]
	result.to_excel(output_file, index=False)
	elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':
	final_cols = []
	for col in output_col:
	for trans_col in trans_col_name:
	if col + '_translated' == trans_col:
	final_cols = final_cols + [col, trans_col]
	else:
	final_cols = final_cols + [col]
	result = df[final_cols]
	result.to_excel(output_file, index=False)
	elif where_to_place == 'replace':
	final_cols = []
	for col in output_col:
	for trans_col in trans_col_name:
	if col + '_translated' == trans_col:
	final_cols = final_cols + [trans_col]
	else:
	final_cols = final_cols + [col]
	result = df[final_cols]
	result.to_excel(output_file, index=False)

	elif where_to_place == 'new_sheet':
	final_cols = [col for col in output_col]
	new_tab_cols = trans_col_name

	result = df[final_cols]
	result1 = df[new_tab_cols]
	# Use ExcelWriter to write multiple sheets
	with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
	result.to_excel(writer, sheet_name=sheet_name, index=False) # First sheet
	result1.to_excel(writer, sheet_name=f'{sheet_name}_translated', index=False) # Second sheet

	progress(1.0, desc="Saving translated file... Completed!")
	except Exception as e:
	print(f"Error saving the file: {e}")
	raise gr.Error(f"Error saving the file: {e}")

	progress(1.0, desc="Completed all tasks!")

	if return_output == 'file':
	return output_file
	elif return_output == 'df':
	return result
	else:
	return output_file


	def extract_word_content_to_excel(file_path):
	""" ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
	doc = Document(file_path)

	data = []
	table_dict = {}
	paragraph_count = 0

	for element in doc.element.body:
	if element.tag.endswith("p"): # Paragraph
	paragraph_text = element.text.strip()
	paragraph_count += 1
	data.append([paragraph_count, paragraph_text]) # บันทึกพารากราฟ

	elif element.tag.endswith("tbl"): # Table (ถ้ามี)
	paragraph_count += 1
	data.append([paragraph_count, "[Table]"])

	# Extract table content
	table = doc.tables[len(table_dict)] # Get current table
	table_data = []
	for row in table.rows:
	row_data = [cell.text.strip() for cell in row.cells]
	table_data.append(row_data)

	# Generate dynamic column names ('object_0', 'object_1', ...)
	max_cols = max(len(row) for row in table_data) if table_data else 0
	column_names = [f"object_{i}" for i in range(max_cols)]

	# Store table as DataFrame
	table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names)


	elif element.tag.endswith("drawing"): # Image (รูปภาพ)
	paragraph_count += 1
	data.append([paragraph_count, "[Image]"])

	# สร้าง DataFrame
	df = pd.DataFrame(data, columns=["paragraph", "original"])
	df['name'] = file_path.split('/')[-1]

	with pd.ExcelWriter("extracted_tables.xlsx") as writer:
	for key, table_df in table_dict.items():
	table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False)

	return df, table_dict

	def reconstruct_word(paragraph_df, translated_tables, file_path):
	"""Reconstruct Word Document from translated content"""
	doc = Document()
	output_path=f"{file_path.split('.')[0]}_translated.docx"

	for _, row in paragraph_df.iterrows():
	if row["original"] == "[Table]": # Insert Table
	table_number = row["paragraph"]
	table_df = translated_tables.get(table_number)

	if table_df is not None:
	# Filter only columns that contain '_translated'
	translated_cols = [col for col in table_df.columns if '_translated' in col]

	if translated_cols:
	table_df = table_df[translated_cols] # Keep only translated columns

	# Create a table with the filtered columns
	table = doc.add_table(rows=len(table_df), cols=len(table_df.columns))

	for i, row_data in enumerate(table_df.values):
	for j, cell_text in enumerate(row_data):
	table.cell(i, j).text = cell_text
	else:
	print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}")
	else:
	if "original_translated" in row:
	doc.add_paragraph(row["original_translated"])
	else:
	doc.add_paragraph("")

	doc.save(output_path)
	return output_path

	def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
	word_to_excel_file, word_table = extract_word_content_to_excel(file)
	base_translated = chat_gpt_translate_excel(word_to_excel_file,
	sheet_name="Sheet1",
	col_name = ['original'],
	source_lang = source_lang,
	target_lang = target_lang,
	where_to_place="append_all (ต่อ column สุดท้าย)",
	keep_original="keep original",
	chosen_model = chosen_model,
	api_key = api_key,
	return_output='df'
	)
	# Translate Tables
	translated_tables = {}
	for key, table_df in word_table.items():
	translated_tables[key] = chat_gpt_translate_excel(
	file="extracted_tables.xlsx",
	sheet_name=f"Table_{key}",
	col_name=table_df.columns.tolist(),
	source_lang=source_lang,
	target_lang=target_lang,
	where_to_place="append_all (ต่อ column สุดท้าย)",
	keep_original="keep original",
	chosen_model=chosen_model,
	api_key=api_key,
	return_output='df'
	)
	output_file = reconstruct_word(base_translated, translated_tables, file)

	if os.path.exists('extracted_tables.xlsx'):
	os.remove('extracted_tables.xlsx')
	if os.path.exists('extracted_tables_translated.xlsx'):
	os.remove('extracted_tables_translated.xlsx')

	# for deploy huggingface
	if os.path.exists(f"{file.split('.')[0]}_translated.xlsx"):
	os.remove(f"{file.split('.')[0]}_translated.xlsx")

	return output_file

	if __name__ == "__main__" :

	with gr.Blocks() as iface:
	gr.Markdown("## Excel Translation Interface")

	excel_file = gr.File(label="Upload Excel File")
	sheet_name = gr.Dropdown(label="Select Sheet", interactive=True)
	column_name= gr.Dropdown(label = "Select Column to Translate (Not require)", multiselect=True, interactive=True)

	with gr.Row():
	source_language = gr.Textbox(label="Source Language Code")
	target_language = gr.Textbox(label="Target Language Code")
	with gr.Row():
	where_to_place = gr.Dropdown(multiselect=False ,label="How translated columns should be placed"
	, choices = ['replace',
	'append_all (ต่อ column สุดท้าย)',
	'append_compare (เปรียบเทียบ column by column)',
	'new_sheet']
	, interactive=True
	)
	keep_original = gr.Dropdown(multiselect=False ,label="You want to keep original column or just only the translated column"
	, choices = ['keep original',
	'translated_column']
	, interactive=True
	)

	def check_file_type(file):
	""" ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
	file_extension = os.path.splitext(file.name)[-1].lower()

	if file_extension in [".docx", ".doc"]:
	return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
	, interactive=False
	)
	elif file_extension in [".xlsx", ".xls"]:
	return update_sheets(file)
	else:
	return "Unknown"

	def check_uploaded_file(file):
	""" ฟังก์ชันรับไฟล์ที่อัปโหลด แล้วตรวจสอบประเภท """
	if file is None:
	return "No file uploaded"
	return check_file_type(file)


	def get_sheet_names(file):
	xls = pd.ExcelFile(file.name)
	return xls.sheet_names

	def update_sheets(file):
	sheets = get_sheet_names(file)
	return gr.update(choices=sheets)

	def update_columns(file, sheet_name):
	if os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]:
	return gr.update(choices=['original'], interactive=False)
	elif os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
	columns = get_column_names(file, sheet_name)
	return gr.update(choices=columns)
	else:
	return "error"

	def get_column_names(file, sheet_name):
	dd = pd.read_excel(file.name, sheet_name=sheet_name)
	return list(dd.columns)


	excel_file.change(fn=check_uploaded_file, inputs=excel_file, outputs=sheet_name)
	sheet_name.change(fn=update_columns, inputs=[excel_file, sheet_name], outputs=column_name)

	model_choosing = gr.Dropdown(multiselect = False ,
	label = "Choosing Model you want",
	choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)']
	, interactive=True
	)

	needed_require = gr.Textbox(label="API Key(require if Chatgpt)")
	translate_button = gr.Button("Translate")
	output_file = gr.File(label="Download Translated Excel File", interactive=True)

	# Unified translation function
	def translate_excel(
	file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
	):
	if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
	if model == "ChatGPT (4o-mini)":
	# Call ChatGPT-based translation
	return chat_gpt_translate_excel(
	file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
	)
	else:
	# Handle other models (currently in progress)
	raise gr.Error("Translation with the selected model is not yet implemented.")
	elif os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]:
	if model == "ChatGPT (4o-mini)":
	# Call ChatGPT-based translation
	return chat_gpt_translate_word(file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key)
	else:
	# Handle other models (currently in progress)
	raise gr.Error("Translation with the selected model is not yet implemented.")

	else:
	print('No Type of Input Supported')

	# Register button click
	translate_button.click(
	fn=translate_excel,
	inputs=[
	excel_file,
	sheet_name,
	column_name,
	source_language,
	target_language,
	where_to_place,
	keep_original,
	model_choosing,
	needed_require,
	],
	outputs=output_file,
	)

	iface.launch(debug=True, share=True,
	server_port= 7860,
	server_name="0.0.0.0"
	)