import pandas as pd import os from langchain_openai import ChatOpenAI from langchain_core.output_parsers import JsonOutputParser from langchain_core.prompts import PromptTemplate from langchain_core.runnables import RunnableLambda import gradio as gr import pandas as pd from docx import Document def using_model(chosen_model, api_key): if chosen_model == 'ChatGPT (4o-mini)': model = chat_gpt_4o_mini(api_key = api_key) else: pass return model def chat_gpt_4o_mini(api_key = None): model = ChatOpenAI(model_name="gpt-4o-mini", api_key=api_key) str_prompt =""" You will be provided with a sentence in {source_lang}, and your task is to translate it into {target_lang}. Answer in Json format with key 'translated' Sentence: {sentence} """ output_parser = JsonOutputParser() prompt = PromptTemplate( template = str_prompt, input_variables=["source_lang","target_lang","sentence"], partial_variables={"format_instructions": output_parser.get_format_instructions()} ) def get_class(x:dict)->str: return x["translated"] chain = prompt | model | output_parser | RunnableLambda(get_class) return chain def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'): if where_to_place is None: where_to_place = 'append_all' model = using_model(chosen_model = chosen_model, api_key = api_key) if isinstance(file, pd.DataFrame): df = file.copy() output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx" df = df.drop(columns=['name']) elif isinstance(file, str): df = pd.read_excel(file, sheet_name=sheet_name, header=0) output_file = f"{file.split('.')[0]}_translated.xlsx" else: df = pd.read_excel(file.name, sheet_name=sheet_name, header=0) output_file = f"{file.name.split('.')[0]}_translated.xlsx" original_col = df.columns total_columns = len(df.columns) current_step = 0 progress(0, desc="Starting translation process...") # Automatically detect string columns if col_name is None if col_name is None: col_name = [col for col in df.columns if df[col].dtype == 'object'] # Determine columns that are not selected for translation remain_col = [col for col in df.columns if col not in col_name] # Dictionary to store unique values and their translations translation_map = {} trans_col_name = [] # Process the selected columns for translation for idx, col in enumerate(col_name): current_step += 1 progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...") try: # Extract unique values from the column unique_values = df[col].dropna().unique() unique_values = list(set(unique_values)) # Ensure uniqueness # Prepare data for translation zh_sentence = [{"sentence": value, "source_lang": source_lang, "target_lang": target_lang} for value in unique_values] # Translate unique values answers = model.batch(zh_sentence, config={"max_concurrency": 3}) # Create a mapping from original values to translated values translations = dict(zip(unique_values, answers)) translation_map[col] = translations trans_col_name.append(col + "_translated") # Map translations back to the original DataFrame df[col + "_translated"] = df[col].map(translations).fillna(df[col]) except Exception as e: print(f"Error in column {col}: {e}") continue # Process remaining columns # for column in remain_col: # current_step += 1 # progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...") # try: # # We do not translate all_col which remaining col # # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}]) # name_col = column + '_translated' # Assuming the translation returns a list of translations # df.loc[:, name_col] = df.loc[:, column] # except Exception as e: # print(f"Error in column {column}: {e}") # continue if not os.path.exists(output_file): pd.DataFrame().to_excel(output_file, index=False) if keep_original == 'keep original': output_col = original_col else: output_col = col_name try: if where_to_place == 'append_all (ต่อ column สุดท้าย)': final_cols = list(output_col) + [col for col in trans_col_name] result = df[final_cols] result.to_excel(output_file, index=False) elif where_to_place == 'append_compare (เปรียบเทียบ column by column)': final_cols = [] for col in output_col: for trans_col in trans_col_name: if col + '_translated' == trans_col: final_cols = final_cols + [col, trans_col] else: final_cols = final_cols + [col] result = df[final_cols] result.to_excel(output_file, index=False) elif where_to_place == 'replace': final_cols = [] for col in output_col: for trans_col in trans_col_name: if col + '_translated' == trans_col: final_cols = final_cols + [trans_col] else: final_cols = final_cols + [col] result = df[final_cols] result.to_excel(output_file, index=False) elif where_to_place == 'new_sheet': final_cols = [col for col in output_col] new_tab_cols = trans_col_name result = df[final_cols] result1 = df[new_tab_cols] # Use ExcelWriter to write multiple sheets with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: result.to_excel(writer, sheet_name=sheet_name, index=False) # First sheet result1.to_excel(writer, sheet_name=f'{sheet_name}_translated', index=False) # Second sheet progress(1.0, desc="Saving translated file... Completed!") except Exception as e: print(f"Error saving the file: {e}") raise gr.Error(f"Error saving the file: {e}") progress(1.0, desc="Completed all tasks!") if return_output == 'file': return output_file elif return_output == 'df': return result else: return output_file def extract_word_content_to_excel(file_path): """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """ doc = Document(file_path) data = [] table_dict = {} paragraph_count = 0 for element in doc.element.body: if element.tag.endswith("p"): # Paragraph paragraph_text = element.text.strip() paragraph_count += 1 data.append([paragraph_count, paragraph_text]) # บันทึกพารากราฟ elif element.tag.endswith("tbl"): # Table (ถ้ามี) paragraph_count += 1 data.append([paragraph_count, "[Table]"]) # Extract table content table = doc.tables[len(table_dict)] # Get current table table_data = [] for row in table.rows: row_data = [cell.text.strip() for cell in row.cells] table_data.append(row_data) # Generate dynamic column names ('object_0', 'object_1', ...) max_cols = max(len(row) for row in table_data) if table_data else 0 column_names = [f"object_{i}" for i in range(max_cols)] # Store table as DataFrame table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names) elif element.tag.endswith("drawing"): # Image (รูปภาพ) paragraph_count += 1 data.append([paragraph_count, "[Image]"]) # สร้าง DataFrame df = pd.DataFrame(data, columns=["paragraph", "original"]) df['name'] = file_path.split('/')[-1] with pd.ExcelWriter("extracted_tables.xlsx") as writer: for key, table_df in table_dict.items(): table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False) return df, table_dict def reconstruct_word(paragraph_df, translated_tables, file_path): """Reconstruct Word Document from translated content""" doc = Document() output_path=f"{file_path.split('.')[0]}_translated.docx" for _, row in paragraph_df.iterrows(): if row["original"] == "[Table]": # Insert Table table_number = row["paragraph"] table_df = translated_tables.get(table_number) if table_df is not None: # Filter only columns that contain '_translated' translated_cols = [col for col in table_df.columns if '_translated' in col] if translated_cols: table_df = table_df[translated_cols] # Keep only translated columns # Create a table with the filtered columns table = doc.add_table(rows=len(table_df), cols=len(table_df.columns)) for i, row_data in enumerate(table_df.values): for j, cell_text in enumerate(row_data): table.cell(i, j).text = cell_text else: print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}") else: if "original_translated" in row: doc.add_paragraph(row["original_translated"]) else: doc.add_paragraph("") doc.save(output_path) return output_path def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()): word_to_excel_file, word_table = extract_word_content_to_excel(file) base_translated = chat_gpt_translate_excel(word_to_excel_file, sheet_name="Sheet1", col_name = ['original'], source_lang = source_lang, target_lang = target_lang, where_to_place="append_all (ต่อ column สุดท้าย)", keep_original="keep original", chosen_model = chosen_model, api_key = api_key, return_output='df' ) # Translate Tables translated_tables = {} for key, table_df in word_table.items(): translated_tables[key] = chat_gpt_translate_excel( file="extracted_tables.xlsx", sheet_name=f"Table_{key}", col_name=table_df.columns.tolist(), source_lang=source_lang, target_lang=target_lang, where_to_place="append_all (ต่อ column สุดท้าย)", keep_original="keep original", chosen_model=chosen_model, api_key=api_key, return_output='df' ) output_file = reconstruct_word(base_translated, translated_tables, file) if os.path.exists('extracted_tables.xlsx'): os.remove('extracted_tables.xlsx') if os.path.exists('extracted_tables_translated.xlsx'): os.remove('extracted_tables_translated.xlsx') # for deploy huggingface if os.path.exists(f"{file.split('.')[0]}_translated.xlsx"): os.remove(f"{file.split('.')[0]}_translated.xlsx") return output_file if __name__ == "__main__" : with gr.Blocks() as iface: gr.Markdown("## Excel Translation Interface") excel_file = gr.File(label="Upload Excel File") sheet_name = gr.Dropdown(label="Select Sheet", interactive=True) column_name= gr.Dropdown(label = "Select Column to Translate (Not require)", multiselect=True, interactive=True) with gr.Row(): source_language = gr.Textbox(label="Source Language Code") target_language = gr.Textbox(label="Target Language Code") with gr.Row(): where_to_place = gr.Dropdown(multiselect=False ,label="How translated columns should be placed" , choices = ['replace', 'append_all (ต่อ column สุดท้าย)', 'append_compare (เปรียบเทียบ column by column)', 'new_sheet'] , interactive=True ) keep_original = gr.Dropdown(multiselect=False ,label="You want to keep original column or just only the translated column" , choices = ['keep original', 'translated_column'] , interactive=True ) def check_file_type(file): """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """ file_extension = os.path.splitext(file.name)[-1].lower() if file_extension in [".docx", ".doc"]: return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)'] , interactive=False ) elif file_extension in [".xlsx", ".xls"]: return update_sheets(file) else: return "Unknown" def check_uploaded_file(file): """ ฟังก์ชันรับไฟล์ที่อัปโหลด แล้วตรวจสอบประเภท """ if file is None: return "No file uploaded" return check_file_type(file) def get_sheet_names(file): xls = pd.ExcelFile(file.name) return xls.sheet_names def update_sheets(file): sheets = get_sheet_names(file) return gr.update(choices=sheets) def update_columns(file, sheet_name): if os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]: return gr.update(choices=['original'], interactive=False) elif os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]: columns = get_column_names(file, sheet_name) return gr.update(choices=columns) else: return "error" def get_column_names(file, sheet_name): dd = pd.read_excel(file.name, sheet_name=sheet_name) return list(dd.columns) excel_file.change(fn=check_uploaded_file, inputs=excel_file, outputs=sheet_name) sheet_name.change(fn=update_columns, inputs=[excel_file, sheet_name], outputs=column_name) model_choosing = gr.Dropdown(multiselect = False , label = "Choosing Model you want", choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)'] , interactive=True ) needed_require = gr.Textbox(label="API Key(require if Chatgpt)") translate_button = gr.Button("Translate") output_file = gr.File(label="Download Translated Excel File", interactive=True) # Unified translation function def translate_excel( file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key ): if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]: if model == "ChatGPT (4o-mini)": # Call ChatGPT-based translation return chat_gpt_translate_excel( file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key ) else: # Handle other models (currently in progress) raise gr.Error("Translation with the selected model is not yet implemented.") elif os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]: if model == "ChatGPT (4o-mini)": # Call ChatGPT-based translation return chat_gpt_translate_word(file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key) else: # Handle other models (currently in progress) raise gr.Error("Translation with the selected model is not yet implemented.") else: print('No Type of Input Supported') # Register button click translate_button.click( fn=translate_excel, inputs=[ excel_file, sheet_name, column_name, source_language, target_language, where_to_place, keep_original, model_choosing, needed_require, ], outputs=output_file, ) iface.launch(debug=True, share=True, server_port= 7860, server_name="0.0.0.0" )