Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import os | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.output_parsers import JsonOutputParser | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.runnables import RunnableLambda | |
| import gradio as gr | |
| import pandas as pd | |
| from docx import Document | |
| def using_model(chosen_model, api_key): | |
| if chosen_model == 'ChatGPT (4o-mini)': | |
| model = chat_gpt_4o_mini(api_key = api_key) | |
| else: | |
| pass | |
| return model | |
| def chat_gpt_4o_mini(api_key = None): | |
| model = ChatOpenAI(model_name="gpt-4o-mini", api_key=api_key) | |
| str_prompt =""" | |
| You will be provided with a sentence in {source_lang}, and your task is to translate it into {target_lang}. | |
| Answer in Json format with key 'translated' | |
| Sentence: {sentence} | |
| """ | |
| output_parser = JsonOutputParser() | |
| prompt = PromptTemplate( | |
| template = str_prompt, | |
| input_variables=["source_lang","target_lang","sentence"], | |
| partial_variables={"format_instructions": output_parser.get_format_instructions()} | |
| ) | |
| def get_class(x:dict)->str: | |
| return x["translated"] | |
| chain = prompt | model | output_parser | RunnableLambda(get_class) | |
| return chain | |
| def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'): | |
| if where_to_place is None: | |
| where_to_place = 'append_all' | |
| model = using_model(chosen_model = chosen_model, api_key = api_key) | |
| if isinstance(file, pd.DataFrame): | |
| df = file.copy() | |
| output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx" | |
| df = df.drop(columns=['name']) | |
| elif isinstance(file, str): | |
| df = pd.read_excel(file, sheet_name=sheet_name, header=0) | |
| output_file = f"{file.split('.')[0]}_translated.xlsx" | |
| else: | |
| df = pd.read_excel(file.name, sheet_name=sheet_name, header=0) | |
| output_file = f"{file.name.split('.')[0]}_translated.xlsx" | |
| original_col = df.columns | |
| total_columns = len(df.columns) | |
| current_step = 0 | |
| progress(0, desc="Starting translation process...") | |
| # Automatically detect string columns if col_name is None | |
| if col_name is None: | |
| col_name = [col for col in df.columns if df[col].dtype == 'object'] | |
| # Determine columns that are not selected for translation | |
| remain_col = [col for col in df.columns if col not in col_name] | |
| # Dictionary to store unique values and their translations | |
| translation_map = {} | |
| trans_col_name = [] | |
| # Process the selected columns for translation | |
| for idx, col in enumerate(col_name): | |
| current_step += 1 | |
| progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...") | |
| try: | |
| # Extract unique values from the column | |
| unique_values = df[col].dropna().unique() | |
| unique_values = list(set(unique_values)) # Ensure uniqueness | |
| # Prepare data for translation | |
| zh_sentence = [{"sentence": value, "source_lang": source_lang, "target_lang": target_lang} for value in unique_values] | |
| # Translate unique values | |
| answers = model.batch(zh_sentence, config={"max_concurrency": 3}) | |
| # Create a mapping from original values to translated values | |
| translations = dict(zip(unique_values, answers)) | |
| translation_map[col] = translations | |
| trans_col_name.append(col + "_translated") | |
| # Map translations back to the original DataFrame | |
| df[col + "_translated"] = df[col].map(translations).fillna(df[col]) | |
| except Exception as e: | |
| print(f"Error in column {col}: {e}") | |
| continue | |
| # Process remaining columns | |
| # for column in remain_col: | |
| # current_step += 1 | |
| # progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...") | |
| # try: | |
| # # We do not translate all_col which remaining col | |
| # # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}]) | |
| # name_col = column + '_translated' # Assuming the translation returns a list of translations | |
| # df.loc[:, name_col] = df.loc[:, column] | |
| # except Exception as e: | |
| # print(f"Error in column {column}: {e}") | |
| # continue | |
| if not os.path.exists(output_file): | |
| pd.DataFrame().to_excel(output_file, index=False) | |
| if keep_original == 'keep original': | |
| output_col = original_col | |
| else: | |
| output_col = col_name | |
| try: | |
| if where_to_place == 'append_all (ต่อ column สุดท้าย)': | |
| final_cols = list(output_col) + [col for col in trans_col_name] | |
| result = df[final_cols] | |
| result.to_excel(output_file, index=False) | |
| elif where_to_place == 'append_compare (เปรียบเทียบ column by column)': | |
| final_cols = [] | |
| for col in output_col: | |
| for trans_col in trans_col_name: | |
| if col + '_translated' == trans_col: | |
| final_cols = final_cols + [col, trans_col] | |
| else: | |
| final_cols = final_cols + [col] | |
| result = df[final_cols] | |
| result.to_excel(output_file, index=False) | |
| elif where_to_place == 'replace': | |
| final_cols = [] | |
| for col in output_col: | |
| for trans_col in trans_col_name: | |
| if col + '_translated' == trans_col: | |
| final_cols = final_cols + [trans_col] | |
| else: | |
| final_cols = final_cols + [col] | |
| result = df[final_cols] | |
| result.to_excel(output_file, index=False) | |
| elif where_to_place == 'new_sheet': | |
| final_cols = [col for col in output_col] | |
| new_tab_cols = trans_col_name | |
| result = df[final_cols] | |
| result1 = df[new_tab_cols] | |
| # Use ExcelWriter to write multiple sheets | |
| with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: | |
| result.to_excel(writer, sheet_name=sheet_name, index=False) # First sheet | |
| result1.to_excel(writer, sheet_name=f'{sheet_name}_translated', index=False) # Second sheet | |
| progress(1.0, desc="Saving translated file... Completed!") | |
| except Exception as e: | |
| print(f"Error saving the file: {e}") | |
| raise gr.Error(f"Error saving the file: {e}") | |
| progress(1.0, desc="Completed all tasks!") | |
| if return_output == 'file': | |
| return output_file | |
| elif return_output == 'df': | |
| return result | |
| else: | |
| return output_file | |
| def extract_word_content_to_excel(file_path): | |
| """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """ | |
| doc = Document(file_path) | |
| data = [] | |
| table_dict = {} | |
| paragraph_count = 0 | |
| for element in doc.element.body: | |
| if element.tag.endswith("p"): # Paragraph | |
| paragraph_text = element.text.strip() | |
| paragraph_count += 1 | |
| data.append([paragraph_count, paragraph_text]) # บันทึกพารากราฟ | |
| elif element.tag.endswith("tbl"): # Table (ถ้ามี) | |
| paragraph_count += 1 | |
| data.append([paragraph_count, "[Table]"]) | |
| # Extract table content | |
| table = doc.tables[len(table_dict)] # Get current table | |
| table_data = [] | |
| for row in table.rows: | |
| row_data = [cell.text.strip() for cell in row.cells] | |
| table_data.append(row_data) | |
| # Generate dynamic column names ('object_0', 'object_1', ...) | |
| max_cols = max(len(row) for row in table_data) if table_data else 0 | |
| column_names = [f"object_{i}" for i in range(max_cols)] | |
| # Store table as DataFrame | |
| table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names) | |
| elif element.tag.endswith("drawing"): # Image (รูปภาพ) | |
| paragraph_count += 1 | |
| data.append([paragraph_count, "[Image]"]) | |
| # สร้าง DataFrame | |
| df = pd.DataFrame(data, columns=["paragraph", "original"]) | |
| df['name'] = file_path.split('/')[-1] | |
| with pd.ExcelWriter("extracted_tables.xlsx") as writer: | |
| for key, table_df in table_dict.items(): | |
| table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False) | |
| return df, table_dict | |
| def reconstruct_word(paragraph_df, translated_tables, file_path): | |
| """Reconstruct Word Document from translated content""" | |
| doc = Document() | |
| output_path=f"{file_path.split('.')[0]}_translated.docx" | |
| for _, row in paragraph_df.iterrows(): | |
| if row["original"] == "[Table]": # Insert Table | |
| table_number = row["paragraph"] | |
| table_df = translated_tables.get(table_number) | |
| if table_df is not None: | |
| # Filter only columns that contain '_translated' | |
| translated_cols = [col for col in table_df.columns if '_translated' in col] | |
| if translated_cols: | |
| table_df = table_df[translated_cols] # Keep only translated columns | |
| # Create a table with the filtered columns | |
| table = doc.add_table(rows=len(table_df), cols=len(table_df.columns)) | |
| for i, row_data in enumerate(table_df.values): | |
| for j, cell_text in enumerate(row_data): | |
| table.cell(i, j).text = cell_text | |
| else: | |
| print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}") | |
| else: | |
| if "original_translated" in row: | |
| doc.add_paragraph(row["original_translated"]) | |
| else: | |
| doc.add_paragraph("") | |
| doc.save(output_path) | |
| return output_path | |
| def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()): | |
| word_to_excel_file, word_table = extract_word_content_to_excel(file) | |
| base_translated = chat_gpt_translate_excel(word_to_excel_file, | |
| sheet_name="Sheet1", | |
| col_name = ['original'], | |
| source_lang = source_lang, | |
| target_lang = target_lang, | |
| where_to_place="append_all (ต่อ column สุดท้าย)", | |
| keep_original="keep original", | |
| chosen_model = chosen_model, | |
| api_key = api_key, | |
| return_output='df' | |
| ) | |
| # Translate Tables | |
| translated_tables = {} | |
| for key, table_df in word_table.items(): | |
| translated_tables[key] = chat_gpt_translate_excel( | |
| file="extracted_tables.xlsx", | |
| sheet_name=f"Table_{key}", | |
| col_name=table_df.columns.tolist(), | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| where_to_place="append_all (ต่อ column สุดท้าย)", | |
| keep_original="keep original", | |
| chosen_model=chosen_model, | |
| api_key=api_key, | |
| return_output='df' | |
| ) | |
| output_file = reconstruct_word(base_translated, translated_tables, file) | |
| if os.path.exists('extracted_tables.xlsx'): | |
| os.remove('extracted_tables.xlsx') | |
| if os.path.exists('extracted_tables_translated.xlsx'): | |
| os.remove('extracted_tables_translated.xlsx') | |
| # for deploy huggingface | |
| if os.path.exists(f"{file.split('.')[0]}_translated.xlsx"): | |
| os.remove(f"{file.split('.')[0]}_translated.xlsx") | |
| return output_file | |
| if __name__ == "__main__" : | |
| with gr.Blocks() as iface: | |
| gr.Markdown("## Excel Translation Interface") | |
| excel_file = gr.File(label="Upload Excel File") | |
| sheet_name = gr.Dropdown(label="Select Sheet", interactive=True) | |
| column_name= gr.Dropdown(label = "Select Column to Translate (Not require)", multiselect=True, interactive=True) | |
| with gr.Row(): | |
| source_language = gr.Textbox(label="Source Language Code") | |
| target_language = gr.Textbox(label="Target Language Code") | |
| with gr.Row(): | |
| where_to_place = gr.Dropdown(multiselect=False ,label="How translated columns should be placed" | |
| , choices = ['replace', | |
| 'append_all (ต่อ column สุดท้าย)', | |
| 'append_compare (เปรียบเทียบ column by column)', | |
| 'new_sheet'] | |
| , interactive=True | |
| ) | |
| keep_original = gr.Dropdown(multiselect=False ,label="You want to keep original column or just only the translated column" | |
| , choices = ['keep original', | |
| 'translated_column'] | |
| , interactive=True | |
| ) | |
| def check_file_type(file): | |
| """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """ | |
| file_extension = os.path.splitext(file.name)[-1].lower() | |
| if file_extension in [".docx", ".doc"]: | |
| return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)'] | |
| , interactive=False | |
| ) | |
| elif file_extension in [".xlsx", ".xls"]: | |
| return update_sheets(file) | |
| else: | |
| return "Unknown" | |
| def check_uploaded_file(file): | |
| """ ฟังก์ชันรับไฟล์ที่อัปโหลด แล้วตรวจสอบประเภท """ | |
| if file is None: | |
| return "No file uploaded" | |
| return check_file_type(file) | |
| def get_sheet_names(file): | |
| xls = pd.ExcelFile(file.name) | |
| return xls.sheet_names | |
| def update_sheets(file): | |
| sheets = get_sheet_names(file) | |
| return gr.update(choices=sheets) | |
| def update_columns(file, sheet_name): | |
| if os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]: | |
| return gr.update(choices=['original'], interactive=False) | |
| elif os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]: | |
| columns = get_column_names(file, sheet_name) | |
| return gr.update(choices=columns) | |
| else: | |
| return "error" | |
| def get_column_names(file, sheet_name): | |
| dd = pd.read_excel(file.name, sheet_name=sheet_name) | |
| return list(dd.columns) | |
| excel_file.change(fn=check_uploaded_file, inputs=excel_file, outputs=sheet_name) | |
| sheet_name.change(fn=update_columns, inputs=[excel_file, sheet_name], outputs=column_name) | |
| model_choosing = gr.Dropdown(multiselect = False , | |
| label = "Choosing Model you want", | |
| choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)'] | |
| , interactive=True | |
| ) | |
| needed_require = gr.Textbox(label="API Key(require if Chatgpt)") | |
| translate_button = gr.Button("Translate") | |
| output_file = gr.File(label="Download Translated Excel File", interactive=True) | |
| # Unified translation function | |
| def translate_excel( | |
| file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key | |
| ): | |
| if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]: | |
| if model == "ChatGPT (4o-mini)": | |
| # Call ChatGPT-based translation | |
| return chat_gpt_translate_excel( | |
| file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key | |
| ) | |
| else: | |
| # Handle other models (currently in progress) | |
| raise gr.Error("Translation with the selected model is not yet implemented.") | |
| elif os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]: | |
| if model == "ChatGPT (4o-mini)": | |
| # Call ChatGPT-based translation | |
| return chat_gpt_translate_word(file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key) | |
| else: | |
| # Handle other models (currently in progress) | |
| raise gr.Error("Translation with the selected model is not yet implemented.") | |
| else: | |
| print('No Type of Input Supported') | |
| # Register button click | |
| translate_button.click( | |
| fn=translate_excel, | |
| inputs=[ | |
| excel_file, | |
| sheet_name, | |
| column_name, | |
| source_language, | |
| target_language, | |
| where_to_place, | |
| keep_original, | |
| model_choosing, | |
| needed_require, | |
| ], | |
| outputs=output_file, | |
| ) | |
| iface.launch(debug=True, share=True, | |
| server_port= 7860, | |
| server_name="0.0.0.0" | |
| ) |