Spaces:

petchsko
/

Translator_app

Sleeping

App Files Files Community

Petch DS commited on Feb 6, 2025

Commit

2ddff94

1 Parent(s): ce1ab91

Final update add docx

Browse files

Files changed (1) hide show

translator_app.py +115 -19

translator_app.py CHANGED Viewed

@@ -38,7 +38,7 @@ def chat_gpt_4o_mini(api_key = None):
     return chain
-def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
     if where_to_place is None:
         where_to_place = 'append_all'
@@ -48,10 +48,13 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
         df = file.copy()
         output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
         df = df.drop(columns=['name'])
     else:
         df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
         output_file = f"{file.name.split('.')[0]}_translated.xlsx"
     original_col = df.columns
     total_columns = len(df.columns)
     current_step = 0
@@ -69,10 +72,11 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
     translation_map = {}
     trans_col_name = []
     # Process the selected columns for translation
     for idx, col in enumerate(col_name):
         current_step += 1
-        progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{total_columns})...")
         try:
             # Extract unique values from the column
@@ -97,14 +101,14 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
             print(f"Error in column {col}: {e}")
             continue
-    # # Process remaining columns
     # for column in remain_col:
     #     current_step += 1
-    #     progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{len(remain_col)})...")
     #     try:
-    #         # We do not translate remain_col which remaining col
-    #         # remain_col = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
     #         name_col = column + '_translated'  # Assuming the translation returns a list of translations
     #         df.loc[:, name_col] = df.loc[:, column]
@@ -119,6 +123,7 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
         output_col = original_col
     else:
         output_col = col_name
     try:
         if where_to_place == 'append_all (ต่อ column สุดท้าย)':
             final_cols = list(output_col) + [col for col in trans_col_name]
@@ -162,13 +167,21 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
         raise gr.Error(f"Error saving the file: {e}")
     progress(1.0, desc="Completed all tasks!")
-    return output_file
 def extract_word_content_to_excel(file_path):
     """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
     doc = Document(file_path)
     data = []
     paragraph_count = 0
     for element in doc.element.body:
@@ -181,6 +194,21 @@ def extract_word_content_to_excel(file_path):
             paragraph_count += 1
             data.append([paragraph_count, "[Table]"])
         elif element.tag.endswith("drawing"):  # Image (รูปภาพ)
             paragraph_count += 1
             data.append([paragraph_count, "[Image]"])
@@ -188,11 +216,47 @@ def extract_word_content_to_excel(file_path):
     # สร้าง DataFrame
     df = pd.DataFrame(data, columns=["paragraph", "original"])
     df['name'] = file_path.split('/')[-1]
-    return df
 def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
-    word_to_excel_file = extract_word_content_to_excel(file)
-    return chat_gpt_translate_excel(word_to_excel_file,
                              sheet_name="Sheet1",
                              col_name = ['original'],
                              source_lang = source_lang,
@@ -200,8 +264,34 @@ def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang
                              where_to_place="append_all (ต่อ column สุดท้าย)",
                              keep_original="keep original",
                              chosen_model = chosen_model,
-                             api_key = api_key
                              )
 if __name__ == "__main__" :
@@ -228,12 +318,15 @@ if __name__ == "__main__" :
                                                     'translated_column']
                                                     , interactive=True
                                         )
         def check_file_type(file):
             """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
             file_extension = os.path.splitext(file.name)[-1].lower()
             if file_extension in [".docx", ".doc"]:
-                return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)'], interactive=False)
             elif file_extension in [".xlsx", ".xls"]:
                 return update_sheets(file)
             else:
@@ -244,7 +337,8 @@ if __name__ == "__main__" :
             if file is None:
                 return "No file uploaded"
             return check_file_type(file)
         def get_sheet_names(file):
             xls = pd.ExcelFile(file.name)
             return xls.sheet_names
@@ -272,7 +366,7 @@ if __name__ == "__main__" :
         model_choosing = gr.Dropdown(multiselect = False ,
                                     label = "Choosing Model you want",
-                                    choices = ['ChatGPT (4o-mini)', 'DeepSeek (developing...)', 'another (In Progress)']
                                     , interactive=True
                                     )
@@ -282,8 +376,8 @@ if __name__ == "__main__" :
         # Unified translation function
         def translate_excel(
-        file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
-    ):
             if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
                 if model == "ChatGPT (4o-mini)":
                     # Call ChatGPT-based translation
@@ -300,6 +394,10 @@ if __name__ == "__main__" :
                 else:
                     # Handle other models (currently in progress)
                     raise gr.Error("Translation with the selected model is not yet implemented.")
         # Register button click
         translate_button.click(
             fn=translate_excel,
@@ -316,8 +414,6 @@ if __name__ == "__main__" :
             ],
             outputs=output_file,
         )
     iface.launch(debug=True, share=True,
                  server_port= 7861,

     return chain
+def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'):
     if where_to_place is None:
         where_to_place = 'append_all'
         df = file.copy()
         output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
         df = df.drop(columns=['name'])
+    elif isinstance(file, str):
+        df = pd.read_excel(file, sheet_name=sheet_name, header=0)
+        output_file = f"{file.split('.')[0]}_translated.xlsx"
     else:
         df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
         output_file = f"{file.name.split('.')[0]}_translated.xlsx"
     original_col = df.columns
     total_columns = len(df.columns)
     current_step = 0
     translation_map = {}
     trans_col_name = []
     # Process the selected columns for translation
     for idx, col in enumerate(col_name):
         current_step += 1
+        progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...")
         try:
             # Extract unique values from the column
             print(f"Error in column {col}: {e}")
             continue
+    # Process remaining columns
     # for column in remain_col:
     #     current_step += 1
+    #     progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")
     #     try:
+    #         # We do not translate all_col which remaining col
+    #         # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
     #         name_col = column + '_translated'  # Assuming the translation returns a list of translations
     #         df.loc[:, name_col] = df.loc[:, column]
         output_col = original_col
     else:
         output_col = col_name
     try:
         if where_to_place == 'append_all (ต่อ column สุดท้าย)':
             final_cols = list(output_col) + [col for col in trans_col_name]
         raise gr.Error(f"Error saving the file: {e}")
     progress(1.0, desc="Completed all tasks!")
+    if return_output == 'file':
+        return output_file
+    elif return_output == 'df':
+        return result
+    else:
+        return output_file
 def extract_word_content_to_excel(file_path):
     """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
     doc = Document(file_path)
     data = []
+    table_dict = {}
     paragraph_count = 0
     for element in doc.element.body:
             paragraph_count += 1
             data.append([paragraph_count, "[Table]"])
+            # Extract table content
+            table = doc.tables[len(table_dict)]  # Get current table
+            table_data = []
+            for row in table.rows:
+                row_data = [cell.text.strip() for cell in row.cells]
+                table_data.append(row_data)
+            # Generate dynamic column names ('object_0', 'object_1', ...)
+            max_cols = max(len(row) for row in table_data) if table_data else 0
+            column_names = [f"object_{i}" for i in range(max_cols)]
+            # Store table as DataFrame
+            table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names)
         elif element.tag.endswith("drawing"):  # Image (รูปภาพ)
             paragraph_count += 1
             data.append([paragraph_count, "[Image]"])
     # สร้าง DataFrame
     df = pd.DataFrame(data, columns=["paragraph", "original"])
     df['name'] = file_path.split('/')[-1]
+    with pd.ExcelWriter("extracted_tables.xlsx") as writer:
+        for key, table_df in table_dict.items():
+            table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False)
+    return df, table_dict
+def reconstruct_word(paragraph_df, translated_tables, file_path):
+    """Reconstruct Word Document from translated content"""
+    doc = Document()
+    output_path=f"{file_path.split('.')[0]}_translated.docx"
+    for _, row in paragraph_df.iterrows():
+        if row["original"] == "[Table]":  # Insert Table
+            table_number = row["paragraph"]
+            table_df = translated_tables.get(table_number)
+            if table_df is not None:
+                # Filter only columns that contain '_translated'
+                translated_cols = [col for col in table_df.columns if '_translated' in col]
+                if translated_cols:
+                    table_df = table_df[translated_cols]  # Keep only translated columns
+                    # Create a table with the filtered columns
+                    table = doc.add_table(rows=len(table_df), cols=len(table_df.columns))
+                    for i, row_data in enumerate(table_df.values):
+                        for j, cell_text in enumerate(row_data):
+                            table.cell(i, j).text = cell_text
+                else:
+                    print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}")
+        else:
+            doc.add_paragraph(row["original_translated"])
+    doc.save(output_path)
+    return output_path
 def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
+    word_to_excel_file, word_table = extract_word_content_to_excel(file)
+    base_translated = chat_gpt_translate_excel(word_to_excel_file,
                              sheet_name="Sheet1",
                              col_name = ['original'],
                              source_lang = source_lang,
                              where_to_place="append_all (ต่อ column สุดท้าย)",
                              keep_original="keep original",
                              chosen_model = chosen_model,
+                             api_key = api_key,
+                             return_output='df'
                              )
+    # Translate Tables
+    translated_tables = {}
+    for key, table_df in word_table.items():
+        translated_tables[key] = chat_gpt_translate_excel(
+            file="extracted_tables.xlsx",
+            sheet_name=f"Table_{key}",
+            col_name=table_df.columns.tolist(),
+            source_lang=source_lang,
+            target_lang=target_lang,
+            where_to_place="append_all (ต่อ column สุดท้าย)",
+            keep_original="keep original",
+            chosen_model=chosen_model,
+            api_key=api_key,
+            return_output='df'
+        )
+    output_file = reconstruct_word(base_translated, translated_tables, file)
+    if os.path.exists('extracted_tables.xlsx'):
+        os.remove('extracted_tables.xlsx')
+    if os.path.exists('extracted_tables_translated.xlsx'):
+        os.remove('extracted_tables_translated.xlsx')
+    if os.path.exists(f"{file.split('/')[-1].split('.')[0]}_translated.xlsx"):
+        os.remove(f"{file.split('/')[-1].split('.')[0]}_translated.xlsx")
+    return output_file
 if __name__ == "__main__" :
                                                     'translated_column']
                                                     , interactive=True
                                         )
         def check_file_type(file):
             """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
             file_extension = os.path.splitext(file.name)[-1].lower()
             if file_extension in [".docx", ".doc"]:
+                return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
+                                , interactive=False
+                                )
             elif file_extension in [".xlsx", ".xls"]:
                 return update_sheets(file)
             else:
             if file is None:
                 return "No file uploaded"
             return check_file_type(file)
         def get_sheet_names(file):
             xls = pd.ExcelFile(file.name)
             return xls.sheet_names
         model_choosing = gr.Dropdown(multiselect = False ,
                                     label = "Choosing Model you want",
+                                    choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)']
                                     , interactive=True
                                     )
         # Unified translation function
         def translate_excel(
+            file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
+        ):
             if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
                 if model == "ChatGPT (4o-mini)":
                     # Call ChatGPT-based translation
                 else:
                     # Handle other models (currently in progress)
                     raise gr.Error("Translation with the selected model is not yet implemented.")
+            else:
+                print('No Type of Input Supported')
         # Register button click
         translate_button.click(
             fn=translate_excel,
             ],
             outputs=output_file,
         )
     iface.launch(debug=True, share=True,
                  server_port= 7861,