Spaces:
Sleeping
Sleeping
Petch DS commited on
Commit ·
2ddff94
1
Parent(s): ce1ab91
Final update add docx
Browse files- translator_app.py +115 -19
translator_app.py
CHANGED
|
@@ -38,7 +38,7 @@ def chat_gpt_4o_mini(api_key = None):
|
|
| 38 |
return chain
|
| 39 |
|
| 40 |
|
| 41 |
-
def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
|
| 42 |
if where_to_place is None:
|
| 43 |
where_to_place = 'append_all'
|
| 44 |
|
|
@@ -48,10 +48,13 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
|
|
| 48 |
df = file.copy()
|
| 49 |
output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
|
| 50 |
df = df.drop(columns=['name'])
|
|
|
|
|
|
|
|
|
|
| 51 |
else:
|
| 52 |
df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
|
| 53 |
output_file = f"{file.name.split('.')[0]}_translated.xlsx"
|
| 54 |
-
|
| 55 |
original_col = df.columns
|
| 56 |
total_columns = len(df.columns)
|
| 57 |
current_step = 0
|
|
@@ -69,10 +72,11 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
|
|
| 69 |
translation_map = {}
|
| 70 |
trans_col_name = []
|
| 71 |
|
|
|
|
| 72 |
# Process the selected columns for translation
|
| 73 |
for idx, col in enumerate(col_name):
|
| 74 |
current_step += 1
|
| 75 |
-
progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{
|
| 76 |
|
| 77 |
try:
|
| 78 |
# Extract unique values from the column
|
|
@@ -97,14 +101,14 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
|
|
| 97 |
print(f"Error in column {col}: {e}")
|
| 98 |
continue
|
| 99 |
|
| 100 |
-
#
|
| 101 |
# for column in remain_col:
|
| 102 |
# current_step += 1
|
| 103 |
-
# progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{
|
| 104 |
|
| 105 |
# try:
|
| 106 |
-
# # We do not translate
|
| 107 |
-
# #
|
| 108 |
# name_col = column + '_translated' # Assuming the translation returns a list of translations
|
| 109 |
# df.loc[:, name_col] = df.loc[:, column]
|
| 110 |
|
|
@@ -119,6 +123,7 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
|
|
| 119 |
output_col = original_col
|
| 120 |
else:
|
| 121 |
output_col = col_name
|
|
|
|
| 122 |
try:
|
| 123 |
if where_to_place == 'append_all (ต่อ column สุดท้าย)':
|
| 124 |
final_cols = list(output_col) + [col for col in trans_col_name]
|
|
@@ -162,13 +167,21 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
|
|
| 162 |
raise gr.Error(f"Error saving the file: {e}")
|
| 163 |
|
| 164 |
progress(1.0, desc="Completed all tasks!")
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
def extract_word_content_to_excel(file_path):
|
| 168 |
""" ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
|
| 169 |
doc = Document(file_path)
|
| 170 |
|
| 171 |
data = []
|
|
|
|
| 172 |
paragraph_count = 0
|
| 173 |
|
| 174 |
for element in doc.element.body:
|
|
@@ -181,6 +194,21 @@ def extract_word_content_to_excel(file_path):
|
|
| 181 |
paragraph_count += 1
|
| 182 |
data.append([paragraph_count, "[Table]"])
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
elif element.tag.endswith("drawing"): # Image (รูปภาพ)
|
| 185 |
paragraph_count += 1
|
| 186 |
data.append([paragraph_count, "[Image]"])
|
|
@@ -188,11 +216,47 @@ def extract_word_content_to_excel(file_path):
|
|
| 188 |
# สร้าง DataFrame
|
| 189 |
df = pd.DataFrame(data, columns=["paragraph", "original"])
|
| 190 |
df['name'] = file_path.split('/')[-1]
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
|
| 194 |
-
word_to_excel_file = extract_word_content_to_excel(file)
|
| 195 |
-
|
| 196 |
sheet_name="Sheet1",
|
| 197 |
col_name = ['original'],
|
| 198 |
source_lang = source_lang,
|
|
@@ -200,8 +264,34 @@ def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang
|
|
| 200 |
where_to_place="append_all (ต่อ column สุดท้าย)",
|
| 201 |
keep_original="keep original",
|
| 202 |
chosen_model = chosen_model,
|
| 203 |
-
api_key = api_key
|
|
|
|
| 204 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
if __name__ == "__main__" :
|
| 207 |
|
|
@@ -228,12 +318,15 @@ if __name__ == "__main__" :
|
|
| 228 |
'translated_column']
|
| 229 |
, interactive=True
|
| 230 |
)
|
|
|
|
| 231 |
def check_file_type(file):
|
| 232 |
""" ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
|
| 233 |
file_extension = os.path.splitext(file.name)[-1].lower()
|
| 234 |
|
| 235 |
if file_extension in [".docx", ".doc"]:
|
| 236 |
-
return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
|
|
|
|
|
|
|
| 237 |
elif file_extension in [".xlsx", ".xls"]:
|
| 238 |
return update_sheets(file)
|
| 239 |
else:
|
|
@@ -244,7 +337,8 @@ if __name__ == "__main__" :
|
|
| 244 |
if file is None:
|
| 245 |
return "No file uploaded"
|
| 246 |
return check_file_type(file)
|
| 247 |
-
|
|
|
|
| 248 |
def get_sheet_names(file):
|
| 249 |
xls = pd.ExcelFile(file.name)
|
| 250 |
return xls.sheet_names
|
|
@@ -272,7 +366,7 @@ if __name__ == "__main__" :
|
|
| 272 |
|
| 273 |
model_choosing = gr.Dropdown(multiselect = False ,
|
| 274 |
label = "Choosing Model you want",
|
| 275 |
-
choices = ['ChatGPT (4o-mini)', '
|
| 276 |
, interactive=True
|
| 277 |
)
|
| 278 |
|
|
@@ -282,8 +376,8 @@ if __name__ == "__main__" :
|
|
| 282 |
|
| 283 |
# Unified translation function
|
| 284 |
def translate_excel(
|
| 285 |
-
|
| 286 |
-
|
| 287 |
if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
|
| 288 |
if model == "ChatGPT (4o-mini)":
|
| 289 |
# Call ChatGPT-based translation
|
|
@@ -300,6 +394,10 @@ if __name__ == "__main__" :
|
|
| 300 |
else:
|
| 301 |
# Handle other models (currently in progress)
|
| 302 |
raise gr.Error("Translation with the selected model is not yet implemented.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
# Register button click
|
| 304 |
translate_button.click(
|
| 305 |
fn=translate_excel,
|
|
@@ -316,8 +414,6 @@ if __name__ == "__main__" :
|
|
| 316 |
],
|
| 317 |
outputs=output_file,
|
| 318 |
)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
|
| 322 |
iface.launch(debug=True, share=True,
|
| 323 |
server_port= 7861,
|
|
|
|
| 38 |
return chain
|
| 39 |
|
| 40 |
|
| 41 |
+
def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'):
|
| 42 |
if where_to_place is None:
|
| 43 |
where_to_place = 'append_all'
|
| 44 |
|
|
|
|
| 48 |
df = file.copy()
|
| 49 |
output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
|
| 50 |
df = df.drop(columns=['name'])
|
| 51 |
+
elif isinstance(file, str):
|
| 52 |
+
df = pd.read_excel(file, sheet_name=sheet_name, header=0)
|
| 53 |
+
output_file = f"{file.split('.')[0]}_translated.xlsx"
|
| 54 |
else:
|
| 55 |
df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
|
| 56 |
output_file = f"{file.name.split('.')[0]}_translated.xlsx"
|
| 57 |
+
|
| 58 |
original_col = df.columns
|
| 59 |
total_columns = len(df.columns)
|
| 60 |
current_step = 0
|
|
|
|
| 72 |
translation_map = {}
|
| 73 |
trans_col_name = []
|
| 74 |
|
| 75 |
+
|
| 76 |
# Process the selected columns for translation
|
| 77 |
for idx, col in enumerate(col_name):
|
| 78 |
current_step += 1
|
| 79 |
+
progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...")
|
| 80 |
|
| 81 |
try:
|
| 82 |
# Extract unique values from the column
|
|
|
|
| 101 |
print(f"Error in column {col}: {e}")
|
| 102 |
continue
|
| 103 |
|
| 104 |
+
# Process remaining columns
|
| 105 |
# for column in remain_col:
|
| 106 |
# current_step += 1
|
| 107 |
+
# progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")
|
| 108 |
|
| 109 |
# try:
|
| 110 |
+
# # We do not translate all_col which remaining col
|
| 111 |
+
# # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
|
| 112 |
# name_col = column + '_translated' # Assuming the translation returns a list of translations
|
| 113 |
# df.loc[:, name_col] = df.loc[:, column]
|
| 114 |
|
|
|
|
| 123 |
output_col = original_col
|
| 124 |
else:
|
| 125 |
output_col = col_name
|
| 126 |
+
|
| 127 |
try:
|
| 128 |
if where_to_place == 'append_all (ต่อ column สุดท้าย)':
|
| 129 |
final_cols = list(output_col) + [col for col in trans_col_name]
|
|
|
|
| 167 |
raise gr.Error(f"Error saving the file: {e}")
|
| 168 |
|
| 169 |
progress(1.0, desc="Completed all tasks!")
|
| 170 |
+
|
| 171 |
+
if return_output == 'file':
|
| 172 |
+
return output_file
|
| 173 |
+
elif return_output == 'df':
|
| 174 |
+
return result
|
| 175 |
+
else:
|
| 176 |
+
return output_file
|
| 177 |
+
|
| 178 |
|
| 179 |
def extract_word_content_to_excel(file_path):
|
| 180 |
""" ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
|
| 181 |
doc = Document(file_path)
|
| 182 |
|
| 183 |
data = []
|
| 184 |
+
table_dict = {}
|
| 185 |
paragraph_count = 0
|
| 186 |
|
| 187 |
for element in doc.element.body:
|
|
|
|
| 194 |
paragraph_count += 1
|
| 195 |
data.append([paragraph_count, "[Table]"])
|
| 196 |
|
| 197 |
+
# Extract table content
|
| 198 |
+
table = doc.tables[len(table_dict)] # Get current table
|
| 199 |
+
table_data = []
|
| 200 |
+
for row in table.rows:
|
| 201 |
+
row_data = [cell.text.strip() for cell in row.cells]
|
| 202 |
+
table_data.append(row_data)
|
| 203 |
+
|
| 204 |
+
# Generate dynamic column names ('object_0', 'object_1', ...)
|
| 205 |
+
max_cols = max(len(row) for row in table_data) if table_data else 0
|
| 206 |
+
column_names = [f"object_{i}" for i in range(max_cols)]
|
| 207 |
+
|
| 208 |
+
# Store table as DataFrame
|
| 209 |
+
table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
elif element.tag.endswith("drawing"): # Image (รูปภาพ)
|
| 213 |
paragraph_count += 1
|
| 214 |
data.append([paragraph_count, "[Image]"])
|
|
|
|
| 216 |
# สร้าง DataFrame
|
| 217 |
df = pd.DataFrame(data, columns=["paragraph", "original"])
|
| 218 |
df['name'] = file_path.split('/')[-1]
|
| 219 |
+
|
| 220 |
+
with pd.ExcelWriter("extracted_tables.xlsx") as writer:
|
| 221 |
+
for key, table_df in table_dict.items():
|
| 222 |
+
table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False)
|
| 223 |
+
|
| 224 |
+
return df, table_dict
|
| 225 |
+
|
| 226 |
+
def reconstruct_word(paragraph_df, translated_tables, file_path):
|
| 227 |
+
"""Reconstruct Word Document from translated content"""
|
| 228 |
+
doc = Document()
|
| 229 |
+
output_path=f"{file_path.split('.')[0]}_translated.docx"
|
| 230 |
+
|
| 231 |
+
for _, row in paragraph_df.iterrows():
|
| 232 |
+
if row["original"] == "[Table]": # Insert Table
|
| 233 |
+
table_number = row["paragraph"]
|
| 234 |
+
table_df = translated_tables.get(table_number)
|
| 235 |
+
|
| 236 |
+
if table_df is not None:
|
| 237 |
+
# Filter only columns that contain '_translated'
|
| 238 |
+
translated_cols = [col for col in table_df.columns if '_translated' in col]
|
| 239 |
+
|
| 240 |
+
if translated_cols:
|
| 241 |
+
table_df = table_df[translated_cols] # Keep only translated columns
|
| 242 |
+
|
| 243 |
+
# Create a table with the filtered columns
|
| 244 |
+
table = doc.add_table(rows=len(table_df), cols=len(table_df.columns))
|
| 245 |
+
|
| 246 |
+
for i, row_data in enumerate(table_df.values):
|
| 247 |
+
for j, cell_text in enumerate(row_data):
|
| 248 |
+
table.cell(i, j).text = cell_text
|
| 249 |
+
else:
|
| 250 |
+
print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}")
|
| 251 |
+
else:
|
| 252 |
+
doc.add_paragraph(row["original_translated"])
|
| 253 |
+
|
| 254 |
+
doc.save(output_path)
|
| 255 |
+
return output_path
|
| 256 |
|
| 257 |
def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
|
| 258 |
+
word_to_excel_file, word_table = extract_word_content_to_excel(file)
|
| 259 |
+
base_translated = chat_gpt_translate_excel(word_to_excel_file,
|
| 260 |
sheet_name="Sheet1",
|
| 261 |
col_name = ['original'],
|
| 262 |
source_lang = source_lang,
|
|
|
|
| 264 |
where_to_place="append_all (ต่อ column สุดท้าย)",
|
| 265 |
keep_original="keep original",
|
| 266 |
chosen_model = chosen_model,
|
| 267 |
+
api_key = api_key,
|
| 268 |
+
return_output='df'
|
| 269 |
)
|
| 270 |
+
# Translate Tables
|
| 271 |
+
translated_tables = {}
|
| 272 |
+
for key, table_df in word_table.items():
|
| 273 |
+
translated_tables[key] = chat_gpt_translate_excel(
|
| 274 |
+
file="extracted_tables.xlsx",
|
| 275 |
+
sheet_name=f"Table_{key}",
|
| 276 |
+
col_name=table_df.columns.tolist(),
|
| 277 |
+
source_lang=source_lang,
|
| 278 |
+
target_lang=target_lang,
|
| 279 |
+
where_to_place="append_all (ต่อ column สุดท้าย)",
|
| 280 |
+
keep_original="keep original",
|
| 281 |
+
chosen_model=chosen_model,
|
| 282 |
+
api_key=api_key,
|
| 283 |
+
return_output='df'
|
| 284 |
+
)
|
| 285 |
+
output_file = reconstruct_word(base_translated, translated_tables, file)
|
| 286 |
+
|
| 287 |
+
if os.path.exists('extracted_tables.xlsx'):
|
| 288 |
+
os.remove('extracted_tables.xlsx')
|
| 289 |
+
if os.path.exists('extracted_tables_translated.xlsx'):
|
| 290 |
+
os.remove('extracted_tables_translated.xlsx')
|
| 291 |
+
|
| 292 |
+
if os.path.exists(f"{file.split('/')[-1].split('.')[0]}_translated.xlsx"):
|
| 293 |
+
os.remove(f"{file.split('/')[-1].split('.')[0]}_translated.xlsx")
|
| 294 |
+
return output_file
|
| 295 |
|
| 296 |
if __name__ == "__main__" :
|
| 297 |
|
|
|
|
| 318 |
'translated_column']
|
| 319 |
, interactive=True
|
| 320 |
)
|
| 321 |
+
|
| 322 |
def check_file_type(file):
|
| 323 |
""" ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
|
| 324 |
file_extension = os.path.splitext(file.name)[-1].lower()
|
| 325 |
|
| 326 |
if file_extension in [".docx", ".doc"]:
|
| 327 |
+
return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
|
| 328 |
+
, interactive=False
|
| 329 |
+
)
|
| 330 |
elif file_extension in [".xlsx", ".xls"]:
|
| 331 |
return update_sheets(file)
|
| 332 |
else:
|
|
|
|
| 337 |
if file is None:
|
| 338 |
return "No file uploaded"
|
| 339 |
return check_file_type(file)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
def get_sheet_names(file):
|
| 343 |
xls = pd.ExcelFile(file.name)
|
| 344 |
return xls.sheet_names
|
|
|
|
| 366 |
|
| 367 |
model_choosing = gr.Dropdown(multiselect = False ,
|
| 368 |
label = "Choosing Model you want",
|
| 369 |
+
choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)']
|
| 370 |
, interactive=True
|
| 371 |
)
|
| 372 |
|
|
|
|
| 376 |
|
| 377 |
# Unified translation function
|
| 378 |
def translate_excel(
|
| 379 |
+
file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
|
| 380 |
+
):
|
| 381 |
if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
|
| 382 |
if model == "ChatGPT (4o-mini)":
|
| 383 |
# Call ChatGPT-based translation
|
|
|
|
| 394 |
else:
|
| 395 |
# Handle other models (currently in progress)
|
| 396 |
raise gr.Error("Translation with the selected model is not yet implemented.")
|
| 397 |
+
|
| 398 |
+
else:
|
| 399 |
+
print('No Type of Input Supported')
|
| 400 |
+
|
| 401 |
# Register button click
|
| 402 |
translate_button.click(
|
| 403 |
fn=translate_excel,
|
|
|
|
| 414 |
],
|
| 415 |
outputs=output_file,
|
| 416 |
)
|
|
|
|
|
|
|
| 417 |
|
| 418 |
iface.launch(debug=True, share=True,
|
| 419 |
server_port= 7861,
|