Petch DS commited on
Commit
2ddff94
·
1 Parent(s): ce1ab91

Final update add docx

Browse files
Files changed (1) hide show
  1. translator_app.py +115 -19
translator_app.py CHANGED
@@ -38,7 +38,7 @@ def chat_gpt_4o_mini(api_key = None):
38
  return chain
39
 
40
 
41
- def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
42
  if where_to_place is None:
43
  where_to_place = 'append_all'
44
 
@@ -48,10 +48,13 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
48
  df = file.copy()
49
  output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
50
  df = df.drop(columns=['name'])
 
 
 
51
  else:
52
  df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
53
  output_file = f"{file.name.split('.')[0]}_translated.xlsx"
54
-
55
  original_col = df.columns
56
  total_columns = len(df.columns)
57
  current_step = 0
@@ -69,10 +72,11 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
69
  translation_map = {}
70
  trans_col_name = []
71
 
 
72
  # Process the selected columns for translation
73
  for idx, col in enumerate(col_name):
74
  current_step += 1
75
- progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{total_columns})...")
76
 
77
  try:
78
  # Extract unique values from the column
@@ -97,14 +101,14 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
97
  print(f"Error in column {col}: {e}")
98
  continue
99
 
100
- # # Process remaining columns
101
  # for column in remain_col:
102
  # current_step += 1
103
- # progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{len(remain_col)})...")
104
 
105
  # try:
106
- # # We do not translate remain_col which remaining col
107
- # # remain_col = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
108
  # name_col = column + '_translated' # Assuming the translation returns a list of translations
109
  # df.loc[:, name_col] = df.loc[:, column]
110
 
@@ -119,6 +123,7 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
119
  output_col = original_col
120
  else:
121
  output_col = col_name
 
122
  try:
123
  if where_to_place == 'append_all (ต่อ column สุดท้าย)':
124
  final_cols = list(output_col) + [col for col in trans_col_name]
@@ -162,13 +167,21 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
162
  raise gr.Error(f"Error saving the file: {e}")
163
 
164
  progress(1.0, desc="Completed all tasks!")
165
- return output_file
 
 
 
 
 
 
 
166
 
167
  def extract_word_content_to_excel(file_path):
168
  """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
169
  doc = Document(file_path)
170
 
171
  data = []
 
172
  paragraph_count = 0
173
 
174
  for element in doc.element.body:
@@ -181,6 +194,21 @@ def extract_word_content_to_excel(file_path):
181
  paragraph_count += 1
182
  data.append([paragraph_count, "[Table]"])
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  elif element.tag.endswith("drawing"): # Image (รูปภาพ)
185
  paragraph_count += 1
186
  data.append([paragraph_count, "[Image]"])
@@ -188,11 +216,47 @@ def extract_word_content_to_excel(file_path):
188
  # สร้าง DataFrame
189
  df = pd.DataFrame(data, columns=["paragraph", "original"])
190
  df['name'] = file_path.split('/')[-1]
191
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
194
- word_to_excel_file = extract_word_content_to_excel(file)
195
- return chat_gpt_translate_excel(word_to_excel_file,
196
  sheet_name="Sheet1",
197
  col_name = ['original'],
198
  source_lang = source_lang,
@@ -200,8 +264,34 @@ def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang
200
  where_to_place="append_all (ต่อ column สุดท้าย)",
201
  keep_original="keep original",
202
  chosen_model = chosen_model,
203
- api_key = api_key
 
204
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  if __name__ == "__main__" :
207
 
@@ -228,12 +318,15 @@ if __name__ == "__main__" :
228
  'translated_column']
229
  , interactive=True
230
  )
 
231
  def check_file_type(file):
232
  """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
233
  file_extension = os.path.splitext(file.name)[-1].lower()
234
 
235
  if file_extension in [".docx", ".doc"]:
236
- return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)'], interactive=False)
 
 
237
  elif file_extension in [".xlsx", ".xls"]:
238
  return update_sheets(file)
239
  else:
@@ -244,7 +337,8 @@ if __name__ == "__main__" :
244
  if file is None:
245
  return "No file uploaded"
246
  return check_file_type(file)
247
-
 
248
  def get_sheet_names(file):
249
  xls = pd.ExcelFile(file.name)
250
  return xls.sheet_names
@@ -272,7 +366,7 @@ if __name__ == "__main__" :
272
 
273
  model_choosing = gr.Dropdown(multiselect = False ,
274
  label = "Choosing Model you want",
275
- choices = ['ChatGPT (4o-mini)', 'DeepSeek (developing...)', 'another (In Progress)']
276
  , interactive=True
277
  )
278
 
@@ -282,8 +376,8 @@ if __name__ == "__main__" :
282
 
283
  # Unified translation function
284
  def translate_excel(
285
- file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
286
- ):
287
  if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
288
  if model == "ChatGPT (4o-mini)":
289
  # Call ChatGPT-based translation
@@ -300,6 +394,10 @@ if __name__ == "__main__" :
300
  else:
301
  # Handle other models (currently in progress)
302
  raise gr.Error("Translation with the selected model is not yet implemented.")
 
 
 
 
303
  # Register button click
304
  translate_button.click(
305
  fn=translate_excel,
@@ -316,8 +414,6 @@ if __name__ == "__main__" :
316
  ],
317
  outputs=output_file,
318
  )
319
-
320
-
321
 
322
  iface.launch(debug=True, share=True,
323
  server_port= 7861,
 
38
  return chain
39
 
40
 
41
+ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'):
42
  if where_to_place is None:
43
  where_to_place = 'append_all'
44
 
 
48
  df = file.copy()
49
  output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
50
  df = df.drop(columns=['name'])
51
+ elif isinstance(file, str):
52
+ df = pd.read_excel(file, sheet_name=sheet_name, header=0)
53
+ output_file = f"{file.split('.')[0]}_translated.xlsx"
54
  else:
55
  df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
56
  output_file = f"{file.name.split('.')[0]}_translated.xlsx"
57
+
58
  original_col = df.columns
59
  total_columns = len(df.columns)
60
  current_step = 0
 
72
  translation_map = {}
73
  trans_col_name = []
74
 
75
+
76
  # Process the selected columns for translation
77
  for idx, col in enumerate(col_name):
78
  current_step += 1
79
+ progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...")
80
 
81
  try:
82
  # Extract unique values from the column
 
101
  print(f"Error in column {col}: {e}")
102
  continue
103
 
104
+ # Process remaining columns
105
  # for column in remain_col:
106
  # current_step += 1
107
+ # progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")
108
 
109
  # try:
110
+ # # We do not translate all_col which remaining col
111
+ # # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
112
  # name_col = column + '_translated' # Assuming the translation returns a list of translations
113
  # df.loc[:, name_col] = df.loc[:, column]
114
 
 
123
  output_col = original_col
124
  else:
125
  output_col = col_name
126
+
127
  try:
128
  if where_to_place == 'append_all (ต่อ column สุดท้าย)':
129
  final_cols = list(output_col) + [col for col in trans_col_name]
 
167
  raise gr.Error(f"Error saving the file: {e}")
168
 
169
  progress(1.0, desc="Completed all tasks!")
170
+
171
+ if return_output == 'file':
172
+ return output_file
173
+ elif return_output == 'df':
174
+ return result
175
+ else:
176
+ return output_file
177
+
178
 
179
  def extract_word_content_to_excel(file_path):
180
  """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
181
  doc = Document(file_path)
182
 
183
  data = []
184
+ table_dict = {}
185
  paragraph_count = 0
186
 
187
  for element in doc.element.body:
 
194
  paragraph_count += 1
195
  data.append([paragraph_count, "[Table]"])
196
 
197
+ # Extract table content
198
+ table = doc.tables[len(table_dict)] # Get current table
199
+ table_data = []
200
+ for row in table.rows:
201
+ row_data = [cell.text.strip() for cell in row.cells]
202
+ table_data.append(row_data)
203
+
204
+ # Generate dynamic column names ('object_0', 'object_1', ...)
205
+ max_cols = max(len(row) for row in table_data) if table_data else 0
206
+ column_names = [f"object_{i}" for i in range(max_cols)]
207
+
208
+ # Store table as DataFrame
209
+ table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names)
210
+
211
+
212
  elif element.tag.endswith("drawing"): # Image (รูปภาพ)
213
  paragraph_count += 1
214
  data.append([paragraph_count, "[Image]"])
 
216
  # สร้าง DataFrame
217
  df = pd.DataFrame(data, columns=["paragraph", "original"])
218
  df['name'] = file_path.split('/')[-1]
219
+
220
+ with pd.ExcelWriter("extracted_tables.xlsx") as writer:
221
+ for key, table_df in table_dict.items():
222
+ table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False)
223
+
224
+ return df, table_dict
225
+
226
+ def reconstruct_word(paragraph_df, translated_tables, file_path):
227
+ """Reconstruct Word Document from translated content"""
228
+ doc = Document()
229
+ output_path=f"{file_path.split('.')[0]}_translated.docx"
230
+
231
+ for _, row in paragraph_df.iterrows():
232
+ if row["original"] == "[Table]": # Insert Table
233
+ table_number = row["paragraph"]
234
+ table_df = translated_tables.get(table_number)
235
+
236
+ if table_df is not None:
237
+ # Filter only columns that contain '_translated'
238
+ translated_cols = [col for col in table_df.columns if '_translated' in col]
239
+
240
+ if translated_cols:
241
+ table_df = table_df[translated_cols] # Keep only translated columns
242
+
243
+ # Create a table with the filtered columns
244
+ table = doc.add_table(rows=len(table_df), cols=len(table_df.columns))
245
+
246
+ for i, row_data in enumerate(table_df.values):
247
+ for j, cell_text in enumerate(row_data):
248
+ table.cell(i, j).text = cell_text
249
+ else:
250
+ print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}")
251
+ else:
252
+ doc.add_paragraph(row["original_translated"])
253
+
254
+ doc.save(output_path)
255
+ return output_path
256
 
257
  def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
258
+ word_to_excel_file, word_table = extract_word_content_to_excel(file)
259
+ base_translated = chat_gpt_translate_excel(word_to_excel_file,
260
  sheet_name="Sheet1",
261
  col_name = ['original'],
262
  source_lang = source_lang,
 
264
  where_to_place="append_all (ต่อ column สุดท้าย)",
265
  keep_original="keep original",
266
  chosen_model = chosen_model,
267
+ api_key = api_key,
268
+ return_output='df'
269
  )
270
+ # Translate Tables
271
+ translated_tables = {}
272
+ for key, table_df in word_table.items():
273
+ translated_tables[key] = chat_gpt_translate_excel(
274
+ file="extracted_tables.xlsx",
275
+ sheet_name=f"Table_{key}",
276
+ col_name=table_df.columns.tolist(),
277
+ source_lang=source_lang,
278
+ target_lang=target_lang,
279
+ where_to_place="append_all (ต่อ column สุดท้าย)",
280
+ keep_original="keep original",
281
+ chosen_model=chosen_model,
282
+ api_key=api_key,
283
+ return_output='df'
284
+ )
285
+ output_file = reconstruct_word(base_translated, translated_tables, file)
286
+
287
+ if os.path.exists('extracted_tables.xlsx'):
288
+ os.remove('extracted_tables.xlsx')
289
+ if os.path.exists('extracted_tables_translated.xlsx'):
290
+ os.remove('extracted_tables_translated.xlsx')
291
+
292
+ if os.path.exists(f"{file.split('/')[-1].split('.')[0]}_translated.xlsx"):
293
+ os.remove(f"{file.split('/')[-1].split('.')[0]}_translated.xlsx")
294
+ return output_file
295
 
296
  if __name__ == "__main__" :
297
 
 
318
  'translated_column']
319
  , interactive=True
320
  )
321
+
322
  def check_file_type(file):
323
  """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
324
  file_extension = os.path.splitext(file.name)[-1].lower()
325
 
326
  if file_extension in [".docx", ".doc"]:
327
+ return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
328
+ , interactive=False
329
+ )
330
  elif file_extension in [".xlsx", ".xls"]:
331
  return update_sheets(file)
332
  else:
 
337
  if file is None:
338
  return "No file uploaded"
339
  return check_file_type(file)
340
+
341
+
342
  def get_sheet_names(file):
343
  xls = pd.ExcelFile(file.name)
344
  return xls.sheet_names
 
366
 
367
  model_choosing = gr.Dropdown(multiselect = False ,
368
  label = "Choosing Model you want",
369
+ choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)']
370
  , interactive=True
371
  )
372
 
 
376
 
377
  # Unified translation function
378
  def translate_excel(
379
+ file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
380
+ ):
381
  if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
382
  if model == "ChatGPT (4o-mini)":
383
  # Call ChatGPT-based translation
 
394
  else:
395
  # Handle other models (currently in progress)
396
  raise gr.Error("Translation with the selected model is not yet implemented.")
397
+
398
+ else:
399
+ print('No Type of Input Supported')
400
+
401
  # Register button click
402
  translate_button.click(
403
  fn=translate_excel,
 
414
  ],
415
  outputs=output_file,
416
  )
 
 
417
 
418
  iface.launch(debug=True, share=True,
419
  server_port= 7861,