Spaces:

petchsko
/

Translator_app

Sleeping

App Files Files Community

Petch DS commited on Feb 5, 2025

Commit

0a3b2e2

1 Parent(s): 74a4b1d

fix output format

Browse files

Files changed (3) hide show

requirements.txt +1 -1
translator_app.ipynb +132 -37
translator_app.py +35 -22

requirements.txt CHANGED Viewed

@@ -4,4 +4,4 @@ langchain-openai
 xlsxwriter==3.2.0
 pandas==2.0.3
 numpy==1.24.3
-openpyxl==3.1.5

 xlsxwriter==3.2.0
 pandas==2.0.3
 numpy==1.24.3
+openpyxl==3.1.5

translator_app.ipynb CHANGED Viewed

@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,9 +30,77 @@
     "from langchain_core.prompts import PromptTemplate\n",
     "from langchain_core.runnables import RunnableLambda\n",
     "import gradio as gr\n",
-    "import pandas as pd"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -42,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -53,7 +121,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Process"
    ]
   },
   {
@@ -65,16 +133,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def using_model(chosen_model, api_key):\n",
-    "    if chosen_model == 'ChatGPT (4o-mini)':\n",
-    "        model = chat_gpt_4o_mini(api_key = api_key)\n",
-    "    else:\n",
-    "        pass\n",
-    "    return model\n",
     "\n",
     "def chat_gpt_4o_mini(api_key = None):\n",
     "    model = ChatOpenAI(model_name=\"gpt-4o-mini\", api_key=api_key)\n",
@@ -96,7 +158,22 @@
     "\n",
     "    chain = prompt | model | output_parser | RunnableLambda(get_class)  \n",
     "\n",
-    "    return chain"
    ]
   },
   {
@@ -108,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -122,11 +199,12 @@
     "\n",
     "def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):\n",
     "    if where_to_place is None:\n",
-    "        where_to_place = 'append_all'\n",
     "\n",
     "    model = using_model(chosen_model = chosen_model, api_key = api_key)\n",
     "\n",
     "    df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)\n",
     "    original_col = df.columns\n",
     "    total_columns = len(df.columns)\n",
     "    current_step = 0\n",
@@ -134,20 +212,23 @@
     "    progress(0, desc=\"Starting translation process...\")\n",
     "\n",
     "    # Automatically detect string columns if col_name is None\n",
     "    if col_name is None:\n",
     "        col_name = [col for col in df.columns if df[col].dtype == 'object']\n",
     "\n",
     "    # Determine columns that are not selected for translation\n",
-    "    all_col = [col for col in df.columns if col not in col_name]\n",
     "\n",
     "    # Dictionary to store unique values and their translations\n",
     "    translation_map = {}\n",
     "    print(col_name)\n",
     "\n",
     "    # Process the selected columns for translation\n",
     "    for idx, col in enumerate(col_name):\n",
     "        current_step += 1\n",
-    "        progress(current_step / total_columns, desc=f\"Translating {col} ({current_step}/{total_columns})...\")\n",
     "\n",
     "        try:\n",
     "            # Extract unique values from the column\n",
@@ -164,6 +245,7 @@
     "            translations = dict(zip(unique_values, answers))\n",
     "            translation_map[col] = translations\n",
     "\n",
     "            # Map translations back to the original DataFrame\n",
     "            df[col + \"_translated\"] = df[col].map(translations).fillna(df[col])\n",
     "\n",
@@ -171,50 +253,63 @@
     "            print(f\"Error in column {col}: {e}\")\n",
     "            continue\n",
     "\n",
-    "    # Process remaining columns\n",
-    "    for column in all_col:\n",
-    "        current_step += 1\n",
-    "        progress(current_step / total_columns, desc=f\"Translating column name: {column} ({current_step}/{total_columns})...\")\n",
     "\n",
-    "        try:\n",
-    "            # We do not translate all_col which remaining col\n",
-    "            # all_col_translation = chain.batch([{\"sentence\": column, \"source_lang\": source_lang, \"target_lang\": target_lang}])\n",
-    "            name_col = column + '_translated'  # Assuming the translation returns a list of translations\n",
-    "            df.loc[:, name_col] = df.loc[:, column]\n",
     "\n",
-    "        except Exception as e:\n",
-    "            print(f\"Error in column {column}: {e}\")\n",
-    "            continue\n",
     "\n",
     "    \n",
-    "    output_file = \"translated_output.xlsx\"\n",
     "    if not os.path.exists(output_file):\n",
     "        pd.DataFrame().to_excel(output_file, index=False)\n",
     "\n",
     "    if keep_original == 'keep original':\n",
     "        output_col = original_col\n",
     "    else:\n",
     "        output_col = col_name\n",
     "\n",
     "        \n",
     "    try:\n",
     "        if where_to_place == 'append_all (ต่อ column สุดท้าย)':\n",
-    "            final_cols = list(output_col) + [col + '_translated' for col in output_col]\n",
     "            result = df[final_cols]\n",
     "            result.to_excel(output_file, index=False)\n",
     "        elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':\n",
     "            final_cols = []\n",
     "            for col in output_col:\n",
-    "                final_cols = final_cols + [col, col + '_translated']\n",
     "            result = df[final_cols]\n",
     "            result.to_excel(output_file, index=False)\n",
     "        elif where_to_place == 'replace':\n",
-    "            final_cols = [col + '_translated' for col in output_col] \n",
     "            result = df[final_cols]\n",
     "            result.to_excel(output_file, index=False)\n",
     "        elif where_to_place == 'new_sheet':\n",
     "            final_cols = [col for col in output_col]\n",
-    "            new_tab_cols = [col + '_translated' for col in output_col]\n",
     "\n",
     "            result = df[final_cols]\n",
     "            result1 = df[new_tab_cols]\n",
@@ -242,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "id": "x8Njoc4fROSp"
    },
@@ -281,7 +376,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['Thai', 'English', 'ABC']\n",
       "Keyboard interruption in main thread... closing server.\n"
      ]
     },
@@ -289,7 +384,7 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -341,7 +436,7 @@
     "\n",
     "    model_choosing = gr.Dropdown(multiselect = False , \n",
     "                                 label = \"Choosing Model you want\", \n",
-    "                                 choices = ['ChatGPT (4o-mini)', 'another (In Progress)']\n",
     "                                 , interactive=True\n",
     "                                 )\n",
     "\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
     "from langchain_core.prompts import PromptTemplate\n",
     "from langchain_core.runnables import RunnableLambda\n",
     "import gradio as gr\n",
+    "import pandas as pd\n",
+    "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
+    "import torch\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from docx import Document\n",
+    "\n",
+    "# # โหลดไฟล์ Word\n",
+    "# doc = Document('test_file.docx')\n",
+    "\n",
+    "# # อ่านทุก paragraph และแสดงเนื้อหา\n",
+    "# for para in doc.paragraphs:\n",
+    "#     print(para.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # อ่านทุกตารางในเอกสาร\n",
+    "# for table in doc.tables:\n",
+    "#     for row in table.rows:\n",
+    "#         for cell in row.cells:\n",
+    "#             print(cell.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for element in doc.element.body:\n",
+    "#     if element.tag.endswith('tbl'):\n",
+    "#         # ถ้าเป็นตาราง\n",
+    "#         print('Table found')\n",
+    "#     elif element.tag.endswith('p'):\n",
+    "#         # ถ้าเป็นพารากราฟ\n",
+    "#         print('Paragraph found')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # นับรูปภาพ\n",
+    "# images = doc.inline_shapes\n",
+    "# print(\"Found\", len(images), \"images\")\n",
+    "\n",
+    "# # ตัวอย่างวิธีดึงข้อมูลพื้นฐานของรูปภาพแต่ละรูป\n",
+    "# for image in images:\n",
+    "#     print(\"Image size:\", image.width.pt, \"x\", image.height.pt) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "# Process for Each Model"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "def chat_gpt_4o_mini(api_key = None):\n",
     "    model = ChatOpenAI(model_name=\"gpt-4o-mini\", api_key=api_key)\n",
     "\n",
     "    chain = prompt | model | output_parser | RunnableLambda(get_class)  \n",
     "\n",
+    "    return chain\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def using_model(chosen_model, api_key=None):\n",
+    "    if chosen_model == 'ChatGPT (4o-mini)':\n",
+    "        return chat_gpt_4o_mini(api_key=api_key)\n",
+    "    else:\n",
+    "        raise ValueError(\"Unsupported model selected\")\n"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 35,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
     "\n",
     "def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):\n",
     "    if where_to_place is None:\n",
+    "        where_to_place = 'append_all (ต่อ column สุดท้าย)'\n",
     "\n",
     "    model = using_model(chosen_model = chosen_model, api_key = api_key)\n",
     "\n",
     "    df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)\n",
+    "    # original columns = col_name + remain_col\n",
     "    original_col = df.columns\n",
     "    total_columns = len(df.columns)\n",
     "    current_step = 0\n",
     "    progress(0, desc=\"Starting translation process...\")\n",
     "\n",
     "    # Automatically detect string columns if col_name is None\n",
+    "    # col_name is column we want to translate\n",
     "    if col_name is None:\n",
     "        col_name = [col for col in df.columns if df[col].dtype == 'object']\n",
     "\n",
     "    # Determine columns that are not selected for translation\n",
+    "    # remain_col is column we do not want to translate\n",
+    "    remain_col = [col for col in df.columns if col not in col_name]\n",
     "\n",
     "    # Dictionary to store unique values and their translations\n",
     "    translation_map = {}\n",
+    "    trans_col_name = []\n",
     "    print(col_name)\n",
     "\n",
     "    # Process the selected columns for translation\n",
     "    for idx, col in enumerate(col_name):\n",
     "        current_step += 1\n",
+    "        progress(current_step / total_columns, desc=f\"Translating {col} ({current_step}/{len(col_name)})...\")\n",
     "\n",
     "        try:\n",
     "            # Extract unique values from the column\n",
     "            translations = dict(zip(unique_values, answers))\n",
     "            translation_map[col] = translations\n",
     "\n",
+    "            trans_col_name.append(col + \"_translated\")\n",
     "            # Map translations back to the original DataFrame\n",
     "            df[col + \"_translated\"] = df[col].map(translations).fillna(df[col])\n",
     "\n",
     "            print(f\"Error in column {col}: {e}\")\n",
     "            continue\n",
     "\n",
+    "    # # Process remaining columns\n",
+    "    # for column in remain_col:\n",
+    "    #     current_step += 1\n",
+    "    #     progress(current_step / total_columns, desc=f\"Translating column name: {column} ({current_step}/{len(remain_col)})...\")\n",
     "\n",
+    "    #     try:\n",
+    "    #         # We do not translate remain_col which remaining col\n",
+    "    #         # remain_col = chain.batch([{\"sentence\": column, \"source_lang\": source_lang, \"target_lang\": target_lang}])\n",
+    "    #         name_col = column + '_translated'  # Assuming the translation returns a list of translations\n",
+    "    #         df.loc[:, name_col] = df.loc[:, column]\n",
     "\n",
+    "    #     except Exception as e:\n",
+    "    #         print(f\"Error in column {column}: {e}\")\n",
+    "    #         continue\n",
     "\n",
     "    \n",
+    "    output_file = f\"{file.name.split('.')[0]}_translated.xlsx\"\n",
     "    if not os.path.exists(output_file):\n",
     "        pd.DataFrame().to_excel(output_file, index=False)\n",
     "\n",
     "    if keep_original == 'keep original':\n",
+    "        # have the all columns\n",
     "        output_col = original_col\n",
     "    else:\n",
+    "        # only translated column\n",
     "        output_col = col_name\n",
     "\n",
     "        \n",
     "    try:\n",
     "        if where_to_place == 'append_all (ต่อ column สุดท้าย)':\n",
+    "            final_cols = list(output_col) + [col for col in trans_col_name]\n",
     "            result = df[final_cols]\n",
     "            result.to_excel(output_file, index=False)\n",
     "        elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':\n",
     "            final_cols = []\n",
     "            for col in output_col:\n",
+    "                for trans_col in trans_col_name:\n",
+    "                    if col + '_translated' == trans_col:\n",
+    "                        final_cols = final_cols + [col, trans_col]\n",
+    "                    else:\n",
+    "                        final_cols = final_cols + [col]\n",
     "            result = df[final_cols]\n",
     "            result.to_excel(output_file, index=False)\n",
     "        elif where_to_place == 'replace':\n",
+    "            final_cols = []\n",
+    "            for col in output_col:\n",
+    "                for trans_col in trans_col_name:\n",
+    "                    if col + '_translated' == trans_col:\n",
+    "                        final_cols = final_cols + [trans_col]\n",
+    "                    else:\n",
+    "                        final_cols = final_cols + [col]\n",
     "            result = df[final_cols]\n",
     "            result.to_excel(output_file, index=False)\n",
+    "\n",
     "        elif where_to_place == 'new_sheet':\n",
     "            final_cols = [col for col in output_col]\n",
+    "            new_tab_cols = trans_col_name\n",
     "\n",
     "            result = df[final_cols]\n",
     "            result1 = df[new_tab_cols]\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 36,
    "metadata": {
     "id": "x8Njoc4fROSp"
    },
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "['control type']\n",
       "Keyboard interruption in main thread... closing server.\n"
      ]
     },
      "data": {
       "text/plain": []
      },
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
     "\n",
     "    model_choosing = gr.Dropdown(multiselect = False , \n",
     "                                 label = \"Choosing Model you want\", \n",
+    "                                 choices = ['ChatGPT (4o-mini)', 'DeepSeek (developing...)','another (In Progress)']\n",
     "                                 , interactive=True\n",
     "                                 )\n",
     "\n",

translator_app.py CHANGED Viewed

@@ -56,11 +56,11 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
         col_name = [col for col in df.columns if df[col].dtype == 'object']
     # Determine columns that are not selected for translation
-    all_col = [col for col in df.columns if col not in col_name]
     # Dictionary to store unique values and their translations
     translation_map = {}
-    print(col_name)
     # Process the selected columns for translation
     for idx, col in enumerate(col_name):
@@ -82,6 +82,7 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
             translations = dict(zip(unique_values, answers))
             translation_map[col] = translations
             # Map translations back to the original DataFrame
             df[col + "_translated"] = df[col].map(translations).fillna(df[col])
@@ -89,23 +90,23 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
             print(f"Error in column {col}: {e}")
             continue
-    # Process remaining columns
-    for column in all_col:
-        current_step += 1
-        progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")
-        try:
-            # We do not translate all_col which remaining col
-            # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
-            name_col = column + '_translated'  # Assuming the translation returns a list of translations
-            df.loc[:, name_col] = df.loc[:, column]
-        except Exception as e:
-            print(f"Error in column {column}: {e}")
-            continue
-    output_file = "translated_output.xlsx"
     if not os.path.exists(output_file):
         pd.DataFrame().to_excel(output_file, index=False)
@@ -115,22 +116,33 @@ def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lan
         output_col = col_name
     try:
         if where_to_place == 'append_all (ต่อ column สุดท้าย)':
-            final_cols = list(output_col) + [col + '_translated' for col in output_col]
             result = df[final_cols]
             result.to_excel(output_file, index=False)
         elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':
             final_cols = []
             for col in output_col:
-                final_cols = final_cols + [col, col + '_translated']
             result = df[final_cols]
             result.to_excel(output_file, index=False)
         elif where_to_place == 'replace':
-            final_cols = [col + '_translated' for col in output_col]
             result = df[final_cols]
             result.to_excel(output_file, index=False)
         elif where_to_place == 'new_sheet':
             final_cols = [col for col in output_col]
-            new_tab_cols = [col + '_translated' for col in output_col]
             result = df[final_cols]
             result1 = df[new_tab_cols]
@@ -195,7 +207,7 @@ if __name__ == "__main__" :
         model_choosing = gr.Dropdown(multiselect = False ,
                                     label = "Choosing Model you want",
-                                    choices = ['ChatGPT (4o-mini)', 'another (In Progress)']
                                     , interactive=True
                                     )
@@ -232,5 +244,6 @@ if __name__ == "__main__" :
             ],
             outputs=output_file,
         )
-    iface.launch(debug=True, share=True, server_port= 7860, server_name="0.0.0.0")

         col_name = [col for col in df.columns if df[col].dtype == 'object']
     # Determine columns that are not selected for translation
+    remain_col = [col for col in df.columns if col not in col_name]
     # Dictionary to store unique values and their translations
     translation_map = {}
+    trans_col_name = []
     # Process the selected columns for translation
     for idx, col in enumerate(col_name):
             translations = dict(zip(unique_values, answers))
             translation_map[col] = translations
+            trans_col_name.append(col + "_translated")
             # Map translations back to the original DataFrame
             df[col + "_translated"] = df[col].map(translations).fillna(df[col])
             print(f"Error in column {col}: {e}")
             continue
+    # # Process remaining columns
+    # for column in remain_col:
+    #     current_step += 1
+    #     progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{len(remain_col)})...")
+    #     try:
+    #         # We do not translate remain_col which remaining col
+    #         # remain_col = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
+    #         name_col = column + '_translated'  # Assuming the translation returns a list of translations
+    #         df.loc[:, name_col] = df.loc[:, column]
+    #     except Exception as e:
+    #         print(f"Error in column {column}: {e}")
+    #         continue
+    output_file = f"{file.name}_translated.xlsx"
     if not os.path.exists(output_file):
         pd.DataFrame().to_excel(output_file, index=False)
         output_col = col_name
     try:
         if where_to_place == 'append_all (ต่อ column สุดท้าย)':
+            final_cols = list(output_col) + [col for col in trans_col_name]
             result = df[final_cols]
             result.to_excel(output_file, index=False)
         elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':
             final_cols = []
             for col in output_col:
+                for trans_col in trans_col_name:
+                    if col + '_translated' == trans_col:
+                        final_cols = final_cols + [col, trans_col]
+                    else:
+                        final_cols = final_cols + [col]
             result = df[final_cols]
             result.to_excel(output_file, index=False)
         elif where_to_place == 'replace':
+            final_cols = []
+            for col in output_col:
+                for trans_col in trans_col_name:
+                    if col + '_translated' == trans_col:
+                        final_cols = final_cols + [trans_col]
+                    else:
+                        final_cols = final_cols + [col]
             result = df[final_cols]
             result.to_excel(output_file, index=False)
         elif where_to_place == 'new_sheet':
             final_cols = [col for col in output_col]
+            new_tab_cols = trans_col_name
             result = df[final_cols]
             result1 = df[new_tab_cols]
         model_choosing = gr.Dropdown(multiselect = False ,
                                     label = "Choosing Model you want",
+                                    choices = ['ChatGPT (4o-mini)', 'DeepSeek (developing...)', 'another (In Progress)']
                                     , interactive=True
                                     )
             ],
             outputs=output_file,
         )
+    iface.launch(debug=True, share=True, server_port= 7860,
+                 server_name="0.0.0.0"
+                 )