Spaces:

petchsko
/

Translator_app

Sleeping

File size: 18,138 Bytes

import pandas as pd
import os
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
import gradio as gr
import pandas as pd
from docx import Document

def using_model(chosen_model, api_key):
    if chosen_model == 'ChatGPT (4o-mini)':
        model = chat_gpt_4o_mini(api_key = api_key)
    else:
        pass
    return model

def chat_gpt_4o_mini(api_key = None):
    model = ChatOpenAI(model_name="gpt-4o-mini", api_key=api_key)

    str_prompt ="""
    You will be provided with a sentence in {source_lang}, and your task is to translate it into {target_lang}.
    Answer in Json format with key 'translated'
    Sentence: {sentence}
    """

    output_parser = JsonOutputParser()
    prompt = PromptTemplate(
        template = str_prompt,
        input_variables=["source_lang","target_lang","sentence"],
        partial_variables={"format_instructions": output_parser.get_format_instructions()}
    )
    def get_class(x:dict)->str:
        return x["translated"]

    chain = prompt | model | output_parser | RunnableLambda(get_class)  

    return chain


def chat_gpt_translate_excel(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress(), return_output = 'file'):
    if where_to_place is None:
        where_to_place = 'append_all'

    model = using_model(chosen_model = chosen_model, api_key = api_key)

    if isinstance(file, pd.DataFrame):
        df = file.copy()
        output_file = f"{file.name.unique()[0].split('.')[0]}_translated.xlsx"
        df = df.drop(columns=['name'])
    elif isinstance(file, str):
        df = pd.read_excel(file, sheet_name=sheet_name, header=0)
        output_file = f"{file.split('.')[0]}_translated.xlsx"
    else:
        df = pd.read_excel(file.name, sheet_name=sheet_name, header=0)
        output_file = f"{file.name.split('.')[0]}_translated.xlsx"

    original_col = df.columns
    total_columns = len(df.columns)
    current_step = 0

    progress(0, desc="Starting translation process...")

    # Automatically detect string columns if col_name is None
    if col_name is None:
        col_name = [col for col in df.columns if df[col].dtype == 'object']

    # Determine columns that are not selected for translation
    remain_col = [col for col in df.columns if col not in col_name]

    # Dictionary to store unique values and their translations
    translation_map = {}
    trans_col_name = []


    # Process the selected columns for translation
    for idx, col in enumerate(col_name):
        current_step += 1
        progress(current_step / total_columns, desc=f"Translating {col} ({current_step}/{len(col_name)})...")

        try:
            # Extract unique values from the column
            unique_values = df[col].dropna().unique()
            unique_values = list(set(unique_values))  # Ensure uniqueness

            # Prepare data for translation
            zh_sentence = [{"sentence": value, "source_lang": source_lang, "target_lang": target_lang} for value in unique_values]

            # Translate unique values
            answers = model.batch(zh_sentence, config={"max_concurrency": 3})
            
            # Create a mapping from original values to translated values
            translations = dict(zip(unique_values, answers))
            translation_map[col] = translations

            trans_col_name.append(col + "_translated")
            # Map translations back to the original DataFrame
            df[col + "_translated"] = df[col].map(translations).fillna(df[col])

        except Exception as e:
            print(f"Error in column {col}: {e}")
            continue

    # Process remaining columns
    # for column in remain_col:
    #     current_step += 1
    #     progress(current_step / total_columns, desc=f"Translating column name: {column} ({current_step}/{total_columns})...")

    #     try:
    #         # We do not translate all_col which remaining col
    #         # all_col_translation = chain.batch([{"sentence": column, "source_lang": source_lang, "target_lang": target_lang}])
    #         name_col = column + '_translated'  # Assuming the translation returns a list of translations
    #         df.loc[:, name_col] = df.loc[:, column]

    #     except Exception as e:
    #         print(f"Error in column {column}: {e}")
    #         continue

    if not os.path.exists(output_file):
        pd.DataFrame().to_excel(output_file, index=False)

    if keep_original == 'keep original':
        output_col = original_col
    else:
        output_col = col_name

    try:
        if where_to_place == 'append_all (ต่อ column สุดท้าย)':
            final_cols = list(output_col) + [col for col in trans_col_name]
            result = df[final_cols]
            result.to_excel(output_file, index=False)
        elif where_to_place == 'append_compare (เปรียบเทียบ column by column)':
            final_cols = []
            for col in output_col:
                for trans_col in trans_col_name:
                    if col + '_translated' == trans_col:
                        final_cols = final_cols + [col, trans_col]
                    else:
                        final_cols = final_cols + [col]
            result = df[final_cols]
            result.to_excel(output_file, index=False)
        elif where_to_place == 'replace':
            final_cols = []
            for col in output_col:
                for trans_col in trans_col_name:
                    if col + '_translated' == trans_col:
                        final_cols = final_cols + [trans_col]
                    else:
                        final_cols = final_cols + [col]
            result = df[final_cols]
            result.to_excel(output_file, index=False)

        elif where_to_place == 'new_sheet':
            final_cols = [col for col in output_col]
            new_tab_cols = trans_col_name

            result = df[final_cols]
            result1 = df[new_tab_cols]
            # Use ExcelWriter to write multiple sheets
            with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
                result.to_excel(writer, sheet_name=sheet_name, index=False)  # First sheet
                result1.to_excel(writer, sheet_name=f'{sheet_name}_translated', index=False)  # Second sheet

        progress(1.0, desc="Saving translated file... Completed!")
    except Exception as e:
        print(f"Error saving the file: {e}")
        raise gr.Error(f"Error saving the file: {e}")

    progress(1.0, desc="Completed all tasks!")

    if return_output == 'file':
        return output_file
    elif return_output == 'df':
        return result
    else:
        return output_file


def extract_word_content_to_excel(file_path):
    """ ดึงเนื้อหา + รูปภาพจากไฟล์ Word และบันทึกเป็น Excel """
    doc = Document(file_path)
    
    data = []
    table_dict = {}
    paragraph_count = 0

    for element in doc.element.body:
        if element.tag.endswith("p"):  # Paragraph
            paragraph_text = element.text.strip()
            paragraph_count += 1
            data.append([paragraph_count, paragraph_text])  # บันทึกพารากราฟ

        elif element.tag.endswith("tbl"):  # Table (ถ้ามี)
            paragraph_count += 1
            data.append([paragraph_count, "[Table]"])

            # Extract table content
            table = doc.tables[len(table_dict)]  # Get current table
            table_data = []
            for row in table.rows:
                row_data = [cell.text.strip() for cell in row.cells]
                table_data.append(row_data)
            
            # Generate dynamic column names ('object_0', 'object_1', ...)
            max_cols = max(len(row) for row in table_data) if table_data else 0
            column_names = [f"object_{i}" for i in range(max_cols)]

            # Store table as DataFrame
            table_dict[paragraph_count] = pd.DataFrame(table_data, columns=column_names)


        elif element.tag.endswith("drawing"):  # Image (รูปภาพ)
            paragraph_count += 1
            data.append([paragraph_count, "[Image]"])

    # สร้าง DataFrame
    df = pd.DataFrame(data, columns=["paragraph", "original"])
    df['name'] = file_path.split('/')[-1]

    with pd.ExcelWriter("extracted_tables.xlsx") as writer:
        for key, table_df in table_dict.items():
            table_df.to_excel(writer, sheet_name=f"Table_{key}", index=False)

    return df, table_dict

def reconstruct_word(paragraph_df, translated_tables, file_path):
    """Reconstruct Word Document from translated content"""
    doc = Document()
    output_path=f"{file_path.split('.')[0]}_translated.docx"

    for _, row in paragraph_df.iterrows():
        if row["original"] == "[Table]":  # Insert Table
            table_number = row["paragraph"]
            table_df = translated_tables.get(table_number)

            if table_df is not None:
                # Filter only columns that contain '_translated'
                translated_cols = [col for col in table_df.columns if '_translated' in col]

                if translated_cols:
                    table_df = table_df[translated_cols]  # Keep only translated columns

                    # Create a table with the filtered columns
                    table = doc.add_table(rows=len(table_df), cols=len(table_df.columns))

                    for i, row_data in enumerate(table_df.values):
                        for j, cell_text in enumerate(row_data):
                            table.cell(i, j).text = cell_text
                else:
                    print(f"⚠ Warning: No '_translated' columns found for table at paragraph {table_number}")
        else:
            if "original_translated" in row:
                doc.add_paragraph(row["original_translated"])
            else:
                doc.add_paragraph("")

    doc.save(output_path)
    return output_path

def chat_gpt_translate_word(file, sheet_name, col_name, source_lang, target_lang, where_to_place, keep_original, chosen_model, api_key = None, progress=gr.Progress()):
    word_to_excel_file, word_table = extract_word_content_to_excel(file)
    base_translated = chat_gpt_translate_excel(word_to_excel_file, 
                             sheet_name="Sheet1", 
                             col_name = ['original'], 
                             source_lang = source_lang, 
                             target_lang = target_lang, 
                             where_to_place="append_all (ต่อ column สุดท้าย)", 
                             keep_original="keep original", 
                             chosen_model = chosen_model, 
                             api_key = api_key,
                             return_output='df'
                             )
    # Translate Tables
    translated_tables = {}
    for key, table_df in word_table.items():
        translated_tables[key] = chat_gpt_translate_excel(
            file="extracted_tables.xlsx",
            sheet_name=f"Table_{key}",
            col_name=table_df.columns.tolist(),
            source_lang=source_lang,
            target_lang=target_lang,
            where_to_place="append_all (ต่อ column สุดท้าย)",
            keep_original="keep original",
            chosen_model=chosen_model,
            api_key=api_key, 
            return_output='df'
        )
    output_file = reconstruct_word(base_translated, translated_tables, file)

    if os.path.exists('extracted_tables.xlsx'):
        os.remove('extracted_tables.xlsx')
    if os.path.exists('extracted_tables_translated.xlsx'):
        os.remove('extracted_tables_translated.xlsx')

    # for deploy huggingface
    if os.path.exists(f"{file.split('.')[0]}_translated.xlsx"):
        os.remove(f"{file.split('.')[0]}_translated.xlsx")

    return output_file

if __name__ == "__main__" :

    with gr.Blocks() as iface:
        gr.Markdown("## Excel Translation Interface")

        excel_file = gr.File(label="Upload Excel File")
        sheet_name = gr.Dropdown(label="Select Sheet", interactive=True)
        column_name= gr.Dropdown(label = "Select Column to Translate (Not require)", multiselect=True, interactive=True)
                                
        with gr.Row():
            source_language = gr.Textbox(label="Source Language Code")
            target_language = gr.Textbox(label="Target Language Code")
        with gr.Row():
            where_to_place = gr.Dropdown(multiselect=False ,label="How translated columns should be placed"
                                        , choices = ['replace', 
                                                    'append_all (ต่อ column สุดท้าย)', 
                                                    'append_compare (เปรียบเทียบ column by column)', 
                                                    'new_sheet']
                                        , interactive=True
                                        )
            keep_original = gr.Dropdown(multiselect=False ,label="You want to keep original column or just only the translated column"
                                        , choices = ['keep original', 
                                                    'translated_column']
                                                    , interactive=True
                                        )

        def check_file_type(file):
            """ ตรวจสอบว่าไฟล์ที่อัปโหลดเป็น Word หรือ Excel """
            file_extension = os.path.splitext(file.name)[-1].lower()

            if file_extension in [".docx", ".doc"]:
                return gr.update(choices=['all paragraphs only', 'specified paragraph or page (Developing ...)']
                                , interactive=False
                                )
            elif file_extension in [".xlsx", ".xls"]:
                return update_sheets(file)
            else:
                return "Unknown"
            
        def check_uploaded_file(file):
            """ ฟังก์ชันรับไฟล์ที่อัปโหลด แล้วตรวจสอบประเภท """
            if file is None:
                return "No file uploaded"
            return check_file_type(file)

    
        def get_sheet_names(file):
            xls = pd.ExcelFile(file.name)
            return xls.sheet_names

        def update_sheets(file):
            sheets = get_sheet_names(file)
            return gr.update(choices=sheets)

        def update_columns(file, sheet_name):
            if os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]:
                return gr.update(choices=['original'], interactive=False)
            elif os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
                columns = get_column_names(file, sheet_name)
                return gr.update(choices=columns)
            else:
                return "error"

        def get_column_names(file, sheet_name):
            dd = pd.read_excel(file.name, sheet_name=sheet_name)
            return list(dd.columns)
        

        excel_file.change(fn=check_uploaded_file, inputs=excel_file, outputs=sheet_name)
        sheet_name.change(fn=update_columns, inputs=[excel_file, sheet_name], outputs=column_name)

        model_choosing = gr.Dropdown(multiselect = False , 
                                    label = "Choosing Model you want", 
                                    choices = ['ChatGPT (4o-mini)', 'Deepseek (developing ...)', 'another (In Progress)']
                                    , interactive=True
                                    )

        needed_require = gr.Textbox(label="API Key(require if Chatgpt)")
        translate_button = gr.Button("Translate")
        output_file = gr.File(label="Download Translated Excel File", interactive=True)

        # Unified translation function
        def translate_excel(
            file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
        ):
            if os.path.splitext(file.name)[-1].lower() in [".xlsx", ".xls"]:
                if model == "ChatGPT (4o-mini)":
                    # Call ChatGPT-based translation
                    return chat_gpt_translate_excel(
                        file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key
                    )
                else:
                    # Handle other models (currently in progress)
                    raise gr.Error("Translation with the selected model is not yet implemented.")
            elif os.path.splitext(file.name)[-1].lower() in [".docx", ".doc"]:
                if model == "ChatGPT (4o-mini)":
                    # Call ChatGPT-based translation
                    return chat_gpt_translate_word(file, sheet_name, columns, source_lang, target_lang, place_option, keep_opt, model, api_key)
                else:
                    # Handle other models (currently in progress)
                    raise gr.Error("Translation with the selected model is not yet implemented.")
                
            else:
                print('No Type of Input Supported')

        # Register button click
        translate_button.click(
            fn=translate_excel,
            inputs=[
                excel_file,
                sheet_name,
                column_name,
                source_language,
                target_language,
                where_to_place,
                keep_original,
                model_choosing,
                needed_require,
            ],
            outputs=output_file,
        )
    
    iface.launch(debug=True, share=True, 
                 server_port= 7860, 
                 server_name="0.0.0.0"
                 )