Spaces:
Sleeping
Sleeping
| import os | |
| from typing import List | |
| from typing import Union | |
| import openai | |
| import tiktoken | |
| from dotenv import load_dotenv | |
| from icecream import ic | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| load_dotenv() # read local .env file | |
| model = os.getenv("OPENAI_MODEL") or "gpt-4-turbo" | |
| client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")) | |
| MAX_TOKENS_PER_CHUNK = ( | |
| 1000 # if text is more than this many tokens, we'll break it up into | |
| ) | |
| # discrete chunks to translate one chunk at a time | |
| def get_completion( | |
| prompt: str, | |
| system_message: str = "You are a helpful assistant.", | |
| model: str = model, | |
| temperature: float = 0.3, | |
| json_mode: bool = False, | |
| ) -> Union[str, dict]: | |
| """ | |
| Generate a completion using the OpenAI API. | |
| Args: | |
| prompt (str): The user's prompt or query. | |
| system_message (str, optional): The system message to set the context for the assistant. | |
| Defaults to "You are a helpful assistant.". | |
| model (str, optional): The name of the OpenAI model to use for generating the completion. | |
| Defaults to "gpt-4-turbo". | |
| temperature (float, optional): The sampling temperature for controlling the randomness of the generated text. | |
| Defaults to 0.3. | |
| json_mode (bool, optional): Whether to return the response in JSON format. | |
| Defaults to False. | |
| Returns: | |
| Union[str, dict]: The generated completion. | |
| If json_mode is True, returns the complete API response as a dictionary. | |
| If json_mode is False, returns the generated text as a string. | |
| """ | |
| if json_mode: | |
| response = client.chat.completions.create( | |
| model=model, | |
| temperature=temperature, | |
| top_p=1, | |
| response_format={"type": "json_object"}, | |
| messages=[ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| ) | |
| return response.choices[0].message.content | |
| else: | |
| response = client.chat.completions.create( | |
| model=model, | |
| temperature=temperature, | |
| top_p=1, | |
| messages=[ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| ) | |
| return response.choices[0].message.content | |
| def one_chunk_initial_translation( | |
| source_lang: str, target_lang: str, source_text: str | |
| ) -> str: | |
| """ | |
| Translate the entire text as one chunk using an LLM. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language for translation. | |
| source_text (str): The text to be translated. | |
| Returns: | |
| str: The translated text. | |
| """ | |
| system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}." | |
| translation_prompt = f"""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text. \ | |
| Do not provide any explanations or text apart from the translation. | |
| {source_lang}: {source_text} | |
| {target_lang}:""" | |
| prompt = translation_prompt.format(source_text=source_text) | |
| translation = get_completion(prompt, system_message=system_message) | |
| return translation | |
| def one_chunk_reflect_on_translation( | |
| source_lang: str, | |
| target_lang: str, | |
| source_text: str, | |
| translation_1: str, | |
| country: str = "", | |
| ) -> str: | |
| """ | |
| Use an LLM to reflect on the translation, treating the entire text as one chunk. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language of the translation. | |
| source_text (str): The original text in the source language. | |
| translation_1 (str): The initial translation of the source text. | |
| country (str): Country specified for target language. | |
| Returns: | |
| str: The LLM's reflection on the translation, providing constructive criticism and suggestions for improvement. | |
| """ | |
| system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \ | |
| You will be provided with a source text and its translation and your goal is to improve the translation." | |
| if country != "": | |
| reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \ | |
| The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}. | |
| The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows: | |
| <SOURCE_TEXT> | |
| {source_text} | |
| </SOURCE_TEXT> | |
| <TRANSLATION> | |
| {translation_1} | |
| </TRANSLATION> | |
| When writing suggestions, pay attention to whether there are ways to improve the translation's \n\ | |
| (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ | |
| (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ | |
| (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ | |
| (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ | |
| Write a list of specific, helpful and constructive suggestions for improving the translation. | |
| Each suggestion should address one specific part of the translation. | |
| Output only the suggestions and nothing else.""" | |
| else: | |
| reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \ | |
| The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows: | |
| <SOURCE_TEXT> | |
| {source_text} | |
| </SOURCE_TEXT> | |
| <TRANSLATION> | |
| {translation_1} | |
| </TRANSLATION> | |
| When writing suggestions, pay attention to whether there are ways to improve the translation's \n\ | |
| (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ | |
| (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ | |
| (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ | |
| (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ | |
| Write a list of specific, helpful and constructive suggestions for improving the translation. | |
| Each suggestion should address one specific part of the translation. | |
| Output only the suggestions and nothing else.""" | |
| prompt = reflection_prompt.format( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| source_text=source_text, | |
| translation_1=translation_1, | |
| ) | |
| reflection = get_completion(prompt, system_message=system_message) | |
| return reflection | |
| def one_chunk_improve_translation( | |
| source_lang: str, | |
| target_lang: str, | |
| source_text: str, | |
| translation_1: str, | |
| reflection: str, | |
| ) -> str: | |
| """ | |
| Use the reflection to improve the translation, treating the entire text as one chunk. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language for the translation. | |
| source_text (str): The original text in the source language. | |
| translation_1 (str): The initial translation of the source text. | |
| reflection (str): Expert suggestions and constructive criticism for improving the translation. | |
| Returns: | |
| str: The improved translation based on the expert suggestions. | |
| """ | |
| system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}." | |
| prompt = f"""Your task is to carefully read, then edit, a translation from {source_lang} to {target_lang}, taking into | |
| account a list of expert suggestions and constructive criticisms. | |
| The source text, the initial translation, and the expert linguist suggestions are delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT>, <TRANSLATION></TRANSLATION> and <EXPERT_SUGGESTIONS></EXPERT_SUGGESTIONS> \ | |
| as follows: | |
| <SOURCE_TEXT> | |
| {source_text} | |
| </SOURCE_TEXT> | |
| <TRANSLATION> | |
| {translation_1} | |
| </TRANSLATION> | |
| <EXPERT_SUGGESTIONS> | |
| {reflection} | |
| </EXPERT_SUGGESTIONS> | |
| Please take into account the expert suggestions when editing the translation. Edit the translation by ensuring: | |
| (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text), | |
| (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \ | |
| (iii) style (by ensuring the translations reflect the style of the source text) | |
| (iv) terminology (inappropriate for context, inconsistent use), or | |
| (v) other errors. | |
| Output only the new translation and nothing else.""" | |
| translation_2 = get_completion(prompt, system_message) | |
| return translation_2 | |
| def one_chunk_translate_text( | |
| source_lang: str, target_lang: str, source_text: str, country: str = "" | |
| ) -> str: | |
| """ | |
| Translate a single chunk of text from the source language to the target language. | |
| This function performs a two-step translation process: | |
| 1. Get an initial translation of the source text. | |
| 2. Reflect on the initial translation and generate an improved translation. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language for the translation. | |
| source_text (str): The text to be translated. | |
| country (str): Country specified for target language. | |
| Returns: | |
| str: The improved translation of the source text. | |
| """ | |
| translation_1 = one_chunk_initial_translation( | |
| source_lang, target_lang, source_text | |
| ) | |
| reflection = one_chunk_reflect_on_translation( | |
| source_lang, target_lang, source_text, translation_1, country | |
| ) | |
| translation_2 = one_chunk_improve_translation( | |
| source_lang, target_lang, source_text, translation_1, reflection | |
| ) | |
| return translation_2 | |
| def num_tokens_in_string( | |
| input_str: str, encoding_name: str = "cl100k_base" | |
| ) -> int: | |
| """ | |
| Calculate the number of tokens in a given string using a specified encoding. | |
| Args: | |
| str (str): The input string to be tokenized. | |
| encoding_name (str, optional): The name of the encoding to use. Defaults to "cl100k_base", | |
| which is the most commonly used encoder (used by GPT-4). | |
| Returns: | |
| int: The number of tokens in the input string. | |
| Example: | |
| >>> text = "Hello, how are you?" | |
| >>> num_tokens = num_tokens_in_string(text) | |
| >>> print(num_tokens) | |
| 5 | |
| """ | |
| encoding = tiktoken.get_encoding(encoding_name) | |
| num_tokens = len(encoding.encode(input_str)) | |
| return num_tokens | |
| def multichunk_initial_translation( | |
| source_lang: str, target_lang: str, source_text_chunks: List[str] | |
| ) -> List[str]: | |
| """ | |
| Translate a text in multiple chunks from the source language to the target language. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language for translation. | |
| source_text_chunks (List[str]): A list of text chunks to be translated. | |
| Returns: | |
| List[str]: A list of translated text chunks. | |
| """ | |
| system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}." | |
| translation_prompt = """Your task is provide a professional translation from {source_lang} to {target_lang} of PART of a text. | |
| The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>. Translate only the part within the source text | |
| delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS>. You can use the rest of the source text as context, but do not translate any | |
| of the other text. Do not output anything other than the translation of the indicated part of the text. | |
| <SOURCE_TEXT> | |
| {tagged_text} | |
| </SOURCE_TEXT> | |
| To reiterate, you should translate only this part of the text, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: | |
| <TRANSLATE_THIS> | |
| {chunk_to_translate} | |
| </TRANSLATE_THIS> | |
| Output only the translation of the portion you are asked to translate, and nothing else. | |
| """ | |
| translation_chunks = [] | |
| for i in range(len(source_text_chunks)): | |
| # Will translate chunk i | |
| tagged_text = ( | |
| "".join(source_text_chunks[0:i]) | |
| + "<TRANSLATE_THIS>" | |
| + source_text_chunks[i] | |
| + "</TRANSLATE_THIS>" | |
| + "".join(source_text_chunks[i + 1 :]) | |
| ) | |
| prompt = translation_prompt.format( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| tagged_text=tagged_text, | |
| chunk_to_translate=source_text_chunks[i], | |
| ) | |
| translation = get_completion(prompt, system_message=system_message) | |
| translation_chunks.append(translation) | |
| return translation_chunks | |
| def multichunk_reflect_on_translation( | |
| source_lang: str, | |
| target_lang: str, | |
| source_text_chunks: List[str], | |
| translation_1_chunks: List[str], | |
| country: str = "", | |
| ) -> List[str]: | |
| """ | |
| Provides constructive criticism and suggestions for improving a partial translation. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language of the translation. | |
| source_text_chunks (List[str]): The source text divided into chunks. | |
| translation_1_chunks (List[str]): The translated chunks corresponding to the source text chunks. | |
| country (str): Country specified for target language. | |
| Returns: | |
| List[str]: A list of reflections containing suggestions for improving each translated chunk. | |
| """ | |
| system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \ | |
| You will be provided with a source text and its translation and your goal is to improve the translation." | |
| if country != "": | |
| reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation. | |
| The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}. | |
| The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated | |
| is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text | |
| as context for critiquing the translated part. | |
| <SOURCE_TEXT> | |
| {tagged_text} | |
| </SOURCE_TEXT> | |
| To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: | |
| <TRANSLATE_THIS> | |
| {chunk_to_translate} | |
| </TRANSLATE_THIS> | |
| The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows: | |
| <TRANSLATION> | |
| {translation_1_chunk} | |
| </TRANSLATION> | |
| When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\ | |
| (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ | |
| (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ | |
| (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ | |
| (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ | |
| Write a list of specific, helpful and constructive suggestions for improving the translation. | |
| Each suggestion should address one specific part of the translation. | |
| Output only the suggestions and nothing else.""" | |
| else: | |
| reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation. | |
| The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated | |
| is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text | |
| as context for critiquing the translated part. | |
| <SOURCE_TEXT> | |
| {tagged_text} | |
| </SOURCE_TEXT> | |
| To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: | |
| <TRANSLATE_THIS> | |
| {chunk_to_translate} | |
| </TRANSLATE_THIS> | |
| The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows: | |
| <TRANSLATION> | |
| {translation_1_chunk} | |
| </TRANSLATION> | |
| When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\ | |
| (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\ | |
| (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\ | |
| (iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\ | |
| (iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\ | |
| Write a list of specific, helpful and constructive suggestions for improving the translation. | |
| Each suggestion should address one specific part of the translation. | |
| Output only the suggestions and nothing else.""" | |
| reflection_chunks = [] | |
| for i in range(len(source_text_chunks)): | |
| # Will translate chunk i | |
| tagged_text = ( | |
| "".join(source_text_chunks[0:i]) | |
| + "<TRANSLATE_THIS>" | |
| + source_text_chunks[i] | |
| + "</TRANSLATE_THIS>" | |
| + "".join(source_text_chunks[i + 1 :]) | |
| ) | |
| if country != "": | |
| prompt = reflection_prompt.format( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| tagged_text=tagged_text, | |
| chunk_to_translate=source_text_chunks[i], | |
| translation_1_chunk=translation_1_chunks[i], | |
| country=country, | |
| ) | |
| else: | |
| prompt = reflection_prompt.format( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| tagged_text=tagged_text, | |
| chunk_to_translate=source_text_chunks[i], | |
| translation_1_chunk=translation_1_chunks[i], | |
| ) | |
| reflection = get_completion(prompt, system_message=system_message) | |
| reflection_chunks.append(reflection) | |
| return reflection_chunks | |
| def multichunk_improve_translation( | |
| source_lang: str, | |
| target_lang: str, | |
| source_text_chunks: List[str], | |
| translation_1_chunks: List[str], | |
| reflection_chunks: List[str], | |
| ) -> List[str]: | |
| """ | |
| Improves the translation of a text from source language to target language by considering expert suggestions. | |
| Args: | |
| source_lang (str): The source language of the text. | |
| target_lang (str): The target language for translation. | |
| source_text_chunks (List[str]): The source text divided into chunks. | |
| translation_1_chunks (List[str]): The initial translation of each chunk. | |
| reflection_chunks (List[str]): Expert suggestions for improving each translated chunk. | |
| Returns: | |
| List[str]: The improved translation of each chunk. | |
| """ | |
| system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}." | |
| improvement_prompt = """Your task is to carefully read, then improve, a translation from {source_lang} to {target_lang}, taking into | |
| account a set of expert suggestions and constructive critisms. Below, the source text, initial translation, and expert suggestions are provided. | |
| The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated | |
| is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text | |
| as context, but need to provide a translation only of the part indicated by <TRANSLATE_THIS> and </TRANSLATE_THIS>. | |
| <SOURCE_TEXT> | |
| {tagged_text} | |
| </SOURCE_TEXT> | |
| To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>: | |
| <TRANSLATE_THIS> | |
| {chunk_to_translate} | |
| </TRANSLATE_THIS> | |
| The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows: | |
| <TRANSLATION> | |
| {translation_1_chunk} | |
| </TRANSLATION> | |
| The expert translations of the indicated part, delimited below by <EXPERT_SUGGESTIONS> and </EXPERT_SUGGESTIONS>, is as follows: | |
| <EXPERT_SUGGESTIONS> | |
| {reflection_chunk} | |
| </EXPERT_SUGGESTIONS> | |
| Taking into account the expert suggestions rewrite the translation to improve it, paying attention | |
| to whether there are ways to improve the translation's | |
| (i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text), | |
| (ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \ | |
| (iii) style (by ensuring the translations reflect the style of the source text) | |
| (iv) terminology (inappropriate for context, inconsistent use), or | |
| (v) other errors. | |
| Output only the new translation of the indicated part and nothing else.""" | |
| translation_2_chunks = [] | |
| for i in range(len(source_text_chunks)): | |
| # Will translate chunk i | |
| tagged_text = ( | |
| "".join(source_text_chunks[0:i]) | |
| + "<TRANSLATE_THIS>" | |
| + source_text_chunks[i] | |
| + "</TRANSLATE_THIS>" | |
| + "".join(source_text_chunks[i + 1 :]) | |
| ) | |
| prompt = improvement_prompt.format( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| tagged_text=tagged_text, | |
| chunk_to_translate=source_text_chunks[i], | |
| translation_1_chunk=translation_1_chunks[i], | |
| reflection_chunk=reflection_chunks[i], | |
| ) | |
| translation_2 = get_completion(prompt, system_message=system_message) | |
| translation_2_chunks.append(translation_2) | |
| return translation_2_chunks | |
| def multichunk_translation( | |
| source_lang, target_lang, source_text_chunks, country: str = "" | |
| ): | |
| """ | |
| Improves the translation of multiple text chunks based on the initial translation and reflection. | |
| Args: | |
| source_lang (str): The source language of the text chunks. | |
| target_lang (str): The target language for translation. | |
| source_text_chunks (List[str]): The list of source text chunks to be translated. | |
| translation_1_chunks (List[str]): The list of initial translations for each source text chunk. | |
| reflection_chunks (List[str]): The list of reflections on the initial translations. | |
| country (str): Country specified for target language | |
| Returns: | |
| List[str]: The list of improved translations for each source text chunk. | |
| """ | |
| translation_1_chunks = multichunk_initial_translation( | |
| source_lang, target_lang, source_text_chunks | |
| ) | |
| reflection_chunks = multichunk_reflect_on_translation( | |
| source_lang, | |
| target_lang, | |
| source_text_chunks, | |
| translation_1_chunks, | |
| country, | |
| ) | |
| translation_2_chunks = multichunk_improve_translation( | |
| source_lang, | |
| target_lang, | |
| source_text_chunks, | |
| translation_1_chunks, | |
| reflection_chunks, | |
| ) | |
| return translation_2_chunks | |
| def calculate_chunk_size(token_count: int, token_limit: int) -> int: | |
| """ | |
| Calculate the chunk size based on the token count and token limit. | |
| Args: | |
| token_count (int): The total number of tokens. | |
| token_limit (int): The maximum number of tokens allowed per chunk. | |
| Returns: | |
| int: The calculated chunk size. | |
| Description: | |
| This function calculates the chunk size based on the given token count and token limit. | |
| If the token count is less than or equal to the token limit, the function returns the token count as the chunk size. | |
| Otherwise, it calculates the number of chunks needed to accommodate all the tokens within the token limit. | |
| The chunk size is determined by dividing the token limit by the number of chunks. | |
| If there are remaining tokens after dividing the token count by the token limit, | |
| the chunk size is adjusted by adding the remaining tokens divided by the number of chunks. | |
| Example: | |
| >>> calculate_chunk_size(1000, 500) | |
| 500 | |
| >>> calculate_chunk_size(1530, 500) | |
| 389 | |
| >>> calculate_chunk_size(2242, 500) | |
| 496 | |
| """ | |
| if token_count <= token_limit: | |
| return token_count | |
| num_chunks = (token_count + token_limit - 1) // token_limit | |
| chunk_size = token_count // num_chunks | |
| remaining_tokens = token_count % token_limit | |
| if remaining_tokens > 0: | |
| chunk_size += remaining_tokens // num_chunks | |
| return chunk_size | |
| def translate( | |
| source_lang, | |
| target_lang, | |
| source_text, | |
| country, | |
| max_tokens=MAX_TOKENS_PER_CHUNK, | |
| ): | |
| """Translate the source_text from source_lang to target_lang.""" | |
| num_tokens_in_text = num_tokens_in_string(source_text) | |
| ic(num_tokens_in_text) | |
| if num_tokens_in_text < max_tokens: | |
| ic("Translating text as single chunk") | |
| final_translation = one_chunk_translate_text( | |
| source_lang, target_lang, source_text, country | |
| ) | |
| return final_translation | |
| else: | |
| ic("Translating text as multiple chunks") | |
| token_size = calculate_chunk_size( | |
| token_count=num_tokens_in_text, token_limit=max_tokens | |
| ) | |
| ic(token_size) | |
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
| model_name="gpt-4", | |
| chunk_size=token_size, | |
| chunk_overlap=0, | |
| ) | |
| source_text_chunks = text_splitter.split_text(source_text) | |
| translation_2_chunks = multichunk_translation( | |
| source_lang, target_lang, source_text_chunks, country | |
| ) | |
| return "".join(translation_2_chunks) | |