Spaces:
Running
Running
| import os | |
| import re | |
| from datetime import datetime | |
| from typing import Dict | |
| import gradio | |
| import sign_language_translator as slt | |
| DESCRIPTION = """Enter your text and select languages from the dropdowns, then click Submit to generate a video. [`Library Repository`](https://github.com/sign-language-translator/sign-language-translator) | |
| The text is preprocessed, tokenized and rearranged and then each token is mapped to a prerecorded video which are concatenated and returned. [`Model Code`](https://github.com/sign-language-translator/sign-language-translator/blob/main/sign_language_translator/models/text_to_sign/concatenative_synthesis.py) | |
| > **NOTE** | |
| > - This model only supports a fixed vocabulary. See the [`*-dictionary-mapping.json`](https://github.com/sign-language-translator/sign-language-datasets/tree/main/parallel_texts) files for supported words. | |
| > - This version needs to re-encode the generated video so that will take some extra time after translation. | |
| > - Since this is a rule-based model, you will have to add **context** to ambiguous words (e.g. glass(material) vs glass(container)). | |
| > - Some signs correspond to words very specific in a particular language so their mapping in other languages will not make sense (e.g. in pakistan-sign-language, signs were recorded in reference to common Urdu words, hence English words "for" & "to" etc do not map well to their original Urdu words "کے لئے" and "کو" etc). | |
| """.strip() | |
| TITLE = "Concatenative Synthesis: Rule Based Text to Sign Language Translator" | |
| CUSTOM_JS = """<script> | |
| const rtlLanguages = ["urdu", "arabic"]; | |
| const keyMap = { | |
| "urdu": { | |
| "1": "۱", | |
| "2": "۲", | |
| "3": "۳", | |
| "4": "۴", | |
| "5": "۵", | |
| "6": "٦", | |
| "7": "۷", | |
| "8": "۸", | |
| "9": "۹", | |
| "0": "۰", | |
| "q": "ق", | |
| "w": "و", | |
| "e": "ع", | |
| "r": "ر", | |
| "t": "ت", | |
| "y": "ے", | |
| "u": "ء", | |
| "i": "ی", | |
| "o": "ہ", | |
| "p": "پ", | |
| "a": "ا", | |
| "s": "س", | |
| "d": "د", | |
| "f": "ف", | |
| "g": "گ", | |
| "h": "ح", | |
| "j": "ج", | |
| "k": "ک", | |
| "l": "ل", | |
| "z": "ز", | |
| "x": "ش", | |
| "c": "چ", | |
| "v": "ط", | |
| "b": "ب", | |
| "n": "ن", | |
| "m": "م", | |
| "R": "ڑ", | |
| "T": "ٹ", | |
| "Y": "َ", | |
| "U": "ئ", | |
| "I": "ِ", | |
| "P": "ُ", | |
| "A": "آ", | |
| "S": "ص", | |
| "D": "ڈ", | |
| "F": "أ", | |
| "G": "غ", | |
| "H": "ھ", | |
| "J": "ض", | |
| "K": "خ", | |
| "Z": "ذ", | |
| "X": "ژ", | |
| "C": "ث", | |
| "V": "ظ", | |
| "N": "ں", | |
| ",": "،", | |
| ".": "۔", | |
| "?": "؟", | |
| ";": "؛", | |
| }, | |
| "hindi": { | |
| "1": "१", | |
| "2": "२", | |
| "3": "३", | |
| "4": "४", | |
| "5": "५", | |
| "6": "६", | |
| "7": "७", | |
| "8": "८", | |
| "9": "९", | |
| "0": "०", | |
| "=": "ृ", | |
| "!": "ऍ", | |
| "@": "ॅ", | |
| "#": "्र", | |
| "$": "र्", | |
| "%": "ज्ञ", | |
| "^": "त्र", | |
| "&": "क्ष", | |
| "*": "श्र", | |
| "_": "ः", | |
| "+": "ऋ", | |
| "q": "ौ", | |
| "w": "ै", | |
| "e": "ा", | |
| "r": "ी", | |
| "t": "ू", | |
| "y": "ब", | |
| "u": "ह", | |
| "i": "ग", | |
| "o": "द", | |
| "p": "ज", | |
| "[": "ड", | |
| "]": "़", | |
| '\\\\': "ॉ", | |
| "Q": "औ", | |
| "W": "ऐ", | |
| "E": "आ", | |
| "R": "ई", | |
| "T": "ऊ", | |
| "Y": "भ", | |
| "U": "ङ", | |
| "I": "घ", | |
| "O": "ध", | |
| "P": "झ", | |
| "{": "ढ", | |
| "}": "ञ", | |
| "|": "ऑ", | |
| "a": "ो", | |
| "s": "े", | |
| "d": "्", | |
| "f": "ि", | |
| "g": "ु", | |
| "h": "प", | |
| "j": "र", | |
| "k": "क", | |
| "l": "त", | |
| ";": "च", | |
| "'": "ट", | |
| "A": "ओ", | |
| "S": "ए", | |
| "D": "अ", | |
| "F": "इ", | |
| "G": "उ", | |
| "H": "फ", | |
| "J": "ऱ", | |
| "K": "ख", | |
| "L": "थ", | |
| ":": "छ", | |
| '"': "ठ", | |
| "z": "ॆ", | |
| "x": "ं", | |
| "c": "म", | |
| "v": "न", | |
| "b": "व", | |
| "n": "ल", | |
| "m": "स", | |
| ".": "।", | |
| "/": "य", | |
| "Z": "ऎ", | |
| "X": "ँ", | |
| "C": "ण", | |
| "V": "ऩ", | |
| "B": "ऴ", | |
| "N": "ळ", | |
| "M": "श", | |
| "<": "ष", | |
| ">": "य़", | |
| // "?":"य़", | |
| } | |
| }; | |
| function updateTextareaDir(language) { | |
| const sourceTextarea = document.getElementById("source-textbox").querySelector("textarea"); | |
| if (rtlLanguages.includes(language)) { | |
| sourceTextarea.setAttribute("dir", "rtl"); | |
| } else { | |
| sourceTextarea.setAttribute("dir", "ltr"); | |
| } | |
| function keypressHandler(event) { | |
| const key = event.key; | |
| if (keyMap[language].hasOwnProperty(key)) { | |
| event.preventDefault(); | |
| const mappedValue = keyMap[language][key]; | |
| const start = sourceTextarea.selectionStart; | |
| const end = sourceTextarea.selectionEnd; | |
| sourceTextarea.value = sourceTextarea.value.slice(0, start) + mappedValue + sourceTextarea.value.slice(end); | |
| sourceTextarea.selectionStart = sourceTextarea.selectionEnd = start + mappedValue.length; | |
| } | |
| } | |
| sourceTextarea.removeEventListener("keypress", sourceTextarea.keypressHandler); | |
| sourceTextarea.addEventListener("keypress", keypressHandler); | |
| // Save the handler function to the textarea element for future removal | |
| sourceTextarea.keypressHandler = keypressHandler; | |
| } | |
| </script> | |
| """ | |
| # todo: add dropdown keyboard custom component with key mapping | |
| # todo: output full height | |
| CUSTOM_CSS = """ | |
| .reverse-row { | |
| flex-direction: row-reverse; | |
| } | |
| #auto-complete-button { | |
| border-color: var(--button-primary-border-color-hover); | |
| } | |
| """ | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| request_logger = ( | |
| gradio.HuggingFaceDatasetSaver( | |
| HF_TOKEN, | |
| "sltAI/crowdsourced-text-to-sign-language-rule-based-translation-corpus", | |
| ) | |
| if HF_TOKEN | |
| else gradio.CSVLogger() | |
| ) | |
| translation_model = slt.models.ConcatenativeSynthesis("ur", "pk-sl", "video") | |
| language_models: Dict[str, slt.models.BeamSampling] = {} | |
| full_to_short = { | |
| "english": "en", | |
| "urdu": "ur", | |
| "hindi": "hi", | |
| } | |
| short_to_full = {s: f for f, s in full_to_short.items()} | |
| def auto_complete_text(model_code: str, text: str): | |
| if model_code not in language_models: | |
| lm = slt.get_model(model_code) | |
| language_models[model_code] = slt.models.BeamSampling( | |
| lm, # type: ignore | |
| start_of_sequence_token=getattr(lm, "start_of_sequence_token", "<"), # type: ignore | |
| end_of_sequence_token=getattr(lm, "end_of_sequence_token", ">"), # type: ignore | |
| ) | |
| # todo: better tokenize/detokenize | |
| tokens = [w for w in re.split(r"\b", text) if w] | |
| lm = language_models[model_code] | |
| lm.max_length = len(tokens) + 10 | |
| completion, _ = lm.complete(tokens or None) | |
| if completion[0] == lm.start_of_sequence_token: # type: ignore | |
| completion = completion[1:] # type: ignore | |
| if completion[-1] == lm.end_of_sequence_token: # type: ignore | |
| completion = completion[:-1] # type: ignore | |
| new_text = "".join(completion) | |
| return new_text | |
| def text_to_video( | |
| text: str, | |
| text_language: str, | |
| sign_language: str, | |
| sign_format: str = "video", | |
| output_path: str = "output.mp4", | |
| codec="h264", # ToDo: install h264 codec for opencv | |
| ): | |
| translation_model.text_language = text_language | |
| translation_model.sign_language = sign_language | |
| translation_model.sign_format = sign_format | |
| if sign_format == "landmarks": | |
| translation_model.sign_embedding_model = "mediapipe-world" | |
| sign = translation_model.translate(text) | |
| if isinstance(sign, slt.Landmarks): | |
| # large hands on sides | |
| # sign.data[:, 33:] *= 2 | |
| # sign.data[:, 33:54, 0] += 0.25 | |
| # sign.data[:, 54:, 0] -= 0.25 | |
| # hands moved to pose wrists | |
| sign.data[:, 33:54, :3] += -sign.data[:, 33:34, :3] + sign.data[:, 15:16, :3] | |
| sign.data[:, 54: , :3] += -sign.data[:, 54:55, :3] + sign.data[:, 16:17, :3] | |
| sign.save_animation(output_path, overwrite=True) | |
| else: | |
| sign.save(output_path, overwrite=True, codec=codec) | |
| # ToDo: video.watermark("Sign Language Translator\nAI Generated Video") | |
| def translate(text: str, text_lang: str, sign_lang: str, sign_format: str): | |
| text_lang = full_to_short.get(text_lang, text_lang) | |
| log = [ | |
| text, | |
| text_lang, | |
| sign_lang, | |
| None, | |
| datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"), | |
| ] | |
| try: | |
| if text_lang == "en": | |
| text = text[:1].lower() + text[1:] | |
| path = "output.mp4" | |
| text_to_video( | |
| text, | |
| text_lang, | |
| sign_lang, | |
| sign_format=sign_format, | |
| output_path=path, | |
| codec="mp4v", | |
| ) | |
| request_logger.flag(log) | |
| return path | |
| except Exception as exc: | |
| log[3] = str(exc) | |
| request_logger.flag(log) | |
| raise gradio.Error(f"Error during translation: {exc}") | |
| with gradio.Blocks(title=TITLE, head=CUSTOM_JS, css=CUSTOM_CSS) as gradio_app: | |
| gradio.Markdown(f"# {TITLE}") | |
| gradio.Markdown(DESCRIPTION) | |
| with gradio.Row(elem_classes=["reverse-row"]): # Inputs and Outputs | |
| with gradio.Column(): # Outputs | |
| gradio.Markdown("## Output Sign Language") | |
| output_video = gradio.Video( | |
| format="mp4", | |
| label="Synthesized Sign Language Video", | |
| autoplay=True, | |
| show_download_button=True, | |
| include_audio=False, | |
| ) | |
| with gradio.Column(): # Inputs | |
| gradio.Markdown("## Select Languages") | |
| with gradio.Row(): | |
| text_lang_dropdown = gradio.Dropdown( | |
| choices=[ | |
| short_to_full.get(code.value, code.value) | |
| for code in slt.TextLanguageCodes | |
| ], | |
| value=short_to_full.get( | |
| slt.TextLanguageCodes.URDU.value, | |
| slt.TextLanguageCodes.URDU.value, | |
| ), | |
| label="Text Language", | |
| elem_id="text-lang-dropdown", | |
| ) | |
| text_lang_dropdown.change( | |
| None, inputs=text_lang_dropdown, js="updateTextareaDir" | |
| ) | |
| sign_lang_dropdown = gradio.Dropdown( | |
| choices=[code.value for code in slt.SignLanguageCodes], | |
| value=slt.SignLanguageCodes.PAKISTAN_SIGN_LANGUAGE.value, | |
| label="Sign Language", | |
| ) | |
| output_format_dropdown = gradio.Dropdown( | |
| choices=[ | |
| slt.SignFormatCodes.VIDEO.value, | |
| slt.SignFormatCodes.LANDMARKS.value, | |
| ], | |
| value=slt.SignFormatCodes.VIDEO.value, | |
| label="Output Format", | |
| ) | |
| # todo: sign format: video/landmarks (tabs?) | |
| gradio.Markdown("## Input Text") | |
| with gradio.Row(): # Source TextArea | |
| source_textbox = gradio.Textbox( | |
| lines=4, | |
| placeholder="Enter Text Here...", | |
| label="Spoken Language Sentence", | |
| show_copy_button=True, | |
| elem_id="source-textbox", | |
| ) | |
| with gradio.Row(): # clear/auto-complete/Language Model | |
| language_model_dropdown = gradio.Dropdown( | |
| choices=[ | |
| slt.ModelCodes.MIXER_LM_NGRAM_URDU.value, | |
| slt.ModelCodes.TRANSFORMER_LM_UR_SUPPORTED.value, | |
| ], | |
| value=slt.ModelCodes.MIXER_LM_NGRAM_URDU.value, | |
| label="Select language model to Generate sample text", | |
| ) | |
| auto_complete_button = gradio.Button( | |
| "Auto-Complete", elem_id="auto-complete-button" | |
| ) | |
| auto_complete_button.click( | |
| auto_complete_text, | |
| inputs=[language_model_dropdown, source_textbox], | |
| outputs=[source_textbox], | |
| api_name=False, | |
| ) | |
| clear_button = gradio.ClearButton(source_textbox, api_name=False) | |
| with gradio.Row(): # Translate Button | |
| translate_button = gradio.Button("Translate", variant="primary") | |
| translate_button.click( | |
| translate, | |
| inputs=[ | |
| source_textbox, | |
| text_lang_dropdown, | |
| sign_lang_dropdown, | |
| output_format_dropdown, | |
| ], | |
| outputs=[output_video], | |
| api_name="translate", | |
| ) | |
| gradio.Examples( | |
| [ | |
| ["We are here to use this.", "english", "pakistan-sign-language", "video"], | |
| ["i(me) admire art.", "english", "pakistan-sign-language", "landmarks"], | |
| ["یہ بہت اچھا ہے۔", "urdu", "pakistan-sign-language", "video"], | |
| ["وہ کام آسان تھا۔", "urdu", "pakistan-sign-language", "landmarks"], | |
| ["कैसे हैं आप?", "hindi", "pakistan-sign-language", "video"], | |
| ["पाँच घंटे।", "hindi", "pakistan-sign-language", "landmarks"], | |
| ], | |
| inputs=[ | |
| source_textbox, | |
| text_lang_dropdown, | |
| sign_lang_dropdown, | |
| output_format_dropdown, | |
| ], | |
| outputs=output_video, | |
| ) | |
| request_logger.setup( | |
| [ | |
| source_textbox, | |
| text_lang_dropdown, | |
| sign_lang_dropdown, | |
| gradio.Markdown(label="Exception"), | |
| gradio.Markdown(label="Timestamp"), | |
| ], | |
| "flagged", | |
| ) | |
| gradio_app.load(None, inputs=[text_lang_dropdown], js="updateTextareaDir") | |
| if __name__ == "__main__": | |
| gradio_app.launch() | |