|
|
|
|
|
from enum import Enum |
|
|
from fastapi import FastAPI, Query |
|
|
from rich.console import Console |
|
|
import ctranslate2 |
|
|
import transformers |
|
|
from time import perf_counter |
|
|
import gradio as gr |
|
|
from pydantic import BaseModel, Field |
|
|
from lang_codes import flores_codes |
|
|
import logging |
|
|
from typing import Union |
|
|
|
|
|
|
|
|
|
|
|
translator = None |
|
|
tokenizer = None |
|
|
human_readable_languages = sorted(list(flores_codes.keys())) |
|
|
delta_load_model = None |
|
|
delta_translate = None |
|
|
|
|
|
|
|
|
description = f""" |
|
|
🚀 This API will take a text from user, src language and target language and then translate 📜 ➡️ 🗒️ it. |
|
|
|
|
|
Backend is C-translate2 which is C++ accelerate backend for transformers. |
|
|
|
|
|
## Languages supported |
|
|
|
|
|
Acehnese (Arabic script) |
|
|
Acehnese (Latin script) |
|
|
Afrikaans |
|
|
Akan |
|
|
Amharic |
|
|
Armenian |
|
|
Assamese |
|
|
Asturian |
|
|
Awadhi |
|
|
Ayacucho Quechua |
|
|
Balinese |
|
|
Bambara |
|
|
Banjar (Arabic script) |
|
|
Banjar (Latin script) |
|
|
Bashkir |
|
|
Basque |
|
|
Belarusian |
|
|
Bemba |
|
|
Bengali |
|
|
Bhojpuri |
|
|
Bosnian |
|
|
Buginese |
|
|
Bulgarian |
|
|
Burmese |
|
|
Catalan |
|
|
Cebuano |
|
|
Central Atlas Tamazight |
|
|
Central Aymara |
|
|
Central Kanuri (Arabic script) |
|
|
Central Kanuri (Latin script) |
|
|
Central Kurdish |
|
|
Chhattisgarhi |
|
|
Chinese (Simplified) |
|
|
Chinese (Traditional) |
|
|
Chokwe |
|
|
Crimean Tatar |
|
|
Croatian |
|
|
Czech |
|
|
Danish |
|
|
Dari |
|
|
Dutch |
|
|
Dyula |
|
|
Dzongkha |
|
|
Eastern Panjabi |
|
|
Eastern Yiddish |
|
|
Egyptian Arabic |
|
|
English |
|
|
Esperanto |
|
|
Estonian |
|
|
Ewe |
|
|
Faroese |
|
|
Fijian |
|
|
Finnish |
|
|
Fon |
|
|
French |
|
|
Friulian |
|
|
Galician |
|
|
Ganda |
|
|
Georgian |
|
|
German |
|
|
Greek |
|
|
Guarani |
|
|
Gujarati |
|
|
Haitian Creole |
|
|
Halh Mongolian |
|
|
Hausa |
|
|
Hebrew |
|
|
Hindi |
|
|
Hungarian |
|
|
Icelandic |
|
|
Igbo |
|
|
Ilocano |
|
|
Indonesian |
|
|
Irish |
|
|
Italian |
|
|
Japanese |
|
|
Javanese |
|
|
Jingpho |
|
|
Kabiyè |
|
|
Kabuverdianu |
|
|
Kabyle |
|
|
Kamba |
|
|
Kannada |
|
|
Kashmiri (Arabic script) |
|
|
Kashmiri (Devanagari script) |
|
|
Kazakh |
|
|
Khmer |
|
|
Kikongo |
|
|
Kikuyu |
|
|
Kimbundu |
|
|
Kinyarwanda |
|
|
Korean |
|
|
Kyrgyz |
|
|
Lao |
|
|
Latgalian |
|
|
Ligurian |
|
|
Limburgish |
|
|
Lingala |
|
|
Lithuanian |
|
|
Lombard |
|
|
Luba-Kasai |
|
|
Luo |
|
|
Luxembourgish |
|
|
Macedonian |
|
|
Magahi |
|
|
Maithili |
|
|
Malayalam |
|
|
Maltese |
|
|
Maori |
|
|
Marathi |
|
|
Meitei (Bengali script) |
|
|
Mesopotamian Arabic |
|
|
Minangkabau (Arabic script) |
|
|
Minangkabau (Latin script) |
|
|
Mizo |
|
|
Modern Standard Arabic |
|
|
Modern Standard Arabic (Romanized) |
|
|
Moroccan Arabic |
|
|
Mossi |
|
|
Najdi Arabic |
|
|
Nepali |
|
|
Nigerian Fulfulde |
|
|
North Azerbaijani |
|
|
North Levantine Arabic |
|
|
Northern Kurdish |
|
|
Northern Sotho |
|
|
Northern Uzbek |
|
|
Norwegian Bokmål |
|
|
Norwegian Nynorsk |
|
|
Nuer |
|
|
Nyanja |
|
|
Occitan |
|
|
Odia |
|
|
Pangasinan |
|
|
Papiamento |
|
|
Plateau Malagasy |
|
|
Polish |
|
|
Portuguese |
|
|
Romanian |
|
|
Rundi |
|
|
Russian |
|
|
Samoan |
|
|
Sango |
|
|
Sanskrit |
|
|
Santali |
|
|
Sardinian |
|
|
Scottish Gaelic |
|
|
Serbian |
|
|
Shan |
|
|
Shona |
|
|
Sicilian |
|
|
Silesian |
|
|
Sindhi |
|
|
Sinhala |
|
|
Slovak |
|
|
Slovenian |
|
|
Somali |
|
|
South Azerbaijani |
|
|
South Levantine Arabic |
|
|
Southern Pashto |
|
|
Southern Sotho |
|
|
Southwestern Dinka |
|
|
Spanish |
|
|
Standard Latvian |
|
|
Standard Malay |
|
|
Standard Tibetan |
|
|
Sundanese |
|
|
Swahili |
|
|
Swati |
|
|
Swedish |
|
|
Tagalog |
|
|
Tajik |
|
|
Tamasheq (Latin script) |
|
|
Tamasheq (Tifinagh script) |
|
|
Tamil |
|
|
Tatar |
|
|
Ta’izzi-Adeni Arabic |
|
|
Telugu |
|
|
Thai |
|
|
Tigrinya |
|
|
Tok Pisin |
|
|
Tosk Albanian |
|
|
Tsonga |
|
|
Tswana |
|
|
Tumbuka |
|
|
Tunisian Arabic |
|
|
Turkish |
|
|
Turkmen |
|
|
Twi |
|
|
Ukrainian |
|
|
Umbundu |
|
|
Urdu |
|
|
Uyghur |
|
|
Venetian |
|
|
Vietnamese |
|
|
Waray |
|
|
Welsh |
|
|
West Central Oromo |
|
|
Western Persian |
|
|
Wolof |
|
|
Xhosa |
|
|
Yoruba |
|
|
Yue Chinese |
|
|
Zulu |
|
|
|
|
|
|
|
|
""" |
|
|
|
|
|
tags_metadata = [ |
|
|
{ |
|
|
"name": "translate", |
|
|
"description": "endpoints related to translation", |
|
|
}, |
|
|
] |
|
|
|
|
|
|
|
|
app = FastAPI( |
|
|
title="Translator-txt2txt", |
|
|
description=description, |
|
|
version="0.0.1", |
|
|
terms_of_service="http://example.com/terms/", |
|
|
contact={ |
|
|
"name": "chooch.ai", |
|
|
"url": "https://chooch.ai/", |
|
|
}, |
|
|
license_info={ |
|
|
"name": "Apache 2.0", |
|
|
"url": "https://chooch.ai/", |
|
|
}, |
|
|
openapi_tags=tags_metadata |
|
|
) |
|
|
|
|
|
class MetaData(BaseModel): |
|
|
text: str |
|
|
source_language : str |
|
|
target_langauge : str |
|
|
|
|
|
|
|
|
class TranslationResponse(BaseModel): |
|
|
metadata : MetaData |
|
|
translated_text : str |
|
|
time_load_model_sec : float |
|
|
time_translate_sec : float |
|
|
|
|
|
|
|
|
|
|
|
@app.on_event("startup") |
|
|
async def load_model(): |
|
|
|
|
|
t0 = perf_counter() |
|
|
global translator, tokenizer |
|
|
translator = ctranslate2.Translator("ct2_model_nllb", device="cuda", device_index=0, compute_type='float16') |
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B") |
|
|
Console().log(f"model has been loaded", style='red on green') |
|
|
t1 = perf_counter() |
|
|
global delta_load_model |
|
|
delta_load_model = t1-t0 |
|
|
|
|
|
|
|
|
|
|
|
@app.post("/translate/", tags=["translate"], response_model=TranslationResponse) |
|
|
async def translate( |
|
|
text : str = Query(default='', description="text that needs to be translated from src to tgt language"), |
|
|
src: str = Query(default='English', enum=human_readable_languages, description="source language to translate text From"), |
|
|
tgt: str = Query(default='Turkish', enum=human_readable_languages, description="target language to translate text To"), |
|
|
|
|
|
): |
|
|
''' |
|
|
🔴 translate the text from src language to target language |
|
|
''' |
|
|
|
|
|
global tokenizer, translator |
|
|
|
|
|
source = flores_codes[src] |
|
|
target = flores_codes[tgt] |
|
|
Console().log(f"Source language is {source}", style='bold red') |
|
|
Console().log(f"Destination language is {target}", style='bold red') |
|
|
|
|
|
|
|
|
|
|
|
tokenizer.src_lang=source |
|
|
|
|
|
|
|
|
source_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text)) |
|
|
target_prefix = [target] |
|
|
t2 = perf_counter() |
|
|
results = translator.translate_batch([source_tokens], target_prefix=[target_prefix]) |
|
|
t3 = perf_counter() |
|
|
target_ = results[0].hypotheses[0][1:] |
|
|
target_ = tokenizer.decode(tokenizer.convert_tokens_to_ids(target_)) |
|
|
global delta_translate |
|
|
delta_translate = t3-t2 |
|
|
|
|
|
Console().log(f"hypotheses are {target_}") |
|
|
|
|
|
return {'translated_text' : target_, |
|
|
'metadata' : { |
|
|
'text' : text, |
|
|
'source_language' : src, |
|
|
'target_langauge' : tgt |
|
|
}, |
|
|
'time_load_model_sec' : delta_load_model, |
|
|
'time_translate_sec' : delta_translate |
|
|
} |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
import uvicorn |
|
|
uvicorn.run(app, host='0.0.0.0', port=8124) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|