Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,12 +5,196 @@ import re
|
|
| 5 |
|
| 6 |
app = FastAPI()
|
| 7 |
|
| 8 |
-
MODEL_NAME = "facebook/nllb-200-distilled-600M"
|
| 9 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def split_text_lines(text, max_chunk_length=900):
|
| 13 |
-
# تقسيم ذكي مع الحفاظ على أسطر strings
|
| 14 |
lines = text.splitlines()
|
| 15 |
chunks = []
|
| 16 |
chunk = ""
|
|
@@ -21,31 +205,34 @@ def split_text_lines(text, max_chunk_length=900):
|
|
| 21 |
if chunk.strip():
|
| 22 |
chunks.append(chunk.strip())
|
| 23 |
chunk = line + "\n"
|
| 24 |
-
if chunk.strip():
|
| 25 |
-
chunks.append(chunk.strip())
|
| 26 |
return chunks
|
| 27 |
|
| 28 |
def batch_translate(texts, src_lang, tgt_lang):
|
| 29 |
-
# ترجمة سريعة batch
|
| 30 |
results = []
|
| 31 |
-
batch_size = 8
|
| 32 |
for i in range(0, len(texts), batch_size):
|
| 33 |
batch = texts[i:i+batch_size]
|
| 34 |
tokenizer.src_lang = src_lang
|
| 35 |
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
|
| 36 |
-
generated = model.generate(**inputs, forced_bos_token_id=tokenizer.
|
| 37 |
translated = tokenizer.batch_decode(generated, skip_special_tokens=True)
|
| 38 |
results.extend(translated)
|
| 39 |
return results
|
| 40 |
|
| 41 |
def detect_language(text):
|
| 42 |
-
# كشف لغة ذكي (يعمل على أول chunk)
|
| 43 |
sample = text[:2000] if len(text) > 2000 else text
|
| 44 |
lang = detect(sample)
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
return lang_map.get(lang, "eng_Latn")
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
@app.post("/translate-text")
|
| 50 |
async def translate_text(
|
| 51 |
text: str = Form(...),
|
|
|
|
| 5 |
|
| 6 |
app = FastAPI()
|
| 7 |
|
| 8 |
+
MODEL_NAME = "facebook/nllb-200-distilled-600M" # الأخف ولمشاكل الذاكرة
|
| 9 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 11 |
|
| 12 |
+
# قائمة جميع اللغات المدعومة، الكود: الاسم
|
| 13 |
+
NLLB_LANGS = {
|
| 14 |
+
"afr_Latn": "Afrikaans",
|
| 15 |
+
"amh_Ethi": "Amharic",
|
| 16 |
+
"arb_Arab": "Arabic",
|
| 17 |
+
"ary_Arab": "Moroccan Arabic",
|
| 18 |
+
"arz_Arab": "Egyptian Arabic",
|
| 19 |
+
"asm_Beng": "Assamese",
|
| 20 |
+
"ast_Latn": "Asturian",
|
| 21 |
+
"awa_Deva": "Awadhi",
|
| 22 |
+
"ayr_Latn": "Aymara",
|
| 23 |
+
"azb_Arab": "South Azerbaijani",
|
| 24 |
+
"azj_Latn": "North Azerbaijani",
|
| 25 |
+
"bak_Cyrl": "Bashkir",
|
| 26 |
+
"bam_Latn": "Bambara",
|
| 27 |
+
"ban_Latn": "Balinese",
|
| 28 |
+
"bel_Cyrl": "Belarusian",
|
| 29 |
+
"bem_Latn": "Bemba",
|
| 30 |
+
"ben_Beng": "Bengali",
|
| 31 |
+
"bho_Deva": "Bhojpuri",
|
| 32 |
+
"bjn_Arab": "Banjar (Arabic)",
|
| 33 |
+
"bjn_Latn": "Banjar (Latin)",
|
| 34 |
+
"bod_Tibt": "Standard Tibetan",
|
| 35 |
+
"bos_Latn": "Bosnian",
|
| 36 |
+
"bug_Latn": "Buginese",
|
| 37 |
+
"bul_Cyrl": "Bulgarian",
|
| 38 |
+
"cat_Latn": "Catalan",
|
| 39 |
+
"ceb_Latn": "Cebuano",
|
| 40 |
+
"ces_Latn": "Czech",
|
| 41 |
+
"cjk_Latn": "Chokwe",
|
| 42 |
+
"ckb_Arab": "Sorani Kurdish",
|
| 43 |
+
"crh_Latn": "Crimean Turkish",
|
| 44 |
+
"csb_Latn": "Kashubian",
|
| 45 |
+
"cym_Latn": "Welsh",
|
| 46 |
+
"dan_Latn": "Danish",
|
| 47 |
+
"deu_Latn": "German",
|
| 48 |
+
"dik_Latn": "Dinka",
|
| 49 |
+
"dyu_Latn": "Dyula",
|
| 50 |
+
"dzo_Tibt": "Dzongkha",
|
| 51 |
+
"ell_Grek": "Greek",
|
| 52 |
+
"eng_Latn": "English",
|
| 53 |
+
"epo_Latn": "Esperanto",
|
| 54 |
+
"est_Latn": "Estonian",
|
| 55 |
+
"eus_Latn": "Basque",
|
| 56 |
+
"ewe_Latn": "Ewe",
|
| 57 |
+
"fao_Latn": "Faroese",
|
| 58 |
+
"fij_Latn": "Fijian",
|
| 59 |
+
"fin_Latn": "Finnish",
|
| 60 |
+
"fon_Latn": "Fon",
|
| 61 |
+
"fra_Latn": "French",
|
| 62 |
+
"fur_Latn": "Friulian",
|
| 63 |
+
"fuv_Latn": "Nigerian Fulfulde",
|
| 64 |
+
"gla_Latn": "Scottish Gaelic",
|
| 65 |
+
"gle_Latn": "Irish",
|
| 66 |
+
"glg_Latn": "Galician",
|
| 67 |
+
"grn_Latn": "Guarani",
|
| 68 |
+
"guj_Gujr": "Gujarati",
|
| 69 |
+
"hat_Latn": "Haitian Creole",
|
| 70 |
+
"hau_Latn": "Hausa",
|
| 71 |
+
"heb_Hebr": "Hebrew",
|
| 72 |
+
"hin_Deva": "Hindi",
|
| 73 |
+
"hne_Deva": "Chhattisgarhi",
|
| 74 |
+
"hrv_Latn": "Croatian",
|
| 75 |
+
"hun_Latn": "Hungarian",
|
| 76 |
+
"hye_Armn": "Armenian",
|
| 77 |
+
"ibo_Latn": "Igbo",
|
| 78 |
+
"ilo_Latn": "Ilocano",
|
| 79 |
+
"ind_Latn": "Indonesian",
|
| 80 |
+
"isl_Latn": "Icelandic",
|
| 81 |
+
"ita_Latn": "Italian",
|
| 82 |
+
"jav_Latn": "Javanese",
|
| 83 |
+
"jpn_Jpan": "Japanese",
|
| 84 |
+
"kab_Latn": "Kabyle",
|
| 85 |
+
"kac_Latn": "Jingpho",
|
| 86 |
+
"kam_Latn": "Kamba",
|
| 87 |
+
"kan_Knda": "Kannada",
|
| 88 |
+
"kas_Arab": "Kashmiri (Arabic)",
|
| 89 |
+
"kas_Deva": "Kashmiri (Devanagari)",
|
| 90 |
+
"kat_Geor": "Georgian",
|
| 91 |
+
"kaz_Cyrl": "Kazakh",
|
| 92 |
+
"kbp_Latn": "Kabiyè",
|
| 93 |
+
"kea_Latn": "Kabuverdianu",
|
| 94 |
+
"khm_Khmr": "Khmer",
|
| 95 |
+
"kik_Latn": "Kikuyu",
|
| 96 |
+
"kin_Latn": "Kinyarwanda",
|
| 97 |
+
"kir_Cyrl": "Kyrgyz",
|
| 98 |
+
"kmb_Latn": "Kimbundu",
|
| 99 |
+
"kmr_Latn": "Kurmanji Kurdish",
|
| 100 |
+
"kon_Latn": "Kikongo",
|
| 101 |
+
"kor_Hang": "Korean",
|
| 102 |
+
"lao_Laoo": "Lao",
|
| 103 |
+
"lij_Latn": "Ligurian",
|
| 104 |
+
"lim_Latn": "Limburgish",
|
| 105 |
+
"lin_Latn": "Lingala",
|
| 106 |
+
"lit_Latn": "Lithuanian",
|
| 107 |
+
"lmo_Latn": "Lombard",
|
| 108 |
+
"ltg_Latn": "Latgalian",
|
| 109 |
+
"ltz_Latn": "Luxembourgish",
|
| 110 |
+
"lua_Latn": "Luba-Kasai",
|
| 111 |
+
"lug_Latn": "Ganda",
|
| 112 |
+
"luo_Latn": "Luo",
|
| 113 |
+
"lus_Latn": "Mizo",
|
| 114 |
+
"mag_Deva": "Magahi",
|
| 115 |
+
"mai_Deva": "Maithili",
|
| 116 |
+
"mal_Mlym": "Malayalam",
|
| 117 |
+
"mar_Deva": "Marathi",
|
| 118 |
+
"min_Latn": "Minangkabau",
|
| 119 |
+
"mkd_Cyrl": "Macedonian",
|
| 120 |
+
"plt_Latn": "Plateau Malagasy",
|
| 121 |
+
"mlt_Latn": "Maltese",
|
| 122 |
+
"mni_Beng": "Manipuri",
|
| 123 |
+
"khk_Cyrl": "Halh Mongolian",
|
| 124 |
+
"mos_Latn": "Mossi",
|
| 125 |
+
"mri_Latn": "Maori",
|
| 126 |
+
"msa_Latn": "Malay",
|
| 127 |
+
"mya_Mymr": "Burmese",
|
| 128 |
+
"nld_Latn": "Dutch",
|
| 129 |
+
"nno_Latn": "Norwegian Nynorsk",
|
| 130 |
+
"nob_Latn": "Norwegian Bokmål",
|
| 131 |
+
"npi_Deva": "Nepali",
|
| 132 |
+
"nso_Latn": "Northern Sotho",
|
| 133 |
+
"nya_Latn": "Nyanja",
|
| 134 |
+
"oci_Latn": "Occitan",
|
| 135 |
+
"ory_Orya": "Odia",
|
| 136 |
+
"pag_Latn": "Pangasinan",
|
| 137 |
+
"pan_Guru": "Punjabi",
|
| 138 |
+
"pap_Latn": "Papiamento",
|
| 139 |
+
"pol_Latn": "Polish",
|
| 140 |
+
"por_Latn": "Portuguese",
|
| 141 |
+
"ron_Latn": "Romanian",
|
| 142 |
+
"run_Latn": "Rundi",
|
| 143 |
+
"rus_Cyrl": "Russian",
|
| 144 |
+
"sag_Latn": "Sango",
|
| 145 |
+
"san_Deva": "Sanskrit",
|
| 146 |
+
"sat_Beng": "Santali",
|
| 147 |
+
"scn_Latn": "Sicilian",
|
| 148 |
+
"shn_Mymr": "Shan",
|
| 149 |
+
"sin_Sinh": "Sinhala",
|
| 150 |
+
"slk_Latn": "Slovak",
|
| 151 |
+
"slv_Latn": "Slovenian",
|
| 152 |
+
"sna_Latn": "Shona",
|
| 153 |
+
"snd_Arab": "Sindhi",
|
| 154 |
+
"som_Latn": "Somali",
|
| 155 |
+
"spa_Latn": "Spanish",
|
| 156 |
+
"als_Latn": "Tosk Albanian",
|
| 157 |
+
"sqi_Latn": "Albanian",
|
| 158 |
+
"srp_Cyrl": "Serbian",
|
| 159 |
+
"ssw_Latn": "Swazi",
|
| 160 |
+
"sun_Latn": "Sundanese",
|
| 161 |
+
"swe_Latn": "Swedish",
|
| 162 |
+
"swh_Latn": "Swahili",
|
| 163 |
+
"szl_Latn": "Silesian",
|
| 164 |
+
"tam_Taml": "Tamil",
|
| 165 |
+
"tat_Cyrl": "Tatar",
|
| 166 |
+
"tel_Telu": "Telugu",
|
| 167 |
+
"tgk_Cyrl": "Tajik",
|
| 168 |
+
"tgl_Latn": "Tagalog",
|
| 169 |
+
"tha_Thai": "Thai",
|
| 170 |
+
"tir_Ethi": "Tigrinya",
|
| 171 |
+
"taq_Latn": "Tamasheq (Latin)",
|
| 172 |
+
"taq_Tfng": "Tamasheq (Tifinagh)",
|
| 173 |
+
"tpi_Latn": "Tok Pisin",
|
| 174 |
+
"tsn_Latn": "Tswana",
|
| 175 |
+
"tso_Latn": "Tsonga",
|
| 176 |
+
"tur_Latn": "Turkish",
|
| 177 |
+
"twi_Latn": "Twi",
|
| 178 |
+
"tzm_Tfng": "Central Atlas Tamazight",
|
| 179 |
+
"uig_Arab": "Uyghur",
|
| 180 |
+
"ukr_Cyrl": "Ukrainian",
|
| 181 |
+
"umb_Latn": "Umbundu",
|
| 182 |
+
"urd_Arab": "Urdu",
|
| 183 |
+
"uzn_Latn": "Northern Uzbek",
|
| 184 |
+
"vec_Latn": "Venetian",
|
| 185 |
+
"vie_Latn": "Vietnamese",
|
| 186 |
+
"war_Latn": "Waray",
|
| 187 |
+
"wol_Latn": "Wolof",
|
| 188 |
+
"xho_Latn": "Xhosa",
|
| 189 |
+
"ydd_Hebr": "Eastern Yiddish",
|
| 190 |
+
"yor_Latn": "Yoruba",
|
| 191 |
+
"yue_Hant": "Cantonese",
|
| 192 |
+
"zho_Hans": "Chinese (Simplified)",
|
| 193 |
+
"zho_Hant": "Chinese (Traditional)",
|
| 194 |
+
"zul_Latn": "Zulu"
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
def split_text_lines(text, max_chunk_length=900):
|
|
|
|
| 198 |
lines = text.splitlines()
|
| 199 |
chunks = []
|
| 200 |
chunk = ""
|
|
|
|
| 205 |
if chunk.strip():
|
| 206 |
chunks.append(chunk.strip())
|
| 207 |
chunk = line + "\n"
|
| 208 |
+
if chunk.strip(): chunks.append(chunk.strip())
|
|
|
|
| 209 |
return chunks
|
| 210 |
|
| 211 |
def batch_translate(texts, src_lang, tgt_lang):
|
|
|
|
| 212 |
results = []
|
| 213 |
+
batch_size = 8
|
| 214 |
for i in range(0, len(texts), batch_size):
|
| 215 |
batch = texts[i:i+batch_size]
|
| 216 |
tokenizer.src_lang = src_lang
|
| 217 |
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
|
| 218 |
+
generated = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
|
| 219 |
translated = tokenizer.batch_decode(generated, skip_special_tokens=True)
|
| 220 |
results.extend(translated)
|
| 221 |
return results
|
| 222 |
|
| 223 |
def detect_language(text):
|
|
|
|
| 224 |
sample = text[:2000] if len(text) > 2000 else text
|
| 225 |
lang = detect(sample)
|
| 226 |
+
lang_map = {
|
| 227 |
+
"en": "eng_Latn", "ar": "arb_Arab", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn",
|
| 228 |
+
# أضف أكوادك المفضلة هنا (أو استخدم الجدول الكامل تلقائياً حسب الحاجة)
|
| 229 |
+
}
|
| 230 |
return lang_map.get(lang, "eng_Latn")
|
| 231 |
|
| 232 |
+
@app.get("/supported-languages")
|
| 233 |
+
def supported_languages():
|
| 234 |
+
return NLLB_LANGS
|
| 235 |
+
|
| 236 |
@app.post("/translate-text")
|
| 237 |
async def translate_text(
|
| 238 |
text: str = Form(...),
|