DarkMo0o commited on
Commit
69b53c5
·
verified ·
1 Parent(s): b173433

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -10
app.py CHANGED
@@ -5,12 +5,196 @@ import re
5
 
6
  app = FastAPI()
7
 
8
- MODEL_NAME = "facebook/nllb-200-distilled-600M"
9
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def split_text_lines(text, max_chunk_length=900):
13
- # تقسيم ذكي مع الحفاظ على أسطر strings
14
  lines = text.splitlines()
15
  chunks = []
16
  chunk = ""
@@ -21,31 +205,34 @@ def split_text_lines(text, max_chunk_length=900):
21
  if chunk.strip():
22
  chunks.append(chunk.strip())
23
  chunk = line + "\n"
24
- if chunk.strip():
25
- chunks.append(chunk.strip())
26
  return chunks
27
 
28
  def batch_translate(texts, src_lang, tgt_lang):
29
- # ترجمة سريعة batch
30
  results = []
31
- batch_size = 8 # يمكنك زيادة العدد حسب موارد السيرفر
32
  for i in range(0, len(texts), batch_size):
33
  batch = texts[i:i+batch_size]
34
  tokenizer.src_lang = src_lang
35
  inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
36
- generated = model.generate(**inputs, forced_bos_token_id=tokenizer.convert_lang_to_id(tgt_lang))
37
  translated = tokenizer.batch_decode(generated, skip_special_tokens=True)
38
  results.extend(translated)
39
  return results
40
 
41
  def detect_language(text):
42
- # كشف لغة ذكي (يعمل على أول chunk)
43
  sample = text[:2000] if len(text) > 2000 else text
44
  lang = detect(sample)
45
- # وفق أكواد NLLB المتوافقة (تعديل سريع)
46
- lang_map = {"en": "eng_Latn", "ar": "arb_Arab", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn"}
 
 
47
  return lang_map.get(lang, "eng_Latn")
48
 
 
 
 
 
49
  @app.post("/translate-text")
50
  async def translate_text(
51
  text: str = Form(...),
 
5
 
6
  app = FastAPI()
7
 
8
+ MODEL_NAME = "facebook/nllb-200-distilled-600M" # الأخف ولمشاكل الذاكرة
9
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
12
+ # قائمة جميع اللغات المدعومة، الكود: الاسم
13
+ NLLB_LANGS = {
14
+ "afr_Latn": "Afrikaans",
15
+ "amh_Ethi": "Amharic",
16
+ "arb_Arab": "Arabic",
17
+ "ary_Arab": "Moroccan Arabic",
18
+ "arz_Arab": "Egyptian Arabic",
19
+ "asm_Beng": "Assamese",
20
+ "ast_Latn": "Asturian",
21
+ "awa_Deva": "Awadhi",
22
+ "ayr_Latn": "Aymara",
23
+ "azb_Arab": "South Azerbaijani",
24
+ "azj_Latn": "North Azerbaijani",
25
+ "bak_Cyrl": "Bashkir",
26
+ "bam_Latn": "Bambara",
27
+ "ban_Latn": "Balinese",
28
+ "bel_Cyrl": "Belarusian",
29
+ "bem_Latn": "Bemba",
30
+ "ben_Beng": "Bengali",
31
+ "bho_Deva": "Bhojpuri",
32
+ "bjn_Arab": "Banjar (Arabic)",
33
+ "bjn_Latn": "Banjar (Latin)",
34
+ "bod_Tibt": "Standard Tibetan",
35
+ "bos_Latn": "Bosnian",
36
+ "bug_Latn": "Buginese",
37
+ "bul_Cyrl": "Bulgarian",
38
+ "cat_Latn": "Catalan",
39
+ "ceb_Latn": "Cebuano",
40
+ "ces_Latn": "Czech",
41
+ "cjk_Latn": "Chokwe",
42
+ "ckb_Arab": "Sorani Kurdish",
43
+ "crh_Latn": "Crimean Turkish",
44
+ "csb_Latn": "Kashubian",
45
+ "cym_Latn": "Welsh",
46
+ "dan_Latn": "Danish",
47
+ "deu_Latn": "German",
48
+ "dik_Latn": "Dinka",
49
+ "dyu_Latn": "Dyula",
50
+ "dzo_Tibt": "Dzongkha",
51
+ "ell_Grek": "Greek",
52
+ "eng_Latn": "English",
53
+ "epo_Latn": "Esperanto",
54
+ "est_Latn": "Estonian",
55
+ "eus_Latn": "Basque",
56
+ "ewe_Latn": "Ewe",
57
+ "fao_Latn": "Faroese",
58
+ "fij_Latn": "Fijian",
59
+ "fin_Latn": "Finnish",
60
+ "fon_Latn": "Fon",
61
+ "fra_Latn": "French",
62
+ "fur_Latn": "Friulian",
63
+ "fuv_Latn": "Nigerian Fulfulde",
64
+ "gla_Latn": "Scottish Gaelic",
65
+ "gle_Latn": "Irish",
66
+ "glg_Latn": "Galician",
67
+ "grn_Latn": "Guarani",
68
+ "guj_Gujr": "Gujarati",
69
+ "hat_Latn": "Haitian Creole",
70
+ "hau_Latn": "Hausa",
71
+ "heb_Hebr": "Hebrew",
72
+ "hin_Deva": "Hindi",
73
+ "hne_Deva": "Chhattisgarhi",
74
+ "hrv_Latn": "Croatian",
75
+ "hun_Latn": "Hungarian",
76
+ "hye_Armn": "Armenian",
77
+ "ibo_Latn": "Igbo",
78
+ "ilo_Latn": "Ilocano",
79
+ "ind_Latn": "Indonesian",
80
+ "isl_Latn": "Icelandic",
81
+ "ita_Latn": "Italian",
82
+ "jav_Latn": "Javanese",
83
+ "jpn_Jpan": "Japanese",
84
+ "kab_Latn": "Kabyle",
85
+ "kac_Latn": "Jingpho",
86
+ "kam_Latn": "Kamba",
87
+ "kan_Knda": "Kannada",
88
+ "kas_Arab": "Kashmiri (Arabic)",
89
+ "kas_Deva": "Kashmiri (Devanagari)",
90
+ "kat_Geor": "Georgian",
91
+ "kaz_Cyrl": "Kazakh",
92
+ "kbp_Latn": "Kabiyè",
93
+ "kea_Latn": "Kabuverdianu",
94
+ "khm_Khmr": "Khmer",
95
+ "kik_Latn": "Kikuyu",
96
+ "kin_Latn": "Kinyarwanda",
97
+ "kir_Cyrl": "Kyrgyz",
98
+ "kmb_Latn": "Kimbundu",
99
+ "kmr_Latn": "Kurmanji Kurdish",
100
+ "kon_Latn": "Kikongo",
101
+ "kor_Hang": "Korean",
102
+ "lao_Laoo": "Lao",
103
+ "lij_Latn": "Ligurian",
104
+ "lim_Latn": "Limburgish",
105
+ "lin_Latn": "Lingala",
106
+ "lit_Latn": "Lithuanian",
107
+ "lmo_Latn": "Lombard",
108
+ "ltg_Latn": "Latgalian",
109
+ "ltz_Latn": "Luxembourgish",
110
+ "lua_Latn": "Luba-Kasai",
111
+ "lug_Latn": "Ganda",
112
+ "luo_Latn": "Luo",
113
+ "lus_Latn": "Mizo",
114
+ "mag_Deva": "Magahi",
115
+ "mai_Deva": "Maithili",
116
+ "mal_Mlym": "Malayalam",
117
+ "mar_Deva": "Marathi",
118
+ "min_Latn": "Minangkabau",
119
+ "mkd_Cyrl": "Macedonian",
120
+ "plt_Latn": "Plateau Malagasy",
121
+ "mlt_Latn": "Maltese",
122
+ "mni_Beng": "Manipuri",
123
+ "khk_Cyrl": "Halh Mongolian",
124
+ "mos_Latn": "Mossi",
125
+ "mri_Latn": "Maori",
126
+ "msa_Latn": "Malay",
127
+ "mya_Mymr": "Burmese",
128
+ "nld_Latn": "Dutch",
129
+ "nno_Latn": "Norwegian Nynorsk",
130
+ "nob_Latn": "Norwegian Bokmål",
131
+ "npi_Deva": "Nepali",
132
+ "nso_Latn": "Northern Sotho",
133
+ "nya_Latn": "Nyanja",
134
+ "oci_Latn": "Occitan",
135
+ "ory_Orya": "Odia",
136
+ "pag_Latn": "Pangasinan",
137
+ "pan_Guru": "Punjabi",
138
+ "pap_Latn": "Papiamento",
139
+ "pol_Latn": "Polish",
140
+ "por_Latn": "Portuguese",
141
+ "ron_Latn": "Romanian",
142
+ "run_Latn": "Rundi",
143
+ "rus_Cyrl": "Russian",
144
+ "sag_Latn": "Sango",
145
+ "san_Deva": "Sanskrit",
146
+ "sat_Beng": "Santali",
147
+ "scn_Latn": "Sicilian",
148
+ "shn_Mymr": "Shan",
149
+ "sin_Sinh": "Sinhala",
150
+ "slk_Latn": "Slovak",
151
+ "slv_Latn": "Slovenian",
152
+ "sna_Latn": "Shona",
153
+ "snd_Arab": "Sindhi",
154
+ "som_Latn": "Somali",
155
+ "spa_Latn": "Spanish",
156
+ "als_Latn": "Tosk Albanian",
157
+ "sqi_Latn": "Albanian",
158
+ "srp_Cyrl": "Serbian",
159
+ "ssw_Latn": "Swazi",
160
+ "sun_Latn": "Sundanese",
161
+ "swe_Latn": "Swedish",
162
+ "swh_Latn": "Swahili",
163
+ "szl_Latn": "Silesian",
164
+ "tam_Taml": "Tamil",
165
+ "tat_Cyrl": "Tatar",
166
+ "tel_Telu": "Telugu",
167
+ "tgk_Cyrl": "Tajik",
168
+ "tgl_Latn": "Tagalog",
169
+ "tha_Thai": "Thai",
170
+ "tir_Ethi": "Tigrinya",
171
+ "taq_Latn": "Tamasheq (Latin)",
172
+ "taq_Tfng": "Tamasheq (Tifinagh)",
173
+ "tpi_Latn": "Tok Pisin",
174
+ "tsn_Latn": "Tswana",
175
+ "tso_Latn": "Tsonga",
176
+ "tur_Latn": "Turkish",
177
+ "twi_Latn": "Twi",
178
+ "tzm_Tfng": "Central Atlas Tamazight",
179
+ "uig_Arab": "Uyghur",
180
+ "ukr_Cyrl": "Ukrainian",
181
+ "umb_Latn": "Umbundu",
182
+ "urd_Arab": "Urdu",
183
+ "uzn_Latn": "Northern Uzbek",
184
+ "vec_Latn": "Venetian",
185
+ "vie_Latn": "Vietnamese",
186
+ "war_Latn": "Waray",
187
+ "wol_Latn": "Wolof",
188
+ "xho_Latn": "Xhosa",
189
+ "ydd_Hebr": "Eastern Yiddish",
190
+ "yor_Latn": "Yoruba",
191
+ "yue_Hant": "Cantonese",
192
+ "zho_Hans": "Chinese (Simplified)",
193
+ "zho_Hant": "Chinese (Traditional)",
194
+ "zul_Latn": "Zulu"
195
+ }
196
+
197
  def split_text_lines(text, max_chunk_length=900):
 
198
  lines = text.splitlines()
199
  chunks = []
200
  chunk = ""
 
205
  if chunk.strip():
206
  chunks.append(chunk.strip())
207
  chunk = line + "\n"
208
+ if chunk.strip(): chunks.append(chunk.strip())
 
209
  return chunks
210
 
211
  def batch_translate(texts, src_lang, tgt_lang):
 
212
  results = []
213
+ batch_size = 8
214
  for i in range(0, len(texts), batch_size):
215
  batch = texts[i:i+batch_size]
216
  tokenizer.src_lang = src_lang
217
  inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
218
+ generated = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
219
  translated = tokenizer.batch_decode(generated, skip_special_tokens=True)
220
  results.extend(translated)
221
  return results
222
 
223
  def detect_language(text):
 
224
  sample = text[:2000] if len(text) > 2000 else text
225
  lang = detect(sample)
226
+ lang_map = {
227
+ "en": "eng_Latn", "ar": "arb_Arab", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn",
228
+ # أضف أكوادك المفضلة هنا (أو استخدم الجدول الكامل تلقائياً حسب الحاجة)
229
+ }
230
  return lang_map.get(lang, "eng_Latn")
231
 
232
+ @app.get("/supported-languages")
233
+ def supported_languages():
234
+ return NLLB_LANGS
235
+
236
  @app.post("/translate-text")
237
  async def translate_text(
238
  text: str = Form(...),