DarkMo0o commited on
Commit
a36c3e2
·
verified ·
1 Parent(s): 69b53c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -258
app.py CHANGED
@@ -1,265 +1,63 @@
1
- from fastapi import FastAPI, File, UploadFile, Form
2
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
- from langdetect import detect
4
- import re
5
 
6
- app = FastAPI()
 
7
 
8
- MODEL_NAME = "facebook/nllb-200-distilled-600M" # الأخف ولمشاكل الذاكرة
9
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
10
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
12
- # قائمة جميع اللغات المدعومة، الكود: الاسم
13
- NLLB_LANGS = {
14
- "afr_Latn": "Afrikaans",
15
- "amh_Ethi": "Amharic",
16
- "arb_Arab": "Arabic",
17
- "ary_Arab": "Moroccan Arabic",
18
- "arz_Arab": "Egyptian Arabic",
19
- "asm_Beng": "Assamese",
20
- "ast_Latn": "Asturian",
21
- "awa_Deva": "Awadhi",
22
- "ayr_Latn": "Aymara",
23
- "azb_Arab": "South Azerbaijani",
24
- "azj_Latn": "North Azerbaijani",
25
- "bak_Cyrl": "Bashkir",
26
- "bam_Latn": "Bambara",
27
- "ban_Latn": "Balinese",
28
- "bel_Cyrl": "Belarusian",
29
- "bem_Latn": "Bemba",
30
- "ben_Beng": "Bengali",
31
- "bho_Deva": "Bhojpuri",
32
- "bjn_Arab": "Banjar (Arabic)",
33
- "bjn_Latn": "Banjar (Latin)",
34
- "bod_Tibt": "Standard Tibetan",
35
- "bos_Latn": "Bosnian",
36
- "bug_Latn": "Buginese",
37
- "bul_Cyrl": "Bulgarian",
38
- "cat_Latn": "Catalan",
39
- "ceb_Latn": "Cebuano",
40
- "ces_Latn": "Czech",
41
- "cjk_Latn": "Chokwe",
42
- "ckb_Arab": "Sorani Kurdish",
43
- "crh_Latn": "Crimean Turkish",
44
- "csb_Latn": "Kashubian",
45
- "cym_Latn": "Welsh",
46
- "dan_Latn": "Danish",
47
- "deu_Latn": "German",
48
- "dik_Latn": "Dinka",
49
- "dyu_Latn": "Dyula",
50
- "dzo_Tibt": "Dzongkha",
51
- "ell_Grek": "Greek",
52
- "eng_Latn": "English",
53
- "epo_Latn": "Esperanto",
54
- "est_Latn": "Estonian",
55
- "eus_Latn": "Basque",
56
- "ewe_Latn": "Ewe",
57
- "fao_Latn": "Faroese",
58
- "fij_Latn": "Fijian",
59
- "fin_Latn": "Finnish",
60
- "fon_Latn": "Fon",
61
- "fra_Latn": "French",
62
- "fur_Latn": "Friulian",
63
- "fuv_Latn": "Nigerian Fulfulde",
64
- "gla_Latn": "Scottish Gaelic",
65
- "gle_Latn": "Irish",
66
- "glg_Latn": "Galician",
67
- "grn_Latn": "Guarani",
68
- "guj_Gujr": "Gujarati",
69
- "hat_Latn": "Haitian Creole",
70
- "hau_Latn": "Hausa",
71
- "heb_Hebr": "Hebrew",
72
- "hin_Deva": "Hindi",
73
- "hne_Deva": "Chhattisgarhi",
74
- "hrv_Latn": "Croatian",
75
- "hun_Latn": "Hungarian",
76
- "hye_Armn": "Armenian",
77
- "ibo_Latn": "Igbo",
78
- "ilo_Latn": "Ilocano",
79
- "ind_Latn": "Indonesian",
80
- "isl_Latn": "Icelandic",
81
- "ita_Latn": "Italian",
82
- "jav_Latn": "Javanese",
83
- "jpn_Jpan": "Japanese",
84
- "kab_Latn": "Kabyle",
85
- "kac_Latn": "Jingpho",
86
- "kam_Latn": "Kamba",
87
- "kan_Knda": "Kannada",
88
- "kas_Arab": "Kashmiri (Arabic)",
89
- "kas_Deva": "Kashmiri (Devanagari)",
90
- "kat_Geor": "Georgian",
91
- "kaz_Cyrl": "Kazakh",
92
- "kbp_Latn": "Kabiyè",
93
- "kea_Latn": "Kabuverdianu",
94
- "khm_Khmr": "Khmer",
95
- "kik_Latn": "Kikuyu",
96
- "kin_Latn": "Kinyarwanda",
97
- "kir_Cyrl": "Kyrgyz",
98
- "kmb_Latn": "Kimbundu",
99
- "kmr_Latn": "Kurmanji Kurdish",
100
- "kon_Latn": "Kikongo",
101
- "kor_Hang": "Korean",
102
- "lao_Laoo": "Lao",
103
- "lij_Latn": "Ligurian",
104
- "lim_Latn": "Limburgish",
105
- "lin_Latn": "Lingala",
106
- "lit_Latn": "Lithuanian",
107
- "lmo_Latn": "Lombard",
108
- "ltg_Latn": "Latgalian",
109
- "ltz_Latn": "Luxembourgish",
110
- "lua_Latn": "Luba-Kasai",
111
- "lug_Latn": "Ganda",
112
- "luo_Latn": "Luo",
113
- "lus_Latn": "Mizo",
114
- "mag_Deva": "Magahi",
115
- "mai_Deva": "Maithili",
116
- "mal_Mlym": "Malayalam",
117
- "mar_Deva": "Marathi",
118
- "min_Latn": "Minangkabau",
119
- "mkd_Cyrl": "Macedonian",
120
- "plt_Latn": "Plateau Malagasy",
121
- "mlt_Latn": "Maltese",
122
- "mni_Beng": "Manipuri",
123
- "khk_Cyrl": "Halh Mongolian",
124
- "mos_Latn": "Mossi",
125
- "mri_Latn": "Maori",
126
- "msa_Latn": "Malay",
127
- "mya_Mymr": "Burmese",
128
- "nld_Latn": "Dutch",
129
- "nno_Latn": "Norwegian Nynorsk",
130
- "nob_Latn": "Norwegian Bokmål",
131
- "npi_Deva": "Nepali",
132
- "nso_Latn": "Northern Sotho",
133
- "nya_Latn": "Nyanja",
134
- "oci_Latn": "Occitan",
135
- "ory_Orya": "Odia",
136
- "pag_Latn": "Pangasinan",
137
- "pan_Guru": "Punjabi",
138
- "pap_Latn": "Papiamento",
139
- "pol_Latn": "Polish",
140
- "por_Latn": "Portuguese",
141
- "ron_Latn": "Romanian",
142
- "run_Latn": "Rundi",
143
- "rus_Cyrl": "Russian",
144
- "sag_Latn": "Sango",
145
- "san_Deva": "Sanskrit",
146
- "sat_Beng": "Santali",
147
- "scn_Latn": "Sicilian",
148
- "shn_Mymr": "Shan",
149
- "sin_Sinh": "Sinhala",
150
- "slk_Latn": "Slovak",
151
- "slv_Latn": "Slovenian",
152
- "sna_Latn": "Shona",
153
- "snd_Arab": "Sindhi",
154
- "som_Latn": "Somali",
155
- "spa_Latn": "Spanish",
156
- "als_Latn": "Tosk Albanian",
157
- "sqi_Latn": "Albanian",
158
- "srp_Cyrl": "Serbian",
159
- "ssw_Latn": "Swazi",
160
- "sun_Latn": "Sundanese",
161
- "swe_Latn": "Swedish",
162
- "swh_Latn": "Swahili",
163
- "szl_Latn": "Silesian",
164
- "tam_Taml": "Tamil",
165
- "tat_Cyrl": "Tatar",
166
- "tel_Telu": "Telugu",
167
- "tgk_Cyrl": "Tajik",
168
- "tgl_Latn": "Tagalog",
169
- "tha_Thai": "Thai",
170
- "tir_Ethi": "Tigrinya",
171
- "taq_Latn": "Tamasheq (Latin)",
172
- "taq_Tfng": "Tamasheq (Tifinagh)",
173
- "tpi_Latn": "Tok Pisin",
174
- "tsn_Latn": "Tswana",
175
- "tso_Latn": "Tsonga",
176
- "tur_Latn": "Turkish",
177
- "twi_Latn": "Twi",
178
- "tzm_Tfng": "Central Atlas Tamazight",
179
- "uig_Arab": "Uyghur",
180
- "ukr_Cyrl": "Ukrainian",
181
- "umb_Latn": "Umbundu",
182
- "urd_Arab": "Urdu",
183
- "uzn_Latn": "Northern Uzbek",
184
- "vec_Latn": "Venetian",
185
- "vie_Latn": "Vietnamese",
186
- "war_Latn": "Waray",
187
- "wol_Latn": "Wolof",
188
- "xho_Latn": "Xhosa",
189
- "ydd_Hebr": "Eastern Yiddish",
190
- "yor_Latn": "Yoruba",
191
- "yue_Hant": "Cantonese",
192
- "zho_Hans": "Chinese (Simplified)",
193
- "zho_Hant": "Chinese (Traditional)",
194
- "zul_Latn": "Zulu"
195
- }
196
 
197
- def split_text_lines(text, max_chunk_length=900):
198
- lines = text.splitlines()
199
- chunks = []
200
- chunk = ""
201
- for line in lines:
202
- if len(chunk) + len(line) < max_chunk_length:
203
- chunk += line + "\n"
204
- else:
205
- if chunk.strip():
206
- chunks.append(chunk.strip())
207
- chunk = line + "\n"
208
- if chunk.strip(): chunks.append(chunk.strip())
209
- return chunks
210
 
211
- def batch_translate(texts, src_lang, tgt_lang):
212
- results = []
213
- batch_size = 8
214
- for i in range(0, len(texts), batch_size):
215
- batch = texts[i:i+batch_size]
216
- tokenizer.src_lang = src_lang
217
- inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=1024)
218
- generated = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
219
- translated = tokenizer.batch_decode(generated, skip_special_tokens=True)
220
- results.extend(translated)
221
- return results
222
 
223
  def detect_language(text):
224
- sample = text[:2000] if len(text) > 2000 else text
225
- lang = detect(sample)
226
- lang_map = {
227
- "en": "eng_Latn", "ar": "arb_Arab", "fr": "fra_Latn", "hi": "hin_Deva", "es": "spa_Latn", "de": "deu_Latn",
228
- # أضف أكوادك المفضلة هنا (أو استخدم الجدول الكامل تلقائياً حسب الحاجة)
229
- }
230
- return lang_map.get(lang, "eng_Latn")
231
-
232
- @app.get("/supported-languages")
233
- def supported_languages():
234
- return NLLB_LANGS
235
-
236
- @app.post("/translate-text")
237
- async def translate_text(
238
- text: str = Form(...),
239
- target_lang: str = Form(...)
240
- ):
241
- source_lang = detect_language(text)
242
- texts = re.split(r'(?<=[.!?\n])\s+', text.strip())
243
- chunks = []
244
- cur_chunk = ""
245
- for sentence in texts:
246
- if len(cur_chunk) + len(sentence) < 900:
247
- cur_chunk += sentence + " "
248
- else:
249
- chunks.append(cur_chunk.strip())
250
- cur_chunk = sentence + " "
251
- if cur_chunk.strip(): chunks.append(cur_chunk.strip())
252
- translated = batch_translate(chunks, source_lang, target_lang)
253
- return {"translated_text": "\n".join(translated)}
254
-
255
- @app.post("/translate-file")
256
- async def translate_file(
257
- file: UploadFile = File(...),
258
- target_lang: str = Form(...)
259
- ):
260
- contents = await file.read()
261
- original_text = contents.decode()
262
- source_lang = detect_language(original_text)
263
- lines = split_text_lines(original_text)
264
- translated_lines = batch_translate(lines, source_lang, target_lang)
265
- return {"translated_text": "\n".join(translated_lines)}
 
1
+ # قم بتشغيل هذا الكود على جهازك أو سيرفرك (Python 3.8+ مطلوب)
2
+ # install: pip install transformers sentencepiece flask
 
 
3
 
4
+ from transformers import pipeline
5
+ from flask import Flask, request, jsonify
6
 
7
+ app = Flask(__name__)
 
 
8
 
9
+ # النموذج متعدد اللغات الأفضل: facebook/m2m100_418M
10
+ translator = pipeline("translation", model="facebook/m2m100_418M")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # دالة فحص اللغة المدخلة (افضل نموذج: papluca/xlm-roberta-base-language-detection)
13
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
14
+ import torch
 
 
 
 
 
 
 
 
 
 
15
 
16
+ lang_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
17
+ lang_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
18
+ lang_labels = [
19
+ "af", "am", "ar", "as", "az", "be", "bg", "bn", "bo", "bs", "ca", "ceb", "co", "cs", "cy", "da", "de", "dv",
20
+ "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "fy", "ga", "gd", "gl", "gn", "gu", "ha", "haw", "he",
21
+ "hi", "hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "ku", "ky", "la",
22
+ "lb", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my", "ne", "nl", "no", "ny", "oc",
23
+ "om", "or", "pa", "pl", "ps", "pt", "qu", "rm", "ro", "ru", "rw", "sd", "se", "sg", "sh", "si", "sk", "sl",
24
+ "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta", "te", "tg", "th", "ti", "tk", "tl", "tn",
25
+ "tr", "ts", "tt", "tw", "ug", "uk", "ur", "uz", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "zh"
26
+ ]
27
 
28
  def detect_language(text):
29
+ inputs = lang_tokenizer(text, return_tensors="pt", truncation=True)
30
+ with torch.no_grad():
31
+ logits = lang_model(**inputs).logits
32
+ predicted = torch.argmax(logits, dim=1)
33
+ lang_code = lang_labels[predicted.item()]
34
+ return lang_code
35
+
36
+ @app.route('/translate', methods=['POST'])
37
+ def translate():
38
+ data = request.json or {}
39
+ text = data.get('text', '')
40
+ detected = detect_language(text)
41
+ # إذا اللغة إنجليزية لا تترجم
42
+ if detected == "en":
43
+ return jsonify({
44
+ "success": True,
45
+ "translatedText": text,
46
+ "originalLanguage": "en"
47
+ })
48
+ translated = translator(text, src_lang=detected, tgt_lang="en")[0]["translation_text"]
49
+ return jsonify({
50
+ "success": True,
51
+ "translatedText": translated,
52
+ "originalLanguage": detected
53
+ })
54
+
55
+ @app.route('/detect', methods=['POST'])
56
+ def langdetect():
57
+ data = request.json or {}
58
+ text = data.get('text', '')
59
+ detected = detect_language(text)
60
+ return jsonify({"language": detected})
61
+
62
+ if __name__ == "__main__":
63
+ app.run(host="0.0.0.0", port=5005)