Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,265 +1,63 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
from langdetect import detect
|
| 4 |
-
import re
|
| 5 |
|
| 6 |
-
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
| 10 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
|
| 14 |
-
"afr_Latn": "Afrikaans",
|
| 15 |
-
"amh_Ethi": "Amharic",
|
| 16 |
-
"arb_Arab": "Arabic",
|
| 17 |
-
"ary_Arab": "Moroccan Arabic",
|
| 18 |
-
"arz_Arab": "Egyptian Arabic",
|
| 19 |
-
"asm_Beng": "Assamese",
|
| 20 |
-
"ast_Latn": "Asturian",
|
| 21 |
-
"awa_Deva": "Awadhi",
|
| 22 |
-
"ayr_Latn": "Aymara",
|
| 23 |
-
"azb_Arab": "South Azerbaijani",
|
| 24 |
-
"azj_Latn": "North Azerbaijani",
|
| 25 |
-
"bak_Cyrl": "Bashkir",
|
| 26 |
-
"bam_Latn": "Bambara",
|
| 27 |
-
"ban_Latn": "Balinese",
|
| 28 |
-
"bel_Cyrl": "Belarusian",
|
| 29 |
-
"bem_Latn": "Bemba",
|
| 30 |
-
"ben_Beng": "Bengali",
|
| 31 |
-
"bho_Deva": "Bhojpuri",
|
| 32 |
-
"bjn_Arab": "Banjar (Arabic)",
|
| 33 |
-
"bjn_Latn": "Banjar (Latin)",
|
| 34 |
-
"bod_Tibt": "Standard Tibetan",
|
| 35 |
-
"bos_Latn": "Bosnian",
|
| 36 |
-
"bug_Latn": "Buginese",
|
| 37 |
-
"bul_Cyrl": "Bulgarian",
|
| 38 |
-
"cat_Latn": "Catalan",
|
| 39 |
-
"ceb_Latn": "Cebuano",
|
| 40 |
-
"ces_Latn": "Czech",
|
| 41 |
-
"cjk_Latn": "Chokwe",
|
| 42 |
-
"ckb_Arab": "Sorani Kurdish",
|
| 43 |
-
"crh_Latn": "Crimean Turkish",
|
| 44 |
-
"csb_Latn": "Kashubian",
|
| 45 |
-
"cym_Latn": "Welsh",
|
| 46 |
-
"dan_Latn": "Danish",
|
| 47 |
-
"deu_Latn": "German",
|
| 48 |
-
"dik_Latn": "Dinka",
|
| 49 |
-
"dyu_Latn": "Dyula",
|
| 50 |
-
"dzo_Tibt": "Dzongkha",
|
| 51 |
-
"ell_Grek": "Greek",
|
| 52 |
-
"eng_Latn": "English",
|
| 53 |
-
"epo_Latn": "Esperanto",
|
| 54 |
-
"est_Latn": "Estonian",
|
| 55 |
-
"eus_Latn": "Basque",
|
| 56 |
-
"ewe_Latn": "Ewe",
|
| 57 |
-
"fao_Latn": "Faroese",
|
| 58 |
-
"fij_Latn": "Fijian",
|
| 59 |
-
"fin_Latn": "Finnish",
|
| 60 |
-
"fon_Latn": "Fon",
|
| 61 |
-
"fra_Latn": "French",
|
| 62 |
-
"fur_Latn": "Friulian",
|
| 63 |
-
"fuv_Latn": "Nigerian Fulfulde",
|
| 64 |
-
"gla_Latn": "Scottish Gaelic",
|
| 65 |
-
"gle_Latn": "Irish",
|
| 66 |
-
"glg_Latn": "Galician",
|
| 67 |
-
"grn_Latn": "Guarani",
|
| 68 |
-
"guj_Gujr": "Gujarati",
|
| 69 |
-
"hat_Latn": "Haitian Creole",
|
| 70 |
-
"hau_Latn": "Hausa",
|
| 71 |
-
"heb_Hebr": "Hebrew",
|
| 72 |
-
"hin_Deva": "Hindi",
|
| 73 |
-
"hne_Deva": "Chhattisgarhi",
|
| 74 |
-
"hrv_Latn": "Croatian",
|
| 75 |
-
"hun_Latn": "Hungarian",
|
| 76 |
-
"hye_Armn": "Armenian",
|
| 77 |
-
"ibo_Latn": "Igbo",
|
| 78 |
-
"ilo_Latn": "Ilocano",
|
| 79 |
-
"ind_Latn": "Indonesian",
|
| 80 |
-
"isl_Latn": "Icelandic",
|
| 81 |
-
"ita_Latn": "Italian",
|
| 82 |
-
"jav_Latn": "Javanese",
|
| 83 |
-
"jpn_Jpan": "Japanese",
|
| 84 |
-
"kab_Latn": "Kabyle",
|
| 85 |
-
"kac_Latn": "Jingpho",
|
| 86 |
-
"kam_Latn": "Kamba",
|
| 87 |
-
"kan_Knda": "Kannada",
|
| 88 |
-
"kas_Arab": "Kashmiri (Arabic)",
|
| 89 |
-
"kas_Deva": "Kashmiri (Devanagari)",
|
| 90 |
-
"kat_Geor": "Georgian",
|
| 91 |
-
"kaz_Cyrl": "Kazakh",
|
| 92 |
-
"kbp_Latn": "Kabiyè",
|
| 93 |
-
"kea_Latn": "Kabuverdianu",
|
| 94 |
-
"khm_Khmr": "Khmer",
|
| 95 |
-
"kik_Latn": "Kikuyu",
|
| 96 |
-
"kin_Latn": "Kinyarwanda",
|
| 97 |
-
"kir_Cyrl": "Kyrgyz",
|
| 98 |
-
"kmb_Latn": "Kimbundu",
|
| 99 |
-
"kmr_Latn": "Kurmanji Kurdish",
|
| 100 |
-
"kon_Latn": "Kikongo",
|
| 101 |
-
"kor_Hang": "Korean",
|
| 102 |
-
"lao_Laoo": "Lao",
|
| 103 |
-
"lij_Latn": "Ligurian",
|
| 104 |
-
"lim_Latn": "Limburgish",
|
| 105 |
-
"lin_Latn": "Lingala",
|
| 106 |
-
"lit_Latn": "Lithuanian",
|
| 107 |
-
"lmo_Latn": "Lombard",
|
| 108 |
-
"ltg_Latn": "Latgalian",
|
| 109 |
-
"ltz_Latn": "Luxembourgish",
|
| 110 |
-
"lua_Latn": "Luba-Kasai",
|
| 111 |
-
"lug_Latn": "Ganda",
|
| 112 |
-
"luo_Latn": "Luo",
|
| 113 |
-
"lus_Latn": "Mizo",
|
| 114 |
-
"mag_Deva": "Magahi",
|
| 115 |
-
"mai_Deva": "Maithili",
|
| 116 |
-
"mal_Mlym": "Malayalam",
|
| 117 |
-
"mar_Deva": "Marathi",
|
| 118 |
-
"min_Latn": "Minangkabau",
|
| 119 |
-
"mkd_Cyrl": "Macedonian",
|
| 120 |
-
"plt_Latn": "Plateau Malagasy",
|
| 121 |
-
"mlt_Latn": "Maltese",
|
| 122 |
-
"mni_Beng": "Manipuri",
|
| 123 |
-
"khk_Cyrl": "Halh Mongolian",
|
| 124 |
-
"mos_Latn": "Mossi",
|
| 125 |
-
"mri_Latn": "Maori",
|
| 126 |
-
"msa_Latn": "Malay",
|
| 127 |
-
"mya_Mymr": "Burmese",
|
| 128 |
-
"nld_Latn": "Dutch",
|
| 129 |
-
"nno_Latn": "Norwegian Nynorsk",
|
| 130 |
-
"nob_Latn": "Norwegian Bokmål",
|
| 131 |
-
"npi_Deva": "Nepali",
|
| 132 |
-
"nso_Latn": "Northern Sotho",
|
| 133 |
-
"nya_Latn": "Nyanja",
|
| 134 |
-
"oci_Latn": "Occitan",
|
| 135 |
-
"ory_Orya": "Odia",
|
| 136 |
-
"pag_Latn": "Pangasinan",
|
| 137 |
-
"pan_Guru": "Punjabi",
|
| 138 |
-
"pap_Latn": "Papiamento",
|
| 139 |
-
"pol_Latn": "Polish",
|
| 140 |
-
"por_Latn": "Portuguese",
|
| 141 |
-
"ron_Latn": "Romanian",
|
| 142 |
-
"run_Latn": "Rundi",
|
| 143 |
-
"rus_Cyrl": "Russian",
|
| 144 |
-
"sag_Latn": "Sango",
|
| 145 |
-
"san_Deva": "Sanskrit",
|
| 146 |
-
"sat_Beng": "Santali",
|
| 147 |
-
"scn_Latn": "Sicilian",
|
| 148 |
-
"shn_Mymr": "Shan",
|
| 149 |
-
"sin_Sinh": "Sinhala",
|
| 150 |
-
"slk_Latn": "Slovak",
|
| 151 |
-
"slv_Latn": "Slovenian",
|
| 152 |
-
"sna_Latn": "Shona",
|
| 153 |
-
"snd_Arab": "Sindhi",
|
| 154 |
-
"som_Latn": "Somali",
|
| 155 |
-
"spa_Latn": "Spanish",
|
| 156 |
-
"als_Latn": "Tosk Albanian",
|
| 157 |
-
"sqi_Latn": "Albanian",
|
| 158 |
-
"srp_Cyrl": "Serbian",
|
| 159 |
-
"ssw_Latn": "Swazi",
|
| 160 |
-
"sun_Latn": "Sundanese",
|
| 161 |
-
"swe_Latn": "Swedish",
|
| 162 |
-
"swh_Latn": "Swahili",
|
| 163 |
-
"szl_Latn": "Silesian",
|
| 164 |
-
"tam_Taml": "Tamil",
|
| 165 |
-
"tat_Cyrl": "Tatar",
|
| 166 |
-
"tel_Telu": "Telugu",
|
| 167 |
-
"tgk_Cyrl": "Tajik",
|
| 168 |
-
"tgl_Latn": "Tagalog",
|
| 169 |
-
"tha_Thai": "Thai",
|
| 170 |
-
"tir_Ethi": "Tigrinya",
|
| 171 |
-
"taq_Latn": "Tamasheq (Latin)",
|
| 172 |
-
"taq_Tfng": "Tamasheq (Tifinagh)",
|
| 173 |
-
"tpi_Latn": "Tok Pisin",
|
| 174 |
-
"tsn_Latn": "Tswana",
|
| 175 |
-
"tso_Latn": "Tsonga",
|
| 176 |
-
"tur_Latn": "Turkish",
|
| 177 |
-
"twi_Latn": "Twi",
|
| 178 |
-
"tzm_Tfng": "Central Atlas Tamazight",
|
| 179 |
-
"uig_Arab": "Uyghur",
|
| 180 |
-
"ukr_Cyrl": "Ukrainian",
|
| 181 |
-
"umb_Latn": "Umbundu",
|
| 182 |
-
"urd_Arab": "Urdu",
|
| 183 |
-
"uzn_Latn": "Northern Uzbek",
|
| 184 |
-
"vec_Latn": "Venetian",
|
| 185 |
-
"vie_Latn": "Vietnamese",
|
| 186 |
-
"war_Latn": "Waray",
|
| 187 |
-
"wol_Latn": "Wolof",
|
| 188 |
-
"xho_Latn": "Xhosa",
|
| 189 |
-
"ydd_Hebr": "Eastern Yiddish",
|
| 190 |
-
"yor_Latn": "Yoruba",
|
| 191 |
-
"yue_Hant": "Cantonese",
|
| 192 |
-
"zho_Hans": "Chinese (Simplified)",
|
| 193 |
-
"zho_Hant": "Chinese (Traditional)",
|
| 194 |
-
"zul_Latn": "Zulu"
|
| 195 |
-
}
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
chunk = ""
|
| 201 |
-
for line in lines:
|
| 202 |
-
if len(chunk) + len(line) < max_chunk_length:
|
| 203 |
-
chunk += line + "\n"
|
| 204 |
-
else:
|
| 205 |
-
if chunk.strip():
|
| 206 |
-
chunks.append(chunk.strip())
|
| 207 |
-
chunk = line + "\n"
|
| 208 |
-
if chunk.strip(): chunks.append(chunk.strip())
|
| 209 |
-
return chunks
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
|
| 223 |
def detect_language(text):
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
):
|
| 260 |
-
contents = await file.read()
|
| 261 |
-
original_text = contents.decode()
|
| 262 |
-
source_lang = detect_language(original_text)
|
| 263 |
-
lines = split_text_lines(original_text)
|
| 264 |
-
translated_lines = batch_translate(lines, source_lang, target_lang)
|
| 265 |
-
return {"translated_text": "\n".join(translated_lines)}
|
|
|
|
| 1 |
+
# قم بتشغيل هذا الكود على جهازك أو سيرفرك (Python 3.8+ مطلوب)
|
| 2 |
+
# install: pip install transformers sentencepiece flask
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from flask import Flask, request, jsonify
|
| 6 |
|
| 7 |
+
app = Flask(__name__)
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# النموذج متعدد اللغات – الأفضل: facebook/m2m100_418M
|
| 10 |
+
translator = pipeline("translation", model="facebook/m2m100_418M")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
# دالة فحص اللغة المدخلة (افضل نموذج: papluca/xlm-roberta-base-language-detection)
|
| 13 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 14 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
lang_tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
|
| 17 |
+
lang_model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
|
| 18 |
+
lang_labels = [
|
| 19 |
+
"af", "am", "ar", "as", "az", "be", "bg", "bn", "bo", "bs", "ca", "ceb", "co", "cs", "cy", "da", "de", "dv",
|
| 20 |
+
"el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "fy", "ga", "gd", "gl", "gn", "gu", "ha", "haw", "he",
|
| 21 |
+
"hi", "hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "ku", "ky", "la",
|
| 22 |
+
"lb", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my", "ne", "nl", "no", "ny", "oc",
|
| 23 |
+
"om", "or", "pa", "pl", "ps", "pt", "qu", "rm", "ro", "ru", "rw", "sd", "se", "sg", "sh", "si", "sk", "sl",
|
| 24 |
+
"sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta", "te", "tg", "th", "ti", "tk", "tl", "tn",
|
| 25 |
+
"tr", "ts", "tt", "tw", "ug", "uk", "ur", "uz", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "zh"
|
| 26 |
+
]
|
| 27 |
|
| 28 |
def detect_language(text):
|
| 29 |
+
inputs = lang_tokenizer(text, return_tensors="pt", truncation=True)
|
| 30 |
+
with torch.no_grad():
|
| 31 |
+
logits = lang_model(**inputs).logits
|
| 32 |
+
predicted = torch.argmax(logits, dim=1)
|
| 33 |
+
lang_code = lang_labels[predicted.item()]
|
| 34 |
+
return lang_code
|
| 35 |
+
|
| 36 |
+
@app.route('/translate', methods=['POST'])
|
| 37 |
+
def translate():
|
| 38 |
+
data = request.json or {}
|
| 39 |
+
text = data.get('text', '')
|
| 40 |
+
detected = detect_language(text)
|
| 41 |
+
# إذا اللغة إنجليزية لا تترجم
|
| 42 |
+
if detected == "en":
|
| 43 |
+
return jsonify({
|
| 44 |
+
"success": True,
|
| 45 |
+
"translatedText": text,
|
| 46 |
+
"originalLanguage": "en"
|
| 47 |
+
})
|
| 48 |
+
translated = translator(text, src_lang=detected, tgt_lang="en")[0]["translation_text"]
|
| 49 |
+
return jsonify({
|
| 50 |
+
"success": True,
|
| 51 |
+
"translatedText": translated,
|
| 52 |
+
"originalLanguage": detected
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
@app.route('/detect', methods=['POST'])
|
| 56 |
+
def langdetect():
|
| 57 |
+
data = request.json or {}
|
| 58 |
+
text = data.get('text', '')
|
| 59 |
+
detected = detect_language(text)
|
| 60 |
+
return jsonify({"language": detected})
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
app.run(host="0.0.0.0", port=5005)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|