File size: 2,874 Bytes
a36c3e2
c242fc2
3c0b058
 
 
 
db9642a
871c16a
a36c3e2
db9642a
e7cfc43
db9642a
 
69b53c5
db9642a
 
 
 
 
3c0b058
db9642a
a36c3e2
3c0b058
 
 
 
 
 
 
 
 
db9642a
 
 
 
 
3c0b058
 
 
db9642a
 
3c0b058
 
 
c242fc2
3c0b058
 
 
 
 
db9642a
3c0b058
 
 
 
db9642a
3c0b058
db9642a
3c0b058
 
c242fc2
db9642a
 
 
 
 
 
 
 
 
 
 
 
 
 
d314308
a36c3e2
c242fc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import (
    MBartForConditionalGeneration, MBart50TokenizerFast, pipeline,
    MarianMTModel, MarianTokenizer
)
import torch

app = Flask(__name__)
CORS(app)

# 🧠 Load language detection model
lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

# 🌐 Supported languages mapping for MBART
LANG_CODE_MAP = {
    "ar": "ar_AR", "en": "en_XX", "es": "es_XX", "fr": "fr_XX",
    "de": "de_DE", "it": "it_IT", "ru": "ru_RU", "zh": "zh_CN",
    "ja": "ja_XX", "ko": "ko_KR", "tr": "tr_TR", "pt": "pt_XX",
    # أضف أكثر لو حبيت
}

# Load MBART model (general case)
mbart_model_name = "facebook/mbart-large-50-many-to-one-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)
mbart_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name)

# Load Arabic-specific translation model
helsinki_model_name = "Helsinki-NLP/opus-mt-ar-en"
helsinki_tokenizer = MarianTokenizer.from_pretrained(helsinki_model_name)
helsinki_model = MarianMTModel.from_pretrained(helsinki_model_name)

def detect_language(text: str):
    try:
        result = lang_detector(text)[0]
        lang = result["label"]
        return lang if lang in LANG_CODE_MAP else "en"
    except Exception:
        return "en"  # fallback

def translate_to_english(text: str):
    src_lang = detect_language(text)
    print(f"Detected language: {src_lang}")

    try:
        # ✨ Use Helsinki model for Arabic
        if src_lang == "ar":
            encoded = helsinki_tokenizer(text, return_tensors="pt", padding=True)
            translated = helsinki_model.generate(**encoded)
            return helsinki_tokenizer.decode(translated[0], skip_special_tokens=True)

        # ✨ Use MBART for all other languages
        mbart_tokenizer.src_lang = LANG_CODE_MAP.get(src_lang, "en_XX")
        encoded = mbart_tokenizer(text, return_tensors="pt")
        generated_tokens = mbart_model.generate(
            **encoded,
            forced_bos_token_id=mbart_tokenizer.lang_code_to_id["en_XX"]
        )
        return mbart_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    except Exception as e:
        return f"❌ Translation Error: {str(e)}"

@app.route("/")
def home():
    return jsonify({"status": "Translation service is running!", "usage": "POST /translate with JSON {'text': 'your text here'}"})

@app.route("/translate", methods=["POST"])
def translate_text():
    data = request.get_json()
    if not data or 'text' not in data:
        return jsonify({"success": False, "error": "No text provided"}), 400

    translated = translate_to_english(data["text"])
    return jsonify({"success": True, "translatedText": translated})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)