abood-bak commited on
Commit
b9a0eeb
·
1 Parent(s): 3c17bf7

the translation models

Browse files
Files changed (3) hide show
  1. .gitignore +34 -0
  2. requirements.txt +7 -0
  3. transliteration_api.py +643 -0
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.development
5
+ .env.production
6
+
7
+ # Virtual environments
8
+ venv/
9
+ env/
10
+ .venv/
11
+ ENV/
12
+
13
+ # Byte-compiled / optimized / DLL files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+
18
+ # Distribution / packaging
19
+ dist/
20
+ build/
21
+ *.egg-info/
22
+ *.egg
23
+
24
+ # IDE files
25
+ .vscode/
26
+ .idea/
27
+ *.sublime-project
28
+ *.sublime-workspace
29
+
30
+ # Logs
31
+ *.log
32
+ logs/
33
+
34
+ myenv/
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ flask==3.0.0
2
+ flask-caching==2.1.0
3
+ transformers==4.36.0
4
+ torch==2.1.0
5
+ sentencepiece==0.1.99
6
+ redis==5.0.1
7
+ accelerate==0.25.0
transliteration_api.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Flask API with BETTER Translation Models + Redis Cache
3
+ Uses high-quality models for both directions with Redis caching
4
+ """
5
+
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ import os
11
+ import re
12
+ import time
13
+ import warnings
14
+ from functools import lru_cache
15
+ from typing import List
16
+
17
+ from flask import Flask, jsonify, request
18
+ from flask_caching import Cache
19
+
20
+ warnings.filterwarnings("ignore")
21
+
22
+ app = Flask(__name__)
23
+
24
+ # Redis Cache configuration
25
+ # REDIS_HOST = os.getenv("REDIS_HOST", "redis")
26
+ # REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))
27
+ # REDIS_DB = int(os.getenv("REDIS_DB", 0))
28
+ # CACHE_DEFAULT_TIMEOUT = int(os.getenv("CACHE_DEFAULT_TIMEOUT", 3600))
29
+ REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
30
+ CACHE_DEFAULT_TIMEOUT = int(os.getenv("CACHE_DEFAULT_TIMEOUT", 3600))
31
+ # cache_config = {
32
+ # "CACHE_TYPE": "RedisCache",
33
+ # "CACHE_REDIS_HOST": REDIS_HOST,
34
+ # "CACHE_REDIS_PORT": REDIS_PORT,
35
+ # "CACHE_REDIS_DB": REDIS_DB,
36
+ # "CACHE_DEFAULT_TIMEOUT": CACHE_DEFAULT_TIMEOUT,
37
+ # "CACHE_KEY_PREFIX": "transliteration:",
38
+ # }
39
+
40
+ # print(f"📦 Redis Configuration:")
41
+ # print(f" Host: {REDIS_HOST}")
42
+ # print(f" Port: {REDIS_PORT}")
43
+ # print(f" DB: {REDIS_DB}")
44
+ # print(f" Timeout: {CACHE_DEFAULT_TIMEOUT}s")
45
+
46
+ cache_config = {
47
+ "CACHE_TYPE": "RedisCache",
48
+ "CACHE_REDIS_URL": REDIS_URL, # MUST be rediss://
49
+ "CACHE_DEFAULT_TIMEOUT": CACHE_DEFAULT_TIMEOUT,
50
+ "CACHE_KEY_PREFIX": "transliteration:",
51
+ }
52
+
53
+ print(f"📦 Redis Configuration:")
54
+ print(f" URL: {REDIS_URL.split('@')[-1]}") # Print only the endpoint for security
55
+ print(f" Timeout: {CACHE_DEFAULT_TIMEOUT}s")
56
+
57
+ # try:
58
+ # cache = Cache(app, config=cache_config)
59
+ # print("✅ Redis cache initialized successfully")
60
+ # except Exception as e:
61
+ # print(f"⚠️ Redis connection failed: {e}")
62
+ # print(" Falling back to SimpleCache")
63
+ # cache_config = {
64
+ # "CACHE_TYPE": "SimpleCache",
65
+ # "CACHE_DEFAULT_TIMEOUT": CACHE_DEFAULT_TIMEOUT,
66
+ # "CACHE_THRESHOLD": 10000,
67
+ # }
68
+ # cache = Cache(app, config=cache_config)
69
+ try:
70
+ cache = Cache(app, config=cache_config)
71
+ with app.app_context():
72
+ cache.set("ping", "pong", timeout=10)
73
+ if cache.get("ping") == "pong":
74
+ print("✅ Upstash Redis connected successfully")
75
+ else:
76
+ raise Exception("Ping test failed")
77
+ except Exception as e:
78
+ print(f"⚠️ Redis connection failed: {e}")
79
+ print(" Falling back to SimpleCache")
80
+ cache = Cache(
81
+ app,
82
+ config={
83
+ "CACHE_TYPE": "SimpleCache",
84
+ "CACHE_DEFAULT_TIMEOUT": CACHE_DEFAULT_TIMEOUT,
85
+ "CACHE_THRESHOLD": 10000,
86
+ },
87
+ )
88
+
89
+ # Global variables
90
+ ar_en_model = None
91
+ en_ar_model = None
92
+ ar_en_tokenizer = None
93
+ en_ar_tokenizer = None
94
+ device = None
95
+ USE_TRANSFORMERS = True
96
+
97
+ # Which models to use
98
+ MODEL_CHOICE = os.getenv("MODEL_CHOICE", "opus-big") # Options: "opus-big", "marefa"
99
+
100
+ # Performance metrics
101
+ metrics = {
102
+ "cache_hits": 0,
103
+ "cache_misses": 0,
104
+ "model_requests": 0,
105
+ "total_requests": 0,
106
+ "avg_response_time": 0.0,
107
+ }
108
+
109
+ print("🚀 Starting Transliteration API with Better Models + Redis Cache...")
110
+
111
+
112
+ def load_models():
113
+ """Load BETTER translation models"""
114
+ global \
115
+ ar_en_model, \
116
+ en_ar_model, \
117
+ ar_en_tokenizer, \
118
+ en_ar_tokenizer, \
119
+ device, \
120
+ USE_TRANSFORMERS, \
121
+ MODEL_CHOICE
122
+
123
+ try:
124
+ import torch
125
+ from transformers import MarianMTModel, MarianTokenizer
126
+
127
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
128
+ print(f"🖥️ Using device: {device}")
129
+
130
+ if MODEL_CHOICE == "opus-big":
131
+ # OPTION 1: OPUS Big Models (BEST QUALITY - Recommended)
132
+ print("📥 Loading OPUS-MT-BIG models (high quality)...")
133
+
134
+ # Arabic to English (OPUS Big)
135
+ print(" Loading AR→EN (opus-mt-tc-big-ar-en)...")
136
+ ar_en_model_name = "Helsinki-NLP/opus-mt-tc-big-ar-en"
137
+ ar_en_tokenizer = MarianTokenizer.from_pretrained(ar_en_model_name)
138
+ ar_en_model = MarianMTModel.from_pretrained(ar_en_model_name).to(device)
139
+ ar_en_model.eval()
140
+ print(" ✅ AR→EN loaded")
141
+
142
+ # English to Arabic (OPUS Big) - MUCH BETTER than basic model
143
+ print(" Loading EN→AR (opus-mt-tc-big-en-ar)...")
144
+ en_ar_model_name = "Helsinki-NLP/opus-mt-tc-big-en-ar"
145
+ en_ar_tokenizer = MarianTokenizer.from_pretrained(en_ar_model_name)
146
+ en_ar_model = MarianMTModel.from_pretrained(en_ar_model_name).to(device)
147
+ en_ar_model.eval()
148
+ print(" ✅ EN→AR loaded")
149
+
150
+ print("🎉 OPUS-MT-BIG models loaded successfully!")
151
+
152
+ elif MODEL_CHOICE == "marefa":
153
+ # OPTION 2: Marefa Model (Specialized for Arabic)
154
+ print("📥 Loading Marefa models (Arabic-specialized)...")
155
+
156
+ # Arabic to English (OPUS Big - still best for this direction)
157
+ print(" Loading AR→EN (opus-mt-tc-big-ar-en)...")
158
+ ar_en_model_name = "Helsinki-NLP/opus-mt-tc-big-ar-en"
159
+ ar_en_tokenizer = MarianTokenizer.from_pretrained(ar_en_model_name)
160
+ ar_en_model = MarianMTModel.from_pretrained(ar_en_model_name).to(device)
161
+ ar_en_model.eval()
162
+ print(" ✅ AR→EN loaded")
163
+
164
+ # English to Arabic (Marefa - Arabic specialized)
165
+ print(" Loading EN→AR (marefa-mt-en-ar)...")
166
+ en_ar_model_name = "marefa-nlp/marefa-mt-en-ar"
167
+ en_ar_tokenizer = MarianTokenizer.from_pretrained(en_ar_model_name)
168
+ en_ar_model = MarianMTModel.from_pretrained(en_ar_model_name).to(device)
169
+ en_ar_model.eval()
170
+ print(" ✅ EN→AR loaded (Marefa)")
171
+
172
+ print("🎉 Marefa models loaded successfully!")
173
+
174
+ USE_TRANSFORMERS = True
175
+ return True
176
+
177
+ except Exception as e:
178
+ print(f"⚠️ Error loading models: {str(e)}")
179
+ print("💡 Make sure you have enough memory and internet connection")
180
+ USE_TRANSFORMERS = False
181
+ return False
182
+
183
+
184
+ @lru_cache(maxsize=10000)
185
+ def normalize_arabic(text: str) -> str:
186
+ """Normalize Arabic text (cached in memory)"""
187
+ text = re.sub(r"[\u064B-\u065F]", "", text)
188
+ text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا").replace("ٱ", "ا")
189
+ text = text.replace("ى", "ي")
190
+ text = text.replace("ة", "ه")
191
+ text = text.replace("ؤ", "و")
192
+ text = text.replace("ئ", "ي")
193
+ return text.strip()
194
+
195
+
196
+ @lru_cache(maxsize=10000)
197
+ def get_phonetic_variants(text: str) -> List[str]:
198
+ """Get phonetic variants (cached in memory)"""
199
+ variants = {text.lower()}
200
+ base = text.lower()
201
+
202
+ patterns = [
203
+ (r"a", "e"),
204
+ (r"e", "a"),
205
+ (r"een$", "ain"),
206
+ (r"ain$", "een"),
207
+ (r"(.)\1", r"\1"),
208
+ ]
209
+
210
+ for pattern, replacement in patterns:
211
+ if re.search(pattern, base):
212
+ variant = re.sub(pattern, replacement, base)
213
+ if variant != base and len(variant) >= 2:
214
+ variants.add(variant)
215
+
216
+ return list(variants)[:5]
217
+
218
+
219
+ def transliterate_with_model(text: str, from_lang: str, to_lang: str) -> List[str]:
220
+ """Use high-quality transformer models"""
221
+ global ar_en_model, en_ar_model, ar_en_tokenizer, en_ar_tokenizer, device
222
+
223
+ if not USE_TRANSFORMERS:
224
+ return []
225
+
226
+ try:
227
+ import torch
228
+
229
+ # Select model and tokenizer
230
+ if from_lang == "ar" and to_lang == "en":
231
+ model = ar_en_model
232
+ tokenizer = ar_en_tokenizer
233
+ elif from_lang == "en" and to_lang == "ar":
234
+ model = en_ar_model
235
+ tokenizer = en_ar_tokenizer
236
+
237
+ # For OPUS Big EN→AR, need to add language token
238
+ if MODEL_CHOICE == "opus-big":
239
+ text = ">>ara<< " + text
240
+ else:
241
+ return []
242
+
243
+ # Tokenize
244
+ inputs = tokenizer([text], return_tensors="pt", padding=True).to(device)
245
+ variants = set()
246
+
247
+ with torch.no_grad():
248
+ # Method 1: Greedy decoding
249
+ outputs = model.generate(
250
+ **inputs, max_length=50, num_beams=1, do_sample=False
251
+ )
252
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
253
+ if result:
254
+ variants.add(result.lower() if to_lang == "en" else result)
255
+
256
+ # Method 2: Beam search (best quality)
257
+ outputs = model.generate(
258
+ **inputs,
259
+ max_length=50,
260
+ num_beams=5,
261
+ num_return_sequences=3,
262
+ do_sample=False,
263
+ early_stopping=True,
264
+ )
265
+ for output in outputs:
266
+ result = tokenizer.decode(output, skip_special_tokens=True).strip()
267
+ if result:
268
+ variants.add(result.lower() if to_lang == "en" else result)
269
+
270
+ # Method 3: Diverse beam search
271
+ outputs = model.generate(
272
+ **inputs,
273
+ max_length=50,
274
+ do_sample=True,
275
+ top_k=50,
276
+ top_p=0.95,
277
+ temperature=0.8,
278
+ num_return_sequences=2,
279
+ )
280
+ for output in outputs:
281
+ result = tokenizer.decode(output, skip_special_tokens=True).strip()
282
+ if result:
283
+ variants.add(result.lower() if to_lang == "en" else result)
284
+
285
+ return list(variants)[:6]
286
+
287
+ except Exception as e:
288
+ print(f"Model error: {str(e)}")
289
+ return []
290
+
291
+
292
+ def add_arabic_variants(text: str) -> List[str]:
293
+ """Add common Arabic variants"""
294
+ variants = {text}
295
+
296
+ # Hamza variants
297
+ if text.startswith("ا"):
298
+ variants.add("أ" + text[1:])
299
+ variants.add("إ" + text[1:])
300
+
301
+ # Final ya
302
+ if text.endswith("ي"):
303
+ variants.add(text[:-1] + "ى")
304
+
305
+ # Taa marbuta
306
+ if text.endswith("ه"):
307
+ variants.add(text[:-1] + "ة")
308
+
309
+ return list(variants)
310
+
311
+
312
+ # ============================================================================
313
+ # API ENDPOINTS
314
+ # ============================================================================
315
+
316
+
317
+ @app.route("/health", methods=["GET"])
318
+ def health_check():
319
+ """Health check with Redis status"""
320
+ redis_status = "connected"
321
+ try:
322
+ # Test Redis connection
323
+ cache.set("health_check", "ok", timeout=5)
324
+ test_val = cache.get("health_check")
325
+ if test_val != "ok":
326
+ redis_status = "error"
327
+ except Exception as e:
328
+ redis_status = f"error: {str(e)}"
329
+
330
+ return jsonify(
331
+ {
332
+ "status": "healthy",
333
+ "models_loaded": ar_en_model is not None and en_ar_model is not None,
334
+ "model_type": MODEL_CHOICE,
335
+ "using_transformers": USE_TRANSFORMERS,
336
+ "device": str(device) if device else "none",
337
+ "ar_en_model": "opus-mt-tc-big-ar-en",
338
+ "en_ar_model": "opus-mt-tc-big-en-ar"
339
+ if MODEL_CHOICE == "opus-big"
340
+ else "marefa-mt-en-ar",
341
+ "cache": {
342
+ "type": "Redis",
343
+ "status": redis_status,
344
+ "timeout": CACHE_DEFAULT_TIMEOUT,
345
+ },
346
+ "metrics": metrics,
347
+ }
348
+ )
349
+
350
+
351
+ @app.route("/stats", methods=["GET"])
352
+ def get_stats():
353
+ """Performance statistics"""
354
+ cache_total = metrics["cache_hits"] + metrics["cache_misses"]
355
+ hit_rate = metrics["cache_hits"] / cache_total if cache_total > 0 else 0
356
+
357
+ # Get Redis info if available
358
+ redis_info = {}
359
+ try:
360
+ if cache_config.get("CACHE_TYPE") == "RedisCache":
361
+ redis_client = cache.cache._write_client
362
+ info = redis_client.info("stats")
363
+ redis_info = {
364
+ "total_connections_received": info.get("total_connections_received", 0),
365
+ "total_commands_processed": info.get("total_commands_processed", 0),
366
+ "keyspace_hits": info.get("keyspace_hits", 0),
367
+ "keyspace_misses": info.get("keyspace_misses", 0),
368
+ }
369
+ except Exception as e:
370
+ redis_info = {"error": str(e)}
371
+
372
+ return jsonify(
373
+ {
374
+ "cache": {
375
+ "hits": metrics["cache_hits"],
376
+ "misses": metrics["cache_misses"],
377
+ "hit_rate": f"{hit_rate * 100:.2f}%",
378
+ "redis": redis_info,
379
+ },
380
+ "requests": {
381
+ "total": metrics["total_requests"],
382
+ "model_requests": metrics["model_requests"],
383
+ },
384
+ "performance": {
385
+ "avg_response_time_ms": f"{metrics['avg_response_time']:.2f}",
386
+ },
387
+ "model_info": {
388
+ "type": MODEL_CHOICE,
389
+ "ar_en": "opus-mt-tc-big-ar-en",
390
+ "en_ar": "opus-mt-tc-big-en-ar"
391
+ if MODEL_CHOICE == "opus-big"
392
+ else "marefa-mt-en-ar",
393
+ },
394
+ }
395
+ )
396
+
397
+
398
+ @app.route("/transliterate", methods=["POST"])
399
+ def transliterate():
400
+ """Main transliteration endpoint with Redis caching"""
401
+ start_time = time.time()
402
+ metrics["total_requests"] += 1
403
+
404
+ try:
405
+ data = request.get_json()
406
+ text = data.get("text", "").strip()
407
+ from_lang = data.get("from", "en")
408
+ to_lang = data.get("to", "ar")
409
+
410
+ if not text or len(text) < 2:
411
+ return jsonify({"variants": []})
412
+
413
+ # Redis cache key
414
+ cache_key = f"{from_lang}-{to_lang}-{text}"
415
+
416
+ # Check Redis cache
417
+ cached = cache.get(cache_key)
418
+ if cached:
419
+ metrics["cache_hits"] += 1
420
+ return jsonify(
421
+ {"variants": cached, "cached": True, "cache_source": "redis"}
422
+ )
423
+
424
+ metrics["cache_misses"] += 1
425
+ metrics["model_requests"] += 1
426
+
427
+ # Generate variants with model
428
+ variants = set([text, text.lower()])
429
+
430
+ if from_lang == "ar":
431
+ # Normalize Arabic
432
+ normalized = normalize_arabic(text)
433
+ if normalized != text:
434
+ variants.add(normalized)
435
+
436
+ # Use model
437
+ model_variants = transliterate_with_model(normalized, from_lang, to_lang)
438
+ variants.update(model_variants)
439
+
440
+ # Add phonetic variants
441
+ for v in list(variants):
442
+ if re.match(r"^[a-z]+$", v):
443
+ variants.update(get_phonetic_variants(v))
444
+
445
+ else: # English to Arabic
446
+ # Use model
447
+ model_variants = transliterate_with_model(text, from_lang, to_lang)
448
+ variants.update(model_variants)
449
+
450
+ # Add Arabic variants for each result
451
+ for v in list(variants):
452
+ if re.search(r"[\u0600-\u06FF]", v):
453
+ variants.update(add_arabic_variants(v))
454
+
455
+ # Filter and limit
456
+ result = [v for v in variants if v and len(v) >= 2][:6]
457
+
458
+ # Cache result in Redis
459
+ cache.set(cache_key, result, timeout=CACHE_DEFAULT_TIMEOUT)
460
+
461
+ # Update metrics
462
+ response_time = (time.time() - start_time) * 1000
463
+ metrics["avg_response_time"] = (
464
+ metrics["avg_response_time"] * (metrics["total_requests"] - 1)
465
+ + response_time
466
+ ) / metrics["total_requests"]
467
+
468
+ return jsonify(
469
+ {
470
+ "variants": result,
471
+ "cached": False,
472
+ "response_time_ms": round(response_time, 2),
473
+ }
474
+ )
475
+
476
+ except Exception as e:
477
+ return jsonify({"error": str(e), "variants": []}), 500
478
+
479
+
480
+ @app.route("/transliterate/batch", methods=["POST"])
481
+ def transliterate_batch():
482
+ """Batch transliteration with Redis caching"""
483
+ start_time = time.time()
484
+
485
+ try:
486
+ data = request.get_json()
487
+ texts = data.get("texts", [])
488
+ from_lang = data.get("from", "en")
489
+ to_lang = data.get("to", "ar")
490
+
491
+ results = {}
492
+ cache_hits = 0
493
+ cache_misses = 0
494
+
495
+ for text in texts:
496
+ cache_key = f"{from_lang}-{to_lang}-{text}"
497
+ cached = cache.get(cache_key)
498
+
499
+ if cached:
500
+ results[text] = cached
501
+ metrics["cache_hits"] += 1
502
+ cache_hits += 1
503
+ else:
504
+ metrics["cache_misses"] += 1
505
+ cache_misses += 1
506
+ variants = set([text, text.lower()])
507
+
508
+ # Use models
509
+ model_variants = transliterate_with_model(text, from_lang, to_lang)
510
+ variants.update(model_variants)
511
+
512
+ # Add variants
513
+ if to_lang == "ar":
514
+ for v in list(variants):
515
+ if re.search(r"[\u0600-\u06FF]", v):
516
+ variants.update(add_arabic_variants(v))
517
+ else:
518
+ for v in list(variants):
519
+ if re.match(r"^[a-z]+$", v):
520
+ variants.update(get_phonetic_variants(v))
521
+
522
+ result = [v for v in variants if v and len(v) >= 2][:6]
523
+ results[text] = result
524
+
525
+ # Cache in Redis
526
+ cache.set(cache_key, result, timeout=CACHE_DEFAULT_TIMEOUT)
527
+
528
+ metrics["total_requests"] += len(texts)
529
+ response_time = (time.time() - start_time) * 1000
530
+
531
+ return jsonify(
532
+ {
533
+ "results": results,
534
+ "count": len(results),
535
+ "cache_hits": cache_hits,
536
+ "cache_misses": cache_misses,
537
+ "response_time_ms": round(response_time, 2),
538
+ }
539
+ )
540
+
541
+ except Exception as e:
542
+ return jsonify({"error": str(e), "results": {}}), 500
543
+
544
+
545
+ @app.route("/cache/clear", methods=["POST"])
546
+ def clear_cache():
547
+ """Clear Redis cache"""
548
+ try:
549
+ cache.clear()
550
+ metrics["cache_hits"] = 0
551
+ metrics["cache_misses"] = 0
552
+ return jsonify({"status": "cache cleared", "cache_type": "redis"})
553
+ except Exception as e:
554
+ return jsonify({"error": str(e)}), 500
555
+
556
+
557
+ @app.route("/cache/warm", methods=["POST"])
558
+ def warm_cache():
559
+ """Warm Redis cache with common terms"""
560
+ data = request.get_json()
561
+ terms = data.get("terms", [])
562
+
563
+ warmed = 0
564
+ failed = 0
565
+
566
+ for term in terms:
567
+ try:
568
+ if re.search(r"[\u0600-\u06FF]", term):
569
+ cache_key = f"ar-en-{term}"
570
+ if not cache.get(cache_key):
571
+ variants = transliterate_with_model(term, "ar", "en")
572
+ cache.set(cache_key, variants, timeout=CACHE_DEFAULT_TIMEOUT)
573
+ warmed += 1
574
+ else:
575
+ cache_key = f"en-ar-{term}"
576
+ if not cache.get(cache_key):
577
+ variants = transliterate_with_model(term, "en", "ar")
578
+ cache.set(cache_key, variants, timeout=CACHE_DEFAULT_TIMEOUT)
579
+ warmed += 1
580
+ except Exception as e:
581
+ print(f"Failed to warm cache for '{term}': {e}")
582
+ failed += 1
583
+
584
+ return jsonify(
585
+ {"status": "success", "warmed": warmed, "failed": failed, "cache_type": "redis"}
586
+ )
587
+
588
+
589
+ @app.route("/cache/info", methods=["GET"])
590
+ def cache_info():
591
+ """Get Redis cache information"""
592
+ try:
593
+ if cache_config.get("CACHE_TYPE") == "RedisCache":
594
+ redis_client = cache.cache._write_client
595
+ info = redis_client.info()
596
+
597
+ return jsonify(
598
+ {
599
+ "cache_type": "Redis",
600
+ "redis_version": info.get("redis_version"),
601
+ "used_memory_human": info.get("used_memory_human"),
602
+ "connected_clients": info.get("connected_clients"),
603
+ "total_commands_processed": info.get("total_commands_processed"),
604
+ "keyspace_hits": info.get("keyspace_hits"),
605
+ "keyspace_misses": info.get("keyspace_misses"),
606
+ "keys": redis_client.dbsize(),
607
+ }
608
+ )
609
+ else:
610
+ return jsonify({"cache_type": "SimpleCache", "message": "Not using Redis"})
611
+ except Exception as e:
612
+ return jsonify({"error": str(e)}), 500
613
+
614
+
615
+ if __name__ == "__main__":
616
+ print("🚀 Starting Flask Transliteration API...")
617
+ print(f"📦 Model choice: {MODEL_CHOICE}")
618
+ print("")
619
+
620
+ # Load better models
621
+ success = load_models()
622
+
623
+ if success:
624
+ print("\n✅ High-quality models loaded!")
625
+ print(" AR→EN: opus-mt-tc-big-ar-en")
626
+ if MODEL_CHOICE == "opus-big":
627
+ print(" EN→AR: opus-mt-tc-big-en-ar (MUCH better than basic!)")
628
+ else:
629
+ print(" EN→AR: marefa-mt-en-ar (Arabic-specialized)")
630
+ else:
631
+ print("\n⚠️ Models failed to load")
632
+
633
+ print("\n✅ Server starting on http://localhost:5000")
634
+ print("📝 Endpoints:")
635
+ print(" POST /transliterate")
636
+ print(" POST /transliterate/batch")
637
+ print(" GET /health")
638
+ print(" GET /stats")
639
+ print(" GET /cache/info")
640
+ print(" POST /cache/clear")
641
+ print(" POST /cache/warm")
642
+
643
+ app.run(host="0.0.0.0", port=5000, debug=False, threaded=True)