asierfg794 commited on
Commit
79daab6
Β·
1 Parent(s): 4ef86bb

EasyOCR + Traductor

Browse files
Files changed (2) hide show
  1. app.py +399 -4
  2. requirements.txt +5 -1
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- app.py β€” FastAPI + EasyOCR + Gemini zerbitzaria
3
- Postzuzenketa testuingurua ulertzen duen LLM bidez.
4
  """
5
 
6
  import io
@@ -11,16 +11,21 @@ from contextlib import asynccontextmanager
11
  import easyocr
12
  import httpx
13
  import numpy as np
 
14
  from deskew import determine_skew
15
  from fastapi import FastAPI, File, Form, HTTPException, UploadFile
16
  from fastapi.middleware.cors import CORSMiddleware
17
  from fastapi.responses import JSONResponse
18
  from PIL import Image
19
  from skimage.transform import rotate
 
20
 
21
  logging.basicConfig(level=logging.INFO)
22
  logger = logging.getLogger(__name__)
23
 
 
 
 
24
  SCRIPTS = {
25
  "latin": ["en","es","fr","de","it","pt","nl","pl","cs","sk","hr",
26
  "ro","hu","lt","lv","et","sv","da","no","is","mt","sq","tr","vi"],
@@ -35,7 +40,9 @@ SCRIPTS = {
35
 
36
  MAX_SIDE = 1500
37
 
 
38
  # Gemini konfigurazioa
 
39
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
40
  GEMINI_MODEL = "gemini-2.5-flash"
41
  GEMINI_URL = (
@@ -105,7 +112,230 @@ Corrected text:"""
105
 
106
  readers: dict = {}
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
109
  def _resize(img: Image.Image) -> Image.Image:
110
  w, h = img.size
111
  longest = max(w, h)
@@ -220,21 +450,149 @@ async def _gemini_correct(text: str) -> str:
220
  return text
221
 
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  @asynccontextmanager
224
  async def lifespan(app: FastAPI):
 
225
  for name, langs in SCRIPTS.items():
226
  logger.info("Reader kargatzen: %s %s", name, langs)
227
  readers[name] = easyocr.Reader(langs, gpu=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  if GEMINI_API_KEY:
229
  logger.info("Gemini konfiguratuta: %s", GEMINI_MODEL)
230
  else:
231
  logger.warning("Gemini API key gabe β€” zuzenketak ez dira aplikatuko")
 
232
  logger.info("Sistema prest.")
233
  yield
234
  readers.clear()
 
235
 
236
 
237
- app = FastAPI(title="OCR API", version="13.0.0", lifespan=lifespan)
238
 
239
  app.add_middleware(
240
  CORSMiddleware,
@@ -244,12 +602,17 @@ app.add_middleware(
244
  )
245
 
246
 
 
 
 
247
  @app.get("/")
248
  async def health_check():
249
  return {
250
  "status": "ok",
251
  "scripts": list(readers.keys()),
252
  "gemini": bool(GEMINI_API_KEY),
 
 
253
  }
254
 
255
 
@@ -285,4 +648,36 @@ async def predict(
285
  text = raw_text
286
 
287
  logger.info("[RESPONSE] Karaktereak: %d", len(text))
288
- return JSONResponse(content={"text": text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ app.py β€” FastAPI + EasyOCR + Gemini + NLLB + HiTZ zerbitzaria
3
+ OCR + postzuzenketa (Gemini 2.5 Flash) + itzulpena (NLLB-200 + HiTZ).
4
  """
5
 
6
  import io
 
11
  import easyocr
12
  import httpx
13
  import numpy as np
14
+ import torch
15
  from deskew import determine_skew
16
  from fastapi import FastAPI, File, Form, HTTPException, UploadFile
17
  from fastapi.middleware.cors import CORSMiddleware
18
  from fastapi.responses import JSONResponse
19
  from PIL import Image
20
  from skimage.transform import rotate
21
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
 
26
+ # ────────────────────────────────────────────────────────────────────────────
27
+ # OCR konfigurazioa
28
+ # ────────────────────────────────────────────────────────────────────────────
29
  SCRIPTS = {
30
  "latin": ["en","es","fr","de","it","pt","nl","pl","cs","sk","hr",
31
  "ro","hu","lt","lv","et","sv","da","no","is","mt","sq","tr","vi"],
 
40
 
41
  MAX_SIDE = 1500
42
 
43
+ # ────────────────────────────────────────────────────────────────────────────
44
  # Gemini konfigurazioa
45
+ # ────────────────────────────────────────────────────────────────────────────
46
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
47
  GEMINI_MODEL = "gemini-2.5-flash"
48
  GEMINI_URL = (
 
112
 
113
  readers: dict = {}
114
 
115
+ # ────────────────────────────────────────────────────────────────────────────
116
+ # Itzulpen-eredu konfigurazioa
117
+ # ────────────────────────────────────────────────────────────────────────────
118
+ NLLB_MODEL_NAME = "facebook/nllb-200-distilled-600M"
119
+
120
+ HITZ_PAIRS = {
121
+ ("en", "eu"): "HiTZ/mt-hitz-en-eu",
122
+ ("eu", "en"): "HiTZ/mt-hitz-eu-en",
123
+ ("es", "eu"): "HiTZ/mt-hitz-es-eu",
124
+ ("eu", "es"): "HiTZ/mt-hitz-eu-es",
125
+ ("gl", "eu"): "HiTZ/mt-hitz-gl-eu",
126
+ ("ca", "eu"): "HiTZ/mt-hitz-ca-eu",
127
+ }
128
+
129
+ # ISO 639-1 (edo 639-3 alternatibarik ez badago) β†’ NLLB-200 kode osoa
130
+ ISO_TO_NLLB = {
131
+ "ace": "ace_Latn",
132
+ "acm": "acm_Arab",
133
+ "acq": "acq_Arab",
134
+ "aeb": "aeb_Arab",
135
+ "af": "afr_Latn",
136
+ "ajp": "ajp_Arab",
137
+ "ak": "aka_Latn",
138
+ "am": "amh_Ethi",
139
+ "apc": "apc_Arab",
140
+ "arb": "arb_Arab",
141
+ "ars": "ars_Arab",
142
+ "ary": "ary_Arab",
143
+ "arz": "arz_Arab",
144
+ "as": "asm_Beng",
145
+ "ast": "ast_Latn",
146
+ "awa": "awa_Deva",
147
+ "ay": "ayr_Latn",
148
+ "azb": "azb_Arab",
149
+ "az": "azj_Latn",
150
+ "ba": "bak_Cyrl",
151
+ "bm": "bam_Latn",
152
+ "ban": "ban_Latn",
153
+ "be": "bel_Cyrl",
154
+ "bem": "bem_Latn",
155
+ "bn": "ben_Beng",
156
+ "bho": "bho_Deva",
157
+ "bjn": "bjn_Latn",
158
+ "bo": "bod_Tibt",
159
+ "bs": "bos_Latn",
160
+ "bug": "bug_Latn",
161
+ "bg": "bul_Cyrl",
162
+ "ca": "cat_Latn",
163
+ "ceb": "ceb_Latn",
164
+ "cs": "ces_Latn",
165
+ "cjk": "cjk_Latn",
166
+ "ckb": "ckb_Arab",
167
+ "crh": "crh_Latn",
168
+ "cy": "cym_Latn",
169
+ "da": "dan_Latn",
170
+ "de": "deu_Latn",
171
+ "dik": "dik_Latn",
172
+ "dyu": "dyu_Latn",
173
+ "dz": "dzo_Tibt",
174
+ "el": "ell_Grek",
175
+ "en": "eng_Latn",
176
+ "eo": "epo_Latn",
177
+ "et": "est_Latn",
178
+ "eu": "eus_Latn",
179
+ "ee": "ewe_Latn",
180
+ "fo": "fao_Latn",
181
+ "fj": "fij_Latn",
182
+ "fi": "fin_Latn",
183
+ "fon": "fon_Latn",
184
+ "fr": "fra_Latn",
185
+ "fur": "fur_Latn",
186
+ "fuv": "fuv_Latn",
187
+ "gd": "gla_Latn",
188
+ "ga": "gle_Latn",
189
+ "gl": "glg_Latn",
190
+ "gn": "grn_Latn",
191
+ "gu": "guj_Gujr",
192
+ "ht": "hat_Latn",
193
+ "ha": "hau_Latn",
194
+ "he": "heb_Hebr",
195
+ "hi": "hin_Deva",
196
+ "hne": "hne_Deva",
197
+ "hr": "hrv_Latn",
198
+ "hu": "hun_Latn",
199
+ "hy": "hye_Armn",
200
+ "ig": "ibo_Latn",
201
+ "ilo": "ilo_Latn",
202
+ "id": "ind_Latn",
203
+ "is": "isl_Latn",
204
+ "it": "ita_Latn",
205
+ "jv": "jav_Latn",
206
+ "ja": "jpn_Jpan",
207
+ "kab": "kab_Latn",
208
+ "kac": "kac_Latn",
209
+ "kam": "kam_Latn",
210
+ "kn": "kan_Knda",
211
+ "ks": "kas_Arab",
212
+ "ka": "kat_Geor",
213
+ "knc": "knc_Latn",
214
+ "kk": "kaz_Cyrl",
215
+ "kbp": "kbp_Latn",
216
+ "kea": "kea_Latn",
217
+ "km": "khm_Khmr",
218
+ "ki": "kik_Latn",
219
+ "rw": "kin_Latn",
220
+ "ky": "kir_Cyrl",
221
+ "kmb": "kmb_Latn",
222
+ "kmr": "kmr_Latn",
223
+ "kg": "kon_Latn",
224
+ "ko": "kor_Hang",
225
+ "lo": "lao_Laoo",
226
+ "lij": "lij_Latn",
227
+ "li": "lim_Latn",
228
+ "ln": "lin_Latn",
229
+ "lt": "lit_Latn",
230
+ "lmo": "lmo_Latn",
231
+ "ltg": "ltg_Latn",
232
+ "lb": "ltz_Latn",
233
+ "lua": "lua_Latn",
234
+ "lg": "lug_Latn",
235
+ "luo": "luo_Latn",
236
+ "lus": "lus_Latn",
237
+ "lv": "lvs_Latn",
238
+ "mag": "mag_Deva",
239
+ "mai": "mai_Deva",
240
+ "ml": "mal_Mlym",
241
+ "mr": "mar_Deva",
242
+ "min": "min_Latn",
243
+ "mk": "mkd_Cyrl",
244
+ "mg": "plt_Latn",
245
+ "mt": "mlt_Latn",
246
+ "mni": "mni_Beng",
247
+ "mn": "khk_Cyrl",
248
+ "mos": "mos_Latn",
249
+ "mi": "mri_Latn",
250
+ "my": "mya_Mymr",
251
+ "nl": "nld_Latn",
252
+ "nn": "nno_Latn",
253
+ "nb": "nob_Latn",
254
+ "ne": "npi_Deva",
255
+ "nso": "nso_Latn",
256
+ "nus": "nus_Latn",
257
+ "ny": "nya_Latn",
258
+ "oc": "oci_Latn",
259
+ "om": "gaz_Latn",
260
+ "or": "ory_Orya",
261
+ "pag": "pag_Latn",
262
+ "pa": "pan_Guru",
263
+ "pap": "pap_Latn",
264
+ "fa": "pes_Arab",
265
+ "pl": "pol_Latn",
266
+ "pt": "por_Latn",
267
+ "prs": "prs_Arab",
268
+ "ps": "pbt_Arab",
269
+ "qu": "quy_Latn",
270
+ "ro": "ron_Latn",
271
+ "rn": "run_Latn",
272
+ "ru": "rus_Cyrl",
273
+ "sg": "sag_Latn",
274
+ "sa": "san_Deva",
275
+ "sat": "sat_Olck",
276
+ "scn": "scn_Latn",
277
+ "shn": "shn_Mymr",
278
+ "si": "sin_Sinh",
279
+ "sk": "slk_Latn",
280
+ "sl": "slv_Latn",
281
+ "sm": "smo_Latn",
282
+ "sn": "sna_Latn",
283
+ "sd": "snd_Arab",
284
+ "so": "som_Latn",
285
+ "st": "sot_Latn",
286
+ "es": "spa_Latn",
287
+ "sq": "als_Latn",
288
+ "sc": "srd_Latn",
289
+ "sr": "srp_Cyrl",
290
+ "ss": "ssw_Latn",
291
+ "su": "sun_Latn",
292
+ "sv": "swe_Latn",
293
+ "sw": "swh_Latn",
294
+ "szl": "szl_Latn",
295
+ "ta": "tam_Taml",
296
+ "tt": "tat_Cyrl",
297
+ "te": "tel_Telu",
298
+ "tg": "tgk_Cyrl",
299
+ "tl": "tgl_Latn",
300
+ "th": "tha_Thai",
301
+ "ti": "tir_Ethi",
302
+ "taq": "taq_Latn",
303
+ "tpi": "tpi_Latn",
304
+ "tn": "tsn_Latn",
305
+ "ts": "tso_Latn",
306
+ "tk": "tuk_Latn",
307
+ "tum": "tum_Latn",
308
+ "tr": "tur_Latn",
309
+ "tw": "twi_Latn",
310
+ "tzm": "tzm_Tfng",
311
+ "ug": "uig_Arab",
312
+ "uk": "ukr_Cyrl",
313
+ "umb": "umb_Latn",
314
+ "ur": "urd_Arab",
315
+ "uz": "uzn_Latn",
316
+ "vec": "vec_Latn",
317
+ "vi": "vie_Latn",
318
+ "war": "war_Latn",
319
+ "wo": "wol_Latn",
320
+ "xh": "xho_Latn",
321
+ "yi": "ydd_Hebr",
322
+ "yo": "yor_Latn",
323
+ "yue": "yue_Hant",
324
+ "zh": "zho_Hans",
325
+ "zht": "zho_Hant",
326
+ "ms": "zsm_Latn",
327
+ "zu": "zul_Latn",
328
+ }
329
+
330
+ # Itzulpen-eredu globalak (lifespan-en kargatzen dira)
331
+ nllb_model = None
332
+ nllb_tokenizer = None
333
+ hitz_models: dict = {} # (src, tgt) -> {"tokenizer": ..., "model": ...}
334
 
335
+
336
+ # ────────────────────────────────────────────────────────────────────────────
337
+ # OCR laguntzaileak
338
+ # ────────────────────────────────────────────────────────────────────────────
339
  def _resize(img: Image.Image) -> Image.Image:
340
  w, h = img.size
341
  longest = max(w, h)
 
450
  return text
451
 
452
 
453
+ # ────────────────────────────────────────────────────────────────────────────
454
+ # Itzulpen laguntzaileak
455
+ # ────────────────────────────────────────────────────────────────────────────
456
+ def _nllb_translate(text: str, src_nllb: str, tgt_nllb: str) -> str:
457
+ """NLLB-200 ereduarekin itzuli."""
458
+ if not text.strip():
459
+ return text
460
+ nllb_tokenizer.src_lang = src_nllb
461
+ inputs = nllb_tokenizer(
462
+ text, return_tensors="pt", truncation=True, max_length=512
463
+ )
464
+ forced_bos_token_id = nllb_tokenizer.convert_tokens_to_ids(tgt_nllb)
465
+ with torch.no_grad():
466
+ outputs = nllb_model.generate(
467
+ **inputs,
468
+ forced_bos_token_id=forced_bos_token_id,
469
+ max_length=512,
470
+ num_beams=4,
471
+ )
472
+ return nllb_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
473
+
474
+
475
+ def _hitz_translate(text: str, src: str, tgt: str) -> str:
476
+ """HiTZ eredu espezifikoarekin itzuli (euskararen norabideak)."""
477
+ if not text.strip():
478
+ return text
479
+ bundle = hitz_models.get((src, tgt))
480
+ if bundle is None:
481
+ raise ValueError(f"HiTZ bikote ezezaguna: {src}β†’{tgt}")
482
+ tokenizer = bundle["tokenizer"]
483
+ model = bundle["model"]
484
+ inputs = tokenizer(
485
+ text, return_tensors="pt", truncation=True, max_length=512
486
+ )
487
+ with torch.no_grad():
488
+ outputs = model.generate(
489
+ **inputs, max_length=512, num_beams=4
490
+ )
491
+ return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
492
+
493
+
494
+ def translate(text: str, src: str, tgt: str) -> str:
495
+ """
496
+ Itzulpen-bideratzaile nagusia:
497
+ - Kasua 1: euskara tartean ez β†’ NLLB zuzenean.
498
+ - Kasua 2: euskara ↔ {es, en} β†’ HiTZ zuzenean.
499
+ - Kasua 3: {gl, ca} β†’ eu β†’ HiTZ zuzenean.
500
+ - Kasua 4: euskara + beste hizkuntza bat β†’ pibotea (en).
501
+ - Kasua 5: src == tgt β†’ testua bere horretan.
502
+ """
503
+ # Kasua 5
504
+ if src == tgt:
505
+ logger.info("[TRANSLATE] src==tgt (%s) β†’ aldaketarik gabe", src)
506
+ return text
507
+
508
+ # Onarpen-egiaztaketa
509
+ if src not in ISO_TO_NLLB:
510
+ raise HTTPException(
511
+ status_code=400,
512
+ detail=f"Hizkuntza ez da onartzen: {src}",
513
+ )
514
+ if tgt not in ISO_TO_NLLB:
515
+ raise HTTPException(
516
+ status_code=400,
517
+ detail=f"Hizkuntza ez da onartzen: {tgt}",
518
+ )
519
+
520
+ eu_involved = (src == "eu" or tgt == "eu")
521
+
522
+ # Kasua 1: euskararik ez β†’ NLLB zuzenean
523
+ if not eu_involved:
524
+ logger.info("[TRANSLATE] NLLB zuzenean: %s β†’ %s", src, tgt)
525
+ return _nllb_translate(text, ISO_TO_NLLB[src], ISO_TO_NLLB[tgt])
526
+
527
+ other = tgt if src == "eu" else src
528
+
529
+ # Kasua 2: eu ↔ {es, en}
530
+ if other in {"es", "en"}:
531
+ logger.info("[TRANSLATE] HiTZ zuzenean: %s β†’ %s", src, tgt)
532
+ return _hitz_translate(text, src, tgt)
533
+
534
+ # Kasua 3: {gl, ca} β†’ eu (norabide bakarra HiTZ ereduetan)
535
+ if src in {"gl", "ca"} and tgt == "eu":
536
+ logger.info("[TRANSLATE] HiTZ zuzenean: %s β†’ %s", src, tgt)
537
+ return _hitz_translate(text, src, tgt)
538
+
539
+ # Kasua 4: pibotea ingelesetik
540
+ if src == "eu":
541
+ logger.info("[TRANSLATE] Pibotea: %s β†’ en β†’ %s", src, tgt)
542
+ intermediate = _hitz_translate(text, "eu", "en")
543
+ return _nllb_translate(
544
+ intermediate, ISO_TO_NLLB["en"], ISO_TO_NLLB[tgt]
545
+ )
546
+ else:
547
+ # tgt == "eu"
548
+ logger.info("[TRANSLATE] Pibotea: %s β†’ en β†’ %s", src, tgt)
549
+ intermediate = _nllb_translate(
550
+ text, ISO_TO_NLLB[src], ISO_TO_NLLB["en"]
551
+ )
552
+ return _hitz_translate(intermediate, "en", "eu")
553
+
554
+
555
+ # ────────────────────────────────────────────────────────────────────────────
556
+ # Lifespan: ereduen karga
557
+ # ────────────────────────────────────────────────────────────────────────────
558
  @asynccontextmanager
559
  async def lifespan(app: FastAPI):
560
+ # EasyOCR irakurleak
561
  for name, langs in SCRIPTS.items():
562
  logger.info("Reader kargatzen: %s %s", name, langs)
563
  readers[name] = easyocr.Reader(langs, gpu=False)
564
+
565
+ # NLLB-200
566
+ global nllb_model, nllb_tokenizer
567
+ logger.info("[TRANSLATE] NLLB eredua kargatzen: %s", NLLB_MODEL_NAME)
568
+ nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL_NAME)
569
+ nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL_NAME)
570
+ nllb_model.eval()
571
+ logger.info("[TRANSLATE] NLLB prest (CPU).")
572
+
573
+ # HiTZ
574
+ for (src, tgt), repo in HITZ_PAIRS.items():
575
+ logger.info(
576
+ "[TRANSLATE] HiTZ eredua kargatzen: %s (%s→%s)", repo, src, tgt
577
+ )
578
+ tok = AutoTokenizer.from_pretrained(repo)
579
+ mod = AutoModelForSeq2SeqLM.from_pretrained(repo)
580
+ mod.eval()
581
+ hitz_models[(src, tgt)] = {"tokenizer": tok, "model": mod}
582
+ logger.info("[TRANSLATE] HiTZ eredu guztiak prest (%d).", len(hitz_models))
583
+
584
  if GEMINI_API_KEY:
585
  logger.info("Gemini konfiguratuta: %s", GEMINI_MODEL)
586
  else:
587
  logger.warning("Gemini API key gabe β€” zuzenketak ez dira aplikatuko")
588
+
589
  logger.info("Sistema prest.")
590
  yield
591
  readers.clear()
592
+ hitz_models.clear()
593
 
594
 
595
+ app = FastAPI(title="OCR + Itzulpena API", version="14.0.0", lifespan=lifespan)
596
 
597
  app.add_middleware(
598
  CORSMiddleware,
 
602
  )
603
 
604
 
605
+ # ────────────────────────────────────────────────────────────────────────────
606
+ # Endpoints
607
+ # ────────────────────────────────────────────────────────────────────────────
608
  @app.get("/")
609
  async def health_check():
610
  return {
611
  "status": "ok",
612
  "scripts": list(readers.keys()),
613
  "gemini": bool(GEMINI_API_KEY),
614
+ "nllb": nllb_model is not None,
615
+ "hitz_pairs": [f"{s}-{t}" for (s, t) in hitz_models.keys()],
616
  }
617
 
618
 
 
648
  text = raw_text
649
 
650
  logger.info("[RESPONSE] Karaktereak: %d", len(text))
651
+ return JSONResponse(content={"text": text})
652
+
653
+
654
+ @app.post("/translate")
655
+ def translate_endpoint(
656
+ text: str = Form(...),
657
+ source_lang: str = Form(...),
658
+ target_lang: str = Form(...),
659
+ ):
660
+ """
661
+ Itzulpen-eskaria. Form-eko parametroak:
662
+ - text: itzuli beharreko testua
663
+ - source_lang: jatorrizko hizkuntza (ISO 639-1, edo 639-3 alternatibarik ez badago)
664
+ - target_lang: helburuko hizkuntza
665
+ Erantzuna: { "translation": "..." }
666
+ """
667
+ logger.info(
668
+ "[TRANSLATE] Eskaria jaso: %s β†’ %s (%d kar.)",
669
+ source_lang, target_lang, len(text),
670
+ )
671
+
672
+ try:
673
+ translation = translate(text, source_lang, target_lang)
674
+ except HTTPException:
675
+ raise
676
+ except Exception as e:
677
+ logger.error("[TRANSLATE] Errorea: %s", e)
678
+ raise HTTPException(
679
+ status_code=500, detail=f"Itzulpen-errorea: {e}"
680
+ )
681
+
682
+ logger.info("[TRANSLATE] Egina (%d kar.)", len(translation))
683
+ return JSONResponse(content={"translation": translation})
requirements.txt CHANGED
@@ -6,4 +6,8 @@ easyocr==1.7.1
6
  numpy==1.26.4
7
  deskew==1.3.2
8
  scikit-image==0.22.0
9
- httpx==0.27.0
 
 
 
 
 
6
  numpy==1.26.4
7
  deskew==1.3.2
8
  scikit-image==0.22.0
9
+ httpx==0.27.0
10
+ transformers==4.44.2
11
+ torch==2.2.2
12
+ sentencepiece==0.2.0
13
+ sacremoses==0.1.1