asierfg794 commited on
Commit
affc051
·
1 Parent(s): 5fc502d

EasyOCR + NLLB optimizado

Browse files
Files changed (4) hide show
  1. .gitignore +0 -3
  2. Dockerfile +0 -70
  3. app.py +45 -88
  4. requirements.txt +1 -2
.gitignore CHANGED
@@ -1,9 +1,6 @@
1
  __pycache__/
2
  *.pyc
3
 
4
- # CTranslate2 bihurtutako NLLB modeloa (Docker build-ean sortzen da)
5
- nllb-200-distilled-600M-ct2-int8/
6
-
7
  # HuggingFace cachea (modeloak deskargatzean)
8
  .cache/
9
  huggingface/
 
1
  __pycache__/
2
  *.pyc
3
 
 
 
 
4
  # HuggingFace cachea (modeloak deskargatzean)
5
  .cache/
6
  huggingface/
Dockerfile CHANGED
@@ -14,53 +14,6 @@ WORKDIR /app
14
  COPY requirements.txt .
15
  RUN pip install --no-cache-dir -r requirements.txt
16
 
17
- # CTranslate2-ren .so-ari "executable stack" bandera kendu (kernel berriek errefusatzen dute).
18
- # Debian trixie-k ez du execstack paketea, beraz Python script batekin egiten dugu:
19
- # PT_GNU_STACK program-headerraren p_flags-eko PF_X bita (0x1) zerora ezarri.
20
- COPY <<'EOF' /tmp/fix_execstack.py
21
- import os, struct, sys
22
-
23
- base = '/usr/local/lib/python3.11/site-packages/ctranslate2'
24
- PT_GNU_STACK_LE = b'\x51\xe5\x74\x64' # 0x6474e551 little-endian
25
- total = 0
26
- for root, _, files in os.walk(base):
27
- for fname in files:
28
- if not (fname.endswith('.so') or '.so.' in fname):
29
- continue
30
- path = os.path.join(root, fname)
31
- with open(path, 'rb') as fp:
32
- data = bytearray(fp.read())
33
- changed = False
34
- i = 0
35
- while True:
36
- i = data.find(PT_GNU_STACK_LE, i)
37
- if i < 0:
38
- break
39
- # ELF64 program header: p_type(4) p_flags(4) ...
40
- flags_off = i + 4
41
- (flags,) = struct.unpack_from('<I', data, flags_off)
42
- if flags & 0x1:
43
- struct.pack_into('<I', data, flags_off, flags & ~0x1)
44
- changed = True
45
- total += 1
46
- print(f'[fix_execstack] {path} offset {i} flags {flags:#x} -> {flags & ~0x1:#x}')
47
- i += 4
48
- if changed:
49
- with open(path, 'wb') as fp:
50
- fp.write(bytes(data))
51
- print(f'[fix_execstack] Aldaketak: {total}')
52
- EOF
53
- RUN python /tmp/fix_execstack.py && rm /tmp/fix_execstack.py
54
-
55
- # NLLB-200 CTranslate2 formatura bihurtu (INT8 kuantizazioa CPUrako)
56
- # Build-denboran egiten da: irudia handiagoa baina abiaraztea askoz azkarragoa
57
- RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')" && \
58
- ct2-transformers-converter \
59
- --model facebook/nllb-200-distilled-600M \
60
- --output_dir /app/nllb-200-distilled-600M-ct2-int8 \
61
- --quantization int8 \
62
- --force
63
-
64
  # Aplikazioaren kodea kopiatu
65
  COPY app.py .
66
 
@@ -69,26 +22,3 @@ EXPOSE 7860
69
 
70
  # Zerbitzaria abiarazi
71
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
72
-
73
-
74
-
75
-
76
- #FROM python:3.11-slim
77
-
78
- #RUN apt-get update && apt-get install -y --no-install-recommends \
79
- # libglib2.0-0 \
80
- # libgl1 \
81
- # libgomp1 \
82
- # libgthread-2.0-0 \
83
- # && rm -rf /var/lib/apt/lists/*
84
-
85
- #WORKDIR /app
86
-
87
- #COPY requirements.txt .
88
- #RUN pip install --no-cache-dir -r requirements.txt
89
-
90
- #COPY app.py .
91
-
92
- #EXPOSE 7860
93
-
94
- #CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
14
  COPY requirements.txt .
15
  RUN pip install --no-cache-dir -r requirements.txt
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Aplikazioaren kodea kopiatu
18
  COPY app.py .
19
 
 
22
 
23
  # Zerbitzaria abiarazi
24
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -3,7 +3,6 @@ app.py — FastAPI + EasyOCR + Gemini + NLLB + HiTZ zerbitzaria
3
  OCR + postzuzenketa (Gemini 2.5 Flash) + itzulpena (NLLB-200 + HiTZ Marian).
4
  """
5
 
6
- import asyncio
7
  import io
8
  import logging
9
  import os
@@ -11,18 +10,22 @@ import re
11
  import time
12
  from contextlib import asynccontextmanager
13
 
14
- import ctranslate2
15
  import easyocr
16
  import httpx
17
  import numpy as np
18
- import torch # HiTZ Marian-erako bakarrik (oraindik desaktibatuta)
19
  from deskew import determine_skew
20
  from fastapi import FastAPI, File, Form, HTTPException, UploadFile
21
  from fastapi.middleware.cors import CORSMiddleware
22
  from fastapi.responses import JSONResponse
23
  from PIL import Image
24
  from skimage.transform import rotate
25
- from transformers import AutoTokenizer
 
 
 
 
 
26
 
27
  logging.basicConfig(level=logging.INFO)
28
  logger = logging.getLogger(__name__)
@@ -74,13 +77,6 @@ Corrected text:"""
74
  readers: dict = {}
75
 
76
  NLLB_MODEL_NAME = "facebook/nllb-200-distilled-600M"
77
- # Modeloaren karpeta app.py-ren ondoan dago (lokalean zein Docker barruan)
78
- NLLB_CT2_DIR = os.path.join(
79
- os.path.dirname(os.path.abspath(__file__)),
80
- "nllb-200-distilled-600M-ct2-int8",
81
- )
82
- CT2_INTRA_THREADS = int(os.environ.get("CT2_INTRA_THREADS", "2"))
83
- CT2_INTER_THREADS = int(os.environ.get("CT2_INTER_THREADS", "1"))
84
 
85
  HITZ_PAIRS = {
86
  ("en", "eu"): "HiTZ/mt-hitz-en-eu",
@@ -144,7 +140,7 @@ ISO_TO_NLLB = {
144
  "zu": "zul_Latn",
145
  }
146
 
147
- nllb_translator: ctranslate2.Translator | None = None
148
  nllb_tokenizer = None
149
  hitz_models: dict = {}
150
 
@@ -309,23 +305,8 @@ def _adaptive_max_tokens(sentence: str) -> int:
309
  return min(512, max(32, int(approx_src_tokens * 1.8)))
310
 
311
 
312
- _TRANSLATION_CACHE: "dict[tuple[str, str, str], str]" = {}
313
- _TRANSLATION_CACHE_MAX = 1024
314
-
315
-
316
- def _cache_get(sentence: str, src: str, tgt: str) -> str | None:
317
- return _TRANSLATION_CACHE.get((sentence, src, tgt))
318
-
319
-
320
- def _cache_put(sentence: str, src: str, tgt: str, value: str) -> None:
321
- if len(_TRANSLATION_CACHE) >= _TRANSLATION_CACHE_MAX:
322
- # FIFO sinplea: lehen sartutakoa kendu
323
- _TRANSLATION_CACHE.pop(next(iter(_TRANSLATION_CACHE)))
324
- _TRANSLATION_CACHE[(sentence, src, tgt)] = value
325
-
326
-
327
  def _nllb_translate(text: str, src_nllb: str, tgt_nllb: str) -> str:
328
- """NLLB-200 ereduarekin itzuli CTranslate2 motorra erabiliz."""
329
  if not text.strip():
330
  return text
331
  blocks = _flatten_to_sentences(text)
@@ -333,51 +314,37 @@ def _nllb_translate(text: str, src_nllb: str, tgt_nllb: str) -> str:
333
  if not to_translate:
334
  return text
335
 
336
- logger.info("[NLLB-CT2] %s -> %s | %d esaldi", src_nllb, tgt_nllb, len(to_translate))
337
  t0 = time.time()
338
-
339
  nllb_tokenizer.src_lang = src_nllb
340
- translations: list[str] = [""] * len(to_translate)
341
-
342
- # 1) Cache-tik bete daitezkeenak bete
343
- pending: list[tuple[int, str]] = []
344
- cache_hits = 0
345
- for i, sentence in enumerate(to_translate):
346
- cached = _cache_get(sentence, src_nllb, tgt_nllb)
347
- if cached is not None:
348
- translations[i] = cached
349
- cache_hits += 1
350
- else:
351
- pending.append((i, sentence))
352
 
353
- # 2) Falta direnak batch-ean itzuli CTranslate2-rekin
354
  BATCH = 8
355
- for batch_start in range(0, len(pending), BATCH):
356
- chunk = pending[batch_start:batch_start + BATCH]
357
- chunk_sentences = [s for _, s in chunk]
358
- source_tokens_batch = [
359
- nllb_tokenizer.convert_ids_to_tokens(nllb_tokenizer.encode(s))
360
- for s in chunk_sentences
361
- ]
362
- max_len = max(_adaptive_max_tokens(s) for s in chunk_sentences)
363
- results = nllb_translator.translate_batch(
364
- source_tokens_batch,
365
- target_prefix=[[tgt_nllb]] * len(chunk_sentences),
366
- beam_size=1,
367
- max_decoding_length=max_len,
368
- no_repeat_ngram_size=3,
369
  )
370
- for (idx, src_s), res in zip(chunk, results):
371
- target_ids = nllb_tokenizer.convert_tokens_to_ids(res.hypotheses[0][1:])
372
- out_s = nllb_tokenizer.decode(target_ids, skip_special_tokens=True).strip()
373
- translations[idx] = out_s
374
- _cache_put(src_s, src_nllb, tgt_nllb, out_s)
375
- logger.info("[NLLB-CT2] %r -> %r", src_s[:60], out_s[:60])
376
-
377
- logger.info("[NLLB-CT2] Egina %.1fs-tan (cache hits: %d/%d, tamaina: %d)",
378
- time.time() - t0, cache_hits, len(to_translate),
379
- len(_TRANSLATION_CACHE))
380
- return _rebuild(blocks, translations)
 
 
 
 
 
381
 
382
 
383
  def _hitz_translate(text: str, src: str, tgt: str) -> str:
@@ -436,19 +403,14 @@ async def lifespan(app: FastAPI):
436
  logger.info("Reader kargatzen (quantize=True): %s %s", name, langs)
437
  readers[name] = easyocr.Reader(langs, gpu=False, quantize=True)
438
 
439
- global nllb_translator, nllb_tokenizer
440
- logger.info("[LOAD] NLLB tokenizer kargatzen: %s", NLLB_MODEL_NAME)
441
  nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL_NAME)
442
- logger.info("[LOAD] NLLB CTranslate2 (INT8) kargatzen: %s", NLLB_CT2_DIR)
443
- nllb_translator = ctranslate2.Translator(
444
- NLLB_CT2_DIR,
445
- device="cpu",
446
- compute_type="int8",
447
- intra_threads=CT2_INTRA_THREADS,
448
- inter_threads=CT2_INTER_THREADS,
449
- )
450
- logger.info("[LOAD] NLLB-CT2 prest | intra=%d inter=%d",
451
- CT2_INTRA_THREADS, CT2_INTER_THREADS)
452
 
453
  # HiTZ aldi baterako desaktibatuta (transformers bateragarritasun arazoak)
454
  logger.info("[LOAD] HiTZ karga saltatzen (NLLB soilik modua)")
@@ -462,7 +424,6 @@ async def lifespan(app: FastAPI):
462
  yield
463
  readers.clear()
464
  hitz_models.clear()
465
- _TRANSLATION_CACHE.clear()
466
 
467
 
468
  app = FastAPI(title="OCR + Itzulpena API", version="16.0.0", lifespan=lifespan)
@@ -480,9 +441,8 @@ async def health_check():
480
  "status": "ok",
481
  "scripts": list(readers.keys()),
482
  "gemini": bool(GEMINI_API_KEY),
483
- "nllb": nllb_translator is not None,
484
- "nllb_backend": "ctranslate2-int8",
485
- "translation_cache_size": len(_TRANSLATION_CACHE),
486
  "hitz_pairs": [f"{s}-{t}" for (s, t) in hitz_models.keys()],
487
  }
488
 
@@ -508,10 +468,7 @@ async def predict(
508
  img_array = np.array(pil_image)
509
  img_array = _deskew(img_array)
510
  reader = readers[script]
511
- loop = asyncio.get_running_loop()
512
- results = await loop.run_in_executor(
513
- None, lambda: reader.readtext(img_array, detail=1, paragraph=False)
514
- )
515
  raw_text = _group_into_lines(results)
516
  logger.info("[OCR] Egina %.1fs-tan, %d karaktere", time.time() - t0, len(raw_text))
517
 
 
3
  OCR + postzuzenketa (Gemini 2.5 Flash) + itzulpena (NLLB-200 + HiTZ Marian).
4
  """
5
 
 
6
  import io
7
  import logging
8
  import os
 
10
  import time
11
  from contextlib import asynccontextmanager
12
 
 
13
  import easyocr
14
  import httpx
15
  import numpy as np
16
+ import torch
17
  from deskew import determine_skew
18
  from fastapi import FastAPI, File, Form, HTTPException, UploadFile
19
  from fastapi.middleware.cors import CORSMiddleware
20
  from fastapi.responses import JSONResponse
21
  from PIL import Image
22
  from skimage.transform import rotate
23
+ from transformers import (
24
+ AutoModelForSeq2SeqLM,
25
+ AutoTokenizer,
26
+ MarianMTModel,
27
+ MarianTokenizer,
28
+ )
29
 
30
  logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger(__name__)
 
77
  readers: dict = {}
78
 
79
  NLLB_MODEL_NAME = "facebook/nllb-200-distilled-600M"
 
 
 
 
 
 
 
80
 
81
  HITZ_PAIRS = {
82
  ("en", "eu"): "HiTZ/mt-hitz-en-eu",
 
140
  "zu": "zul_Latn",
141
  }
142
 
143
+ nllb_model = None
144
  nllb_tokenizer = None
145
  hitz_models: dict = {}
146
 
 
305
  return min(512, max(32, int(approx_src_tokens * 1.8)))
306
 
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  def _nllb_translate(text: str, src_nllb: str, tgt_nllb: str) -> str:
309
+ """NLLB-200 ereduarekin itzuli, esaldika eta batch-ean."""
310
  if not text.strip():
311
  return text
312
  blocks = _flatten_to_sentences(text)
 
314
  if not to_translate:
315
  return text
316
 
317
+ logger.info("[NLLB] %s -> %s | %d esaldi", src_nllb, tgt_nllb, len(to_translate))
318
  t0 = time.time()
 
319
  nllb_tokenizer.src_lang = src_nllb
320
+ forced_bos = nllb_tokenizer.convert_tokens_to_ids(tgt_nllb)
321
+ logger.info("[NLLB] forced_bos_token_id(%s) = %s", tgt_nllb, forced_bos)
 
 
 
 
 
 
 
 
 
 
322
 
323
+ translations = []
324
  BATCH = 8
325
+ for i in range(0, len(to_translate), BATCH):
326
+ chunk = to_translate[i:i + BATCH]
327
+ max_new = max(_adaptive_max_tokens(s) for s in chunk)
328
+ inputs = nllb_tokenizer(
329
+ chunk, return_tensors="pt", padding=True,
330
+ truncation=True, max_length=512,
 
 
 
 
 
 
 
 
331
  )
332
+ with torch.no_grad():
333
+ outputs = nllb_model.generate(
334
+ **inputs,
335
+ forced_bos_token_id=forced_bos,
336
+ max_new_tokens=max_new,
337
+ num_beams=2,
338
+ no_repeat_ngram_size=3,
339
+ early_stopping=True,
340
+ )
341
+ decoded = nllb_tokenizer.batch_decode(outputs, skip_special_tokens=True)
342
+ for src_s, out_s in zip(chunk, decoded):
343
+ logger.info("[NLLB] %r -> %r", src_s[:60], out_s[:60])
344
+ translations.extend(decoded)
345
+
346
+ logger.info("[NLLB] Egina %.1fs-tan", time.time() - t0)
347
+ return _rebuild(blocks, [t.strip() for t in translations])
348
 
349
 
350
  def _hitz_translate(text: str, src: str, tgt: str) -> str:
 
403
  logger.info("Reader kargatzen (quantize=True): %s %s", name, langs)
404
  readers[name] = easyocr.Reader(langs, gpu=False, quantize=True)
405
 
406
+ global nllb_model, nllb_tokenizer
407
+ logger.info("[LOAD] NLLB eredua kargatzen: %s", NLLB_MODEL_NAME)
408
  nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL_NAME)
409
+ nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL_NAME)
410
+ nllb_model.eval()
411
+ logger.info("[LOAD] NLLB mota: %s | tokenizer: %s",
412
+ nllb_model.__class__.__name__,
413
+ nllb_tokenizer.__class__.__name__)
 
 
 
 
 
414
 
415
  # HiTZ aldi baterako desaktibatuta (transformers bateragarritasun arazoak)
416
  logger.info("[LOAD] HiTZ karga saltatzen (NLLB soilik modua)")
 
424
  yield
425
  readers.clear()
426
  hitz_models.clear()
 
427
 
428
 
429
  app = FastAPI(title="OCR + Itzulpena API", version="16.0.0", lifespan=lifespan)
 
441
  "status": "ok",
442
  "scripts": list(readers.keys()),
443
  "gemini": bool(GEMINI_API_KEY),
444
+ "nllb": nllb_model is not None,
445
+ "nllb_class": nllb_model.__class__.__name__ if nllb_model else None,
 
446
  "hitz_pairs": [f"{s}-{t}" for (s, t) in hitz_models.keys()],
447
  }
448
 
 
468
  img_array = np.array(pil_image)
469
  img_array = _deskew(img_array)
470
  reader = readers[script]
471
+ results = reader.readtext(img_array, detail=1, paragraph=False)
 
 
 
472
  raw_text = _group_into_lines(results)
473
  logger.info("[OCR] Egina %.1fs-tan, %d karaktere", time.time() - t0, len(raw_text))
474
 
requirements.txt CHANGED
@@ -10,5 +10,4 @@ httpx==0.27.0
10
  transformers==4.37.0
11
  torch==2.2.2
12
  sentencepiece==0.2.0
13
- sacremoses==0.1.1
14
- ctranslate2==4.5.0
 
10
  transformers==4.37.0
11
  torch==2.2.2
12
  sentencepiece==0.2.0
13
+ sacremoses==0.1.1