Spaces:

sterepando
/

MandreOCR

Paused

App Files Files Community

sterepando commited on Nov 28, 2025

Commit

31b0bf1

verified ·

1 Parent(s): fc0d9d2

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -46

app.py CHANGED Viewed

@@ -3,35 +3,33 @@ import uvicorn
 from PIL import Image
 from fastapi import FastAPI, UploadFile, File, Response
 import torch
-from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoImageProcessor
 # --- 1. Глобальная загрузка компонентов ---
 model = None
-tokenizer = None
-image_processor = None
 device = "cpu"
 try:
-    print(">>> Инициализация загрузки LightOnOCR-1B (Fixed VLM pipeline)...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f">>> Устройство: {device}")
     repo_id = "lightonai/LightOnOCR-1B-1025"
-    # 1. Загружаем токенизатор
-    tokenizer = AutoTokenizer.from_pretrained(repo_id)
-    # 2. Загружаем обработчик изображений
-    # Используем AutoImageProcessor, он должен вернуть правильный класс
-    image_processor = AutoImageProcessor.from_pretrained(repo_id)
-    # 3. Загружаем модель
     dtype = torch.bfloat16 if device == "cuda" else torch.float32
     model = AutoModelForImageTextToText.from_pretrained(
         repo_id,
         torch_dtype=dtype,
-        low_cpu_mem_usage=True
     ).to(device)
     print(">>> Все компоненты успешно загружены!")
@@ -39,11 +37,11 @@ try:
 except Exception as e:
     print(f"КРИТИЧЕСКАЯ ОШИБКА загрузки: {e}")
-app = FastAPI(title="LightOnOCR Robust API", version="4.0.0")
 @app.post("/api/ocr")
 async def run_ocr(file: UploadFile = File(...)):
-    if model is None:
         return Response(content="Сервер не готов.", status_code=503)
     try:
@@ -51,51 +49,43 @@ async def run_ocr(file: UploadFile = File(...)):
         contents = await file.read()
         image = Image.open(io.BytesIO(contents)).convert("RGB")
-        # 2. Подготовка визуальных данных
-        # ВАЖНО: Мы не просто берем pixel_values, мы берем ВСЕ, что вернет процессор.
-        # Современные модели требуют 'image_sizes' или 'aspect_ratio_ids'.
-        vision_outputs = image_processor(images=image, return_tensors="pt")
-        # Переносим тензоры на устройство (GPU/CPU)
-        # Создаем словарь аргументов для генерации
-        gen_kwargs = {
-            "max_new_tokens": 1024,
-            "do_sample": False,
-            "pad_token_id": tokenizer.pad_token_id
-        }
-        # Автоматически добавляем все выходы процессора (pixel_values, image_sizes и т.д.)
-        for key, value in vision_outputs.items():
-            if isinstance(value, torch.Tensor):
-                gen_kwargs[key] = value.to(device)
-            else:
-                gen_kwargs[key] = value
-        # 3. Подготовка текста
-        # Стандартный формат промпта для LLaVA-подобных моделей
         prompt = "<image>\nTranscribe the text in this image."
-        text_inputs = tokenizer(prompt, return_tensors="pt")
-        gen_kwargs["input_ids"] = text_inputs["input_ids"].to(device)
-        gen_kwargs["attention_mask"] = text_inputs["attention_mask"].to(device)
         # 4. Г��нерация
-        # Теперь gen_kwargs содержит и pixel_values, и image_sizes (если они нужны модели)
         with torch.inference_mode():
-            generated_ids = model.generate(**gen_kwargs)
         # 5. Декодирование
-        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Очистка от артефактов промпта (опционально)
-        # Часто модель возвращает "Transcribe... \n Результат". Уберем промпт.
-        clean_text = generated_text.replace("Transcribe the text in this image.", "").strip()
         return {"text": clean_text}
     except Exception as e:
         import traceback
-        traceback.print_exc() # Печатаем полный лог ошибки в консоль сервера
         return Response(content=f"Server Error: {str(e)}", status_code=500)
 @app.get("/")

 from PIL import Image
 from fastapi import FastAPI, UploadFile, File, Response
 import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
 # --- 1. Глобальная загрузка компонентов ---
 model = None
+processor = None
 device = "cpu"
 try:
+    print(">>> Инициализация загрузки LightOnOCR-1B (с trust_remote_code=True)...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f">>> Устройство: {device}")
     repo_id = "lightonai/LightOnOCR-1B-1025"
+    # 1. Загружаем процессор
+    # ВАЖНО: trust_remote_code=True позволяет загрузить кастомный код процессора из репозитория,
+    # который умеет правильно обрабатывать аргумент 'images' и вставлять токены.
+    processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+    # 2. Загружаем модель
     dtype = torch.bfloat16 if device == "cuda" else torch.float32
     model = AutoModelForImageTextToText.from_pretrained(
         repo_id,
         torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True
     ).to(device)
     print(">>> Все компоненты успешно загружены!")
 except Exception as e:
     print(f"КРИТИЧЕСКАЯ ОШИБКА загрузки: {e}")
+app = FastAPI(title="LightOnOCR Final API", version="5.0.0")
 @app.post("/api/ocr")
 async def run_ocr(file: UploadFile = File(...)):
+    if model is None or processor is None:
         return Response(content="Сервер не готов.", status_code=503)
     try:
         contents = await file.read()
         image = Image.open(io.BytesIO(contents)).convert("RGB")
+        # 2. Формирование промпта
+        # Для этой модели обычно достаточно простого промпта, но важно,
+        # чтобы процессор сам обработал вставку <image> токенов.
         prompt = "<image>\nTranscribe the text in this image."
+        # 3. Обработка через процессор
+        # Теперь, с trust_remote_code=True, этот вызов должен работать корректно
+        # и вернуть input_ids, pixel_values и, возможно, image_sizes.
+        inputs = processor(text=prompt, images=image, return_tensors="pt")
+        # Переносим все тензоры на устройство
+        inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
         # 4. Г��нерация
         with torch.inference_mode():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=False,
+                pad_token_id=processor.tokenizer.pad_token_id
+            )
         # 5. Декодирование
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Очистка результата от промпта (простая эвристика)
+        clean_text = generated_text.replace(prompt.replace("<image>", ""), "").strip()
+        # Дополнительная очистка, если модель возвращает мусор в начале
+        if "Transcribe" in clean_text:
+             clean_text = clean_text.split("image.")[-1].strip()
         return {"text": clean_text}
     except Exception as e:
         import traceback
+        traceback.print_exc()
         return Response(content=f"Server Error: {str(e)}", status_code=500)
 @app.get("/")