Spaces:

tddf
/

end

Sleeping

App Files Files Community

tddf commited on Mar 30

Commit

a345f45

verified ·

1 Parent(s): 22cde00

Update Main.py

Browse files

Files changed (1) hide show

Main.py +17 -28

Main.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torch
 from transformers import LightOnOcrForConditionalGeneration, LightOnOcrProcessor
 from PIL import Image
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 st.set_page_config(
@@ -60,7 +61,7 @@ def load_image():
         return Image.open(io.BytesIO(image_data)).convert('RGB')
     return None
-# ==================== Главный интерфейс ====================
 st.markdown('<div class="header-emoji">📄✨</div>', unsafe_allow_html=True)
 st.title("LightOnOCR")
 st.markdown("**Распознавание текста с изображений**")
@@ -78,42 +79,34 @@ if st.button("🔍 Распознать текст", use_container_width=True, t
     if img is None:
         st.error("Сначала загрузите изображение")
     else:
-        with st.spinner("Распознавание текста... (может занять 5–30 сек на CPU)"):
             conversation = [
                 {
                     "role": "user",
                     "content": [
                         {"type": "image"},
-                        {"type": "text", "text": "Extract ALL visible text from the image as accurately as possible. Include every word, number, and line. Preserve formatting and tables."}
                     ]
                 }
             ]
-            # Шаблон чата
             inputs = processor.apply_chat_template(
                 conversation,
                 add_generation_prompt=True,
                 tokenize=True,
                 return_dict=True,
-                return_tensors="pt"
             )
-            # Обработка изображения
-            image_inputs = processor.image_processor(img, return_tensors="pt")
-            pixel_values = image_inputs.pixel_values.to(device=device, dtype=dtype)
-            # Размеры изображения (критично для модели)
-            height, width = img.size[1], img.size[0]   # PIL: (width, height) → height, width
-            image_sizes = torch.tensor([[height, width]], dtype=torch.long, device=device)
-            inputs["pixel_values"] = pixel_values
-            inputs["image_sizes"] = image_sizes
-            # Перенос остальных тензоров
-            for k, v in inputs.items():
-                if isinstance(v, torch.Tensor) and k not in ["pixel_values", "image_sizes"]:
-                    inputs[k] = v.to(device=device)
             # Генерация
             output_ids = model.generate(
@@ -126,22 +119,18 @@ if st.button("🔍 Распознать текст", use_container_width=True, t
                 eos_token_id=processor.tokenizer.eos_token_id,
             )
-            # Извлекаем только сгенерированную часть
             prompt_length = inputs["input_ids"].shape[1]
             generated_ids = output_ids[0, prompt_length:]
             generated_text = processor.decode(
                 generated_ids,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=True
             ).strip()
-            # Отладка (временно показываем длину)
-            st.info(f"Сгенерировано токенов: {len(generated_ids)} | Длина текста: {len(generated_text)} символов")
-            if not generated_text or len(generated_text) < 5:
-                st.warning("Модель вернула очень короткий или пустой текст. Попробуйте другое изображение с чётким английским текстом.")
-                st.code("Результат пустой или слишком короткий.", language=None)
             else:
                 st.success("✅ Распознавание завершено!")
                 st.markdown('<div class="result-box">', unsafe_allow_html=True)

 from transformers import LightOnOcrForConditionalGeneration, LightOnOcrProcessor
 from PIL import Image
+# Ускоряем скачивание
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 st.set_page_config(
         return Image.open(io.BytesIO(image_data)).convert('RGB')
     return None
+# ==================== Интерфейс ====================
 st.markdown('<div class="header-emoji">📄✨</div>', unsafe_allow_html=True)
 st.title("LightOnOCR")
 st.markdown("**Распознавание текста с изображений**")
     if img is None:
         st.error("Сначала загрузите изображение")
     else:
+        with st.spinner("Распознавание текста... (5–30 сек на CPU)"):
+            # Правильный формат разговора (как в официальных примерах)
             conversation = [
                 {
                     "role": "user",
                     "content": [
                         {"type": "image"},
+                        {"type": "text", "text": "Extract all the text from this image as accurately as possible. Preserve line breaks, formatting and tables."}
                     ]
                 }
             ]
+            # Применяем шаблон + передаём само изображение
             inputs = processor.apply_chat_template(
                 conversation,
                 add_generation_prompt=True,
                 tokenize=True,
                 return_dict=True,
+                return_tensors="pt",
+                images=img   # ← Это ключевой момент
             )
+            # Переносим все тензоры на устройство
+            inputs = {
+                k: (v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device))
+                for k, v in inputs.items()
+            }
             # Генерация
             output_ids = model.generate(
                 eos_token_id=processor.tokenizer.eos_token_id,
             )
+            # Убираем промпт, оставляем только сгенерированный текст
             prompt_length = inputs["input_ids"].shape[1]
             generated_ids = output_ids[0, prompt_length:]
             generated_text = processor.decode(
                 generated_ids,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=True
             ).strip()
+            if not generated_text:
+                st.warning("Модель не смогла извлечь текст. Попробуйте более чёткое изображение с английски�� текстом.")
             else:
                 st.success("✅ Распознавание завершено!")
                 st.markdown('<div class="result-box">', unsafe_allow_html=True)