Spaces:

Ane4ka
/

NoteMaker

Sleeping

App Files Files Community

ASureevaA commited on Dec 4, 2025

Commit

fa051f7

1 Parent(s): 35e85d1

edit

Browse files

Files changed (2) hide show

app.py +62 -59
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -8,12 +8,12 @@ import torch
 import gradio as gradio_module
 from PIL import Image
 from transformers import (
-    TrOCRProcessor,
-    VisionEncoderDecoderModel,
     pipeline,
-    VitsTokenizer,
     VitsModel,
 )
 # ============================
 # 1. Настройки устройства
@@ -26,62 +26,66 @@ device_string: str = "cuda" if torch.cuda.is_available() else "cpu"
 # 2. Модели
 # ============================
-# OCR: печатный английский текст
-# Модель: microsoft/trocr-small-printed
-ocr_processor: TrOCRProcessor = TrOCRProcessor.from_pretrained(
-    "microsoft/trocr-small-printed"
-)
-ocr_model: VisionEncoderDecoderModel = VisionEncoderDecoderModel.from_pretrained(
-    "microsoft/trocr-small-printed"
-)
-ocr_model.to(device_string)
-# Суммаризация: английский новостной/общий текст
-# Модель: sshleifer/distilbart-cnn-12-6
 summary_pipeline = pipeline(
     task="summarization",
     model="sshleifer/distilbart-cnn-12-6",
 )
-# TTS: английская MMS VITS
-# Модель: facebook/mms-tts-eng
 tts_model: VitsModel = VitsModel.from_pretrained("facebook/mms-tts-eng")
-tts_tokenizer: VitsTokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
 tts_model.to(device_string)
 # ============================
-# 3. OCR
 # ============================
 def run_ocr(image_object: Image.Image) -> str:
     """
-    Распознавание печатного английского текста с изображения.
-    Используем TrOCR (microsoft/trocr-small-printed).
-    Ожидается более-менее читаемый printed text
-    (скриншоты, документы, слайды и т.п.).
     """
     if image_object is None:
         return ""
-    rgb_image_object: Image.Image = image_object.convert("RGB")
-    processor_output = ocr_processor(
-        images=rgb_image_object,
-        return_tensors="pt",
-    )
-    pixel_values_tensor = processor_output.pixel_values.to(device_string)
-    with torch.no_grad():
-        generated_id_tensor = ocr_model.generate(pixel_values_tensor)
-    decoded_text_list = ocr_processor.batch_decode(
-        generated_id_tensor,
-        skip_special_tokens=True,
-    )
-    recognized_text: str = decoded_text_list[0].strip()
     return recognized_text
@@ -103,13 +107,14 @@ def run_summarization(
     word_count: int = len(cleaned_text.split())
-    # Простая адаптация длины под размер текста,
-    # чтобы не было бессмысленных max_length >> input_length.
     dynamic_max_length: int = min(
         max_summary_tokens,
         max(32, word_count + 20),
     )
     summary_result_list = summary_pipeline(
         cleaned_text,
         max_length=dynamic_max_length,
@@ -129,10 +134,8 @@ def run_tts(summary_text: str) -> Optional[str]:
     """
     Озвучка английского текста конспекта через VitsModel (facebook/mms-tts-eng).
-    ВАЖНО:
-    - защищаемся от пустого ввода;
-    - ловим RuntimeError изнутри модели (бывают краши на редких входах);
-      в этом случае просто возвращаем None, чтобы не ронять весь Space.
     """
     cleaned_text: str = summary_text.strip()
     if not cleaned_text:
@@ -142,7 +145,6 @@ def run_tts(summary_text: str) -> Optional[str]:
         cleaned_text,
         return_tensors="pt",
     )
     tokenized_inputs = {
         key: value.to(device_string)
         for key, value in tokenized_inputs.items()
@@ -151,14 +153,13 @@ def run_tts(summary_text: str) -> Optional[str]:
     input_ids_tensor = tokenized_inputs.get("input_ids")
     if input_ids_tensor is None:
         return None
     if input_ids_tensor.numel() == 0 or input_ids_tensor.shape[1] == 0:
         return None
     try:
         with torch.no_grad():
             model_output = tts_model(**tokenized_inputs)
-            waveform_tensor = model_output.waveform  # shape: (batch, n_samples)
     except RuntimeError as runtime_error:
         print(f"[WARN] TTS RuntimeError: {runtime_error}")
         return None
@@ -190,9 +191,9 @@ def full_flow(
 ) -> Tuple[str, str, Optional[str]]:
     """
     Полный пайплайн:
-    1) OCR: изображение -> исходный текст (английский)
-    2) Суммаризация: текст -> краткое резюме
-    3) TTS: резюме -> .wav файл (или None, если TTS не смог)
     """
     recognized_text: str = run_ocr(image_object=image_object)
@@ -207,7 +208,7 @@ def full_flow(
 # ============================
-# 7. Gradio UI
 # ============================
 gradio_interface = gradio_module.Interface(
@@ -215,35 +216,37 @@ gradio_interface = gradio_module.Interface(
     inputs=[
         gradio_module.Image(
             type="pil",
-            label="Image with printed English text",
         ),
         gradio_module.Slider(
             minimum=32,
             maximum=256,
             value=128,
             step=16,
-            label="Maximum summary length (tokens, approx)",
         ),
     ],
     outputs=[
         gradio_module.Textbox(
-            label="Recognized text (OCR)",
-            lines=6,
         ),
         gradio_module.Textbox(
-            label="Summary (English)",
             lines=6,
         ),
         gradio_module.Audio(
-            label="Summary narration (MMS VITS, en)",
             type="filepath",
         ),
     ],
-    title="Image → Text → Summary → Speech (English models)",
     description=(
-        "1) English OCR transformer recognizes printed text from the image.\n"
-        "2) English summarization transformer creates a short summary.\n"
-        "3) English VITS (facebook/mms-tts-eng) reads the summary aloud."
     ),
 )

 import gradio as gradio_module
 from PIL import Image
 from transformers import (
     pipeline,
     VitsModel,
+    AutoTokenizer,
 )
+from nemotron_ocr.inference.pipeline import NemotronOCR  # <-- Nemotron OCR v1
 # ============================
 # 1. Настройки устройства
 # 2. Модели
 # ============================
+ocr_engine: NemotronOCR = NemotronOCR()
 summary_pipeline = pipeline(
     task="summarization",
     model="sshleifer/distilbart-cnn-12-6",
 )
 tts_model: VitsModel = VitsModel.from_pretrained("facebook/mms-tts-eng")
+tts_tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 tts_model.to(device_string)
 # ============================
+# 3. OCR через NemotronOCR
 # ============================
 def run_ocr(image_object: Image.Image) -> str:
     """
+    OCR для печатного (и вообще любого) английского текста с картины.
+    Используем NemotronOCR из nvidia/nemotron-ocr-v1.
+    Модель сама делает:
+    - детекцию текстовых блоков,
+    - распознавание текста,
+    - анализ порядка чтения.
+    На выходе NemotronOCR даёт список dict:
+    [
+        {
+            "text": "...",
+            "confidence": float,
+            "left": float,
+            "upper": float,
+            "right": float,
+            "lower": float,
+            ...
+        },
+        ...
+    ]
     """
     if image_object is None:
         return ""
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temporary_file:
+        image_object.save(temporary_file.name)
+        image_path: str = temporary_file.name
+    predictions = ocr_engine(image_path)
+    text_parts = []
+    for prediction in predictions:
+        text_value = prediction.get("text", "")
+        if not text_value:
+            continue
+        text_parts.append(str(text_value))
+    recognized_text: str = "\n".join(text_parts).strip()
     return recognized_text
     word_count: int = len(cleaned_text.split())
     dynamic_max_length: int = min(
         max_summary_tokens,
         max(32, word_count + 20),
     )
+    if word_count < 8:
+        return cleaned_text
     summary_result_list = summary_pipeline(
         cleaned_text,
         max_length=dynamic_max_length,
     """
     Озвучка английского текста конспекта через VitsModel (facebook/mms-tts-eng).
+    Если модель внутри упадёт (известный баг на некоторых странных инпутах),
+    мы просто вернём None и не будем ронять всё приложение.
     """
     cleaned_text: str = summary_text.strip()
     if not cleaned_text:
         cleaned_text,
         return_tensors="pt",
     )
     tokenized_inputs = {
         key: value.to(device_string)
         for key, value in tokenized_inputs.items()
     input_ids_tensor = tokenized_inputs.get("input_ids")
     if input_ids_tensor is None:
         return None
     if input_ids_tensor.numel() == 0 or input_ids_tensor.shape[1] == 0:
         return None
     try:
         with torch.no_grad():
             model_output = tts_model(**tokenized_inputs)
+            waveform_tensor = model_output.waveform  # (batch, n_samples)
     except RuntimeError as runtime_error:
         print(f"[WARN] TTS RuntimeError: {runtime_error}")
         return None
 ) -> Tuple[str, str, Optional[str]]:
     """
     Полный пайплайн:
+    1) OCR: изображение -> исходный английский текст
+    2) Суммаризация: текст -> конспект (английский)
+    3) TTS: конспект -> .wav файл (или None, если TTS не смог)
     """
     recognized_text: str = run_ocr(image_object=image_object)
 # ============================
+# 7. Gradio UI (на русском)
 # ============================
 gradio_interface = gradio_module.Interface(
     inputs=[
         gradio_module.Image(
             type="pil",
+            label="Изображение с напечатанным английским текстом",
         ),
         gradio_module.Slider(
             minimum=32,
             maximum=256,
             value=128,
             step=16,
+            label="Максимальная длина конспекта (токены, примерно)",
         ),
     ],
     outputs=[
         gradio_module.Textbox(
+            label="Распознанный текст (Nemotron OCR)",
+            lines=8,
         ),
         gradio_module.Textbox(
+            label="Конспект (английский текст)",
             lines=6,
         ),
         gradio_module.Audio(
+            label="Озвучка конспекта (английский TTS)",
             type="filepath",
         ),
     ],
+    title="Картинка → Текст → Конспект → Озвучка (Nemotron OCR + английские модели)",
     description=(
+        "1) Nemotron OCR v1 (nvidia/nemotron-ocr-v1) распознаёт текст с документа.\n"
+        "2) Английский трансформер суммаризации делает краткий пересказ.\n"
+        "3) VITS-модель MMS (facebook/mms-tts-eng) озвучивает конспект.\n\n"
+        "Если озвучка не сгенерировалась, значит конкретный текст не понравился TTS-модели "
+        "и она упала внутри — пайплайн просто пропустит аудио."
     ),
 )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-transformers>=4.33.0
 torch
 sentencepiece
 gradio

+transformers>=4.40.0
 torch
 sentencepiece
 gradio