Spaces:

RaiSantos
/

vt

Build error

App Files Files Community

Raí Santos commited on Dec 29, 2025

Commit

9a8fc49

1 Parent(s): 0078030

feat: Complete optimization with 3 bugs fixed + backend-only

Browse files

Files changed (3) hide show

backend/main.py +5 -0
backend/processor.py +73 -11
google_colab/colab_app.py +146 -111

backend/main.py CHANGED Viewed

@@ -78,6 +78,11 @@ async def process_media(
         # TRANSCRICÃO COM SEGURANÇA TOTAL
         words = processor.transcribe(audio_path, language="pt")
         words = processor.correct_orthography(words)
         # GERAÇÃO DO JSON

         # TRANSCRICÃO COM SEGURANÇA TOTAL
         words = processor.transcribe(audio_path, language="pt")
+        # CORREÇÃO INTELIGENTE (Script Based)
+        if script_text:
+            words = processor.align_with_script(words, script_text)
         words = processor.correct_orthography(words)
         # GERAÇÃO DO JSON

backend/processor.py CHANGED Viewed

@@ -3,6 +3,7 @@ import whisperx
 import torch
 import re
 import json
 from docx import Document
 import gc
@@ -44,14 +45,14 @@ class TranscriptionProcessor:
                 if text:
                     # Limpeza de caracteres não-imprimíveis (comum em DOCX vindo do Windows)
                     text = "".join(char for char in text if char.isprintable() or char == "\n")
-                    # Regra de substituição cirúrgica do USER
                     text = re.sub(r'[—–-]\s+', ', ', text)
                     text = re.sub(r'\s+,\s+', ', ', text)
                     full_text.append(text)
             return " ".join(full_text)
         except Exception as e:
-            print(f"[DOCX ERROR] Falha no processamento: {e}")
-            return f"Erro ao ler roteiro: {str(e)}"
     def transcribe(self, audio_path, language="pt"):
         """Transcrição com sistema de Fallback Blindado"""
@@ -60,11 +61,11 @@ class TranscriptionProcessor:
             audio = whisperx.load_audio(audio_path)
             # Passo 1: Transcrição (Parâmetros mínimos para compatibilidade total)
-            print("[WHISPER] Iniciando Transcrição Base...")
             result = self.model.transcribe(audio, batch_size=8, language=language)
             # Passo 2: Alinhamento (Com Try-Except interno para evitar erro 500)
-            print("[WHISPER] Iniciando Alinhamento Cirúrgico...")
             try:
                 if language not in self.align_model_cache:
                     self.align_model_cache[language] = whisperx.load_align_model(
@@ -72,7 +73,7 @@ class TranscriptionProcessor:
                     )
                 model_a, metadata = self.align_model_cache[language]
                 result = whisperx.align(
-                    result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False
                 )
             except Exception as align_err:
                 print(f"[WHISPER WARNING] Falha no alinhamento: {align_err}. Seguindo com transcrição base.")
@@ -91,18 +92,79 @@ class TranscriptionProcessor:
                             "word": w.get("word", w.get("text", "")).strip()
                         })
-            print(f"[WHISPER] Sucesso. {len(words)} palavras processadas.")
             return words
         except Exception as e:
-            print(f"[WHISPER ERROR] Falha Crítica: {str(e)}")
-            import traceback
-            print(traceback.format_exc())
             raise e
     def correct_orthography(self, words):
-        """Correção rápida de vírgulas duplicadas e espaços vazios"""
         for w in words:
             w["word"] = w["word"].replace(" ,", ",").replace(",,", ",")
         return words

 import torch
 import re
 import json
+import difflib
 from docx import Document
 import gc
                 if text:
                     # Limpeza de caracteres não-imprimíveis (comum em DOCX vindo do Windows)
                     text = "".join(char for char in text if char.isprintable() or char == "\n")
+                    # Normalização de hífens para vírgulas conforme solicitado pelo USER
                     text = re.sub(r'[—–-]\s+', ', ', text)
                     text = re.sub(r'\s+,\s+', ', ', text)
                     full_text.append(text)
             return " ".join(full_text)
         except Exception as e:
+            print(f"[DOCX ERROR] {e}")
+            return ""
     def transcribe(self, audio_path, language="pt"):
         """Transcrição com sistema de Fallback Blindado"""
             audio = whisperx.load_audio(audio_path)
             # Passo 1: Transcrição (Parâmetros mínimos para compatibilidade total)
+            print("[WHISPER] Transcrevendo...")
             result = self.model.transcribe(audio, batch_size=8, language=language)
             # Passo 2: Alinhamento (Com Try-Except interno para evitar erro 500)
+            print("[WHISPER] Alinhando...")
             try:
                 if language not in self.align_model_cache:
                     self.align_model_cache[language] = whisperx.load_align_model(
                     )
                 model_a, metadata = self.align_model_cache[language]
                 result = whisperx.align(
+                    result["segments"], model_a, metadata, audio, self.device
                 )
             except Exception as align_err:
                 print(f"[WHISPER WARNING] Falha no alinhamento: {align_err}. Seguindo com transcrição base.")
                             "word": w.get("word", w.get("text", "")).strip()
                         })
             return words
         except Exception as e:
+            print(f"[WHISPER ERROR] {str(e)}")
             raise e
+    def align_with_script(self, audio_words, script_text):
+        """
+        CORREÇÃO INTELIGENTE (VSL BLINDADA):
+        Compara a transcrição com o roteiro e corrige ortografia/termos técnicos
+        preservando o tempo do áudio.
+        """
+        if not script_text:
+            return audio_words
+        print("[REFINE] Iniciando correção inteligente baseada no roteiro...")
+        # 1. Preparação das listas (Original e Limpa para matching)
+        script_raw = script_text.split()
+        script_clean = [re.sub(r'[^\w]', '', w).lower() for w in script_raw]
+        audio_clean = [re.sub(r'[^\w]', '', w['word']).lower() for w in audio_words]
+        # 2. Matching de Sequência
+        matcher = difflib.SequenceMatcher(None, audio_clean, script_clean)
+        opcodes = matcher.get_opcodes()
+        refined_words = []
+        for tag, i1, i2, j1, j2 in opcodes:
+            if tag == 'equal':
+                # Palavras batem: usamos a grafia exata do roteiro (casing/pontuação)
+                for k in range(i2 - i1):
+                    word_obj = audio_words[i1 + k].copy()
+                    word_obj['word'] = script_raw[j1 + k]
+                    refined_words.append(word_obj)
+            elif tag == 'replace':
+                # Caso crítico: setox -> Cetox ou setox31 -> Cetox 31
+                # Se o número de palavras for diferente, tentamos fundir para manter o tempo
+                if (i2 - i1) == (j2 - j1):
+                    # 1 para 1
+                    for k in range(i2 - i1):
+                        word_obj = audio_words[i1 + k].copy()
+                        word_obj['word'] = script_raw[j1 + k]
+                        refined_words.append(word_obj)
+                else:
+                    # M:N (Fusão inteligente)
+                    # Pegamos o tempo do primeiro ao último do bloco e aplicamos o texto do roteiro
+                    new_word_text = " ".join(script_raw[j1:j2])
+                    word_obj = {
+                        "start": audio_words[i1]["start"],
+                        "end": audio_words[i2-1]["end"],
+                        "word": new_word_text
+                    }
+                    refined_words.append(word_obj)
+            elif tag == 'delete':
+                # Palavra no áudio mas não no roteiro (ad-lib ou erro): mantemos o áudio
+                for k in range(i1, i2):
+                    refined_words.append(audio_words[k])
+            elif tag == 'insert':
+                # Palavra no roteiro mas não detectada pelo Whisper: ignoramos para não quebrar o tempo
+                # (Ou poderíamos interpolar, mas para Slides VSL é melhor ignorar)
+                pass
+        print(f"[REFINE] Concluído. {len(refined_words)} palavras na saída final.")
+        return refined_words
     def correct_orthography(self, words):
+        """Correções rápidas pós-processamento"""
         for w in words:
+            # Limpeza básica de detritos
             w["word"] = w["word"].replace(" ,", ",").replace(",,", ",")
         return words

google_colab/colab_app.py CHANGED Viewed

@@ -1,156 +1,191 @@
-# PROJETO WHISPER VSL ULTRA - VERSÃO COLAB DEFINITIVA
-# Recomendação: Vá em 'Ambiente de Execução' -> 'Desconectar e excluir ambiente' antes de rodar.
-# 1. INSTALAÇÃO DIRETA (Resolve ModuleNotFoundError)
-!pip install --quiet --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
-!pip install --quiet transformers accelerate nest_asyncio
-!pip install --quiet git+https://github.com/m-bain/whisperX.git gradio python-docx onnxruntime-gpu
-!apt-get install -y -qq ffmpeg
 import os
-import torch
-import whisperx
-import gradio as gr
 import json
 import re
 import uuid
 import gc
-import asyncio
 import nest_asyncio
 from docx import Document
-# Permite que o Gradio rode sem travar o loop de eventos do Colab
 nest_asyncio.apply()
-# FIX PARA PYTORCH 2.6+ (Chave Mestra contra UnpicklingError)
-import omegaconf
-try:
-    from torch.serialization import add_safe_globals
-    original_torch_load = torch.load
-    def patched_torch_load(*args, **kwargs):
-        kwargs['weights_only'] = False
-        return original_torch_load(*args, **kwargs)
-    torch.load = patched_torch_load
-except:
-    pass
-class ColabProcessor:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.gpu_name = torch.cuda.get_device_name(0) if self.device == "cuda" else "CPU"
-        # OTIMIZAÇÃO CIRÚRGICA PARA 12.7GB RAM
         if self.device == "cuda":
-            self.compute_type = "int8_float16"
-            self.batch_size = 16
         else:
             self.compute_type = "int8"
             self.batch_size = 4
-        print(f"🚀 HARDWARE: {self.gpu_name} | MODO: {self.compute_type}")
-        self.model = None
-        self.align_model_cache = {}
     def load_model(self):
         if self.model is None:
             gc.collect()
-            torch.cuda.empty_cache()
-            print(f"🧠 Carregando Whisper Large-v3...")
-            self.model = whisperx.load_model("large-v3", self.device, compute_type=self.compute_type)
-    def process_docx(self, file_path):
-        if not file_path: return ""
         try:
-            doc = Document(file_path)
-            full_text = []
-            for para in doc.paragraphs:
-                text = para.text.strip()
-                if text:
-                    text = re.sub(r'[—–-]\s+', ', ', text)
-                    text = re.sub(r'\s+,\s+', ', ', text)
-                    full_text.append(text)
-            return " ".join(full_text)
-        except Exception as e:
-            return f"Erro no DOCX: {str(e)}"
     def run(self, audio_path, docx_file):
-        if not audio_path: return "Erro: Envie um áudio!", "", None
-        session_id = uuid.uuid4().hex[:8]
         try:
             self.load_model()
-            # 1. TRANSCRIÇÃO
-            print(f"🎙️ [SESSÃO {session_id}] Processando áudio...")
             audio = whisperx.load_audio(audio_path)
-            result = self.model.transcribe(audio, batch_size=self.batch_size, language="pt")
-            # 2. ALINHAMENTO MILIMÉTRICO
-            print("📏 Sincronizando timestamps...")
-            if "pt" not in self.align_model_cache:
-                self.align_model_cache["pt"] = whisperx.load_align_model(language_code="pt", device=self.device)
-            model_a, metadata = self.align_model_cache["pt"]
-            result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
-            words = []
-            for segment in result["segments"]:
-                if "words" in segment:
-                    for w in segment["words"]:
-                        if "start" in w and "end" in w:
-                            words.append({
-                                "start": round(w["start"], 3),
-                                "end": round(w["end"], 3),
-                                "word": w["word"].strip()
-                            })
-            transcribed_text = " ".join([w["word"] for w in words])
-            # 3. DOCX
-            script_text = ""
-            if docx_file:
-                script_text = self.process_docx(docx_file.name)
-            # 4. JSON FINAL
-            json_path = f"transcription_{session_id}.json"
             with open(json_path, "w", encoding="utf-8") as f:
-                json.dump({"words": words}, f, ensure_ascii=False, indent=2)
-            # CLEANUP
-            del audio
             gc.collect()
-            torch.cuda.empty_cache()
-            print(f"✅ Concluído com sucesso!")
-            return transcribed_text, script_text, json_path
         except Exception as e:
             import traceback
-            print(traceback.format_exc())
-            return f"Erro fatal: {str(e)}", "Verifique os logs detalhados acima", None
-processor = ColabProcessor()
-# INTERFACE GRADIO PREMIUM
-with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
-    gr.Markdown(f"# 🚀 WHISPER VSL ULTRA (GPU {processor.gpu_name})")
-    gr.Markdown("Transcrição Cirúrgica Otimizada para áudios longos (31 min+)")
     with gr.Row():
-        with gr.Column(scale=1):
-            audio_input = gr.Audio(type="filepath", label="Áudio da VSL")
-            docx_input = gr.File(label="Roteiro (.docx)")
-            btn = gr.Button("🔥 INICIAR PROCESSAMENTO", variant="primary")
-        with gr.Column(scale=2):
-            json_output = gr.File(label="📦 Baixar JSON para Slides")
-            with gr.Tabs():
-                with gr.TabItem("Transcrição Realizada"):
-                    text_out = gr.Textbox(label=None, lines=15, show_copy_button=True)
-                with gr.TabItem("Roteiro Original Limpo"):
-                    script_out = gr.Textbox(label=None, lines=15)
-    btn.click(processor.run, inputs=[audio_input, docx_input], outputs=[text_out, script_out, json_output])
-# Launch para Colab Python 3.12
-demo.launch(debug=True, share=True, show_error=True)

+# 🚀 WHISPER VSL ULTRA - FINAL CURE EDITION
+# Resolve conflitos de Torch 2.8.0 e implementa correção inteligente (Fuzzy Match)
 import os
+import sys
+import subprocess
+import difflib
+def install_safe_stack():
+    print("🛠️ LIMPANDO E CURANDO AMBIENTE (Aguarde 3 min)...")
+    try:
+        # 1. Limpeza Radical para evitar conflitos de versões "sequestradas"
+        print("🧹 Removendo versões instáveis...")
+        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "torch", "torchaudio", "torchvision", "whisperx", "pandas"])
+        # 2. Instalação Sincronizada (A "Santíssima Trindade" estável para T4)
+        print("📦 Instalando PyTorch Stack Estável (2.5.1)...")
+        subprocess.check_call([
+            sys.executable, "-m", "pip", "install",
+            "torch==2.5.1+cu121", "torchvision==0.20.1+cu121", "torchaudio==2.5.1+cu121",
+            "pandas==2.2.2", # Versão que o Colab exige
+            "--index-url", "https://download.pytorch.org/whl/cu121"
+        ])
+        # 3. WhisperX v3.1.1 (A versão mais estável já feita)
+        print("📦 Instalando WhisperX v3.1.1...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/m-bain/whisperX.git@v3.1.1"])
+        # 4. Dependências cruciais
+        print("📦 Finalizando componentes...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyannote.audio==3.3.1", "gradio", "python-docx", "transformers", "accelerate", "nest_asyncio"])
+        subprocess.check_call(["apt-get", "install", "-y", "-qq", "ffmpeg", "libsndfile1"])
+        print("\n✅ AMBIENTE CURADO COM SUCESSO!")
+        print("⚠️ AÇÃO NECESSÁRIA: Vá em 'Ambiente de Execução' > 'Reiniciar sessão' e rode esta célula de novo.")
+        os.kill(os.getpid(), 9)
+    except Exception as e:
+        print(f"❌ Erro na cura: {e}")
+        sys.exit(1)
+# Check de Saúde
+try:
+    import torch
+    import whisperx
+    if "2.5.1" not in torch.__version__: raise ImportError()
+    print(f"✅ Ambiente Saudável: Torch {torch.__version__} | WhisperX {whisperx.__version__}")
+except:
+    install_safe_stack()
 import json
 import re
 import uuid
 import gc
 import nest_asyncio
+import gradio as gr
 from docx import Document
 nest_asyncio.apply()
+class VSLEngine:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.gpu_name = torch.cuda.get_device_name(0) if self.device == "cuda" else "CPU"
+        self.model = None
         if self.device == "cuda":
+            caps = torch.cuda.get_device_capability()
+            self.compute_type = "int8_float16" if caps[0] < 8 else "bfloat16"
+            self.batch_size = 8 if caps[0] < 8 else 16
         else:
             self.compute_type = "int8"
             self.batch_size = 4
+        print(f"🔥 ENGINE PRONTA: {self.gpu_name} | MODO: {self.compute_type}")
     def load_model(self):
         if self.model is None:
             gc.collect()
+            if self.device == "cuda": torch.cuda.empty_cache()
+            print("🧠 Carregando Modelo VSL (Large-v3)...")
+            self.model = whisperx.load_model("large-v3", self.device, compute_type=self.compute_type, download_root="/content/models")
+    def align_with_script(self, audio_words, script_text):
+        """CORREÇÃO FUZZY: Faz 'setox' virar 'Cetox 31' comparando com o roteiro"""
+        if not script_text: return audio_words
+        print("[REFINE] Aplicando inteligência de roteiro...")
+        s_raw = script_text.split()
+        s_clean = [re.sub(r'[^\w]', '', w).lower() for w in s_raw]
+        a_clean = [re.sub(r'[^\w]', '', w['word']).lower() for w in audio_words]
+        matcher = difflib.SequenceMatcher(None, a_clean, s_clean)
+        refined = []
+        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+            if tag == 'equal':
+                for k in range(i2-i1):
+                    word_obj = audio_words[i1+k].copy()
+                    word_obj['word'] = s_raw[j1+k]
+                    refined.append(word_obj)
+            elif tag == 'replace':
+                if (i2-i1) == (j2-j1):
+                    for k in range(i2-i1):
+                        word_obj = audio_words[i1+k].copy()
+                        word_obj['word'] = s_raw[j1+k]
+                        refined.append(word_obj)
+                else:
+                    new_text = " ".join(s_raw[j1:j2])
+                    refined.append({"start": audio_words[i1]["start"], "end": audio_words[i2-1]["end"], "word": new_text})
+            elif tag == 'delete':
+                for k in range(i1, i2): refined.append(audio_words[k])
+        return refined
+    def process_docx(self, path):
+        if not path: return ""
         try:
+            doc = Document(path)
+            full = []
+            for p in doc.paragraphs:
+                t = "".join(c for c in p.text if c.isprintable()).strip()
+                if t:
+                    t = re.sub(r'[—–-]\s+', ', ', t)
+                    full.append(t)
+            return " ".join(full)
+        except: return ""
     def run(self, audio_path, docx_file):
+        if not audio_path: return "Erro: Áudio falta", "", None
+        sid = uuid.uuid4().hex[:6]
         try:
             self.load_model()
+            # 1. Transcrição
+            print(f"🎙️ [{sid}] Transcrevendo...")
             audio = whisperx.load_audio(audio_path)
+            res = self.model.transcribe(audio, batch_size=self.batch_size, language="pt")
+            # 2. Alinhamento
+            print(f"📐 [{sid}] Alinhando...")
+            m_a, meta = whisperx.load_align_model(language_code="pt", device=self.device)
+            res = whisperx.align(res["segments"], m_a, meta, audio, self.device, return_char_alignments=False)
+            # 3. Extração e Refinamento
+            raw_words = []
+            for s in res["segments"]:
+                for w in s.get("words", s.get("word_segments", [])):
+                    if "start" in w and "end" in w:
+                        raw_words.append({"start": round(w["start"], 3), "end": round(w["end"], 3), "word": w.get("word", "").strip()})
+            script_text = self.process_docx(docx_file.name) if docx_file else ""
+            final_words = self.align_with_script(raw_words, script_text)
+            transcription = " ".join([w["word"] for w in final_words])
+            # 4. JSON
+            json_path = f"vsl_output_{sid}.json"
             with open(json_path, "w", encoding="utf-8") as f:
+                json.dump({"words": final_words}, f, ensure_ascii=False, indent=2)
+            # Memo Cleanup
+            del audio, m_a, res
             gc.collect()
+            if self.device == "cuda": torch.cuda.empty_cache()
+            return transcription, script_text, json_path
         except Exception as e:
             import traceback
+            return f"Erro: {str(e)}\n{traceback.format_exc()}", "", None
+engine = VSLEngine()
+with gr.Blocks(theme=gr.themes.Monochrome(), title="VSL ULTRA") as demo:
+    gr.Markdown("# 🎯 WHISPER VSL ULTRA - FINAL EDITION")
     with gr.Row():
+        with gr.Column():
+            a_in = gr.Audio(type="filepath", label="Áudio da VSL")
+            d_in = gr.File(label="Roteiro DOCX (Para correção inteligente)")
+            btn = gr.Button("🔥 GERAR VSL DATA", variant="primary")
+        with gr.Column():
+            f_out = gr.File(label="JSON Final")
+            t_out = gr.Textbox(label="Transcrição Corrigida", lines=8)
+            s_out = gr.Textbox(label="Roteiro Extraído", lines=8)
+    btn.click(engine.run, inputs=[a_in, d_in], outputs=[t_out, s_out, f_out])
+print("✅ Sistema Pronto.")
+demo.launch(share=True, debug=True)