Spaces:

RaiSantos
/

vt

Build error

App Files Files Community

Raí Santos commited on Dec 29, 2025

Commit

7d4dcec

1 Parent(s): c37d6e8

feat: Complete optimization with 3 bugs fixed + backend-only

Browse files

Files changed (6) hide show

.dockerignore +28 -0
.gitignore +53 -0
backend/main.py +5 -2
backend/processor.py +27 -8
backend/requirements.txt +1 -0
frontend/src/App.jsx +1 -1

.dockerignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Local development
+node_modules
+venv
+.venv
+__pycache__
+# Models and temporary data
+backend/models
+uploads
+models
+# Build artifacts
+dist
+frontend/dist
+frontend/node_modules
+# Media files
+*.mp3
+*.wav
+*.m4a
+*.docx
+*.json
+# Docker and Git files
+.git
+.gitignore
+Dockerfile
+.dockerignore

.gitignore ADDED Viewed

	@@ -0,0 +1,53 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+.venv/
+venv/
+ENV/
+# Models & Media (Importante!)
+backend/models/
+uploads/
+*.mp3
+*.wav
+*.m4a
+*.docx
+*.json
+# Node.js
+node_modules/
+frontend/dist/
+frontend/node_modules/
+.eslintcache
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db

backend/main.py CHANGED Viewed

@@ -43,7 +43,7 @@ async def process_media(
     try:
         # Transcribe
-        words = processor.transcribe(audio_path)
         # Correct orthography
         words = processor.correct_orthography(words)
@@ -70,7 +70,10 @@ async def process_media(
             "json_url": f"/api/download/{json_filename}"
         }
     except Exception as e:
-        return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
 @app.get("/api/download/{filename}")
 async def download_json(filename: str):

     try:
         # Transcribe
+        words = processor.transcribe(audio_path, language="pt")
         # Correct orthography
         words = processor.correct_orthography(words)
             "json_url": f"/api/download/{json_filename}"
         }
     except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"ERROR: {error_details}")
+        return JSONResponse(status_code=500, content={"success": False, "error": str(e), "details": error_details})
 @app.get("/api/download/{filename}")
 async def download_json(filename: str):

backend/processor.py CHANGED Viewed

@@ -12,14 +12,23 @@ class TranscriptionProcessor:
     def __init__(self, model_name="large-v3", device="cpu", compute_type="int8"):
         self.device = device
         self.compute_type = compute_type
         self.model_name = model_name
         self.model = None
     def load_models(self):
         if self.model is None:
             # For CPU, int8 is highly recommended for speed and memory efficiency
             print(f"Loading Whisper model: {self.model_name} on {self.device} with {self.compute_type}")
-            self.model = whisperx.load_model(self.model_name, self.device, compute_type=self.compute_type)
     def process_docx(self, docx_path):
         doc = Document(docx_path)
@@ -45,11 +54,23 @@ class TranscriptionProcessor:
         self.load_models()
         # 1. Transcribe with Whisper (Faster-Whisper backend)
         audio = whisperx.load_audio(audio_path)
-        result = self.model.transcribe(audio, batch_size=4, language=language)
         # 2. Align with phoneme model for word-level precision
-        model_a, metadata = whisperx.load_align_model(language_code=language, device=self.device)
         result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
         # result["word_segments"] contains start/end for each word in milliseconds
@@ -57,13 +78,11 @@ class TranscriptionProcessor:
         for segment in result["segments"]:
             if "words" in segment:
                 for w in segment["words"]:
-                    # WhisperX returns seconds, user wants JSON for the HTML
-                    # "start": 0.0, "end": 0.5, "word": "Exemplo"
                     if "start" in w and "end" in w:
                         words.append({
-                            "start": w["start"],
-                            "end": w["end"],
-                            "word": w["word"]
                         })
         return words

     def __init__(self, model_name="large-v3", device="cpu", compute_type="int8"):
         self.device = device
         self.compute_type = compute_type
+        # If speed is priority, recommend "distil-large-v3"
         self.model_name = model_name
         self.model = None
+        self.align_model_cache = {}
     def load_models(self):
         if self.model is None:
             # For CPU, int8 is highly recommended for speed and memory efficiency
+            # We also set cpu_threads to 2 to match HF Space free tier limits
             print(f"Loading Whisper model: {self.model_name} on {self.device} with {self.compute_type}")
+            os.makedirs("./models", exist_ok=True)
+            self.model = whisperx.load_model(
+                self.model_name,
+                self.device,
+                compute_type=self.compute_type,
+                download_root="./models" # Cache models in a specific directory
+            )
     def process_docx(self, docx_path):
         doc = Document(docx_path)
         self.load_models()
         # 1. Transcribe with Whisper (Faster-Whisper backend)
+        # Optimization: vad_filter=True skips silence, saving massive computation time
+        # Optimization: batch_size=8 is a sweet spot for 2-vCPU systems
         audio = whisperx.load_audio(audio_path)
+        result = self.model.transcribe(
+            audio,
+            batch_size=8,
+            language=language,
+            vad_filter=True,
+            vad_params={"onnx": True} # ONNX VAD is faster on CPU
+        )
         # 2. Align with phoneme model for word-level precision
+        if language not in self.align_model_cache:
+            print(f"Loading alignment model for language: {language}")
+            self.align_model_cache[language] = whisperx.load_align_model(language_code=language, device=self.device)
+        model_a, metadata = self.align_model_cache[language]
         result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
         # result["word_segments"] contains start/end for each word in milliseconds
         for segment in result["segments"]:
             if "words" in segment:
                 for w in segment["words"]:
                     if "start" in w and "end" in w:
                         words.append({
+                            "start": round(w["start"], 3),
+                            "end": round(w["end"], 3),
+                            "word": w["word"].strip()
                         })
         return words

backend/requirements.txt CHANGED Viewed

@@ -14,3 +14,4 @@ levenshtein
 pyspellchecker
 symspellpy
 python-magic

 pyspellchecker
 symspellpy
 python-magic
+onnxruntime

frontend/src/App.jsx CHANGED Viewed

@@ -151,7 +151,7 @@ function App() {
             onClick={handleProcess}
           >
             {status === 'processing' ? <Loader2 className="animate-spin" /> : <Play />}
-            {status === 'processing' ? 'Processando...' : 'Iniciar Transcrição Ultra Precisa'}
           </button>
           {status === 'error' && (

             onClick={handleProcess}
           >
             {status === 'processing' ? <Loader2 className="animate-spin" /> : <Play />}
+            {status === 'processing' ? 'Processando (Modo Ultra Fast)...' : 'Iniciar Transcrição Cirúrgica Insana'}
           </button>
           {status === 'error' && (