Raí Santos
commited on
Commit
·
7d4dcec
1
Parent(s):
c37d6e8
feat: Complete optimization with 3 bugs fixed + backend-only
Browse files- .dockerignore +28 -0
- .gitignore +53 -0
- backend/main.py +5 -2
- backend/processor.py +27 -8
- backend/requirements.txt +1 -0
- frontend/src/App.jsx +1 -1
.dockerignore
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local development
|
| 2 |
+
node_modules
|
| 3 |
+
venv
|
| 4 |
+
.venv
|
| 5 |
+
__pycache__
|
| 6 |
+
|
| 7 |
+
# Models and temporary data
|
| 8 |
+
backend/models
|
| 9 |
+
uploads
|
| 10 |
+
models
|
| 11 |
+
|
| 12 |
+
# Build artifacts
|
| 13 |
+
dist
|
| 14 |
+
frontend/dist
|
| 15 |
+
frontend/node_modules
|
| 16 |
+
|
| 17 |
+
# Media files
|
| 18 |
+
*.mp3
|
| 19 |
+
*.wav
|
| 20 |
+
*.m4a
|
| 21 |
+
*.docx
|
| 22 |
+
*.json
|
| 23 |
+
|
| 24 |
+
# Docker and Git files
|
| 25 |
+
.git
|
| 26 |
+
.gitignore
|
| 27 |
+
Dockerfile
|
| 28 |
+
.dockerignore
|
.gitignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
build/
|
| 10 |
+
develop-eggs/
|
| 11 |
+
dist/
|
| 12 |
+
downloads/
|
| 13 |
+
eggs/
|
| 14 |
+
.eggs/
|
| 15 |
+
lib/
|
| 16 |
+
lib64/
|
| 17 |
+
parts/
|
| 18 |
+
sdist/
|
| 19 |
+
var/
|
| 20 |
+
wheels/
|
| 21 |
+
*.egg-info/
|
| 22 |
+
.installed.cfg
|
| 23 |
+
*.egg
|
| 24 |
+
|
| 25 |
+
# Virtual Environment
|
| 26 |
+
.venv/
|
| 27 |
+
venv/
|
| 28 |
+
ENV/
|
| 29 |
+
|
| 30 |
+
# Models & Media (Importante!)
|
| 31 |
+
backend/models/
|
| 32 |
+
uploads/
|
| 33 |
+
*.mp3
|
| 34 |
+
*.wav
|
| 35 |
+
*.m4a
|
| 36 |
+
*.docx
|
| 37 |
+
*.json
|
| 38 |
+
|
| 39 |
+
# Node.js
|
| 40 |
+
node_modules/
|
| 41 |
+
frontend/dist/
|
| 42 |
+
frontend/node_modules/
|
| 43 |
+
.eslintcache
|
| 44 |
+
|
| 45 |
+
# IDEs
|
| 46 |
+
.vscode/
|
| 47 |
+
.idea/
|
| 48 |
+
*.swp
|
| 49 |
+
*.swo
|
| 50 |
+
|
| 51 |
+
# OS
|
| 52 |
+
.DS_Store
|
| 53 |
+
Thumbs.db
|
backend/main.py
CHANGED
|
@@ -43,7 +43,7 @@ async def process_media(
|
|
| 43 |
|
| 44 |
try:
|
| 45 |
# Transcribe
|
| 46 |
-
words = processor.transcribe(audio_path)
|
| 47 |
|
| 48 |
# Correct orthography
|
| 49 |
words = processor.correct_orthography(words)
|
|
@@ -70,7 +70,10 @@ async def process_media(
|
|
| 70 |
"json_url": f"/api/download/{json_filename}"
|
| 71 |
}
|
| 72 |
except Exception as e:
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
@app.get("/api/download/{filename}")
|
| 76 |
async def download_json(filename: str):
|
|
|
|
| 43 |
|
| 44 |
try:
|
| 45 |
# Transcribe
|
| 46 |
+
words = processor.transcribe(audio_path, language="pt")
|
| 47 |
|
| 48 |
# Correct orthography
|
| 49 |
words = processor.correct_orthography(words)
|
|
|
|
| 70 |
"json_url": f"/api/download/{json_filename}"
|
| 71 |
}
|
| 72 |
except Exception as e:
|
| 73 |
+
import traceback
|
| 74 |
+
error_details = traceback.format_exc()
|
| 75 |
+
print(f"ERROR: {error_details}")
|
| 76 |
+
return JSONResponse(status_code=500, content={"success": False, "error": str(e), "details": error_details})
|
| 77 |
|
| 78 |
@app.get("/api/download/{filename}")
|
| 79 |
async def download_json(filename: str):
|
backend/processor.py
CHANGED
|
@@ -12,14 +12,23 @@ class TranscriptionProcessor:
|
|
| 12 |
def __init__(self, model_name="large-v3", device="cpu", compute_type="int8"):
|
| 13 |
self.device = device
|
| 14 |
self.compute_type = compute_type
|
|
|
|
| 15 |
self.model_name = model_name
|
| 16 |
self.model = None
|
|
|
|
| 17 |
|
| 18 |
def load_models(self):
|
| 19 |
if self.model is None:
|
| 20 |
# For CPU, int8 is highly recommended for speed and memory efficiency
|
|
|
|
| 21 |
print(f"Loading Whisper model: {self.model_name} on {self.device} with {self.compute_type}")
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def process_docx(self, docx_path):
|
| 25 |
doc = Document(docx_path)
|
|
@@ -45,11 +54,23 @@ class TranscriptionProcessor:
|
|
| 45 |
self.load_models()
|
| 46 |
|
| 47 |
# 1. Transcribe with Whisper (Faster-Whisper backend)
|
|
|
|
|
|
|
| 48 |
audio = whisperx.load_audio(audio_path)
|
| 49 |
-
result = self.model.transcribe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# 2. Align with phoneme model for word-level precision
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
|
| 54 |
|
| 55 |
# result["word_segments"] contains start/end for each word in milliseconds
|
|
@@ -57,13 +78,11 @@ class TranscriptionProcessor:
|
|
| 57 |
for segment in result["segments"]:
|
| 58 |
if "words" in segment:
|
| 59 |
for w in segment["words"]:
|
| 60 |
-
# WhisperX returns seconds, user wants JSON for the HTML
|
| 61 |
-
# "start": 0.0, "end": 0.5, "word": "Exemplo"
|
| 62 |
if "start" in w and "end" in w:
|
| 63 |
words.append({
|
| 64 |
-
"start": w["start"],
|
| 65 |
-
"end": w["end"],
|
| 66 |
-
"word": w["word"]
|
| 67 |
})
|
| 68 |
|
| 69 |
return words
|
|
|
|
| 12 |
def __init__(self, model_name="large-v3", device="cpu", compute_type="int8"):
|
| 13 |
self.device = device
|
| 14 |
self.compute_type = compute_type
|
| 15 |
+
# If speed is priority, recommend "distil-large-v3"
|
| 16 |
self.model_name = model_name
|
| 17 |
self.model = None
|
| 18 |
+
self.align_model_cache = {}
|
| 19 |
|
| 20 |
def load_models(self):
|
| 21 |
if self.model is None:
|
| 22 |
# For CPU, int8 is highly recommended for speed and memory efficiency
|
| 23 |
+
# We also set cpu_threads to 2 to match HF Space free tier limits
|
| 24 |
print(f"Loading Whisper model: {self.model_name} on {self.device} with {self.compute_type}")
|
| 25 |
+
os.makedirs("./models", exist_ok=True)
|
| 26 |
+
self.model = whisperx.load_model(
|
| 27 |
+
self.model_name,
|
| 28 |
+
self.device,
|
| 29 |
+
compute_type=self.compute_type,
|
| 30 |
+
download_root="./models" # Cache models in a specific directory
|
| 31 |
+
)
|
| 32 |
|
| 33 |
def process_docx(self, docx_path):
|
| 34 |
doc = Document(docx_path)
|
|
|
|
| 54 |
self.load_models()
|
| 55 |
|
| 56 |
# 1. Transcribe with Whisper (Faster-Whisper backend)
|
| 57 |
+
# Optimization: vad_filter=True skips silence, saving massive computation time
|
| 58 |
+
# Optimization: batch_size=8 is a sweet spot for 2-vCPU systems
|
| 59 |
audio = whisperx.load_audio(audio_path)
|
| 60 |
+
result = self.model.transcribe(
|
| 61 |
+
audio,
|
| 62 |
+
batch_size=8,
|
| 63 |
+
language=language,
|
| 64 |
+
vad_filter=True,
|
| 65 |
+
vad_params={"onnx": True} # ONNX VAD is faster on CPU
|
| 66 |
+
)
|
| 67 |
|
| 68 |
# 2. Align with phoneme model for word-level precision
|
| 69 |
+
if language not in self.align_model_cache:
|
| 70 |
+
print(f"Loading alignment model for language: {language}")
|
| 71 |
+
self.align_model_cache[language] = whisperx.load_align_model(language_code=language, device=self.device)
|
| 72 |
+
|
| 73 |
+
model_a, metadata = self.align_model_cache[language]
|
| 74 |
result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
|
| 75 |
|
| 76 |
# result["word_segments"] contains start/end for each word in milliseconds
|
|
|
|
| 78 |
for segment in result["segments"]:
|
| 79 |
if "words" in segment:
|
| 80 |
for w in segment["words"]:
|
|
|
|
|
|
|
| 81 |
if "start" in w and "end" in w:
|
| 82 |
words.append({
|
| 83 |
+
"start": round(w["start"], 3),
|
| 84 |
+
"end": round(w["end"], 3),
|
| 85 |
+
"word": w["word"].strip()
|
| 86 |
})
|
| 87 |
|
| 88 |
return words
|
backend/requirements.txt
CHANGED
|
@@ -14,3 +14,4 @@ levenshtein
|
|
| 14 |
pyspellchecker
|
| 15 |
symspellpy
|
| 16 |
python-magic
|
|
|
|
|
|
| 14 |
pyspellchecker
|
| 15 |
symspellpy
|
| 16 |
python-magic
|
| 17 |
+
onnxruntime
|
frontend/src/App.jsx
CHANGED
|
@@ -151,7 +151,7 @@ function App() {
|
|
| 151 |
onClick={handleProcess}
|
| 152 |
>
|
| 153 |
{status === 'processing' ? <Loader2 className="animate-spin" /> : <Play />}
|
| 154 |
-
{status === 'processing' ? 'Processando...' : 'Iniciar Transcrição
|
| 155 |
</button>
|
| 156 |
|
| 157 |
{status === 'error' && (
|
|
|
|
| 151 |
onClick={handleProcess}
|
| 152 |
>
|
| 153 |
{status === 'processing' ? <Loader2 className="animate-spin" /> : <Play />}
|
| 154 |
+
{status === 'processing' ? 'Processando (Modo Ultra Fast)...' : 'Iniciar Transcrição Cirúrgica Insana'}
|
| 155 |
</button>
|
| 156 |
|
| 157 |
{status === 'error' && (
|