Raí Santos commited on
Commit
7d4dcec
·
1 Parent(s): c37d6e8

feat: Complete optimization with 3 bugs fixed + backend-only

Browse files
.dockerignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local development
2
+ node_modules
3
+ venv
4
+ .venv
5
+ __pycache__
6
+
7
+ # Models and temporary data
8
+ backend/models
9
+ uploads
10
+ models
11
+
12
+ # Build artifacts
13
+ dist
14
+ frontend/dist
15
+ frontend/node_modules
16
+
17
+ # Media files
18
+ *.mp3
19
+ *.wav
20
+ *.m4a
21
+ *.docx
22
+ *.json
23
+
24
+ # Docker and Git files
25
+ .git
26
+ .gitignore
27
+ Dockerfile
28
+ .dockerignore
.gitignore ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ build/
10
+ develop-eggs/
11
+ dist/
12
+ downloads/
13
+ eggs/
14
+ .eggs/
15
+ lib/
16
+ lib64/
17
+ parts/
18
+ sdist/
19
+ var/
20
+ wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+
25
+ # Virtual Environment
26
+ .venv/
27
+ venv/
28
+ ENV/
29
+
30
+ # Models & Media (Importante!)
31
+ backend/models/
32
+ uploads/
33
+ *.mp3
34
+ *.wav
35
+ *.m4a
36
+ *.docx
37
+ *.json
38
+
39
+ # Node.js
40
+ node_modules/
41
+ frontend/dist/
42
+ frontend/node_modules/
43
+ .eslintcache
44
+
45
+ # IDEs
46
+ .vscode/
47
+ .idea/
48
+ *.swp
49
+ *.swo
50
+
51
+ # OS
52
+ .DS_Store
53
+ Thumbs.db
backend/main.py CHANGED
@@ -43,7 +43,7 @@ async def process_media(
43
 
44
  try:
45
  # Transcribe
46
- words = processor.transcribe(audio_path)
47
 
48
  # Correct orthography
49
  words = processor.correct_orthography(words)
@@ -70,7 +70,10 @@ async def process_media(
70
  "json_url": f"/api/download/{json_filename}"
71
  }
72
  except Exception as e:
73
- return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
 
 
 
74
 
75
  @app.get("/api/download/{filename}")
76
  async def download_json(filename: str):
 
43
 
44
  try:
45
  # Transcribe
46
+ words = processor.transcribe(audio_path, language="pt")
47
 
48
  # Correct orthography
49
  words = processor.correct_orthography(words)
 
70
  "json_url": f"/api/download/{json_filename}"
71
  }
72
  except Exception as e:
73
+ import traceback
74
+ error_details = traceback.format_exc()
75
+ print(f"ERROR: {error_details}")
76
+ return JSONResponse(status_code=500, content={"success": False, "error": str(e), "details": error_details})
77
 
78
  @app.get("/api/download/{filename}")
79
  async def download_json(filename: str):
backend/processor.py CHANGED
@@ -12,14 +12,23 @@ class TranscriptionProcessor:
12
  def __init__(self, model_name="large-v3", device="cpu", compute_type="int8"):
13
  self.device = device
14
  self.compute_type = compute_type
 
15
  self.model_name = model_name
16
  self.model = None
 
17
 
18
  def load_models(self):
19
  if self.model is None:
20
  # For CPU, int8 is highly recommended for speed and memory efficiency
 
21
  print(f"Loading Whisper model: {self.model_name} on {self.device} with {self.compute_type}")
22
- self.model = whisperx.load_model(self.model_name, self.device, compute_type=self.compute_type)
 
 
 
 
 
 
23
 
24
  def process_docx(self, docx_path):
25
  doc = Document(docx_path)
@@ -45,11 +54,23 @@ class TranscriptionProcessor:
45
  self.load_models()
46
 
47
  # 1. Transcribe with Whisper (Faster-Whisper backend)
 
 
48
  audio = whisperx.load_audio(audio_path)
49
- result = self.model.transcribe(audio, batch_size=4, language=language)
 
 
 
 
 
 
50
 
51
  # 2. Align with phoneme model for word-level precision
52
- model_a, metadata = whisperx.load_align_model(language_code=language, device=self.device)
 
 
 
 
53
  result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
54
 
55
  # result["word_segments"] contains start/end for each word in milliseconds
@@ -57,13 +78,11 @@ class TranscriptionProcessor:
57
  for segment in result["segments"]:
58
  if "words" in segment:
59
  for w in segment["words"]:
60
- # WhisperX returns seconds, user wants JSON for the HTML
61
- # "start": 0.0, "end": 0.5, "word": "Exemplo"
62
  if "start" in w and "end" in w:
63
  words.append({
64
- "start": w["start"],
65
- "end": w["end"],
66
- "word": w["word"]
67
  })
68
 
69
  return words
 
12
  def __init__(self, model_name="large-v3", device="cpu", compute_type="int8"):
13
  self.device = device
14
  self.compute_type = compute_type
15
+ # If speed is priority, recommend "distil-large-v3"
16
  self.model_name = model_name
17
  self.model = None
18
+ self.align_model_cache = {}
19
 
20
  def load_models(self):
21
  if self.model is None:
22
  # For CPU, int8 is highly recommended for speed and memory efficiency
23
+ # We also set cpu_threads to 2 to match HF Space free tier limits
24
  print(f"Loading Whisper model: {self.model_name} on {self.device} with {self.compute_type}")
25
+ os.makedirs("./models", exist_ok=True)
26
+ self.model = whisperx.load_model(
27
+ self.model_name,
28
+ self.device,
29
+ compute_type=self.compute_type,
30
+ download_root="./models" # Cache models in a specific directory
31
+ )
32
 
33
  def process_docx(self, docx_path):
34
  doc = Document(docx_path)
 
54
  self.load_models()
55
 
56
  # 1. Transcribe with Whisper (Faster-Whisper backend)
57
+ # Optimization: vad_filter=True skips silence, saving massive computation time
58
+ # Optimization: batch_size=8 is a sweet spot for 2-vCPU systems
59
  audio = whisperx.load_audio(audio_path)
60
+ result = self.model.transcribe(
61
+ audio,
62
+ batch_size=8,
63
+ language=language,
64
+ vad_filter=True,
65
+ vad_params={"onnx": True} # ONNX VAD is faster on CPU
66
+ )
67
 
68
  # 2. Align with phoneme model for word-level precision
69
+ if language not in self.align_model_cache:
70
+ print(f"Loading alignment model for language: {language}")
71
+ self.align_model_cache[language] = whisperx.load_align_model(language_code=language, device=self.device)
72
+
73
+ model_a, metadata = self.align_model_cache[language]
74
  result = whisperx.align(result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False)
75
 
76
  # result["word_segments"] contains start/end for each word in milliseconds
 
78
  for segment in result["segments"]:
79
  if "words" in segment:
80
  for w in segment["words"]:
 
 
81
  if "start" in w and "end" in w:
82
  words.append({
83
+ "start": round(w["start"], 3),
84
+ "end": round(w["end"], 3),
85
+ "word": w["word"].strip()
86
  })
87
 
88
  return words
backend/requirements.txt CHANGED
@@ -14,3 +14,4 @@ levenshtein
14
  pyspellchecker
15
  symspellpy
16
  python-magic
 
 
14
  pyspellchecker
15
  symspellpy
16
  python-magic
17
+ onnxruntime
frontend/src/App.jsx CHANGED
@@ -151,7 +151,7 @@ function App() {
151
  onClick={handleProcess}
152
  >
153
  {status === 'processing' ? <Loader2 className="animate-spin" /> : <Play />}
154
- {status === 'processing' ? 'Processando...' : 'Iniciar Transcrição Ultra Precisa'}
155
  </button>
156
 
157
  {status === 'error' && (
 
151
  onClick={handleProcess}
152
  >
153
  {status === 'processing' ? <Loader2 className="animate-spin" /> : <Play />}
154
+ {status === 'processing' ? 'Processando (Modo Ultra Fast)...' : 'Iniciar Transcrição Cirúrgica Insana'}
155
  </button>
156
 
157
  {status === 'error' && (