Clearwave48 commited on
Commit
2fcb053
Β·
verified Β·
1 Parent(s): c42513e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +64 -153
main.py CHANGED
@@ -1,11 +1,18 @@
1
  """
2
- ClearWave AI β€” API Space (FastAPI only)
3
- Handles /api/health and /api/process-url
4
-
5
- Audio enhancement : Cleanvoice API (noise, fillers, stutters, silences, breaths)
6
- Transcription : Groq Whisper large-v3 (primary) / faster-whisper (fallback)
7
- Translation : NLLB-200-1.3B (primary) / Google Translate (fallback)
8
- Summary : Extractive (position-scored)
 
 
 
 
 
 
 
9
  """
10
 
11
  import os
@@ -20,23 +27,22 @@ from fastapi import FastAPI, Request
20
  from fastapi.responses import StreamingResponse, JSONResponse
21
  from fastapi.middleware.cors import CORSMiddleware
22
 
23
- # ── Cloudinary config ─────────────────────────────────────────────────────────
 
 
 
 
24
  cloudinary.config(
25
  cloud_name = os.environ.get("CLOUD_NAME"),
26
  api_key = os.environ.get("API_KEY"),
27
  api_secret = os.environ.get("API_SECRET"),
28
  )
29
 
30
- # ── Cleanvoice config ─────────────────────────────────────────────────────────
31
- CLEANVOICE_API_KEY = os.environ.get("CLEANVOICE_API_KEY")
32
- CLEANVOICE_BASE = "https://api.cleanvoice.ai/v2"
33
-
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
 
37
- from transcriber import Transcriber
38
- from translator import Translator
39
-
40
  transcriber = Transcriber()
41
  translator = Translator()
42
 
@@ -50,126 +56,6 @@ app.add_middleware(
50
  )
51
 
52
 
53
- # ══════════════════════════════════════════════════════════════════════════════
54
- # CLEANVOICE HELPER
55
- # ══════════════════════════════════════════════════════════════════════════════
56
-
57
- def cleanvoice_enhance(audio_path: str, out_dir: str,
58
- opt_fillers: bool = True,
59
- opt_stutters: bool = True,
60
- opt_silences: bool = True,
61
- opt_breaths: bool = True,
62
- opt_mouth: bool = True) -> dict:
63
- """
64
- Full Cleanvoice enhancement pipeline:
65
- 1. Upload audio file β†’ get signed URL
66
- 2. Submit edit job β†’ configure which features to enable
67
- 3. Poll until done β†’ max 30 attempts Γ— 10s = 5 minutes
68
- 4. Download result β†’ save to out_dir
69
- Returns: {"audio_path": str, "stats": dict}
70
- Raises RuntimeError on failure so run_pipeline() can catch and report it.
71
- """
72
- if not CLEANVOICE_API_KEY:
73
- raise RuntimeError("CLEANVOICE_API_KEY is not set in HF Space secrets.")
74
-
75
- headers = {"X-API-Key": CLEANVOICE_API_KEY}
76
-
77
- # ── Step 1: Upload ────────────────────────────────────────────────────────
78
- logger.info("[Cleanvoice] Uploading audio...")
79
- with open(audio_path, "rb") as f:
80
- up_resp = requests.post(
81
- f"{CLEANVOICE_BASE}/uploads",
82
- headers=headers,
83
- files={"file": (os.path.basename(audio_path), f)},
84
- timeout=120,
85
- )
86
- up_resp.raise_for_status()
87
- file_url = up_resp.json().get("url") or up_resp.json().get("signedUrl")
88
- if not file_url:
89
- raise RuntimeError(f"Cleanvoice upload gave no URL: {up_resp.json()}")
90
- logger.info(f"[Cleanvoice] Upload done β†’ {file_url[:60]}...")
91
-
92
- # ── Step 2: Submit edit job ───────────────────────────────────────────────
93
- # Cleanvoice config flags β€” map your pipeline options to Cleanvoice features
94
- config = {
95
- "enhance_speech": True, # always on β€” core noise removal
96
- "remove_filler_words": opt_fillers, # um, uh, like, basically...
97
- "remove_stutters": opt_stutters, # word repetitions
98
- "remove_silence": opt_silences, # long pauses
99
- "remove_breathing": opt_breaths, # breath sounds
100
- "remove_mouth_sounds": opt_mouth, # clicks, pops, smacks
101
- }
102
- logger.info(f"[Cleanvoice] Submitting edit job with config: {config}")
103
- edit_resp = requests.post(
104
- f"{CLEANVOICE_BASE}/edits",
105
- headers={**headers, "Content-Type": "application/json"},
106
- json={"input": {"files": [file_url], "config": config}},
107
- timeout=30,
108
- )
109
- edit_resp.raise_for_status()
110
- edit_data = edit_resp.json()
111
- edit_id = edit_data.get("id") or edit_data.get("editId")
112
- if not edit_id:
113
- raise RuntimeError(f"Cleanvoice edit job gave no ID: {edit_data}")
114
- logger.info(f"[Cleanvoice] Edit job submitted β†’ id={edit_id}")
115
-
116
- # ── Step 3: Poll until done ───────────────────────────────────────────────
117
- max_attempts = 36 # 36 Γ— 10s = 6 minutes max
118
- for attempt in range(1, max_attempts + 1):
119
- time.sleep(10)
120
- status_resp = requests.get(
121
- f"{CLEANVOICE_BASE}/edits/{edit_id}",
122
- headers=headers,
123
- timeout=15,
124
- )
125
- status_resp.raise_for_status()
126
- status_data = status_resp.json()
127
- status = status_data.get("status", "unknown")
128
- logger.info(f"[Cleanvoice] Poll {attempt}/{max_attempts} β†’ status={status}")
129
-
130
- if status == "completed":
131
- # Grab the output URL β€” try common key names
132
- output = status_data.get("output") or {}
133
- enhanced_dl = (
134
- output.get("url")
135
- or output.get("downloadUrl")
136
- or status_data.get("downloadUrl")
137
- )
138
- if not enhanced_dl:
139
- raise RuntimeError(f"Cleanvoice completed but no download URL: {status_data}")
140
-
141
- # ── Step 4: Download enhanced audio ──────────────────────────────
142
- logger.info(f"[Cleanvoice] Downloading result from {enhanced_dl[:60]}...")
143
- dl = requests.get(enhanced_dl, timeout=120)
144
- dl.raise_for_status()
145
-
146
- # Preserve original extension if possible, default to .mp3
147
- ext = os.path.splitext(enhanced_dl.split("?")[0])[-1] or ".mp3"
148
- out_path = os.path.join(out_dir, f"cleanvoice_enhanced{ext}")
149
- with open(out_path, "wb") as f:
150
- f.write(dl.content)
151
- logger.info(f"[Cleanvoice] βœ… Enhanced audio saved β†’ {out_path}")
152
-
153
- return {
154
- "audio_path": out_path,
155
- "stats": {
156
- "noise_method": "Cleanvoice API",
157
- "fillers_removed": "yes" if opt_fillers else "no",
158
- "stutters_removed": "yes" if opt_stutters else "no",
159
- "silences_removed_sec": "yes" if opt_silences else "no",
160
- "breaths_reduced": opt_breaths,
161
- "mouth_sounds_removed": "yes" if opt_mouth else "no",
162
- },
163
- }
164
-
165
- elif status in ("error", "failed"):
166
- raise RuntimeError(f"Cleanvoice job failed: {status_data.get('message', status_data)}")
167
-
168
- # still processing β€” keep polling
169
-
170
- raise RuntimeError(f"Cleanvoice timed out after {max_attempts * 10}s (edit_id={edit_id})")
171
-
172
-
173
  # ══════════════════════════════════════════════════════════════════════════════
174
  # PIPELINE
175
  # ══════════════════════════════════════════════════════════════════════════════
@@ -177,9 +63,12 @@ def cleanvoice_enhance(audio_path: str, out_dir: str,
177
  def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
178
  opt_fillers=True, opt_stutters=True, opt_silences=True,
179
  opt_breaths=True, opt_mouth=True):
180
-
181
- out_dir = tempfile.mkdtemp()
182
- stats = {}
 
 
 
183
  word_segs = []
184
 
185
  try:
@@ -187,16 +76,23 @@ def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
187
  yield {"status": "processing", "step": 1,
188
  "message": "Step 1/4 β€” Enhancing audio with Cleanvoice..."}
189
  try:
190
- result = cleanvoice_enhance(
191
  audio_path, out_dir,
192
- opt_fillers=opt_fillers,
193
- opt_stutters=opt_stutters,
194
- opt_silences=opt_silences,
195
- opt_breaths=opt_breaths,
196
- opt_mouth=opt_mouth,
197
  )
198
  clean1 = result["audio_path"]
199
- stats = result["stats"]
 
 
 
 
 
 
 
200
  logger.info("[Pipeline] Cleanvoice enhancement complete")
201
  except Exception as e:
202
  # Cleanvoice failed β€” log it and continue with original audio
@@ -216,6 +112,11 @@ def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
216
  "message": "Step 2/4 β€” Transcribing..."}
217
  transcript, detected_lang, t_method = transcriber.transcribe(clean1, src_lang)
218
  word_segs = transcriber._last_segments
 
 
 
 
 
219
  logger.info(f"[Pipeline] Transcription done: {len(transcript.split())} words, lang={detected_lang}")
220
 
221
  # ── Step 3: Translate ─────────────────────────────────────────────────
@@ -235,6 +136,7 @@ def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
235
  "message": "Step 4/4 β€” Summarizing & uploading..."}
236
  summary = translator.summarize(transcript)
237
 
 
238
  try:
239
  upload_result = cloudinary.uploader.upload(
240
  clean1,
@@ -245,7 +147,6 @@ def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
245
  logger.info(f"[Pipeline] Cloudinary upload done: {enhanced_url}")
246
  except Exception as e:
247
  logger.error(f"[Pipeline] Cloudinary upload failed: {e}")
248
- enhanced_url = None
249
 
250
  # ── Done ──────────────────────────────────────────────────────────────
251
  yield {
@@ -266,14 +167,13 @@ def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
266
  "mouth_sounds_removed": stats.get("mouth_sounds_removed", 0),
267
  "transcription_method": t_method,
268
  "translation_method": tl_method,
269
- "processing_sec": 0,
270
  "word_segments": len(word_segs),
271
  "transcript_words": len(transcript.split()),
272
  },
273
  }
274
 
275
  except Exception as e:
276
- logger.error(f"Pipeline failed: {e}", exc_info=True)
277
  yield {"status": "error", "message": f"Error: {str(e)}"}
278
 
279
 
@@ -311,13 +211,23 @@ async def process_url(request: Request):
311
 
312
  yield sse({"status": "processing", "step": 0, "message": "Downloading audio..."})
313
 
 
314
  try:
315
  resp = requests.get(audio_url, timeout=60, stream=True)
316
  resp.raise_for_status()
317
- suffix = ".wav" if "wav" in audio_url.lower() else ".mp3"
318
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
 
 
 
 
 
 
 
 
 
319
  downloaded = 0
320
- total = int(resp.headers.get("content-length", 0))
321
  for chunk in resp.iter_content(chunk_size=65536):
322
  if chunk:
323
  tmp.write(chunk)
@@ -325,12 +235,13 @@ async def process_url(request: Request):
325
  if total:
326
  pct = int(downloaded * 100 / total)
327
  yield sse({"status": "processing", "step": 0,
328
- "message": "Downloading... " + str(pct) + "%"})
329
  tmp.close()
330
  except Exception as e:
331
- yield sse({"status": "error", "message": "Download failed: " + str(e)})
332
  return
333
 
 
334
  for result in run_pipeline(tmp.name, src_lang, tgt_lang,
335
  opt_fillers, opt_stutters, opt_silences,
336
  opt_breaths, opt_mouth):
 
1
  """
2
+ ClearWave AI β€” API Space (FastAPI)
3
+ ===================================
4
+ Endpoints: /api/health | /api/process-url
5
+
6
+ Pipeline:
7
+ 1. Download audio from URL
8
+ 2. Denoise / enhance β†’ Denoiser (Cleanvoice SDK)
9
+ 3. Transcribe β†’ Groq Whisper large-v3 (primary) / faster-whisper (fallback)
10
+ 4. Translate β†’ NLLB-200-1.3B (primary) / Google Translate (fallback)
11
+ 5. Summarize β†’ Extractive (position-scored)
12
+ 6. Upload result β†’ Cloudinary
13
+
14
+ All secrets read from HF Space environment variables:
15
+ CLEANVOICE_API_KEY, CLOUD_NAME, API_KEY, API_SECRET, GROQ_API_KEY
16
  """
17
 
18
  import os
 
27
  from fastapi.responses import StreamingResponse, JSONResponse
28
  from fastapi.middleware.cors import CORSMiddleware
29
 
30
+ from denoiser import Denoiser
31
+ from transcriber import Transcriber
32
+ from translator import Translator
33
+
34
+ # ── Cloudinary config ──────────────────────────────────────────────────────────
35
  cloudinary.config(
36
  cloud_name = os.environ.get("CLOUD_NAME"),
37
  api_key = os.environ.get("API_KEY"),
38
  api_secret = os.environ.get("API_SECRET"),
39
  )
40
 
 
 
 
 
41
  logging.basicConfig(level=logging.INFO)
42
  logger = logging.getLogger(__name__)
43
 
44
+ # ── Singleton instances (loaded once at startup) ───────────────────────────────
45
+ denoiser = Denoiser()
 
46
  transcriber = Transcriber()
47
  translator = Translator()
48
 
 
56
  )
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # ══════════════════════════════════════════════════════════════════════════════
60
  # PIPELINE
61
  # ══════════════════════════════════════════════════════════════════════════════
 
63
  def run_pipeline(audio_path, src_lang="auto", tgt_lang="te",
64
  opt_fillers=True, opt_stutters=True, opt_silences=True,
65
  opt_breaths=True, opt_mouth=True):
66
+ """
67
+ Generator β€” yields SSE-style dicts at each step.
68
+ Caller wraps each dict in "data: <json>\n\n"
69
+ """
70
+ out_dir = tempfile.mkdtemp()
71
+ stats = {}
72
  word_segs = []
73
 
74
  try:
 
76
  yield {"status": "processing", "step": 1,
77
  "message": "Step 1/4 β€” Enhancing audio with Cleanvoice..."}
78
  try:
79
+ result = denoiser.process(
80
  audio_path, out_dir,
81
+ fillers=opt_fillers,
82
+ stutters=opt_stutters,
83
+ long_silences=opt_silences,
84
+ breaths=opt_breaths,
85
+ mouth_sounds=opt_mouth,
86
  )
87
  clean1 = result["audio_path"]
88
+ stats = {
89
+ "noise_method": "Cleanvoice API",
90
+ "fillers_removed": "yes" if opt_fillers else "no",
91
+ "stutters_removed": "yes" if opt_stutters else "no",
92
+ "silences_removed_sec": "yes" if opt_silences else "no",
93
+ "breaths_reduced": opt_breaths,
94
+ "mouth_sounds_removed": "yes" if opt_mouth else "no",
95
+ }
96
  logger.info("[Pipeline] Cleanvoice enhancement complete")
97
  except Exception as e:
98
  # Cleanvoice failed β€” log it and continue with original audio
 
112
  "message": "Step 2/4 β€” Transcribing..."}
113
  transcript, detected_lang, t_method = transcriber.transcribe(clean1, src_lang)
114
  word_segs = transcriber._last_segments
115
+
116
+ # Clean filler words from transcript text too
117
+ if opt_fillers:
118
+ transcript = denoiser.clean_transcript_fillers(transcript)
119
+
120
  logger.info(f"[Pipeline] Transcription done: {len(transcript.split())} words, lang={detected_lang}")
121
 
122
  # ── Step 3: Translate ─────────────────────────────────────────────────
 
136
  "message": "Step 4/4 β€” Summarizing & uploading..."}
137
  summary = translator.summarize(transcript)
138
 
139
+ enhanced_url = None
140
  try:
141
  upload_result = cloudinary.uploader.upload(
142
  clean1,
 
147
  logger.info(f"[Pipeline] Cloudinary upload done: {enhanced_url}")
148
  except Exception as e:
149
  logger.error(f"[Pipeline] Cloudinary upload failed: {e}")
 
150
 
151
  # ── Done ──────────────────────────────────────────────────────────────
152
  yield {
 
167
  "mouth_sounds_removed": stats.get("mouth_sounds_removed", 0),
168
  "transcription_method": t_method,
169
  "translation_method": tl_method,
 
170
  "word_segments": len(word_segs),
171
  "transcript_words": len(transcript.split()),
172
  },
173
  }
174
 
175
  except Exception as e:
176
+ logger.error(f"[Pipeline] Fatal error: {e}", exc_info=True)
177
  yield {"status": "error", "message": f"Error: {str(e)}"}
178
 
179
 
 
211
 
212
  yield sse({"status": "processing", "step": 0, "message": "Downloading audio..."})
213
 
214
+ # ── Download audio from URL ───────────────────────────────────────────
215
  try:
216
  resp = requests.get(audio_url, timeout=60, stream=True)
217
  resp.raise_for_status()
218
+
219
+ # Detect extension β€” support WhatsApp .opus and common formats
220
+ lower_url = audio_url.lower().split("?")[0]
221
+ if ".opus" in lower_url: suffix = ".opus"
222
+ elif ".ogg" in lower_url: suffix = ".ogg"
223
+ elif ".aac" in lower_url: suffix = ".aac"
224
+ elif ".m4a" in lower_url: suffix = ".m4a"
225
+ elif ".wav" in lower_url: suffix = ".wav"
226
+ else: suffix = ".mp3"
227
+
228
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
229
  downloaded = 0
230
+ total = int(resp.headers.get("content-length", 0))
231
  for chunk in resp.iter_content(chunk_size=65536):
232
  if chunk:
233
  tmp.write(chunk)
 
235
  if total:
236
  pct = int(downloaded * 100 / total)
237
  yield sse({"status": "processing", "step": 0,
238
+ "message": f"Downloading... {pct}%"})
239
  tmp.close()
240
  except Exception as e:
241
+ yield sse({"status": "error", "message": f"Download failed: {e}"})
242
  return
243
 
244
+ # ── Run pipeline ──────────────────────────────────────────────────────
245
  for result in run_pipeline(tmp.name, src_lang, tgt_lang,
246
  opt_fillers, opt_stutters, opt_silences,
247
  opt_breaths, opt_mouth):