factorstudios commited on
Commit
e330188
·
verified ·
1 Parent(s): 8b5393c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -21
app.py CHANGED
@@ -32,8 +32,9 @@ LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
32
  # Directory within the HF dataset where the audio files are located
33
  AUDIO_FILE_PREFIX = "audio/"
34
 
 
35
  WHISPER_SERVERS = [
36
- f"https://makeitfr-mineo-{i}.hf.space/transcribe" for i in range(1, 21)
37
  ]
38
 
39
  # Temporary storage for audio files
@@ -48,7 +49,6 @@ class WhisperServer:
48
  def __init__(self, url: str):
49
  self.url = url
50
  self.is_processing = False
51
- self.current_file_index: Optional[int] = None
52
  self.total_processed = 0
53
  self.total_time = 0.0
54
 
@@ -60,11 +60,11 @@ class WhisperServer:
60
  def release(self):
61
  """Release the server for a new file"""
62
  self.is_processing = False
63
- self.current_file_index = None
64
 
65
  # Global state for whisper servers
66
  servers = [WhisperServer(url) for url in WHISPER_SERVERS]
67
  server_lock = asyncio.Lock()
 
68
 
69
  # --- Progress and State Management Functions ---
70
 
@@ -181,21 +181,22 @@ async def transcribe_with_server(server: WhisperServer, wav_path: Path) -> Optio
181
  print(f"[{FLOW_ID}] Error transcribing with {server.url}: {e}")
182
  return None
183
 
184
- async def process_file_task(wav_file: str, state: Dict, progress: Dict):
185
- server = None
186
- while server is None:
187
  async with server_lock:
188
- for s in servers:
 
 
 
189
  if not s.is_processing:
190
  s.is_processing = True
191
- server = s
192
- break
193
- if server is None:
194
- await asyncio.sleep(1)
195
 
 
 
196
  try:
197
- # FIX: Ensure we use the correct local path for the downloaded file
198
- # hf_hub_download returns the absolute path to the downloaded file
199
  print(f"[{FLOW_ID}] Downloading {wav_file}...")
200
  downloaded_path_str = hf_hub_download(
201
  repo_id=HF_AUDIO_DATASET_ID,
@@ -233,7 +234,6 @@ async def process_file_task(wav_file: str, state: Dict, progress: Dict):
233
  state["file_states"][wav_file] = "failed_transcription"
234
  print(f"[{FLOW_ID}] ❌ Failed: {wav_file}")
235
 
236
- # Cleanup
237
  if wav_path.exists():
238
  wav_path.unlink()
239
 
@@ -256,7 +256,6 @@ async def main_processing_loop():
256
  await asyncio.sleep(60)
257
  continue
258
 
259
- # Check HF_OUTPUT_DATASET_ID for existing JSON outputs
260
  print(f"[{FLOW_ID}] Checking {HF_OUTPUT_DATASET_ID} for existing JSON outputs...")
261
  try:
262
  api = HfApi(token=HF_TOKEN)
@@ -267,10 +266,7 @@ async def main_processing_loop():
267
  print(f"[{FLOW_ID}] Warning: Could not fetch existing files: {e}")
268
  existing_json_files = set()
269
 
270
- # 1. Handpick failed_transcription files
271
  failed_files = [f for f, s in state.get("file_states", {}).items() if s == "failed_transcription"]
272
-
273
- # 2. Check for new files
274
  next_idx = state.get("next_download_index", 0)
275
  new_files_chunk = file_list[next_idx:next_idx + 1000]
276
 
@@ -298,10 +294,7 @@ async def main_processing_loop():
298
  state["next_download_index"] = current_idx + 1
299
  continue
300
 
301
- # Found an UNPROCESSED file
302
  print(f"[{FLOW_ID}] Found unprocessed file: {f}")
303
-
304
- # Upload skipped state before processing
305
  if state_changed_locally:
306
  print(f"[{FLOW_ID}] Synchronizing skipped files to HF state...")
307
  await upload_hf_state(state)
 
32
  # Directory within the HF dataset where the audio files are located
33
  AUDIO_FILE_PREFIX = "audio/"
34
 
35
+ # FIX: Updated server list based on the logs showing 'eliasishere' prefix
36
  WHISPER_SERVERS = [
37
+ f"https://eliasishere-makeitfr-mineo-{i}.hf.space/transcribe" for i in range(1, 21)
38
  ]
39
 
40
  # Temporary storage for audio files
 
49
  def __init__(self, url: str):
50
  self.url = url
51
  self.is_processing = False
 
52
  self.total_processed = 0
53
  self.total_time = 0.0
54
 
 
60
  def release(self):
61
  """Release the server for a new file"""
62
  self.is_processing = False
 
63
 
64
  # Global state for whisper servers
65
  servers = [WhisperServer(url) for url in WHISPER_SERVERS]
66
  server_lock = asyncio.Lock()
67
+ server_index = 0 # For round-robin selection
68
 
69
  # --- Progress and State Management Functions ---
70
 
 
181
  print(f"[{FLOW_ID}] Error transcribing with {server.url}: {e}")
182
  return None
183
 
184
+ async def get_available_server() -> WhisperServer:
185
+ global server_index
186
+ while True:
187
  async with server_lock:
188
+ # Round-robin check
189
+ for _ in range(len(servers)):
190
+ s = servers[server_index]
191
+ server_index = (server_index + 1) % len(servers)
192
  if not s.is_processing:
193
  s.is_processing = True
194
+ return s
195
+ await asyncio.sleep(1)
 
 
196
 
197
+ async def process_file_task(wav_file: str, state: Dict, progress: Dict):
198
+ server = await get_available_server()
199
  try:
 
 
200
  print(f"[{FLOW_ID}] Downloading {wav_file}...")
201
  downloaded_path_str = hf_hub_download(
202
  repo_id=HF_AUDIO_DATASET_ID,
 
234
  state["file_states"][wav_file] = "failed_transcription"
235
  print(f"[{FLOW_ID}] ❌ Failed: {wav_file}")
236
 
 
237
  if wav_path.exists():
238
  wav_path.unlink()
239
 
 
256
  await asyncio.sleep(60)
257
  continue
258
 
 
259
  print(f"[{FLOW_ID}] Checking {HF_OUTPUT_DATASET_ID} for existing JSON outputs...")
260
  try:
261
  api = HfApi(token=HF_TOKEN)
 
266
  print(f"[{FLOW_ID}] Warning: Could not fetch existing files: {e}")
267
  existing_json_files = set()
268
 
 
269
  failed_files = [f for f, s in state.get("file_states", {}).items() if s == "failed_transcription"]
 
 
270
  next_idx = state.get("next_download_index", 0)
271
  new_files_chunk = file_list[next_idx:next_idx + 1000]
272
 
 
294
  state["next_download_index"] = current_idx + 1
295
  continue
296
 
 
297
  print(f"[{FLOW_ID}] Found unprocessed file: {f}")
 
 
298
  if state_changed_locally:
299
  print(f"[{FLOW_ID}] Synchronizing skipped files to HF state...")
300
  await upload_hf_state(state)