Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -32,8 +32,9 @@ LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
|
|
| 32 |
# Directory within the HF dataset where the audio files are located
|
| 33 |
AUDIO_FILE_PREFIX = "audio/"
|
| 34 |
|
|
|
|
| 35 |
WHISPER_SERVERS = [
|
| 36 |
-
f"https://makeitfr-mineo-{i}.hf.space/transcribe" for i in range(1, 21)
|
| 37 |
]
|
| 38 |
|
| 39 |
# Temporary storage for audio files
|
|
@@ -48,7 +49,6 @@ class WhisperServer:
|
|
| 48 |
def __init__(self, url: str):
|
| 49 |
self.url = url
|
| 50 |
self.is_processing = False
|
| 51 |
-
self.current_file_index: Optional[int] = None
|
| 52 |
self.total_processed = 0
|
| 53 |
self.total_time = 0.0
|
| 54 |
|
|
@@ -60,11 +60,11 @@ class WhisperServer:
|
|
| 60 |
def release(self):
|
| 61 |
"""Release the server for a new file"""
|
| 62 |
self.is_processing = False
|
| 63 |
-
self.current_file_index = None
|
| 64 |
|
| 65 |
# Global state for whisper servers
|
| 66 |
servers = [WhisperServer(url) for url in WHISPER_SERVERS]
|
| 67 |
server_lock = asyncio.Lock()
|
|
|
|
| 68 |
|
| 69 |
# --- Progress and State Management Functions ---
|
| 70 |
|
|
@@ -181,21 +181,22 @@ async def transcribe_with_server(server: WhisperServer, wav_path: Path) -> Optio
|
|
| 181 |
print(f"[{FLOW_ID}] Error transcribing with {server.url}: {e}")
|
| 182 |
return None
|
| 183 |
|
| 184 |
-
async def
|
| 185 |
-
|
| 186 |
-
while
|
| 187 |
async with server_lock:
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
| 189 |
if not s.is_processing:
|
| 190 |
s.is_processing = True
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
if server is None:
|
| 194 |
-
await asyncio.sleep(1)
|
| 195 |
|
|
|
|
|
|
|
| 196 |
try:
|
| 197 |
-
# FIX: Ensure we use the correct local path for the downloaded file
|
| 198 |
-
# hf_hub_download returns the absolute path to the downloaded file
|
| 199 |
print(f"[{FLOW_ID}] Downloading {wav_file}...")
|
| 200 |
downloaded_path_str = hf_hub_download(
|
| 201 |
repo_id=HF_AUDIO_DATASET_ID,
|
|
@@ -233,7 +234,6 @@ async def process_file_task(wav_file: str, state: Dict, progress: Dict):
|
|
| 233 |
state["file_states"][wav_file] = "failed_transcription"
|
| 234 |
print(f"[{FLOW_ID}] ❌ Failed: {wav_file}")
|
| 235 |
|
| 236 |
-
# Cleanup
|
| 237 |
if wav_path.exists():
|
| 238 |
wav_path.unlink()
|
| 239 |
|
|
@@ -256,7 +256,6 @@ async def main_processing_loop():
|
|
| 256 |
await asyncio.sleep(60)
|
| 257 |
continue
|
| 258 |
|
| 259 |
-
# Check HF_OUTPUT_DATASET_ID for existing JSON outputs
|
| 260 |
print(f"[{FLOW_ID}] Checking {HF_OUTPUT_DATASET_ID} for existing JSON outputs...")
|
| 261 |
try:
|
| 262 |
api = HfApi(token=HF_TOKEN)
|
|
@@ -267,10 +266,7 @@ async def main_processing_loop():
|
|
| 267 |
print(f"[{FLOW_ID}] Warning: Could not fetch existing files: {e}")
|
| 268 |
existing_json_files = set()
|
| 269 |
|
| 270 |
-
# 1. Handpick failed_transcription files
|
| 271 |
failed_files = [f for f, s in state.get("file_states", {}).items() if s == "failed_transcription"]
|
| 272 |
-
|
| 273 |
-
# 2. Check for new files
|
| 274 |
next_idx = state.get("next_download_index", 0)
|
| 275 |
new_files_chunk = file_list[next_idx:next_idx + 1000]
|
| 276 |
|
|
@@ -298,10 +294,7 @@ async def main_processing_loop():
|
|
| 298 |
state["next_download_index"] = current_idx + 1
|
| 299 |
continue
|
| 300 |
|
| 301 |
-
# Found an UNPROCESSED file
|
| 302 |
print(f"[{FLOW_ID}] Found unprocessed file: {f}")
|
| 303 |
-
|
| 304 |
-
# Upload skipped state before processing
|
| 305 |
if state_changed_locally:
|
| 306 |
print(f"[{FLOW_ID}] Synchronizing skipped files to HF state...")
|
| 307 |
await upload_hf_state(state)
|
|
|
|
| 32 |
# Directory within the HF dataset where the audio files are located
|
| 33 |
AUDIO_FILE_PREFIX = "audio/"
|
| 34 |
|
| 35 |
+
# FIX: Updated server list based on the logs showing 'eliasishere' prefix
|
| 36 |
WHISPER_SERVERS = [
|
| 37 |
+
f"https://eliasishere-makeitfr-mineo-{i}.hf.space/transcribe" for i in range(1, 21)
|
| 38 |
]
|
| 39 |
|
| 40 |
# Temporary storage for audio files
|
|
|
|
| 49 |
def __init__(self, url: str):
|
| 50 |
self.url = url
|
| 51 |
self.is_processing = False
|
|
|
|
| 52 |
self.total_processed = 0
|
| 53 |
self.total_time = 0.0
|
| 54 |
|
|
|
|
| 60 |
def release(self):
|
| 61 |
"""Release the server for a new file"""
|
| 62 |
self.is_processing = False
|
|
|
|
| 63 |
|
| 64 |
# Global state for whisper servers
|
| 65 |
servers = [WhisperServer(url) for url in WHISPER_SERVERS]
|
| 66 |
server_lock = asyncio.Lock()
|
| 67 |
+
server_index = 0 # For round-robin selection
|
| 68 |
|
| 69 |
# --- Progress and State Management Functions ---
|
| 70 |
|
|
|
|
| 181 |
print(f"[{FLOW_ID}] Error transcribing with {server.url}: {e}")
|
| 182 |
return None
|
| 183 |
|
| 184 |
+
async def get_available_server() -> WhisperServer:
|
| 185 |
+
global server_index
|
| 186 |
+
while True:
|
| 187 |
async with server_lock:
|
| 188 |
+
# Round-robin check
|
| 189 |
+
for _ in range(len(servers)):
|
| 190 |
+
s = servers[server_index]
|
| 191 |
+
server_index = (server_index + 1) % len(servers)
|
| 192 |
if not s.is_processing:
|
| 193 |
s.is_processing = True
|
| 194 |
+
return s
|
| 195 |
+
await asyncio.sleep(1)
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
async def process_file_task(wav_file: str, state: Dict, progress: Dict):
|
| 198 |
+
server = await get_available_server()
|
| 199 |
try:
|
|
|
|
|
|
|
| 200 |
print(f"[{FLOW_ID}] Downloading {wav_file}...")
|
| 201 |
downloaded_path_str = hf_hub_download(
|
| 202 |
repo_id=HF_AUDIO_DATASET_ID,
|
|
|
|
| 234 |
state["file_states"][wav_file] = "failed_transcription"
|
| 235 |
print(f"[{FLOW_ID}] ❌ Failed: {wav_file}")
|
| 236 |
|
|
|
|
| 237 |
if wav_path.exists():
|
| 238 |
wav_path.unlink()
|
| 239 |
|
|
|
|
| 256 |
await asyncio.sleep(60)
|
| 257 |
continue
|
| 258 |
|
|
|
|
| 259 |
print(f"[{FLOW_ID}] Checking {HF_OUTPUT_DATASET_ID} for existing JSON outputs...")
|
| 260 |
try:
|
| 261 |
api = HfApi(token=HF_TOKEN)
|
|
|
|
| 266 |
print(f"[{FLOW_ID}] Warning: Could not fetch existing files: {e}")
|
| 267 |
existing_json_files = set()
|
| 268 |
|
|
|
|
| 269 |
failed_files = [f for f, s in state.get("file_states", {}).items() if s == "failed_transcription"]
|
|
|
|
|
|
|
| 270 |
next_idx = state.get("next_download_index", 0)
|
| 271 |
new_files_chunk = file_list[next_idx:next_idx + 1000]
|
| 272 |
|
|
|
|
| 294 |
state["next_download_index"] = current_idx + 1
|
| 295 |
continue
|
| 296 |
|
|
|
|
| 297 |
print(f"[{FLOW_ID}] Found unprocessed file: {f}")
|
|
|
|
|
|
|
| 298 |
if state_changed_locally:
|
| 299 |
print(f"[{FLOW_ID}] Synchronizing skipped files to HF state...")
|
| 300 |
await upload_hf_state(state)
|