Samfredoly commited on
Commit
2280692
Β·
verified Β·
1 Parent(s): f48dbca

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +45 -0
  2. app.py +704 -0
  3. requirements.txt +17 -0
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim-bullseye
2
+
3
+ # Install system dependencies
4
+ RUN sed -i 's/main/main contrib non-free/' /etc/apt/sources.list && \
5
+ apt-get update && \
6
+ apt-get install -y --no-install-recommends \
7
+ unrar \
8
+ libgl1 \
9
+ libglib2.0-0 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ WORKDIR /app
13
+
14
+ # Upgrade pip and install core dependencies first
15
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel packaging
16
+
17
+ # Install CPU-only PyTorch first
18
+
19
+ # Copy requirements and install with special handling for flash_attn
20
+ COPY requirements.txt .
21
+ RUN pip install --no-cache-dir \
22
+ -r requirements.txt \
23
+ --find-links https://download.pytorch.org/whl/cpu \
24
+ --extra-index-url https://pypi.org/simple && \
25
+ # Install remaining packages that might have been skipped
26
+ pip install --no-cache-dir \
27
+ accelerate \
28
+ transformers==4.36.2 \
29
+ timm==0.9.12 \
30
+ einops==0.7.0
31
+
32
+ # Copy application code
33
+ COPY . .
34
+
35
+ # Create non-root user
36
+ RUN useradd -m -u 1000 user && \
37
+ chown -R user:user /app
38
+
39
+ USER user
40
+
41
+ # Environment variables to suppress warnings
42
+ ENV HF_HUB_DISABLE_PROGRESS=1
43
+ ENV TF_CPP_MIN_LOG_LEVEL=3
44
+
45
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
+ import time
7
+ import sys
8
+ import threading
9
+ from typing import Dict, List, Optional, Any
10
+ from huggingface_hub import HfApi, hf_hub_url
11
+ from fastapi import FastAPI, HTTPException
12
+ from fastapi.responses import JSONResponse
13
+ import uvicorn
14
+
15
+ # Fix Unicode encoding for Windows
16
+ if sys.platform == 'win32':
17
+ import io
18
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
19
+
20
+ # Initialize FastAPI app
21
+ app = FastAPI(title="Audio Transcriber", description="Audio transcription and upload service")
22
+
23
+ # ==== CONFIGURATION ====
24
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
25
+ SOURCE_REPO_ID = "Samfredoly/BG_Vid" # Fetch audio files from here
26
+ TARGET_REPO_ID = "samfred2/A_Text" # Upload transcriptions here
27
+ REFERENCE_REPO_ID = "Fred808/BG3" # Reference repo to match audio filenames
28
+
29
+ # Path Configuration
30
+ DOWNLOAD_FOLDER = "downloads_audio"
31
+ TRANSCRIPTIONS_FOLDER = "transcriptions"
32
+ LOCAL_STATE_FOLDER = ".state_audio"
33
+
34
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
35
+ os.makedirs(TRANSCRIPTIONS_FOLDER, exist_ok=True)
36
+ os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
37
+
38
+ # State Files
39
+ FAILED_FILES_LOG = "failed_audio_files.log"
40
+ HF_STATE_FILE = "processing_audio_state.json"
41
+
42
+ # Processing Parameters
43
+ PROCESSING_DELAY = 2
44
+ MAX_RETRIES = 3
45
+ MIN_FREE_SPACE_GB = 1
46
+ WHISPER_MODEL = "small" # Whisper model size
47
+
48
+ # Initialize HF API
49
+ hf_api = HfApi(token=HF_TOKEN)
50
+
51
+ # Global State
52
+ processing_status = {
53
+ "is_running": False,
54
+ "current_file": None,
55
+ "total_files": 0,
56
+ "processed_files": 0,
57
+ "failed_files": 0,
58
+ "transcribed_files": 0,
59
+ "last_update": None,
60
+ "logs": []
61
+ }
62
+
63
+ def log_message(message: str, level: str = "INFO"):
64
+ """Log messages with timestamp"""
65
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
66
+ log_entry = f"[{timestamp}] {level}: {message}"
67
+ print(log_entry)
68
+ processing_status["logs"].append(log_entry)
69
+ processing_status["last_update"] = timestamp
70
+ if len(processing_status["logs"]) > 100:
71
+ processing_status["logs"] = processing_status["logs"][-100:]
72
+
73
+ def log_failed_file(filename: str, error: str):
74
+ """Log failed files to persistent file"""
75
+ with open(FAILED_FILES_LOG, "a") as f:
76
+ f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
77
+
78
+ def get_disk_usage(path: str) -> Dict[str, float]:
79
+ """Get disk usage statistics in GB"""
80
+ statvfs = os.statvfs(path)
81
+ total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
82
+ free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
83
+ used = total - free
84
+ return {"total": total, "free": free, "used": used}
85
+
86
+ def check_disk_space(path: str = ".") -> bool:
87
+ """Check if there's enough disk space"""
88
+ disk_info = get_disk_usage(path)
89
+ if disk_info["free"] < MIN_FREE_SPACE_GB:
90
+ log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
91
+ return False
92
+ return True
93
+
94
+ def cleanup_temp_files():
95
+ """Clean up temporary files to free space"""
96
+ log_message("🧹 Cleaning up temporary files...", "INFO")
97
+
98
+ current_file = processing_status.get("current_file")
99
+ for file in os.listdir(DOWNLOAD_FOLDER):
100
+ if file != current_file and file.endswith((".wav", ".mp3")):
101
+ try:
102
+ os.remove(os.path.join(DOWNLOAD_FOLDER, file))
103
+ log_message(f"πŸ—‘οΈ Removed old download: {file}", "INFO")
104
+ except:
105
+ pass
106
+
107
+ def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str, Any]:
108
+ """Load state from JSON file with migration logic for new structure."""
109
+ if os.path.exists(file_path):
110
+ try:
111
+ with open(file_path, "r") as f:
112
+ data = json.load(f)
113
+
114
+ if "file_states" not in data or not isinstance(data["file_states"], dict):
115
+ log_message("ℹ️ Initializing 'file_states' dictionary.", "INFO")
116
+ data["file_states"] = {}
117
+
118
+ if "next_download_index" not in data:
119
+ data["next_download_index"] = 0
120
+
121
+ return data
122
+ except json.JSONDecodeError:
123
+ log_message(f"⚠️ Corrupted state file: {file_path}", "WARNING")
124
+ return default_value
125
+
126
+ def save_json_state(file_path: str, data: Dict[str, Any]):
127
+ """Save state to JSON file"""
128
+ with open(file_path, "w") as f:
129
+ json.dump(data, f, indent=2)
130
+
131
+ def download_hf_state(repo_id: str, filename: str) -> Dict[str, Any]:
132
+ """Downloads the state file from Hugging Face or returns a default state."""
133
+ local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
134
+ default_state = {"next_download_index": 0, "file_states": {}}
135
+
136
+ try:
137
+ files = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
138
+ if filename not in files:
139
+ log_message(f"ℹ️ State file {filename} not found in {repo_id}. Starting from default state.", "INFO")
140
+ return default_state
141
+
142
+ from huggingface_hub import hf_hub_download
143
+ hf_hub_download(
144
+ repo_id=repo_id,
145
+ filename=filename,
146
+ repo_type="dataset",
147
+ local_dir=LOCAL_STATE_FOLDER,
148
+ local_dir_use_symlinks=False
149
+ )
150
+
151
+ log_message(f"βœ… Successfully downloaded state file from {repo_id}.", "INFO")
152
+ return load_json_state(local_path, default_state)
153
+
154
+ except Exception as e:
155
+ log_message(f"⚠️ Failed to download state file from Hugging Face: {str(e)}. Starting from default state.", "WARNING")
156
+ return default_state
157
+
158
+ def upload_hf_state(repo_id: str, filename: str, state: Dict[str, Any]) -> bool:
159
+ """Uploads the state file to Hugging Face."""
160
+ local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
161
+
162
+ try:
163
+ save_json_state(local_path, state)
164
+
165
+ hf_api.upload_file(
166
+ path_or_fileobj=local_path,
167
+ path_in_repo=filename,
168
+ repo_id=repo_id,
169
+ repo_type="dataset",
170
+ commit_message=f"Update audio processing state: next_index={state['next_download_index']}"
171
+ )
172
+ log_message(f"βœ… Successfully uploaded updated state file to {repo_id}", "INFO")
173
+ return True
174
+ except Exception as e:
175
+ log_message(f"❌ Failed to upload state file to Hugging Face: {str(e)}", "ERROR")
176
+ return False
177
+
178
+ def lock_file_for_processing(wav_filename: str, state: Dict[str, Any]) -> bool:
179
+ """Marks a file as 'processing' in the state file and uploads the lock."""
180
+ log_message(f"πŸ”’ Attempting to lock file: {wav_filename} (Marking as 'processing')", "INFO")
181
+
182
+ state["file_states"][wav_filename] = "processing"
183
+
184
+ if upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state):
185
+ log_message(f"βœ… Successfully locked file: {wav_filename}", "INFO")
186
+ return True
187
+ else:
188
+ log_message(f"❌ Failed to upload lock for file: {wav_filename}. Aborting processing.", "ERROR")
189
+ if wav_filename in state["file_states"]:
190
+ del state["file_states"][wav_filename]
191
+ return False
192
+
193
+ def unlock_file_as_processed(wav_filename: str, state: Dict[str, Any], next_index: int) -> bool:
194
+ """Marks a file as 'processed', updates the index, and uploads the state."""
195
+ log_message(f"πŸ”“ Attempting to unlock file: {wav_filename} (Marking as 'processed')", "INFO")
196
+
197
+ state["file_states"][wav_filename] = "processed"
198
+ state["next_download_index"] = next_index
199
+
200
+ if upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state):
201
+ log_message(f"βœ… Successfully unlocked and marked as processed: {wav_filename}", "INFO")
202
+ return True
203
+ else:
204
+ log_message(f"❌ Failed to upload final state for file: {wav_filename}.", "ERROR")
205
+ return False
206
+
207
+ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
208
+ """Download file with retry logic and disk space checking"""
209
+ if not check_disk_space():
210
+ cleanup_temp_files()
211
+ if not check_disk_space():
212
+ log_message("❌ Insufficient disk space even after cleanup", "ERROR")
213
+ return False
214
+
215
+ try:
216
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
217
+ except Exception as e:
218
+ log_message(f"❌ Failed to create directory for download path {os.path.dirname(dest_path)}: {str(e)}", "ERROR")
219
+ return False
220
+
221
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
222
+ for attempt in range(max_retries):
223
+ try:
224
+ with requests.get(url, headers=headers, stream=True) as r:
225
+ r.raise_for_status()
226
+
227
+ with open(dest_path, "wb") as f:
228
+ for chunk in r.iter_content(chunk_size=8192):
229
+ if chunk:
230
+ f.write(chunk)
231
+
232
+ log_message(f"βœ… Download successful: {dest_path}", "INFO")
233
+ return True
234
+
235
+ except requests.exceptions.RequestException as e:
236
+ log_message(f"❌ Download attempt {attempt + 1} failed for {url}: {str(e)}", "WARNING")
237
+ time.sleep(PROCESSING_DELAY)
238
+ except Exception as e:
239
+ log_message(f"❌ An unexpected error occurred during download: {str(e)}", "ERROR")
240
+ return False
241
+
242
+ log_message(f"❌ Failed to download {url} after {max_retries} attempts.", "ERROR")
243
+ return False
244
+
245
+ def fetch_reference_files(repo_id: str) -> Dict[str, str]:
246
+ """Fetch all files from Fred808/BG3 repo to match with audio filenames."""
247
+ log_message(f"πŸ“‹ Fetching file list from {repo_id}...", "INFO")
248
+
249
+ try:
250
+ files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
251
+
252
+ # Include all file types (zip, rar, wav, mp3, etc.)
253
+ all_files = [f for f in files_list]
254
+
255
+ # Create a mapping of base filename (without extension) to full path
256
+ filename_map = {}
257
+ for file_path in all_files:
258
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
259
+ filename_map[base_name] = file_path
260
+
261
+ log_message(f"βœ… Found {len(filename_map)} files in reference repo", "INFO")
262
+ return filename_map
263
+
264
+ except Exception as e:
265
+ log_message(f"❌ Failed to fetch reference files: {str(e)}", "ERROR")
266
+ return {}
267
+
268
+ def find_matching_filename(transcribed_filename: str, reference_map: Dict[str, str]) -> Optional[str]:
269
+ """Find matching filename in reference map from Fred808/BG3."""
270
+ base_name = os.path.splitext(transcribed_filename)[0]
271
+
272
+ # Exact match first
273
+ if base_name in reference_map:
274
+ full_path = reference_map[base_name]
275
+ print(f"\nβœ… MATCH FOUND:")
276
+ print(f" Audio: {transcribed_filename}")
277
+ print(f" File: {full_path}")
278
+ log_message(f"βœ… Found exact match: {transcribed_filename} -> {full_path}", "INFO")
279
+ return full_path
280
+
281
+ # Partial/fuzzy match (check if reference contains transcribed as substring)
282
+ matches = []
283
+ for ref_base, ref_full_path in reference_map.items():
284
+ if base_name.lower() in ref_base.lower() or ref_base.lower() in base_name.lower():
285
+ matches.append((ref_base, ref_full_path))
286
+
287
+ # Return first partial match if found
288
+ if matches:
289
+ ref_base, ref_full_path = matches[0]
290
+ print(f"\nβœ… PARTIAL MATCH FOUND:")
291
+ print(f" Audio: {transcribed_filename}")
292
+ print(f" File: {ref_full_path}")
293
+ log_message(f"βœ… Found partial match: {transcribed_filename} -> {ref_full_path}", "INFO")
294
+ return ref_full_path
295
+
296
+ print(f"\n❌ NO MATCH FOUND:")
297
+ print(f" Audio: {transcribed_filename}")
298
+ log_message(f"⚠️ No matching filename found for: {transcribed_filename}", "WARNING")
299
+ return None
300
+
301
+ def transcribe_audio(wav_path: str) -> Optional[Dict[str, Any]]:
302
+ """Transcribe audio file using Whisper from Transformers."""
303
+ log_message(f"🎀 Transcribing audio file: {wav_path}", "INFO")
304
+
305
+ try:
306
+ from transformers import pipeline
307
+ import librosa
308
+
309
+ # Load audio with librosa
310
+ log_message(f"Loading audio file: {wav_path}", "INFO")
311
+ audio, sr = librosa.load(wav_path, sr=16000)
312
+
313
+ # Initialize Whisper pipeline
314
+ log_message(f"Loading Whisper {WHISPER_MODEL} model from Transformers...", "INFO")
315
+ pipe = pipeline(
316
+ "automatic-speech-recognition",
317
+ model=f"openai/whisper-{WHISPER_MODEL}",
318
+ device=0 if __import__('torch').cuda.is_available() else -1 # GPU if available, else CPU
319
+ )
320
+
321
+ # Transcribe
322
+ log_message("Transcribing audio...", "INFO")
323
+ result = pipe(audio)
324
+
325
+ # Format result to match openai-whisper format
326
+ formatted_result = {
327
+ "text": result["text"],
328
+ "segments": [{"text": result["text"]}]
329
+ }
330
+
331
+ log_message(f"βœ… Successfully transcribed: {wav_path}", "INFO")
332
+ return formatted_result
333
+
334
+ except ImportError as e:
335
+ missing_lib = str(e)
336
+ log_message(f"❌ Missing library. Install with: pip install transformers librosa torch torchaudio", "ERROR")
337
+ log_message(f" Error: {missing_lib}", "ERROR")
338
+ return None
339
+ except Exception as e:
340
+ log_message(f"❌ Failed to transcribe {wav_path}: {str(e)}", "ERROR")
341
+ return None
342
+
343
+ def process_audio_file(wav_path: str, reference_map: Dict[str, str], matched_filename: str) -> bool:
344
+ """
345
+ Main processing logic for a single audio file:
346
+ 1. Transcribe using Whisper
347
+ 2. Save transcription as JSON
348
+ 3. Upload to HF dataset
349
+ 4. Clean up local files
350
+ """
351
+ wav_filename = os.path.basename(wav_path)
352
+
353
+ # 1. Transcribe audio
354
+ transcription = transcribe_audio(wav_path)
355
+ if transcription is None:
356
+ log_failed_file(wav_filename, "Transcription failed")
357
+ return False
358
+
359
+ # 2. Save transcription as JSON
360
+ json_filename = os.path.splitext(matched_filename)[0] + "_transcription.json"
361
+ json_output_path = os.path.join(TRANSCRIPTIONS_FOLDER, json_filename)
362
+
363
+ try:
364
+ os.makedirs(os.path.dirname(json_output_path), exist_ok=True)
365
+
366
+ with open(json_output_path, "w", encoding="utf-8") as f:
367
+ json.dump(transcription, f, indent=2, ensure_ascii=False)
368
+
369
+ log_message(f"βœ… Saved transcription: {json_output_path}", "INFO")
370
+
371
+ except Exception as e:
372
+ log_message(f"❌ Failed to save transcription JSON: {str(e)}", "ERROR")
373
+ log_failed_file(wav_filename, f"Failed to save JSON: {str(e)}")
374
+ return False
375
+
376
+ # 3. Upload to HF dataset
377
+ try:
378
+ path_in_repo = f"transcriptions/{json_filename}"
379
+ commit_message = f"Add transcription for: {matched_filename}"
380
+
381
+ hf_api.upload_file(
382
+ path_or_fileobj=json_output_path,
383
+ path_in_repo=path_in_repo,
384
+ repo_id=TARGET_REPO_ID,
385
+ repo_type="dataset",
386
+ commit_message=commit_message
387
+ )
388
+ log_message(f"βœ… Successfully uploaded transcription: {json_filename}", "INFO")
389
+ processing_status["transcribed_files"] += 1
390
+
391
+ except Exception as e:
392
+ log_message(f"❌ Failed to upload transcription to HF: {str(e)}", "ERROR")
393
+ log_failed_file(wav_filename, f"Failed to upload: {str(e)}")
394
+ return False
395
+
396
+ # 4. Clean up local files
397
+ try:
398
+ os.remove(json_output_path)
399
+ log_message(f"πŸ—‘οΈ Cleaned up local transcription file: {json_output_path}", "INFO")
400
+ except:
401
+ pass
402
+
403
+ return True
404
+
405
+ def get_next_file_to_process(repo_id: str, state: Dict[str, Any]) -> Optional[Dict[str, Any]]:
406
+ """
407
+ Finds the next audio file to process from the source repo in reverse order (oldest to newest).
408
+ Returns: { 'filename': str, 'url': str, 'index': int } or None
409
+ """
410
+ log_message(f"πŸ” Searching for next audio file to process in {repo_id}", "INFO")
411
+
412
+ try:
413
+ files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
414
+
415
+ # Filter for audio files and sort in reverse order (descending)
416
+ audio_files = sorted([f for f in files_list if f.endswith(('.wav', '.mp3'))], reverse=True)
417
+
418
+ if not audio_files:
419
+ log_message("ℹ️ No audio files found in the source repository.", "INFO")
420
+ return None
421
+
422
+ processing_status["total_files"] = len(audio_files)
423
+
424
+ start_index = state.get("next_download_index", 0)
425
+
426
+ for index in range(start_index, len(audio_files)):
427
+ filename = audio_files[index]
428
+ file_state = state["file_states"].get(filename)
429
+
430
+ if file_state is None or file_state == "failed":
431
+ url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type="dataset", subfolder=None)
432
+
433
+ log_message(f"βœ… Found next audio file: {filename} at index {index}", "INFO")
434
+ return {
435
+ 'filename': filename,
436
+ 'url': url,
437
+ 'index': index
438
+ }
439
+
440
+ elif file_state == "processing":
441
+ log_message(f"⚠️ File {filename} is currently marked as 'processing'. Skipping for now.", "WARNING")
442
+
443
+ elif file_state == "processed":
444
+ log_message(f"ℹ️ File {filename} already processed. Skipping.", "INFO")
445
+
446
+ log_message("ℹ️ All files up to the current index have been processed or skipped.", "INFO")
447
+
448
+ if start_index >= len(audio_files):
449
+ log_message("ℹ️ Reached end of file list. Resetting index to 0 for next loop.", "INFO")
450
+ state["next_download_index"] = 0
451
+ upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state)
452
+
453
+ return None
454
+
455
+ except Exception as e:
456
+ log_message(f"❌ Failed to list files from Hugging Face: {str(e)}", "ERROR")
457
+ return None
458
+
459
+ def main_processing_loop():
460
+ """The main loop that orchestrates the download, transcription, and upload cycle."""
461
+
462
+ if processing_status["is_running"]:
463
+ log_message("⚠️ Processing loop is already running.", "WARNING")
464
+ return
465
+
466
+ processing_status["is_running"] = True
467
+
468
+ try:
469
+ log_message("πŸš€ Starting audio transcription processing loop...", "INFO")
470
+
471
+ # Fetch reference files from BG_Vid repo once at the start
472
+ reference_map = fetch_reference_files(REFERENCE_REPO_ID)
473
+
474
+ if not reference_map:
475
+ log_message("❌ No reference files found. Cannot proceed.", "ERROR")
476
+ return
477
+
478
+ while processing_status["is_running"]:
479
+
480
+ current_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
481
+ next_file_info = get_next_file_to_process(SOURCE_REPO_ID, current_state)
482
+
483
+ if next_file_info is None:
484
+ log_message("πŸ’€ No new audio files to process. Sleeping for a while...", "INFO")
485
+ time.sleep(PROCESSING_DELAY * 5)
486
+ continue
487
+
488
+ target_file = next_file_info['filename']
489
+ audio_url = next_file_info['url']
490
+ target_index = next_file_info['index']
491
+
492
+ processing_status["current_file"] = target_file
493
+ success = False
494
+ matched_filename = None
495
+
496
+ try:
497
+ if not lock_file_for_processing(target_file, current_state):
498
+ log_message(f"❌ Failed to lock file {target_file}. Skipping.", "ERROR")
499
+ time.sleep(PROCESSING_DELAY)
500
+ continue
501
+
502
+ local_wav_path = os.path.join(DOWNLOAD_FOLDER, os.path.basename(target_file))
503
+ log_message(f"⬇️ Downloading audio file: {target_file}", "INFO")
504
+
505
+ if download_with_retry(audio_url, local_wav_path):
506
+
507
+ # Extract base filename for matching
508
+ base_filename = os.path.basename(target_file)
509
+ matched_filename = find_matching_filename(base_filename, reference_map)
510
+
511
+ if matched_filename:
512
+ if process_audio_file(local_wav_path, reference_map, matched_filename):
513
+ success = True
514
+ log_message(f"βœ… Finished processing: {target_file}", "INFO")
515
+ else:
516
+ log_message(f"❌ Processing failed for: {target_file}", "ERROR")
517
+ else:
518
+ log_message(f"❌ No matching filename found for: {base_filename}", "ERROR")
519
+ log_failed_file(target_file, "No matching reference filename")
520
+ else:
521
+ log_message(f"❌ Download failed for: {target_file}", "ERROR")
522
+
523
+ except Exception as e:
524
+ log_message(f"πŸ”₯ An unhandled error occurred while processing {target_file}: {str(e)}", "ERROR")
525
+ log_failed_file(target_file, str(e))
526
+
527
+ finally:
528
+ next_index_to_save = target_index + 1
529
+ current_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
530
+
531
+ if success:
532
+ unlock_file_as_processed(target_file, current_state, next_index_to_save)
533
+ processing_status["processed_files"] += 1
534
+ else:
535
+ log_message(f"⚠️ Processing failed for {target_file}. Marking as 'failed' and advancing index.", "WARNING")
536
+ current_state["file_states"][target_file] = "failed"
537
+ current_state["next_download_index"] = next_index_to_save
538
+ upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, current_state)
539
+ processing_status["failed_files"] += 1
540
+
541
+ if os.path.exists(local_wav_path):
542
+ os.remove(local_wav_path)
543
+ log_message(f"πŸ—‘οΈ Cleaned up local file: {local_wav_path}", "INFO")
544
+
545
+ time.sleep(PROCESSING_DELAY)
546
+
547
+ log_message("πŸŽ‰ Processing complete!", "INFO")
548
+ log_message(f"πŸ“Š Final stats: {processing_status['transcribed_files']} audio files transcribed, {processing_status['processed_files']} files processed", "INFO")
549
+
550
+ except KeyboardInterrupt:
551
+ log_message("⏹️ Processing interrupted by user", "WARNING")
552
+ except Exception as e:
553
+ log_message(f"❌ Fatal error: {str(e)}", "ERROR")
554
+ finally:
555
+ processing_status["is_running"] = False
556
+ cleanup_temp_files()
557
+
558
+ if __name__ == "__main__":
559
+ main_processing_loop()
560
+
561
+ # ===== FASTAPI ENDPOINTS =====
562
+
563
+ @app.get("/")
564
+ async def root():
565
+ """Root endpoint with service info"""
566
+ return {
567
+ "service": "Audio Transcriber",
568
+ "status": "running",
569
+ "version": "1.0.0",
570
+ "endpoints": {
571
+ "status": "/status",
572
+ "start": "/start",
573
+ "stop": "/stop",
574
+ "process": "/process/{filename}",
575
+ "logs": "/logs"
576
+ }
577
+ }
578
+
579
+ @app.get("/status")
580
+ async def get_status():
581
+ """Get current processing status"""
582
+ return {
583
+ "is_running": processing_status["is_running"],
584
+ "current_file": processing_status["current_file"],
585
+ "total_files": processing_status["total_files"],
586
+ "processed_files": processing_status["processed_files"],
587
+ "transcribed_files": processing_status["transcribed_files"],
588
+ "failed_files": processing_status["failed_files"],
589
+ "last_update": processing_status["last_update"],
590
+ "recent_logs": processing_status["logs"][-10:]
591
+ }
592
+
593
+ @app.post("/start")
594
+ async def start_processing():
595
+ """Start the main processing loop"""
596
+ if processing_status["is_running"]:
597
+ raise HTTPException(status_code=400, detail="Processing already running")
598
+
599
+ # Start processing in a separate thread
600
+ thread = threading.Thread(target=main_processing_loop, daemon=True)
601
+ thread.start()
602
+
603
+ return {
604
+ "message": "Processing started",
605
+ "status": "started"
606
+ }
607
+
608
+ @app.post("/stop")
609
+ async def stop_processing():
610
+ """Stop the main processing loop"""
611
+ if not processing_status["is_running"]:
612
+ raise HTTPException(status_code=400, detail="Processing not running")
613
+
614
+ processing_status["is_running"] = False
615
+
616
+ return {
617
+ "message": "Processing stopped",
618
+ "status": "stopped"
619
+ }
620
+
621
+ @app.get("/logs")
622
+ async def get_logs(limit: int = 50):
623
+ """Get recent logs"""
624
+ logs = processing_status["logs"][-limit:]
625
+ return {
626
+ "total_logs": len(processing_status["logs"]),
627
+ "recent_logs": logs
628
+ }
629
+
630
+ @app.post("/process/{filename}")
631
+ async def process_single_file(filename: str):
632
+ """Process a single audio file manually"""
633
+ try:
634
+ log_message(f"🎯 Manual processing requested for: {filename}", "INFO")
635
+
636
+ # Download and process the file
637
+ reference_map = fetch_reference_files(REFERENCE_REPO_ID)
638
+ if not reference_map:
639
+ raise HTTPException(status_code=500, detail="Could not fetch reference files")
640
+
641
+ # Get file URL
642
+ audio_url = hf_hub_url(repo_id=SOURCE_REPO_ID, filename=filename, repo_type="dataset", subfolder=None)
643
+ local_wav_path = os.path.join(DOWNLOAD_FOLDER, os.path.basename(filename))
644
+
645
+ # Download
646
+ if not download_with_retry(audio_url, local_wav_path):
647
+ raise HTTPException(status_code=500, detail="Failed to download file")
648
+
649
+ # Find match
650
+ base_filename = os.path.basename(filename)
651
+ matched_filename = find_matching_filename(base_filename, reference_map)
652
+
653
+ if not matched_filename:
654
+ os.remove(local_wav_path)
655
+ raise HTTPException(status_code=404, detail="No matching filename found")
656
+
657
+ # Process
658
+ if process_audio_file(local_wav_path, reference_map, matched_filename):
659
+ processing_status["transcribed_files"] += 1
660
+
661
+ if os.path.exists(local_wav_path):
662
+ os.remove(local_wav_path)
663
+
664
+ return {
665
+ "status": "success",
666
+ "file": filename,
667
+ "matched": matched_filename,
668
+ "message": "Audio transcribed and uploaded successfully"
669
+ }
670
+ else:
671
+ if os.path.exists(local_wav_path):
672
+ os.remove(local_wav_path)
673
+ raise HTTPException(status_code=500, detail="Processing failed")
674
+
675
+ except Exception as e:
676
+ log_message(f"❌ Manual processing error: {str(e)}", "ERROR")
677
+ raise HTTPException(status_code=500, detail=str(e))
678
+
679
+ @app.on_event("startup")
680
+ async def startup_event():
681
+ """Auto-start processing when server starts"""
682
+ log_message("πŸš€ Server startup: Checking dependencies...", "INFO")
683
+
684
+ try:
685
+ import transformers
686
+ log_message("βœ… Transformers found", "INFO")
687
+ except ImportError:
688
+ log_message("⚠️ WARNING: Transformers not installed!", "WARNING")
689
+ log_message(" Install with: pip install transformers librosa torch torchaudio", "WARNING")
690
+
691
+ log_message("πŸš€ Server startup: Auto-starting processing loop", "INFO")
692
+
693
+ # Start processing in a separate thread
694
+ thread = threading.Thread(target=main_processing_loop, daemon=True)
695
+ thread.start()
696
+
697
+ def run_api(host: str = "0.0.0.0", port: int = 8000):
698
+ """Run the FastAPI server"""
699
+ log_message(f"πŸš€ Starting FastAPI server on {host}:{port}", "INFO")
700
+ uvicorn.run(app, host=host, port=port)
701
+
702
+ if __name__ == "__main__":
703
+ # Run API server (processing will auto-start via startup event)
704
+ run_api()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ accelerate
3
+ fastapi
4
+ uvicorn
5
+ opencv-python-headless
6
+ numpy
7
+ pathlib
8
+ huggingface_hub
9
+ pillow
10
+ rarfile
11
+ python-multipart
12
+ openai-whisper
13
+ ffmpeg-python
14
+ transformers
15
+ librosa
16
+ torch
17
+ torchaudio