Samfredoly commited on
Commit
445d058
·
verified ·
1 Parent(s): 5a560c5

Rename download_api.py to app.py

Browse files
Files changed (2) hide show
  1. app.py +534 -0
  2. download_api.py +0 -407
app.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
+ import time
7
+ import sys
8
+ from typing import Dict, List, Optional, Any
9
+ from huggingface_hub import HfApi, hf_hub_url
10
+
11
+ # Fix Unicode encoding for Windows
12
+ if sys.platform == 'win32':
13
+ import io
14
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
15
+
16
+ # ==== CONFIGURATION ====
17
+ HF_TOKEN = ""
18
+ SOURCE_REPO_ID = "Fred808/BG3" # Fetch audio files from here
19
+ TARGET_REPO_ID = "Samfredoly/BG_Tran" # Upload transcriptions here
20
+ REFERENCE_REPO_ID = "Samfredoly/BG_Vid" # Reference repo to match audio filenames
21
+
22
+ # Path Configuration
23
+ DOWNLOAD_FOLDER = "downloads_audio"
24
+ TRANSCRIPTIONS_FOLDER = "transcriptions"
25
+ LOCAL_STATE_FOLDER = ".state_audio"
26
+
27
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
28
+ os.makedirs(TRANSCRIPTIONS_FOLDER, exist_ok=True)
29
+ os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
30
+
31
+ # State Files
32
+ FAILED_FILES_LOG = "failed_audio_files.log"
33
+ HF_STATE_FILE = "processing_audio_state.json"
34
+
35
+ # Processing Parameters
36
+ PROCESSING_DELAY = 2
37
+ MAX_RETRIES = 3
38
+ MIN_FREE_SPACE_GB = 1
39
+ WHISPER_MODEL = "small" # Whisper model size
40
+
41
+ # Initialize HF API
42
+ hf_api = HfApi(token=HF_TOKEN)
43
+
44
+ # Global State
45
+ processing_status = {
46
+ "is_running": False,
47
+ "current_file": None,
48
+ "total_files": 0,
49
+ "processed_files": 0,
50
+ "failed_files": 0,
51
+ "transcribed_files": 0,
52
+ "last_update": None,
53
+ "logs": []
54
+ }
55
+
56
+ def log_message(message: str, level: str = "INFO"):
57
+ """Log messages with timestamp"""
58
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
59
+ log_entry = f"[{timestamp}] {level}: {message}"
60
+ print(log_entry)
61
+ processing_status["logs"].append(log_entry)
62
+ processing_status["last_update"] = timestamp
63
+ if len(processing_status["logs"]) > 100:
64
+ processing_status["logs"] = processing_status["logs"][-100:]
65
+
66
+ def log_failed_file(filename: str, error: str):
67
+ """Log failed files to persistent file"""
68
+ with open(FAILED_FILES_LOG, "a") as f:
69
+ f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
70
+
71
+ def get_disk_usage(path: str) -> Dict[str, float]:
72
+ """Get disk usage statistics in GB"""
73
+ statvfs = os.statvfs(path)
74
+ total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
75
+ free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
76
+ used = total - free
77
+ return {"total": total, "free": free, "used": used}
78
+
79
+ def check_disk_space(path: str = ".") -> bool:
80
+ """Check if there's enough disk space"""
81
+ disk_info = get_disk_usage(path)
82
+ if disk_info["free"] < MIN_FREE_SPACE_GB:
83
+ log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
84
+ return False
85
+ return True
86
+
87
+ def cleanup_temp_files():
88
+ """Clean up temporary files to free space"""
89
+ log_message("🧹 Cleaning up temporary files...", "INFO")
90
+
91
+ current_file = processing_status.get("current_file")
92
+ for file in os.listdir(DOWNLOAD_FOLDER):
93
+ if file != current_file and file.endswith((".wav", ".mp3")):
94
+ try:
95
+ os.remove(os.path.join(DOWNLOAD_FOLDER, file))
96
+ log_message(f"🗑️ Removed old download: {file}", "INFO")
97
+ except:
98
+ pass
99
+
100
+ def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str, Any]:
101
+ """Load state from JSON file with migration logic for new structure."""
102
+ if os.path.exists(file_path):
103
+ try:
104
+ with open(file_path, "r") as f:
105
+ data = json.load(f)
106
+
107
+ if "file_states" not in data or not isinstance(data["file_states"], dict):
108
+ log_message("ℹ️ Initializing 'file_states' dictionary.", "INFO")
109
+ data["file_states"] = {}
110
+
111
+ if "next_download_index" not in data:
112
+ data["next_download_index"] = 0
113
+
114
+ return data
115
+ except json.JSONDecodeError:
116
+ log_message(f"⚠️ Corrupted state file: {file_path}", "WARNING")
117
+ return default_value
118
+
119
+ def save_json_state(file_path: str, data: Dict[str, Any]):
120
+ """Save state to JSON file"""
121
+ with open(file_path, "w") as f:
122
+ json.dump(data, f, indent=2)
123
+
124
+ def download_hf_state(repo_id: str, filename: str) -> Dict[str, Any]:
125
+ """Downloads the state file from Hugging Face or returns a default state."""
126
+ local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
127
+ default_state = {"next_download_index": 0, "file_states": {}}
128
+
129
+ try:
130
+ files = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
131
+ if filename not in files:
132
+ log_message(f"ℹ️ State file {filename} not found in {repo_id}. Starting from default state.", "INFO")
133
+ return default_state
134
+
135
+ from huggingface_hub import hf_hub_download
136
+ hf_hub_download(
137
+ repo_id=repo_id,
138
+ filename=filename,
139
+ repo_type="dataset",
140
+ local_dir=LOCAL_STATE_FOLDER,
141
+ local_dir_use_symlinks=False
142
+ )
143
+
144
+ log_message(f"✅ Successfully downloaded state file from {repo_id}.", "INFO")
145
+ return load_json_state(local_path, default_state)
146
+
147
+ except Exception as e:
148
+ log_message(f"⚠️ Failed to download state file from Hugging Face: {str(e)}. Starting from default state.", "WARNING")
149
+ return default_state
150
+
151
+ def upload_hf_state(repo_id: str, filename: str, state: Dict[str, Any]) -> bool:
152
+ """Uploads the state file to Hugging Face."""
153
+ local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
154
+
155
+ try:
156
+ save_json_state(local_path, state)
157
+
158
+ hf_api.upload_file(
159
+ path_or_fileobj=local_path,
160
+ path_in_repo=filename,
161
+ repo_id=repo_id,
162
+ repo_type="dataset",
163
+ commit_message=f"Update audio processing state: next_index={state['next_download_index']}"
164
+ )
165
+ log_message(f"✅ Successfully uploaded updated state file to {repo_id}", "INFO")
166
+ return True
167
+ except Exception as e:
168
+ log_message(f"❌ Failed to upload state file to Hugging Face: {str(e)}", "ERROR")
169
+ return False
170
+
171
+ def lock_file_for_processing(wav_filename: str, state: Dict[str, Any]) -> bool:
172
+ """Marks a file as 'processing' in the state file and uploads the lock."""
173
+ log_message(f"🔒 Attempting to lock file: {wav_filename} (Marking as 'processing')", "INFO")
174
+
175
+ state["file_states"][wav_filename] = "processing"
176
+
177
+ if upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state):
178
+ log_message(f"✅ Successfully locked file: {wav_filename}", "INFO")
179
+ return True
180
+ else:
181
+ log_message(f"❌ Failed to upload lock for file: {wav_filename}. Aborting processing.", "ERROR")
182
+ if wav_filename in state["file_states"]:
183
+ del state["file_states"][wav_filename]
184
+ return False
185
+
186
+ def unlock_file_as_processed(wav_filename: str, state: Dict[str, Any], next_index: int) -> bool:
187
+ """Marks a file as 'processed', updates the index, and uploads the state."""
188
+ log_message(f"🔓 Attempting to unlock file: {wav_filename} (Marking as 'processed')", "INFO")
189
+
190
+ state["file_states"][wav_filename] = "processed"
191
+ state["next_download_index"] = next_index
192
+
193
+ if upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state):
194
+ log_message(f"✅ Successfully unlocked and marked as processed: {wav_filename}", "INFO")
195
+ return True
196
+ else:
197
+ log_message(f"❌ Failed to upload final state for file: {wav_filename}.", "ERROR")
198
+ return False
199
+
200
+ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
201
+ """Download file with retry logic and disk space checking"""
202
+ if not check_disk_space():
203
+ cleanup_temp_files()
204
+ if not check_disk_space():
205
+ log_message("❌ Insufficient disk space even after cleanup", "ERROR")
206
+ return False
207
+
208
+ try:
209
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
210
+ except Exception as e:
211
+ log_message(f"❌ Failed to create directory for download path {os.path.dirname(dest_path)}: {str(e)}", "ERROR")
212
+ return False
213
+
214
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
215
+ for attempt in range(max_retries):
216
+ try:
217
+ with requests.get(url, headers=headers, stream=True) as r:
218
+ r.raise_for_status()
219
+
220
+ with open(dest_path, "wb") as f:
221
+ for chunk in r.iter_content(chunk_size=8192):
222
+ if chunk:
223
+ f.write(chunk)
224
+
225
+ log_message(f"✅ Download successful: {dest_path}", "INFO")
226
+ return True
227
+
228
+ except requests.exceptions.RequestException as e:
229
+ log_message(f"❌ Download attempt {attempt + 1} failed for {url}: {str(e)}", "WARNING")
230
+ time.sleep(PROCESSING_DELAY)
231
+ except Exception as e:
232
+ log_message(f"❌ An unexpected error occurred during download: {str(e)}", "ERROR")
233
+ return False
234
+
235
+ log_message(f"❌ Failed to download {url} after {max_retries} attempts.", "ERROR")
236
+ return False
237
+
238
+ def fetch_reference_files(repo_id: str) -> Dict[str, str]:
239
+ """Fetch all files from Fred808/BG3 repo to match with audio filenames."""
240
+ log_message(f"📋 Fetching file list from {repo_id}...", "INFO")
241
+
242
+ try:
243
+ files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
244
+
245
+ # Include all file types (zip, rar, wav, mp3, etc.)
246
+ all_files = [f for f in files_list]
247
+
248
+ # Create a mapping of base filename (without extension) to full path
249
+ filename_map = {}
250
+ for file_path in all_files:
251
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
252
+ filename_map[base_name] = file_path
253
+
254
+ log_message(f"✅ Found {len(filename_map)} files in reference repo", "INFO")
255
+ return filename_map
256
+
257
+ except Exception as e:
258
+ log_message(f"❌ Failed to fetch reference files: {str(e)}", "ERROR")
259
+ return {}
260
+
261
+ def find_matching_filename(transcribed_filename: str, reference_map: Dict[str, str]) -> Optional[str]:
262
+ """Find matching filename in reference map from Fred808/BG3."""
263
+ base_name = os.path.splitext(transcribed_filename)[0]
264
+
265
+ # Exact match first
266
+ if base_name in reference_map:
267
+ full_path = reference_map[base_name]
268
+ print(f"\n✅ MATCH FOUND:")
269
+ print(f" Audio: {transcribed_filename}")
270
+ print(f" File: {full_path}")
271
+ log_message(f"✅ Found exact match: {transcribed_filename} -> {full_path}", "INFO")
272
+ return full_path
273
+
274
+ # Partial/fuzzy match (check if reference contains transcribed as substring)
275
+ matches = []
276
+ for ref_base, ref_full_path in reference_map.items():
277
+ if base_name.lower() in ref_base.lower() or ref_base.lower() in base_name.lower():
278
+ matches.append((ref_base, ref_full_path))
279
+
280
+ # Return first partial match if found
281
+ if matches:
282
+ ref_base, ref_full_path = matches[0]
283
+ print(f"\n✅ PARTIAL MATCH FOUND:")
284
+ print(f" Audio: {transcribed_filename}")
285
+ print(f" File: {ref_full_path}")
286
+ log_message(f"✅ Found partial match: {transcribed_filename} -> {ref_full_path}", "INFO")
287
+ return ref_full_path
288
+
289
+ print(f"\n❌ NO MATCH FOUND:")
290
+ print(f" Audio: {transcribed_filename}")
291
+ log_message(f"⚠️ No matching filename found for: {transcribed_filename}", "WARNING")
292
+ return None
293
+
294
+ def transcribe_audio(wav_path: str) -> Optional[Dict[str, Any]]:
295
+ """Transcribe audio file using Whisper."""
296
+ log_message(f"🎤 Transcribing audio file: {wav_path}", "INFO")
297
+
298
+ try:
299
+ import whisper
300
+
301
+ # Load model
302
+ log_message(f"Loading Whisper {WHISPER_MODEL} model...", "INFO")
303
+ model = whisper.load_model(WHISPER_MODEL)
304
+
305
+ # Transcribe
306
+ result = model.transcribe(wav_path)
307
+
308
+ log_message(f"✅ Successfully transcribed: {wav_path}", "INFO")
309
+ return result
310
+
311
+ except ImportError:
312
+ log_message("❌ Whisper not installed. Install with: pip install openai-whisper", "ERROR")
313
+ return None
314
+ except Exception as e:
315
+ log_message(f"❌ Failed to transcribe {wav_path}: {str(e)}", "ERROR")
316
+ return None
317
+
318
+ def process_audio_file(wav_path: str, reference_map: Dict[str, str], matched_filename: str) -> bool:
319
+ """
320
+ Main processing logic for a single audio file:
321
+ 1. Transcribe using Whisper
322
+ 2. Save transcription as JSON
323
+ 3. Upload to HF dataset
324
+ 4. Clean up local files
325
+ """
326
+ wav_filename = os.path.basename(wav_path)
327
+
328
+ # 1. Transcribe audio
329
+ transcription = transcribe_audio(wav_path)
330
+ if transcription is None:
331
+ log_failed_file(wav_filename, "Transcription failed")
332
+ return False
333
+
334
+ # 2. Save transcription as JSON
335
+ json_filename = os.path.splitext(matched_filename)[0] + "_transcription.json"
336
+ json_output_path = os.path.join(TRANSCRIPTIONS_FOLDER, json_filename)
337
+
338
+ try:
339
+ os.makedirs(os.path.dirname(json_output_path), exist_ok=True)
340
+
341
+ with open(json_output_path, "w", encoding="utf-8") as f:
342
+ json.dump(transcription, f, indent=2, ensure_ascii=False)
343
+
344
+ log_message(f"✅ Saved transcription: {json_output_path}", "INFO")
345
+
346
+ except Exception as e:
347
+ log_message(f"❌ Failed to save transcription JSON: {str(e)}", "ERROR")
348
+ log_failed_file(wav_filename, f"Failed to save JSON: {str(e)}")
349
+ return False
350
+
351
+ # 3. Upload to HF dataset
352
+ try:
353
+ path_in_repo = f"transcriptions/{json_filename}"
354
+ commit_message = f"Add transcription for: {matched_filename}"
355
+
356
+ hf_api.upload_file(
357
+ path_or_fileobj=json_output_path,
358
+ path_in_repo=path_in_repo,
359
+ repo_id=TARGET_REPO_ID,
360
+ repo_type="dataset",
361
+ commit_message=commit_message
362
+ )
363
+ log_message(f"✅ Successfully uploaded transcription: {json_filename}", "INFO")
364
+ processing_status["transcribed_files"] += 1
365
+
366
+ except Exception as e:
367
+ log_message(f"❌ Failed to upload transcription to HF: {str(e)}", "ERROR")
368
+ log_failed_file(wav_filename, f"Failed to upload: {str(e)}")
369
+ return False
370
+
371
+ # 4. Clean up local files
372
+ try:
373
+ os.remove(json_output_path)
374
+ log_message(f"🗑️ Cleaned up local transcription file: {json_output_path}", "INFO")
375
+ except:
376
+ pass
377
+
378
+ return True
379
+
380
+ def get_next_file_to_process(repo_id: str, state: Dict[str, Any]) -> Optional[Dict[str, Any]]:
381
+ """
382
+ Finds the next audio file to process from the source repo in reverse order (oldest to newest).
383
+ Returns: { 'filename': str, 'url': str, 'index': int } or None
384
+ """
385
+ log_message(f"🔍 Searching for next audio file to process in {repo_id}", "INFO")
386
+
387
+ try:
388
+ files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
389
+
390
+ # Filter for audio files and sort in reverse order (descending)
391
+ audio_files = sorted([f for f in files_list if f.endswith(('.wav', '.mp3'))], reverse=True)
392
+
393
+ if not audio_files:
394
+ log_message("ℹ️ No audio files found in the source repository.", "INFO")
395
+ return None
396
+
397
+ processing_status["total_files"] = len(audio_files)
398
+
399
+ start_index = state.get("next_download_index", 0)
400
+
401
+ for index in range(start_index, len(audio_files)):
402
+ filename = audio_files[index]
403
+ file_state = state["file_states"].get(filename)
404
+
405
+ if file_state is None or file_state == "failed":
406
+ url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type="dataset", subfolder=None)
407
+
408
+ log_message(f"✅ Found next audio file: {filename} at index {index}", "INFO")
409
+ return {
410
+ 'filename': filename,
411
+ 'url': url,
412
+ 'index': index
413
+ }
414
+
415
+ elif file_state == "processing":
416
+ log_message(f"⚠️ File {filename} is currently marked as 'processing'. Skipping for now.", "WARNING")
417
+
418
+ elif file_state == "processed":
419
+ log_message(f"ℹ️ File {filename} already processed. Skipping.", "INFO")
420
+
421
+ log_message("ℹ️ All files up to the current index have been processed or skipped.", "INFO")
422
+
423
+ if start_index >= len(audio_files):
424
+ log_message("ℹ️ Reached end of file list. Resetting index to 0 for next loop.", "INFO")
425
+ state["next_download_index"] = 0
426
+ upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state)
427
+
428
+ return None
429
+
430
+ except Exception as e:
431
+ log_message(f"❌ Failed to list files from Hugging Face: {str(e)}", "ERROR")
432
+ return None
433
+
434
+ def main_processing_loop():
435
+ """The main loop that orchestrates the download, transcription, and upload cycle."""
436
+
437
+ if processing_status["is_running"]:
438
+ log_message("⚠️ Processing loop is already running.", "WARNING")
439
+ return
440
+
441
+ processing_status["is_running"] = True
442
+
443
+ try:
444
+ log_message("🚀 Starting audio transcription processing loop...", "INFO")
445
+
446
+ # Fetch reference files from BG_Vid repo once at the start
447
+ reference_map = fetch_reference_files(REFERENCE_REPO_ID)
448
+
449
+ if not reference_map:
450
+ log_message("❌ No reference files found. Cannot proceed.", "ERROR")
451
+ return
452
+
453
+ while processing_status["is_running"]:
454
+
455
+ current_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
456
+ next_file_info = get_next_file_to_process(SOURCE_REPO_ID, current_state)
457
+
458
+ if next_file_info is None:
459
+ log_message("💤 No new audio files to process. Sleeping for a while...", "INFO")
460
+ time.sleep(PROCESSING_DELAY * 5)
461
+ continue
462
+
463
+ target_file = next_file_info['filename']
464
+ audio_url = next_file_info['url']
465
+ target_index = next_file_info['index']
466
+
467
+ processing_status["current_file"] = target_file
468
+ success = False
469
+ matched_filename = None
470
+
471
+ try:
472
+ if not lock_file_for_processing(target_file, current_state):
473
+ log_message(f"❌ Failed to lock file {target_file}. Skipping.", "ERROR")
474
+ time.sleep(PROCESSING_DELAY)
475
+ continue
476
+
477
+ local_wav_path = os.path.join(DOWNLOAD_FOLDER, os.path.basename(target_file))
478
+ log_message(f"⬇️ Downloading audio file: {target_file}", "INFO")
479
+
480
+ if download_with_retry(audio_url, local_wav_path):
481
+
482
+ # Extract base filename for matching
483
+ base_filename = os.path.basename(target_file)
484
+ matched_filename = find_matching_filename(base_filename, reference_map)
485
+
486
+ if matched_filename:
487
+ if process_audio_file(local_wav_path, reference_map, matched_filename):
488
+ success = True
489
+ log_message(f"✅ Finished processing: {target_file}", "INFO")
490
+ else:
491
+ log_message(f"❌ Processing failed for: {target_file}", "ERROR")
492
+ else:
493
+ log_message(f"❌ No matching filename found for: {base_filename}", "ERROR")
494
+ log_failed_file(target_file, "No matching reference filename")
495
+ else:
496
+ log_message(f"❌ Download failed for: {target_file}", "ERROR")
497
+
498
+ except Exception as e:
499
+ log_message(f"🔥 An unhandled error occurred while processing {target_file}: {str(e)}", "ERROR")
500
+ log_failed_file(target_file, str(e))
501
+
502
+ finally:
503
+ next_index_to_save = target_index + 1
504
+ current_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
505
+
506
+ if success:
507
+ unlock_file_as_processed(target_file, current_state, next_index_to_save)
508
+ processing_status["processed_files"] += 1
509
+ else:
510
+ log_message(f"⚠️ Processing failed for {target_file}. Marking as 'failed' and advancing index.", "WARNING")
511
+ current_state["file_states"][target_file] = "failed"
512
+ current_state["next_download_index"] = next_index_to_save
513
+ upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, current_state)
514
+ processing_status["failed_files"] += 1
515
+
516
+ if os.path.exists(local_wav_path):
517
+ os.remove(local_wav_path)
518
+ log_message(f"🗑️ Cleaned up local file: {local_wav_path}", "INFO")
519
+
520
+ time.sleep(PROCESSING_DELAY)
521
+
522
+ log_message("🎉 Processing complete!", "INFO")
523
+ log_message(f"📊 Final stats: {processing_status['transcribed_files']} audio files transcribed, {processing_status['processed_files']} files processed", "INFO")
524
+
525
+ except KeyboardInterrupt:
526
+ log_message("⏹️ Processing interrupted by user", "WARNING")
527
+ except Exception as e:
528
+ log_message(f"❌ Fatal error: {str(e)}", "ERROR")
529
+ finally:
530
+ processing_status["is_running"] = False
531
+ cleanup_temp_files()
532
+
533
+ if __name__ == "__main__":
534
+ main_processing_loop()
download_api.py DELETED
@@ -1,407 +0,0 @@
1
- import os
2
- import json
3
- import time
4
- import threading
5
- import asyncio
6
- from fastapi import FastAPI, HTTPException, BackgroundTasks
7
- from fastapi.middleware.cors import CORSMiddleware
8
- from fastapi.responses import JSONResponse, FileResponse
9
- from fastapi.staticfiles import StaticFiles
10
- import uvicorn
11
- from typing import Dict
12
- from pathlib import Path
13
- import subprocess
14
- from datetime import datetime
15
-
16
- import torch
17
-
18
- # Import core functionality
19
- from vision_analyzer import (
20
- main_processing_loop,
21
- processing_status,
22
- log_message,
23
- FRAMES_OUTPUT_FOLDER
24
- )
25
-
26
- # FastAPI App Definition
27
- app = FastAPI(title="Video Analysis API",
28
- description="API to access video frame analysis results and extracted images",
29
- version="1.0.0")
30
-
31
- # Add CORS middleware to allow cross-origin requests
32
- app.add_middleware(
33
- CORSMiddleware,
34
- allow_origins=["*"], # Allows all origins
35
- allow_credentials=True,
36
- allow_methods=["*"], # Allows all methods
37
- allow_headers=["*"],
38
- )
39
-
40
- # Global variables for processing and frame tracking
41
- processing_thread = None
42
- frame_locks = {} # Dict to track frame locks: {course: {frame: {"locked_by": id, "locked_at": timestamp}}}
43
- processed_frames = {} # Dict to track processed frames: {course: {frame: {"processed_by": id, "processed_at": timestamp}}}
44
- LOCK_TIMEOUT = 300 # 5 minutes timeout for locks
45
- TRACKING_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "frame_tracking.json")
46
-
47
- def save_tracking_state():
48
- """Save frame tracking state to disk"""
49
- state = {
50
- "frame_locks": frame_locks,
51
- "processed_frames": processed_frames
52
- }
53
- try:
54
- with open(TRACKING_FILE, "w") as f:
55
- json.dump(state, f, indent=2)
56
- except Exception as e:
57
- log_message(f"Error saving tracking state: {e}")
58
-
59
- def load_tracking_state():
60
- """Load frame tracking state from disk"""
61
- global frame_locks, processed_frames
62
- try:
63
- with open(TRACKING_FILE, "r") as f:
64
- state = json.load(f)
65
- frame_locks = state.get("frame_locks", {})
66
- processed_frames = state.get("processed_frames", {})
67
- except FileNotFoundError:
68
- log_message("No previous tracking state found")
69
- except Exception as e:
70
- log_message(f"Error loading tracking state: {e}")
71
-
72
- def check_frame_lock(course: str, frame: str) -> bool:
73
- """Check if frame is locked and lock hasn't expired"""
74
- if course in frame_locks and frame in frame_locks[course]:
75
- lock = frame_locks[course][frame]
76
- if time.time() - lock["locked_at"] < LOCK_TIMEOUT:
77
- return True
78
- # Lock expired, remove it
79
- del frame_locks[course][frame]
80
- save_tracking_state()
81
- return False
82
-
83
- def lock_frame(course: str, frame: str, requester_id: str) -> bool:
84
- """Attempt to lock a frame for processing"""
85
- if check_frame_lock(course, frame):
86
- return False
87
-
88
- if course not in frame_locks:
89
- frame_locks[course] = {}
90
-
91
- frame_locks[course][frame] = {
92
- "locked_by": requester_id,
93
- "locked_at": time.time()
94
- }
95
- save_tracking_state()
96
- return True
97
-
98
- def mark_frame_processed(course: str, frame: str, requester_id: str):
99
- """Mark a frame as successfully processed"""
100
- if course not in processed_frames:
101
- processed_frames[course] = {}
102
-
103
- processed_frames[course][frame] = {
104
- "processed_by": requester_id,
105
- "processed_at": time.time()
106
- }
107
-
108
- # Remove the lock if it exists
109
- if course in frame_locks and frame in frame_locks[course]:
110
- del frame_locks[course][frame]
111
-
112
- save_tracking_state()
113
-
114
- def log_message(message):
115
- """Add a log message with timestamp"""
116
- timestamp = datetime.now().strftime("%H:%M:%S")
117
- log_entry = f"[{timestamp}] {message}"
118
- processing_status["logs"].append(log_entry)
119
-
120
- # Keep only the last 100 logs
121
- if len(processing_status["logs"]) > 100:
122
- processing_status["logs"] = processing_status["logs"][-100:]
123
-
124
- print(log_entry)
125
-
126
- @app.on_event("startup")
127
- async def startup_event():
128
- """Initialize frame tracking and start processing loop"""
129
- # Load frame tracking state
130
- load_tracking_state()
131
- log_message("✓ Loaded frame tracking state")
132
-
133
- # Start processing thread
134
- global processing_thread
135
- if not (processing_thread and processing_thread.is_alive()):
136
- log_message("🚀 Starting RAR extraction, frame extraction, and vision analysis pipeline in background...")
137
- processing_thread = threading.Thread(target=main_processing_loop)
138
- processing_thread.daemon = True
139
- processing_thread.start()
140
-
141
- @app.get("/")
142
- async def root():
143
- """Root endpoint that returns basic info"""
144
- return {
145
- "message": "Video Analysis API",
146
- "status": "running",
147
- "endpoints": {
148
- "/status": "Get processing status",
149
- "/courses": "List all available course folders",
150
- "/images/{course_folder}": "List images in a course folder",
151
- "/images/{course_folder}/{frame_filename}": "Get specific frame image",
152
- "/start-processing": "Start processing pipeline",
153
- "/stop-processing": "Stop processing pipeline"
154
- }
155
- }
156
-
157
- @app.get("/status")
158
- async def get_status():
159
- """Get current processing status"""
160
- return {
161
- "processing_status": processing_status,
162
- "frames_folder": FRAMES_OUTPUT_FOLDER,
163
- "frames_folder_exists": os.path.exists(FRAMES_OUTPUT_FOLDER)
164
- }
165
-
166
- # ===== NEW IMAGE SERVING ENDPOINTS =====
167
-
168
- @app.get("/middleware/next/course")
169
- async def get_next_course(requester_id: str):
170
- """Get next available course for processing"""
171
- if not os.path.exists(FRAMES_OUTPUT_FOLDER):
172
- raise HTTPException(status_code=404, detail="No courses available")
173
-
174
- # Load latest state
175
- load_tracking_state()
176
-
177
- # Find a course with unprocessed frames
178
- for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
179
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
180
- if not os.path.isdir(folder_path):
181
- continue
182
-
183
- # Check if course has any unprocessed frames
184
- image_files = [f for f in os.listdir(folder_path)
185
- if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
186
-
187
- for image in image_files:
188
- if (folder not in processed_frames or
189
- image not in processed_frames[folder]):
190
- return {"course": folder}
191
-
192
- raise HTTPException(status_code=404, detail="No courses with unprocessed frames")
193
-
194
- @app.get("/middleware/next/image/{course_folder}")
195
- async def get_next_image(course_folder: str, requester_id: str):
196
- """Get next available image from a course"""
197
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
198
-
199
- if not os.path.exists(folder_path):
200
- raise HTTPException(status_code=404, detail=f"Course not found: {course_folder}")
201
-
202
- # Load latest state
203
- load_tracking_state()
204
-
205
- # Find first unprocessed and unlocked frame
206
- for file in sorted(os.listdir(folder_path)):
207
- if not file.lower().endswith(('.png', '.jpg', '.jpeg')):
208
- continue
209
-
210
- # Skip if processed
211
- if (course_folder in processed_frames and
212
- file in processed_frames[course_folder]):
213
- continue
214
-
215
- # Skip if locked by another requester
216
- if check_frame_lock(course_folder, file):
217
- continue
218
-
219
- # Try to lock the frame
220
- if lock_frame(course_folder, file, requester_id):
221
- file_path = os.path.join(folder_path, file)
222
- file_stats = os.stat(file_path)
223
- return {
224
- "file_id": f"frame:{course_folder}/{file}",
225
- "frame": file,
226
- "video": os.path.splitext(file)[0],
227
- "size_bytes": file_stats.st_size,
228
- "modified_time": time.ctime(file_stats.st_mtime),
229
- "url": f"/images/{course_folder}/{file}"
230
- }
231
-
232
- raise HTTPException(status_code=404, detail="No available frames in course")
233
-
234
- @app.post("/middleware/release/frame/{course_folder}/{video}/{frame}")
235
- async def release_frame(course_folder: str, video: str, frame: str, requester_id: str):
236
- """Release a frame lock"""
237
- if course_folder in frame_locks and frame in frame_locks[course_folder]:
238
- lock = frame_locks[course_folder][frame]
239
- if lock["locked_by"] == requester_id:
240
- del frame_locks[course_folder][frame]
241
- save_tracking_state()
242
- return {"status": "released"}
243
- return {"status": "not_found"}
244
-
245
- @app.post("/middleware/release/course/{course_folder}")
246
- async def release_course(course_folder: str, requester_id: str):
247
- """Release all frame locks for a course"""
248
- if course_folder in frame_locks:
249
- # Only release frames locked by this requester
250
- frames_to_release = [
251
- frame for frame, lock in frame_locks[course_folder].items()
252
- if lock["locked_by"] == requester_id
253
- ]
254
- for frame in frames_to_release:
255
- del frame_locks[course_folder][frame]
256
- save_tracking_state()
257
- return {"status": "released"}
258
-
259
- @app.get("/images/{course_folder}/{frame_filename}")
260
- async def get_frame_image(course_folder: str, frame_filename: str, requester_id: str = None):
261
- """
262
- Serve extracted frame images from course folders with locking
263
-
264
- Args:
265
- course_folder: The course folder name (e.g., "course1_video1_mp4_frames")
266
- frame_filename: The frame file name (e.g., "0001.png")
267
- requester_id: Optional requester ID for frame locking
268
- """
269
- # Load latest state
270
- load_tracking_state()
271
-
272
- # Construct the full path to the image
273
- image_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder, frame_filename)
274
-
275
- # Check if file exists
276
- if not os.path.exists(image_path):
277
- raise HTTPException(status_code=404, detail=f"Image not found: {course_folder}/{frame_filename}")
278
-
279
- # Verify it's an image file
280
- if not frame_filename.lower().endswith(('.png', '.jpg', '.jpeg')):
281
- raise HTTPException(status_code=400, detail="File must be an image (PNG, JPG, JPEG)")
282
-
283
- # If requester_id provided, verify frame lock
284
- if requester_id:
285
- if check_frame_lock(course_folder, frame_filename):
286
- lock = frame_locks[course_folder][frame_filename]
287
- if lock["locked_by"] != requester_id:
288
- raise HTTPException(status_code=423, detail="Frame is locked by another requester")
289
-
290
- # Return the image file
291
- return FileResponse(image_path)
292
-
293
- @app.get("/images/{course_folder}")
294
- async def list_course_images(course_folder: str):
295
- """
296
- List all available images in a specific course folder
297
-
298
- Args:
299
- course_folder: The course folder name
300
- """
301
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
302
-
303
- if not os.path.exists(folder_path):
304
- raise HTTPException(status_code=404, detail=f"Course folder not found: {course_folder}")
305
-
306
- # Get all image files
307
- image_files = []
308
- for file in os.listdir(folder_path):
309
- if file.lower().endswith(('.png', '.jpg', '.jpeg')):
310
- file_path = os.path.join(folder_path, file)
311
- file_stats = os.stat(file_path)
312
- image_files.append({
313
- "filename": file,
314
- "size_bytes": file_stats.st_size,
315
- "modified_time": time.ctime(file_stats.st_mtime),
316
- "url": f"/images/{course_folder}/{file}"
317
- })
318
-
319
- return {
320
- "course_folder": course_folder,
321
- "total_images": len(image_files),
322
- "images": image_files
323
- }
324
-
325
- @app.get("/courses")
326
- async def list_all_courses():
327
- """
328
- List all available course folders with their image counts
329
- """
330
- if not os.path.exists(FRAMES_OUTPUT_FOLDER):
331
- return {"courses": [], "message": "Frames output folder does not exist yet"}
332
-
333
- courses = []
334
- for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
335
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
336
- if os.path.isdir(folder_path):
337
- # Count image files
338
- image_count = len([f for f in os.listdir(folder_path)
339
- if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
340
- courses.append({
341
- "course_folder": folder,
342
- "image_count": image_count,
343
- "images_url": f"/images/{folder}",
344
- "sample_image_url": f"/images/{folder}/0001.png" if image_count > 0 else None
345
- })
346
-
347
- return {
348
- "total_courses": len(courses),
349
- "courses": courses
350
- }
351
-
352
-
353
- # Signal handlers to prevent accidental shutdown
354
- def handle_shutdown(signum, frame):
355
- """Prevent shutdown on SIGTERM/SIGINT"""
356
- print(f"\n⚠️ Received signal {signum}. Server will continue running.")
357
- print("Use Ctrl+Break or kill -9 to force stop.")
358
-
359
- # Setup signal handlers for graceful shutdown prevention
360
- import signal
361
- signal.signal(signal.SIGINT, handle_shutdown)
362
- signal.signal(signal.SIGTERM, handle_shutdown)
363
-
364
- # Server lifecycle events
365
- @app.on_event("shutdown")
366
- async def shutdown_event():
367
- """Save state on shutdown attempt"""
368
- save_tracking_state()
369
- print("💾 Saved tracking state")
370
- print("⚠️ Server shutdown prevented - use Ctrl+Break or kill -9 to force stop")
371
- # Prevent shutdown by not returning
372
- while True:
373
- await asyncio.sleep(1)
374
-
375
- if __name__ == "__main__":
376
- # Start the FastAPI server
377
- print("🚀 Starting Video Analysis FastAPI Server (Persistent Mode)...")
378
- print("API Documentation will be available at: http://localhost:8000/docs")
379
- print("API Root endpoint: http://localhost:8000/")
380
- print("⚠️ Server will continue running even after processing completes")
381
- print("Use Ctrl+Break or kill -9 to force stop")
382
-
383
- # Ensure the analysis output folder exists
384
- os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
385
-
386
- # Start processing in thread instead of blocking
387
- processing_thread = threading.Thread(target=main_processing_loop)
388
- processing_thread.daemon = False # Make non-daemon so it doesn't exit
389
- processing_thread.start()
390
-
391
- # Configure uvicorn for persistent running
392
- config = uvicorn.Config(
393
- app=app,
394
- host="0.0.0.0",
395
- port=8000,
396
- log_level="info",
397
- reload=False,
398
- workers=1,
399
- loop="asyncio",
400
- timeout_keep_alive=600, # Keep connections alive longer
401
- access_log=True
402
- )
403
-
404
- # Run server with persistent config
405
- server = uvicorn.Server(config)
406
- server.run()
407
-