Fred808 commited on
Commit
c64d671
·
verified ·
1 Parent(s): e79b1b8

Upload modified_video_processor.py

Browse files
Files changed (1) hide show
  1. modified_video_processor.py +866 -0
modified_video_processor.py ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
+ import asyncio
7
+ import threading
8
+ import time
9
+ import hashlib
10
+ import zipfile
11
+ import uvicorn
12
+ from typing import Dict, List, Set, Optional
13
+ from fastapi import FastAPI, BackgroundTasks, HTTPException, Form
14
+ from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from huggingface_hub import HfApi, list_repo_files
17
+ from huggingface_hub.utils import HfHubHTTPError
18
+
19
+ # ==== CONFIGURATION ====
20
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
21
+ SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1") # Source for RARs
22
+ DEST_REPO_ID_RAR = os.getenv("DEST_REPO_RAR", "") # Destination for extracted RAR contents (set to empty string if not needed)
23
+ # DEST_REPO_ID_VIDEO = os.getenv("DEST_REPO_VIDEO", "Fred808/BG3") # Destination for zipped video frames - DISABLED FOR DOWNLOAD MODE
24
+
25
+ DOWNLOAD_FOLDER = "downloads"
26
+ EXTRACT_FOLDER = "extracted_tmp"
27
+ VIDEO_FRAMES_EXTRACT_FOLDER = "video_frames_tmp"
28
+ ZIPPED_FRAMES_FOLDER = "zipped_frames" # This will now store files for download instead of upload
29
+
30
+ DOWNLOAD_STATE_FILE = "download_progress.json"
31
+ PROCESS_STATE_FILE = "process_progress.json"
32
+ UPLOADED_FOLDERS_FILE = "uploaded_folders.json" # Track uploaded folder hashes for BG2
33
+ PROCESSED_VIDEO_COURSES_FILE = "processed_video_courses.json" # Track processed video course folders for BG3
34
+ FAILED_FILES_LOG = "failed_files.txt"
35
+
36
+ CHUNK_SIZE = 3 # Smaller chunks for Space environment
37
+ PROCESSING_DELAY = 2 # Delay between processing files (seconds)
38
+ VIDEO_FRAME_FPS = 3 # Frames per second to extract from videos
39
+
40
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
41
+ os.makedirs(EXTRACT_FOLDER, exist_ok=True)
42
+ os.makedirs(VIDEO_FRAMES_EXTRACT_FOLDER, exist_ok=True)
43
+ os.makedirs(ZIPPED_FRAMES_FOLDER, exist_ok=True)
44
+
45
+ api = HfApi(token=HF_TOKEN)
46
+
47
+ # Global state
48
+ processing_status = {
49
+ "is_running": False,
50
+ "current_file": None,
51
+ "total_files": 0,
52
+ "processed_files": 0,
53
+ "failed_files": 0,
54
+ "uploaded_rar_folders": 0,
55
+ "extracted_video_courses": 0, # Changed from uploaded_video_courses to extracted_video_courses
56
+ "last_update": None,
57
+ "logs": []
58
+ }
59
+
60
+ app = FastAPI(title="RAR & Video Processing Service", description="Automated RAR extraction and video frame extraction service with download capability")
61
+
62
+ # Add CORS middleware
63
+ app.add_middleware(
64
+ CORSMiddleware,
65
+ allow_origins=["*"],
66
+ allow_credentials=True,
67
+ allow_methods=["*"],
68
+ allow_headers=["*"],
69
+ )
70
+
71
+ def log_message(message: str):
72
+ """Add message to logs and print it"""
73
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
74
+ log_entry = f"[{timestamp}] {message}"
75
+ print(log_entry)
76
+ processing_status["logs"].append(log_entry)
77
+ processing_status["last_update"] = timestamp
78
+ # Keep only last 100 log entries
79
+ if len(processing_status["logs"]) > 100:
80
+ processing_status["logs"] = processing_status["logs"][-100:]
81
+
82
+ def log_failed_file(filename: str, error_msg: str):
83
+ """Log failed files to a separate file for later review"""
84
+ with open(FAILED_FILES_LOG, "a") as f:
85
+ f.write(f"{filename}: {error_msg}\n")
86
+ log_message(f"❌ Failed: {filename} - {error_msg}")
87
+
88
+ def get_folder_hash(folder_name: str) -> str:
89
+ """Generate a hash for the folder name to use as a unique identifier"""
90
+ return hashlib.md5(folder_name.encode()).hexdigest()
91
+
92
+ def load_uploaded_folders() -> Set[str]:
93
+ """Load set of uploaded folder hashes for BG2"""
94
+ if os.path.exists(UPLOADED_FOLDERS_FILE):
95
+ try:
96
+ with open(UPLOADED_FOLDERS_FILE, "r") as f:
97
+ data = json.load(f)
98
+ return set(data.get("uploaded_folder_hashes", []))
99
+ except json.JSONDecodeError:
100
+ log_message(f"⚠️ Warning: Could not decode {UPLOADED_FOLDERS_FILE}. Starting with empty set.")
101
+ return set()
102
+ return set()
103
+
104
+ def save_uploaded_folders(uploaded_set: Set[str]):
105
+ """Save set of uploaded folder hashes for BG2"""
106
+ with open(UPLOADED_FOLDERS_FILE, "w") as f:
107
+ json.dump({"uploaded_folder_hashes": list(uploaded_set)}, f)
108
+
109
+ def load_processed_video_courses() -> Set[str]:
110
+ """Loads the set of processed video course folder names."""
111
+ if os.path.exists(PROCESSED_VIDEO_COURSES_FILE):
112
+ try:
113
+ with open(PROCESSED_VIDEO_COURSES_FILE, "r") as f:
114
+ return set(json.load(f))
115
+ except json.JSONDecodeError:
116
+ log_message(f"⚠️ Warning: Could not decode {PROCESSED_VIDEO_COURSES_FILE}. Starting with empty set.")
117
+ return set()
118
+ return set()
119
+
120
+ def save_processed_video_courses(processed_set: set):
121
+ """Saves the set of processed video course folder names to a file."""
122
+ with open(PROCESSED_VIDEO_COURSES_FILE, "w") as f:
123
+ json.dump(list(processed_set), f)
124
+
125
+ def load_download_state() -> int:
126
+ """Load download progress state"""
127
+ if os.path.exists(DOWNLOAD_STATE_FILE):
128
+ try:
129
+ with open(DOWNLOAD_STATE_FILE, "r") as f:
130
+ return json.load(f).get("next_download_index", 0)
131
+ except json.JSONDecodeError:
132
+ log_message(f"⚠️ Warning: Could not decode {DOWNLOAD_STATE_FILE}. Starting download from index 0.")
133
+ return 0
134
+ return 0
135
+
136
+ def save_download_state(next_index: int):
137
+ """Save download progress state"""
138
+ with open(DOWNLOAD_STATE_FILE, "w") as f:
139
+ json.dump({"next_download_index": next_index}, f)
140
+
141
+ def load_processed_files_state() -> set:
142
+ """Load processed files from the state file"""
143
+ if os.path.exists(PROCESS_STATE_FILE):
144
+ try:
145
+ with open(PROCESS_STATE_FILE, "r") as f:
146
+ data = json.load(f)
147
+ return set(data.get("processed_rars", []))
148
+ except json.JSONDecodeError:
149
+ log_message(f"⚠️ Warning: Could not decode {PROCESS_STATE_FILE}. Starting with empty processed list.")
150
+ return set()
151
+ return set()
152
+
153
+ def save_processed_files_state(processed_set: set):
154
+ """Save processed files to the state file"""
155
+ with open(PROCESS_STATE_FILE, "w") as f:
156
+ json.dump({"processed_rars": list(processed_set)}, f)
157
+
158
+ def download_rar_files(start_index: int, chunk_size: int) -> tuple:
159
+ """Downloads a batch of RAR files from the source dataset"""
160
+ try:
161
+ all_files = list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset", token=HF_TOKEN)
162
+ # Filter for .rar files and exclude the specific one
163
+ rar_files_in_repo = sorted([f for f in all_files if f.endswith(".rar") and "ZBrush/3DConceptArtist_TheUltimateZbrushGuide_DownloadPirate.com.rar" not in f])
164
+
165
+ end_index = start_index + chunk_size
166
+ files_to_download_metadata = rar_files_in_repo[start_index:end_index]
167
+
168
+ if not files_to_download_metadata:
169
+ log_message("✅ No more RAR files to download.")
170
+ return [], start_index
171
+
172
+ log_message(f"📥 Downloading RAR files {start_index + 1} to {end_index} from {SOURCE_REPO_ID}")
173
+
174
+ downloaded_paths = []
175
+ for file_path_in_repo in files_to_download_metadata:
176
+ filename = os.path.basename(file_path_in_repo)
177
+ dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
178
+ file_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{file_path_in_repo}"
179
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
180
+
181
+ if os.path.exists(dest_path):
182
+ log_message(f"⏩ Already exists, skipping: {filename}")
183
+ downloaded_paths.append(dest_path)
184
+ continue
185
+
186
+ log_message(f"🔽 Downloading: {file_path_in_repo}")
187
+ try:
188
+ with requests.get(file_url, headers=headers, stream=True) as r:
189
+ r.raise_for_status()
190
+ with open(dest_path, "wb") as f:
191
+ for chunk in r.iter_content(chunk_size=8192):
192
+ f.write(chunk)
193
+ log_message(f"✅ Downloaded: {filename}")
194
+ downloaded_paths.append(dest_path)
195
+ except Exception as e:
196
+ log_message(f"❌ Failed to download {file_path_in_repo}: {e}")
197
+ log_failed_file(file_path_in_repo, f"Download failed: {e}")
198
+
199
+ return downloaded_paths, end_index
200
+ except Exception as e:
201
+ log_message(f"❌ Error in download_rar_files: {e}")
202
+ return [], start_index
203
+
204
+ def extract_frames(video_path: str, output_folder: str, fps: int) -> bool:
205
+ """Extracts frames from a video using ffmpeg."""
206
+ os.makedirs(output_folder, exist_ok=True)
207
+
208
+ # Ensure ffmpeg is available
209
+ if shutil.which("ffmpeg") is None:
210
+ log_message("❌ ffmpeg not found. Please install ffmpeg.")
211
+ return False
212
+
213
+ # ffmpeg command to extract frames at specified FPS
214
+ command = [
215
+ "ffmpeg",
216
+ "-i", video_path,
217
+ "-vf", f"fps={fps}",
218
+ os.path.join(output_folder, "frame_%04d.png")
219
+ ]
220
+
221
+ log_message(f"🖼️ Extracting frames from {os.path.basename(video_path)} to {output_folder} at {fps} FPS...")
222
+ try:
223
+ subprocess.run(command, check=True, capture_output=True, text=True)
224
+ log_message(f"✅ Successfully extracted frames from {os.path.basename(video_path)}")
225
+ return True
226
+ except subprocess.CalledProcessError as e:
227
+ log_message(f"❌ Error extracting frames from {os.path.basename(video_path)}: {e.stderr}")
228
+ return False
229
+
230
+ def zip_folder(folder_path: str, output_zip_path: str) -> bool:
231
+ """Zips the contents of a folder."""
232
+ log_message(f"📦 Compressing {folder_path} to {output_zip_path}...")
233
+ try:
234
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
235
+ for root, _, files in os.walk(folder_path):
236
+ for file in files:
237
+ file_path = os.path.join(root, file)
238
+ arcname = os.path.relpath(file_path, folder_path)
239
+ zipf.write(file_path, arcname)
240
+ log_message(f"✅ Successfully zipped {folder_path}")
241
+ return True
242
+ except Exception as e:
243
+ log_message(f"❌ Error zipping {folder_path}: {e}")
244
+ return False
245
+
246
+ def upload_file_to_hf(local_path: str, path_in_repo: str, repo_id: str, max_retries: int = 5, initial_delay: int = 5) -> bool:
247
+ """Uploads a single file to Hugging Face Hub with retry logic and exponential backoff."""
248
+ log_message(f"⬆️ Uploading {os.path.basename(local_path)} to {repo_id}/{path_in_repo}")
249
+
250
+ for attempt in range(max_retries):
251
+ try:
252
+ api.upload_file(
253
+ path_or_fileobj=local_path,
254
+ path_in_repo=path_in_repo,
255
+ repo_id=repo_id,
256
+ repo_type="dataset"
257
+ )
258
+ log_message(f"✅ Uploaded: {os.path.basename(local_path)}")
259
+ return True
260
+ except HfHubHTTPError as e:
261
+ if e.response.status_code == 429 and attempt < max_retries - 1: # Too Many Requests
262
+ delay = initial_delay * (2 ** attempt)
263
+ log_message(f"⚠️ Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{max_retries})")
264
+ time.sleep(delay)
265
+ else:
266
+ log_message(f"❌ Hugging Face Hub error uploading {os.path.basename(local_path)}: {e}")
267
+ return False
268
+ except Exception as e:
269
+ log_message(f"❌ Error uploading {os.path.basename(local_path)}: {e}")
270
+ return False
271
+ log_message(f"❌ Failed to upload {os.path.basename(local_path)} after {max_retries} attempts.")
272
+ return False
273
+
274
+ def process_video_frames_for_download(extracted_rar_folder: str, processed_video_courses_set: Set[str]) -> bool:
275
+ """Scans an extracted RAR folder for MP4s, extracts frames, zips, and saves for download."""
276
+ video_processed_successfully = False
277
+
278
+ # Use the top-level folder name of the extracted RAR as the course folder name
279
+ course_folder_name = os.path.basename(extracted_rar_folder)
280
+
281
+ if course_folder_name in processed_video_courses_set:
282
+ log_message(f"⏩ Video frames for course '{course_folder_name}' already processed. Skipping.")
283
+ return True
284
+
285
+ log_message(f"🎬 Processing videos in extracted RAR folder: {course_folder_name}")
286
+
287
+ video_files_found = []
288
+ for root, _, files in os.walk(extracted_rar_folder):
289
+ for file in files:
290
+ # Check for common video file extensions
291
+ if file.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv')):
292
+ video_files_found.append(os.path.join(root, file))
293
+
294
+ if not video_files_found:
295
+ log_message(f"⚠️ No video files found in {course_folder_name}. Skipping video frame extraction.")
296
+ return False # Indicate no video processing was done
297
+
298
+ course_video_extract_dir = os.path.join(VIDEO_FRAMES_EXTRACT_FOLDER, course_folder_name)
299
+ os.makedirs(course_video_extract_dir, exist_ok=True)
300
+
301
+ frames_extracted_count = 0
302
+ for video_path in video_files_found:
303
+ video_basename = os.path.splitext(os.path.basename(video_path))[0]
304
+ # Create a unique output folder for frames from this video within the course's frame directory
305
+ video_output_folder = os.path.join(course_video_extract_dir, video_basename)
306
+
307
+ if extract_frames(video_path, video_output_folder, VIDEO_FRAME_FPS):
308
+ frames_extracted_count += 1
309
+ else:
310
+ log_message(f"❌ Failed to extract frames from {os.path.basename(video_path)}. Continuing with other videos.")
311
+ # Clean up partially extracted frames for this video
312
+ if os.path.exists(video_output_folder):
313
+ shutil.rmtree(video_output_folder)
314
+
315
+ # Check if any frames were extracted for the entire course folder
316
+ if frames_extracted_count == 0:
317
+ log_message(f"⚠️ No frames extracted for any video in {course_folder_name}. Skipping zipping.")
318
+ if os.path.exists(course_video_extract_dir):
319
+ shutil.rmtree(course_video_extract_dir)
320
+ return False
321
+
322
+ course_zip_path = os.path.join(ZIPPED_FRAMES_FOLDER, f"{course_folder_name}_frames.zip")
323
+ if zip_folder(course_video_extract_dir, course_zip_path):
324
+ log_message(f"✅ Successfully processed video frames and saved {course_folder_name}_frames.zip for download")
325
+ processed_video_courses_set.add(course_folder_name) # Mark as processed
326
+ save_processed_video_courses(processed_video_courses_set) # Save state
327
+ video_processed_successfully = True
328
+
329
+ # Clean up the temporary extraction folder but keep the zip file for download
330
+ log_message(f"🧹 Cleaning up temporary video frame files for {course_folder_name}")
331
+ if os.path.exists(course_video_extract_dir):
332
+ shutil.rmtree(course_video_extract_dir)
333
+ else:
334
+ log_message(f"❌ Failed to zip video frames for {course_folder_name}")
335
+
336
+ return video_processed_successfully
337
+
338
+ def extract_and_upload_rar(rar_path: str, processed_rars_set: set, uploaded_folders_set: Set[str], processed_video_courses_set: Set[str]) -> bool:
339
+ """Extracts a single RAR file, uploads its contents to BG2 (if DEST_REPO_ID_RAR is set), and then processes videos for download"""
340
+ filename = os.path.basename(rar_path)
341
+ processing_status["current_file"] = filename
342
+
343
+ folder_name = filename.replace(".rar", "")
344
+ folder_hash = get_folder_hash(folder_name)
345
+ current_extract_folder = os.path.join(EXTRACT_FOLDER, f"{folder_name}_extracted")
346
+
347
+ # Check if RAR is already processed (uploaded to BG2 or video frames processed)
348
+ # This logic needs to be careful. If BG2 is not set, we only care about video processing.
349
+ # If video processing is not needed, we only care about BG2 upload.
350
+ is_bg2_processed = (not DEST_REPO_ID_RAR) or (folder_hash in uploaded_folders_set)
351
+ is_bg3_processed = (folder_name in processed_video_courses_set)
352
+
353
+ if filename in processed_rars_set and is_bg2_processed and is_bg3_processed:
354
+ log_message(f"⏩ {filename} already fully processed, skipping.")
355
+ return True
356
+
357
+ # If BG2 upload is enabled and folder already uploaded to BG2, skip RAR extraction/upload to BG2
358
+ # but still proceed to video processing if not already done.
359
+ if DEST_REPO_ID_RAR and folder_hash in uploaded_folders_set and not is_bg3_processed:
360
+ log_message(f"🔒 Folder '{folder_name}' already uploaded to BG2 (hash: {folder_hash[:8]}...), skipping RAR upload.")
361
+ # If the extracted folder doesn't exist, we can't process videos from it.
362
+ # This scenario might happen if the previous run was interrupted after BG2 upload but before video processing cleanup.
363
+ if not os.path.exists(current_extract_folder):
364
+ log_message(f"⚠️ Extracted folder {current_extract_folder} not found for video processing. Attempting re-extraction for video processing.")
365
+ # Fall through to re-extract and process videos
366
+ else:
367
+ # Proceed to video processing if not already done
368
+ log_message(f"Continuing with video processing for {filename}.")
369
+ video_processed = process_video_frames_for_download(current_extract_folder, processed_video_courses_set)
370
+ if video_processed:
371
+ processed_rars_set.add(filename)
372
+ save_processed_files_state(processed_rars_set)
373
+ return video_processed
374
+
375
+ log_message(f"📦 Attempting to extract: {filename}")
376
+ os.makedirs(current_extract_folder, exist_ok=True)
377
+
378
+ try:
379
+ if shutil.which("unrar") is None:
380
+ raise RuntimeError("unrar command not found. Please install unrar.")
381
+
382
+ # Use -idq to suppress query messages and -o+ to overwrite without prompting
383
+ unrar_command = ["unrar", "x", "-o+", rar_path, current_extract_folder]
384
+ log_message(f"Running command: {' '.join(unrar_command)}")
385
+
386
+ result = subprocess.run(
387
+ unrar_command,
388
+ check=True,
389
+ capture_output=True,
390
+ text=True,
391
+ encoding='utf-8'
392
+ )
393
+
394
+ extracted_contents = os.listdir(current_extract_folder)
395
+ if not extracted_contents:
396
+ raise Exception("Extraction completed but no files were produced in the target directory.")
397
+ log_message(f"Successfully extracted {len(extracted_contents)} items")
398
+
399
+ # Upload extracted files to BG2 if DEST_REPO_ID_RAR is set
400
+ if DEST_REPO_ID_RAR:
401
+ upload_count = 0
402
+ for root, _, files in os.walk(current_extract_folder):
403
+ for file in files:
404
+ local_path = os.path.join(root, file)
405
+ # Construct path in repo relative to the extracted content's root
406
+ path_in_repo = os.path.join(folder_name, os.path.relpath(local_path, current_extract_folder))
407
+ log_message(f"⬆️ Uploading to BG2: {path_in_repo}")
408
+
409
+ try:
410
+ if upload_file_to_hf(
411
+ local_path=local_path,
412
+ path_in_repo=path_in_repo,
413
+ repo_id=DEST_REPO_ID_RAR
414
+ ):
415
+ upload_count += 1
416
+ else:
417
+ log_message(f"❌ Failed to upload {path_in_repo} to BG2. Skipping remaining uploads for this RAR.")
418
+ # Consider if you want to fail the whole RAR processing here or continue.
419
+ # For now, we'll continue but log the failure.
420
+
421
+ except Exception as upload_error:
422
+ log_message(f"❌ Failed to upload {path_in_repo} to BG2: {upload_error}")
423
+ # Don't re-raise, allow other files to be attempted
424
+
425
+ if upload_count > 0: # Only mark as uploaded if at least one file was successfully uploaded
426
+ log_message(f"✅ Successfully uploaded {upload_count} files from {filename} to BG2")
427
+ # Mark folder as uploaded to BG2 using hash
428
+ uploaded_folders_set.add(folder_hash)
429
+ save_uploaded_folders(uploaded_folders_set)
430
+ processing_status["uploaded_rar_folders"] = len(uploaded_folders_set)
431
+ log_message(f"🔒 Folder '{folder_name}' locked in BG2 repo (hash: {folder_hash[:8]}...)")
432
+ else:
433
+ log_message(f"⚠️ No files were successfully uploaded from {filename} to BG2.")
434
+ else:
435
+ log_message("Skipping upload to BG2 as DEST_REPO_ID_RAR is not set.")
436
+
437
+ # Now process video frames from the extracted content for download
438
+ video_processed = process_video_frames_for_download(current_extract_folder, processed_video_courses_set)
439
+
440
+ # Mark RAR as processed only if both BG2 (if enabled) and video processing are successful
441
+ # Or if BG2 is not enabled, only video processing needs to be successful
442
+ if (not DEST_REPO_ID_RAR or (folder_hash in uploaded_folders_set)) and video_processed:
443
+ processed_rars_set.add(filename)
444
+ save_processed_files_state(processed_rars_set)
445
+ processing_status["processed_files"] = len(processed_rars_set)
446
+ processing_status["extracted_video_courses"] = len(processed_video_courses_set)
447
+ return True
448
+ elif DEST_REPO_ID_RAR and not (folder_hash in uploaded_folders_set):
449
+ log_message(f"❌ RAR processing failed for {filename}: BG2 upload was not successful.")
450
+ return False
451
+ elif not video_processed:
452
+ log_message(f"❌ RAR processing failed for {filename}: Video frame processing was not successful.")
453
+ return False
454
+ else:
455
+ # This case should ideally not be reached if the above logic is exhaustive
456
+ log_message(f"❌ RAR processing failed for {filename}: Unknown reason.")
457
+ return False
458
+
459
+ except subprocess.CalledProcessError as e:
460
+ error_msg = f"RAR extraction failed (exit {e.returncode}): {e.stderr.strip()}"
461
+ log_failed_file(filename, error_msg)
462
+ processing_status["failed_files"] += 1
463
+ return False
464
+ except Exception as e:
465
+ error_msg = f"Unexpected error during processing {filename}: {str(e)}"
466
+ log_failed_file(filename, error_msg)
467
+ processing_status["failed_files"] += 1
468
+ return False
469
+ finally:
470
+ # Always cleanup the extraction folder after processing (success or failure)
471
+ if os.path.exists(current_extract_folder):
472
+ log_message(f"🧹 Cleaning up extracted RAR files in {current_extract_folder}")
473
+ try:
474
+ shutil.rmtree(current_extract_folder)
475
+ log_message(f"✅ Cleaned up RAR extraction folder")
476
+ except Exception as e:
477
+ log_message(f"⚠️ Could not clean up RAR extraction folder {current_extract_folder}: {e}")
478
+
479
+ def continuous_processing(start_download_index: Optional[int] = None):
480
+ """Main processing loop that runs continuously, with an optional starting download index"""
481
+ processing_status["is_running"] = True
482
+ log_message("🚀 Starting continuous RAR and Video processing...")
483
+
484
+ try:
485
+ # Load uploaded folders tracking for BG2
486
+ uploaded_folders = load_uploaded_folders()
487
+ processing_status["uploaded_rar_folders"] = len(uploaded_folders)
488
+
489
+ # Load processed video courses tracking
490
+ processed_video_courses = load_processed_video_courses()
491
+ processing_status["extracted_video_courses"] = len(processed_video_courses)
492
+
493
+ if start_download_index is not None:
494
+ log_message(f"Starting download from index: {start_download_index}")
495
+ save_download_state(start_download_index) # Set download state to start from this index
496
+ else:
497
+ log_message("Starting download from saved state or beginning.")
498
+
499
+ while processing_status["is_running"]:
500
+ # 1. Download a batch of RAR files
501
+ download_start_index = load_download_state()
502
+ downloaded_rar_paths, next_download_index = download_rar_files(download_start_index, CHUNK_SIZE)
503
+ save_download_state(next_download_index)
504
+
505
+ # 2. Process all available RAR files (downloaded + existing)
506
+ all_local_rars = sorted([os.path.join(DOWNLOAD_FOLDER, f) for f in os.listdir(DOWNLOAD_FOLDER) if f.endswith(".rar")])
507
+ processed_rars = load_processed_files_state()
508
+ processing_status["total_files"] = len(all_local_rars)
509
+ # Recalculate processed_files based on actual processed_rars_set to be accurate
510
+ processing_status["processed_files"] = len(processed_rars)
511
+
512
+ # Filter out RARs that are already fully processed based on current state
513
+ rars_to_process = []
514
+ for rar_file_path in all_local_rars:
515
+ filename = os.path.basename(rar_file_path)
516
+ folder_name = filename.replace(".rar", "")
517
+ is_bg2_processed = (not DEST_REPO_ID_RAR) or (get_folder_hash(folder_name) in uploaded_folders)
518
+ is_bg3_processed = (folder_name in processed_video_courses)
519
+
520
+ if not (filename in processed_rars and is_bg2_processed and is_bg3_processed):
521
+ rars_to_process.append(rar_file_path)
522
+
523
+ if not downloaded_rar_paths and not rars_to_process:
524
+ log_message("✅ No more RAR files to download or process. Stopping...")
525
+ break
526
+
527
+ for rar_file_path in rars_to_process:
528
+ if not processing_status["is_running"]:
529
+ break
530
+
531
+ filename = os.path.basename(rar_file_path)
532
+ success = extract_and_upload_rar(rar_file_path, processed_rars, uploaded_folders, processed_video_courses)
533
+ if success:
534
+ # Delete the RAR file after successful processing
535
+ log_message(f"🗑️ Deleting processed RAR: {filename}")
536
+ try:
537
+ os.remove(rar_file_path)
538
+ log_message(f"✅ Deleted RAR file: {filename}")
539
+ except Exception as e:
540
+ log_message(f"⚠️ Could not delete {rar_file_path}: {e}")
541
+
542
+ # Add delay between processing files
543
+ time.sleep(PROCESSING_DELAY)
544
+
545
+ # If no new files were downloaded and all local files are processed, we're done
546
+ if not downloaded_rar_paths and not rars_to_process:
547
+ break
548
+
549
+ except Exception as e:
550
+ log_message(f"❌ Error in continuous processing: {e}")
551
+ finally:
552
+ processing_status["is_running"] = False
553
+ processing_status["current_file"] = None
554
+ log_message("🏁 Processing stopped")
555
+
556
+ @app.get("/", response_class=HTMLResponse)
557
+ async def root():
558
+ """Serve the main HTML interface"""
559
+ html_content = """
560
+ <!DOCTYPE html>
561
+ <html>
562
+ <head>
563
+ <title>RAR & Video Processing Service</title>
564
+ <meta charset=\"utf-8\">
565
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">
566
+ <style>
567
+ body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
568
+ .container { max-width: 1200px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
569
+ .status-card { background: #e3f2fd; padding: 15px; border-radius: 5px; margin: 10px 0; }
570
+ .logs { background: #f5f5f5; padding: 15px; border-radius: 5px; height: 400px; overflow-y: auto; font-family: monospace; font-size: 12px; }
571
+ .button { background: #2196F3; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; margin: 5px; }
572
+ .button:hover { background: #1976D2; }
573
+ .button:disabled { background: #ccc; cursor: not-allowed; }
574
+ .stop-button { background: #f44336; }
575
+ .stop-button:hover { background: #d32f2f; }
576
+ .download-button { background: #4CAF50; }
577
+ .download-button:hover { background: #45a049; }
578
+ .stats { display: flex; gap: 20px; margin: 20px 0; }
579
+ .stat-item { background: #f0f0f0; padding: 10px; border-radius: 5px; text-align: center; flex: 1; }
580
+ .start-form { margin-top: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 5px; background: #f9f9f9; }
581
+ .start-form input[type=\"number\"] { width: calc(100% - 120px); padding: 8px; margin-right: 10px; border: 1px solid #ccc; border-radius: 4px; }
582
+ .start-form button { padding: 8px 15px; background: #4CAF50; color: white; border: none; border-radius: 4px; cursor: pointer; }
583
+ .start-form button:hover { background: #45a049; }
584
+ .downloads-section { margin-top: 30px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; background: #f9f9f9; }
585
+ .download-list { max-height: 300px; overflow-y: auto; }
586
+ .download-item { display: flex; justify-content: space-between; align-items: center; padding: 10px; border-bottom: 1px solid #eee; }
587
+ .download-item:last-child { border-bottom: none; }
588
+ </style>
589
+ </head>
590
+ <body>
591
+ <div class=\"container\">
592
+ <h1>🔄 RAR & Video Processing Service</h1>
593
+ <p>Automated extraction and upload of RAR files from BG1 to BG2 dataset, and video frame extraction for download</p>
594
+
595
+ <div class=\"status-card\">
596
+ <h3>Status: <span id=\"status\">Stopped</span></h3>
597
+ <p>Current File: <span id=\"current-file\">None</span></p>
598
+ <p>Last Update: <span id=\"last-update\">Never</span></p>
599
+ </div>
600
+
601
+ <div class=\"stats\">
602
+ <div class=\"stat-item\">
603
+ <h4>Total Files (RARs)</h4>
604
+ <span id=\"total-files\">0</span>
605
+ </div>
606
+ <div class=\"stat-item\">
607
+ <h4>Processed (RARs)</h4>
608
+ <span id=\"processed-files\">0</span>
609
+ </div>
610
+ <div class=\"stat-item\">
611
+ <h4>Uploaded Folders (BG2)</h4>
612
+ <span id=\"uploaded-rar-folders\">0</span>
613
+ </div>
614
+ <div class=\"stat-item\">
615
+ <h4>Extracted Video Courses</h4>
616
+ <span id=\"extracted-video-courses\">0</span>
617
+ </div>
618
+ <div class=\"stat-item\">
619
+ <h4>Failed</h4>
620
+ <span id=\"failed-files\">0</span>
621
+ </div>
622
+ </div>
623
+
624
+ <div class=\"start-form\">
625
+ <h3>Start Processing from Specific Download Index</h3>
626
+ <input type=\"number\" id=\"start-index-input\" placeholder=\"Enter start index (e.g., 0)\" value=\"0\">
627
+ <button onclick=\"startProcessingWithIndex()\">Start from Index</button>
628
+ </div>
629
+
630
+ <div>
631
+ <button class=\"button\" onclick=\"startProcessing()\" id=\"start-btn\">Start Processing (from last saved index)</button>
632
+ <button class=\"button stop-button\" onclick=\"stopProcessing()\" id=\"stop-btn\" disabled>Stop Processing</button>
633
+ <button class=\"button\" onclick=\"refreshStatus()\">Refresh Status</button>
634
+ <button class=\"button download-button\" onclick=\"refreshDownloads()\">Refresh Downloads</button>
635
+ </div>
636
+
637
+ <div class=\"downloads-section\">
638
+ <h3>Available Downloads</h3>
639
+ <div class=\"download-list\" id=\"download-list\">
640
+ <p>Loading...</p>
641
+ </div>
642
+ </div>
643
+
644
+ <h3>Logs</h3>
645
+ <div class=\"logs\" id=\"logs\">Loading...</div>
646
+ </div>
647
+
648
+ <script>
649
+ async function startProcessing() {
650
+ try {
651
+ const response = await fetch(\"/start\", { method: \"POST\" });
652
+ const result = await response.json();
653
+ alert(result.message);
654
+ refreshStatus();
655
+ } catch (error) {
656
+ alert(\"Error starting processing: \" + error.message);
657
+ }
658
+ }
659
+
660
+ async function startProcessingWithIndex() {
661
+ const index = document.getElementById(\"start-index-input\").value;
662
+ if (index === \"\" || isNaN(index)) {
663
+ alert(\"Please enter a valid number for the start index.\");
664
+ return;
665
+ }
666
+ try {
667
+ const response = await fetch(\"/start_from_index\", {
668
+ method: \"POST\",
669
+ headers: { \"Content-Type\": \"application/x-www-form-urlencoded\" },
670
+ body: `start_index=${parseInt(index)}`
671
+ });
672
+ const result = await response.json();
673
+ alert(result.message);
674
+ refreshStatus();
675
+ } catch (error) {
676
+ alert(\"Error starting processing from index: \" + error.message);
677
+ }
678
+ }
679
+
680
+ async function stopProcessing() {
681
+ try {
682
+ const response = await fetch(\"/stop\", { method: \"POST\" });
683
+ const result = await response.json();
684
+ alert(result.message);
685
+ refreshStatus();
686
+ } catch (error) {
687
+ alert(\"Error stopping processing: \" + error.message);
688
+ }
689
+ }
690
+
691
+ async function refreshStatus() {
692
+ try {
693
+ const response = await fetch(\"/status\");
694
+ const status = await response.json();
695
+
696
+ document.getElementById(\"status\").textContent = status.is_running ? \"Running\" : \"Stopped\";
697
+ document.getElementById(\"current-file\").textContent = status.current_file || \"None\";
698
+ document.getElementById(\"last-update\").textContent = status.last_update || \"Never\";
699
+ document.getElementById(\"total-files\").textContent = status.total_files;
700
+ document.getElementById(\"processed-files\").textContent = status.processed_files;
701
+ document.getElementById(\"uploaded-rar-folders\").textContent = status.uploaded_rar_folders;
702
+ document.getElementById(\"extracted-video-courses\").textContent = status.extracted_video_courses;
703
+ document.getElementById(\"failed-files\").textContent = status.failed_files;
704
+
705
+ document.getElementById(\"start-btn\").disabled = status.is_running;
706
+ document.getElementById(\"stop-btn\").disabled = !status.is_running;
707
+
708
+ const logsDiv = document.getElementById(\"logs\");
709
+ logsDiv.innerHTML = status.logs.join(\"<br>\");
710
+ logsDiv.scrollTop = logsDiv.scrollHeight;
711
+ } catch (error) {
712
+ console.error(\"Error refreshing status:\", error);
713
+ }
714
+ }
715
+
716
+ async function refreshDownloads() {
717
+ try {
718
+ const response = await fetch(\"/downloads\");
719
+ const downloads = await response.json();
720
+
721
+ const downloadList = document.getElementById(\"download-list\");
722
+ if (downloads.files.length === 0) {
723
+ downloadList.innerHTML = \"<p>No downloads available yet.</p>\";
724
+ } else {
725
+ downloadList.innerHTML = downloads.files.map(file =>
726
+ `<div class=\"download-item\">
727
+ <span>${file.name} (${file.size})</span>
728
+ <a href=\"/download/${file.name}\" class=\"button download-button\" download>Download</a>
729
+ </div>`
730
+ ).join(\"\");
731
+ }
732
+ } catch (error) {
733
+ console.error(\"Error refreshing downloads:\", error);
734
+ document.getElementById(\"download-list\").innerHTML = \"<p>Error loading downloads.</p>\";
735
+ }
736
+ }
737
+
738
+ // Auto-refresh every 5 seconds
739
+ setInterval(refreshStatus, 5000);
740
+ setInterval(refreshDownloads, 10000);
741
+
742
+ // Initial load
743
+ refreshStatus();
744
+ refreshDownloads();
745
+ </script>
746
+ </body>
747
+ </html>
748
+ """
749
+ return HTMLResponse(content=html_content)
750
+
751
+ @app.get("/status")
752
+ async def get_status():
753
+ """Get current processing status"""
754
+ return JSONResponse(content=processing_status)
755
+
756
+ @app.post("/start")
757
+ async def start_processing(background_tasks: BackgroundTasks):
758
+ """Start the processing in background"""
759
+ if processing_status["is_running"]:
760
+ return {"message": "Processing is already running"}
761
+
762
+ background_tasks.add_task(continuous_processing)
763
+ return {"message": "Processing started"}
764
+
765
+ @app.post("/start_from_index")
766
+ async def start_processing_from_index(background_tasks: BackgroundTasks, start_index: int = Form(...)):
767
+ """Start the processing from a specific download index in background"""
768
+ if processing_status["is_running"]:
769
+ return {"message": "Processing is already running"}
770
+
771
+ if start_index < 0:
772
+ return {"message": "Start index cannot be negative."}
773
+
774
+ background_tasks.add_task(continuous_processing, start_download_index=start_index)
775
+ return {"message": f"Processing started from download index: {start_index}"}
776
+
777
+ @app.post("/stop")
778
+ async def stop_processing():
779
+ """Stop the processing"""
780
+ if not processing_status["is_running"]:
781
+ return {"message": "Processing is not running"}
782
+
783
+ processing_status["is_running"] = False
784
+ return {"message": "Processing stop requested"}
785
+
786
+ @app.get("/logs")
787
+ async def get_logs():
788
+ """Get processing logs"""
789
+ return {"logs": processing_status["logs"]}
790
+
791
+ @app.get("/uploaded-folders")
792
+ async def get_uploaded_folders():
793
+ """Get list of uploaded folder hashes for BG2"""
794
+ uploaded_folders = load_uploaded_folders()
795
+ return {"uploaded_folder_count": len(uploaded_folders), "folder_hashes": list(uploaded_folders)}
796
+
797
+ @app.get("/processed-video-courses")
798
+ async def get_processed_video_courses():
799
+ """Get list of processed video course folder names"""
800
+ processed_video_courses = load_processed_video_courses()
801
+ return {"processed_video_course_count": len(processed_video_courses), "course_names": list(processed_video_courses)}
802
+
803
+ @app.get("/downloads")
804
+ async def list_downloads():
805
+ """List available frame downloads"""
806
+ try:
807
+ if not os.path.exists(ZIPPED_FRAMES_FOLDER):
808
+ return {"files": []}
809
+
810
+ files = []
811
+ for filename in os.listdir(ZIPPED_FRAMES_FOLDER):
812
+ if filename.endswith('.zip'):
813
+ file_path = os.path.join(ZIPPED_FRAMES_FOLDER, filename)
814
+ file_size = os.path.getsize(file_path)
815
+ # Convert size to human readable format
816
+ if file_size < 1024:
817
+ size_str = f"{file_size} B"
818
+ elif file_size < 1024 * 1024:
819
+ size_str = f"{file_size / 1024:.1f} KB"
820
+ elif file_size < 1024 * 1024 * 1024:
821
+ size_str = f"{file_size / (1024 * 1024):.1f} MB"
822
+ else:
823
+ size_str = f"{file_size / (1024 * 1024 * 1024):.1f} GB"
824
+
825
+ files.append({
826
+ "name": filename,
827
+ "size": size_str,
828
+ "path": file_path
829
+ })
830
+
831
+ # Sort by filename
832
+ files.sort(key=lambda x: x["name"])
833
+ return {"files": files}
834
+ except Exception as e:
835
+ log_message(f"❌ Error listing downloads: {e}")
836
+ return {"files": [], "error": str(e)}
837
+
838
+ @app.get("/download/{filename}")
839
+ async def download_file(filename: str):
840
+ """Download a specific frame zip file"""
841
+ try:
842
+ # Sanitize filename to prevent directory traversal
843
+ safe_filename = os.path.basename(filename)
844
+ file_path = os.path.join(ZIPPED_FRAMES_FOLDER, safe_filename)
845
+
846
+ if not os.path.exists(file_path):
847
+ raise HTTPException(status_code=404, detail="File not found")
848
+
849
+ if not file_path.endswith('.zip'):
850
+ raise HTTPException(status_code=400, detail="Only zip files can be downloaded")
851
+
852
+ log_message(f"📥 Serving download: {safe_filename}")
853
+ return FileResponse(
854
+ path=file_path,
855
+ filename=safe_filename,
856
+ media_type='application/zip'
857
+ )
858
+ except HTTPException:
859
+ raise
860
+ except Exception as e:
861
+ log_message(f"❌ Error serving download {filename}: {e}")
862
+ raise HTTPException(status_code=500, detail="Internal server error")
863
+
864
+ if __name__ == "__main__":
865
+ uvicorn.run(app, host="0.0.0.0", port=7860)
866
+