Fred808 commited on
Commit
72b57da
·
verified ·
1 Parent(s): 9cb8b0b

Create App.py

Browse files
Files changed (1) hide show
  1. App.py +736 -0
App.py ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
+ import asyncio
7
+ import threading
8
+ import time
9
+ import hashlib
10
+ import zipfile
11
+ from typing import Dict, List, Set, Optional
12
+ from fastapi import FastAPI, BackgroundTasks, HTTPException, Form
13
+ from fastapi.responses import HTMLResponse, JSONResponse
14
+ from fastapi.middleware.cors import CORSMiddleware
15
+ from huggingface_hub import HfApi, list_repo_files, HfHubHTTPError
16
+
17
+ # ==== CONFIGURATION ====
18
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
19
+ SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1") # Source for RARs
20
+ DEST_REPO_ID_RAR = os.getenv("DEST_REPO_RAR", "") # Destination for extracted RAR contents (set to empty string if not needed)
21
+ DEST_REPO_ID_VIDEO = os.getenv("DEST_REPO_VIDEO", "Fred808/BG3") # Destination for zipped video frames
22
+
23
+ DOWNLOAD_FOLDER = "downloads"
24
+ EXTRACT_FOLDER = "extracted_tmp"
25
+ VIDEO_FRAMES_EXTRACT_FOLDER = "video_frames_tmp"
26
+ ZIPPED_FRAMES_FOLDER = "zipped_frames"
27
+
28
+ DOWNLOAD_STATE_FILE = "download_progress.json"
29
+ PROCESS_STATE_FILE = "process_progress.json"
30
+ UPLOADED_FOLDERS_FILE = "uploaded_folders.json" # Track uploaded folder hashes for BG2
31
+ PROCESSED_VIDEO_COURSES_FILE = "processed_video_courses.json" # Track processed video course folders for BG3
32
+ FAILED_FILES_LOG = "failed_files.txt"
33
+
34
+ CHUNK_SIZE = 3 # Smaller chunks for Space environment
35
+ PROCESSING_DELAY = 2 # Delay between processing files (seconds)
36
+ VIDEO_FRAME_FPS = 3 # Frames per second to extract from videos
37
+
38
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
39
+ os.makedirs(EXTRACT_FOLDER, exist_ok=True)
40
+ os.makedirs(VIDEO_FRAMES_EXTRACT_FOLDER, exist_ok=True)
41
+ os.makedirs(ZIPPED_FRAMES_FOLDER, exist_ok=True)
42
+
43
+ api = HfApi(token=HF_TOKEN)
44
+
45
+ # Global state
46
+ processing_status = {
47
+ "is_running": False,
48
+ "current_file": None,
49
+ "total_files": 0,
50
+ "processed_files": 0,
51
+ "failed_files": 0,
52
+ "uploaded_rar_folders": 0,
53
+ "uploaded_video_courses": 0,
54
+ "last_update": None,
55
+ "logs": []
56
+ }
57
+
58
+ app = FastAPI(title="RAR & Video Processing Service", description="Automated RAR extraction and video frame upload service")
59
+
60
+ # Add CORS middleware
61
+ app.add_middleware(
62
+ CORSMiddleware,
63
+ allow_origins=["*"],
64
+ allow_credentials=True,
65
+ allow_methods=["*"],
66
+ allow_headers=["*"],
67
+ )
68
+
69
+ def log_message(message: str):
70
+ """Add message to logs and print it"""
71
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
72
+ log_entry = f"[{timestamp}] {message}"
73
+ print(log_entry)
74
+ processing_status["logs"].append(log_entry)
75
+ processing_status["last_update"] = timestamp
76
+ # Keep only last 100 log entries
77
+ if len(processing_status["logs"]) > 100:
78
+ processing_status["logs"] = processing_status["logs"][-100:]
79
+
80
+ def log_failed_file(filename: str, error_msg: str):
81
+ """Log failed files to a separate file for later review"""
82
+ with open(FAILED_FILES_LOG, "a") as f:
83
+ f.write(f"{filename}: {error_msg}\n")
84
+ log_message(f"❌ Failed: {filename} - {error_msg}")
85
+
86
+ def get_folder_hash(folder_name: str) -> str:
87
+ """Generate a hash for the folder name to use as a unique identifier"""
88
+ return hashlib.md5(folder_name.encode()).hexdigest()
89
+
90
+ def load_uploaded_folders() -> Set[str]:
91
+ """Load set of uploaded folder hashes for BG2"""
92
+ if os.path.exists(UPLOADED_FOLDERS_FILE):
93
+ try:
94
+ with open(UPLOADED_FOLDERS_FILE, "r") as f:
95
+ data = json.load(f)
96
+ return set(data.get("uploaded_folder_hashes", []))
97
+ except json.JSONDecodeError:
98
+ log_message(f"⚠️ Warning: Could not decode {UPLOADED_FOLDERS_FILE}. Starting with empty set.")
99
+ return set()
100
+ return set()
101
+
102
+ def save_uploaded_folders(uploaded_set: Set[str]):
103
+ """Save set of uploaded folder hashes for BG2"""
104
+ with open(UPLOADED_FOLDERS_FILE, "w") as f:
105
+ json.dump({"uploaded_folder_hashes": list(uploaded_set)}, f)
106
+
107
+ def load_processed_video_courses() -> Set[str]:
108
+ """Loads the set of processed video course folder names for BG3."""
109
+ if os.path.exists(PROCESSED_VIDEO_COURSES_FILE):
110
+ try:
111
+ with open(PROCESSED_VIDEO_COURSES_FILE, "r") as f:
112
+ return set(json.load(f))
113
+ except json.JSONDecodeError:
114
+ log_message(f"⚠️ Warning: Could not decode {PROCESSED_VIDEO_COURSES_FILE}. Starting with empty set.")
115
+ return set()
116
+ return set()
117
+
118
+ def save_processed_video_courses(processed_set: set):
119
+ """Saves the set of processed video course folder names for BG3 to a file."""
120
+ with open(PROCESSED_VIDEO_COURSES_FILE, "w") as f:
121
+ json.dump(list(processed_set), f)
122
+
123
+ def load_download_state() -> int:
124
+ """Load download progress state"""
125
+ if os.path.exists(DOWNLOAD_STATE_FILE):
126
+ try:
127
+ with open(DOWNLOAD_STATE_FILE, "r") as f:
128
+ return json.load(f).get("next_download_index", 0)
129
+ except json.JSONDecodeError:
130
+ log_message(f"⚠️ Warning: Could not decode {DOWNLOAD_STATE_FILE}. Starting download from index 0.")
131
+ return 0
132
+ return 0
133
+
134
+ def save_download_state(next_index: int):
135
+ """Save download progress state"""
136
+ with open(DOWNLOAD_STATE_FILE, "w") as f:
137
+ json.dump({"next_download_index": next_index}, f)
138
+
139
+ def load_processed_files_state() -> set:
140
+ """Load processed files from the state file"""
141
+ if os.path.exists(PROCESS_STATE_FILE):
142
+ try:
143
+ with open(PROCESS_STATE_FILE, "r") as f:
144
+ data = json.load(f)
145
+ return set(data.get("processed_rars", []))
146
+ except json.JSONDecodeError:
147
+ log_message(f"⚠️ Warning: Could not decode {PROCESS_STATE_FILE}. Starting with empty processed list.")
148
+ return set()
149
+ return set()
150
+
151
+ def save_processed_files_state(processed_set: set):
152
+ """Save processed files to the state file"""
153
+ with open(PROCESS_STATE_FILE, "w") as f:
154
+ json.dump({"processed_rars": list(processed_set)}, f)
155
+
156
+ def download_rar_files(start_index: int, chunk_size: int) -> tuple:
157
+ """Downloads a batch of RAR files from the source dataset"""
158
+ try:
159
+ all_files = list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset", token=HF_TOKEN)
160
+ # Filter for .rar files and exclude the specific one
161
+ rar_files_in_repo = sorted([f for f in all_files if f.endswith(".rar") and "ZBrush/3DConceptArtist_TheUltimateZbrushGuide_DownloadPirate.com.rar" not in f])
162
+
163
+ end_index = start_index + chunk_size
164
+ files_to_download_metadata = rar_files_in_repo[start_index:end_index]
165
+
166
+ if not files_to_download_metadata:
167
+ log_message("✅ No more RAR files to download.")
168
+ return [], start_index
169
+
170
+ log_message(f"📥 Downloading RAR files {start_index + 1} to {end_index} from {SOURCE_REPO_ID}")
171
+
172
+ downloaded_paths = []
173
+ for file_path_in_repo in files_to_download_metadata:
174
+ filename = os.path.basename(file_path_in_repo)
175
+ dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
176
+ file_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{file_path_in_repo}"
177
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
178
+
179
+ if os.path.exists(dest_path):
180
+ log_message(f"⏩ Already exists, skipping: {filename}")
181
+ downloaded_paths.append(dest_path)
182
+ continue
183
+
184
+ log_message(f"🔽 Downloading: {file_path_in_repo}")
185
+ try:
186
+ with requests.get(file_url, headers=headers, stream=True) as r:
187
+ r.raise_for_status()
188
+ with open(dest_path, "wb") as f:
189
+ for chunk in r.iter_content(chunk_size=8192):
190
+ f.write(chunk)
191
+ log_message(f"✅ Downloaded: {filename}")
192
+ downloaded_paths.append(dest_path)
193
+ except Exception as e:
194
+ log_message(f"❌ Failed to download {file_path_in_repo}: {e}")
195
+ log_failed_file(file_path_in_repo, f"Download failed: {e}")
196
+
197
+ return downloaded_paths, end_index
198
+ except Exception as e:
199
+ log_message(f"❌ Error in download_rar_files: {e}")
200
+ return [], start_index
201
+
202
+ def extract_frames(video_path: str, output_folder: str, fps: int) -> bool:
203
+ """Extracts frames from a video using ffmpeg."""
204
+ os.makedirs(output_folder, exist_ok=True)
205
+
206
+ # Ensure ffmpeg is available
207
+ if shutil.which("ffmpeg") is None:
208
+ log_message("❌ ffmpeg not found. Please install ffmpeg.")
209
+ return False
210
+
211
+ # ffmpeg command to extract frames at specified FPS
212
+ command = [
213
+ "ffmpeg",
214
+ "-i", video_path,
215
+ "-vf", f"fps={fps}",
216
+ os.path.join(output_folder, "frame_%04d.png")
217
+ ]
218
+
219
+ log_message(f"🖼️ Extracting frames from {os.path.basename(video_path)} to {output_folder} at {fps} FPS...")
220
+ try:
221
+ subprocess.run(command, check=True, capture_output=True, text=True)
222
+ log_message(f"✅ Successfully extracted frames from {os.path.basename(video_path)}")
223
+ return True
224
+ except subprocess.CalledProcessError as e:
225
+ log_message(f"❌ Error extracting frames from {os.path.basename(video_path)}: {e.stderr}")
226
+ return False
227
+
228
+ def zip_folder(folder_path: str, output_zip_path: str) -> bool:
229
+ """Zips the contents of a folder."""
230
+ log_message(f" compressing {folder_path} to {output_zip_path}...")
231
+ try:
232
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
233
+ for root, _, files in os.walk(folder_path):
234
+ for file in files:
235
+ file_path = os.path.join(root, file)
236
+ arcname = os.path.relpath(file_path, folder_path)
237
+ zipf.write(file_path, arcname)
238
+ log_message(f"✅ Successfully zipped {folder_path}")
239
+ return True
240
+ except Exception as e:
241
+ log_message(f"❌ Error zipping {folder_path}: {e}")
242
+ return False
243
+
244
+ def upload_file_to_hf(local_path: str, path_in_repo: str, repo_id: str, max_retries: int = 5, initial_delay: int = 5) -> bool:
245
+ """Uploads a single file to Hugging Face Hub with retry logic and exponential backoff."""
246
+ log_message(f"⬆️ Uploading {os.path.basename(local_path)} to {repo_id}/{path_in_repo}")
247
+
248
+ for attempt in range(max_retries):
249
+ try:
250
+ api.upload_file(
251
+ path_or_fileobj=local_path,
252
+ path_in_repo=path_in_repo,
253
+ repo_id=repo_id,
254
+ repo_type="dataset"
255
+ )
256
+ log_message(f"✅ Uploaded: {os.path.basename(local_path)}")
257
+ return True
258
+ except HfHubHTTPError as e:
259
+ if e.response.status_code == 429 and attempt < max_retries - 1: # Too Many Requests
260
+ delay = initial_delay * (2 ** attempt)
261
+ log_message(f"⚠️ Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{max_retries})")
262
+ time.sleep(delay)
263
+ else:
264
+ log_message(f"❌ Hugging Face Hub error uploading {os.path.basename(local_path)}: {e}")
265
+ return False
266
+ except Exception as e:
267
+ log_message(f"❌ Error uploading {os.path.basename(local_path)}: {e}")
268
+ return False
269
+ log_message(f"❌ Failed to upload {os.path.basename(local_path)} after {max_retries} attempts.")
270
+ return False
271
+
272
+ def process_video_frames_and_upload(extracted_rar_folder: str, processed_video_courses_set: Set[str]) -> bool:
273
+ """Scans an extracted RAR folder for MP4s, extracts frames, zips, and uploads to BG3."""
274
+ video_processed_successfully = False
275
+
276
+ # Use the top-level folder name of the extracted RAR as the course folder name
277
+ course_folder_name = os.path.basename(extracted_rar_folder)
278
+
279
+ if course_folder_name in processed_video_courses_set:
280
+ log_message(f"⏩ Video frames for course \'{course_folder_name}\' already processed. Skipping.")
281
+ return True
282
+
283
+ log_message(f"🎬 Processing videos in extracted RAR folder: {course_folder_name}")
284
+
285
+ mp4_files_found = []
286
+ for root, _, files in os.walk(extracted_rar_folder):
287
+ for file in files:
288
+ if file.lower().endswith(".mp4"):
289
+ mp4_files_found.append(os.path.join(root, file))
290
+
291
+ if not mp4_files_found:
292
+ log_message(f"⚠️ No MP4 files found in {course_folder_name}. Skipping video frame extraction.")
293
+ return False # Indicate no video processing was done
294
+
295
+ course_video_extract_dir = os.path.join(VIDEO_FRAMES_EXTRACT_FOLDER, course_folder_name)
296
+ os.makedirs(course_video_extract_dir, exist_ok=True)
297
+
298
+ for mp4_path in mp4_files_found:
299
+ video_basename = os.path.splitext(os.path.basename(mp4_path))[0]
300
+ # Create a unique output folder for frames from this video within the course's frame directory
301
+ video_output_folder = os.path.join(course_video_extract_dir, video_basename)
302
+
303
+ if not extract_frames(mp4_path, video_output_folder, VIDEO_FRAME_FPS):
304
+ log_message(f"❌ Failed to extract frames from {os.path.basename(mp4_path)}. Continuing with other videos.")
305
+ # Clean up partially extracted frames for this video
306
+ if os.path.exists(video_output_folder):
307
+ shutil.rmtree(video_output_folder)
308
+
309
+ # Check if any frames were extracted for the entire course folder
310
+ if not os.listdir(course_video_extract_dir):
311
+ log_message(f"⚠️ No frames extracted for any video in {course_folder_name}. Skipping zipping and upload to BG3.")
312
+ shutil.rmtree(course_video_extract_dir)
313
+ return False
314
+
315
+ course_zip_path = os.path.join(ZIPPED_FRAMES_FOLDER, f"{course_folder_name}_frames.zip")
316
+ if zip_folder(course_video_extract_dir, course_zip_path):
317
+ path_in_repo = f"{course_folder_name}_frames.zip"
318
+ if upload_file_to_hf(course_zip_path, path_in_repo, DEST_REPO_ID_VIDEO):
319
+ log_message(f"✅ Successfully processed video frames and uploaded {course_folder_name}_frames.zip to BG3")
320
+ processed_video_courses_set.add(course_folder_name) # Mark as processed
321
+ save_processed_video_courses(processed_video_courses_set) # Save state
322
+ video_processed_successfully = True
323
+ else:
324
+ log_message(f"❌ Failed to upload zipped video frames for {course_folder_name} to BG3")
325
+ else:
326
+ log_message(f"❌ Failed to zip video frames for {course_folder_name}")
327
+
328
+ # Cleanup local video frame files
329
+ log_message(f"🧹 Cleaning up local video frame files for {course_folder_name}")
330
+ if os.path.exists(course_video_extract_dir):
331
+ shutil.rmtree(course_video_extract_dir)
332
+ if os.path.exists(course_zip_path):
333
+ os.remove(course_zip_path)
334
+
335
+ return video_processed_successfully
336
+
337
+ def extract_and_upload_rar(rar_path: str, processed_rars_set: set, uploaded_folders_set: Set[str], processed_video_courses_set: Set[str]) -> bool:
338
+ """Extracts a single RAR file, uploads its contents to BG2 (if DEST_REPO_ID_RAR is set), and then processes videos for BG3"""
339
+ filename = os.path.basename(rar_path)
340
+ processing_status["current_file"] = filename
341
+
342
+ folder_name = filename.replace(".rar", "")
343
+ current_extract_folder = os.path.join(EXTRACT_FOLDER, f"{folder_name}_extracted")
344
+
345
+ # Check if RAR is already processed (uploaded to BG2 or video frames processed)
346
+ if filename in processed_rars_set and (not DEST_REPO_ID_RAR or get_folder_hash(folder_name) in uploaded_folders_set) and folder_name in processed_video_courses_set:
347
+ log_message(f"⏩ {filename} already fully processed, skipping.")
348
+ return True
349
+
350
+ # Generate folder name and hash for tracking
351
+ folder_hash = get_folder_hash(folder_name)
352
+
353
+ # If BG2 upload is enabled and folder already uploaded to BG2, skip RAR extraction/upload to BG2
354
+ if DEST_REPO_ID_RAR and folder_hash in uploaded_folders_set:
355
+ log_message(f"🔒 Folder \'{folder_name}\' already uploaded to BG2 (hash: {folder_hash[:8]}...), skipping RAR upload.")
356
+ processed_rars_set.add(filename)
357
+ save_processed_files_state(processed_rars_set)
358
+ # Proceed to video processing if not already done
359
+ if not os.path.exists(current_extract_folder): # If extracted folder doesn't exist, we can't process videos
360
+ log_message(f"⚠️ Extracted folder {current_extract_folder} not found for video processing. Skipping.")
361
+ return False
362
+ process_video_frames_and_upload(current_extract_folder, processed_video_courses_set)
363
+ return True
364
+
365
+ log_message(f"📦 Attempting to extract: {filename}")
366
+ os.makedirs(current_extract_folder, exist_ok=True)
367
+
368
+ try:
369
+ if shutil.which("unrar") is None:
370
+ raise RuntimeError("unrar command not found. Please install unrar.")
371
+
372
+ # Use -idq to suppress query messages and -o+ to overwrite without prompting
373
+ unrar_command = ["unrar", "x", "-o+", rar_path, current_extract_folder]
374
+ log_message(f"Running command: {' '.join(unrar_command)}")
375
+
376
+ result = subprocess.run(
377
+ unrar_command,
378
+ check=True,
379
+ capture_output=True,
380
+ text=True,
381
+ encoding='utf-8'
382
+ )
383
+
384
+ extracted_contents = os.listdir(current_extract_folder)
385
+ if not extracted_contents:
386
+ raise Exception("Extraction completed but no files were produced in the target directory.")
387
+ log_message(f"Successfully extracted {len(extracted_contents)} items")
388
+
389
+ # Upload extracted files to BG2 if DEST_REPO_ID_RAR is set
390
+ if DEST_REPO_ID_RAR:
391
+ upload_count = 0
392
+ for root, _, files in os.walk(current_extract_folder):
393
+ for file in files:
394
+ local_path = os.path.join(root, file)
395
+ # Construct path in repo relative to the extracted content's root
396
+ path_in_repo = os.path.join(folder_name, os.path.relpath(local_path, current_extract_folder))
397
+ log_message(f"⬆️ Uploading to BG2: {path_in_repo}")
398
+
399
+ try:
400
+ upload_file_to_hf(
401
+ local_path=local_path,
402
+ path_in_repo=path_in_repo,
403
+ repo_id=DEST_REPO_ID_RAR
404
+ )
405
+ upload_count += 1
406
+ except Exception as upload_error:
407
+ log_message(f"❌ Failed to upload {path_in_repo} to BG2: {upload_error}")
408
+ raise upload_error
409
+
410
+ log_message(f"✅ Successfully uploaded {upload_count} files from {filename} to BG2")
411
+
412
+ # Mark folder as uploaded to BG2 using hash
413
+ uploaded_folders_set.add(folder_hash)
414
+ save_uploaded_folders(uploaded_folders_set)
415
+ processing_status["uploaded_rar_folders"] = len(uploaded_folders_set)
416
+
417
+ log_message(f"🔒 Folder \'{folder_name}\' locked in BG2 repo (hash: {folder_hash[:8]}...)")
418
+ else:
419
+ log_message("Skipping upload to BG2 as DEST_REPO_ID_RAR is not set.")
420
+
421
+ # Now process video frames from the extracted content and upload to BG3
422
+ process_video_frames_and_upload(current_extract_folder, processed_video_courses_set)
423
+
424
+ return True
425
+
426
+ except subprocess.CalledProcessError as e:
427
+ error_msg = f"RAR extraction failed (exit {e.returncode}): {e.stderr.strip()}"
428
+ log_failed_file(filename, error_msg)
429
+ processing_status["failed_files"] += 1
430
+ return False
431
+ except Exception as e:
432
+ error_msg = f"Unexpected error during processing {filename}: {str(e)}"
433
+ log_failed_file(filename, error_msg)
434
+ processing_status["failed_files"] += 1
435
+ return False
436
+ finally:
437
+ # Always cleanup the extraction folder after processing (success or failure)
438
+ if os.path.exists(current_extract_folder):
439
+ log_message(f"🧹 Cleaning up extracted RAR files in {current_extract_folder}")
440
+ try:
441
+ shutil.rmtree(current_extract_folder)
442
+ log_message(f"✅ Cleaned up RAR extraction folder")
443
+ except Exception as e:
444
+ log_message(f"⚠️ Could not clean up RAR extraction folder {current_extract_folder}: {e}")
445
+
446
+ def continuous_processing(start_download_index: Optional[int] = None):
447
+ """Main processing loop that runs continuously, with an optional starting download index"""
448
+ processing_status["is_running"] = True
449
+ log_message("🚀 Starting continuous RAR and Video processing...")
450
+
451
+ try:
452
+ # Load uploaded folders tracking for BG2
453
+ uploaded_folders = load_uploaded_folders()
454
+ processing_status["uploaded_rar_folders"] = len(uploaded_folders)
455
+
456
+ # Load processed video courses tracking for BG3
457
+ processed_video_courses = load_processed_video_courses()
458
+ processing_status["uploaded_video_courses"] = len(processed_video_courses)
459
+
460
+ if start_download_index is not None:
461
+ log_message(f"Starting download from index: {start_download_index}")
462
+ save_download_state(start_download_index) # Set download state to start from this index
463
+ else:
464
+ log_message("Starting download from saved state or beginning.")
465
+
466
+ while processing_status["is_running"]:
467
+ # 1. Download a batch of RAR files
468
+ download_start_index = load_download_state()
469
+ downloaded_rar_paths, next_download_index = download_rar_files(download_start_index, CHUNK_SIZE)
470
+ save_download_state(next_download_index)
471
+
472
+ if not downloaded_rar_paths:
473
+ # Check if there are any local RAR files to process
474
+ all_local_rars = sorted([os.path.join(DOWNLOAD_FOLDER, f) for f in os.listdir(DOWNLOAD_FOLDER) if f.endswith(".rar")])
475
+ processed_rars = load_processed_files_state()
476
+ unprocessed_rars = [rar for rar in all_local_rars if os.path.basename(rar) not in processed_rars]
477
+
478
+ if not unprocessed_rars:
479
+ log_message("✅ No more RAR files to download. Stopping...")
480
+ break
481
+ else:
482
+ log_message(f"📋 Found {len(unprocessed_rars)} unprocessed local RAR files")
483
+
484
+ # 2. Process all available RAR files (downloaded + existing)
485
+ all_local_rars = sorted([os.path.join(DOWNLOAD_FOLDER, f) for f in os.listdir(DOWNLOAD_FOLDER) if f.endswith(".rar")])
486
+ processed_rars = load_processed_files_state()
487
+ processing_status["total_files"] = len(all_local_rars)
488
+ processing_status["processed_files"] = len(processed_rars)
489
+
490
+ for rar_file_path in all_local_rars:
491
+ if not processing_status["is_running"]:
492
+ break
493
+
494
+ filename = os.path.basename(rar_file_path)
495
+ if filename not in processed_rars:
496
+ success = extract_and_upload_rar(rar_file_path, processed_rars, uploaded_folders, processed_video_courses)
497
+ if success:
498
+ processed_rars.add(filename)
499
+ save_processed_files_state(processed_rars)
500
+ processing_status["processed_files"] += 1
501
+
502
+ # Delete the RAR file after successful processing
503
+ log_message(f"🗑️ Deleting processed RAR: {filename}")
504
+ try:
505
+ os.remove(rar_file_path)
506
+ log_message(f"✅ Deleted RAR file: {filename}")
507
+ except Exception as e:
508
+ log_message(f"⚠️ Could not delete {rar_file_path}: {e}")
509
+
510
+ # Add delay between processing files
511
+ time.sleep(PROCESSING_DELAY)
512
+
513
+ # If no new files were downloaded and all local files are processed, we're done
514
+ if not downloaded_rar_paths:
515
+ break
516
+
517
+ except Exception as e:
518
+ log_message(f"❌ Error in continuous processing: {e}")
519
+ finally:
520
+ processing_status["is_running"] = False
521
+ processing_status["current_file"] = None
522
+ log_message("🏁 Processing stopped")
523
+
524
+ @app.get("/", response_class=HTMLResponse)
525
+ async def root():
526
+ """Serve the main HTML interface"""
527
+ html_content = """
528
+ <!DOCTYPE html>
529
+ <html>
530
+ <head>
531
+ <title>RAR & Video Processing Service</title>
532
+ <meta charset="utf-8">
533
+ <meta name="viewport" content="width=device-width, initial-scale=1">
534
+ <style>
535
+ body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
536
+ .container { max-width: 1200px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
537
+ .status-card { background: #e3f2fd; padding: 15px; border-radius: 5px; margin: 10px 0; }
538
+ .logs { background: #f5f5f5; padding: 15px; border-radius: 5px; height: 400px; overflow-y: auto; font-family: monospace; font-size: 12px; }
539
+ .button { background: #2196F3; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; margin: 5px; }
540
+ .button:hover { background: #1976D2; }
541
+ .button:disabled { background: #ccc; cursor: not-allowed; }
542
+ .stop-button { background: #f44336; }
543
+ .stop-button:hover { background: #d32f2f; }
544
+ .stats { display: flex; gap: 20px; margin: 20px 0; }
545
+ .stat-item { background: #f0f0f0; padding: 10px; border-radius: 5px; text-align: center; flex: 1; }
546
+ .start-form { margin-top: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 5px; background: #f9f9f9; }
547
+ .start-form input[type="number"] { width: calc(100% - 120px); padding: 8px; margin-right: 10px; border: 1px solid #ccc; border-radius: 4px; }
548
+ .start-form button { padding: 8px 15px; background: #4CAF50; color: white; border: none; border-radius: 4px; cursor: pointer; }
549
+ .start-form button:hover { background: #45a049; }
550
+ </style>
551
+ </head>
552
+ <body>
553
+ <div class="container">
554
+ <h1>🔄 RAR & Video Processing Service</h1>
555
+ <p>Automated extraction and upload of RAR files from BG1 to BG2 dataset, and video frame extraction/upload to BG3 dataset</p>
556
+
557
+ <div class="status-card">
558
+ <h3>Status: <span id="status">Stopped</span></h3>
559
+ <p>Current File: <span id="current-file">None</span></p>
560
+ <p>Last Update: <span id="last-update">Never</span></p>
561
+ </div>
562
+
563
+ <div class="stats">
564
+ <div class="stat-item">
565
+ <h4>Total Files (RARs)</h4>
566
+ <span id="total-files">0</span>
567
+ </div>
568
+ <div class="stat-item">
569
+ <h4>Processed (RARs)</h4>
570
+ <span id="processed-files">0</span>
571
+ </div>
572
+ <div class="stat-item">
573
+ <h4>Uploaded Folders (BG2)</h4>
574
+ <span id="uploaded-rar-folders">0</span>
575
+ </div>
576
+ <div class="stat-item">
577
+ <h4>Uploaded Video Courses (BG3)</h4>
578
+ <span id="uploaded-video-courses">0</span>
579
+ </div>
580
+ <div class="stat-item">
581
+ <h4>Failed</h4>
582
+ <span id="failed-files">0</span>
583
+ </div>
584
+ </div>
585
+
586
+ <div class="start-form">
587
+ <h3>Start Processing from Specific Download Index</h3>
588
+ <input type="number" id="start-index-input" placeholder="Enter start index (e.g., 0)" value="0">
589
+ <button onclick="startProcessingWithIndex()">Start from Index</button>
590
+ </div>
591
+
592
+ <div>
593
+ <button class="button" onclick="startProcessing()" id="start-btn">Start Processing (from last saved index)</button>
594
+ <button class="button stop-button" onclick="stopProcessing()" id="stop-btn" disabled>Stop Processing</button>
595
+ <button class="button" onclick="refreshStatus()">Refresh Status</button>
596
+ </div>
597
+
598
+ <h3>Logs</h3>
599
+ <div class="logs" id="logs">Loading...</div>
600
+ </div>
601
+
602
+ <script>
603
+ async function startProcessing() {
604
+ try {
605
+ const response = await fetch(\"/start\", { method: \"POST\" });
606
+ const result = await response.json();
607
+ alert(result.message);
608
+ refreshStatus();
609
+ } catch (error) {
610
+ alert(\"Error starting processing: \" + error.message);
611
+ }
612
+ }
613
+
614
+ async function startProcessingWithIndex() {
615
+ const index = document.getElementById(\"start-index-input\").value;
616
+ if (index === "" || isNaN(index)) {
617
+ alert(\"Please enter a valid number for the start index.\");
618
+ return;
619
+ }
620
+ try {
621
+ const response = await fetch(\"/start_from_index\", {
622
+ method: \"POST\",
623
+ headers: { \"Content-Type\": \"application/x-www-form-urlencoded\" },
624
+ body: `start_index=${parseInt(index)}`
625
+ });
626
+ const result = await response.json();
627
+ alert(result.message);
628
+ refreshStatus();
629
+ } catch (error) {
630
+ alert(\"Error starting processing from index: \" + error.message);
631
+ }
632
+ }
633
+
634
+ async function stopProcessing() {
635
+ try {
636
+ const response = await fetch(\"/stop\", { method: \"POST\" });
637
+ const result = await response.json();
638
+ alert(result.message);
639
+ refreshStatus();
640
+ } catch (error) {
641
+ alert(\"Error stopping processing: \" + error.message);
642
+ }
643
+ }
644
+
645
+ async function refreshStatus() {
646
+ try {
647
+ const response = await fetch(\"/status\");
648
+ const status = await response.json();
649
+
650
+ document.getElementById(\"status\").textContent = status.is_running ? \"Running\" : \"Stopped\";
651
+ document.getElementById(\"current-file\").textContent = status.current_file || \"None\";
652
+ document.getElementById(\"last-update\").textContent = status.last_update || \"Never\";
653
+ document.getElementById(\"total-files\").textContent = status.total_files;
654
+ document.getElementById(\"processed-files\").textContent = status.processed_files;
655
+ document.getElementById(\"uploaded-rar-folders\").textContent = status.uploaded_rar_folders;
656
+ document.getElementById(\"uploaded-video-courses\").textContent = status.uploaded_video_courses;
657
+ document.getElementById(\"failed-files\").textContent = status.failed_files;
658
+
659
+ document.getElementById(\"start-btn\").disabled = status.is_running;
660
+ document.getElementById(\"stop-btn\").disabled = !status.is_running;
661
+
662
+ const logsDiv = document.getElementById(\"logs\");
663
+ logsDiv.innerHTML = status.logs.join(\"<br>\");
664
+ logsDiv.scrollTop = logsDiv.scrollHeight;
665
+ } catch (error) {
666
+ console.error(\"Error refreshing status:\", error);
667
+ }
668
+ }
669
+
670
+ // Auto-refresh every 5 seconds
671
+ setInterval(refreshStatus, 5000);
672
+
673
+ // Initial load
674
+ refreshStatus();
675
+ </script>
676
+ </body>
677
+ </html>
678
+ """
679
+ return HTMLResponse(content=html_content)
680
+
681
+ @app.get("/status")
682
+ async def get_status():
683
+ """Get current processing status"""
684
+ return JSONResponse(content=processing_status)
685
+
686
+ @app.post("/start")
687
+ async def start_processing(background_tasks: BackgroundTasks):
688
+ """Start the processing in background"""
689
+ if processing_status["is_running"]:
690
+ return {"message": "Processing is already running"}
691
+
692
+ background_tasks.add_task(continuous_processing)
693
+ return {"message": "Processing started"}
694
+
695
+ @app.post("/start_from_index")
696
+ async def start_processing_from_index(background_tasks: BackgroundTasks, start_index: int = Form(...)):
697
+ """Start the processing from a specific download index in background"""
698
+ if processing_status["is_running"]:
699
+ return {"message": "Processing is already running"}
700
+
701
+ if start_index < 0:
702
+ return {"message": "Start index cannot be negative."}
703
+
704
+ background_tasks.add_task(continuous_processing, start_download_index=start_index)
705
+ return {"message": f"Processing started from download index: {start_index}"}
706
+
707
+ @app.post("/stop")
708
+ async def stop_processing():
709
+ """Stop the processing"""
710
+ if not processing_status["is_running"]:
711
+ return {"message": "Processing is not running"}
712
+
713
+ processing_status["is_running"] = False
714
+ return {"message": "Processing stop requested"}
715
+
716
+ @app.get("/logs")
717
+ async def get_logs():
718
+ """Get processing logs"""
719
+ return {"logs": processing_status["logs"]}
720
+
721
+ @app.get("/uploaded-folders")
722
+ async def get_uploaded_folders():
723
+ """Get list of uploaded folder hashes for BG2"""
724
+ uploaded_folders = load_uploaded_folders()
725
+ return {"uploaded_folder_count": len(uploaded_folders), "folder_hashes": list(uploaded_folders)}
726
+
727
+ @app.get("/processed-video-courses")
728
+ async def get_processed_video_courses():
729
+ """Get list of processed video course folder names for BG3"""
730
+ processed_video_courses = load_processed_video_courses()
731
+ return {"processed_video_course_count": len(processed_video_courses), "course_names": list(processed_video_courses)}
732
+
733
+ if __name__ == "__main__":
734
+ uvicorn.run(app, host="0.0.0.0", port=7860)
735
+
736
+