Fred808 commited on
Commit
53d888f
Β·
verified Β·
1 Parent(s): bd8bf7b

Update vision_analyzer.py

Browse files
Files changed (1) hide show
  1. vision_analyzer.py +411 -380
vision_analyzer.py CHANGED
@@ -1,407 +1,438 @@
1
  import os
2
  import json
 
 
 
3
  import time
 
4
  import threading
5
- import asyncio
6
- from fastapi import FastAPI, HTTPException, BackgroundTasks
7
- from fastapi.middleware.cors import CORSMiddleware
8
- from fastapi.responses import JSONResponse, FileResponse
9
- from fastapi.staticfiles import StaticFiles
10
- import uvicorn
11
- from typing import Dict
12
  from pathlib import Path
13
- import subprocess
14
- from datetime import datetime
15
-
16
- import torch
17
-
18
- # Import from vision_analyzer (previously cursor_tracker)
19
- from vision_analyzer import (
20
- main_processing_loop,
21
- processing_status,
22
- log_message,
23
- FRAMES_OUTPUT_FOLDER # Add this import for frames directory
24
- )
25
-
26
- # FastAPI App Definition
27
- app = FastAPI(title="Video Analysis API",
28
- description="API to access video frame analysis results and extracted images",
29
- version="1.0.0")
30
-
31
- # Add CORS middleware to allow cross-origin requests
32
- app.add_middleware(
33
- CORSMiddleware,
34
- allow_origins=["*"], # Allows all origins
35
- allow_credentials=True,
36
- allow_methods=["*"], # Allows all methods
37
- allow_headers=["*"],
38
- )
39
-
40
- # Global variables for processing and frame tracking
41
- processing_thread = None
42
- frame_locks = {} # Dict to track frame locks: {course: {frame: {"locked_by": id, "locked_at": timestamp}}}
43
- processed_frames = {} # Dict to track processed frames: {course: {frame: {"processed_by": id, "processed_at": timestamp}}}
44
- LOCK_TIMEOUT = 300 # 5 minutes timeout for locks
45
- TRACKING_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "frame_tracking.json")
46
-
47
- def save_tracking_state():
48
- """Save frame tracking state to disk"""
49
- state = {
50
- "frame_locks": frame_locks,
51
- "processed_frames": processed_frames
52
- }
53
- try:
54
- with open(TRACKING_FILE, "w") as f:
55
- json.dump(state, f, indent=2)
56
- except Exception as e:
57
- log_message(f"Error saving tracking state: {e}")
58
 
59
- def load_tracking_state():
60
- """Load frame tracking state from disk"""
61
- global frame_locks, processed_frames
62
- try:
63
- with open(TRACKING_FILE, "r") as f:
64
- state = json.load(f)
65
- frame_locks = state.get("frame_locks", {})
66
- processed_frames = state.get("processed_frames", {})
67
- except FileNotFoundError:
68
- log_message("No previous tracking state found")
69
- except Exception as e:
70
- log_message(f"Error loading tracking state: {e}")
71
 
72
- def check_frame_lock(course: str, frame: str) -> bool:
73
- """Check if frame is locked and lock hasn't expired"""
74
- if course in frame_locks and frame in frame_locks[course]:
75
- lock = frame_locks[course][frame]
76
- if time.time() - lock["locked_at"] < LOCK_TIMEOUT:
77
- return True
78
- # Lock expired, remove it
79
- del frame_locks[course][frame]
80
- save_tracking_state()
81
- return False
82
 
83
- def lock_frame(course: str, frame: str, requester_id: str) -> bool:
84
- """Attempt to lock a frame for processing"""
85
- if check_frame_lock(course, frame):
86
- return False
87
-
88
- if course not in frame_locks:
89
- frame_locks[course] = {}
90
-
91
- frame_locks[course][frame] = {
92
- "locked_by": requester_id,
93
- "locked_at": time.time()
94
- }
95
- save_tracking_state()
96
- return True
97
 
98
- def mark_frame_processed(course: str, frame: str, requester_id: str):
99
- """Mark a frame as successfully processed"""
100
- if course not in processed_frames:
101
- processed_frames[course] = {}
102
-
103
- processed_frames[course][frame] = {
104
- "processed_by": requester_id,
105
- "processed_at": time.time()
106
- }
107
-
108
- # Remove the lock if it exists
109
- if course in frame_locks and frame in frame_locks[course]:
110
- del frame_locks[course][frame]
111
-
112
- save_tracking_state()
113
 
114
- def log_message(message):
115
- """Add a log message with timestamp"""
116
- timestamp = datetime.now().strftime("%H:%M:%S")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  log_entry = f"[{timestamp}] {message}"
 
118
  processing_status["logs"].append(log_entry)
119
-
120
- # Keep only the last 100 logs
121
  if len(processing_status["logs"]) > 100:
122
  processing_status["logs"] = processing_status["logs"][-100:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- print(log_entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- @app.on_event("startup")
127
- async def startup_event():
128
- """Initialize frame tracking and start processing loop"""
129
- # Load frame tracking state
130
- load_tracking_state()
131
- log_message("βœ“ Loaded frame tracking state")
 
 
 
 
 
 
132
 
133
- # Start processing thread
134
- global processing_thread
135
- if not (processing_thread and processing_thread.is_alive()):
136
- log_message("πŸš€ Starting RAR extraction, frame extraction, and vision analysis pipeline in background...")
137
- processing_thread = threading.Thread(target=main_processing_loop)
138
- processing_thread.daemon = True
139
- processing_thread.start()
140
-
141
- @app.get("/")
142
- async def root():
143
- """Root endpoint that returns basic info"""
144
- return {
145
- "message": "Video Analysis API",
146
- "status": "running",
147
- "endpoints": {
148
- "/status": "Get processing status",
149
- "/courses": "List all available course folders",
150
- "/images/{course_folder}": "List images in a course folder",
151
- "/images/{course_folder}/{frame_filename}": "Get specific frame image",
152
- "/start-processing": "Start processing pipeline",
153
- "/stop-processing": "Stop processing pipeline"
154
- }
155
- }
156
-
157
- @app.get("/status")
158
- async def get_status():
159
- """Get current processing status"""
160
- return {
161
- "processing_status": processing_status,
162
- "frames_folder": FRAMES_OUTPUT_FOLDER,
163
- "frames_folder_exists": os.path.exists(FRAMES_OUTPUT_FOLDER)
164
- }
165
-
166
- # ===== NEW IMAGE SERVING ENDPOINTS =====
167
-
168
- @app.get("/middleware/next/course")
169
- async def get_next_course(requester_id: str):
170
- """Get next available course for processing"""
171
- if not os.path.exists(FRAMES_OUTPUT_FOLDER):
172
- raise HTTPException(status_code=404, detail="No courses available")
173
 
174
- # Load latest state
175
- load_tracking_state()
 
 
 
 
 
 
 
 
 
 
176
 
177
- # Find a course with unprocessed frames
178
- for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
179
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
180
- if not os.path.isdir(folder_path):
181
- continue
 
 
 
 
 
182
 
183
- # Check if course has any unprocessed frames
184
- image_files = [f for f in os.listdir(folder_path)
185
- if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
186
-
187
- for image in image_files:
188
- if (folder not in processed_frames or
189
- image not in processed_frames[folder]):
190
- return {"course": folder}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- raise HTTPException(status_code=404, detail="No courses with unprocessed frames")
 
 
 
 
193
 
194
- @app.get("/middleware/next/image/{course_folder}")
195
- async def get_next_image(course_folder: str, requester_id: str):
196
- """Get next available image from a course"""
197
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
198
-
199
- if not os.path.exists(folder_path):
200
- raise HTTPException(status_code=404, detail=f"Course not found: {course_folder}")
201
-
202
- # Load latest state
203
- load_tracking_state()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- # Find first unprocessed and unlocked frame
206
- for file in sorted(os.listdir(folder_path)):
207
- if not file.lower().endswith(('.png', '.jpg', '.jpeg')):
208
- continue
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- # Skip if processed
211
- if (course_folder in processed_frames and
212
- file in processed_frames[course_folder]):
213
- continue
214
 
215
- # Skip if locked by another requester
216
- if check_frame_lock(course_folder, file):
217
- continue
 
 
 
 
 
 
 
 
 
218
 
219
- # Try to lock the frame
220
- if lock_frame(course_folder, file, requester_id):
221
- file_path = os.path.join(folder_path, file)
222
- file_stats = os.stat(file_path)
223
- return {
224
- "file_id": f"frame:{course_folder}/{file}",
225
- "frame": file,
226
- "video": os.path.splitext(file)[0],
227
- "size_bytes": file_stats.st_size,
228
- "modified_time": time.ctime(file_stats.st_mtime),
229
- "url": f"/images/{course_folder}/{file}"
230
- }
231
-
232
- raise HTTPException(status_code=404, detail="No available frames in course")
233
-
234
- @app.post("/middleware/release/frame/{course_folder}/{video}/{frame}")
235
- async def release_frame(course_folder: str, video: str, frame: str, requester_id: str):
236
- """Release a frame lock"""
237
- if course_folder in frame_locks and frame in frame_locks[course_folder]:
238
- lock = frame_locks[course_folder][frame]
239
- if lock["locked_by"] == requester_id:
240
- del frame_locks[course_folder][frame]
241
- save_tracking_state()
242
- return {"status": "released"}
243
- return {"status": "not_found"}
244
-
245
- @app.post("/middleware/release/course/{course_folder}")
246
- async def release_course(course_folder: str, requester_id: str):
247
- """Release all frame locks for a course"""
248
- if course_folder in frame_locks:
249
- # Only release frames locked by this requester
250
- frames_to_release = [
251
- frame for frame, lock in frame_locks[course_folder].items()
252
- if lock["locked_by"] == requester_id
253
- ]
254
- for frame in frames_to_release:
255
- del frame_locks[course_folder][frame]
256
- save_tracking_state()
257
- return {"status": "released"}
258
-
259
- @app.get("/images/{course_folder}/{frame_filename}")
260
- async def get_frame_image(course_folder: str, frame_filename: str, requester_id: str = None):
261
- """
262
- Serve extracted frame images from course folders with locking
263
-
264
- Args:
265
- course_folder: The course folder name (e.g., "course1_video1_mp4_frames")
266
- frame_filename: The frame file name (e.g., "0001.png")
267
- requester_id: Optional requester ID for frame locking
268
- """
269
- # Load latest state
270
- load_tracking_state()
271
-
272
- # Construct the full path to the image
273
- image_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder, frame_filename)
274
-
275
- # Check if file exists
276
- if not os.path.exists(image_path):
277
- raise HTTPException(status_code=404, detail=f"Image not found: {course_folder}/{frame_filename}")
278
-
279
- # Verify it's an image file
280
- if not frame_filename.lower().endswith(('.png', '.jpg', '.jpeg')):
281
- raise HTTPException(status_code=400, detail="File must be an image (PNG, JPG, JPEG)")
282
-
283
- # If requester_id provided, verify frame lock
284
- if requester_id:
285
- if check_frame_lock(course_folder, frame_filename):
286
- lock = frame_locks[course_folder][frame_filename]
287
- if lock["locked_by"] != requester_id:
288
- raise HTTPException(status_code=423, detail="Frame is locked by another requester")
289
-
290
- # Return the image file
291
- return FileResponse(image_path)
292
-
293
- @app.get("/images/{course_folder}")
294
- async def list_course_images(course_folder: str):
295
- """
296
- List all available images in a specific course folder
297
-
298
- Args:
299
- course_folder: The course folder name
300
- """
301
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
302
-
303
- if not os.path.exists(folder_path):
304
- raise HTTPException(status_code=404, detail=f"Course folder not found: {course_folder}")
305
-
306
- # Get all image files
307
- image_files = []
308
- for file in os.listdir(folder_path):
309
- if file.lower().endswith(('.png', '.jpg', '.jpeg')):
310
- file_path = os.path.join(folder_path, file)
311
- file_stats = os.stat(file_path)
312
- image_files.append({
313
- "filename": file,
314
- "size_bytes": file_stats.st_size,
315
- "modified_time": time.ctime(file_stats.st_mtime),
316
- "url": f"/images/{course_folder}/{file}"
317
- })
318
-
319
- return {
320
- "course_folder": course_folder,
321
- "total_images": len(image_files),
322
- "images": image_files
323
- }
324
-
325
- @app.get("/courses")
326
- async def list_all_courses():
327
- """
328
- List all available course folders with their image counts
329
- """
330
- if not os.path.exists(FRAMES_OUTPUT_FOLDER):
331
- return {"courses": [], "message": "Frames output folder does not exist yet"}
332
-
333
- courses = []
334
- for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
335
- folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
336
- if os.path.isdir(folder_path):
337
- # Count image files
338
- image_count = len([f for f in os.listdir(folder_path)
339
- if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
340
- courses.append({
341
- "course_folder": folder,
342
- "image_count": image_count,
343
- "images_url": f"/images/{folder}",
344
- "sample_image_url": f"/images/{folder}/0001.png" if image_count > 0 else None
345
- })
346
-
347
- return {
348
- "total_courses": len(courses),
349
- "courses": courses
350
- }
351
-
352
-
353
- # Signal handlers to prevent accidental shutdown
354
- def handle_shutdown(signum, frame):
355
- """Prevent shutdown on SIGTERM/SIGINT"""
356
- print(f"\n⚠️ Received signal {signum}. Server will continue running.")
357
- print("Use Ctrl+Break or kill -9 to force stop.")
358
-
359
- # Setup signal handlers for graceful shutdown prevention
360
- import signal
361
- signal.signal(signal.SIGINT, handle_shutdown)
362
- signal.signal(signal.SIGTERM, handle_shutdown)
363
-
364
- # Server lifecycle events
365
- @app.on_event("shutdown")
366
- async def shutdown_event():
367
- """Save state on shutdown attempt"""
368
- save_tracking_state()
369
- print("πŸ’Ύ Saved tracking state")
370
- print("⚠️ Server shutdown prevented - use Ctrl+Break or kill -9 to force stop")
371
- # Prevent shutdown by not returning
372
- while True:
373
- await asyncio.sleep(1)
374
-
375
- if __name__ == "__main__":
376
- # Start the FastAPI server
377
- print("πŸš€ Starting Video Analysis FastAPI Server (Persistent Mode)...")
378
- print("API Documentation will be available at: http://localhost:8000/docs")
379
- print("API Root endpoint: http://localhost:8000/")
380
- print("⚠️ Server will continue running even after processing completes")
381
- print("Use Ctrl+Break or kill -9 to force stop")
382
-
383
- # Ensure the analysis output folder exists
384
- os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
385
-
386
- # Start processing in thread instead of blocking
387
- processing_thread = threading.Thread(target=main_processing_loop)
388
- processing_thread.daemon = False # Make non-daemon so it doesn't exit
389
- processing_thread.start()
390
-
391
- # Configure uvicorn for persistent running
392
- config = uvicorn.Config(
393
- app=app,
394
- host="0.0.0.0",
395
- port=8000,
396
- log_level="info",
397
- reload=False,
398
- workers=1,
399
- loop="asyncio",
400
- timeout_keep_alive=600, # Keep connections alive longer
401
- access_log=True
402
- )
403
-
404
- # Run server with persistent config
405
- server = uvicorn.Server(config)
406
- server.run()
407
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
  import time
7
+ import re
8
  import threading
9
+ from typing import Dict, List, Set, Optional
10
+ from huggingface_hub import HfApi, list_repo_files
11
+
12
+ import cv2
13
+ import numpy as np
 
 
14
  from pathlib import Path
15
+ import smtplib
16
+ from email.message import EmailMessage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # ==== CONFIGURATION ====
19
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
20
+ SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
 
 
 
 
 
 
 
 
 
21
 
22
+ # Path Configuration
23
+ DOWNLOAD_FOLDER = "downloads"
24
+ EXTRACT_FOLDER = "extracted"
25
+ FRAMES_OUTPUT_FOLDER = "extracted_frames"
 
 
 
 
 
 
26
 
27
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
28
+ os.makedirs(EXTRACT_FOLDER, exist_ok=True)
29
+ os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # State Files
32
+ DOWNLOAD_STATE_FILE = "download_progress.json"
33
+ PROCESS_STATE_FILE = "process_progress.json"
34
+ FAILED_FILES_LOG = "failed_files.log"
35
+
36
+ # Processing Parameters
37
+ CHUNK_SIZE = 1
38
+ PROCESSING_DELAY = 2
39
+ MAX_RETRIES = 3
40
+ MIN_FREE_SPACE_GB = 2 # Minimum free space in GB before processing
41
+
42
+ # Frame Extraction Parameters
43
+ DEFAULT_FPS = 3 # Default frames per second for extraction
44
+
45
+ # Cursor Tracking Parameters
46
 
47
+
48
+ # Initialize HF API
49
+ hf_api = HfApi(token=HF_TOKEN)
50
+
51
+ # Global State
52
+ processing_status = {
53
+ "is_running": False,
54
+ "current_file": None,
55
+ "total_files": 0,
56
+ "processed_files": 0,
57
+ "failed_files": 0,
58
+ "extracted_courses": 0,
59
+ "extracted_videos": 0,
60
+ "last_update": None,
61
+ "logs": []
62
+ }
63
+
64
+ def log_message(message: str):
65
+ """Log messages with timestamp"""
66
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
67
  log_entry = f"[{timestamp}] {message}"
68
+ print(log_entry)
69
  processing_status["logs"].append(log_entry)
70
+ processing_status["last_update"] = timestamp
 
71
  if len(processing_status["logs"]) > 100:
72
  processing_status["logs"] = processing_status["logs"][-100:]
73
+
74
+ def log_failed_file(filename: str, error: str):
75
+ """Log failed files to persistent file"""
76
+ with open(FAILED_FILES_LOG, "a") as f:
77
+ f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
78
+
79
+ def get_disk_usage(path: str) -> Dict[str, float]:
80
+ """Get disk usage statistics in GB"""
81
+ statvfs = os.statvfs(path)
82
+ total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
83
+ free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
84
+ used = total - free
85
+ return {"total": total, "free": free, "used": used}
86
+
87
+ def check_disk_space(path: str = ".") -> bool:
88
+ """Check if there's enough disk space"""
89
+ disk_info = get_disk_usage(path)
90
+ if disk_info["free"] < MIN_FREE_SPACE_GB:
91
+ log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
92
+ return False
93
+ return True
94
+
95
+ def cleanup_temp_files():
96
+ """Clean up temporary files to free space"""
97
+ log_message("🧹 Cleaning up temporary files...")
98
 
99
+ # Clean old downloads (keep only current processing file)
100
+ current_file = processing_status.get("current_file")
101
+ for file in os.listdir(DOWNLOAD_FOLDER):
102
+ if file != current_file and file.endswith((".rar", ".zip")):
103
+ try:
104
+ os.remove(os.path.join(DOWNLOAD_FOLDER, file))
105
+ log_message(f"πŸ—‘οΈ Removed old download: {file}")
106
+ except:
107
+ pass
108
+
109
+ def load_json_state(file_path: str, default_value):
110
+ """Load state from JSON file"""
111
+ if os.path.exists(file_path):
112
+ try:
113
+ with open(file_path, "r") as f:
114
+ return json.load(f)
115
+ except json.JSONDecodeError:
116
+ log_message(f"⚠️ Corrupted state file: {file_path}")
117
+ return default_value
118
 
119
+ def save_json_state(file_path: str, data):
120
+ """Save state to JSON file"""
121
+ with open(file_path, "w") as f:
122
+ json.dump(data, f, indent=2)
123
+
124
+ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
125
+ """Download file with retry logic and disk space checking"""
126
+ if not check_disk_space():
127
+ cleanup_temp_files()
128
+ if not check_disk_space():
129
+ log_message("❌ Insufficient disk space even after cleanup")
130
+ return False
131
 
132
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
133
+ for attempt in range(max_retries):
134
+ try:
135
+ with requests.get(url, headers=headers, stream=True) as r:
136
+ r.raise_for_status()
137
+
138
+ # Check content length if available
139
+ content_length = r.headers.get("content-length")
140
+ if content_length:
141
+ size_gb = int(content_length) / (1024**3)
142
+ disk_info = get_disk_usage(".")
143
+ if size_gb > disk_info["free"] - 0.5: # Leave 0.5GB buffer
144
+ log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
145
+ return False
146
+
147
+ with open(dest_path, "wb") as f:
148
+ for chunk in r.iter_content(chunk_size=8192):
149
+ f.write(chunk)
150
+ return True
151
+ except Exception as e:
152
+ if attempt < max_retries - 1:
153
+ time.sleep(2 ** attempt)
154
+ continue
155
+ log_message(f"❌ Download failed after {max_retries} attempts: {e}")
156
+ return False
157
+ return False
158
+
159
+ def is_multipart_rar(filename: str) -> bool:
160
+ """Check if this is a multi-part RAR file"""
161
+ return ".part" in filename.lower() and filename.lower().endswith(".rar")
162
+
163
+ def get_rar_part_base(filename: str) -> str:
164
+ """Get the base name for multi-part RAR files"""
165
+ if ".part" in filename.lower():
166
+ return filename.split(".part")[0]
167
+ return filename.replace(".rar", "")
168
+
169
+ def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
170
+ """Extract RAR with retry and recovery, handling multi-part archives"""
171
+ filename = os.path.basename(rar_path)
172
 
173
+ # For multi-part RARs, we need the first part
174
+ if is_multipart_rar(filename):
175
+ base_name = get_rar_part_base(filename)
176
+ first_part = f"{base_name}.part01.rar"
177
+ first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
178
+
179
+ if not os.path.exists(first_part_path):
180
+ log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
181
+ return False
182
+
183
+ rar_path = first_part_path
184
+ log_message(f"πŸ“¦ Processing multi-part RAR starting with: {first_part}")
185
 
186
+ for attempt in range(max_retries):
187
+ try:
188
+ # Test RAR first
189
+ test_cmd = ["unrar", "t", rar_path]
190
+ test_result = subprocess.run(test_cmd, capture_output=True, text=True)
191
+ if test_result.returncode != 0:
192
+ log_message(f"⚠️ RAR test failed: {test_result.stderr}")
193
+ if attempt == max_retries - 1:
194
+ return False
195
+ continue
196
 
197
+ # Extract RAR
198
+ cmd = ["unrar", "x", "-o+", rar_path, output_dir]
199
+ if attempt > 0: # Try recovery on subsequent attempts
200
+ cmd.insert(2, "-kb")
201
+
202
+ result = subprocess.run(cmd, capture_output=True, text=True)
203
+ if result.returncode == 0:
204
+ log_message(f"βœ… Successfully extracted: {os.path.basename(rar_path)}")
205
+ return True
206
+ else:
207
+ error_msg = result.stderr or result.stdout
208
+ log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
209
+
210
+ if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
211
+ log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
212
+ elif result.returncode == 10:
213
+ log_message(f"⚠️ No files to extract (exit code 10)")
214
+ return False
215
+ elif result.returncode == 1:
216
+ log_message(f"⚠️ Non-fatal error (exit code 1)")
217
+
218
+ except Exception as e:
219
+ log_message(f"❌ Extraction exception: {str(e)}")
220
+ if attempt == max_retries - 1:
221
+ return False
222
+ time.sleep(1)
223
 
224
+ return False
225
+
226
+ # --- Frame Extraction Utilities ---
227
+ def ensure_dir(path):
228
+ os.makedirs(path, exist_ok=True)
229
 
230
+ def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
231
+ """Extract frames from video at the specified frames per second (fps)."""
232
+ log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
233
+ ensure_dir(output_dir)
234
+ cap = cv2.VideoCapture(str(video_path))
235
+ if not cap.isOpened():
236
+ log_message(f"[ERROR] Failed to open video file: {video_path}")
237
+ return 0
238
+ video_fps = cap.get(cv2.CAP_PROP_FPS)
239
+ # log_message(f"[DEBUG] Video FPS: {video_fps}")
240
+ if not video_fps or video_fps <= 0:
241
+ video_fps = 30 # fallback if FPS is not available
242
+ log_message(f"[WARN] Using fallback FPS: {video_fps}")
243
+ frame_interval = int(round(video_fps / fps))
244
+ # log_message(f"[DEBUG] Frame interval: {frame_interval}")
245
+ frame_idx = 0
246
+ saved_idx = 1
247
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
248
+ log_message(f"[DEBUG] Total frames in video: {total_frames}")
249
+ while cap.isOpened():
250
+ ret, frame = cap.read()
251
+ if not ret:
252
+ # log_message(f"[DEBUG] No more frames to read at frame_idx {frame_idx}.")
253
+ break
254
+ if frame_idx % frame_interval == 0:
255
+ frame_name = f"{saved_idx:04d}.png"
256
+ cv2.imwrite(str(Path(output_dir) / frame_name), frame)
257
+ # log_message(f"[DEBUG] Saved frame {frame_idx} as {frame_name}")
258
+ saved_idx += 1
259
+ frame_idx += 1
260
+ cap.release()
261
+ log_message(f"Extracted {saved_idx-1} frames from {video_path} to {output_dir}")
262
+ return saved_idx - 1
263
+
264
+
265
+
266
+
267
+
268
+ def process_rar_file(rar_path: str) -> bool:
269
+ """Process a single RAR file - extract, then process videos for frames"""
270
+ filename = os.path.basename(rar_path)
271
+ processing_status["current_file"] = filename
272
+
273
+ # Handle multi-part RAR naming
274
+ if is_multipart_rar(filename):
275
+ course_name = get_rar_part_base(filename)
276
+ else:
277
+ course_name = filename.replace(".rar", "")
278
+
279
+ extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
280
+
281
+ try:
282
+ log_message(f"πŸ”„ Processing: {filename}")
283
+
284
+ # Clean up any existing directory
285
+ if os.path.exists(extract_dir):
286
+ shutil.rmtree(extract_dir, ignore_errors=True)
287
+
288
+ # Extract RAR
289
+ os.makedirs(extract_dir, exist_ok=True)
290
+ if not extract_with_retry(rar_path, extract_dir):
291
+ raise Exception("RAR extraction failed")
292
+
293
+ # Count extracted files
294
+ file_count = 0
295
+ video_files_found = []
296
+ for root, dirs, files in os.walk(extract_dir):
297
+ for file in files:
298
+ file_count += 1
299
+ if file.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
300
+ video_files_found.append(os.path.join(root, file))
301
+
302
+ processing_status["extracted_courses"] += 1
303
+ log_message(f"βœ… Successfully extracted '{course_name}' ({file_count} files, {len(video_files_found)} videos)")
304
+
305
+ # Process video files for frame extraction
306
+ for video_path in video_files_found:
307
+ video_filename = Path(video_path).name
308
+ # Unique output directory for frames
309
+ frames_output_dir = os.path.join(
310
+ FRAMES_OUTPUT_FOLDER,
311
+ f"{course_name}_{video_filename.replace('.', '_')}_frames"
312
+ )
313
+ ensure_dir(frames_output_dir)
314
+
315
+ # πŸ”₯ Extract frames here
316
+ frame_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
317
+ processing_status["extracted_videos"] += 1
318
+
319
+ if frame_count == 0:
320
+ log_message(f"⚠️ No frames extracted from {video_filename}")
321
+ else:
322
+ log_message(f"βœ… {frame_count} frames extracted from {video_filename}")
323
+
324
+ return True
325
+
326
+ except Exception as e:
327
+ error_msg = str(e)
328
+ log_message(f"❌ Processing failed: {error_msg}")
329
+ log_failed_file(filename, error_msg)
330
+ return False
331
+
332
+ finally:
333
+ processing_status["current_file"] = None
334
+
335
+
336
+ def main_processing_loop(start_index: int = 0):
337
+ """Main processing workflow - extraction, frame extraction, and cursor tracking"""
338
+ processing_status["is_running"] = True
339
 
340
+ try:
341
+ # Load state
342
+ processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
343
+ download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 3})
344
+
345
+ # Use start_index if provided, otherwise use the saved state
346
+ next_index = start_index if start_index > 0 else download_state["next_download_index"]
347
+
348
+ log_message(f"πŸ“Š Starting from index {next_index}")
349
+ log_message(f"πŸ“Š Previously processed: {len(processed_rars)} files")
350
+
351
+ # Get file list
352
+ try:
353
+ files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
354
+ rar_files = sorted([f for f in files if f.endswith(".rar")])
355
 
356
+ processing_status["total_files"] = len(rar_files)
357
+ log_message(f"πŸ“ Found {len(rar_files)} RAR files in repository")
 
 
358
 
359
+ if next_index >= len(rar_files):
360
+ log_message("βœ… All files have been processed!")
361
+ return
362
+
363
+ except Exception as e:
364
+ log_message(f"❌ Failed to get file list: {str(e)}")
365
+ return
366
+
367
+ # Process only one file per run
368
+ if next_index < len(rar_files):
369
+ rar_file = rar_files[next_index]
370
+ filename = os.path.basename(rar_file)
371
 
372
+ if filename in processed_rars:
373
+ log_message(f"⏭️ Skipping already processed: {filename}")
374
+ processing_status["processed_files"] += 1
375
+ # Move to next file
376
+ next_index += 1
377
+ save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
378
+ log_message(f"πŸ“Š Moving to next file. Progress: {next_index}/{len(rar_files)}")
379
+ return
380
+
381
+ log_message(f"πŸ“₯ Downloading: {filename}")
382
+ dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
383
+
384
+ # Download file
385
+ download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
386
+ if download_with_retry(download_url, dest_path):
387
+ # Process file
388
+ if process_rar_file(dest_path):
389
+ processed_rars.append(filename)
390
+ save_json_state(PROCESS_STATE_FILE, {"processed_rars": processed_rars})
391
+ log_message(f"βœ… Successfully processed: {filename}")
392
+ processing_status["processed_files"] += 1
393
+ else:
394
+ log_message(f"❌ Failed to process: {filename}")
395
+ processing_status["failed_files"] += 1
396
+
397
+ # Clean up downloaded file
398
+ try:
399
+ os.remove(dest_path)
400
+ log_message(f"πŸ—‘οΈ Cleaned up download: {filename}")
401
+ except:
402
+ pass
403
+ else:
404
+ log_message(f"❌ Failed to download: {filename}")
405
+ processing_status["failed_files"] += 1
406
+
407
+ # Update download state for next run
408
+ next_index += 1
409
+ save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
410
+
411
+ # Status update log_message(f"πŸ“Š Frames Extracted: {processing_status["extracted_frames_count"]}")
412
+ if next_index < len(rar_files):
413
+ log_message(f"πŸ”„ Run the script again to process the next file: {os.path.basename(rar_files[next_index])}")
414
+ else:
415
+ log_message("πŸŽ‰ All files have been processed!")
416
+ else:
417
+ log_message("βœ… All files have been processed!")
418
+
419
+ log_message("πŸŽ‰ Processing complete!")
420
+ log_message(f'πŸ“Š Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, frames extracted')
421
+
422
+ except KeyboardInterrupt:
423
+ log_message("⏹️ Processing interrupted by user")
424
+ except Exception as e:
425
+ log_message(f"❌ Fatal error: {str(e)}")
426
+ finally:
427
+ processing_status["is_running"] = False
428
+ cleanup_temp_files()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
 
430
+ # Expose necessary functions and variables for download_api.py
431
+ __all__ = [
432
+ "main_processing_loop",
433
+ "processing_status",
434
+ "log_message",
435
+ "extract_frames",
436
+ "DEFAULT_FPS",
437
+ "ensure_dir"
438
+ ]