Factor Studios commited on
Commit
2e7ef3c
Β·
verified Β·
1 Parent(s): aeb6030

Update cursor_tracker.py

Browse files
Files changed (1) hide show
  1. cursor_tracker.py +601 -601
cursor_tracker.py CHANGED
@@ -1,601 +1,601 @@
1
- import os
2
- import json
3
- import requests
4
- import subprocess
5
- import shutil
6
- import time
7
- import re
8
- import threading
9
- from typing import Dict, List, Set, Optional
10
- from huggingface_hub import HfApi, list_repo_files
11
-
12
- import cv2
13
- import numpy as np
14
- from pathlib import Path
15
- import smtplib
16
- from email.message import EmailMessage
17
-
18
- # ==== CONFIGURATION ====
19
- HF_TOKEN = os.getenv("HF_TOKEN", "")
20
- SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
21
-
22
- # Path Configuration
23
- DOWNLOAD_FOLDER = "downloads"
24
- EXTRACT_FOLDER = "extracted"
25
- FRAMES_OUTPUT_FOLDER = "extracted_frames" # New folder for extracted frames
26
- CURSOR_TRACKING_OUTPUT_FOLDER = "cursor_tracking_results" # New folder for cursor tracking results
27
- CURSOR_TEMPLATES_DIR = "cursors"
28
-
29
- os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
30
- os.makedirs(EXTRACT_FOLDER, exist_ok=True)
31
- os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
32
- os.makedirs(CURSOR_TRACKING_OUTPUT_FOLDER, exist_ok=True)
33
- os.makedirs(CURSOR_TEMPLATES_DIR, exist_ok=True) # Ensure cursor templates directory exists
34
-
35
- # State Files
36
- DOWNLOAD_STATE_FILE = "download_progress.json"
37
- PROCESS_STATE_FILE = "process_progress.json"
38
- FAILED_FILES_LOG = "failed_files.log"
39
-
40
- # Processing Parameters
41
- CHUNK_SIZE = 1
42
- PROCESSING_DELAY = 2
43
- MAX_RETRIES = 3
44
- MIN_FREE_SPACE_GB = 2 # Minimum free space in GB before processing
45
-
46
- # Frame Extraction Parameters
47
- DEFAULT_FPS = 3 # Default frames per second for extraction
48
-
49
- # Cursor Tracking Parameters
50
- CURSOR_THRESHOLD = 0.8
51
-
52
- # Initialize HF API
53
- hf_api = HfApi(token=HF_TOKEN)
54
-
55
- # Global State
56
- processing_status = {
57
- "is_running": False,
58
- "current_file": None,
59
- "total_files": 0,
60
- "processed_files": 0,
61
- "failed_files": 0,
62
- "extracted_courses": 0,
63
- "extracted_videos": 0,
64
- "extracted_frames_count": 0,
65
- "tracked_cursors_count": 0,
66
- "last_update": None,
67
- "logs": []
68
- }
69
-
70
- def log_message(message: str):
71
- """Log messages with timestamp"""
72
- timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
73
- log_entry = f"[{timestamp}] {message}"
74
- print(log_entry)
75
- processing_status["logs"].append(log_entry)
76
- processing_status["last_update"] = timestamp
77
- if len(processing_status["logs"]) > 100:
78
- processing_status["logs"] = processing_status["logs"][-100:]
79
-
80
- def log_failed_file(filename: str, error: str):
81
- """Log failed files to persistent file"""
82
- with open(FAILED_FILES_LOG, "a") as f:
83
- f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
84
-
85
- def get_disk_usage(path: str) -> Dict[str, float]:
86
- """Get disk usage statistics in GB"""
87
- statvfs = os.statvfs(path)
88
- total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
89
- free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
90
- used = total - free
91
- return {"total": total, "free": free, "used": used}
92
-
93
- def check_disk_space(path: str = ".") -> bool:
94
- """Check if there's enough disk space"""
95
- disk_info = get_disk_usage(path)
96
- if disk_info["free"] < MIN_FREE_SPACE_GB:
97
- log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
98
- return False
99
- return True
100
-
101
- def cleanup_temp_files():
102
- """Clean up temporary files to free space"""
103
- log_message("🧹 Cleaning up temporary files...")
104
-
105
- # Clean old downloads (keep only current processing file)
106
- current_file = processing_status.get("current_file")
107
- for file in os.listdir(DOWNLOAD_FOLDER):
108
- if file != current_file and file.endswith((".rar", ".zip")):
109
- try:
110
- os.remove(os.path.join(DOWNLOAD_FOLDER, file))
111
- log_message(f"πŸ—‘οΈ Removed old download: {file}")
112
- except:
113
- pass
114
-
115
- def load_json_state(file_path: str, default_value):
116
- """Load state from JSON file"""
117
- if os.path.exists(file_path):
118
- try:
119
- with open(file_path, "r") as f:
120
- return json.load(f)
121
- except json.JSONDecodeError:
122
- log_message(f"⚠️ Corrupted state file: {file_path}")
123
- return default_value
124
-
125
- def save_json_state(file_path: str, data):
126
- """Save state to JSON file"""
127
- with open(file_path, "w") as f:
128
- json.dump(data, f, indent=2)
129
-
130
- def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
131
- """Download file with retry logic and disk space checking"""
132
- if not check_disk_space():
133
- cleanup_temp_files()
134
- if not check_disk_space():
135
- log_message("❌ Insufficient disk space even after cleanup")
136
- return False
137
-
138
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
139
- for attempt in range(max_retries):
140
- try:
141
- with requests.get(url, headers=headers, stream=True) as r:
142
- r.raise_for_status()
143
-
144
- # Check content length if available
145
- content_length = r.headers.get("content-length")
146
- if content_length:
147
- size_gb = int(content_length) / (1024**3)
148
- disk_info = get_disk_usage(".")
149
- if size_gb > disk_info["free"] - 0.5: # Leave 0.5GB buffer
150
- log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
151
- return False
152
-
153
- with open(dest_path, "wb") as f:
154
- for chunk in r.iter_content(chunk_size=8192):
155
- f.write(chunk)
156
- return True
157
- except Exception as e:
158
- if attempt < max_retries - 1:
159
- time.sleep(2 ** attempt)
160
- continue
161
- log_message(f"❌ Download failed after {max_retries} attempts: {e}")
162
- return False
163
- return False
164
-
165
- def is_multipart_rar(filename: str) -> bool:
166
- """Check if this is a multi-part RAR file"""
167
- return ".part" in filename.lower() and filename.lower().endswith(".rar")
168
-
169
- def get_rar_part_base(filename: str) -> str:
170
- """Get the base name for multi-part RAR files"""
171
- if ".part" in filename.lower():
172
- return filename.split(".part")[0]
173
- return filename.replace(".rar", "")
174
-
175
- def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
176
- """Extract RAR with retry and recovery, handling multi-part archives"""
177
- filename = os.path.basename(rar_path)
178
-
179
- # For multi-part RARs, we need the first part
180
- if is_multipart_rar(filename):
181
- base_name = get_rar_part_base(filename)
182
- first_part = f"{base_name}.part01.rar"
183
- first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
184
-
185
- if not os.path.exists(first_part_path):
186
- log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
187
- return False
188
-
189
- rar_path = first_part_path
190
- log_message(f"πŸ“¦ Processing multi-part RAR starting with: {first_part}")
191
-
192
- for attempt in range(max_retries):
193
- try:
194
- # Test RAR first
195
- test_cmd = ["unrar", "t", rar_path]
196
- test_result = subprocess.run(test_cmd, capture_output=True, text=True)
197
- if test_result.returncode != 0:
198
- log_message(f"⚠️ RAR test failed: {test_result.stderr}")
199
- if attempt == max_retries - 1:
200
- return False
201
- continue
202
-
203
- # Extract RAR
204
- cmd = ["unrar", "x", "-o+", rar_path, output_dir]
205
- if attempt > 0: # Try recovery on subsequent attempts
206
- cmd.insert(2, "-kb")
207
-
208
- result = subprocess.run(cmd, capture_output=True, text=True)
209
- if result.returncode == 0:
210
- log_message(f"βœ… Successfully extracted: {os.path.basename(rar_path)}")
211
- return True
212
- else:
213
- error_msg = result.stderr or result.stdout
214
- log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
215
-
216
- if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
217
- log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
218
- elif result.returncode == 10:
219
- log_message(f"⚠️ No files to extract (exit code 10)")
220
- return False
221
- elif result.returncode == 1:
222
- log_message(f"⚠️ Non-fatal error (exit code 1)")
223
-
224
- except Exception as e:
225
- log_message(f"❌ Extraction exception: {str(e)}")
226
- if attempt == max_retries - 1:
227
- return False
228
- time.sleep(1)
229
-
230
- return False
231
-
232
- # --- Frame Extraction Utilities ---
233
- def ensure_dir(path):
234
- os.makedirs(path, exist_ok=True)
235
-
236
- def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
237
- """Extract frames from video at the specified frames per second (fps)."""
238
- log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
239
- ensure_dir(output_dir)
240
- cap = cv2.VideoCapture(str(video_path))
241
- if not cap.isOpened():
242
- log_message(f"[ERROR] Failed to open video file: {video_path}")
243
- return 0
244
- video_fps = cap.get(cv2.CAP_PROP_FPS)
245
- # log_message(f"[DEBUG] Video FPS: {video_fps}")
246
- if not video_fps or video_fps <= 0:
247
- video_fps = 30 # fallback if FPS is not available
248
- log_message(f"[WARN] Using fallback FPS: {video_fps}")
249
- frame_interval = int(round(video_fps / fps))
250
- # log_message(f"[DEBUG] Frame interval: {frame_interval}")
251
- frame_idx = 0
252
- saved_idx = 1
253
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
254
- log_message(f"[DEBUG] Total frames in video: {total_frames}")
255
- while cap.isOpened():
256
- ret, frame = cap.read()
257
- if not ret:
258
- # log_message(f"[DEBUG] No more frames to read at frame_idx {frame_idx}.")
259
- break
260
- if frame_idx % frame_interval == 0:
261
- frame_name = f"{saved_idx:04d}.png"
262
- cv2.imwrite(str(Path(output_dir) / frame_name), frame)
263
- # log_message(f"[DEBUG] Saved frame {frame_idx} as {frame_name}")
264
- saved_idx += 1
265
- frame_idx += 1
266
- cap.release()
267
- log_message(f"Extracted {saved_idx-1} frames from {video_path} to {output_dir}")
268
- return saved_idx - 1
269
-
270
- # --- Cursor Tracking Utilities ---
271
- def to_rgb(img):
272
- if img is None:
273
- return None
274
- if len(img.shape) == 2:
275
- return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
276
- if img.shape[2] == 4:
277
- return cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
278
- return img
279
-
280
- def get_mask_from_alpha(template_img):
281
- if template_img is not None and len(template_img.shape) == 3 and template_img.shape[2] == 4:
282
- # Use alpha channel as mask (nonzero alpha = 255)
283
- return (template_img[:, :, 3] > 0).astype(np.uint8) * 255
284
- return None
285
-
286
- def detect_cursor_in_frame_multi(frame, cursor_templates, threshold=CURSOR_THRESHOLD):
287
- """Detect cursor position in a frame using multiple templates. Returns best match above threshold."""
288
- best_pos = None
289
- best_conf = -1
290
- best_template_name = None
291
- frame_rgb = to_rgb(frame)
292
- for template_name, cursor_template in cursor_templates.items():
293
- template_rgb = to_rgb(cursor_template)
294
- mask = get_mask_from_alpha(cursor_template)
295
- if template_rgb is None or frame_rgb is None or template_rgb.shape[2] != frame_rgb.shape[2]:
296
- log_message(f"[WARN] Skipping template {template_name} due to channel mismatch or load error.")
297
- continue
298
- try:
299
- result = cv2.matchTemplate(frame_rgb, template_rgb, cv2.TM_CCOEFF_NORMED, mask=mask)
300
- except Exception as e:
301
- log_message(f"[WARN] matchTemplate failed for {template_name}: {e}")
302
- continue
303
- min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
304
- if max_val > best_conf:
305
- best_conf = max_val
306
- if max_val >= threshold:
307
- cursor_w, cursor_h = template_rgb.shape[1], template_rgb.shape[0]
308
- cursor_x = max_loc[0] + cursor_w // 2
309
- cursor_y = max_loc[1] + cursor_h // 2
310
- best_pos = (cursor_x, cursor_y)
311
- best_template_name = template_name
312
- if best_conf >= threshold:
313
- return best_pos, best_conf, best_template_name
314
- return None, best_conf, None
315
-
316
- def send_email_with_attachment(subject, body, to_email, from_email, app_password, attachment_path):
317
- msg = EmailMessage()
318
- msg["Subject"] = subject
319
- msg["From"] = from_email
320
- msg["To"] = to_email
321
- msg.set_content(body)
322
- with open(attachment_path, "rb") as f:
323
- file_data = f.read()
324
- file_name = Path(attachment_path).name
325
- msg.add_attachment(file_data, maintype="application", subtype="octet-stream", filename=file_name)
326
- try:
327
- with smtplib.SMTP_SSL("smtp.gmail.com", 465) as smtp:
328
- smtp.login(from_email, app_password)
329
- smtp.send_message(msg)
330
- log_message(f"[SUCCESS] Email sent to {to_email}")
331
- except Exception as e:
332
- log_message(f"[ERROR] Failed to send email: {e}")
333
-
334
- def track_cursor(frames_dir, cursor_templates_dir, output_json_path, threshold=CURSOR_THRESHOLD, start_frame=1, email_results=False):
335
- """Detect cursor in each frame using multiple templates, print status, and write positions to a JSON file."""
336
- log_message(f"[INFO] Tracking cursors in {frames_dir}...")
337
- frames_dir = Path(frames_dir).resolve()
338
- output_json_path = Path(output_json_path).resolve()
339
- cursor_templates_dir = Path(cursor_templates_dir).resolve()
340
- # log_message(f"[DEBUG] frames_dir: {frames_dir}")
341
- # log_message(f"[DEBUG] cursor_templates_dir: {cursor_templates_dir}")
342
- # log_message(f"[DEBUG] output_json_path: {output_json_path}")
343
- ensure_dir(frames_dir)
344
- ensure_dir(output_json_path.parent)
345
- # Load all PNG templates from the cursor_templates_dir
346
- cursor_templates = {}
347
- for template_file in cursor_templates_dir.glob("*.png"):
348
- template_img = cv2.imread(str(template_file), cv2.IMREAD_UNCHANGED)
349
- if template_img is not None:
350
- cursor_templates[template_file.name] = template_img
351
- else:
352
- log_message(f"[WARN] Could not load template: {template_file}")
353
- if not cursor_templates:
354
- log_message(f"[ERROR] No cursor templates found in: {cursor_templates_dir}")
355
- return 0
356
- results = []
357
- tracked_count = 0
358
- for frame_file in sorted(frames_dir.glob("*.png")):
359
- frame_num = int(frame_file.stem)
360
- if frame_num < start_frame:
361
- continue
362
- frame = cv2.imread(str(frame_file), cv2.IMREAD_UNCHANGED)
363
- if frame is None:
364
- log_message(f"[WARN] Could not load frame: {frame_file}")
365
- continue
366
- pos, conf, template_name = detect_cursor_in_frame_multi(frame, cursor_templates, threshold)
367
- if pos is not None:
368
- # log_message(f"{frame_file.name}: Cursor at {pos} (template: {template_name})")
369
- results.append({
370
- "frame": frame_file.name,
371
- "cursor_active": True,
372
- "x": pos[0],
373
- "y": pos[1],
374
- "confidence": conf,
375
- "template": template_name
376
- })
377
- tracked_count += 1
378
- else:
379
- # log_message(f"{frame_file.name}: Cursor disabled")
380
- results.append({
381
- "frame": frame_file.name,
382
- "cursor_active": False,
383
- "x": None,
384
- "y": None,
385
- "confidence": conf,
386
- "template": None
387
- })
388
- try:
389
- with open(output_json_path, "w") as f:
390
- json.dump(results, f, indent=2)
391
- log_message(f"[SUCCESS] Cursor tracking results saved to {output_json_path}")
392
- if email_results:
393
- log_message("[INFO] Preparing to email results...")
394
- to_email = os.environ.get("TO_EMAIL")
395
- from_email = os.environ.get("FROM_EMAIL")
396
- app_password = os.environ.get("GMAIL_APP_PASSWORD")
397
- if not (to_email and from_email and app_password):
398
- log_message("[ERROR] Email environment variables not set. Please set TO_EMAIL, FROM_EMAIL, and GMAIL_APP_PASSWORD.")
399
- # return tracked_count # Don't return here, just log error
400
- else:
401
- send_email_with_attachment(
402
- subject="Cursor Tracking Results",
403
- body="See attached JSON results.",
404
- to_email=to_email,
405
- from_email=from_email,
406
- app_password=app_password,
407
- attachment_path=output_json_path
408
- )
409
- except Exception as e:
410
- log_message(f"[ERROR] Failed to write output JSON: {e}")
411
- # raise # Don't raise, just log error
412
- return tracked_count
413
-
414
- def process_rar_file(rar_path: str) -> bool:
415
- """Process a single RAR file - extract, then process videos for frames and cursor tracking"""
416
- filename = os.path.basename(rar_path)
417
- processing_status["current_file"] = filename
418
-
419
- # Handle multi-part RAR naming
420
- if is_multipart_rar(filename):
421
- course_name = get_rar_part_base(filename)
422
- else:
423
- course_name = filename.replace(".rar", "")
424
-
425
- extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
426
-
427
- try:
428
- log_message(f"πŸ”„ Processing: {filename}")
429
-
430
- # Clean up any existing directory
431
- if os.path.exists(extract_dir):
432
- shutil.rmtree(extract_dir, ignore_errors=True)
433
-
434
- # Extract RAR
435
- os.makedirs(extract_dir, exist_ok=True)
436
- if not extract_with_retry(rar_path, extract_dir):
437
- raise Exception("RAR extraction failed")
438
-
439
- # Count extracted files
440
- file_count = 0
441
- video_files_found = []
442
- for root, dirs, files in os.walk(extract_dir):
443
- for file in files:
444
- file_count += 1
445
- if file.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
446
- video_files_found.append(os.path.join(root, file))
447
-
448
- processing_status["extracted_courses"] += 1
449
- log_message(f"βœ… Successfully extracted \'{course_name}\' ({file_count} files, {len(video_files_found)} videos)")
450
-
451
- # Process video files for frame extraction and cursor tracking
452
- for video_path in video_files_found:
453
- video_filename = Path(video_path).name
454
- # Create a unique output directory for frames for each video
455
- frames_output_dir = os.path.join(FRAMES_OUTPUT_FOLDER, f"{course_name}_{video_filename.replace('.', '_')}_frames")
456
- ensure_dir(frames_output_dir)
457
-
458
- extracted_frames_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
459
- processing_status["extracted_frames_count"] += extracted_frames_count
460
- if extracted_frames_count > 0:
461
- processing_status["extracted_videos"] += 1
462
- log_message(f"[INFO] Extracted {extracted_frames_count} frames from {video_filename}")
463
-
464
- # Perform cursor tracking on the extracted frames
465
- cursor_output_json = os.path.join(CURSOR_TRACKING_OUTPUT_FOLDER, f"{course_name}_{video_filename.replace('.', '_')}_cursor_data.json")
466
- tracked_cursors = track_cursor(frames_output_dir, CURSOR_TEMPLATES_DIR, cursor_output_json, threshold=CURSOR_THRESHOLD)
467
- processing_status["tracked_cursors_count"] += tracked_cursors
468
- log_message(f"[INFO] Tracked {tracked_cursors} cursors in frames from {video_filename}")
469
- else:
470
- log_message(f"[WARN] No frames extracted from {video_filename}")
471
-
472
- return True
473
-
474
- except Exception as e:
475
- error_msg = str(e)
476
- log_message(f"❌ Processing failed: {error_msg}")
477
- log_failed_file(filename, error_msg)
478
- return False
479
-
480
- finally:
481
- processing_status["current_file"] = None
482
-
483
- def main_processing_loop(start_index: int = 0):
484
- """Main processing workflow - extraction, frame extraction, and cursor tracking"""
485
- processing_status["is_running"] = True
486
-
487
- try:
488
- # Load state
489
- processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
490
- download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 0})
491
-
492
- # Use start_index if provided, otherwise use the saved state
493
- next_index = start_index if start_index > 0 else download_state["next_download_index"]
494
-
495
- log_message(f"πŸ“Š Starting from index {next_index}")
496
- log_message(f"πŸ“Š Previously processed: {len(processed_rars)} files")
497
-
498
- # Get file list
499
- try:
500
- files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
501
- rar_files = sorted([f for f in files if f.endswith(".rar")])
502
-
503
- processing_status["total_files"] = len(rar_files)
504
- log_message(f"πŸ“ Found {len(rar_files)} RAR files in repository")
505
-
506
- if next_index >= len(rar_files):
507
- log_message("βœ… All files have been processed!")
508
- return
509
-
510
- except Exception as e:
511
- log_message(f"❌ Failed to get file list: {str(e)}")
512
- return
513
-
514
- # Process only one file per run
515
- if next_index < len(rar_files):
516
- rar_file = rar_files[next_index]
517
- filename = os.path.basename(rar_file)
518
-
519
- if filename in processed_rars:
520
- log_message(f"⏭️ Skipping already processed: {filename}")
521
- processing_status["processed_files"] += 1
522
- # Move to next file
523
- next_index += 1
524
- save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
525
- log_message(f"πŸ“Š Moving to next file. Progress: {next_index}/{len(rar_files)}")
526
- return
527
-
528
- log_message(f"πŸ“₯ Downloading: {filename}")
529
- dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
530
-
531
- # Download file
532
- download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
533
- if download_with_retry(download_url, dest_path):
534
- # Process file
535
- if process_rar_file(dest_path):
536
- processed_rars.append(filename)
537
- save_json_state(PROCESS_STATE_FILE, {"processed_rars": processed_rars})
538
- log_message(f"βœ… Successfully processed: {filename}")
539
- processing_status["processed_files"] += 1
540
- else:
541
- log_message(f"❌ Failed to process: {filename}")
542
- processing_status["failed_files"] += 1
543
-
544
- # Clean up downloaded file
545
- try:
546
- os.remove(dest_path)
547
- log_message(f"πŸ—‘οΈ Cleaned up download: {filename}")
548
- except:
549
- pass
550
- else:
551
- log_message(f"❌ Failed to download: {filename}")
552
- processing_status["failed_files"] += 1
553
-
554
- # Update download state for next run
555
- next_index += 1
556
- save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
557
-
558
- # Status update
559
- log_message(f"πŸ“Š Progress: {next_index}/{len(rar_files)} files processed")
560
- log_message(f'πŸ“Š Extracted: {processing_status["extracted_courses"]} courses')
561
- log_message(f'πŸ“Š Videos Processed: {processing_status["extracted_videos"]}')
562
- log_message(f'πŸ“Š Frames Extracted: {processing_status["extracted_frames_count"]}')
563
- log_message(f'πŸ“Š Cursors Tracked: {processing_status["tracked_cursors_count"]}')
564
- log_message(f'πŸ“Š Failed: {processing_status["failed_files"]} files')
565
-
566
- if next_index < len(rar_files):
567
- log_message(f"πŸ”„ Run the script again to process the next file: {os.path.basename(rar_files[next_index])}")
568
- else:
569
- log_message("πŸŽ‰ All files have been processed!")
570
- else:
571
- log_message("βœ… All files have been processed!")
572
-
573
- log_message("πŸŽ‰ Processing complete!")
574
- log_message(f'πŸ“Š Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, {processing_status["extracted_frames_count"]} frames extracted, {processing_status["tracked_cursors_count"]} cursors tracked')
575
-
576
- except KeyboardInterrupt:
577
- log_message("⏹️ Processing interrupted by user")
578
- except Exception as e:
579
- log_message(f"❌ Fatal error: {str(e)}")
580
- finally:
581
- processing_status["is_running"] = False
582
- cleanup_temp_files()
583
-
584
- # Expose necessary functions and variables for download_api.py
585
- __all__ = [
586
- "main_processing_loop",
587
- "processing_status",
588
- "CURSOR_TRACKING_OUTPUT_FOLDER",
589
- "CURSOR_TEMPLATES_DIR",
590
- "log_message",
591
- "send_email_with_attachment",
592
- "track_cursor",
593
- "extract_frames",
594
- "DEFAULT_FPS",
595
- "CURSOR_THRESHOLD",
596
- "ensure_dir"
597
- ]
598
-
599
-
600
-
601
-
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
+ import time
7
+ import re
8
+ import threading
9
+ from typing import Dict, List, Set, Optional
10
+ from huggingface_hub import HfApi, list_repo_files
11
+
12
+ import cv2
13
+ import numpy as np
14
+ from pathlib import Path
15
+ import smtplib
16
+ from email.message import EmailMessage
17
+
18
+ # ==== CONFIGURATION ====
19
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
20
+ SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
21
+
22
+ # Path Configuration
23
+ DOWNLOAD_FOLDER = "downloads"
24
+ EXTRACT_FOLDER = "extracted"
25
+ FRAMES_OUTPUT_FOLDER = "extracted_frames" # New folder for extracted frames
26
+ CURSOR_TRACKING_OUTPUT_FOLDER = "cursor_tracking_results" # New folder for cursor tracking results
27
+ CURSOR_TEMPLATES_DIR = "cursors"
28
+
29
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
30
+ os.makedirs(EXTRACT_FOLDER, exist_ok=True)
31
+ os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
32
+ os.makedirs(CURSOR_TRACKING_OUTPUT_FOLDER, exist_ok=True)
33
+ os.makedirs(CURSOR_TEMPLATES_DIR, exist_ok=True) # Ensure cursor templates directory exists
34
+
35
+ # State Files
36
+ DOWNLOAD_STATE_FILE = "download_progress.json"
37
+ PROCESS_STATE_FILE = "process_progress.json"
38
+ FAILED_FILES_LOG = "failed_files.log"
39
+
40
+ # Processing Parameters
41
+ CHUNK_SIZE = 1
42
+ PROCESSING_DELAY = 2
43
+ MAX_RETRIES = 3
44
+ MIN_FREE_SPACE_GB = 2 # Minimum free space in GB before processing
45
+
46
+ # Frame Extraction Parameters
47
+ DEFAULT_FPS = 3 # Default frames per second for extraction
48
+
49
+ # Cursor Tracking Parameters
50
+ CURSOR_THRESHOLD = 0.8
51
+
52
+ # Initialize HF API
53
+ hf_api = HfApi(token=HF_TOKEN)
54
+
55
+ # Global State
56
+ processing_status = {
57
+ "is_running": False,
58
+ "current_file": None,
59
+ "total_files": 0,
60
+ "processed_files": 0,
61
+ "failed_files": 0,
62
+ "extracted_courses": 0,
63
+ "extracted_videos": 0,
64
+ "extracted_frames_count": 0,
65
+ "tracked_cursors_count": 0,
66
+ "last_update": None,
67
+ "logs": []
68
+ }
69
+
70
+ def log_message(message: str):
71
+ """Log messages with timestamp"""
72
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
73
+ log_entry = f"[{timestamp}] {message}"
74
+ print(log_entry)
75
+ processing_status["logs"].append(log_entry)
76
+ processing_status["last_update"] = timestamp
77
+ if len(processing_status["logs"]) > 100:
78
+ processing_status["logs"] = processing_status["logs"][-100:]
79
+
80
+ def log_failed_file(filename: str, error: str):
81
+ """Log failed files to persistent file"""
82
+ with open(FAILED_FILES_LOG, "a") as f:
83
+ f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
84
+
85
+ def get_disk_usage(path: str) -> Dict[str, float]:
86
+ """Get disk usage statistics in GB"""
87
+ statvfs = os.statvfs(path)
88
+ total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
89
+ free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
90
+ used = total - free
91
+ return {"total": total, "free": free, "used": used}
92
+
93
+ def check_disk_space(path: str = ".") -> bool:
94
+ """Check if there's enough disk space"""
95
+ disk_info = get_disk_usage(path)
96
+ if disk_info["free"] < MIN_FREE_SPACE_GB:
97
+ log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
98
+ return False
99
+ return True
100
+
101
+ def cleanup_temp_files():
102
+ """Clean up temporary files to free space"""
103
+ log_message("🧹 Cleaning up temporary files...")
104
+
105
+ # Clean old downloads (keep only current processing file)
106
+ current_file = processing_status.get("current_file")
107
+ for file in os.listdir(DOWNLOAD_FOLDER):
108
+ if file != current_file and file.endswith((".rar", ".zip")):
109
+ try:
110
+ os.remove(os.path.join(DOWNLOAD_FOLDER, file))
111
+ log_message(f"πŸ—‘οΈ Removed old download: {file}")
112
+ except:
113
+ pass
114
+
115
+ def load_json_state(file_path: str, default_value):
116
+ """Load state from JSON file"""
117
+ if os.path.exists(file_path):
118
+ try:
119
+ with open(file_path, "r") as f:
120
+ return json.load(f)
121
+ except json.JSONDecodeError:
122
+ log_message(f"⚠️ Corrupted state file: {file_path}")
123
+ return default_value
124
+
125
+ def save_json_state(file_path: str, data):
126
+ """Save state to JSON file"""
127
+ with open(file_path, "w") as f:
128
+ json.dump(data, f, indent=2)
129
+
130
+ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
131
+ """Download file with retry logic and disk space checking"""
132
+ if not check_disk_space():
133
+ cleanup_temp_files()
134
+ if not check_disk_space():
135
+ log_message("❌ Insufficient disk space even after cleanup")
136
+ return False
137
+
138
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
139
+ for attempt in range(max_retries):
140
+ try:
141
+ with requests.get(url, headers=headers, stream=True) as r:
142
+ r.raise_for_status()
143
+
144
+ # Check content length if available
145
+ content_length = r.headers.get("content-length")
146
+ if content_length:
147
+ size_gb = int(content_length) / (1024**3)
148
+ disk_info = get_disk_usage(".")
149
+ if size_gb > disk_info["free"] - 0.5: # Leave 0.5GB buffer
150
+ log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
151
+ return False
152
+
153
+ with open(dest_path, "wb") as f:
154
+ for chunk in r.iter_content(chunk_size=8192):
155
+ f.write(chunk)
156
+ return True
157
+ except Exception as e:
158
+ if attempt < max_retries - 1:
159
+ time.sleep(2 ** attempt)
160
+ continue
161
+ log_message(f"❌ Download failed after {max_retries} attempts: {e}")
162
+ return False
163
+ return False
164
+
165
+ def is_multipart_rar(filename: str) -> bool:
166
+ """Check if this is a multi-part RAR file"""
167
+ return ".part" in filename.lower() and filename.lower().endswith(".rar")
168
+
169
+ def get_rar_part_base(filename: str) -> str:
170
+ """Get the base name for multi-part RAR files"""
171
+ if ".part" in filename.lower():
172
+ return filename.split(".part")[0]
173
+ return filename.replace(".rar", "")
174
+
175
+ def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
176
+ """Extract RAR with retry and recovery, handling multi-part archives"""
177
+ filename = os.path.basename(rar_path)
178
+
179
+ # For multi-part RARs, we need the first part
180
+ if is_multipart_rar(filename):
181
+ base_name = get_rar_part_base(filename)
182
+ first_part = f"{base_name}.part01.rar"
183
+ first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
184
+
185
+ if not os.path.exists(first_part_path):
186
+ log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
187
+ return False
188
+
189
+ rar_path = first_part_path
190
+ log_message(f"πŸ“¦ Processing multi-part RAR starting with: {first_part}")
191
+
192
+ for attempt in range(max_retries):
193
+ try:
194
+ # Test RAR first
195
+ test_cmd = ["unrar", "t", rar_path]
196
+ test_result = subprocess.run(test_cmd, capture_output=True, text=True)
197
+ if test_result.returncode != 0:
198
+ log_message(f"⚠️ RAR test failed: {test_result.stderr}")
199
+ if attempt == max_retries - 1:
200
+ return False
201
+ continue
202
+
203
+ # Extract RAR
204
+ cmd = ["unrar", "x", "-o+", rar_path, output_dir]
205
+ if attempt > 0: # Try recovery on subsequent attempts
206
+ cmd.insert(2, "-kb")
207
+
208
+ result = subprocess.run(cmd, capture_output=True, text=True)
209
+ if result.returncode == 0:
210
+ log_message(f"βœ… Successfully extracted: {os.path.basename(rar_path)}")
211
+ return True
212
+ else:
213
+ error_msg = result.stderr or result.stdout
214
+ log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
215
+
216
+ if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
217
+ log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
218
+ elif result.returncode == 10:
219
+ log_message(f"⚠️ No files to extract (exit code 10)")
220
+ return False
221
+ elif result.returncode == 1:
222
+ log_message(f"⚠️ Non-fatal error (exit code 1)")
223
+
224
+ except Exception as e:
225
+ log_message(f"❌ Extraction exception: {str(e)}")
226
+ if attempt == max_retries - 1:
227
+ return False
228
+ time.sleep(1)
229
+
230
+ return False
231
+
232
+ # --- Frame Extraction Utilities ---
233
+ def ensure_dir(path):
234
+ os.makedirs(path, exist_ok=True)
235
+
236
+ def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
237
+ """Extract frames from video at the specified frames per second (fps)."""
238
+ log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
239
+ ensure_dir(output_dir)
240
+ cap = cv2.VideoCapture(str(video_path))
241
+ if not cap.isOpened():
242
+ log_message(f"[ERROR] Failed to open video file: {video_path}")
243
+ return 0
244
+ video_fps = cap.get(cv2.CAP_PROP_FPS)
245
+ # log_message(f"[DEBUG] Video FPS: {video_fps}")
246
+ if not video_fps or video_fps <= 0:
247
+ video_fps = 30 # fallback if FPS is not available
248
+ log_message(f"[WARN] Using fallback FPS: {video_fps}")
249
+ frame_interval = int(round(video_fps / fps))
250
+ # log_message(f"[DEBUG] Frame interval: {frame_interval}")
251
+ frame_idx = 0
252
+ saved_idx = 1
253
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
254
+ log_message(f"[DEBUG] Total frames in video: {total_frames}")
255
+ while cap.isOpened():
256
+ ret, frame = cap.read()
257
+ if not ret:
258
+ # log_message(f"[DEBUG] No more frames to read at frame_idx {frame_idx}.")
259
+ break
260
+ if frame_idx % frame_interval == 0:
261
+ frame_name = f"{saved_idx:04d}.png"
262
+ cv2.imwrite(str(Path(output_dir) / frame_name), frame)
263
+ # log_message(f"[DEBUG] Saved frame {frame_idx} as {frame_name}")
264
+ saved_idx += 1
265
+ frame_idx += 1
266
+ cap.release()
267
+ log_message(f"Extracted {saved_idx-1} frames from {video_path} to {output_dir}")
268
+ return saved_idx - 1
269
+
270
+ # --- Cursor Tracking Utilities ---
271
+ def to_rgb(img):
272
+ if img is None:
273
+ return None
274
+ if len(img.shape) == 2:
275
+ return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
276
+ if img.shape[2] == 4:
277
+ return cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
278
+ return img
279
+
280
+ def get_mask_from_alpha(template_img):
281
+ if template_img is not None and len(template_img.shape) == 3 and template_img.shape[2] == 4:
282
+ # Use alpha channel as mask (nonzero alpha = 255)
283
+ return (template_img[:, :, 3] > 0).astype(np.uint8) * 255
284
+ return None
285
+
286
+ def detect_cursor_in_frame_multi(frame, cursor_templates, threshold=CURSOR_THRESHOLD):
287
+ """Detect cursor position in a frame using multiple templates. Returns best match above threshold."""
288
+ best_pos = None
289
+ best_conf = -1
290
+ best_template_name = None
291
+ frame_rgb = to_rgb(frame)
292
+ for template_name, cursor_template in cursor_templates.items():
293
+ template_rgb = to_rgb(cursor_template)
294
+ mask = get_mask_from_alpha(cursor_template)
295
+ if template_rgb is None or frame_rgb is None or template_rgb.shape[2] != frame_rgb.shape[2]:
296
+ log_message(f"[WARN] Skipping template {template_name} due to channel mismatch or load error.")
297
+ continue
298
+ try:
299
+ result = cv2.matchTemplate(frame_rgb, template_rgb, cv2.TM_CCOEFF_NORMED, mask=mask)
300
+ except Exception as e:
301
+ log_message(f"[WARN] matchTemplate failed for {template_name}: {e}")
302
+ continue
303
+ min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
304
+ if max_val > best_conf:
305
+ best_conf = max_val
306
+ if max_val >= threshold:
307
+ cursor_w, cursor_h = template_rgb.shape[1], template_rgb.shape[0]
308
+ cursor_x = max_loc[0] + cursor_w // 2
309
+ cursor_y = max_loc[1] + cursor_h // 2
310
+ best_pos = (cursor_x, cursor_y)
311
+ best_template_name = template_name
312
+ if best_conf >= threshold:
313
+ return best_pos, best_conf, best_template_name
314
+ return None, best_conf, None
315
+
316
+ def send_email_with_attachment(subject, body, to_email, from_email, app_password, attachment_path):
317
+ msg = EmailMessage()
318
+ msg["Subject"] = subject
319
+ msg["From"] = from_email
320
+ msg["To"] = to_email
321
+ msg.set_content(body)
322
+ with open(attachment_path, "rb") as f:
323
+ file_data = f.read()
324
+ file_name = Path(attachment_path).name
325
+ msg.add_attachment(file_data, maintype="application", subtype="octet-stream", filename=file_name)
326
+ try:
327
+ with smtplib.SMTP_SSL("smtp.gmail.com", 465) as smtp:
328
+ smtp.login(from_email, app_password)
329
+ smtp.send_message(msg)
330
+ log_message(f"[SUCCESS] Email sent to {to_email}")
331
+ except Exception as e:
332
+ log_message(f"[ERROR] Failed to send email: {e}")
333
+
334
+ def track_cursor(frames_dir, cursor_templates_dir, output_json_path, threshold=CURSOR_THRESHOLD, start_frame=1, email_results=False):
335
+ """Detect cursor in each frame using multiple templates, print status, and write positions to a JSON file."""
336
+ log_message(f"[INFO] Tracking cursors in {frames_dir}...")
337
+ frames_dir = Path(frames_dir).resolve()
338
+ output_json_path = Path(output_json_path).resolve()
339
+ cursor_templates_dir = Path(cursor_templates_dir).resolve()
340
+ # log_message(f"[DEBUG] frames_dir: {frames_dir}")
341
+ # log_message(f"[DEBUG] cursor_templates_dir: {cursor_templates_dir}")
342
+ # log_message(f"[DEBUG] output_json_path: {output_json_path}")
343
+ ensure_dir(frames_dir)
344
+ ensure_dir(output_json_path.parent)
345
+ # Load all PNG templates from the cursor_templates_dir
346
+ cursor_templates = {}
347
+ for template_file in cursor_templates_dir.glob("*.png"):
348
+ template_img = cv2.imread(str(template_file), cv2.IMREAD_UNCHANGED)
349
+ if template_img is not None:
350
+ cursor_templates[template_file.name] = template_img
351
+ else:
352
+ log_message(f"[WARN] Could not load template: {template_file}")
353
+ if not cursor_templates:
354
+ log_message(f"[ERROR] No cursor templates found in: {cursor_templates_dir}")
355
+ return 0
356
+ results = []
357
+ tracked_count = 0
358
+ for frame_file in sorted(frames_dir.glob("*.png")):
359
+ frame_num = int(frame_file.stem)
360
+ if frame_num < start_frame:
361
+ continue
362
+ frame = cv2.imread(str(frame_file), cv2.IMREAD_UNCHANGED)
363
+ if frame is None:
364
+ log_message(f"[WARN] Could not load frame: {frame_file}")
365
+ continue
366
+ pos, conf, template_name = detect_cursor_in_frame_multi(frame, cursor_templates, threshold)
367
+ if pos is not None:
368
+ # log_message(f"{frame_file.name}: Cursor at {pos} (template: {template_name})")
369
+ results.append({
370
+ "frame": frame_file.name,
371
+ "cursor_active": True,
372
+ "x": pos[0],
373
+ "y": pos[1],
374
+ "confidence": conf,
375
+ "template": template_name
376
+ })
377
+ tracked_count += 1
378
+ else:
379
+ # log_message(f"{frame_file.name}: Cursor disabled")
380
+ results.append({
381
+ "frame": frame_file.name,
382
+ "cursor_active": False,
383
+ "x": None,
384
+ "y": None,
385
+ "confidence": conf,
386
+ "template": None
387
+ })
388
+ try:
389
+ with open(output_json_path, "w") as f:
390
+ json.dump(results, f, indent=2)
391
+ log_message(f"[SUCCESS] Cursor tracking results saved to {output_json_path}")
392
+ if email_results:
393
+ log_message("[INFO] Preparing to email results...")
394
+ to_email = os.environ.get("TO_EMAIL")
395
+ from_email = os.environ.get("FROM_EMAIL")
396
+ app_password = os.environ.get("GMAIL_APP_PASSWORD")
397
+ if not (to_email and from_email and app_password):
398
+ log_message("[ERROR] Email environment variables not set. Please set TO_EMAIL, FROM_EMAIL, and GMAIL_APP_PASSWORD.")
399
+ # return tracked_count # Don't return here, just log error
400
+ else:
401
+ send_email_with_attachment(
402
+ subject="Cursor Tracking Results",
403
+ body="See attached JSON results.",
404
+ to_email=to_email,
405
+ from_email=from_email,
406
+ app_password=app_password,
407
+ attachment_path=output_json_path
408
+ )
409
+ except Exception as e:
410
+ log_message(f"[ERROR] Failed to write output JSON: {e}")
411
+ # raise # Don't raise, just log error
412
+ return tracked_count
413
+
414
+ def process_rar_file(rar_path: str) -> bool:
415
+ """Process a single RAR file - extract, then process videos for frames and cursor tracking"""
416
+ filename = os.path.basename(rar_path)
417
+ processing_status["current_file"] = filename
418
+
419
+ # Handle multi-part RAR naming
420
+ if is_multipart_rar(filename):
421
+ course_name = get_rar_part_base(filename)
422
+ else:
423
+ course_name = filename.replace(".rar", "")
424
+
425
+ extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
426
+
427
+ try:
428
+ log_message(f"πŸ”„ Processing: {filename}")
429
+
430
+ # Clean up any existing directory
431
+ if os.path.exists(extract_dir):
432
+ shutil.rmtree(extract_dir, ignore_errors=True)
433
+
434
+ # Extract RAR
435
+ os.makedirs(extract_dir, exist_ok=True)
436
+ if not extract_with_retry(rar_path, extract_dir):
437
+ raise Exception("RAR extraction failed")
438
+
439
+ # Count extracted files
440
+ file_count = 0
441
+ video_files_found = []
442
+ for root, dirs, files in os.walk(extract_dir):
443
+ for file in files:
444
+ file_count += 1
445
+ if file.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
446
+ video_files_found.append(os.path.join(root, file))
447
+
448
+ processing_status["extracted_courses"] += 1
449
+ log_message(f"βœ… Successfully extracted \'{course_name}\' ({file_count} files, {len(video_files_found)} videos)")
450
+
451
+ # Process video files for frame extraction and cursor tracking
452
+ for video_path in video_files_found:
453
+ video_filename = Path(video_path).name
454
+ # Create a unique output directory for frames for each video
455
+ frames_output_dir = os.path.join(FRAMES_OUTPUT_FOLDER, f"{course_name}_{video_filename.replace('.', '_')}_frames")
456
+ ensure_dir(frames_output_dir)
457
+
458
+ extracted_frames_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
459
+ processing_status["extracted_frames_count"] += extracted_frames_count
460
+ if extracted_frames_count > 0:
461
+ processing_status["extracted_videos"] += 1
462
+ log_message(f"[INFO] Extracted {extracted_frames_count} frames from {video_filename}")
463
+
464
+ # Perform cursor tracking on the extracted frames
465
+ cursor_output_json = os.path.join(CURSOR_TRACKING_OUTPUT_FOLDER, f"{course_name}_{video_filename.replace('.', '_')}_cursor_data.json")
466
+ tracked_cursors = track_cursor(frames_output_dir, CURSOR_TEMPLATES_DIR, cursor_output_json, threshold=CURSOR_THRESHOLD)
467
+ processing_status["tracked_cursors_count"] += tracked_cursors
468
+ log_message(f"[INFO] Tracked {tracked_cursors} cursors in frames from {video_filename}")
469
+ else:
470
+ log_message(f"[WARN] No frames extracted from {video_filename}")
471
+
472
+ return True
473
+
474
+ except Exception as e:
475
+ error_msg = str(e)
476
+ log_message(f"❌ Processing failed: {error_msg}")
477
+ log_failed_file(filename, error_msg)
478
+ return False
479
+
480
+ finally:
481
+ processing_status["current_file"] = None
482
+
483
+ def main_processing_loop(start_index: int = 0):
484
+ """Main processing workflow - extraction, frame extraction, and cursor tracking"""
485
+ processing_status["is_running"] = True
486
+
487
+ try:
488
+ # Load state
489
+ processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
490
+ download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 4})
491
+
492
+ # Use start_index if provided, otherwise use the saved state
493
+ next_index = start_index if start_index > 0 else download_state["next_download_index"]
494
+
495
+ log_message(f"πŸ“Š Starting from index {next_index}")
496
+ log_message(f"πŸ“Š Previously processed: {len(processed_rars)} files")
497
+
498
+ # Get file list
499
+ try:
500
+ files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
501
+ rar_files = sorted([f for f in files if f.endswith(".rar")])
502
+
503
+ processing_status["total_files"] = len(rar_files)
504
+ log_message(f"πŸ“ Found {len(rar_files)} RAR files in repository")
505
+
506
+ if next_index >= len(rar_files):
507
+ log_message("βœ… All files have been processed!")
508
+ return
509
+
510
+ except Exception as e:
511
+ log_message(f"❌ Failed to get file list: {str(e)}")
512
+ return
513
+
514
+ # Process only one file per run
515
+ if next_index < len(rar_files):
516
+ rar_file = rar_files[next_index]
517
+ filename = os.path.basename(rar_file)
518
+
519
+ if filename in processed_rars:
520
+ log_message(f"⏭️ Skipping already processed: {filename}")
521
+ processing_status["processed_files"] += 1
522
+ # Move to next file
523
+ next_index += 1
524
+ save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
525
+ log_message(f"πŸ“Š Moving to next file. Progress: {next_index}/{len(rar_files)}")
526
+ return
527
+
528
+ log_message(f"πŸ“₯ Downloading: {filename}")
529
+ dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
530
+
531
+ # Download file
532
+ download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
533
+ if download_with_retry(download_url, dest_path):
534
+ # Process file
535
+ if process_rar_file(dest_path):
536
+ processed_rars.append(filename)
537
+ save_json_state(PROCESS_STATE_FILE, {"processed_rars": processed_rars})
538
+ log_message(f"βœ… Successfully processed: {filename}")
539
+ processing_status["processed_files"] += 1
540
+ else:
541
+ log_message(f"❌ Failed to process: {filename}")
542
+ processing_status["failed_files"] += 1
543
+
544
+ # Clean up downloaded file
545
+ try:
546
+ os.remove(dest_path)
547
+ log_message(f"πŸ—‘οΈ Cleaned up download: {filename}")
548
+ except:
549
+ pass
550
+ else:
551
+ log_message(f"❌ Failed to download: {filename}")
552
+ processing_status["failed_files"] += 1
553
+
554
+ # Update download state for next run
555
+ next_index += 1
556
+ save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
557
+
558
+ # Status update
559
+ log_message(f"πŸ“Š Progress: {next_index}/{len(rar_files)} files processed")
560
+ log_message(f'πŸ“Š Extracted: {processing_status["extracted_courses"]} courses')
561
+ log_message(f'πŸ“Š Videos Processed: {processing_status["extracted_videos"]}')
562
+ log_message(f'πŸ“Š Frames Extracted: {processing_status["extracted_frames_count"]}')
563
+ log_message(f'πŸ“Š Cursors Tracked: {processing_status["tracked_cursors_count"]}')
564
+ log_message(f'πŸ“Š Failed: {processing_status["failed_files"]} files')
565
+
566
+ if next_index < len(rar_files):
567
+ log_message(f"πŸ”„ Run the script again to process the next file: {os.path.basename(rar_files[next_index])}")
568
+ else:
569
+ log_message("πŸŽ‰ All files have been processed!")
570
+ else:
571
+ log_message("βœ… All files have been processed!")
572
+
573
+ log_message("πŸŽ‰ Processing complete!")
574
+ log_message(f'πŸ“Š Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, {processing_status["extracted_frames_count"]} frames extracted, {processing_status["tracked_cursors_count"]} cursors tracked')
575
+
576
+ except KeyboardInterrupt:
577
+ log_message("⏹️ Processing interrupted by user")
578
+ except Exception as e:
579
+ log_message(f"❌ Fatal error: {str(e)}")
580
+ finally:
581
+ processing_status["is_running"] = False
582
+ cleanup_temp_files()
583
+
584
+ # Expose necessary functions and variables for download_api.py
585
+ __all__ = [
586
+ "main_processing_loop",
587
+ "processing_status",
588
+ "CURSOR_TRACKING_OUTPUT_FOLDER",
589
+ "CURSOR_TEMPLATES_DIR",
590
+ "log_message",
591
+ "send_email_with_attachment",
592
+ "track_cursor",
593
+ "extract_frames",
594
+ "DEFAULT_FPS",
595
+ "CURSOR_THRESHOLD",
596
+ "ensure_dir"
597
+ ]
598
+
599
+
600
+
601
+