Fred808 commited on
Commit
dac6bd5
Β·
verified Β·
1 Parent(s): 15fc156

Update cursor_tracker.py

Browse files
Files changed (1) hide show
  1. cursor_tracker.py +266 -136
cursor_tracker.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
  import json
3
  import requests
4
  import subprocess
@@ -14,6 +13,7 @@ import numpy as np
14
  from pathlib import Path
15
  import smtplib
16
  from email.message import EmailMessage
 
17
 
18
  # ==== CONFIGURATION ====
19
  HF_TOKEN = os.getenv("HF_TOKEN", "")
@@ -22,15 +22,15 @@ SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
22
  # Path Configuration
23
  DOWNLOAD_FOLDER = "downloads"
24
  EXTRACT_FOLDER = "extracted"
25
- FRAMES_OUTPUT_FOLDER = "extracted_frames" # New folder for extracted frames
26
- CURSOR_TRACKING_OUTPUT_FOLDER = "cursor_tracking_results" # New folder for cursor tracking results
27
  CURSOR_TEMPLATES_DIR = "cursors"
28
 
29
  os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
30
  os.makedirs(EXTRACT_FOLDER, exist_ok=True)
31
  os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
32
  os.makedirs(CURSOR_TRACKING_OUTPUT_FOLDER, exist_ok=True)
33
- os.makedirs(CURSOR_TEMPLATES_DIR, exist_ok=True) # Ensure cursor templates directory exists
34
 
35
  # State Files
36
  DOWNLOAD_STATE_FILE = "download_progress.json"
@@ -44,7 +44,7 @@ MAX_RETRIES = 3
44
  MIN_FREE_SPACE_GB = 2 # Minimum free space in GB before processing
45
 
46
  # Frame Extraction Parameters
47
- DEFAULT_FPS = 3 # Default frames per second for extraction
48
 
49
  # Cursor Tracking Parameters
50
  CURSOR_THRESHOLD = 0.8
@@ -67,6 +67,7 @@ processing_status = {
67
  "logs": []
68
  }
69
 
 
70
  def log_message(message: str):
71
  """Log messages with timestamp"""
72
  timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
@@ -77,11 +78,13 @@ def log_message(message: str):
77
  if len(processing_status["logs"]) > 100:
78
  processing_status["logs"] = processing_status["logs"][-100:]
79
 
 
80
  def log_failed_file(filename: str, error: str):
81
  """Log failed files to persistent file"""
82
  with open(FAILED_FILES_LOG, "a") as f:
83
  f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
84
 
 
85
  def get_disk_usage(path: str) -> Dict[str, float]:
86
  """Get disk usage statistics in GB"""
87
  statvfs = os.statvfs(path)
@@ -90,6 +93,7 @@ def get_disk_usage(path: str) -> Dict[str, float]:
90
  used = total - free
91
  return {"total": total, "free": free, "used": used}
92
 
 
93
  def check_disk_space(path: str = ".") -> bool:
94
  """Check if there's enough disk space"""
95
  disk_info = get_disk_usage(path)
@@ -98,10 +102,11 @@ def check_disk_space(path: str = ".") -> bool:
98
  return False
99
  return True
100
 
 
101
  def cleanup_temp_files():
102
  """Clean up temporary files to free space"""
103
  log_message("🧹 Cleaning up temporary files...")
104
-
105
  # Clean old downloads (keep only current processing file)
106
  current_file = processing_status.get("current_file")
107
  for file in os.listdir(DOWNLOAD_FOLDER):
@@ -109,9 +114,10 @@ def cleanup_temp_files():
109
  try:
110
  os.remove(os.path.join(DOWNLOAD_FOLDER, file))
111
  log_message(f"πŸ—‘οΈ Removed old download: {file}")
112
- except:
113
  pass
114
 
 
115
  def load_json_state(file_path: str, default_value):
116
  """Load state from JSON file"""
117
  if os.path.exists(file_path):
@@ -122,11 +128,13 @@ def load_json_state(file_path: str, default_value):
122
  log_message(f"⚠️ Corrupted state file: {file_path}")
123
  return default_value
124
 
 
125
  def save_json_state(file_path: str, data):
126
  """Save state to JSON file"""
127
  with open(file_path, "w") as f:
128
  json.dump(data, f, indent=2)
129
 
 
130
  def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
131
  """Download file with retry logic and disk space checking"""
132
  if not check_disk_space():
@@ -134,13 +142,13 @@ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
134
  if not check_disk_space():
135
  log_message("❌ Insufficient disk space even after cleanup")
136
  return False
137
-
138
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
139
  for attempt in range(max_retries):
140
  try:
141
  with requests.get(url, headers=headers, stream=True) as r:
142
  r.raise_for_status()
143
-
144
  # Check content length if available
145
  content_length = r.headers.get("content-length")
146
  if content_length:
@@ -149,10 +157,11 @@ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
149
  if size_gb > disk_info["free"] - 0.5: # Leave 0.5GB buffer
150
  log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
151
  return False
152
-
153
  with open(dest_path, "wb") as f:
154
  for chunk in r.iter_content(chunk_size=8192):
155
- f.write(chunk)
 
156
  return True
157
  except Exception as e:
158
  if attempt < max_retries - 1:
@@ -162,33 +171,36 @@ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
162
  return False
163
  return False
164
 
 
165
  def is_multipart_rar(filename: str) -> bool:
166
  """Check if this is a multi-part RAR file"""
167
  return ".part" in filename.lower() and filename.lower().endswith(".rar")
168
 
 
169
  def get_rar_part_base(filename: str) -> str:
170
  """Get the base name for multi-part RAR files"""
171
  if ".part" in filename.lower():
172
  return filename.split(".part")[0]
173
  return filename.replace(".rar", "")
174
 
 
175
  def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
176
  """Extract RAR with retry and recovery, handling multi-part archives"""
177
  filename = os.path.basename(rar_path)
178
-
179
  # For multi-part RARs, we need the first part
180
  if is_multipart_rar(filename):
181
  base_name = get_rar_part_base(filename)
182
  first_part = f"{base_name}.part01.rar"
183
  first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
184
-
185
  if not os.path.exists(first_part_path):
186
  log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
187
  return False
188
-
189
  rar_path = first_part_path
190
  log_message(f"πŸ“¦ Processing multi-part RAR starting with: {first_part}")
191
-
192
  for attempt in range(max_retries):
193
  try:
194
  # Test RAR first
@@ -199,12 +211,12 @@ def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) ->
199
  if attempt == max_retries - 1:
200
  return False
201
  continue
202
-
203
  # Extract RAR
204
  cmd = ["unrar", "x", "-o+", rar_path, output_dir]
205
  if attempt > 0: # Try recovery on subsequent attempts
206
  cmd.insert(2, "-kb")
207
-
208
  result = subprocess.run(cmd, capture_output=True, text=True)
209
  if result.returncode == 0:
210
  log_message(f"βœ… Successfully extracted: {os.path.basename(rar_path)}")
@@ -212,7 +224,7 @@ def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) ->
212
  else:
213
  error_msg = result.stderr or result.stdout
214
  log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
215
-
216
  if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
217
  log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
218
  elif result.returncode == 10:
@@ -220,19 +232,21 @@ def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) ->
220
  return False
221
  elif result.returncode == 1:
222
  log_message(f"⚠️ Non-fatal error (exit code 1)")
223
-
224
  except Exception as e:
225
  log_message(f"❌ Extraction exception: {str(e)}")
226
  if attempt == max_retries - 1:
227
  return False
228
  time.sleep(1)
229
-
230
  return False
231
-
 
232
  # --- Frame Extraction Utilities ---
233
  def ensure_dir(path):
234
  os.makedirs(path, exist_ok=True)
235
 
 
236
  def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
237
  """Extract frames from video at the specified frames per second (fps)."""
238
  log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
@@ -242,12 +256,12 @@ def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
242
  log_message(f"[ERROR] Failed to open video file: {video_path}")
243
  return 0
244
  video_fps = cap.get(cv2.CAP_PROP_FPS)
245
- # log_message(f"[DEBUG] Video FPS: {video_fps}")
246
  if not video_fps or video_fps <= 0:
247
  video_fps = 30 # fallback if FPS is not available
248
  log_message(f"[WARN] Using fallback FPS: {video_fps}")
249
  frame_interval = int(round(video_fps / fps))
250
- # log_message(f"[DEBUG] Frame interval: {frame_interval}")
 
251
  frame_idx = 0
252
  saved_idx = 1
253
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -255,19 +269,18 @@ def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
255
  while cap.isOpened():
256
  ret, frame = cap.read()
257
  if not ret:
258
- # log_message(f"[DEBUG] No more frames to read at frame_idx {frame_idx}.")
259
  break
260
  if frame_idx % frame_interval == 0:
261
  frame_name = f"{saved_idx:04d}.png"
262
  cv2.imwrite(str(Path(output_dir) / frame_name), frame)
263
- # log_message(f"[DEBUG] Saved frame {frame_idx} as {frame_name}")
264
  saved_idx += 1
265
  frame_idx += 1
266
  cap.release()
267
  log_message(f"Extracted {saved_idx-1} frames from {video_path} to {output_dir}")
268
  return saved_idx - 1
269
 
270
- # --- Cursor Tracking Utilities ---
 
271
  def to_rgb(img):
272
  if img is None:
273
  return None
@@ -277,12 +290,14 @@ def to_rgb(img):
277
  return cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
278
  return img
279
 
 
280
  def get_mask_from_alpha(template_img):
281
  if template_img is not None and len(template_img.shape) == 3 and template_img.shape[2] == 4:
282
  # Use alpha channel as mask (nonzero alpha = 255)
283
  return (template_img[:, :, 3] > 0).astype(np.uint8) * 255
284
  return None
285
 
 
286
  def detect_cursor_in_frame_multi(frame, cursor_templates, threshold=CURSOR_THRESHOLD):
287
  """Detect cursor position in a frame using multiple templates. Returns best match above threshold."""
288
  best_pos = None
@@ -293,12 +308,11 @@ def detect_cursor_in_frame_multi(frame, cursor_templates, threshold=CURSOR_THRES
293
  template_rgb = to_rgb(cursor_template)
294
  mask = get_mask_from_alpha(cursor_template)
295
  if template_rgb is None or frame_rgb is None or template_rgb.shape[2] != frame_rgb.shape[2]:
296
- log_message(f"[WARN] Skipping template {template_name} due to channel mismatch or load error.")
297
  continue
298
  try:
299
  result = cv2.matchTemplate(frame_rgb, template_rgb, cv2.TM_CCOEFF_NORMED, mask=mask)
300
- except Exception as e:
301
- log_message(f"[WARN] matchTemplate failed for {template_name}: {e}")
302
  continue
303
  min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
304
  if max_val > best_conf:
@@ -313,91 +327,173 @@ def detect_cursor_in_frame_multi(frame, cursor_templates, threshold=CURSOR_THRES
313
  return best_pos, best_conf, best_template_name
314
  return None, best_conf, None
315
 
316
- def send_email_with_attachment(subject, body, to_email, from_email, app_password, attachment_path):
317
- msg = EmailMessage()
318
- msg["Subject"] = subject
319
- msg["From"] = from_email
320
- msg["To"] = to_email
321
- msg.set_content(body)
322
- with open(attachment_path, "rb") as f:
323
- file_data = f.read()
324
- file_name = Path(attachment_path).name
325
- msg.add_attachment(file_data, maintype="application", subtype="octet-stream", filename=file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  try:
327
- with smtplib.SMTP_SSL("smtp.gmail.com", 465) as smtp:
328
- smtp.login(from_email, app_password)
329
- smtp.send_message(msg)
330
- log_message(f"[SUCCESS] Email sent to {to_email}")
 
 
 
 
 
 
331
  except Exception as e:
332
- log_message(f"[ERROR] Failed to send email: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
- def track_cursor(frames_dir, cursor_templates_dir, output_json_path, threshold=CURSOR_THRESHOLD, start_frame=1, email_results=False):
335
- """Detect cursor in each frame using multiple templates, print status, and write positions to a JSON file."""
336
- log_message(f"[INFO] Tracking cursors in {frames_dir}...")
337
  frames_dir = Path(frames_dir).resolve()
338
  output_json_path = Path(output_json_path).resolve()
339
  cursor_templates_dir = Path(cursor_templates_dir).resolve()
340
- # log_message(f"[DEBUG] frames_dir: {frames_dir}")
341
- # log_message(f"[DEBUG] cursor_templates_dir: {cursor_templates_dir}")
342
- # log_message(f"[DEBUG] output_json_path: {output_json_path}")
343
- ensure_dir(frames_dir)
344
  ensure_dir(output_json_path.parent)
345
- # Load all PNG templates from the cursor_templates_dir
 
346
  cursor_templates = {}
347
  for template_file in cursor_templates_dir.glob("*.png"):
348
- template_img = cv2.imread(str(template_file), cv2.IMREAD_UNCHANGED)
349
- if template_img is not None:
350
- cursor_templates[template_file.name] = template_img
351
- else:
352
- log_message(f"[WARN] Could not load template: {template_file}")
353
  if not cursor_templates:
354
- log_message(f"[ERROR] No cursor templates found in: {cursor_templates_dir}")
355
  return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  results = []
357
- tracked_count = 0
358
- for frame_file in sorted(frames_dir.glob("*.png")):
359
- frame_num = int(frame_file.stem)
360
- if frame_num < start_frame:
361
- continue
362
- frame = cv2.imread(str(frame_file), cv2.IMREAD_UNCHANGED)
363
- if frame is None:
364
- log_message(f"[WARN] Could not load frame: {frame_file}")
365
- continue
366
- pos, conf, template_name = detect_cursor_in_frame_multi(frame, cursor_templates, threshold)
367
- if pos is not None:
368
- # log_message(f"{frame_file.name}: Cursor at {pos} (template: {template_name})")
369
- results.append({
370
- "frame": frame_file.name,
371
- "cursor_active": True,
372
- "x": pos[0],
373
- "y": pos[1],
374
- "confidence": conf,
375
- "template": template_name
376
- })
377
- tracked_count += 1
378
- else:
379
- # log_message(f"{frame_file.name}: Cursor disabled")
380
- results.append({
381
- "frame": frame_file.name,
382
- "cursor_active": False,
383
- "x": None,
384
- "y": None,
385
- "confidence": conf,
386
- "template": None
387
- })
388
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  with open(output_json_path, "w") as f:
390
  json.dump(results, f, indent=2)
391
  log_message(f"[SUCCESS] Cursor tracking results saved to {output_json_path}")
 
 
 
392
  if email_results:
393
- log_message("[INFO] Preparing to email results...")
394
  to_email = os.environ.get("TO_EMAIL")
395
  from_email = os.environ.get("FROM_EMAIL")
396
  app_password = os.environ.get("GMAIL_APP_PASSWORD")
397
- if not (to_email and from_email and app_password):
398
- log_message("[ERROR] Email environment variables not set. Please set TO_EMAIL, FROM_EMAIL, and GMAIL_APP_PASSWORD.")
399
- # return tracked_count # Don't return here, just log error
400
- else:
401
  send_email_with_attachment(
402
  subject="Cursor Tracking Results",
403
  body="See attached JSON results.",
@@ -406,36 +502,71 @@ def track_cursor(frames_dir, cursor_templates_dir, output_json_path, threshold=C
406
  app_password=app_password,
407
  attachment_path=output_json_path
408
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  except Exception as e:
410
- log_message(f"[ERROR] Failed to write output JSON: {e}")
411
- # raise # Don't raise, just log error
412
- return tracked_count
 
 
 
 
 
 
 
413
 
414
  def process_rar_file(rar_path: str) -> bool:
415
  """Process a single RAR file - extract, then process videos for frames and cursor tracking"""
416
  filename = os.path.basename(rar_path)
417
  processing_status["current_file"] = filename
418
-
419
  # Handle multi-part RAR naming
420
  if is_multipart_rar(filename):
421
  course_name = get_rar_part_base(filename)
422
  else:
423
  course_name = filename.replace(".rar", "")
424
-
425
  extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
426
-
427
  try:
428
  log_message(f"πŸ”„ Processing: {filename}")
429
-
430
  # Clean up any existing directory
431
  if os.path.exists(extract_dir):
432
  shutil.rmtree(extract_dir, ignore_errors=True)
433
-
434
  # Extract RAR
435
  os.makedirs(extract_dir, exist_ok=True)
436
  if not extract_with_retry(rar_path, extract_dir):
437
  raise Exception("RAR extraction failed")
438
-
439
  # Count extracted files
440
  file_count = 0
441
  video_files_found = []
@@ -444,17 +575,18 @@ def process_rar_file(rar_path: str) -> bool:
444
  file_count += 1
445
  if file.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
446
  video_files_found.append(os.path.join(root, file))
447
-
448
  processing_status["extracted_courses"] += 1
449
- log_message(f"βœ… Successfully extracted \'{course_name}\' ({file_count} files, {len(video_files_found)} videos)")
450
 
451
  # Process video files for frame extraction and cursor tracking
452
  for video_path in video_files_found:
453
  video_filename = Path(video_path).name
454
  # Create a unique output directory for frames for each video
455
- frames_output_dir = os.path.join(FRAMES_OUTPUT_FOLDER, f"{course_name}_{video_filename.replace('.', '_')}_frames")
 
456
  ensure_dir(frames_output_dir)
457
-
458
  extracted_frames_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
459
  processing_status["extracted_frames_count"] += extracted_frames_count
460
  if extracted_frames_count > 0:
@@ -462,60 +594,61 @@ def process_rar_file(rar_path: str) -> bool:
462
  log_message(f"[INFO] Extracted {extracted_frames_count} frames from {video_filename}")
463
 
464
  # Perform cursor tracking on the extracted frames
465
- cursor_output_json = os.path.join(CURSOR_TRACKING_OUTPUT_FOLDER, f"{course_name}_{video_filename.replace('.', '_')}_cursor_data.json")
466
- tracked_cursors = track_cursor(frames_output_dir, CURSOR_TEMPLATES_DIR, cursor_output_json, threshold=CURSOR_THRESHOLD)
467
  processing_status["tracked_cursors_count"] += tracked_cursors
468
  log_message(f"[INFO] Tracked {tracked_cursors} cursors in frames from {video_filename}")
469
  else:
470
  log_message(f"[WARN] No frames extracted from {video_filename}")
471
 
472
  return True
473
-
474
  except Exception as e:
475
  error_msg = str(e)
476
  log_message(f"❌ Processing failed: {error_msg}")
477
  log_failed_file(filename, error_msg)
478
  return False
479
-
480
  finally:
481
  processing_status["current_file"] = None
482
 
 
483
  def main_processing_loop(start_index: int = 0):
484
  """Main processing workflow - extraction, frame extraction, and cursor tracking"""
485
  processing_status["is_running"] = True
486
-
487
  try:
488
  # Load state
489
  processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
490
- download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 4})
491
-
492
  # Use start_index if provided, otherwise use the saved state
493
  next_index = start_index if start_index > 0 else download_state["next_download_index"]
494
-
495
  log_message(f"πŸ“Š Starting from index {next_index}")
496
  log_message(f"πŸ“Š Previously processed: {len(processed_rars)} files")
497
-
498
  # Get file list
499
  try:
500
  files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
501
  rar_files = sorted([f for f in files if f.endswith(".rar")])
502
-
503
  processing_status["total_files"] = len(rar_files)
504
  log_message(f"πŸ“ Found {len(rar_files)} RAR files in repository")
505
-
506
  if next_index >= len(rar_files):
507
  log_message("βœ… All files have been processed!")
508
  return
509
-
510
  except Exception as e:
511
  log_message(f"❌ Failed to get file list: {str(e)}")
512
  return
513
-
514
- # Process only one file per run
515
  if next_index < len(rar_files):
516
  rar_file = rar_files[next_index]
517
  filename = os.path.basename(rar_file)
518
-
519
  if filename in processed_rars:
520
  log_message(f"⏭️ Skipping already processed: {filename}")
521
  processing_status["processed_files"] += 1
@@ -524,10 +657,10 @@ def main_processing_loop(start_index: int = 0):
524
  save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
525
  log_message(f"πŸ“Š Moving to next file. Progress: {next_index}/{len(rar_files)}")
526
  return
527
-
528
  log_message(f"πŸ“₯ Downloading: {filename}")
529
  dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
530
-
531
  # Download file
532
  download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
533
  if download_with_retry(download_url, dest_path):
@@ -540,21 +673,21 @@ def main_processing_loop(start_index: int = 0):
540
  else:
541
  log_message(f"❌ Failed to process: {filename}")
542
  processing_status["failed_files"] += 1
543
-
544
  # Clean up downloaded file
545
  try:
546
  os.remove(dest_path)
547
  log_message(f"πŸ—‘οΈ Cleaned up download: {filename}")
548
- except:
549
  pass
550
  else:
551
  log_message(f"❌ Failed to download: {filename}")
552
  processing_status["failed_files"] += 1
553
-
554
  # Update download state for next run
555
  next_index += 1
556
  save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
557
-
558
  # Status update
559
  log_message(f"πŸ“Š Progress: {next_index}/{len(rar_files)} files processed")
560
  log_message(f'πŸ“Š Extracted: {processing_status["extracted_courses"]} courses')
@@ -562,17 +695,17 @@ def main_processing_loop(start_index: int = 0):
562
  log_message(f'πŸ“Š Frames Extracted: {processing_status["extracted_frames_count"]}')
563
  log_message(f'πŸ“Š Cursors Tracked: {processing_status["tracked_cursors_count"]}')
564
  log_message(f'πŸ“Š Failed: {processing_status["failed_files"]} files')
565
-
566
  if next_index < len(rar_files):
567
  log_message(f"πŸ”„ Run the script again to process the next file: {os.path.basename(rar_files[next_index])}")
568
  else:
569
  log_message("πŸŽ‰ All files have been processed!")
570
  else:
571
  log_message("βœ… All files have been processed!")
572
-
573
  log_message("πŸŽ‰ Processing complete!")
574
  log_message(f'πŸ“Š Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, {processing_status["extracted_frames_count"]} frames extracted, {processing_status["tracked_cursors_count"]} cursors tracked')
575
-
576
  except KeyboardInterrupt:
577
  log_message("⏹️ Processing interrupted by user")
578
  except Exception as e:
@@ -581,6 +714,7 @@ def main_processing_loop(start_index: int = 0):
581
  processing_status["is_running"] = False
582
  cleanup_temp_files()
583
 
 
584
  # Expose necessary functions and variables for download_api.py
585
  __all__ = [
586
  "main_processing_loop",
@@ -595,7 +729,3 @@ __all__ = [
595
  "CURSOR_THRESHOLD",
596
  "ensure_dir"
597
  ]
598
-
599
-
600
-
601
-
 
 
1
  import json
2
  import requests
3
  import subprocess
 
13
  from pathlib import Path
14
  import smtplib
15
  from email.message import EmailMessage
16
+ import multiprocessing
17
 
18
  # ==== CONFIGURATION ====
19
  HF_TOKEN = os.getenv("HF_TOKEN", "")
 
22
  # Path Configuration
23
  DOWNLOAD_FOLDER = "downloads"
24
  EXTRACT_FOLDER = "extracted"
25
+ FRAMES_OUTPUT_FOLDER = "extracted_frames" # New folder for extracted frames
26
+ CURSOR_TRACKING_OUTPUT_FOLDER = "cursor_tracking_results" # New folder for cursor tracking results
27
  CURSOR_TEMPLATES_DIR = "cursors"
28
 
29
  os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
30
  os.makedirs(EXTRACT_FOLDER, exist_ok=True)
31
  os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
32
  os.makedirs(CURSOR_TRACKING_OUTPUT_FOLDER, exist_ok=True)
33
+ os.makedirs(CURSOR_TEMPLATES_DIR, exist_ok=True) # Ensure cursor templates directory exists
34
 
35
  # State Files
36
  DOWNLOAD_STATE_FILE = "download_progress.json"
 
44
  MIN_FREE_SPACE_GB = 2 # Minimum free space in GB before processing
45
 
46
  # Frame Extraction Parameters
47
+ DEFAULT_FPS = 3 # Default frames per second for extraction
48
 
49
  # Cursor Tracking Parameters
50
  CURSOR_THRESHOLD = 0.8
 
67
  "logs": []
68
  }
69
 
70
+
71
  def log_message(message: str):
72
  """Log messages with timestamp"""
73
  timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
 
78
  if len(processing_status["logs"]) > 100:
79
  processing_status["logs"] = processing_status["logs"][-100:]
80
 
81
+
82
  def log_failed_file(filename: str, error: str):
83
  """Log failed files to persistent file"""
84
  with open(FAILED_FILES_LOG, "a") as f:
85
  f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
86
 
87
+
88
  def get_disk_usage(path: str) -> Dict[str, float]:
89
  """Get disk usage statistics in GB"""
90
  statvfs = os.statvfs(path)
 
93
  used = total - free
94
  return {"total": total, "free": free, "used": used}
95
 
96
+
97
  def check_disk_space(path: str = ".") -> bool:
98
  """Check if there's enough disk space"""
99
  disk_info = get_disk_usage(path)
 
102
  return False
103
  return True
104
 
105
+
106
  def cleanup_temp_files():
107
  """Clean up temporary files to free space"""
108
  log_message("🧹 Cleaning up temporary files...")
109
+
110
  # Clean old downloads (keep only current processing file)
111
  current_file = processing_status.get("current_file")
112
  for file in os.listdir(DOWNLOAD_FOLDER):
 
114
  try:
115
  os.remove(os.path.join(DOWNLOAD_FOLDER, file))
116
  log_message(f"πŸ—‘οΈ Removed old download: {file}")
117
+ except Exception:
118
  pass
119
 
120
+
121
  def load_json_state(file_path: str, default_value):
122
  """Load state from JSON file"""
123
  if os.path.exists(file_path):
 
128
  log_message(f"⚠️ Corrupted state file: {file_path}")
129
  return default_value
130
 
131
+
132
  def save_json_state(file_path: str, data):
133
  """Save state to JSON file"""
134
  with open(file_path, "w") as f:
135
  json.dump(data, f, indent=2)
136
 
137
+
138
  def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
139
  """Download file with retry logic and disk space checking"""
140
  if not check_disk_space():
 
142
  if not check_disk_space():
143
  log_message("❌ Insufficient disk space even after cleanup")
144
  return False
145
+
146
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
147
  for attempt in range(max_retries):
148
  try:
149
  with requests.get(url, headers=headers, stream=True) as r:
150
  r.raise_for_status()
151
+
152
  # Check content length if available
153
  content_length = r.headers.get("content-length")
154
  if content_length:
 
157
  if size_gb > disk_info["free"] - 0.5: # Leave 0.5GB buffer
158
  log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
159
  return False
160
+
161
  with open(dest_path, "wb") as f:
162
  for chunk in r.iter_content(chunk_size=8192):
163
+ if chunk:
164
+ f.write(chunk)
165
  return True
166
  except Exception as e:
167
  if attempt < max_retries - 1:
 
171
  return False
172
  return False
173
 
174
+
175
  def is_multipart_rar(filename: str) -> bool:
176
  """Check if this is a multi-part RAR file"""
177
  return ".part" in filename.lower() and filename.lower().endswith(".rar")
178
 
179
+
180
  def get_rar_part_base(filename: str) -> str:
181
  """Get the base name for multi-part RAR files"""
182
  if ".part" in filename.lower():
183
  return filename.split(".part")[0]
184
  return filename.replace(".rar", "")
185
 
186
+
187
  def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
188
  """Extract RAR with retry and recovery, handling multi-part archives"""
189
  filename = os.path.basename(rar_path)
190
+
191
  # For multi-part RARs, we need the first part
192
  if is_multipart_rar(filename):
193
  base_name = get_rar_part_base(filename)
194
  first_part = f"{base_name}.part01.rar"
195
  first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
196
+
197
  if not os.path.exists(first_part_path):
198
  log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
199
  return False
200
+
201
  rar_path = first_part_path
202
  log_message(f"πŸ“¦ Processing multi-part RAR starting with: {first_part}")
203
+
204
  for attempt in range(max_retries):
205
  try:
206
  # Test RAR first
 
211
  if attempt == max_retries - 1:
212
  return False
213
  continue
214
+
215
  # Extract RAR
216
  cmd = ["unrar", "x", "-o+", rar_path, output_dir]
217
  if attempt > 0: # Try recovery on subsequent attempts
218
  cmd.insert(2, "-kb")
219
+
220
  result = subprocess.run(cmd, capture_output=True, text=True)
221
  if result.returncode == 0:
222
  log_message(f"βœ… Successfully extracted: {os.path.basename(rar_path)}")
 
224
  else:
225
  error_msg = result.stderr or result.stdout
226
  log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
227
+
228
  if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
229
  log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
230
  elif result.returncode == 10:
 
232
  return False
233
  elif result.returncode == 1:
234
  log_message(f"⚠️ Non-fatal error (exit code 1)")
235
+
236
  except Exception as e:
237
  log_message(f"❌ Extraction exception: {str(e)}")
238
  if attempt == max_retries - 1:
239
  return False
240
  time.sleep(1)
241
+
242
  return False
243
+
244
+
245
  # --- Frame Extraction Utilities ---
246
  def ensure_dir(path):
247
  os.makedirs(path, exist_ok=True)
248
 
249
+
250
  def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
251
  """Extract frames from video at the specified frames per second (fps)."""
252
  log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
 
256
  log_message(f"[ERROR] Failed to open video file: {video_path}")
257
  return 0
258
  video_fps = cap.get(cv2.CAP_PROP_FPS)
 
259
  if not video_fps or video_fps <= 0:
260
  video_fps = 30 # fallback if FPS is not available
261
  log_message(f"[WARN] Using fallback FPS: {video_fps}")
262
  frame_interval = int(round(video_fps / fps))
263
+ if frame_interval <= 0:
264
+ frame_interval = 1
265
  frame_idx = 0
266
  saved_idx = 1
267
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
269
  while cap.isOpened():
270
  ret, frame = cap.read()
271
  if not ret:
 
272
  break
273
  if frame_idx % frame_interval == 0:
274
  frame_name = f"{saved_idx:04d}.png"
275
  cv2.imwrite(str(Path(output_dir) / frame_name), frame)
 
276
  saved_idx += 1
277
  frame_idx += 1
278
  cap.release()
279
  log_message(f"Extracted {saved_idx-1} frames from {video_path} to {output_dir}")
280
  return saved_idx - 1
281
 
282
+
283
+ # --- Cursor Tracking Utilities (multiprocessing) ---
284
  def to_rgb(img):
285
  if img is None:
286
  return None
 
290
  return cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
291
  return img
292
 
293
+
294
  def get_mask_from_alpha(template_img):
295
  if template_img is not None and len(template_img.shape) == 3 and template_img.shape[2] == 4:
296
  # Use alpha channel as mask (nonzero alpha = 255)
297
  return (template_img[:, :, 3] > 0).astype(np.uint8) * 255
298
  return None
299
 
300
+
301
  def detect_cursor_in_frame_multi(frame, cursor_templates, threshold=CURSOR_THRESHOLD):
302
  """Detect cursor position in a frame using multiple templates. Returns best match above threshold."""
303
  best_pos = None
 
308
  template_rgb = to_rgb(cursor_template)
309
  mask = get_mask_from_alpha(cursor_template)
310
  if template_rgb is None or frame_rgb is None or template_rgb.shape[2] != frame_rgb.shape[2]:
311
+ # Channel mismatch or load error
312
  continue
313
  try:
314
  result = cv2.matchTemplate(frame_rgb, template_rgb, cv2.TM_CCOEFF_NORMED, mask=mask)
315
+ except Exception:
 
316
  continue
317
  min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
318
  if max_val > best_conf:
 
327
  return best_pos, best_conf, best_template_name
328
  return None, best_conf, None
329
 
330
+
331
+ # Multiprocessing worker init and worker function
332
+ # These globals are loaded in each worker process via initializer for efficiency
333
+ _WORKER_CURSOR_TEMPLATES = None
334
+ _WORKER_THRESHOLD = None
335
+
336
+
337
+ def _init_worker(template_paths, threshold):
338
+ """Initializer for pool workers: load templates into process-local global variable"""
339
+ global _WORKER_CURSOR_TEMPLATES
340
+ global _WORKER_THRESHOLD
341
+ _WORKER_CURSOR_TEMPLATES = {}
342
+ for tp in template_paths:
343
+ try:
344
+ img = cv2.imread(tp, cv2.IMREAD_UNCHANGED)
345
+ if img is not None:
346
+ _WORKER_CURSOR_TEMPLATES[os.path.basename(tp)] = img
347
+ except Exception:
348
+ pass
349
+ _WORKER_THRESHOLD = threshold
350
+
351
+
352
+ def track_cursor_worker(frame_file, cursor_templates, threshold, log_queue):
353
+ """Worker function that tracks cursor and sends logs back."""
354
+ frame = cv2.imread(str(frame_file), cv2.IMREAD_UNCHANGED)
355
+ if frame is None:
356
+ log_queue.put(f"[WARN] Frame unreadable: {frame_file.name}")
357
+ return {
358
+ "frame": frame_file.name,
359
+ "cursor_active": False,
360
+ "x": None,
361
+ "y": None,
362
+ "confidence": -1,
363
+ "template": None
364
+ }
365
+
366
+ pos, conf, template_name = detect_cursor_in_frame_multi(frame, cursor_templates, threshold)
367
+
368
+ if pos is not None:
369
+ log_queue.put(
370
+ f"[FRAME] {frame_file.name} β†’ FOUND cursor at ({pos[0]},{pos[1]}) conf={conf:.3f} template={template_name}"
371
+ )
372
+ return {
373
+ "frame": frame_file.name,
374
+ "cursor_active": True,
375
+ "x": pos[0],
376
+ "y": pos[1],
377
+ "confidence": conf,
378
+ "template": template_name
379
+ }
380
+ else:
381
+ log_queue.put(
382
+ f"[FRAME] {frame_file.name} β†’ NO cursor (max_conf={conf:.3f})"
383
+ )
384
+ return {
385
+ "frame": frame_file.name,
386
+ "cursor_active": False,
387
+ "x": None,
388
+ "y": None,
389
+ "confidence": conf,
390
+ "template": None
391
+ }
392
+
393
+
394
+
395
+ def upload_to_hf_dataset(local_path, dataset_repo_id="Fred808/data", hf_token=None):
396
+ """Upload JSON tracking results to Hugging Face dataset repo"""
397
  try:
398
+ api = HfApi(token=hf_token or HF_TOKEN)
399
+ filename = os.path.basename(local_path)
400
+ repo_path = f"results/{filename}"
401
+ api.upload_file(
402
+ path_or_fileobj=local_path,
403
+ path_in_repo=repo_path,
404
+ repo_id=dataset_repo_id,
405
+ repo_type="dataset"
406
+ )
407
+ log_message(f"[UPLOAD] βœ… Uploaded {filename} to {dataset_repo_id}/{repo_path}")
408
  except Exception as e:
409
+ log_message(f"[UPLOAD ERROR] {e}")
410
+
411
+ def log_listener(log_queue):
412
+ """Continuously print log messages from worker processes."""
413
+ while True:
414
+ msg = log_queue.get()
415
+ if msg == "STOP":
416
+ break
417
+ log_message(msg)
418
+
419
+ def track_cursor_parallel(frames_dir, cursor_templates_dir, output_json_path,
420
+ threshold=CURSOR_THRESHOLD, start_frame=1,
421
+ batch_size=100, email_results=False):
422
+ """Parallelized cursor tracking with real-time logging"""
423
+ log_message(f"[INFO] Tracking cursors in {frames_dir} with real-time logging...")
424
 
 
 
 
425
  frames_dir = Path(frames_dir).resolve()
426
  output_json_path = Path(output_json_path).resolve()
427
  cursor_templates_dir = Path(cursor_templates_dir).resolve()
 
 
 
 
428
  ensure_dir(output_json_path.parent)
429
+
430
+ # Load cursor templates
431
  cursor_templates = {}
432
  for template_file in cursor_templates_dir.glob("*.png"):
433
+ img = cv2.imread(str(template_file), cv2.IMREAD_UNCHANGED)
434
+ if img is not None:
435
+ cursor_templates[template_file.name] = img
 
 
436
  if not cursor_templates:
437
+ log_message(f"[ERROR] No cursor templates found in {cursor_templates_dir}")
438
  return 0
439
+
440
+ # List frames
441
+ all_frames = sorted(frames_dir.glob("*.png"))
442
+ all_frames = [f for f in all_frames if int(f.stem) >= start_frame]
443
+ total_frames = len(all_frames)
444
+ if not total_frames:
445
+ log_message("[WARN] No frames found to process.")
446
+ return 0
447
+
448
+ log_message(f"[INFO] Total frames to track: {total_frames}")
449
+
450
+ # Multiprocessing setup
451
+ manager = multiprocessing.Manager()
452
+ log_queue = manager.Queue()
453
+ listener = multiprocessing.Process(target=log_listener, args=(log_queue,))
454
+ listener.start()
455
+
456
+ pool = multiprocessing.Pool(multiprocessing.cpu_count())
457
  results = []
458
+ processed = 0
459
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  try:
461
+ # Feed tasks to pool in batches
462
+ for i in range(0, total_frames, batch_size):
463
+ batch = all_frames[i:i + batch_size]
464
+ tasks = [
465
+ pool.apply_async(
466
+ track_cursor_worker,
467
+ (frame_file, cursor_templates, threshold, log_queue)
468
+ )
469
+ for frame_file in batch
470
+ ]
471
+
472
+ for t in tasks:
473
+ res = t.get()
474
+ results.append(res)
475
+ processed += 1
476
+ if processed % 50 == 0 or processed == total_frames:
477
+ log_message(f"[PROGRESS] {processed}/{total_frames} frames processed")
478
+ with open(output_json_path, "w") as f:
479
+ json.dump(results, f, indent=2)
480
+
481
+ pool.close()
482
+ pool.join()
483
+
484
+ # Final write
485
  with open(output_json_path, "w") as f:
486
  json.dump(results, f, indent=2)
487
  log_message(f"[SUCCESS] Cursor tracking results saved to {output_json_path}")
488
+
489
+ upload_to_hf_dataset(output_json_path, dataset_repo_id="Fred808/data", hf_token=HF_TOKEN)
490
+
491
  if email_results:
492
+ log_message("[INFO] Sending email results (if configured)...")
493
  to_email = os.environ.get("TO_EMAIL")
494
  from_email = os.environ.get("FROM_EMAIL")
495
  app_password = os.environ.get("GMAIL_APP_PASSWORD")
496
+ if to_email and from_email and app_password:
 
 
 
497
  send_email_with_attachment(
498
  subject="Cursor Tracking Results",
499
  body="See attached JSON results.",
 
502
  app_password=app_password,
503
  attachment_path=output_json_path
504
  )
505
+
506
+ except Exception as e:
507
+ log_message(f"[ERROR] Exception during parallel tracking: {e}")
508
+ pool.terminate()
509
+
510
+ finally:
511
+ log_queue.put("STOP")
512
+ listener.join()
513
+
514
+ active = len([r for r in results if r["cursor_active"]])
515
+ log_message(f"[DONE] {active}/{total_frames} frames contained cursors.")
516
+ return active
517
+
518
+ def send_email_with_attachment(subject, body, to_email, from_email, app_password, attachment_path):
519
+ msg = EmailMessage()
520
+ msg["Subject"] = subject
521
+ msg["From"] = from_email
522
+ msg["To"] = to_email
523
+ msg.set_content(body)
524
+ with open(attachment_path, "rb") as f:
525
+ file_data = f.read()
526
+ file_name = Path(attachment_path).name
527
+ msg.add_attachment(file_data, maintype="application", subtype="octet-stream", filename=file_name)
528
+ try:
529
+ with smtplib.SMTP_SSL("smtp.gmail.com", 465) as smtp:
530
+ smtp.login(from_email, app_password)
531
+ smtp.send_message(msg)
532
+ log_message(f"[SUCCESS] Email sent to {to_email}")
533
  except Exception as e:
534
+ log_message(f"[ERROR] Failed to send email: {e}")
535
+
536
+
537
+ def track_cursor(frames_dir, cursor_templates_dir, output_json_path, threshold=CURSOR_THRESHOLD, start_frame=1, batch_size=100, email_results=False):
538
+ """
539
+ Backwards-compatible wrapper that calls the parallel implementation.
540
+ Keep this name so other parts of your code that call track_cursor() keep working.
541
+ """
542
+ return track_cursor_parallel(frames_dir, cursor_templates_dir, output_json_path, threshold, start_frame, batch_size, email_results)
543
+
544
 
545
  def process_rar_file(rar_path: str) -> bool:
546
  """Process a single RAR file - extract, then process videos for frames and cursor tracking"""
547
  filename = os.path.basename(rar_path)
548
  processing_status["current_file"] = filename
549
+
550
  # Handle multi-part RAR naming
551
  if is_multipart_rar(filename):
552
  course_name = get_rar_part_base(filename)
553
  else:
554
  course_name = filename.replace(".rar", "")
555
+
556
  extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
557
+
558
  try:
559
  log_message(f"πŸ”„ Processing: {filename}")
560
+
561
  # Clean up any existing directory
562
  if os.path.exists(extract_dir):
563
  shutil.rmtree(extract_dir, ignore_errors=True)
564
+
565
  # Extract RAR
566
  os.makedirs(extract_dir, exist_ok=True)
567
  if not extract_with_retry(rar_path, extract_dir):
568
  raise Exception("RAR extraction failed")
569
+
570
  # Count extracted files
571
  file_count = 0
572
  video_files_found = []
 
575
  file_count += 1
576
  if file.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
577
  video_files_found.append(os.path.join(root, file))
578
+
579
  processing_status["extracted_courses"] += 1
580
+ log_message(f"βœ… Successfully extracted '{course_name}' ({file_count} files, {len(video_files_found)} videos)")
581
 
582
  # Process video files for frame extraction and cursor tracking
583
  for video_path in video_files_found:
584
  video_filename = Path(video_path).name
585
  # Create a unique output directory for frames for each video
586
+ safe_video_name = video_filename.replace(".", "_")
587
+ frames_output_dir = os.path.join(FRAMES_OUTPUT_FOLDER, f"{course_name}_{safe_video_name}_frames")
588
  ensure_dir(frames_output_dir)
589
+
590
  extracted_frames_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
591
  processing_status["extracted_frames_count"] += extracted_frames_count
592
  if extracted_frames_count > 0:
 
594
  log_message(f"[INFO] Extracted {extracted_frames_count} frames from {video_filename}")
595
 
596
  # Perform cursor tracking on the extracted frames
597
+ cursor_output_json = os.path.join(CURSOR_TRACKING_OUTPUT_FOLDER, f"{course_name}_{safe_video_name}_cursor_data.json")
598
+ tracked_cursors = track_cursor(frames_output_dir, CURSOR_TEMPLATES_DIR, cursor_output_json, threshold=CURSOR_THRESHOLD, batch_size=100)
599
  processing_status["tracked_cursors_count"] += tracked_cursors
600
  log_message(f"[INFO] Tracked {tracked_cursors} cursors in frames from {video_filename}")
601
  else:
602
  log_message(f"[WARN] No frames extracted from {video_filename}")
603
 
604
  return True
605
+
606
  except Exception as e:
607
  error_msg = str(e)
608
  log_message(f"❌ Processing failed: {error_msg}")
609
  log_failed_file(filename, error_msg)
610
  return False
611
+
612
  finally:
613
  processing_status["current_file"] = None
614
 
615
+
616
  def main_processing_loop(start_index: int = 0):
617
  """Main processing workflow - extraction, frame extraction, and cursor tracking"""
618
  processing_status["is_running"] = True
619
+
620
  try:
621
  # Load state
622
  processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
623
+ download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 5})
624
+
625
  # Use start_index if provided, otherwise use the saved state
626
  next_index = start_index if start_index > 0 else download_state["next_download_index"]
627
+
628
  log_message(f"πŸ“Š Starting from index {next_index}")
629
  log_message(f"πŸ“Š Previously processed: {len(processed_rars)} files")
630
+
631
  # Get file list
632
  try:
633
  files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
634
  rar_files = sorted([f for f in files if f.endswith(".rar")])
635
+
636
  processing_status["total_files"] = len(rar_files)
637
  log_message(f"πŸ“ Found {len(rar_files)} RAR files in repository")
638
+
639
  if next_index >= len(rar_files):
640
  log_message("βœ… All files have been processed!")
641
  return
642
+
643
  except Exception as e:
644
  log_message(f"❌ Failed to get file list: {str(e)}")
645
  return
646
+
647
+ # Process one file per run
648
  if next_index < len(rar_files):
649
  rar_file = rar_files[next_index]
650
  filename = os.path.basename(rar_file)
651
+
652
  if filename in processed_rars:
653
  log_message(f"⏭️ Skipping already processed: {filename}")
654
  processing_status["processed_files"] += 1
 
657
  save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
658
  log_message(f"πŸ“Š Moving to next file. Progress: {next_index}/{len(rar_files)}")
659
  return
660
+
661
  log_message(f"πŸ“₯ Downloading: {filename}")
662
  dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
663
+
664
  # Download file
665
  download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
666
  if download_with_retry(download_url, dest_path):
 
673
  else:
674
  log_message(f"❌ Failed to process: {filename}")
675
  processing_status["failed_files"] += 1
676
+
677
  # Clean up downloaded file
678
  try:
679
  os.remove(dest_path)
680
  log_message(f"πŸ—‘οΈ Cleaned up download: {filename}")
681
+ except Exception:
682
  pass
683
  else:
684
  log_message(f"❌ Failed to download: {filename}")
685
  processing_status["failed_files"] += 1
686
+
687
  # Update download state for next run
688
  next_index += 1
689
  save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
690
+
691
  # Status update
692
  log_message(f"πŸ“Š Progress: {next_index}/{len(rar_files)} files processed")
693
  log_message(f'πŸ“Š Extracted: {processing_status["extracted_courses"]} courses')
 
695
  log_message(f'πŸ“Š Frames Extracted: {processing_status["extracted_frames_count"]}')
696
  log_message(f'πŸ“Š Cursors Tracked: {processing_status["tracked_cursors_count"]}')
697
  log_message(f'πŸ“Š Failed: {processing_status["failed_files"]} files')
698
+
699
  if next_index < len(rar_files):
700
  log_message(f"πŸ”„ Run the script again to process the next file: {os.path.basename(rar_files[next_index])}")
701
  else:
702
  log_message("πŸŽ‰ All files have been processed!")
703
  else:
704
  log_message("βœ… All files have been processed!")
705
+
706
  log_message("πŸŽ‰ Processing complete!")
707
  log_message(f'πŸ“Š Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, {processing_status["extracted_frames_count"]} frames extracted, {processing_status["tracked_cursors_count"]} cursors tracked')
708
+
709
  except KeyboardInterrupt:
710
  log_message("⏹️ Processing interrupted by user")
711
  except Exception as e:
 
714
  processing_status["is_running"] = False
715
  cleanup_temp_files()
716
 
717
+
718
  # Expose necessary functions and variables for download_api.py
719
  __all__ = [
720
  "main_processing_loop",
 
729
  "CURSOR_THRESHOLD",
730
  "ensure_dir"
731
  ]