Fred808 commited on
Commit
a93e011
Β·
verified Β·
1 Parent(s): e829dee

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -35
  2. Dockerfile +45 -0
  3. download_api.py +407 -0
  4. index.html +201 -0
  5. requirements.txt +11 -0
  6. vision_analyzer.py +564 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim-bullseye
2
+
3
+ # Install system dependencies
4
+ RUN sed -i 's/main/main contrib non-free/' /etc/apt/sources.list && \
5
+ apt-get update && \
6
+ apt-get install -y --no-install-recommends \
7
+ unrar \
8
+ libgl1 \
9
+ libglib2.0-0 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ WORKDIR /app
13
+
14
+ # Upgrade pip and install core dependencies first
15
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel packaging
16
+
17
+ # Install CPU-only PyTorch first
18
+
19
+ # Copy requirements and install with special handling for flash_attn
20
+ COPY requirements.txt .
21
+ RUN pip install --no-cache-dir \
22
+ -r requirements.txt \
23
+ --find-links https://download.pytorch.org/whl/cpu \
24
+ --extra-index-url https://pypi.org/simple && \
25
+ # Install remaining packages that might have been skipped
26
+ pip install --no-cache-dir \
27
+ accelerate \
28
+ transformers==4.36.2 \
29
+ timm==0.9.12 \
30
+ einops==0.7.0
31
+
32
+ # Copy application code
33
+ COPY . .
34
+
35
+ # Create non-root user
36
+ RUN useradd -m -u 1000 user && \
37
+ chown -R user:user /app
38
+
39
+ USER user
40
+
41
+ # Environment variables to suppress warnings
42
+ ENV HF_HUB_DISABLE_PROGRESS=1
43
+ ENV TF_CPP_MIN_LOG_LEVEL=3
44
+
45
+ CMD ["uvicorn", "download_api:app", "--host", "0.0.0.0", "--port", "7860"]
download_api.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import threading
5
+ import asyncio
6
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.responses import JSONResponse, FileResponse
9
+ from fastapi.staticfiles import StaticFiles
10
+ import uvicorn
11
+ from typing import Dict
12
+ from pathlib import Path
13
+ import subprocess
14
+ from datetime import datetime
15
+
16
+ import torch
17
+
18
+ # Import core functionality
19
+ from vision_analyzer import (
20
+ main_processing_loop,
21
+ processing_status,
22
+ log_message,
23
+ FRAMES_OUTPUT_FOLDER
24
+ )
25
+
26
+ # FastAPI App Definition
27
+ app = FastAPI(title="Video Analysis API",
28
+ description="API to access video frame analysis results and extracted images",
29
+ version="1.0.0")
30
+
31
+ # Add CORS middleware to allow cross-origin requests
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["*"], # Allows all origins
35
+ allow_credentials=True,
36
+ allow_methods=["*"], # Allows all methods
37
+ allow_headers=["*"],
38
+ )
39
+
40
+ # Global variables for processing and frame tracking
41
+ processing_thread = None
42
+ frame_locks = {} # Dict to track frame locks: {course: {frame: {"locked_by": id, "locked_at": timestamp}}}
43
+ processed_frames = {} # Dict to track processed frames: {course: {frame: {"processed_by": id, "processed_at": timestamp}}}
44
+ LOCK_TIMEOUT = 300 # 5 minutes timeout for locks
45
+ TRACKING_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "frame_tracking.json")
46
+
47
+ def save_tracking_state():
48
+ """Save frame tracking state to disk"""
49
+ state = {
50
+ "frame_locks": frame_locks,
51
+ "processed_frames": processed_frames
52
+ }
53
+ try:
54
+ with open(TRACKING_FILE, "w") as f:
55
+ json.dump(state, f, indent=2)
56
+ except Exception as e:
57
+ log_message(f"Error saving tracking state: {e}")
58
+
59
+ def load_tracking_state():
60
+ """Load frame tracking state from disk"""
61
+ global frame_locks, processed_frames
62
+ try:
63
+ with open(TRACKING_FILE, "r") as f:
64
+ state = json.load(f)
65
+ frame_locks = state.get("frame_locks", {})
66
+ processed_frames = state.get("processed_frames", {})
67
+ except FileNotFoundError:
68
+ log_message("No previous tracking state found")
69
+ except Exception as e:
70
+ log_message(f"Error loading tracking state: {e}")
71
+
72
+ def check_frame_lock(course: str, frame: str) -> bool:
73
+ """Check if frame is locked and lock hasn't expired"""
74
+ if course in frame_locks and frame in frame_locks[course]:
75
+ lock = frame_locks[course][frame]
76
+ if time.time() - lock["locked_at"] < LOCK_TIMEOUT:
77
+ return True
78
+ # Lock expired, remove it
79
+ del frame_locks[course][frame]
80
+ save_tracking_state()
81
+ return False
82
+
83
+ def lock_frame(course: str, frame: str, requester_id: str) -> bool:
84
+ """Attempt to lock a frame for processing"""
85
+ if check_frame_lock(course, frame):
86
+ return False
87
+
88
+ if course not in frame_locks:
89
+ frame_locks[course] = {}
90
+
91
+ frame_locks[course][frame] = {
92
+ "locked_by": requester_id,
93
+ "locked_at": time.time()
94
+ }
95
+ save_tracking_state()
96
+ return True
97
+
98
+ def mark_frame_processed(course: str, frame: str, requester_id: str):
99
+ """Mark a frame as successfully processed"""
100
+ if course not in processed_frames:
101
+ processed_frames[course] = {}
102
+
103
+ processed_frames[course][frame] = {
104
+ "processed_by": requester_id,
105
+ "processed_at": time.time()
106
+ }
107
+
108
+ # Remove the lock if it exists
109
+ if course in frame_locks and frame in frame_locks[course]:
110
+ del frame_locks[course][frame]
111
+
112
+ save_tracking_state()
113
+
114
+ def log_message(message):
115
+ """Add a log message with timestamp"""
116
+ timestamp = datetime.now().strftime("%H:%M:%S")
117
+ log_entry = f"[{timestamp}] {message}"
118
+ processing_status["logs"].append(log_entry)
119
+
120
+ # Keep only the last 100 logs
121
+ if len(processing_status["logs"]) > 100:
122
+ processing_status["logs"] = processing_status["logs"][-100:]
123
+
124
+ print(log_entry)
125
+
126
+ @app.on_event("startup")
127
+ async def startup_event():
128
+ """Initialize frame tracking and start processing loop"""
129
+ # Load frame tracking state
130
+ load_tracking_state()
131
+ log_message("βœ“ Loaded frame tracking state")
132
+
133
+ # Start processing thread
134
+ global processing_thread
135
+ if not (processing_thread and processing_thread.is_alive()):
136
+ log_message("πŸš€ Starting RAR extraction, frame extraction, and vision analysis pipeline in background...")
137
+ processing_thread = threading.Thread(target=main_processing_loop)
138
+ processing_thread.daemon = True
139
+ processing_thread.start()
140
+
141
+ @app.get("/")
142
+ async def root():
143
+ """Root endpoint that returns basic info"""
144
+ return {
145
+ "message": "Video Analysis API",
146
+ "status": "running",
147
+ "endpoints": {
148
+ "/status": "Get processing status",
149
+ "/courses": "List all available course folders",
150
+ "/images/{course_folder}": "List images in a course folder",
151
+ "/images/{course_folder}/{frame_filename}": "Get specific frame image",
152
+ "/start-processing": "Start processing pipeline",
153
+ "/stop-processing": "Stop processing pipeline"
154
+ }
155
+ }
156
+
157
+ @app.get("/status")
158
+ async def get_status():
159
+ """Get current processing status"""
160
+ return {
161
+ "processing_status": processing_status,
162
+ "frames_folder": FRAMES_OUTPUT_FOLDER,
163
+ "frames_folder_exists": os.path.exists(FRAMES_OUTPUT_FOLDER)
164
+ }
165
+
166
+ # ===== NEW IMAGE SERVING ENDPOINTS =====
167
+
168
+ @app.get("/middleware/next/course")
169
+ async def get_next_course(requester_id: str):
170
+ """Get next available course for processing"""
171
+ if not os.path.exists(FRAMES_OUTPUT_FOLDER):
172
+ raise HTTPException(status_code=404, detail="No courses available")
173
+
174
+ # Load latest state
175
+ load_tracking_state()
176
+
177
+ # Find a course with unprocessed frames
178
+ for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
179
+ folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
180
+ if not os.path.isdir(folder_path):
181
+ continue
182
+
183
+ # Check if course has any unprocessed frames
184
+ image_files = [f for f in os.listdir(folder_path)
185
+ if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
186
+
187
+ for image in image_files:
188
+ if (folder not in processed_frames or
189
+ image not in processed_frames[folder]):
190
+ return {"course": folder}
191
+
192
+ raise HTTPException(status_code=404, detail="No courses with unprocessed frames")
193
+
194
+ @app.get("/middleware/next/image/{course_folder}")
195
+ async def get_next_image(course_folder: str, requester_id: str):
196
+ """Get next available image from a course"""
197
+ folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
198
+
199
+ if not os.path.exists(folder_path):
200
+ raise HTTPException(status_code=404, detail=f"Course not found: {course_folder}")
201
+
202
+ # Load latest state
203
+ load_tracking_state()
204
+
205
+ # Find first unprocessed and unlocked frame
206
+ for file in sorted(os.listdir(folder_path)):
207
+ if not file.lower().endswith(('.png', '.jpg', '.jpeg')):
208
+ continue
209
+
210
+ # Skip if processed
211
+ if (course_folder in processed_frames and
212
+ file in processed_frames[course_folder]):
213
+ continue
214
+
215
+ # Skip if locked by another requester
216
+ if check_frame_lock(course_folder, file):
217
+ continue
218
+
219
+ # Try to lock the frame
220
+ if lock_frame(course_folder, file, requester_id):
221
+ file_path = os.path.join(folder_path, file)
222
+ file_stats = os.stat(file_path)
223
+ return {
224
+ "file_id": f"frame:{course_folder}/{file}",
225
+ "frame": file,
226
+ "video": os.path.splitext(file)[0],
227
+ "size_bytes": file_stats.st_size,
228
+ "modified_time": time.ctime(file_stats.st_mtime),
229
+ "url": f"/images/{course_folder}/{file}"
230
+ }
231
+
232
+ raise HTTPException(status_code=404, detail="No available frames in course")
233
+
234
+ @app.post("/middleware/release/frame/{course_folder}/{video}/{frame}")
235
+ async def release_frame(course_folder: str, video: str, frame: str, requester_id: str):
236
+ """Release a frame lock"""
237
+ if course_folder in frame_locks and frame in frame_locks[course_folder]:
238
+ lock = frame_locks[course_folder][frame]
239
+ if lock["locked_by"] == requester_id:
240
+ del frame_locks[course_folder][frame]
241
+ save_tracking_state()
242
+ return {"status": "released"}
243
+ return {"status": "not_found"}
244
+
245
+ @app.post("/middleware/release/course/{course_folder}")
246
+ async def release_course(course_folder: str, requester_id: str):
247
+ """Release all frame locks for a course"""
248
+ if course_folder in frame_locks:
249
+ # Only release frames locked by this requester
250
+ frames_to_release = [
251
+ frame for frame, lock in frame_locks[course_folder].items()
252
+ if lock["locked_by"] == requester_id
253
+ ]
254
+ for frame in frames_to_release:
255
+ del frame_locks[course_folder][frame]
256
+ save_tracking_state()
257
+ return {"status": "released"}
258
+
259
+ @app.get("/images/{course_folder}/{frame_filename}")
260
+ async def get_frame_image(course_folder: str, frame_filename: str, requester_id: str = None):
261
+ """
262
+ Serve extracted frame images from course folders with locking
263
+
264
+ Args:
265
+ course_folder: The course folder name (e.g., "course1_video1_mp4_frames")
266
+ frame_filename: The frame file name (e.g., "0001.png")
267
+ requester_id: Optional requester ID for frame locking
268
+ """
269
+ # Load latest state
270
+ load_tracking_state()
271
+
272
+ # Construct the full path to the image
273
+ image_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder, frame_filename)
274
+
275
+ # Check if file exists
276
+ if not os.path.exists(image_path):
277
+ raise HTTPException(status_code=404, detail=f"Image not found: {course_folder}/{frame_filename}")
278
+
279
+ # Verify it's an image file
280
+ if not frame_filename.lower().endswith(('.png', '.jpg', '.jpeg')):
281
+ raise HTTPException(status_code=400, detail="File must be an image (PNG, JPG, JPEG)")
282
+
283
+ # If requester_id provided, verify frame lock
284
+ if requester_id:
285
+ if check_frame_lock(course_folder, frame_filename):
286
+ lock = frame_locks[course_folder][frame_filename]
287
+ if lock["locked_by"] != requester_id:
288
+ raise HTTPException(status_code=423, detail="Frame is locked by another requester")
289
+
290
+ # Return the image file
291
+ return FileResponse(image_path)
292
+
293
+ @app.get("/images/{course_folder}")
294
+ async def list_course_images(course_folder: str):
295
+ """
296
+ List all available images in a specific course folder
297
+
298
+ Args:
299
+ course_folder: The course folder name
300
+ """
301
+ folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
302
+
303
+ if not os.path.exists(folder_path):
304
+ raise HTTPException(status_code=404, detail=f"Course folder not found: {course_folder}")
305
+
306
+ # Get all image files
307
+ image_files = []
308
+ for file in os.listdir(folder_path):
309
+ if file.lower().endswith(('.png', '.jpg', '.jpeg')):
310
+ file_path = os.path.join(folder_path, file)
311
+ file_stats = os.stat(file_path)
312
+ image_files.append({
313
+ "filename": file,
314
+ "size_bytes": file_stats.st_size,
315
+ "modified_time": time.ctime(file_stats.st_mtime),
316
+ "url": f"/images/{course_folder}/{file}"
317
+ })
318
+
319
+ return {
320
+ "course_folder": course_folder,
321
+ "total_images": len(image_files),
322
+ "images": image_files
323
+ }
324
+
325
+ @app.get("/courses")
326
+ async def list_all_courses():
327
+ """
328
+ List all available course folders with their image counts
329
+ """
330
+ if not os.path.exists(FRAMES_OUTPUT_FOLDER):
331
+ return {"courses": [], "message": "Frames output folder does not exist yet"}
332
+
333
+ courses = []
334
+ for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
335
+ folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
336
+ if os.path.isdir(folder_path):
337
+ # Count image files
338
+ image_count = len([f for f in os.listdir(folder_path)
339
+ if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
340
+ courses.append({
341
+ "course_folder": folder,
342
+ "image_count": image_count,
343
+ "images_url": f"/images/{folder}",
344
+ "sample_image_url": f"/images/{folder}/0001.png" if image_count > 0 else None
345
+ })
346
+
347
+ return {
348
+ "total_courses": len(courses),
349
+ "courses": courses
350
+ }
351
+
352
+
353
+ # Signal handlers to prevent accidental shutdown
354
+ def handle_shutdown(signum, frame):
355
+ """Prevent shutdown on SIGTERM/SIGINT"""
356
+ print(f"\n⚠️ Received signal {signum}. Server will continue running.")
357
+ print("Use Ctrl+Break or kill -9 to force stop.")
358
+
359
+ # Setup signal handlers for graceful shutdown prevention
360
+ import signal
361
+ signal.signal(signal.SIGINT, handle_shutdown)
362
+ signal.signal(signal.SIGTERM, handle_shutdown)
363
+
364
+ # Server lifecycle events
365
+ @app.on_event("shutdown")
366
+ async def shutdown_event():
367
+ """Save state on shutdown attempt"""
368
+ save_tracking_state()
369
+ print("πŸ’Ύ Saved tracking state")
370
+ print("⚠️ Server shutdown prevented - use Ctrl+Break or kill -9 to force stop")
371
+ # Prevent shutdown by not returning
372
+ while True:
373
+ await asyncio.sleep(1)
374
+
375
+ if __name__ == "__main__":
376
+ # Start the FastAPI server
377
+ print("πŸš€ Starting Video Analysis FastAPI Server (Persistent Mode)...")
378
+ print("API Documentation will be available at: http://localhost:8000/docs")
379
+ print("API Root endpoint: http://localhost:8000/")
380
+ print("⚠️ Server will continue running even after processing completes")
381
+ print("Use Ctrl+Break or kill -9 to force stop")
382
+
383
+ # Ensure the analysis output folder exists
384
+ os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
385
+
386
+ # Start processing in thread instead of blocking
387
+ processing_thread = threading.Thread(target=main_processing_loop)
388
+ processing_thread.daemon = False # Make non-daemon so it doesn't exit
389
+ processing_thread.start()
390
+
391
+ # Configure uvicorn for persistent running
392
+ config = uvicorn.Config(
393
+ app=app,
394
+ host="0.0.0.0",
395
+ port=8000,
396
+ log_level="info",
397
+ reload=False,
398
+ workers=1,
399
+ loop="asyncio",
400
+ timeout_keep_alive=600, # Keep connections alive longer
401
+ access_log=True
402
+ )
403
+
404
+ # Run server with persistent config
405
+ server = uvicorn.Server(config)
406
+ server.run()
407
+
index.html ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <meta name="description" content="Advanced web-based dashboard for monitoring and managing video analysis operations. Real-time processing status, file management, and control interface.">
7
+ <meta name="keywords" content="video analysis, vision processing, dashboard, monitoring, API interface">
8
+ <meta name="author" content="Video Analysis Dashboard">
9
+ <title>Video Analysis Dashboard - Real-time Processing Monitor</title>
10
+ <link rel="stylesheet" href="/static/style.css">
11
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" rel="stylesheet">
12
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
13
+ </head>
14
+ <body>
15
+ <div class="container">
16
+ <!-- Header -->
17
+ <header class="header">
18
+ <div class="header-content">
19
+ <div class="logo">
20
+ <i class="fas fa-eye"></i>
21
+ <h1>Video Analysis Dashboard</h1>
22
+ </div>
23
+ <div class="header-actions">
24
+ <button id="refreshBtn" class="btn btn-secondary">
25
+ <i class="fas fa-sync-alt"></i>
26
+ Refresh
27
+ </button>
28
+ <div class="theme-toggle">
29
+ <button id="themeToggle" class="btn btn-icon">
30
+ <i class="fas fa-moon"></i>
31
+ </button>
32
+ </div>
33
+ </div>
34
+ </div>
35
+ </header>
36
+
37
+ <!-- Main Content -->
38
+ <main class="main-content">
39
+ <!-- Status Section -->
40
+ <section class="status-section">
41
+ <div class="card">
42
+ <div class="card-header">
43
+ <h2><i class="fas fa-chart-line"></i> Processing Status</h2>
44
+ <div class="status-indicator" id="statusIndicator">
45
+ <span class="status-dot"></span>
46
+ <span class="status-text">Loading...</span>
47
+ </div>
48
+ </div>
49
+ <div class="card-content">
50
+ <div class="stats-grid">
51
+ <div class="stat-item">
52
+ <div class="stat-value" id="totalFiles">-</div>
53
+ <div class="stat-label">Total Files</div>
54
+ </div>
55
+ <div class="stat-item">
56
+ <div class="stat-value" id="processedFiles">-</div>
57
+ <div class="stat-label">Processed</div>
58
+ </div>
59
+ <div class="stat-item">
60
+ <div class="stat-value" id="extractedCourses">-</div>
61
+ <div class="stat-label">Courses</div>
62
+ </div>
63
+ <div class="stat-item">
64
+ <div class="stat-value" id="extractedVideos">-</div>
65
+ <div class="stat-label">Videos</div>
66
+ </div>
67
+ <div class="stat-item">
68
+ <div class="stat-value" id="extractedFrames">-</div>
69
+ <div class="stat-label">Frames</div>
70
+ </div>
71
+ <div class="stat-item">
72
+ <div class="stat-value" id="analyzedFrames">-</div>
73
+ <div class="stat-label">Frames Analyzed</div>
74
+ </div>
75
+ </div>
76
+
77
+ <div class="progress-section">
78
+ <div class="progress-info">
79
+ <span>Current File:</span>
80
+ <span id="currentFile" class="current-file">None</span>
81
+ </div>
82
+ <div class="progress-bar">
83
+ <div class="progress-fill" id="progressFill"></div>
84
+ </div>
85
+ <div class="progress-text" id="progressText">0%</div>
86
+ </div>
87
+ </div>
88
+ </div>
89
+ </section>
90
+
91
+ <!-- Control Section -->
92
+ <section class="control-section">
93
+ <div class="card">
94
+ <div class="card-header">
95
+ <h2><i class="fas fa-cogs"></i> Processing Controls</h2>
96
+ </div>
97
+ <div class="card-content">
98
+ <div class="control-group">
99
+ <div class="input-group">
100
+ <label for="startIndex">Start Index for RAR Fetching:</label>
101
+ <input type="number" id="startIndex" min="0" value="0" class="input">
102
+ <span class="input-help">Specify which index to start processing from</span>
103
+ </div>
104
+ <div class="button-group">
105
+ <button id="startProcessing" class="btn btn-primary">
106
+ <i class="fas fa-play"></i>
107
+ Start Processing
108
+ </button>
109
+ <button id="stopProcessing" class="btn btn-danger">
110
+ <i class="fas fa-stop"></i>
111
+ Stop Processing
112
+ </button>
113
+ </div>
114
+ </div>
115
+ </div>
116
+ </div>
117
+ </section>
118
+
119
+ <!-- Analysis Data Files Section -->
120
+ <section class="files-section">
121
+ <div class="card">
122
+ <div class="card-header">
123
+ <h2><i class="fas fa-file-alt"></i> Analysis Results</h2>
124
+ <div class="file-count" id="fileCount">0 files</div>
125
+ </div>
126
+ <div class="card-content">
127
+ <div class="files-grid" id="filesGrid">
128
+ <!-- Files will be populated here -->
129
+ </div>
130
+ </div>
131
+ </div>
132
+ </section>
133
+
134
+ <!-- Logs Section -->
135
+ <section class="logs-section">
136
+ <div class="card">
137
+ <div class="card-header">
138
+ <h2><i class="fas fa-terminal"></i> Processing Logs</h2>
139
+ <div class="log-controls">
140
+ <button id="clearLogs" class="btn btn-secondary btn-sm">
141
+ <i class="fas fa-trash"></i>
142
+ Clear
143
+ </button>
144
+ <button id="autoScroll" class="btn btn-secondary btn-sm active">
145
+ <i class="fas fa-arrow-down"></i>
146
+ Auto-scroll
147
+ </button>
148
+ </div>
149
+ </div>
150
+ <div class="card-content">
151
+ <div class="logs-container" id="logsContainer">
152
+ <div class="log-entry">
153
+ <span class="log-time">[Loading...]</span>
154
+ <span class="log-message">Initializing dashboard...</span>
155
+ </div>
156
+ </div>
157
+ </div>
158
+ </div>
159
+ </section>
160
+ </main>
161
+ </div>
162
+
163
+ <!-- File Details Modal -->
164
+ <div id="fileModal" class="modal">
165
+ <div class="modal-content">
166
+ <div class="modal-header">
167
+ <h3 id="modalTitle">Analysis Details</h3>
168
+ <button class="modal-close" id="modalClose">
169
+ <i class="fas fa-times"></i>
170
+ </button>
171
+ </div>
172
+ <div class="modal-body" id="modalBody">
173
+ <!-- Analysis details will be populated here -->
174
+ </div>
175
+ <div class="modal-footer">
176
+ <button id="downloadFile" class="btn btn-primary">
177
+ <i class="fas fa-download"></i>
178
+ Download JSON
179
+ </button>
180
+ <button id="viewSummary" class="btn btn-secondary">
181
+ <i class="fas fa-list-alt"></i>
182
+ View Summary
183
+ </button>
184
+ </div>
185
+ </div>
186
+ </div>
187
+
188
+ <!-- Loading Overlay -->
189
+ <div id="loadingOverlay" class="loading-overlay">
190
+ <div class="loading-spinner">
191
+ <i class="fas fa-spinner fa-spin"></i>
192
+ <p>Loading...</p>
193
+ </div>
194
+ </div>
195
+
196
+ <!-- Toast Notifications -->
197
+ <div id="toastContainer" class="toast-container"></div>
198
+
199
+ <script src="/static/script.js"></script>
200
+ </body>
201
+ </html>
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ accelerate
3
+ fastapi
4
+ uvicorn
5
+ opencv-python-headless
6
+ numpy
7
+ pathlib
8
+ huggingface_hub
9
+ pillow
10
+ rarfile
11
+ python-multipart
vision_analyzer.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import subprocess
5
+ import shutil
6
+ import time
7
+ import re
8
+ import threading
9
+ from typing import Dict, List, Set, Optional
10
+ from huggingface_hub import HfApi, list_repo_files, CommitOperationAdd, hf_hub_download
11
+
12
+ import cv2
13
+ import numpy as np
14
+ from pathlib import Path
15
+ import smtplib
16
+ from email.message import EmailMessage
17
+
18
+ # ==== CONFIGURATION ====
19
+ # NOTE: The user provided a token, but we will rely on the environment variable
20
+ # or the token passed to the HfApi constructor, as per the original script's design.
21
+ HF_TOKEN = os.getenv("HF_TOKEN", "") # Using provided token as fallback
22
+ SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
23
+ TARGET_REPO_ID = os.getenv("TARGET_REPO", "Fred808/BG3") # New target repo for uploads
24
+
25
+ # Path Configuration
26
+ DOWNLOAD_FOLDER = "downloads"
27
+ EXTRACT_FOLDER = "extracted"
28
+ FRAMES_OUTPUT_FOLDER = "extracted_frames"
29
+ ZIP_OUTPUT_FOLDER = "zipped_frames" # New folder for zip files
30
+ LOCAL_STATE_FOLDER = ".state" # Folder to temporarily store the downloaded state file
31
+
32
+ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
33
+ os.makedirs(EXTRACT_FOLDER, exist_ok=True)
34
+ os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
35
+ os.makedirs(ZIP_OUTPUT_FOLDER, exist_ok=True) # Create zip output folder
36
+ os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
37
+
38
+ # State Files
39
+ # The local state files are now deprecated in favor of the remote HF_STATE_FILE
40
+ # DOWNLOAD_STATE_FILE = "download_progress.json"
41
+ # PROCESS_STATE_FILE = "process_progress.json"
42
+ FAILED_FILES_LOG = "failed_files.log"
43
+ HF_STATE_FILE = "processing_state.json" # New remote state file name
44
+
45
+ # Processing Parameters
46
+ CHUNK_SIZE = 2
47
+ PROCESSING_DELAY = 2
48
+ MAX_RETRIES = 3
49
+ MIN_FREE_SPACE_GB = 2 # Minimum free space in GB before processing
50
+
51
+ # Frame Extraction Parameters
52
+ DEFAULT_FPS = 3 # Default frames per second for extraction
53
+
54
+
55
+ # Initialize HF API
56
+ hf_api = HfApi(token=HF_TOKEN)
57
+
58
+ # Global State
59
+ processing_status = {
60
+ "is_running": False,
61
+ "current_file": None,
62
+ "total_files": 0,
63
+ "processed_files": 0,
64
+ "failed_files": 0,
65
+ "extracted_courses": 0,
66
+ "extracted_videos": 0,
67
+ "last_update": None,
68
+ "logs": []
69
+ }
70
+
71
+ def log_message(message: str):
72
+ """Log messages with timestamp"""
73
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
74
+ log_entry = f"[{timestamp}] {message}"
75
+ print(log_entry)
76
+ processing_status["logs"].append(log_entry)
77
+ processing_status["last_update"] = timestamp
78
+ if len(processing_status["logs"]) > 100:
79
+ processing_status["logs"] = processing_status["logs"][-100:]
80
+
81
+ def log_failed_file(filename: str, error: str):
82
+ """Log failed files to persistent file"""
83
+ with open(FAILED_FILES_LOG, "a") as f:
84
+ f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
85
+
86
+ def get_disk_usage(path: str) -> Dict[str, float]:
87
+ """Get disk usage statistics in GB"""
88
+ statvfs = os.statvfs(path)
89
+ total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
90
+ free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
91
+ used = total - free
92
+ return {"total": total, "free": free, "used": used}
93
+
94
+ def check_disk_space(path: str = ".") -> bool:
95
+ """Check if there's enough disk space"""
96
+ disk_info = get_disk_usage(path)
97
+ if disk_info["free"] < MIN_FREE_SPACE_GB:
98
+ log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
99
+ return False
100
+ return True
101
+
102
+ def cleanup_temp_files():
103
+ """Clean up temporary files to free space"""
104
+ log_message("🧹 Cleaning up temporary files...")
105
+
106
+ # Clean old downloads (keep only current processing file)
107
+ current_file = processing_status.get("current_file")
108
+ for file in os.listdir(DOWNLOAD_FOLDER):
109
+ if file != current_file and file.endswith((".rar", ".zip")):
110
+ try:
111
+ os.remove(os.path.join(DOWNLOAD_FOLDER, file))
112
+ log_message(f"πŸ—‘οΈ Removed old download: {file}")
113
+ except:
114
+ pass
115
+
116
+ def load_json_state(file_path: str, default_value):
117
+ """Load state from JSON file"""
118
+ if os.path.exists(file_path):
119
+ try:
120
+ with open(file_path, "r") as f:
121
+ return json.load(f)
122
+ except json.JSONDecodeError:
123
+ log_message(f"⚠️ Corrupted state file: {file_path}")
124
+ return default_value
125
+
126
+ def save_json_state(file_path: str, data):
127
+ """Save state to JSON file"""
128
+ with open(file_path, "w") as f:
129
+ json.dump(data, f, indent=2)
130
+
131
+ def download_hf_state(repo_id: str, filename: str) -> Dict:
132
+ """Downloads the state file from Hugging Face or returns a default state."""
133
+ local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
134
+ default_state = {"next_download_index": 0, "processed_rars": []}
135
+
136
+ try:
137
+ # Check if the file exists in the repo first
138
+ files = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
139
+ if filename not in files:
140
+ log_message(f"ℹ️ State file {filename} not found in {repo_id}. Starting from default state.")
141
+ return default_state
142
+
143
+ # Download the file
144
+ hf_hub_download(
145
+ repo_id=repo_id,
146
+ filename=filename,
147
+ repo_type="dataset",
148
+ local_dir=LOCAL_STATE_FOLDER,
149
+ local_dir_use_symlinks=False
150
+ )
151
+
152
+ log_message(f"βœ… Successfully downloaded state file from {repo_id}.")
153
+ return load_json_state(local_path, default_state)
154
+
155
+ except Exception as e:
156
+ log_message(f"⚠️ Failed to download state file from Hugging Face: {str(e)}. Starting from default state.")
157
+ return default_state
158
+
159
+ def upload_hf_state(repo_id: str, filename: str, state: Dict) -> bool:
160
+ """Uploads the state file to Hugging Face."""
161
+ local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
162
+
163
+ try:
164
+ # 1. Save the updated state locally
165
+ save_json_state(local_path, state)
166
+
167
+ # 2. Upload the file
168
+ hf_api.upload_file(
169
+ path_or_fileobj=local_path,
170
+ path_in_repo=filename,
171
+ repo_id=repo_id,
172
+ repo_type="dataset",
173
+ commit_message=f"Update processing state: next_index={state['next_download_index']}"
174
+ )
175
+ log_message(f"βœ… Successfully uploaded updated state file to {repo_id}")
176
+ return True
177
+ except Exception as e:
178
+ log_message(f"❌ Failed to upload state file to Hugging Face: {str(e)}")
179
+ return False
180
+
181
+
182
+ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
183
+ """Download file with retry logic and disk space checking"""
184
+ if not check_disk_space():
185
+ cleanup_temp_files()
186
+ if not check_disk_space():
187
+ log_message("❌ Insufficient disk space even after cleanup")
188
+ return False
189
+
190
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
191
+ for attempt in range(max_retries):
192
+ try:
193
+ with requests.get(url, headers=headers, stream=True) as r:
194
+ r.raise_for_status()
195
+
196
+ # Check content length if available
197
+ content_length = r.headers.get("content-length")
198
+ if content_length:
199
+ size_gb = int(content_length) / (1024**3)
200
+ disk_info = get_disk_usage(".")
201
+ if size_gb > disk_info["free"] - 0.5: # Leave 0.5GB buffer
202
+ log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
203
+ return False
204
+
205
+ with open(dest_path, "wb") as f:
206
+ for chunk in r.iter_content(chunk_size=8192):
207
+ f.write(chunk)
208
+ return True
209
+ except Exception as e:
210
+ if attempt < max_retries - 1:
211
+ time.sleep(2 ** attempt)
212
+ continue
213
+ log_message(f"❌ Download failed after {max_retries} attempts: {e}")
214
+ return False
215
+ return False
216
+
217
+ def is_multipart_rar(filename: str) -> bool:
218
+ """Check if this is a multi-part RAR file"""
219
+ return ".part" in filename.lower() and filename.lower().endswith(".rar")
220
+
221
+ def get_rar_part_base(filename: str) -> str:
222
+ """Get the base name for multi-part RAR files"""
223
+ if ".part" in filename.lower():
224
+ return filename.split(".part")[0]
225
+ return filename.replace(".rar", "")
226
+
227
+ def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
228
+ """Extract RAR with retry and recovery, handling multi-part archives"""
229
+ filename = os.path.basename(rar_path)
230
+
231
+ # The user requested to process each RAR file independently,
232
+ # regardless of whether it is a multi-part archive or not.
233
+ # We will attempt to extract the current file directly.
234
+ log_message("πŸ“¦ Processing RAR file independently as requested by user.")
235
+
236
+ for attempt in range(max_retries):
237
+ try:
238
+ # Extract RAR
239
+ # Use 'e' (extract) instead of 'x' (extract with full paths) and the -kb switch
240
+ # to keep broken extracted files, which might be the video file itself.
241
+ # We skip the 'unrar t' test as it fails for multi-part archives when parts are missing.
242
+ cmd = ["unrar", "e", "-o+", "-kb", rar_path, output_dir]
243
+
244
+ result = subprocess.run(cmd, capture_output=True, text=True)
245
+ if result.returncode == 0:
246
+ log_message(f"βœ… Successfully extracted: {os.path.basename(rar_path)}")
247
+ return True
248
+ else:
249
+ error_msg = result.stderr or result.stdout
250
+ log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
251
+
252
+ # NEW: Check for the expected "Cannot find volume" error and treat it as a success
253
+ # This is the key change to allow independent extraction of multi-part files.
254
+ if "Cannot find volume" in error_msg:
255
+ log_message(f"βœ… Extracted all possible files from independent part (expected volume error).")
256
+ return True
257
+
258
+ if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
259
+ log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
260
+ elif result.returncode == 10:
261
+ log_message(f"⚠️ No files to extract (exit code 10)")
262
+ return False
263
+ elif result.returncode == 1:
264
+ log_message(f"⚠️ Non-fatal error (exit code 1)")
265
+
266
+ except Exception as e:
267
+ log_message(f"❌ Extraction exception: {str(e)}")
268
+ if attempt == max_retries - 1:
269
+ return False
270
+ time.sleep(1)
271
+
272
+ return False
273
+
274
+ # --- Frame Extraction Utilities ---
275
+ def ensure_dir(path):
276
+ os.makedirs(path, exist_ok=True)
277
+
278
+ def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
279
+ """Extract frames from video at the specified frames per second (fps)."""
280
+ log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
281
+ ensure_dir(output_dir)
282
+ cap = cv2.VideoCapture(str(video_path))
283
+ if not cap.isOpened():
284
+ log_message(f"[ERROR] Failed to open video file: {video_path}")
285
+ return 0
286
+ video_fps = cap.get(cv2.CAP_PROP_FPS)
287
+ # log_message(f"[DEBUG] Video FPS: {video_fps}")
288
+ if not video_fps or video_fps <= 0:
289
+ video_fps = 30 # fallback if FPS is not available
290
+ log_message(f"[WARN] Using fallback FPS: {video_fps}")
291
+ frame_interval = int(round(video_fps / fps))
292
+ # log_message(f"[DEBUG] Frame interval: {frame_interval}")
293
+ frame_idx = 0
294
+ saved_idx = 1
295
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
296
+ log_message(f"[DEBUG] Total frames in video: {total_frames}")
297
+ while cap.isOpened():
298
+ ret, frame = cap.read()
299
+ if not ret:
300
+ # log_message(f"[DEBUG] No more frames to read at frame_idx {frame_idx}.")
301
+ break
302
+ if frame_idx % frame_interval == 0:
303
+ frame_name = f"{saved_idx:04d}.png"
304
+ cv2.imwrite(str(Path(output_dir) / frame_name), frame)
305
+ # log_message(f"[DEBUG] Saved frame {frame_idx} as {frame_name}")
306
+ saved_idx += 1
307
+ frame_idx += 1
308
+ cap.release()
309
+ frame_count = saved_idx - 1
310
+ log_message(f"Extracted {frame_count} frames from {video_path} to {output_dir}")
311
+ return frame_count
312
+
313
+ def zip_frames_folder(frames_dir: str, output_folder: str) -> Optional[str]:
314
+ """Zips the extracted frames folder."""
315
+ base_dir_name = os.path.basename(frames_dir)
316
+ # The output path for the zip file (without .zip extension)
317
+ zip_base_name = os.path.join(output_folder, base_dir_name)
318
+
319
+ log_message(f"[INFO] Zipping frames from {frames_dir} to {zip_base_name}.zip...")
320
+ try:
321
+ # shutil.make_archive(base_name, format, root_dir, base_dir)
322
+ # root_dir is the directory to start archiving from (parent of frames_dir)
323
+ # base_dir is the directory inside root_dir to archive (frames_dir itself)
324
+ archive_path = shutil.make_archive(
325
+ base_name=zip_base_name,
326
+ format='zip',
327
+ root_dir=os.path.dirname(frames_dir),
328
+ base_dir=base_dir_name
329
+ )
330
+ log_message(f"βœ… Successfully created zip file: {archive_path}")
331
+ return archive_path
332
+ except Exception as e:
333
+ log_message(f"❌ Failed to create zip archive for {frames_dir}: {str(e)}")
334
+ return None
335
+
336
+ def upload_to_huggingface(file_path: str, repo_id: str, path_in_repo: str) -> bool:
337
+ """Uploads a file to a Hugging Face dataset repository."""
338
+ filename = os.path.basename(file_path)
339
+ log_message(f"[INFO] Uploading {filename} to {repo_id}/{path_in_repo}...")
340
+ try:
341
+ # Use HfApi.upload_file for a single file upload
342
+ hf_api.upload_file(
343
+ path_or_fileobj=file_path,
344
+ path_in_repo=path_in_repo,
345
+ repo_id=repo_id,
346
+ repo_type="dataset",
347
+ commit_message=f"Add extracted frames zip: {filename}"
348
+ )
349
+ log_message(f"βœ… Successfully uploaded {filename} to {repo_id}")
350
+ return True
351
+ except Exception as e:
352
+ log_message(f"❌ Failed to upload {filename} to Hugging Face: {str(e)}")
353
+ return False
354
+
355
+
356
+ def process_rar_file(rar_path: str) -> bool:
357
+ """Process a single RAR file - extract, then process videos for frames, zip, and upload"""
358
+ filename = os.path.basename(rar_path)
359
+ processing_status["current_file"] = filename
360
+
361
+ # Handle multi-part RAR naming
362
+ if is_multipart_rar(filename):
363
+ course_name = get_rar_part_base(filename)
364
+ else:
365
+ course_name = filename.replace(".rar", "")
366
+
367
+ extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
368
+
369
+ try:
370
+ log_message(f"πŸ”„ Processing: {filename}")
371
+
372
+ # DO NOT clean up existing directory to preserve extracted files
373
+ # if os.path.exists(extract_dir):
374
+ # shutil.rmtree(extract_dir, ignore_errors=True)
375
+
376
+ # Extract RAR
377
+ os.makedirs(extract_dir, exist_ok=True)
378
+ if not extract_with_retry(rar_path, extract_dir):
379
+ raise Exception("RAR extraction failed")
380
+
381
+ # Count extracted files
382
+ file_count = 0
383
+ video_files_found = []
384
+ for root, dirs, files in os.walk(extract_dir):
385
+ for file in files:
386
+ file_count += 1
387
+ if file.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.webm')):
388
+ video_files_found.append(os.path.join(root, file))
389
+
390
+ processing_status["extracted_courses"] += 1
391
+ log_message(f"βœ… Successfully extracted '{course_name}' ({file_count} files, {len(video_files_found)} videos)")
392
+
393
+ # Process video files for frame extraction, zipping, and uploading
394
+ for video_path in video_files_found:
395
+ video_filename = Path(video_path).name
396
+ # Unique output directory for frames
397
+ frames_output_dir_name = f"{course_name}_{video_filename.replace('.', '_')}_frames"
398
+ frames_output_dir = os.path.join(
399
+ FRAMES_OUTPUT_FOLDER,
400
+ frames_output_dir_name
401
+ )
402
+ ensure_dir(frames_output_dir)
403
+
404
+ # 1. Extract frames
405
+ frame_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
406
+ processing_status["extracted_videos"] += 1
407
+
408
+ if frame_count == 0:
409
+ log_message(f"⚠️ No frames extracted from {video_filename}. Skipping zip/upload.")
410
+ # Clean up empty directory
411
+ shutil.rmtree(frames_output_dir, ignore_errors=True)
412
+ continue
413
+ else:
414
+ log_message(f"βœ… {frame_count} frames extracted from {video_filename}")
415
+
416
+ # 2. Zip the extracted frames
417
+ zip_path = zip_frames_folder(frames_output_dir, ZIP_OUTPUT_FOLDER)
418
+
419
+ if zip_path:
420
+ # 3. Upload the zip file to Hugging Face
421
+ path_in_repo = f"frames/{os.path.basename(zip_path)}"
422
+ if upload_to_huggingface(zip_path, TARGET_REPO_ID, path_in_repo):
423
+ log_message(f"βœ… Upload successful for {os.path.basename(zip_path)}")
424
+ else:
425
+ log_message(f"❌ Upload failed for {os.path.basename(zip_path)}. Keeping file for manual inspection.")
426
+ # Do not raise exception, allow processing of next video
427
+
428
+ # 4. Clean up local zip and frame files after successful upload
429
+ try:
430
+ os.remove(zip_path)
431
+ log_message(f"πŸ—‘οΈ Cleaned up local zip file: {os.path.basename(zip_path)}")
432
+ shutil.rmtree(frames_output_dir, ignore_errors=True)
433
+ log_message(f"πŸ—‘οΈ Cleaned up local frames directory: {frames_output_dir_name}")
434
+ except Exception as e:
435
+ log_message(f"⚠️ Failed to clean up temporary files: {str(e)}")
436
+
437
+ return True
438
+
439
+ except Exception as e:
440
+ error_msg = str(e)
441
+ log_message(f"❌ Processing failed: {error_msg}")
442
+ log_failed_file(filename, error_msg)
443
+ return False
444
+
445
+ finally:
446
+ processing_status["current_file"] = None
447
+
448
+
449
+ def main_processing_loop(start_index: int = 0):
450
+ """Main processing workflow - extraction, frame extraction, and cursor tracking"""
451
+ processing_status["is_running"] = True
452
+
453
+ try:
454
+ # --- NEW: Load state from Hugging Face ---
455
+ remote_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
456
+ processed_rars = remote_state.get("processed_rars", [])
457
+
458
+ # Use start_index if provided, otherwise use the remote state
459
+ next_index = start_index if start_index > 0 else remote_state.get("next_download_index", 1)
460
+ # ----------------------------------------
461
+
462
+ log_message(f"πŸ“Š Starting from index {next_index}")
463
+ log_message(f"πŸ“Š Previously processed: {len(processed_rars)} files")
464
+
465
+ # Get file list
466
+ try:
467
+ files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
468
+ rar_files = sorted([f for f in files if f.endswith(".rar")])
469
+
470
+ processing_status["total_files"] = len(rar_files)
471
+ log_message(f"πŸ“ Found {len(rar_files)} RAR files in repository")
472
+
473
+ if next_index >= len(rar_files):
474
+ log_message("βœ… All files have been processed!")
475
+ return
476
+
477
+ except Exception as e:
478
+ log_message(f"❌ Failed to get file list: {str(e)}")
479
+ return
480
+
481
+ # Loop through all files starting from next_index
482
+ for i in range(next_index, len(rar_files)):
483
+ rar_file = rar_files[i]
484
+ filename = os.path.basename(rar_file)
485
+
486
+ # 1. Check if already processed
487
+ if filename in processed_rars:
488
+ log_message(f"⏭️ Skipping already processed: {filename}")
489
+ processing_status["processed_files"] += 1
490
+ next_index = i + 1 # Update next_index for state saving
491
+
492
+ # --- NEW: Update remote state even if skipping ---
493
+ remote_state["next_download_index"] = next_index
494
+ upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, remote_state)
495
+ # -----------------------------------------------
496
+ continue
497
+
498
+ log_message(f"πŸ“₯ Downloading: {filename}")
499
+ dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
500
+
501
+ # 2. Download file
502
+ download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
503
+ if download_with_retry(download_url, dest_path):
504
+
505
+ # 3. Process file (extract, frame extract, zip, upload)
506
+ if process_rar_file(dest_path):
507
+ processed_rars.append(filename)
508
+ log_message(f"βœ… Successfully processed: {filename}")
509
+ processing_status["processed_files"] += 1
510
+
511
+ # Immediate Cleanup of downloaded RAR file
512
+ try:
513
+ os.remove(dest_path)
514
+ log_message(f"πŸ—‘οΈ Cleaned up downloaded RAR file: {filename}")
515
+ except Exception as e:
516
+ log_message(f"⚠️ Failed to clean up downloaded RAR file {filename}: {str(e)}")
517
+
518
+ else:
519
+ log_message(f"❌ Failed to process: {filename}")
520
+ processing_status["failed_files"] += 1
521
+
522
+ # Clean up downloaded file only if processing failed and it wasn't cleaned above
523
+ try:
524
+ os.remove(dest_path)
525
+ log_message(f"πŸ—‘οΈ Cleaned up failed download: {filename}")
526
+ except:
527
+ pass
528
+ else:
529
+ log_message(f"❌ Failed to download: {filename}")
530
+ processing_status["failed_files"] += 1
531
+
532
+ # 4. Update and upload state for the next run
533
+ next_index = i + 1
534
+ remote_state["next_download_index"] = next_index
535
+ remote_state["processed_rars"] = processed_rars # Ensure this is updated
536
+ upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, remote_state)
537
+
538
+ # Pause between processing files to prevent resource exhaustion
539
+ time.sleep(PROCESSING_DELAY)
540
+ # --- END OF LOOP ---
541
+
542
+ if next_index >= len(rar_files):
543
+ log_message("πŸŽ‰ All files have been processed!")
544
+
545
+ log_message("πŸŽ‰ Processing complete!")
546
+ log_message(f'πŸ“Š Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, frames extracted')
547
+
548
+ except KeyboardInterrupt:
549
+ log_message("⏹️ Processing interrupted by user")
550
+ except Exception as e:
551
+ log_message(f"❌ Fatal error: {str(e)}")
552
+ finally:
553
+ processing_status["is_running"] = False
554
+ cleanup_temp_files()
555
+
556
+ # Expose necessary functions and variables for download_api.py
557
+ __all__ = [
558
+ "main_processing_loop",
559
+ "processing_status",
560
+ "log_message",
561
+ "extract_frames",
562
+ "DEFAULT_FPS",
563
+ "ensure_dir"
564
+ ]