compressor / server.py
factorstudios's picture
Update server.py
9144bbc verified
raw
history blame
28.3 kB
#!/usr/bin/env python3
"""
FastAPI Video Compression Server
Continuously polls Hugging Face dataset for large videos and compresses them.
"""
import os
import sys
import json
import asyncio
import subprocess
import tempfile
import threading
import time
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from typing import List, Dict, Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
import uvicorn
try:
from huggingface_hub import list_repo_files, hf_hub_download, upload_file
except ImportError:
print("Missing dependency: huggingface_hub")
sys.exit(1)
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
print("Error: Missing HF_TOKEN in .env")
sys.exit(1)
# Configuration
HF_DATASET_REPO = "factorstudios/movs"
READY_VIDEOS_FOLDER = "ready_videos"
COMPRESSED_FOLDER = "compressed"
SIZE_THRESHOLD_MB = 100 # Compress videos above 100MB
CACHE_DIR = "/tmp/video_compress_cache"
TEMP_DIR = Path("/tmp/video_compression_server")
TEMP_DIR.mkdir(exist_ok=True)
PROGRESS_FILE = TEMP_DIR / "compression_progress.json"
PROGRESS_FILE_REPO = f"{COMPRESSED_FOLDER}/compression_progress.json"
# Global state
compression_state = {
"is_running": False,
"is_polling": False,
"total_found": 0,
"total_compressed": 0,
"total_uploaded": 0,
"current_video": None,
"last_error": None,
"processed_videos": [],
"failed_videos": [],
"total_space_saved_mb": 0,
"compression_stats": {}
}
app = FastAPI(title="Video Compression Server")
def get_video_duration(video_path: str) -> float:
"""Get video duration in seconds using ffprobe."""
cmd = [
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1:nokey=1",
video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
return float(result.stdout.strip())
except ValueError:
return 0
def get_video_bitrate(video_path: str) -> float:
"""Get video bitrate in kbps."""
cmd = [
"ffprobe", "-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=bit_rate",
"-of", "default=noprint_wrappers=1:nokey=1",
video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
return float(result.stdout.strip()) / 1000
except ValueError:
return 0
def compress_video(input_path: str, output_path: str) -> Dict:
"""
Compress video using H.265 targeting ~50MB with quality preservation.
Returns dict with compression stats.
"""
print(f"\n{'='*80}")
print(f"COMPRESSING: {Path(input_path).name}")
print(f"{'='*80}")
# Get video info
duration = get_video_duration(input_path)
original_size = os.path.getsize(input_path) / (1024**2)
print(f"Original: {original_size:.1f} MB | Duration: {duration:.1f}s")
print(f"Target: ~80MB with high quality preservation")
# Calculate target bitrate for ~80MB file
target_mb = 80
target_bitrate_kbps = int((target_mb * 8 * 1024) / max(duration, 1))
video_bitrate = max(400, min(target_bitrate_kbps, 1200))
print(f"Calculated bitrate: {video_bitrate}kbps")
# H.265 encoding with balanced compression targeting ~80MB
ffmpeg_cmd = [
"ffmpeg", "-y",
"-i", input_path,
"-c:v", "libx265",
"-preset", "medium",
"-crf", "26",
"-b:v", f"{video_bitrate}k",
"-maxrate", f"{int(video_bitrate * 1.2)}k",
"-bufsize", f"{int(video_bitrate * 2)}k",
"-x265-params", "aq-mode=3:log-level=error",
"-pix_fmt", "yuv420p",
"-c:a", "aac",
"-b:a", "128k",
"-progress", "pipe:1",
output_path
]
try:
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=14400)
if result.returncode != 0:
print(f"✗ Encoding failed with code {result.returncode}")
if result.stderr:
print(f"Error output: {result.stderr[:500]}")
return None
if not os.path.exists(output_path):
print(f"✗ Output file not created")
return None
compressed_size = os.path.getsize(output_path) / (1024**2)
compression_ratio = original_size / compressed_size
saved_mb = original_size - compressed_size
print(f"✓ Compressed: {original_size:.1f}MB → {compressed_size:.1f}MB")
print(f" Saved: {saved_mb:.1f}MB ({(saved_mb/original_size*100):.1f}%)")
print(f" Ratio: {compression_ratio:.2f}x")
return {
"original_size_mb": round(original_size, 2),
"compressed_size_mb": round(compressed_size, 2),
"saved_mb": round(saved_mb, 2),
"compression_ratio": round(compression_ratio, 2),
"duration_seconds": duration
}
except subprocess.TimeoutExpired:
print(f"✗ Compression timed out (>4 hours)")
return None
except Exception as e:
print(f"✗ Compression error: {e}")
return None
async def load_progress_file() -> Dict:
"""Load compression progress from JSON file in HF dataset."""
try:
print("Attempting to load progress file from dataset...")
# Try to download existing progress file
progress_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=PROGRESS_FILE_REPO,
repo_type="dataset",
token=HF_TOKEN,
cache_dir=CACHE_DIR
)
if os.path.islink(progress_path):
progress_path = os.path.realpath(progress_path)
with open(progress_path, 'r') as f:
progress = json.load(f)
# Ensure all required keys exist (for backwards compatibility with old progress files)
if "processing" not in progress:
progress["processing"] = []
if "compressed" not in progress:
progress["compressed"] = []
if "failed" not in progress:
progress["failed"] = []
compressed_count = len(progress.get('compressed', []))
failed_count = len(progress.get('failed', []))
processing_count = len(progress.get('processing', []))
print(f"✓ Loaded progress: {compressed_count} compressed, {failed_count} failed, {processing_count} processing")
return progress
except Exception as e:
print(f"⊘ No existing progress file or load failed: {str(e)[:100]}")
return {"compressed": [], "failed": [], "processing": [], "last_updated": datetime.now().isoformat()}
async def save_progress_file(progress: Dict):
"""Save and upload compression progress to HF dataset."""
try:
progress["last_updated"] = datetime.now().isoformat()
with open(PROGRESS_FILE, 'w') as f:
json.dump(progress, f, indent=2)
print(f"Uploading progress file...")
upload_file(
path_or_fileobj=str(PROGRESS_FILE),
path_in_repo=PROGRESS_FILE_REPO,
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Update compression progress"
)
print(f"✓ Progress file uploaded")
except Exception as e:
print(f"Warning: Could not save progress file: {e}")
async def scan_and_compress_videos():
"""Scan ready_videos folder and compress large videos using persistent progress tracking."""
if compression_state["is_polling"]:
print("Already polling, skipping...")
return
compression_state["is_polling"] = True
try:
print("\n" + "="*80)
print("LOADING COMPRESSION PROGRESS")
print("="*80)
# Load progress file
progress = await load_progress_file()
compressed_files = {item["path"] for item in progress.get("compressed", [])}
failed_files = progress.get("failed", [])
failed_paths = {item["path"] for item in failed_files}
processing_files = progress.get("processing", [])
processing_paths = {item["path"] for item in processing_files}
# Clean up stale processing files (>4 hours old = likely crashed server)
print("\n" + "="*80)
print("CHECKING FOR STALE PROCESSING FILES")
print("="*80)
current_time = datetime.now()
stale_threshold = 4 * 3600 # 4 hours in seconds
stale_files = []
for processing_item in processing_files:
started_at = datetime.fromisoformat(processing_item["started_at"])
elapsed_seconds = (current_time - started_at).total_seconds()
if elapsed_seconds > stale_threshold:
print(f" ⚠ {processing_item['file_name']} (stale, {elapsed_seconds/3600:.1f}h old)")
stale_files.append(processing_item)
# Move stale files back to failed for retry
if stale_files:
print(f"\nRecovering {len(stale_files)} stale processing files...")
for stale_item in stale_files:
progress["processing"] = [f for f in progress["processing"] if f["path"] != stale_item["path"]]
progress["failed"].append({
"path": stale_item["path"],
"file_name": stale_item["file_name"],
"error": "Server crash detected (stale processing)",
"retry_count": 0,
"timestamp": datetime.now().isoformat()
})
await save_progress_file(progress)
# Reload processing paths after cleanup
processing_files = progress.get("processing", [])
processing_paths = {item["path"] for item in processing_files}
else:
print("✓ No stale processing files")
print(f"Current state: {len(compressed_files)} compressed, {len(failed_files)} failed, {len(processing_paths)} processing")
print("\n" + "="*80)
print("RETRYING PREVIOUSLY FAILED FILES")
print("="*80)
# Retry failed files (max 2 attempts per file)
files_to_retry = []
for failed_item in failed_files:
retry_count = failed_item.get("retry_count", 0)
if retry_count < 2:
files_to_retry.append(failed_item)
print(f" ⟳ {failed_item['file_name']} (attempt {retry_count + 1}/2)")
else:
print(f" ✗ {failed_item['file_name']} (max retries reached)")
# Process retry files
for failed_item in files_to_retry:
compression_state["current_video"] = failed_item["file_name"]
video_info = {
"path": failed_item["path"],
"movie_name": failed_item["path"].split("/")[1],
"file_name": failed_item["file_name"]
}
# Mark as processing
progress["failed"] = [f for f in progress["failed"] if f["path"] != video_info["path"]]
progress["processing"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"status": "retrying",
"started_at": datetime.now().isoformat()
})
await save_progress_file(progress)
try:
# Download
print(f"\nRetrying: {video_info['path']}")
video_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=video_info["path"],
repo_type="dataset",
token=HF_TOKEN,
cache_dir=CACHE_DIR
)
if os.path.islink(video_path):
video_path = os.path.realpath(video_path)
file_size_mb = os.path.getsize(video_path) / (1024**2)
print(f"✓ Downloaded: {file_size_mb:.1f} MB")
# Compress
output_path = TEMP_DIR / f"{Path(video_info['file_name']).stem}_compressed.mp4"
compression_stats = compress_video(video_path, str(output_path))
if not compression_stats:
raise Exception("Compression failed")
compression_state["total_compressed"] += 1
# Upload compressed video
upload_filename = f"{Path(video_info['file_name']).stem}_compressed.mp4"
upload_path = f"{COMPRESSED_FOLDER}/{video_info['movie_name']}/{upload_filename}"
print(f"Uploading: {upload_path}")
upload_file(
path_or_fileobj=str(output_path),
path_in_repo=upload_path,
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"Add H.265 compressed video: {upload_filename}"
)
compression_state["total_uploaded"] += 1
compression_state["total_space_saved_mb"] += compression_stats["saved_mb"]
# Remove from processing and add to compressed
progress["processing"] = [f for f in progress["processing"] if f["path"] != video_info["path"]]
progress["compressed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"upload_path": upload_path,
"status": "compressed_uploaded",
"stats": compression_stats,
"retry_success": True,
"timestamp": datetime.now().isoformat()
})
print(f"✓ Retry successful!")
# Cleanup
try:
os.remove(output_path)
except:
pass
except Exception as e:
print(f"✗ Retry failed: {e}")
# Remove from processing and re-add to failed with incremented retry count
progress["processing"] = [f for f in progress["processing"] if f["path"] != video_info["path"]]
retry_count = failed_item.get("retry_count", 0) + 1
progress["failed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"error": str(e),
"retry_count": retry_count,
"last_error": str(e),
"last_attempt": datetime.now().isoformat(),
"timestamp": failed_item.get("timestamp", datetime.now().isoformat())
})
# Save progress after each retry attempt
await save_progress_file(progress)
print("\n" + "="*80)
print("SCANNING FOR NEW LARGE VIDEOS")
print("="*80)
try:
# List all files in ready_videos
print("Connecting to Hugging Face dataset...")
files = list_repo_files(
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
print(f"✓ Found {len(files)} total files in dataset")
except Exception as e:
print(f"✗ Error listing files: {e}")
compression_state["last_error"] = str(e)
compression_state["is_polling"] = False
return
# Find unprocessed video files (excluding those we just retried)
video_files = []
for f in files:
if f.startswith(f"{READY_VIDEOS_FOLDER}/") and f.endswith(".mp4"):
# Skip already compressed files
if "_compressed" in f:
print(f" ⊘ {f.split('/')[-1]} (already compressed file)")
continue
# Skip if already compressed
if f in compressed_files:
print(f" ⊘ {f.split('/')[-1]} (already processed)")
continue
# Skip if it was just retried (don't retry again in this cycle)
if f in failed_paths:
print(f" ⊘ {f.split('/')[-1]} (queued for retry next scan)")
continue
# Skip if currently being processed by another server
if f in processing_paths:
print(f" ⟳ {f.split('/')[-1]} (currently processing)")
continue
try:
# Parse file path: ready_videos/moviename/segment-XX.mp4
parts = f.split("/")
if len(parts) >= 3:
movie_name = parts[1]
file_name = parts[2]
video_files.append({
"path": f,
"movie_name": movie_name,
"file_name": file_name
})
except Exception as e:
print(f"Error parsing file {f}: {e}")
continue
compression_state["total_found"] = len(video_files)
print(f"\n✓ Found {len(video_files)} unprocessed video files")
if not video_files:
print("✓ All videos already processed!")
compression_state["is_polling"] = False
return
# Process each video
for video_info in video_files:
compression_state["current_video"] = video_info["file_name"]
# Mark as processing
progress["processing"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"status": "compressing",
"started_at": datetime.now().isoformat()
})
await save_progress_file(progress)
try:
# Download
print(f"\nDownloading: {video_info['path']}")
video_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=video_info["path"],
repo_type="dataset",
token=HF_TOKEN,
cache_dir=CACHE_DIR
)
if os.path.islink(video_path):
video_path = os.path.realpath(video_path)
file_size_mb = os.path.getsize(video_path) / (1024**2)
print(f"✓ Downloaded: {file_size_mb:.1f} MB")
# Check if meets compression threshold
if file_size_mb < SIZE_THRESHOLD_MB:
print(f"⊘ Below threshold ({file_size_mb:.1f}MB < {SIZE_THRESHOLD_MB}MB), skipping")
# Remove from processing and add to compressed (skipped)
progress["processing"] = [f for f in progress["processing"] if f["path"] != video_info["path"]]
progress["compressed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"status": "skipped",
"reason": f"Below {SIZE_THRESHOLD_MB}MB threshold",
"timestamp": datetime.now().isoformat()
})
await save_progress_file(progress)
continue
print(f"✓ Starting H.265 compression...")
# Compress
output_path = TEMP_DIR / f"{Path(video_info['file_name']).stem}_compressed.mp4"
compression_stats = compress_video(video_path, str(output_path))
if not compression_stats:
raise Exception("Compression failed")
compression_state["total_compressed"] += 1
# Upload compressed video to COMPRESSED folder with same structure
upload_filename = f"{Path(video_info['file_name']).stem}_compressed.mp4"
upload_path = f"{COMPRESSED_FOLDER}/{video_info['movie_name']}/{upload_filename}"
print(f"Uploading: {upload_path}")
upload_file(
path_or_fileobj=str(output_path),
path_in_repo=upload_path,
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"Add H.265 compressed video: {upload_filename}"
)
compression_state["total_uploaded"] += 1
compression_state["total_space_saved_mb"] += compression_stats["saved_mb"]
# Remove from processing and add to compressed
progress["processing"] = [f for f in progress["processing"] if f["path"] != video_info["path"]]
progress["compressed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"upload_path": upload_path,
"status": "compressed_uploaded",
"stats": compression_stats,
"timestamp": datetime.now().isoformat()
})
# Save progress
await save_progress_file(progress)
# Cleanup
try:
os.remove(output_path)
except:
pass
except Exception as e:
print(f"✗ Error: {e}")
compression_state["last_error"] = str(e)
# Remove from processing and add to failed
progress["processing"] = [f for f in progress["processing"] if f["path"] != video_info["path"]]
progress["failed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"error": str(e),
"retry_count": 0,
"timestamp": datetime.now().isoformat()
})
await save_progress_file(progress)
print(f"\n{'='*80}")
print("COMPRESSION COMPLETE")
print(f" Compressed: {compression_state['total_compressed']}")
print(f" Uploaded: {compression_state['total_uploaded']}")
print(f" Space saved: {compression_state['total_space_saved_mb']:.1f} MB")
print(f" Total compressed: {len(progress['compressed'])}")
print(f" Total failed: {len(progress['failed'])}")
print(f"{'='*80}\n")
except Exception as e:
print(f"Critical error: {e}")
compression_state["last_error"] = str(e)
finally:
compression_state["is_polling"] = False
@app.on_event("startup")
async def startup_event():
"""Schedule compression scan on server startup with 30s delay using background thread."""
print("\n" + "="*80)
print("STARTUP EVENT TRIGGERED")
print("="*80)
# Check if ffmpeg/ffprobe are available
try:
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=5)
if result.returncode == 0:
print("✓ ffmpeg is available")
else:
print("✗ ffmpeg check failed")
except FileNotFoundError:
print("✗ WARNING: ffmpeg not found in PATH - compression will fail")
except Exception as e:
print(f"✗ Error checking ffmpeg: {e}")
# Schedule scan in a background thread (more reliable for deployment)
def run_delayed_scan():
print("\nWaiting 30 seconds before starting compression scan...")
time.sleep(30)
print("Starting compression scan now...")
asyncio.run(scan_and_compress_videos())
scan_thread = threading.Thread(target=run_delayed_scan, daemon=True)
scan_thread.start()
print("✓ Background scan thread scheduled")
@app.get("/")
async def health():
"""Health check endpoint."""
return JSONResponse({
"status": "running",
"service": "Video Compression Server",
"is_polling": compression_state["is_polling"],
"total_found": compression_state["total_found"],
"total_compressed": compression_state["total_compressed"],
"total_uploaded": compression_state["total_uploaded"],
"current_video": compression_state["current_video"],
"space_saved_mb": compression_state["total_space_saved_mb"],
"processed_count": len(compression_state["processed_videos"]),
"failed_count": len(compression_state["failed_videos"])
})
@app.get("/status")
async def get_status():
"""Get detailed status."""
return JSONResponse({
"is_polling": compression_state["is_polling"],
"total_found": compression_state["total_found"],
"total_compressed": compression_state["total_compressed"],
"total_uploaded": compression_state["total_uploaded"],
"total_space_saved_mb": round(compression_state["total_space_saved_mb"], 2),
"current_video": compression_state["current_video"],
"processed_videos": compression_state["processed_videos"],
"failed_videos": compression_state["failed_videos"],
"last_error": compression_state["last_error"]
})
@app.get("/progress")
async def get_progress():
"""Get full progress file state with counts and details."""
progress = await load_progress_file()
return JSONResponse({
"compressed_count": len(progress.get("compressed", [])),
"failed_count": len(progress.get("failed", [])),
"processing_count": len(progress.get("processing", [])),
"compressed": progress.get("compressed", []),
"failed": progress.get("failed", []),
"processing": progress.get("processing", []),
"last_updated": progress.get("last_updated")
})
@app.post("/scan")
async def trigger_scan():
"""Manually trigger a scan and compression run."""
if compression_state["is_polling"]:
return JSONResponse({
"status": "already_running",
"message": "Compression scan already in progress"
}, status_code=409)
asyncio.create_task(scan_and_compress_videos())
return JSONResponse({
"status": "started",
"message": "Compression scan started"
})
@app.get("/stats")
async def get_stats():
"""Get compression statistics."""
return JSONResponse({
"compression_stats": compression_state["compression_stats"],
"total_space_saved_mb": round(compression_state["total_space_saved_mb"], 2),
"average_compression_ratio": round(
sum(s["compression_ratio"] for s in compression_state["compression_stats"].values()) /
max(len(compression_state["compression_stats"]), 1), 2
) if compression_state["compression_stats"] else 0
})
@app.post("/reset")
async def reset_state():
"""Reset all tracking (for testing)."""
compression_state["total_found"] = 0
compression_state["total_compressed"] = 0
compression_state["total_uploaded"] = 0
compression_state["processed_videos"] = []
compression_state["failed_videos"] = []
compression_state["compression_stats"] = {}
compression_state["total_space_saved_mb"] = 0
return JSONResponse({"status": "reset"})
if __name__ == "__main__":
print("Starting Video Compression Server on port 7860...")
print("Scanning for videos on startup...")
uvicorn.run(app, host="0.0.0.0", port=7860)