compressor / server.py
factorstudios's picture
Update server.py
cb902a4 verified
raw
history blame
19 kB
#!/usr/bin/env python3
"""
FastAPI Video Compression Server
Continuously polls Hugging Face dataset for large videos and compresses them.
"""
import os
import sys
import json
import asyncio
import subprocess
import tempfile
import threading
import time
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from typing import List, Dict, Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
import uvicorn
try:
from huggingface_hub import list_repo_files, hf_hub_download, upload_file
except ImportError:
print("Missing dependency: huggingface_hub")
sys.exit(1)
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
print("Error: Missing HF_TOKEN in .env")
sys.exit(1)
# Configuration
HF_DATASET_REPO = "factorstudios/movs"
READY_VIDEOS_FOLDER = "ready_videos"
COMPRESSED_FOLDER = "compressed"
SIZE_THRESHOLD_MB = 100 # Compress videos above 100MB
CACHE_DIR = "/tmp/video_compress_cache"
TEMP_DIR = Path("/tmp/video_compression_server")
TEMP_DIR.mkdir(exist_ok=True)
PROGRESS_FILE = TEMP_DIR / "compression_progress.json"
PROGRESS_FILE_REPO = f"{COMPRESSED_FOLDER}/compression_progress.json"
# Global state
compression_state = {
"is_running": False,
"is_polling": False,
"total_found": 0,
"total_compressed": 0,
"total_uploaded": 0,
"current_video": None,
"last_error": None,
"processed_videos": [],
"failed_videos": [],
"total_space_saved_mb": 0,
"compression_stats": {}
}
app = FastAPI(title="Video Compression Server")
def get_video_duration(video_path: str) -> float:
"""Get video duration in seconds using ffprobe."""
cmd = [
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1:nokey=1",
video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
return float(result.stdout.strip())
except ValueError:
return 0
def get_video_bitrate(video_path: str) -> float:
"""Get video bitrate in kbps."""
cmd = [
"ffprobe", "-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=bit_rate",
"-of", "default=noprint_wrappers=1:nokey=1",
video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
return float(result.stdout.strip()) / 1000
except ValueError:
return 0
def compress_video(input_path: str, output_path: str) -> Dict:
"""
Compress video using H.265 targeting ~50MB with quality preservation.
Returns dict with compression stats.
"""
print(f"\n{'='*80}")
print(f"COMPRESSING: {Path(input_path).name}")
print(f"{'='*80}")
# Get video info
duration = get_video_duration(input_path)
original_size = os.path.getsize(input_path) / (1024**2)
print(f"Original: {original_size:.1f} MB | Duration: {duration:.1f}s")
print(f"Target: ~50MB with high quality preservation")
# Calculate target bitrate for ~50MB file
target_mb = 50
target_bitrate_kbps = int((target_mb * 8 * 1024) / max(duration, 1))
video_bitrate = max(400, min(target_bitrate_kbps, 1200))
print(f"Calculated bitrate: {video_bitrate}kbps")
# H.265 encoding with aggressive compression targeting ~50MB
ffmpeg_cmd = [
"ffmpeg", "-y",
"-i", input_path,
"-c:v", "libx265",
"-preset", "slow",
"-crf", "28",
"-b:v", f"{video_bitrate}k",
"-maxrate", f"{int(video_bitrate * 1.2)}k",
"-bufsize", f"{int(video_bitrate * 2)}k",
"-x265-params", "aq-mode=3:log-level=error",
"-pix_fmt", "yuv420p",
"-c:a", "aac",
"-b:a", "128k",
"-progress", "pipe:1",
output_path
]
try:
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=3600)
if result.returncode != 0:
print(f"✗ Encoding failed with code {result.returncode}")
if result.stderr:
print(f"Error output: {result.stderr[:500]}")
return None
if not os.path.exists(output_path):
print(f"✗ Output file not created")
return None
compressed_size = os.path.getsize(output_path) / (1024**2)
compression_ratio = original_size / compressed_size
saved_mb = original_size - compressed_size
print(f"✓ Compressed: {original_size:.1f}MB → {compressed_size:.1f}MB")
print(f" Saved: {saved_mb:.1f}MB ({(saved_mb/original_size*100):.1f}%)")
print(f" Ratio: {compression_ratio:.2f}x")
return {
"original_size_mb": round(original_size, 2),
"compressed_size_mb": round(compressed_size, 2),
"saved_mb": round(saved_mb, 2),
"compression_ratio": round(compression_ratio, 2),
"duration_seconds": duration
}
except subprocess.TimeoutExpired:
print(f"✗ Compression timed out (>1 hour)")
return None
except Exception as e:
print(f"✗ Compression error: {e}")
return None
async def load_progress_file() -> Dict:
"""Load compression progress from JSON file in HF dataset."""
try:
print("Attempting to load progress file from dataset...")
# Try to download existing progress file
progress_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=PROGRESS_FILE_REPO,
repo_type="dataset",
token=HF_TOKEN,
cache_dir=CACHE_DIR
)
if os.path.islink(progress_path):
progress_path = os.path.realpath(progress_path)
with open(progress_path, 'r') as f:
progress = json.load(f)
compressed_count = len(progress.get('compressed', []))
failed_count = len(progress.get('failed', []))
print(f"✓ Loaded progress: {compressed_count} compressed, {failed_count} failed")
return progress
except Exception as e:
print(f"⊘ No existing progress file or load failed: {str(e)[:100]}")
return {"compressed": [], "failed": [], "last_updated": datetime.now().isoformat()}
async def save_progress_file(progress: Dict):
"""Save and upload compression progress to HF dataset."""
try:
progress["last_updated"] = datetime.now().isoformat()
with open(PROGRESS_FILE, 'w') as f:
json.dump(progress, f, indent=2)
print(f"Uploading progress file...")
upload_file(
path_or_fileobj=str(PROGRESS_FILE),
path_in_repo=PROGRESS_FILE_REPO,
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Update compression progress"
)
print(f"✓ Progress file uploaded")
except Exception as e:
print(f"Warning: Could not save progress file: {e}")
async def scan_and_compress_videos():
"""Scan ready_videos folder and compress large videos using persistent progress tracking."""
if compression_state["is_polling"]:
print("Already polling, skipping...")
return
compression_state["is_polling"] = True
try:
print("\n" + "="*80)
print("LOADING COMPRESSION PROGRESS")
print("="*80)
# Load progress file
progress = await load_progress_file()
compressed_files = {item["path"] for item in progress.get("compressed", [])}
failed_files = {item["path"] for item in progress.get("failed", [])}
print(f"Current state: {len(compressed_files)} compressed, {len(failed_files)} failed")
print("\n" + "="*80)
print("SCANNING FOR LARGE VIDEOS")
print("="*80)
try:
# List all files in ready_videos
print("Connecting to Hugging Face dataset...")
files = list_repo_files(
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
print(f"✓ Found {len(files)} total files in dataset")
except Exception as e:
print(f"✗ Error listing files: {e}")
compression_state["last_error"] = str(e)
compression_state["is_polling"] = False
return
# Find unprocessed video files
video_files = []
for f in files:
if f.startswith(f"{READY_VIDEOS_FOLDER}/") and f.endswith(".mp4"):
# Skip already compressed files
if "_compressed" in f:
print(f" ⊘ {f.split('/')[-1]} (already compressed file)")
continue
# Skip if already handled
if f in compressed_files:
print(f" ⊘ {f.split('/')[-1]} (already processed)")
continue
if f in failed_files:
print(f" ✗ {f.split('/')[-1]} (previously failed)")
continue
try:
# Parse file path: ready_videos/moviename/segment-XX.mp4
parts = f.split("/")
if len(parts) >= 3:
movie_name = parts[1]
file_name = parts[2]
video_files.append({
"path": f,
"movie_name": movie_name,
"file_name": file_name
})
except Exception as e:
print(f"Error parsing file {f}: {e}")
continue
compression_state["total_found"] = len(video_files)
print(f"\n✓ Found {len(video_files)} unprocessed video files")
if not video_files:
print("✓ All videos already processed!")
compression_state["is_polling"] = False
return
# Process each video
for video_info in video_files:
compression_state["current_video"] = video_info["file_name"]
try:
# Download
print(f"\nDownloading: {video_info['path']}")
video_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=video_info["path"],
repo_type="dataset",
token=HF_TOKEN,
cache_dir=CACHE_DIR
)
if os.path.islink(video_path):
video_path = os.path.realpath(video_path)
file_size_mb = os.path.getsize(video_path) / (1024**2)
print(f"✓ Downloaded: {file_size_mb:.1f} MB")
# Check if meets compression threshold
if file_size_mb < SIZE_THRESHOLD_MB:
print(f"⊘ Below threshold ({file_size_mb:.1f}MB < {SIZE_THRESHOLD_MB}MB), skipping")
progress["compressed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"status": "skipped",
"reason": f"Below {SIZE_THRESHOLD_MB}MB threshold",
"timestamp": datetime.now().isoformat()
})
await save_progress_file(progress)
continue
print(f"✓ Starting H.265 compression...")
# Compress
output_path = TEMP_DIR / f"{Path(video_info['file_name']).stem}_compressed.mp4"
compression_stats = compress_video(video_path, str(output_path))
if not compression_stats:
raise Exception("Compression failed")
compression_state["total_compressed"] += 1
# Upload compressed video to COMPRESSED folder with same structure
upload_filename = f"{Path(video_info['file_name']).stem}_compressed.mp4"
upload_path = f"{COMPRESSED_FOLDER}/{video_info['movie_name']}/{upload_filename}"
print(f"Uploading: {upload_path}")
upload_file(
path_or_fileobj=str(output_path),
path_in_repo=upload_path,
repo_id=HF_DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"Add H.265 compressed video: {upload_filename}"
)
compression_state["total_uploaded"] += 1
compression_state["total_space_saved_mb"] += compression_stats["saved_mb"]
# Track in progress file
progress["compressed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"upload_path": upload_path,
"status": "compressed_uploaded",
"stats": compression_stats,
"timestamp": datetime.now().isoformat()
})
# Save progress
await save_progress_file(progress)
# Cleanup
try:
os.remove(output_path)
except:
pass
except Exception as e:
print(f"✗ Error: {e}")
compression_state["last_error"] = str(e)
progress["failed"].append({
"path": video_info["path"],
"file_name": video_info["file_name"],
"error": str(e),
"timestamp": datetime.now().isoformat()
})
await save_progress_file(progress)
print(f"\n{'='*80}")
print("COMPRESSION COMPLETE")
print(f" Compressed: {compression_state['total_compressed']}")
print(f" Uploaded: {compression_state['total_uploaded']}")
print(f" Space saved: {compression_state['total_space_saved_mb']:.1f} MB")
print(f" Total compressed: {len(progress['compressed'])}")
print(f" Total failed: {len(progress['failed'])}")
print(f"{'='*80}\n")
except Exception as e:
print(f"Critical error: {e}")
compression_state["last_error"] = str(e)
finally:
compression_state["is_polling"] = False
@app.on_event("startup")
async def startup_event():
"""Schedule compression scan on server startup with 30s delay using background thread."""
print("\n" + "="*80)
print("STARTUP EVENT TRIGGERED")
print("="*80)
# Check if ffmpeg/ffprobe are available
try:
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=5)
if result.returncode == 0:
print("✓ ffmpeg is available")
else:
print("✗ ffmpeg check failed")
except FileNotFoundError:
print("✗ WARNING: ffmpeg not found in PATH - compression will fail")
except Exception as e:
print(f"✗ Error checking ffmpeg: {e}")
# Schedule scan in a background thread (more reliable for deployment)
def run_delayed_scan():
print("\nWaiting 30 seconds before starting compression scan...")
time.sleep(30)
print("Starting compression scan now...")
asyncio.run(scan_and_compress_videos())
scan_thread = threading.Thread(target=run_delayed_scan, daemon=True)
scan_thread.start()
print("✓ Background scan thread scheduled")
@app.get("/")
async def health():
"""Health check endpoint."""
return JSONResponse({
"status": "running",
"service": "Video Compression Server",
"is_polling": compression_state["is_polling"],
"total_found": compression_state["total_found"],
"total_compressed": compression_state["total_compressed"],
"total_uploaded": compression_state["total_uploaded"],
"current_video": compression_state["current_video"],
"space_saved_mb": compression_state["total_space_saved_mb"],
"processed_count": len(compression_state["processed_videos"]),
"failed_count": len(compression_state["failed_videos"])
})
@app.get("/status")
async def get_status():
"""Get detailed status."""
return JSONResponse({
"is_polling": compression_state["is_polling"],
"total_found": compression_state["total_found"],
"total_compressed": compression_state["total_compressed"],
"total_uploaded": compression_state["total_uploaded"],
"total_space_saved_mb": round(compression_state["total_space_saved_mb"], 2),
"current_video": compression_state["current_video"],
"processed_videos": compression_state["processed_videos"],
"failed_videos": compression_state["failed_videos"],
"last_error": compression_state["last_error"]
})
@app.post("/scan")
async def trigger_scan():
"""Manually trigger a scan and compression run."""
if compression_state["is_polling"]:
return JSONResponse({
"status": "already_running",
"message": "Compression scan already in progress"
}, status_code=409)
asyncio.create_task(scan_and_compress_videos())
return JSONResponse({
"status": "started",
"message": "Compression scan started"
})
@app.get("/stats")
async def get_stats():
"""Get compression statistics."""
return JSONResponse({
"compression_stats": compression_state["compression_stats"],
"total_space_saved_mb": round(compression_state["total_space_saved_mb"], 2),
"average_compression_ratio": round(
sum(s["compression_ratio"] for s in compression_state["compression_stats"].values()) /
max(len(compression_state["compression_stats"]), 1), 2
) if compression_state["compression_stats"] else 0
})
@app.post("/reset")
async def reset_state():
"""Reset all tracking (for testing)."""
compression_state["total_found"] = 0
compression_state["total_compressed"] = 0
compression_state["total_uploaded"] = 0
compression_state["processed_videos"] = []
compression_state["failed_videos"] = []
compression_state["compression_stats"] = {}
compression_state["total_space_saved_mb"] = 0
return JSONResponse({"status": "reset"})
if __name__ == "__main__":
print("Starting Video Compression Server on port 7860...")
print("Scanning for videos on startup...")
uvicorn.run(app, host="0.0.0.0", port=7860)