SUPIR

Runtime error

App Files Files Community

Fabrice-TIERCELIN commited on May 25, 2025

Commit

0ba18b1

verified ·

1 Parent(s): 1fa1fb9

Delete video_super_resolution

Browse files

Files changed (6) hide show

video_super_resolution/__pycache__/color_fix.cpython-39.pyc +0 -0
video_super_resolution/color_fix.py +0 -122
video_super_resolution/dataset.py +0 -113
video_super_resolution/scripts/__pycache__/inference_sr.cpython-39.pyc +0 -0
video_super_resolution/scripts/inference_sr.py +0 -142
video_super_resolution/scripts/inference_sr.sh +0 -56

video_super_resolution/__pycache__/color_fix.cpython-39.pyc DELETED Viewed

Binary file (4.01 kB)

video_super_resolution/color_fix.py DELETED Viewed

@@ -1,122 +0,0 @@
-'''
-# --------------------------------------------------------------------------------
-#   Color fixed script from Li Yi (https://github.com/pkuliyi2015/sd-webui-stablesr/blob/master/srmodule/colorfix.py)
-# --------------------------------------------------------------------------------
-'''
-import torch
-from PIL import Image
-from torch import Tensor
-from torch.nn import functional as F
-from torchvision.transforms import ToTensor, ToPILImage
-from einops import rearrange
-def adain_color_fix(target: Image, source: Image):
-    # Convert images to tensors
-    target = rearrange(target, 'T H W C -> T C H W') / 255
-    source = (source + 1) / 2
-    # Apply adaptive instance normalization
-    result_tensor_list = []
-    for i in range(0, target.shape[0]):
-        result_tensor_list.append(adaptive_instance_normalization(target[i].unsqueeze(0), source[i].unsqueeze(0)))
-    # Convert tensor back to image
-    result_tensor = torch.cat(result_tensor_list, dim=0).clamp_(0.0, 1.0)
-    result_video = rearrange(result_tensor, "T C H W -> T H W C") * 255
-    return result_video
-def wavelet_color_fix(target, source):
-    # Convert images to tensors
-    target = rearrange(target, 'T H W C -> T C H W') / 255
-    source = (source + 1) / 2
-    # Apply wavelet reconstruction
-    result_tensor_list = []
-    for i in range(0, target.shape[0]):
-        result_tensor_list.append(wavelet_reconstruction(target[i].unsqueeze(0), source[i].unsqueeze(0)))
-    # Convert tensor back to image
-    result_tensor = torch.cat(result_tensor_list, dim=0).clamp_(0.0, 1.0)
-    result_video = rearrange(result_tensor, "T C H W -> T H W C") * 255
-    return result_video
-def calc_mean_std(feat: Tensor, eps=1e-5):
-    """Calculate mean and std for adaptive_instance_normalization.
-    Args:
-        feat (Tensor): 4D tensor.
-        eps (float): A small value added to the variance to avoid
-            divide-by-zero. Default: 1e-5.
-    """
-    size = feat.size()
-    assert len(size) == 4, 'The input feature should be 4D tensor.'
-    b, c = size[:2]
-    feat_var = feat.reshape(b, c, -1).var(dim=2) + eps
-    feat_std = feat_var.sqrt().reshape(b, c, 1, 1)
-    feat_mean = feat.reshape(b, c, -1).mean(dim=2).reshape(b, c, 1, 1)
-    return feat_mean, feat_std
-def adaptive_instance_normalization(content_feat:Tensor, style_feat:Tensor):
-    """Adaptive instance normalization.
-    Adjust the reference features to have the similar color and illuminations
-    as those in the degradate features.
-    Args:
-        content_feat (Tensor): The reference feature.
-        style_feat (Tensor): The degradate features.
-    """
-    size = content_feat.size()
-    style_mean, style_std = calc_mean_std(style_feat)
-    content_mean, content_std = calc_mean_std(content_feat)
-    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
-    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
-def wavelet_blur(image: Tensor, radius: int):
-    """
-    Apply wavelet blur to the input tensor.
-    """
-    # input shape: (1, 3, H, W)
-    # convolution kernel
-    kernel_vals = [
-        [0.0625, 0.125, 0.0625],
-        [0.125, 0.25, 0.125],
-        [0.0625, 0.125, 0.0625],
-    ]
-    kernel = torch.tensor(kernel_vals, dtype=image.dtype, device=image.device)
-    # add channel dimensions to the kernel to make it a 4D tensor
-    kernel = kernel[None, None]
-    # repeat the kernel across all input channels
-    kernel = kernel.repeat(3, 1, 1, 1)
-    image = F.pad(image, (radius, radius, radius, radius), mode='replicate')
-    # apply convolution
-    output = F.conv2d(image, kernel, groups=3, dilation=radius)
-    return output
-def wavelet_decomposition(image: Tensor, levels=5):
-    """
-    Apply wavelet decomposition to the input tensor.
-    This function only returns the low frequency & the high frequency.
-    """
-    high_freq = torch.zeros_like(image)
-    for i in range(levels):
-        radius = 2 ** i
-        low_freq = wavelet_blur(image, radius)
-        high_freq += (image - low_freq)
-        image = low_freq
-    return high_freq, low_freq
-def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor):
-    """
-    Apply wavelet decomposition, so that the content will have the same color as the style.
-    """
-    # calculate the wavelet decomposition of the content feature
-    content_high_freq, content_low_freq = wavelet_decomposition(content_feat)
-    del content_low_freq
-    # calculate the wavelet decomposition of the style feature
-    style_high_freq, style_low_freq = wavelet_decomposition(style_feat)
-    del style_high_freq
-    # reconstruct the content feature with the style's high frequency
-    return content_high_freq + style_low_freq

video_super_resolution/dataset.py DELETED Viewed

@@ -1,113 +0,0 @@
-import os
-import random
-import glob
-import torchvision
-from einops import rearrange
-from torch.utils import data as data
-import torch.nn.functional as F
-from torchvision import transforms
-from PIL import Image
-class PairedCaptionVideoDataset(data.Dataset):
-    def __init__(
-            self,
-            root_folders=None,
-            null_text_ratio=0.5,
-            num_frames=16
-    ):
-        super(PairedCaptionVideoDataset, self).__init__()
-        self.null_text_ratio = null_text_ratio
-        self.lr_list = []
-        self.gt_list = []
-        self.tag_path_list = []
-        self.num_frames = num_frames
-        # root_folders = root_folders.split(',')
-        for root_folder in root_folders:
-            lr_path = root_folder +'/lq'
-            tag_path = root_folder +'/text'
-            gt_path = root_folder +'/gt'
-            self.lr_list += glob.glob(os.path.join(lr_path, '*.mp4'))
-            self.gt_list += glob.glob(os.path.join(gt_path, '*.mp4'))
-            self.tag_path_list += glob.glob(os.path.join(tag_path, '*.txt'))
-        assert len(self.lr_list) == len(self.gt_list)
-        assert len(self.lr_list) == len(self.tag_path_list)
-    def __getitem__(self, index):
-        gt_path = self.gt_list[index]
-        vframes_gt, _, info = torchvision.io.read_video(filename=gt_path, pts_unit="sec", output_format="TCHW")
-        fps = info['video_fps']
-        vframes_gt = (rearrange(vframes_gt, "T C H W -> C T H W") / 255) * 2 - 1
-        # gt = self.trandform(vframes_gt)
-        lq_path = self.lr_list[index]
-        vframes_lq, _, _ = torchvision.io.read_video(filename=lq_path, pts_unit="sec", output_format="TCHW")
-        vframes_lq = (rearrange(vframes_lq, "T C H W -> C T H W") / 255) * 2 - 1
-        # lq = self.trandform(vframes_lq)
-        if random.random() < self.null_text_ratio:
-            tag = ''
-        else:
-            tag_path = self.tag_path_list[index]
-            with open(tag_path, 'r', encoding='utf-8') as file:
-                tag = file.read()
-        return {"gt": vframes_gt[:, :self.num_frames, :, :], "lq": vframes_lq[:, :self.num_frames, :, :], "text": tag, 'fps': fps}
-    def __len__(self):
-        return len(self.gt_list)
-class PairedCaptionImageDataset(data.Dataset):
-    def __init__(
-            self,
-            root_folder=None,
-    ):
-        super(PairedCaptionImageDataset, self).__init__()
-        self.lr_list = []
-        self.gt_list = []
-        self.tag_path_list = []
-        lr_path = root_folder +'/sr_bicubic'
-        gt_path = root_folder +'/gt'
-        self.lr_list += glob.glob(os.path.join(lr_path, '*.png'))
-        self.gt_list += glob.glob(os.path.join(gt_path, '*.png'))
-        assert len(self.lr_list) == len(self.gt_list)
-        self.img_preproc = transforms.Compose([
-            transforms.ToTensor(),
-        ])
-        # Define the crop size (e.g., 256x256)
-        crop_size = (720, 1280)
-        # CenterCrop transform
-        self.center_crop = transforms.CenterCrop(crop_size)
-    def __getitem__(self, index):
-        gt_path = self.gt_list[index]
-        gt_img = Image.open(gt_path).convert('RGB')
-        gt_img = self.center_crop(self.img_preproc(gt_img))
-        lq_path = self.lr_list[index]
-        lq_img = Image.open(lq_path).convert('RGB')
-        lq_img = self.center_crop(self.img_preproc(lq_img))
-        example = dict()
-        example["lq"] = (lq_img.squeeze(0) * 2.0 - 1.0).unsqueeze(1)
-        example["gt"] = (gt_img.squeeze(0) * 2.0 - 1.0).unsqueeze(1)
-        example["text"] = ""
-        return example
-    def __len__(self):
-        return len(self.gt_list)

video_super_resolution/scripts/__pycache__/inference_sr.cpython-39.pyc DELETED Viewed

Binary file (3.97 kB)

video_super_resolution/scripts/inference_sr.py DELETED Viewed

@@ -1,142 +0,0 @@
-import os
-import torch
-from argparse import ArgumentParser, Namespace
-import json
-from typing import Any, Dict, List, Mapping, Tuple
-from easydict import EasyDict
-from video_to_video.video_to_video_model import VideoToVideo_sr
-from video_to_video.utils.seed import setup_seed
-from video_to_video.utils.logger import get_logger
-from video_super_resolution.color_fix import adain_color_fix
-from inference_utils import *
-logger = get_logger()
-class STAR_sr():
-    def __init__(self,
-                 result_dir='./results/',
-                 file_name='000_video.mp4',
-                 model_path='./pretrained_weight',
-                 solver_mode='fast',
-                 steps=15,
-                 guide_scale=7.5,
-                 upscale=4,
-                 max_chunk_len=32,
-                 variant_info=None,
-                 chunk_size=3,
-                 ):
-        self.model_path=model_path
-        logger.info('checkpoint_path: {}'.format(self.model_path))
-        self.result_dir = result_dir
-        self.file_name = file_name
-        os.makedirs(self.result_dir, exist_ok=True)
-        model_cfg = EasyDict(__name__='model_cfg')
-        model_cfg.model_path = self.model_path
-        model_cfg.chunk_size = chunk_size
-        self.model = VideoToVideo_sr(model_cfg)
-        steps = 15 if solver_mode == 'fast' else steps
-        self.solver_mode=solver_mode
-        self.steps=steps
-        self.guide_scale=guide_scale
-        self.upscale = upscale
-        self.max_chunk_len=max_chunk_len
-        self.variant_info=variant_info
-    def enhance_a_video(self, video_path, prompt):
-        logger.info('input video path: {}'.format(video_path))
-        text = prompt
-        logger.info('text: {}'.format(text))
-        caption = text + self.model.positive_prompt
-        input_frames, input_fps = load_video(video_path)
-        in_f_num = len(input_frames)
-        logger.info('input frames length: {}'.format(in_f_num))
-        logger.info('input fps: {}'.format(input_fps))
-        video_data = preprocess(input_frames)
-        _, _, h, w = video_data.shape
-        logger.info('input resolution: {}'.format((h, w)))
-        target_h, target_w = h * self.upscale, w * self.upscale   # adjust_resolution(h, w, up_scale=4)
-        logger.info('target resolution: {}'.format((target_h, target_w)))
-        pre_data = {'video_data': video_data, 'y': caption}
-        pre_data['target_res'] = (target_h, target_w)
-        total_noise_levels = 900
-        setup_seed(666)
-        with torch.no_grad():
-            data_tensor = collate_fn(pre_data, 'cuda:0')
-            output = self.model.test(data_tensor, total_noise_levels, steps=self.steps, \
-                                solver_mode=self.solver_mode, guide_scale=self.guide_scale, \
-                                max_chunk_len=self.max_chunk_len
-                                )
-        output = tensor2vid(output)
-        # Using color fix
-        output = adain_color_fix(output, video_data)
-        save_video(output, self.result_dir, self.file_name, fps=input_fps)
-        return os.path.join(self.result_dir, self.file_name)
-def parse_args():
-    parser = ArgumentParser()
-    parser.add_argument("--input_path", required=True, type=str, help="input video path")
-    parser.add_argument("--save_dir", type=str, default='results', help="save directory")
-    parser.add_argument("--file_name", type=str, help="file name")
-    parser.add_argument("--model_path", type=str, default='./pretrained_weight/I2VGen-XL-based/heavy_deg.pt', help="model path")
-    parser.add_argument("--prompt", type=str, default='a good video', help="prompt")
-    parser.add_argument("--upscale", type=int, default=4, help='up-scale')
-    parser.add_argument("--max_chunk_len", type=int, default=32, help='max_chunk_len')
-    parser.add_argument("--variant_info", type=str, default=None, help='information of inference strategy')
-    parser.add_argument("--cfg", type=float, default=7.5)
-    parser.add_argument("--solver_mode", type=str, default='fast', help='fast | normal')
-    parser.add_argument("--steps", type=int, default=15)
-    return parser.parse_args()
-def main():
-    args = parse_args()
-    input_path = args.input_path
-    prompt = args.prompt
-    model_path = args.model_path
-    save_dir = args.save_dir
-    file_name = args.file_name
-    upscale = args.upscale
-    max_chunk_len = args.max_chunk_len
-    steps = args.steps
-    solver_mode = args.solver_mode
-    guide_scale = args.cfg
-    assert solver_mode in ('fast', 'normal')
-    star_sr = STAR_sr(
-                            result_dir=save_dir,
-                            file_name=file_name,  # new added
-                            model_path=model_path,
-                            solver_mode=solver_mode,
-                            steps=steps,
-                            guide_scale=guide_scale,
-                            upscale=upscale,
-                            max_chunk_len=max_chunk_len,
-                            variant_info=None,
-                            )
-    star_sr.enhance_a_video(input_path, prompt)
-if __name__ == '__main__':
-    main()

video_super_resolution/scripts/inference_sr.sh DELETED Viewed

@@ -1,56 +0,0 @@
-#!/bin/bash
-# Folder paths
-video_folder_path='./input/video'
-txt_file_path='./input/text/prompt.txt'
-# Get all .mp4 files in the folder using find to handle special characters
-mapfile -t mp4_files < <(find "$video_folder_path" -type f -name "*.mp4")
-# Print the list of MP4 files
-echo "MP4 files to be processed:"
-for mp4_file in "${mp4_files[@]}"; do
-    echo "$mp4_file"
-done
-# Read lines from the text file, skipping empty lines
-mapfile -t lines < <(grep -v '^\s*$' "$txt_file_path")
-# List of frame counts
-frame_length=32
-# Debugging output
-echo "Number of MP4 files: ${#mp4_files[@]}"
-echo "Number of lines in the text file: ${#lines[@]}"
-# Ensure the number of video files matches the number of lines
-if [ ${#mp4_files[@]} -ne ${#lines[@]} ]; then
-    echo "Number of MP4 files and lines in the text file do not match."
-    exit 1
-fi
-# Loop through video files and corresponding lines
-for i in "${!mp4_files[@]}"; do
-    mp4_file="${mp4_files[$i]}"
-    line="${lines[$i]}"
-    # Extract the filename without the extension
-    file_name=$(basename "$mp4_file" .mp4)
-    echo "Processing video file: $mp4_file with prompt: $line"
-    # Run Python script with parameters
-    python \
-        ./video_super_resolution/scripts/inference_sr.py \
-        --solver_mode 'fast' \
-        --steps 15 \
-        --input_path "${mp4_file}" \
-        --model_path /mnt/bn/videodataset/VSR/pretrained_models/STAR/heavy_deg.pt \
-        --prompt "${line}" \
-        --upscale 4 \
-        --max_chunk_len ${frame_length} \
-        --file_name "${file_name}.mp4" \
-        --save_dir ./results
-done
-echo "All videos processed successfully."