vidfom
/

LTX-itv-Repository

Diffusers

Safetensors

Model card Files Files and versions

xet

Community

vidfom commited on Mar 31, 2025

Commit

9864eac

verified ·

1 Parent(s): 14b57af

Upload LTX_Video_Img_to_Vid.ipynb

Browse files

Files changed (1) hide show

LTX-Video/LTX_Video_Img_to_Vid.ipynb +474 -0

LTX-Video/LTX_Video_Img_to_Vid.ipynb ADDED Viewed

	@@ -0,0 +1,474 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **LTX-VIDEO (Image to Video based on ComfyUI nodes library)**\n",
+        "ComfyUI Github Repository: https://github.com/comfyanonymous/ComfyUI"
+      ],
+      "metadata": {
+        "id": "f4p1ysFKMbs_"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "- Note that this Notebook only references the ComfyUI nodes library; it does not display the ComfyUI GUI.\n",
+        "- You can use the free T4 GPU to run this depending on the output video resolution and number of frames. The default setting runs without issues, but at 768 by 512 output resolution with 73 frames, the decoding process crashes the 12.7GB RAM.  For faster video generation with higher resolutions and frames, use higher GPUs.\n",
+        "- If you want to generate a video with n frames, then set frames to n+1. e.g. To generate a video with 72 frames, set frames to 73.\n",
+        "- You need to use detailed prompts to get decent results.\n",
+        "- Videos are generated at 24fps."
+      ],
+      "metadata": {
+        "id": "EBB00lC6q-DA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Prepare Environment\n",
+        "%cd /content\n",
+        "Always_Load_Models_for_Inference = False\n",
+        "Use_t5xxl_fp16 = False\n",
+        "\n",
+        "!pip install -q torchsde einops diffusers accelerate xformers\n",
+        "!pip install av\n",
+        "!git clone https://github.com/Isi-dev/ComfyUI\n",
+        "%cd /content/ComfyUI\n",
+        "!apt -y install -qq aria2 ffmpeg\n",
+        "\n",
+        "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/ltx-video-2b-v0.9.5.safetensors -d /content/ComfyUI/models/checkpoints -o ltx-video-2b-v0.9.5.safetensors\n",
+        "if Use_t5xxl_fp16:\n",
+        "    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors\n",
+        "else:\n",
+        "    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors\n",
+        "\n",
+        "import torch\n",
+        "import numpy as np\n",
+        "from PIL import Image\n",
+        "import gc\n",
+        "import sys\n",
+        "import random\n",
+        "import os\n",
+        "import imageio\n",
+        "from google.colab import files\n",
+        "from IPython.display import display, HTML\n",
+        "sys.path.insert(0, '/content/ComfyUI')\n",
+        "\n",
+        "from comfy import model_management\n",
+        "\n",
+        "from nodes import (\n",
+        "    CheckpointLoaderSimple,\n",
+        "    CLIPLoader,\n",
+        "    CLIPTextEncode,\n",
+        "    VAEDecode,\n",
+        "    LoadImage,\n",
+        "    SaveImage\n",
+        ")\n",
+        "\n",
+        "from comfy_extras.nodes_custom_sampler import (\n",
+        "    KSamplerSelect,\n",
+        "    SamplerCustom\n",
+        ")\n",
+        "\n",
+        "from comfy_extras.nodes_lt import (\n",
+        "    LTXVPreprocess,\n",
+        "    LTXVImgToVideo,\n",
+        "    LTXVScheduler,\n",
+        "    LTXVConditioning\n",
+        ")\n",
+        "\n",
+        "checkpoint_loader = CheckpointLoaderSimple()\n",
+        "clip_loader = CLIPLoader()\n",
+        "clip_encode_positive = CLIPTextEncode()\n",
+        "clip_encode_negative = CLIPTextEncode()\n",
+        "load_image = LoadImage()\n",
+        "save_node = SaveImage()\n",
+        "preprocess = LTXVPreprocess()\n",
+        "img_to_video = LTXVImgToVideo()\n",
+        "scheduler = LTXVScheduler()\n",
+        "sampler_select = KSamplerSelect()\n",
+        "conditioning = LTXVConditioning()\n",
+        "sampler = SamplerCustom()\n",
+        "vae_decode = VAEDecode()\n",
+        "\n",
+        "# if not Always_Load_Models_for_Inference:\n",
+        "with torch.inference_mode():\n",
+        "    # Load models\n",
+        "    print(\"Loading Model...\")\n",
+        "    model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
+        "    print(\"Loaded model!\")\n",
+        "    print(\"Loading Text_Encoder...\")\n",
+        "    # if Use_t5xxl_fp16:\n",
+        "    #     clip = clip_loader.load_clip(\"t5xxl_fp16.safetensors\", \"ltxv\", \"default\")[0]\n",
+        "    # else:\n",
+        "    clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
+        "    print(\"Loaded Text_Encoder!\")\n",
+        "\n",
+        "def clear_gpu_memory():\n",
+        "    import gc\n",
+        "    import torch\n",
+        "\n",
+        "    gc.collect()\n",
+        "    torch.cuda.empty_cache()\n",
+        "    torch.cuda.ipc_collect()\n",
+        "\n",
+        "    # if Always_Load_Models_for_Inference:\n",
+        "    #     for var_name in [\"model\", \"vae\", \"clip\"]:\n",
+        "    #         if var_name in globals():\n",
+        "    #             del globals()[var_name]\n",
+        "\n",
+        "    gc.collect()\n",
+        "    torch.cuda.empty_cache()\n",
+        "\n",
+        "\n",
+        "def upload_image():\n",
+        "    \"\"\"Handle image upload in Colab and store in /content/ComfyUI/input/\"\"\"\n",
+        "    from google.colab import files\n",
+        "    import os\n",
+        "    import shutil\n",
+        "\n",
+        "    os.makedirs('/content/ComfyUI/input', exist_ok=True)\n",
+        "\n",
+        "    uploaded = files.upload()\n",
+        "\n",
+        "    # Move each uploaded file to ComfyUI input directory\n",
+        "    for filename in uploaded.keys():\n",
+        "        src_path = f'/content/ComfyUI/{filename}'\n",
+        "        dest_path = f'/content/ComfyUI/input/{filename}'\n",
+        "\n",
+        "        shutil.move(src_path, dest_path)\n",
+        "        print(f\"Image saved to: {dest_path}\")\n",
+        "        return dest_path\n",
+        "\n",
+        "    return None\n",
+        "\n",
+        "\n",
+        "def generate_video(\n",
+        "    image_path: str = None,\n",
+        "    positive_prompt: str = \"A red fox moving gracefully\",\n",
+        "    negative_prompt: str = \"low quality, worst quality\",\n",
+        "    width: int = 768,\n",
+        "    height: int = 512,\n",
+        "    seed: int = 0,\n",
+        "    steps: int = 30,\n",
+        "    cfg_scale: float = 2.05,\n",
+        "    sampler_name: str = \"euler\",\n",
+        "    length: int = 24,  # Number of frames\n",
+        "    fps: int = 24\n",
+        "):\n",
+        "    \"\"\"Generate a video from an uploaded image using LTX-Video model\"\"\"\n",
+        "    try:\n",
+        "\n",
+        "        # if Always_Load_Models_for_Inference:\n",
+        "        #     with torch.inference_mode():\n",
+        "        #         # Load models\n",
+        "        #         print(\"Loading Model...\")\n",
+        "        #         model, _, vae = checkpoint_loader.load_checkpoint(\"ltx-video-2b-v0.9.5.safetensors\")\n",
+        "        #         print(\"Loaded model!\")\n",
+        "        #         print(\"Loading Text_Encoder...\")\n",
+        "        #         clip = clip_loader.load_clip(\"t5xxl_fp8_e4m3fn_scaled.safetensors\", \"ltxv\", \"default\")[0]\n",
+        "        #         print(\"Loaded Text_Encoder!\")\n",
+        "\n",
+        "        assert width % 32 == 0, \"Width must be divisible by 32\"\n",
+        "        assert height % 32 == 0, \"Height must be divisible by 32\"\n",
+        "\n",
+        "\n",
+        "\n",
+        "        positive = clip_encode_positive.encode(clip, positive_prompt)[0]\n",
+        "        negative = clip_encode_negative.encode(clip, negative_prompt)[0]\n",
+        "\n",
+        "        if image_path is None:\n",
+        "            print(\"Please upload an image file:\")\n",
+        "            image_path = upload_image()\n",
+        "        if image_path is None:\n",
+        "            print(\"No image uploaded!\")\n",
+        "        loaded_image = load_image.load_image(image_path)[0]\n",
+        "        processed_image = preprocess.preprocess(loaded_image, 40)[0]\n",
+        "\n",
+        "        video_output = img_to_video.generate(\n",
+        "            positive=positive,\n",
+        "            negative=negative,\n",
+        "            vae=vae,\n",
+        "            image=processed_image,\n",
+        "            width=width,\n",
+        "            height=height,\n",
+        "            length=length,\n",
+        "            batch_size=1\n",
+        "        )\n",
+        "\n",
+        "        sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1)[0]\n",
+        "        selected_sampler = sampler_select.get_sampler(sampler_name)[0]\n",
+        "        conditioned = conditioning.append(video_output[0], video_output[1], 25.0)\n",
+        "\n",
+        "        sampled = sampler.sample(\n",
+        "            model=model,\n",
+        "            add_noise=True,\n",
+        "            noise_seed=seed if seed != 0 else random.randint(0, 2**32),\n",
+        "            cfg=cfg_scale,\n",
+        "            positive=conditioned[0],\n",
+        "            negative=conditioned[1],\n",
+        "            sampler=selected_sampler,\n",
+        "            sigmas=sigmas,\n",
+        "            latent_image=video_output[2]\n",
+        "        )[0]\n",
+        "\n",
+        "        # model_management.soft_empty_cache()\n",
+        "\n",
+        "        with torch.no_grad():\n",
+        "            try:\n",
+        "                decoded = vae_decode.decode(vae, sampled)[0].detach()\n",
+        "                # print(f\"Decoded frames shape: {decoded.shape}\")\n",
+        "            except Exception as e:\n",
+        "                print(f\"Error during decoding: {str(e)}\")\n",
+        "                raise\n",
+        "\n",
+        "        # Reshape to video frames (batch, frames, H, W, C)\n",
+        "        # decoded_frames = decoded.reshape(1, length, height, width, 3)\n",
+        "\n",
+        "        save_node.save_images(decoded, filename_prefix=\"video_frame\")\n",
+        "\n",
+        "        output_path = \"/content/output.mp4\"\n",
+        "        frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)\n",
+        "        with imageio.get_writer(output_path, fps=fps) as writer:\n",
+        "            for frame in frames_np:\n",
+        "                writer.append_data(frame)\n",
+        "\n",
+        "        print(f\"\\nVideo generation complete!\")\n",
+        "        print(f\"Saved {len(decoded)} frames to ComfyUI output directory\")\n",
+        "        print(f\"Video saved to: {output_path}\")\n",
+        "        display_video(output_path)\n",
+        "\n",
+        "    except Exception as e:\n",
+        "        print(f\"Error during video generation: {str(e)}\")\n",
+        "        raise\n",
+        "    finally:\n",
+        "        clear_gpu_memory()\n",
+        "\n",
+        "\n",
+        "def display_video(video_path):\n",
+        "    \"\"\"Display video in Colab notebook with proper HTML5 player\"\"\"\n",
+        "    from IPython.display import HTML\n",
+        "    from base64 import b64encode\n",
+        "\n",
+        "    mp4 = open(video_path,'rb').read()\n",
+        "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "\n",
+        "    display(HTML(f\"\"\"\n",
+        "    <video width=512 controls autoplay loop>\n",
+        "        <source src=\"{data_url}\" type=\"video/mp4\">\n",
+        "    </video>\n",
+        "    \"\"\"))"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "rrXFIT4fMfyJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Run Image to Video\n",
+        "positive_prompt = \"A red fox moving gracefully, its russet coat vibrant against the white landscape, leaving perfect star-shaped prints behind as steam rises from its breath in the crisp winter air. The scene is wrapped in snow-muffled silence, broken only by the gentle murmur of water still flowing beneath the ice.\" # @param {\"type\":\"string\"}\n",
+        "negative_prompt = \"low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly\" # @param {\"type\":\"string\"}\n",
+        "width = 704 # @param {\"type\":\"number\"}\n",
+        "height = 480 # @param {\"type\":\"number\"}\n",
+        "seed = 1000 # @param {\"type\":\"integer\"}\n",
+        "steps = 20 # @param {\"type\":\"integer\", \"min\":1, \"max\":100}\n",
+        "cfg_scale = 2.5 # @param {\"type\":\"number\", \"min\":1, \"max\":20}\n",
+        "sampler_name = \"euler\" # @param [\"euler\", \"dpmpp_2m\", \"ddim\", \"lms\"]\n",
+        "frames = 73 # @param {\"type\":\"integer\", \"min\":1, \"max\":120}\n",
+        "\n",
+        "# @title Run Video Generation\n",
+        "print(\"Starting video generation workflow...\")\n",
+        "with torch.inference_mode():\n",
+        "    generate_video(\n",
+        "        image_path=None,  # This will trigger upload\n",
+        "        positive_prompt=positive_prompt,\n",
+        "        negative_prompt=negative_prompt,\n",
+        "        width=width,\n",
+        "        height=height,\n",
+        "        seed=seed,\n",
+        "        steps=steps,\n",
+        "        cfg_scale=cfg_scale,\n",
+        "        sampler_name=sampler_name,\n",
+        "        length=frames\n",
+        "    )\n",
+        "clear_gpu_memory()"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "roC59_oNNflb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "********************************************************************************************************************************************************************************************************************************************************************************************************************************"
+      ],
+      "metadata": {
+        "id": "yWSMPSVcbmmn"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "********************************************************************************************************************************************************************************************************************************************************************************************************************************"
+      ],
+      "metadata": {
+        "id": "FGDof1EkbnHv"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **LTX-VIDEO (Image to Video based on Lightricks LTX-VIDEO Github Repository)**\n",
+        "LTX-Video Github Repository: https://github.com/Lightricks/LTX-Video"
+      ],
+      "metadata": {
+        "id": "6t7--x3NBdE5"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "- You need compute units to run this section.\n",
+        "- Use detailed prompts to improve the generated video.\n",
+        "- If you want to generate a video with n frames, then set NUM_FRAMES to n+1. e.g. To generate a video with 120 frames, set NUM_FRAMES to 121.\n",
+        "- Videos are generated at 24fps.\n"
+      ],
+      "metadata": {
+        "id": "KVykpe_nU7lK"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Prepare Environment\n",
+        "# Install dependencies\n",
+        "!git clone https://github.com/Isi-dev/LTX-Video.git\n",
+        "%cd LTX-Video\n",
+        "\n",
+        "# Install required packages\n",
+        "!pip install -e \".[inference-script]\"\n",
+        "\n",
+        "!pip install \"huggingface_hub[cli]\"\n",
+        "!apt-get install -y aria2\n",
+        "import os\n",
+        "from huggingface_hub import list_repo_files\n",
+        "\n",
+        "repo_id = \"Isi99999/LTX-Video\"\n",
+        "all_files = list_repo_files(repo_id)\n",
+        "base_url = f\"https://huggingface.co/{repo_id}/resolve/main/\"\n",
+        "\n",
+        "with open(\"file_list.txt\", \"w\") as f:\n",
+        "    for file_path in all_files:\n",
+        "        full_url = f\"{base_url}{file_path}\"\n",
+        "        save_path = f\"MODEL_DIR/{file_path}\"\n",
+        "        os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
+        "        f.write(f\"{full_url}\\n out={save_path}\\n\")\n",
+        "!aria2c -x 16 -s 16 -i file_list.txt --continue=true --auto-file-renaming=false\n",
+        "\n",
+        "print(\"✅ All models downloaded successfully!\")"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "S9doZlq9B36X"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Upload Image\n",
+        "from google.colab import files\n",
+        "from PIL import Image\n",
+        "\n",
+        "uploaded = files.upload()\n",
+        "image_path = list(uploaded.keys())[0]\n",
+        "image = Image.open(image_path)\n",
+        "print(\"✅Image loaded successfully:\", image.size)"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "QH2FBr4naeK2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Generate Video\n",
+        "PROMPT =\"A red fox moving gracefully, its russet coat vibrant against the white landscape, leaving perfect star-shaped prints behind as steam rises from its breath in the crisp winter air. The scene is wrapped in snow-muffled silence, broken only by the gentle murmur of water still flowing beneath the ice.\" # @param {type:\"string\"}\n",
+        "STEPS = 20 # @param {\"type\":\"number\"}\n",
+        "Instruction_1 = \"choose from '720*1280', '1280*720', '480*832', '832*480', '480*704', '704*480'  for width & height, and your input image should be of the same resolution as your selected width & height.\" # @param {\"type\":\"string\"}\n",
+        "WIDTH = 704 # @param {\"type\":\"number\"}\n",
+        "HEIGHT = 480 # @param {\"type\":\"number\"}\n",
+        "Instruction_2 = \"The NUM_FRAMES should not exceed 257.\" # @param {\"type\":\"string\"}\n",
+        "NUM_FRAMES = 121 # @param {\"type\":\"number\"}\n",
+        "SEED = 1000 # @param {\"type\":\"number\"}\n",
+        "\n",
+        "\n",
+        "total_vram = 0\n",
+        "import torch\n",
+        "if torch.cuda.is_available():\n",
+        "    gpu_id = torch.cuda.current_device()\n",
+        "    total_vram = torch.cuda.get_device_properties(gpu_id).total_memory / 1024**3\n",
+        "else:\n",
+        "    print(\"No GPU found.\")\n",
+        "if total_vram < 18:\n",
+        "    print(\"It seems you are using the free T4 GPU which is offered with a RAM of 12.7GB. The text encoder will crash the RAM. Choose a higher runtime type.\")\n",
+        "elif total_vram > 18 and total_vram < 30:\n",
+        "    print(\"Setting low_vram flag to avoid Out of Memory Errors. Inference will be a bit slow.\")\n",
+        "    !python inference.py --ckpt_path \"MODEL_DIR/\" --output_path \"outputVidFromImage\" --low_vram --offload_to_cpu --conditioning_media_paths {image_path} --conditioning_start_frames 0 --text_encoder_model_name_or_path \"MODEL_DIR/\"  --prompt \"{PROMPT}\" --prompt_enhancement_words_threshold 0 --height {HEIGHT} --width {WIDTH} --num_frames {NUM_FRAMES} --seed {SEED} --num_inference_steps {STEPS}\n",
+        "else :\n",
+        "    !python inference.py --ckpt_path \"MODEL_DIR/\" --output_path \"outputVidFromImage\" --conditioning_media_paths {image_path} --conditioning_start_frames 0 --text_encoder_model_name_or_path \"MODEL_DIR/\"  --prompt \"{PROMPT}\" --prompt_enhancement_words_threshold 0 --height {HEIGHT} --width {WIDTH} --num_frames {NUM_FRAMES} --seed {SEED} --num_inference_steps {STEPS}\n",
+        "\n",
+        "if total_vram > 18:\n",
+        "    import os\n",
+        "    import glob\n",
+        "    from IPython.display import display as displayVid, Video as outVid\n",
+        "\n",
+        "    video_folder = \"outputVidFromImage/\"\n",
+        "\n",
+        "    # Find the latest MP4 file\n",
+        "    video_files = glob.glob(os.path.join(video_folder, \"*.mp4\"))\n",
+        "\n",
+        "    if video_files:\n",
+        "        latest_video = max(video_files, key=os.path.getctime)  # Get the most recent video\n",
+        "        print(f\"Displaying video: {latest_video}\")\n",
+        "        displayVid(outVid(latest_video, embed=True))\n",
+        "    else:\n",
+        "        print(\"❌ No video found in outputVid/\")\n"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "RHFnir7waoKm"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}