Reward-Forcing

Paused

App Files Files Community

fffiloni commited on Dec 31, 2025

Commit

0d629ec

verified ·

1 Parent(s): c0f475e

Update app_wip.py

Browse files

Files changed (1) hide show

app_wip.py +43 -50

app_wip.py CHANGED Viewed

@@ -21,7 +21,7 @@ from utils.misc import set_seed
 from demo_utils.memory import get_cuda_free_memory_gb, DynamicSwapInstaller
 # -------------------------------------------------------------------
-# Téléchargement des checkpoints (comme dans ton app_wip)
 # -------------------------------------------------------------------
 snapshot_download(
     repo_id="Wan-AI/Wan2.1-T2V-1.3B",
@@ -62,10 +62,10 @@ def reward_forcing_inference(
     progress: gr.Progress,
 ):
     """
-    Version simplifiée / inline d'inference.py
     - single GPU
     - T2V uniquement
-    - 1 prompt par fichier .txt
     """
     logs = ""
@@ -79,42 +79,45 @@ def reward_forcing_inference(
     torch.set_grad_enabled(False)
-    # --------------------- Config & pipeline ---------------------
-    logs += "Chargement de la config...\n"
-    progress(0.05, desc="Chargement de la config")
-    config = OmegaConf.load(CONFIG_PATH)
-    default_config = OmegaConf.load("configs/default_config.yaml")
-    config = OmegaConf.merge(default_config, config)
-    if hasattr(config, "denoising_step_list"):
-        pipeline = CausalInferencePipeline(config, device=device)
-    else:
-        pipeline = CausalDiffusionInferencePipeline(config, device=device)
-    logs += "Chargement des poids du checkpoint...\n"
-    progress(0.1, desc="Chargement du checkpoint")
-    state_dict = torch.load(CHECKPOINT_PATH, map_location="cpu")
-    pipeline.generator.load_state_dict(state_dict)
-    checkpoint_step = os.path.basename(os.path.dirname(CHECKPOINT_PATH))
-    checkpoint_step = checkpoint_step.split("_")[-1]
-    pipeline = pipeline.to(dtype=torch.bfloat16)
-    if low_memory:
-        DynamicSwapInstaller.install_model(pipeline.text_encoder, device=device)
-    else:
-        pipeline.text_encoder.to(device=device)
-    pipeline.generator.to(device=device)
-    pipeline.vae.to(device=device)
     # --------------------- Dataset / DataLoader ---------------------
     logs += "Préparation du dataset (TextDataset)...\n"
-    progress(0.15, desc="Préparation du dataset")
     dataset = TextDataset(prompt_path=prompt_txt_path, extended_prompt_path=None)
     num_prompts = len(dataset)
     logs += f"Number of prompts: {num_prompts}\n"
-    # On ne supporte que batch_size=1 ici
     from torch.utils.data import DataLoader, SequentialSampler
     sampler = SequentialSampler(dataset)
@@ -126,10 +129,7 @@ def reward_forcing_inference(
     os.makedirs(output_folder, exist_ok=True)
     logs += f"Dossier de sortie: {output_folder}\n"
-    progress(0.2, desc="Démarrage de l'inférence")
-    # --------------------- Boucle d'inférence ---------------------
-    # On tracke le tqdm de la boucle avec le Progress Gradio
     for i, batch_data in progress.tqdm(
         enumerate(dataloader),
         total=num_prompts,
@@ -138,7 +138,7 @@ def reward_forcing_inference(
     ):
         idx = batch_data["idx"].item()
-        # batch_size=1 -> on simplifie
         if isinstance(batch_data, dict):
             batch = batch_data
         elif isinstance(batch_data, list):
@@ -148,7 +148,7 @@ def reward_forcing_inference(
         all_video = []
-        # TEXT-TO-VIDEO uniquement (pas d'I2V)
         prompt = batch["prompts"][0]
         extended_prompt = batch.get("extended_prompts", [None])[0]
         if extended_prompt is not None:
@@ -165,7 +165,6 @@ def reward_forcing_inference(
         )
         logs += f"Génération pour le prompt: {prompt[:80]}...\n"
-        progress(0.4, desc="Sampling latents")
         # Appel au pipeline
         video, latents = pipeline.inference(
@@ -176,8 +175,6 @@ def reward_forcing_inference(
             low_memory=low_memory,
         )
-        progress(0.7, desc="Décodage et écriture vidéo")
         current_video = rearrange(video, "b t c h w -> b t h w c").cpu()
         all_video.append(current_video)
@@ -186,19 +183,15 @@ def reward_forcing_inference(
         # Clear VAE cache
         pipeline.vae.model.clear_cache()
-        # Sauvegarde vidéo
         if idx < num_prompts:
             model = "regular" if not use_ema else "ema"
-            # pour éviter des noms chelous, on tronque le prompt
             safe_name = prompt[:50].replace("/", "_").replace("\\", "_")
             output_path = os.path.join(output_folder, f"{safe_name}.mp4")
             write_video(output_path, video[0], fps=16)
             logs += f"Vidéo enregistrée: {output_path}\n"
-            # On retourne la première vidéo (une seule dans ton cas)
             return output_path, logs
-    # Si on sort de la boucle sans rien (cas improbable ici)
     logs += "[WARN] Aucune vidéo générée dans la boucle.\n"
     return None, logs
@@ -226,8 +219,7 @@ def gradio_generate(prompt: str, duration: str, use_ema: bool, progress=gr.Progr
     with open(prompt_path, "w", encoding="utf-8") as f:
         f.write(prompt.strip() + "\n")
-    progress(0.01, desc="Préparation de l'inférence")
     video_path, logs = reward_forcing_inference(
         prompt_txt_path=prompt_path,
         num_output_frames=num_output_frames,
@@ -242,7 +234,6 @@ def gradio_generate(prompt: str, duration: str, use_ema: bool, progress=gr.Progr
             "Regarde les logs ci-dessous pour voir ce qui a coincé."
         )
-    progress(1.0, desc="Terminé ✅")
     return video_path, logs
@@ -256,7 +247,9 @@ with gr.Blocks(title="Reward Forcing T2V Demo (inline inference)") as demo:
         # 🎬 Reward Forcing – Text-to-Video (inline)
         Cette version appelle directement la logique d'inférence en Python,
-        ce qui permet à Gradio de suivre le `tqdm` et d'afficher une barre de progression.
         """
     )

 from demo_utils.memory import get_cuda_free_memory_gb, DynamicSwapInstaller
 # -------------------------------------------------------------------
+# Téléchargement des checkpoints (une fois au démarrage du Space)
 # -------------------------------------------------------------------
 snapshot_download(
     repo_id="Wan-AI/Wan2.1-T2V-1.3B",
     progress: gr.Progress,
 ):
     """
+    Version inline / simplifiée de inference.py :
     - single GPU
     - T2V uniquement
+    - 1 fichier .txt = n prompts (mais on retourne la 1ère vidéo)
     """
     logs = ""
     torch.set_grad_enabled(False)
+    # --------------------- BARRE 1 : init modèle / config ---------------------
+    # 4 étapes : config, pipeline, checkpoint, move to device
+    with progress.tqdm(total=4, desc="Initialisation du modèle", unit="step") as pbar:
+        logs += "Chargement de la config...\n"
+        config = OmegaConf.load(CONFIG_PATH)
+        default_config = OmegaConf.load("configs/default_config.yaml")
+        config = OmegaConf.merge(default_config, config)
+        pbar.update(1)
+        logs += "Initialisation de la pipeline...\n"
+        if hasattr(config, "denoising_step_list"):
+            pipeline = CausalInferencePipeline(config, device=device)
+        else:
+            pipeline = CausalDiffusionInferencePipeline(config, device=device)
+        pbar.update(1)
+        logs += "Chargement des poids du checkpoint...\n"
+        state_dict = torch.load(CHECKPOINT_PATH, map_location="cpu")
+        pipeline.generator.load_state_dict(state_dict)
+        checkpoint_step = os.path.basename(os.path.dirname(CHECKPOINT_PATH))
+        checkpoint_step = checkpoint_step.split("_")[-1]
+        pbar.update(1)
+        logs += "Placement du modèle sur le device...\n"
+        pipeline = pipeline.to(dtype=torch.bfloat16)
+        if low_memory:
+            DynamicSwapInstaller.install_model(pipeline.text_encoder, device=device)
+        else:
+            pipeline.text_encoder.to(device=device)
+        pipeline.generator.to(device=device)
+        pipeline.vae.to(device=device)
+        pbar.update(1)
     # --------------------- Dataset / DataLoader ---------------------
     logs += "Préparation du dataset (TextDataset)...\n"
     dataset = TextDataset(prompt_path=prompt_txt_path, extended_prompt_path=None)
     num_prompts = len(dataset)
     logs += f"Number of prompts: {num_prompts}\n"
     from torch.utils.data import DataLoader, SequentialSampler
     sampler = SequentialSampler(dataset)
     os.makedirs(output_folder, exist_ok=True)
     logs += f"Dossier de sortie: {output_folder}\n"
+    # --------------------- BARRE 2 : boucle d'inférence ---------------------
     for i, batch_data in progress.tqdm(
         enumerate(dataloader),
         total=num_prompts,
     ):
         idx = batch_data["idx"].item()
+        # Unpack batch
         if isinstance(batch_data, dict):
             batch = batch_data
         elif isinstance(batch_data, list):
         all_video = []
+        # TEXT-TO-VIDEO uniquement (pas d'I2V ici)
         prompt = batch["prompts"][0]
         extended_prompt = batch.get("extended_prompts", [None])[0]
         if extended_prompt is not None:
         )
         logs += f"Génération pour le prompt: {prompt[:80]}...\n"
         # Appel au pipeline
         video, latents = pipeline.inference(
             low_memory=low_memory,
         )
         current_video = rearrange(video, "b t c h w -> b t h w c").cpu()
         all_video.append(current_video)
         # Clear VAE cache
         pipeline.vae.model.clear_cache()
+        # Sauvegarde vidéo (on retourne la 1ère vidéo)
         if idx < num_prompts:
             model = "regular" if not use_ema else "ema"
             safe_name = prompt[:50].replace("/", "_").replace("\\", "_")
             output_path = os.path.join(output_folder, f"{safe_name}.mp4")
             write_video(output_path, video[0], fps=16)
             logs += f"Vidéo enregistrée: {output_path}\n"
             return output_path, logs
     logs += "[WARN] Aucune vidéo générée dans la boucle.\n"
     return None, logs
     with open(prompt_path, "w", encoding="utf-8") as f:
         f.write(prompt.strip() + "\n")
+    # Appel de la fonction d'inférence inline
     video_path, logs = reward_forcing_inference(
         prompt_txt_path=prompt_path,
         num_output_frames=num_output_frames,
             "Regarde les logs ci-dessous pour voir ce qui a coincé."
         )
     return video_path, logs
         # 🎬 Reward Forcing – Text-to-Video (inline)
         Cette version appelle directement la logique d'inférence en Python,
+        ce qui permet à Gradio de suivre les `tqdm` :
+        - Initialisation du modèle
+        - Génération vidéo
         """
     )