create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 17

Commit

c79fe94

verified ·

1 Parent(s): 80f71ae

Update train.py

Browse files

Files changed (1) hide show

train.py +136 -70

train.py CHANGED Viewed

@@ -31,7 +31,7 @@ def load_model(model_name, device_id=0):
     return processor, model
-def getTemplate(processor):
     msg = [
         {
             "role": "user",
@@ -44,29 +44,97 @@ def getTemplate(processor):
             ],
         }
     ]
     return processor.apply_chat_template(
         msg, add_generation_prompt=True, tokenize=False
     )
-def caption_batch(batch, processor, model, text):
-    images = batch["image"]
     pil_images = []
-    for image in images:
         if isinstance(image, Image.Image):
             if image.mode != "RGB":
                 image = image.convert("RGB")
             pil_images.append(image)
-    texts = [text] * len(pil_images)
-    inputs = processor(text=texts, images=pil_images, return_tensors="pt", padding=True)
-    inputs = {k: v.pin_memory().to(model.device, non_blocking=True) for k, v in inputs.items()}
-    with torch.no_grad(), torch.amp.autocast('cuda', dtype=torch.bfloat16):
         generated = model.generate(
             **inputs,
             max_new_tokens=128,
@@ -76,91 +144,87 @@ def caption_batch(batch, processor, model, text):
     decoded = processor.batch_decode(generated, skip_special_tokens=False)
     captions = []
-    special_tokens = set(processor.tokenizer.all_special_tokens)
     for d in decoded:
         if "<|im_start|>assistant" in d:
             d = d.split("<|im_start|>assistant")[-1]
-        for token in special_tokens:
             d = d.replace(token, "")
-        d = d.strip()
-        captions.append(d)
-    return {
-        "text": captions,
-    }
-def process_shard(gpu_id, start, end, model_name, batch_size, input_dataset, output_file):
     try:
         torch.cuda.set_device(gpu_id)
-        print(f"[GPU {gpu_id}] Loading model...", flush=True)
         processor, model = load_model(model_name, gpu_id)
-        print(f"[GPU {gpu_id}] Loading data shard [{start}:{end}]...", flush=True)
-        loaded = datasets.load_dataset(input_dataset, split=f"train[{start}:{end}]")
-        if isinstance(loaded, datasets.DatasetDict):
-            shard = cast(Dataset, loaded["train"])
-        else:
-            shard = cast(Dataset, loaded)
-        print(f"[GPU {gpu_id}] Processing {len(shard)} examples...", flush=True)
         result = shard.map(
-            lambda batch: caption_batch(batch, processor, model, getTemplate(processor)),
             batched=True,
             batch_size=batch_size,
-            remove_columns=[col for col in shard.column_names if col != "image"],
         )
-        print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)
         result.save_to_disk(output_file)
-        print(f"[GPU {gpu_id}] Done!", flush=True)
         return output_file
     except Exception as e:
         print(f"[GPU {gpu_id}] Error: {e}", flush=True)
         raise
 def main():
-    mp.set_start_method('spawn', force=True)
     input_dataset = "none-yet/anime-captions"
     output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
     batch_size = 20
-    print("Loading dataset info...")
-    loaded = datasets.load_dataset(input_dataset, split="train")
-    if isinstance(loaded, datasets.DatasetDict):
-        ds = cast(Dataset, loaded["train"])
-    else:
-        ds = cast(Dataset, loaded)
     num_gpus = torch.cuda.device_count()
-    total_size = len(ds)
-    shard_size = total_size // num_gpus
-    print(f"Dataset size: {total_size}")
     print(f"Using {num_gpus} GPUs")
-    print(f"Shard size: {shard_size}")
     processes = []
     temp_files = []
     for i in range(num_gpus):
-        start = i * shard_size
-        end = start + shard_size if i < num_gpus - 1 else total_size
-        output_file = f"temp_shard_{i}"
-        temp_files.append(output_file)
         p = mp.Process(
             target=process_shard,
-            args=(i, start, end, model_name, batch_size, input_dataset, output_file),
         )
         p.start()
         processes.append(p)
@@ -168,30 +232,32 @@ def main():
     for p in processes:
         p.join()
         if p.exitcode != 0:
-            print(f"\nProcess failed with exit code {p.exitcode}", flush=True)
-            print("Terminating all processes...", flush=True)
-            for proc in processes:
-                if proc.is_alive():
-                    proc.terminate()
-            for proc in processes:
-                proc.join()
-            raise RuntimeError(f"At least one process failed")
-    print("\nAll processes completed. Loading and concatenating results...")
-    shards = [cast(Dataset, datasets.load_from_disk(f)) for f in temp_files]
-    final_ds = datasets.concatenate_datasets(shards)
-    print(f"Final dataset size: {len(final_ds)}")
-    print("Pushing to hub...")
     final_ds.push_to_hub(output_dataset, create_pr=False)
-    print("Cleaning up temporary files...")
     for f in temp_files:
-        if os.path.exists(f):
-            shutil.rmtree(f)
-    print("Done!")
 if __name__ == "__main__":

     return processor, model
+def build_template(processor):
     msg = [
         {
             "role": "user",
             ],
         }
     ]
     return processor.apply_chat_template(
         msg, add_generation_prompt=True, tokenize=False
     )
+def iterable_to_map(ds, chunk_size=10000):
+    buffer = []
+    for ex in ds:
+        buffer.append(ex)
+        if len(buffer) >= chunk_size:
+            yield buffer
+            buffer = []
+def cpu_preprocess(input_dataset, output_folder, model_name):
+    print("CPU preprocessing…")
+    processor = AutoProcessor.from_pretrained(model_name)
+    template = build_template(processor)
+    def _pp(batch):
+        out_images = []
+        for img in batch["image"]:
+            if isinstance(img, Image.Image):
+                if img.mode != "RGB":
+                    img = img.convert("RGB")
+            out_images.append(img)
+        prompts = [template] * len(out_images)
+        return {
+            "image": out_images,
+            "prompt": prompts,
+        }
+    ds = datasets.load_dataset(input_dataset, split="train")
+    if ds is None:
+        raise ValueError(
+            f"Failed to load dataset '{input_dataset}' with split 'train'. Check the dataset name or available splits."
+        )
+    if isinstance(ds, datasets.DatasetDict):
+        if "train" in ds:
+            ds = ds["train"]
+        else:
+            raise ValueError(
+                f"'{input_dataset}' does not contain a 'train' split. Available splits: {list(ds.keys())}"
+            )
+    if not isinstance(ds, datasets.Dataset):
+        raise TypeError(f"Expected a Dataset instance, got {type(ds)}")
+    print(f"Dataset loaded: {len(ds)} examples")
+    ds2 = ds.map(
+        _pp,
+        batched=True,
+        remove_columns=[c for c in ds.column_names if c not in ("image",)],
+    )
+    print("Saving CPU-preprocessed dataset…")
+    parts = []
+    for chunk in iterable_to_map(ds2):
+        part = Dataset.from_list(chunk)
+        parts.append(part)
+    ds2 = datasets.concatenate_datasets(parts)
+    ds2.save_to_disk(output_folder)
+    print("CPU preprocessing done.")
+def caption_batch(batch, processor, model):
+    imgs = batch["image"]
+    prompts = batch["prompt"]
     pil_images = []
+    for image in imgs:
         if isinstance(image, Image.Image):
             if image.mode != "RGB":
                 image = image.convert("RGB")
             pil_images.append(image)
+    inputs = processor(
+        text=prompts, images=pil_images, return_tensors="pt", padding=True
+    )
+    inputs = {
+        k: v.pin_memory().to(model.device, non_blocking=True) for k, v in inputs.items()
+    }
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):  # type: ignore
         generated = model.generate(
             **inputs,
             max_new_tokens=128,
     decoded = processor.batch_decode(generated, skip_special_tokens=False)
     captions = []
+    special = set(processor.tokenizer.all_special_tokens)
     for d in decoded:
         if "<|im_start|>assistant" in d:
             d = d.split("<|im_start|>assistant")[-1]
+        for token in special:
             d = d.replace(token, "")
+        captions.append(d.strip())
+    return {"text": captions}
+def process_shard(
+    gpu_id, start, end, model_name, batch_size, prepped_folder, output_file
+):
     try:
         torch.cuda.set_device(gpu_id)
+        print(f"[GPU {gpu_id}] Loading model…", flush=True)
         processor, model = load_model(model_name, gpu_id)
+        print(f"[GPU {gpu_id}] Loading preprocessed shard [{start}:{end}]…", flush=True)
+        shard = datasets.load_from_disk(prepped_folder)
+        if isinstance(shard, datasets.DatasetDict):
+            shard = shard["train"]
+        shard = shard.select(range(start, end))
+        print(f"[GPU {gpu_id}] Captioning {len(shard)} examples…", flush=True)
         result = shard.map(
+            lambda batch: caption_batch(batch, processor, model),
             batched=True,
             batch_size=batch_size,
+            remove_columns=["image", "prompt"],
         )
+        print(f"[GPU {gpu_id}] Saving {output_file}…", flush=True)
         result.save_to_disk(output_file)
+        print(f"[GPU {gpu_id}] Done.", flush=True)
         return output_file
     except Exception as e:
         print(f"[GPU {gpu_id}] Error: {e}", flush=True)
         raise
 def main():
+    mp.set_start_method("spawn", force=True)
     input_dataset = "none-yet/anime-captions"
+    prepped_folder = "cpu_preprocessed"
     output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
     batch_size = 20
+    if not os.path.exists(prepped_folder):
+        cpu_preprocess(input_dataset, prepped_folder, model_name)
+    ds = datasets.load_from_disk(prepped_folder)
+    total = len(ds)
     num_gpus = torch.cuda.device_count()
+    shard = total // num_gpus
+    print(f"Dataset size: {total}")
     print(f"Using {num_gpus} GPUs")
+    print(f"Shard size: {shard}")
     processes = []
     temp_files = []
     for i in range(num_gpus):
+        s = i * shard
+        e = s + shard if i < num_gpus - 1 else total
+        of = f"temp_shard_{i}"
+        temp_files.append(of)
         p = mp.Process(
             target=process_shard,
+            args=(i, s, e, model_name, batch_size, prepped_folder, of),
         )
         p.start()
         processes.append(p)
     for p in processes:
         p.join()
         if p.exitcode != 0:
+            print("A process failed, aborting…")
+            for q in processes:
+                if q.is_alive():
+                    q.terminate()
+            for q in processes:
+                q.join()
+            raise RuntimeError("GPU worker failed.")
+    print("Merging shards…")
+    parts = []
+    for f in temp_files:
+        ds = datasets.load_from_disk(f)
+        if isinstance(ds, datasets.DatasetDict):
+            ds = ds["train"]
+        parts.append(ds)
+    final_ds = datasets.concatenate_datasets(parts)
+    print(f"Pushing final dataset to {output_dataset}…")
     final_ds.push_to_hub(output_dataset, create_pr=False)
+    print("Cleaning up…")
     for f in temp_files:
+        shutil.rmtree(f, ignore_errors=True)
+    print("Done.")
 if __name__ == "__main__":