joy-caption-pre-alpha

Runtime error

App Files Files Community

svjack commited on Jan 18, 2025

Commit

48b0782

verified ·

1 Parent(s): 13f60c5

Update run_caption.py

Browse files

Files changed (1) hide show

run_caption.py +36 -18

run_caption.py CHANGED Viewed

@@ -5,12 +5,15 @@ from torch import nn
 from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
 from PIL import Image
 import shutil
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 VLM_PROMPT = "A descriptive caption for this image:\n"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("wpkklhc6")
 class ImageAdapter(nn.Module):
     def __init__(self, input_features: int, output_features: int):
         super().__init__()
@@ -24,6 +27,7 @@ class ImageAdapter(nn.Module):
         x = self.linear2(x)
         return x
 def load_models():
     print("Loading CLIP")
     clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
@@ -49,6 +53,7 @@ def load_models():
     return clip_processor, clip_model, tokenizer, text_model, image_adapter
 @torch.no_grad()
 def generate_caption(input_image: Image.Image, clip_processor, clip_model, tokenizer, text_model, image_adapter):
     torch.cuda.empty_cache()
@@ -97,37 +102,50 @@ def generate_caption(input_image: Image.Image, clip_processor, clip_model, token
     return caption.strip()
 def main():
     parser = argparse.ArgumentParser(description="Generate a caption for an image and save it to a text file.")
-    parser.add_argument("input_image", type=str, help="Path to the input image")
-    parser.add_argument("output_path", type=str, help="Path to save the output image and caption text file")
     args = parser.parse_args()
     # Load models
     clip_processor, clip_model, tokenizer, text_model, image_adapter = load_models()
-    # Open the input image
-    input_image = Image.open(args.input_image)
-    # Generate caption
-    caption = generate_caption(input_image, clip_processor, clip_model, tokenizer, text_model, image_adapter)
-    # Process output path
     output_path = Path(args.output_path)
     output_path.mkdir(parents=True, exist_ok=True)
-    # Copy image to output path
-    image_name = Path(args.input_image).name
-    image_name = image_name.replace(" ", "_")  # Replace spaces with underscores
-    output_image_path = output_path / image_name
-    shutil.copy(args.input_image, output_image_path)
-    # Save caption to txt file
-    txt_file_path = output_path / f"{output_image_path.stem}.txt"
-    with open(txt_file_path, "w") as f:
-        f.write(caption)
-    print(f"Caption saved to {txt_file_path}")
 if __name__ == "__main__":
     main()

 from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
 from PIL import Image
 import shutil
+from tqdm import tqdm  # 引入 tqdm 用于显示进度条
+# Constants
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 VLM_PROMPT = "A descriptive caption for this image:\n"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("wpkklhc6")
+# Image Adapter
 class ImageAdapter(nn.Module):
     def __init__(self, input_features: int, output_features: int):
         super().__init__()
         x = self.linear2(x)
         return x
+# Load models
 def load_models():
     print("Loading CLIP")
     clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
     return clip_processor, clip_model, tokenizer, text_model, image_adapter
+# Generate caption
 @torch.no_grad()
 def generate_caption(input_image: Image.Image, clip_processor, clip_model, tokenizer, text_model, image_adapter):
     torch.cuda.empty_cache()
     return caption.strip()
+# Main function
 def main():
     parser = argparse.ArgumentParser(description="Generate a caption for an image and save it to a text file.")
+    parser.add_argument("input_path", type=str, help="Path to the input image or directory containing images")
+    parser.add_argument("output_path", type=str, help="Path to save the output images and captions")
     args = parser.parse_args()
     # Load models
     clip_processor, clip_model, tokenizer, text_model, image_adapter = load_models()
+    # Determine if input is a directory or a single file
+    input_path = Path(args.input_path)
+    if input_path.is_dir():
+        image_paths = list(input_path.glob("*.[pjP][npP][gG]")) + list(input_path.glob("*.[jJ][pP][eE][gG]"))  # 支持 PNG 和 JPEG 格式
+    else:
+        image_paths = [input_path]
+    # Create output directory if it doesn't exist
     output_path = Path(args.output_path)
     output_path.mkdir(parents=True, exist_ok=True)
+    # Process each image
+    for image_path in tqdm(image_paths, desc="Processing images"):
+        try:
+            # Open the input image
+            input_image = Image.open(image_path)
+            # Generate caption
+            caption = generate_caption(input_image, clip_processor, clip_model, tokenizer, text_model, image_adapter)
+            # Copy image to output path
+            image_name = image_path.name.replace(" ", "_")  # Replace spaces with underscores
+            output_image_path = output_path / image_name
+            shutil.copy(image_path, output_image_path)
+            # Save caption to txt file
+            txt_file_path = output_path / f"{output_image_path.stem}.txt"
+            with open(txt_file_path, "w") as f:
+                f.write(caption)
+            print(f"Caption saved to {txt_file_path}")
+        except Exception as e:
+            print(f"Error processing {image_path}: {e}")
 if __name__ == "__main__":
     main()