| | import json |
| | import pandas as pd |
| | from sentence_transformers import SentenceTransformer |
| | from pathlib import Path |
| | from tqdm import tqdm |
| |
|
| | def extract_caption(text_block): |
| | for line in text_block.splitlines(): |
| | if "CAPTION:" in line.upper(): |
| | return line.split("CAPTION:")[-1].strip() |
| | return "" |
| |
|
| | def load_captions_from_files(json_files): |
| | all_paths = [] |
| | all_captions = [] |
| |
|
| | for json_path in tqdm(json_files, desc="Reading files"): |
| | with open(json_path, 'r', encoding='utf-8') as f: |
| | data = json.load(f) |
| |
|
| | for img_path, outer_list in data.items(): |
| | if not outer_list or not outer_list[0]: |
| | continue |
| | text_block = outer_list[0][0] |
| | caption = extract_caption(text_block) |
| | if caption: |
| | all_paths.append(img_path) |
| | all_captions.append(caption) |
| |
|
| | return all_paths, all_captions |
| |
|
| | def compute_and_save_embeddings(json_files, output_csv): |
| | model = SentenceTransformer('all-MiniLM-L6-v2') |
| | image_paths, captions = load_captions_from_files(json_files) |
| |
|
| | if not captions: |
| | print("No valid captions found across input files.") |
| | return |
| |
|
| | embeddings = model.encode(captions, show_progress_bar=True) |
| | df = pd.DataFrame(embeddings) |
| | df.insert(0, "image_path", image_paths) |
| | df.to_csv(output_csv, index=False) |
| | print(f"Saved {len(df)} embeddings from {len(json_files)} files to {output_csv}") |
| |
|
| | |
| | if __name__ == "__main__": |
| | import glob |
| | |
| | files = glob.glob("./MBD_text/*.json") |
| | compute_and_save_embeddings(files, "combined_caption_embeddings.csv") |
| |
|