Spaces:

griddev
/

project_02_DS

Running

App Files Files Community

griddev commited on about 12 hours ago

Commit

bba7394

verified ·

1 Parent(s): 6455831

Deploy Streamlit Space app

Browse files

Files changed (1) hide show

app.py +124 -64

app.py CHANGED Viewed

@@ -165,37 +165,54 @@ DEFAULT_SHAKESPEARE_WEIGHTS = "./shakespeare_transformer.pt"
 WEIGHTS_REPO_ID = os.getenv("WEIGHTS_REPO_ID", "griddev/vlm-caption-weights")
 WEIGHTS_CACHE_DIR = os.getenv("WEIGHTS_CACHE_DIR", "./weights_bundle")
-def _resolve_weight_paths():
     output_root = DEFAULT_OUTPUT_ROOT
     shakespeare_file = DEFAULT_SHAKESPEARE_FILE
     shakespeare_weights = DEFAULT_SHAKESPEARE_WEIGHTS
-    local_ready = (
-        os.path.isdir(output_root)
-        and os.path.exists(shakespeare_file)
-        and os.path.exists(shakespeare_weights)
     )
-    if local_ready:
         return output_root, shakespeare_file, shakespeare_weights
     try:
-        from huggingface_hub import snapshot_download
-        snapshot_download(
-            repo_id=WEIGHTS_REPO_ID,
-            repo_type="model",
-            local_dir=WEIGHTS_CACHE_DIR,
-            local_dir_use_symlinks=False,
-            allow_patterns=[
-                "outputs/*",
-                "outputs/**/*",
-                "input.txt",
-                "shakespeare_transformer.pt",
-            ],
-        )
-        candidate_output_root = os.path.join(WEIGHTS_CACHE_DIR, "outputs")
-        candidate_shakespeare_file = os.path.join(WEIGHTS_CACHE_DIR, "input.txt")
         candidate_shakespeare_weights = os.path.join(
-            WEIGHTS_CACHE_DIR, "shakespeare_transformer.pt"
         )
         if os.path.isdir(candidate_output_root):
             output_root = candidate_output_root
@@ -209,9 +226,6 @@ def _resolve_weight_paths():
     return output_root, shakespeare_file, shakespeare_weights
-OUTPUT_ROOT, SHAKESPEARE_FILE, SHAKESPEARE_WEIGHTS_PATH = _resolve_weight_paths()
 # ─────────────────────────────────────────────────────────────────────────────
 # Device
 # ─────────────────────────────────────────────────────────────────────────────
@@ -228,12 +242,36 @@ def get_device():
 def _has_finetuned(model_dir, subdir):
     """Check if a fine-tuned checkpoint exists for a given model + subdir."""
-    path = os.path.join(OUTPUT_ROOT, model_dir, subdir)
-    return os.path.isdir(path) and len(os.listdir(path)) > 0
-def _ckpt_path(model_dir, subdir):
-    return os.path.join(OUTPUT_ROOT, model_dir, subdir)
 # ─────────────────────────────────────────────────────────────────────────────
@@ -250,7 +288,10 @@ def load_blip(weight_source="base"):
         "Salesforce/blip-image-captioning-base")
     if weight_source != "base":
-        ckpt = _ckpt_path("blip", weight_source)
         if os.path.isdir(ckpt) and os.listdir(ckpt):
             try:
                 loaded = BlipForConditionalGeneration.from_pretrained(ckpt)
@@ -276,7 +317,10 @@ def load_vit_gpt2(weight_source="base"):
     model.config.pad_token_id = tokenizer.pad_token_id
     if weight_source != "base":
-        ckpt = _ckpt_path("vit_gpt2", weight_source)
         if os.path.isdir(ckpt) and os.listdir(ckpt):
             try:
                 loaded = VisionEncoderDecoderModel.from_pretrained(ckpt)
@@ -298,7 +342,10 @@ def load_git(weight_source="base"):
     model = AutoModelForCausalLM.from_pretrained(model_id)
     if weight_source != "base":
-        ckpt = _ckpt_path("git", weight_source)
         if os.path.isdir(ckpt) and os.listdir(ckpt):
             try:
                 loaded = AutoModelForCausalLM.from_pretrained(ckpt)
@@ -317,9 +364,12 @@ def load_custom_vlm(weight_source="base"):
     from config import CFG
     device = get_device()
     cfg = CFG()
-    cfg.output_root = OUTPUT_ROOT
-    cfg.shakespeare_file = SHAKESPEARE_FILE
-    cfg.shakespeare_weights_path = SHAKESPEARE_WEIGHTS_PATH
     if not os.path.exists(cfg.shakespeare_file):
         return None, None, None, None, device
@@ -518,36 +568,27 @@ with st.sidebar:
     st.markdown("### 🔬 VLM Caption Lab")
     st.markdown("---")
     # ── Weight Source ─────────────────────────────────────────────────────────
-    weight_options = {
-        "🔵 Base (Pretrained)": "base",
-        "🟢 Fine-tuned (Best)": "best",
-        "🟡 Fine-tuned (Latest)": "latest",
-    }
     weight_choice = st.radio(
         "**Weight Source**", list(weight_options.keys()), index=0,
         help="Base = HuggingFace pretrained. Best/Latest = your fine-tuned checkpoints."
     )
     weight_source = weight_options[weight_choice]
-    # Show availability indicators
-    ft_status = []
-    for mdl_dir, mdl_name in [("blip", "BLIP"), ("vit_gpt2", "ViT-GPT2"),
-                               ("git", "GIT"), ("custom_vlm", "Custom VLM")]:
-        has_best = _has_finetuned(mdl_dir, "best")
-        has_latest = _has_finetuned(mdl_dir, "latest")
-        if has_best or has_latest:
-            ft_status.append(f"  ✅ {mdl_name}")
-        else:
-            ft_status.append(f"  ⬜ {mdl_name}")
-    if weight_source != "base":
-        st.caption("Fine-tuned checkpoints:\n" + "\n".join(ft_status))
     st.markdown("---")
-    # ── Architecture Selector ─────────────────────────────────────────────────
-    selected_model = st.selectbox("**Architecture**", MODEL_KEYS, index=0)
     if selected_model in ("BLIP (Multimodal Mixture Attention)",
                           "ViT-GPT2 (Standard Cross-Attention)"):
         mode_options = [
@@ -678,14 +719,21 @@ with tab_caption:
     with col_result:
         if uploaded_file and generate_btn:
-            with st.spinner(f"Loading {MODEL_SHORT[selected_model]} ({weight_source}) + generating…"):
                 try:
                     caption = generate_caption(
                         selected_model, selected_mode, image,
                         num_beams=num_beams,
                         max_new_tokens=max_new_tokens,
                         length_penalty=length_penalty,
-                        weight_source=weight_source,
                     )
                 except Exception as e:
                     st.error(f"Generation error: {e}")
@@ -693,7 +741,7 @@ with tab_caption:
             if caption:
                 render_caption_card(
-                    selected_model, caption, weight_source,
                     num_beams, length_penalty, max_new_tokens,
                     container=st,
                 )
@@ -745,7 +793,7 @@ with tab_compare:
     with col_ctrl:
         if compare_file:
             compare_image = Image.open(compare_file).convert("RGB")
-            st.image(compare_image, caption="Comparison Image", width="stretch")
     compare_btn = st.button("🚀 Compare All 4 Models",
                              disabled=(compare_file is None or not is_common_mode),
@@ -754,6 +802,18 @@ with tab_compare:
     if compare_file and compare_btn:
         compare_image = Image.open(compare_file).convert("RGB")
         # Generate captions from all 4 models
         results = {}
         progress = st.progress(0, text="Starting comparison...")
@@ -784,7 +844,7 @@ with tab_compare:
                     num_beams=num_beams,
                     max_new_tokens=max_new_tokens,
                     length_penalty=length_penalty,
-                    weight_source=weight_source,
                 )
                 results[model_key] = cap
             except Exception as e:
@@ -804,7 +864,7 @@ with tab_compare:
             cap = results.get(model_key, "[Not available]")
             with col:
                 render_caption_card(
-                    model_key, cap, weight_source,
                     num_beams, length_penalty, max_new_tokens,
                     container=st,
                     card_class="compare-card",

 WEIGHTS_REPO_ID = os.getenv("WEIGHTS_REPO_ID", "griddev/vlm-caption-weights")
 WEIGHTS_CACHE_DIR = os.getenv("WEIGHTS_CACHE_DIR", "./weights_bundle")
+MODEL_DIR = {
+    "BLIP (Multimodal Mixture Attention)": "blip",
+    "ViT-GPT2 (Standard Cross-Attention)": "vit_gpt2",
+    "GIT (Zero Cross-Attention)": "git",
+    "Custom VLM (Shakespeare Prefix)": "custom_vlm",
+}
+OUTPUT_ROOT = DEFAULT_OUTPUT_ROOT
+@st.cache_resource(show_spinner=False)
+def _download_weights(need_outputs: bool, need_shakespeare: bool) -> str:
+    from huggingface_hub import snapshot_download
+    allow_patterns = []
+    if need_outputs:
+        allow_patterns += ["outputs/*", "outputs/**/*"]
+    if need_shakespeare:
+        allow_patterns += ["input.txt", "shakespeare_transformer.pt"]
+    if not allow_patterns:
+        return WEIGHTS_CACHE_DIR
+    return snapshot_download(
+        repo_id=WEIGHTS_REPO_ID,
+        repo_type="model",
+        local_dir=WEIGHTS_CACHE_DIR,
+        local_dir_use_symlinks=False,
+        allow_patterns=allow_patterns,
+    )
+def _resolve_weight_paths(need_outputs: bool, need_shakespeare: bool):
     output_root = DEFAULT_OUTPUT_ROOT
     shakespeare_file = DEFAULT_SHAKESPEARE_FILE
     shakespeare_weights = DEFAULT_SHAKESPEARE_WEIGHTS
+    have_outputs = os.path.isdir(output_root) and len(os.listdir(output_root)) > 0
+    have_shakespeare = (
+        os.path.exists(shakespeare_file) and os.path.exists(shakespeare_weights)
     )
+    if (not need_outputs or have_outputs) and (not need_shakespeare or have_shakespeare):
         return output_root, shakespeare_file, shakespeare_weights
     try:
+        cache_dir = _download_weights(need_outputs, need_shakespeare)
+        candidate_output_root = os.path.join(cache_dir, "outputs")
+        candidate_shakespeare_file = os.path.join(cache_dir, "input.txt")
         candidate_shakespeare_weights = os.path.join(
+            cache_dir, "shakespeare_transformer.pt"
         )
         if os.path.isdir(candidate_output_root):
             output_root = candidate_output_root
     return output_root, shakespeare_file, shakespeare_weights
 # ─────────────────────────────────────────────────────────────────────────────
 # Device
 # ─────────────────────────────────────────────────────────────────────────────
 def _has_finetuned(model_dir, subdir):
     """Check if a fine-tuned checkpoint exists for a given model + subdir."""
+    candidates = [
+        os.path.join(DEFAULT_OUTPUT_ROOT, model_dir, subdir),
+        os.path.join(WEIGHTS_CACHE_DIR, "outputs", model_dir, subdir),
+    ]
+    for path in candidates:
+        if os.path.isdir(path) and len(os.listdir(path)) > 0:
+            return True
+    return False
+def _ckpt_path(output_root, model_dir, subdir):
+    return os.path.join(output_root, model_dir, subdir)
+def _resolve_weight_source_for_model(model_name, requested_source):
+    if requested_source == "base":
+        return requested_source, None
+    model_dir = MODEL_DIR.get(model_name)
+    if not model_dir:
+        return requested_source, None
+    if _has_finetuned(model_dir, requested_source):
+        return requested_source, None
+    _resolve_weight_paths(
+        need_outputs=True,
+        need_shakespeare=(model_dir == "custom_vlm"),
+    )
+    if _has_finetuned(model_dir, requested_source):
+        return requested_source, None
+    short_name = MODEL_SHORT.get(model_name, model_name)
+    return "base", f"{short_name} has no '{requested_source}' weights. Using base."
 # ─────────────────────────────────────────────────────────────────────────────
         "Salesforce/blip-image-captioning-base")
     if weight_source != "base":
+        output_root, _, _ = _resolve_weight_paths(
+            need_outputs=True, need_shakespeare=False
+        )
+        ckpt = _ckpt_path(output_root, "blip", weight_source)
         if os.path.isdir(ckpt) and os.listdir(ckpt):
             try:
                 loaded = BlipForConditionalGeneration.from_pretrained(ckpt)
     model.config.pad_token_id = tokenizer.pad_token_id
     if weight_source != "base":
+        output_root, _, _ = _resolve_weight_paths(
+            need_outputs=True, need_shakespeare=False
+        )
+        ckpt = _ckpt_path(output_root, "vit_gpt2", weight_source)
         if os.path.isdir(ckpt) and os.listdir(ckpt):
             try:
                 loaded = VisionEncoderDecoderModel.from_pretrained(ckpt)
     model = AutoModelForCausalLM.from_pretrained(model_id)
     if weight_source != "base":
+        output_root, _, _ = _resolve_weight_paths(
+            need_outputs=True, need_shakespeare=False
+        )
+        ckpt = _ckpt_path(output_root, "git", weight_source)
         if os.path.isdir(ckpt) and os.listdir(ckpt):
             try:
                 loaded = AutoModelForCausalLM.from_pretrained(ckpt)
     from config import CFG
     device = get_device()
     cfg = CFG()
+    output_root, shakespeare_file, shakespeare_weights = _resolve_weight_paths(
+        need_outputs=(weight_source != "base"), need_shakespeare=True
+    )
+    cfg.output_root = output_root
+    cfg.shakespeare_file = shakespeare_file
+    cfg.shakespeare_weights_path = shakespeare_weights
     if not os.path.exists(cfg.shakespeare_file):
         return None, None, None, None, device
     st.markdown("### 🔬 VLM Caption Lab")
     st.markdown("---")
+    # ── Architecture Selector ─────────────────────────────────────────────────
+    selected_model = st.selectbox("**Architecture**", MODEL_KEYS, index=0)
     # ── Weight Source ─────────────────────────────────────────────────────────
+    model_dir = MODEL_DIR.get(selected_model)
+    weight_options = {"🔵 Base (Pretrained)": "base"}
+    if model_dir and _has_finetuned(model_dir, "best"):
+        weight_options["🟢 Fine-tuned (Best)"] = "best"
+    if model_dir and _has_finetuned(model_dir, "latest"):
+        weight_options["🟡 Fine-tuned (Latest)"] = "latest"
     weight_choice = st.radio(
         "**Weight Source**", list(weight_options.keys()), index=0,
         help="Base = HuggingFace pretrained. Best/Latest = your fine-tuned checkpoints."
     )
     weight_source = weight_options[weight_choice]
+    if len(weight_options) == 1:
+        st.caption("Fine-tuned weights not available for this model.")
     st.markdown("---")
     if selected_model in ("BLIP (Multimodal Mixture Attention)",
                           "ViT-GPT2 (Standard Cross-Attention)"):
         mode_options = [
     with col_result:
         if uploaded_file and generate_btn:
+            resolved_source, warning_msg = _resolve_weight_source_for_model(
+                selected_model, weight_source
+            )
+            if warning_msg:
+                st.warning(warning_msg)
+            with st.spinner(
+                f"Loading {MODEL_SHORT[selected_model]} ({resolved_source}) + generating…"
+            ):
                 try:
                     caption = generate_caption(
                         selected_model, selected_mode, image,
                         num_beams=num_beams,
                         max_new_tokens=max_new_tokens,
                         length_penalty=length_penalty,
+                        weight_source=resolved_source,
                     )
                 except Exception as e:
                     st.error(f"Generation error: {e}")
             if caption:
                 render_caption_card(
+                    selected_model, caption, resolved_source,
                     num_beams, length_penalty, max_new_tokens,
                     container=st,
                 )
     with col_ctrl:
         if compare_file:
             compare_image = Image.open(compare_file).convert("RGB")
+            st.image(compare_image, caption="Comparison Image", use_column_width=True)
     compare_btn = st.button("🚀 Compare All 4 Models",
                              disabled=(compare_file is None or not is_common_mode),
     if compare_file and compare_btn:
         compare_image = Image.open(compare_file).convert("RGB")
+        resolved_sources = {}
+        warnings = []
+        for model_key in MODEL_KEYS:
+            resolved, warning_msg = _resolve_weight_source_for_model(
+                model_key, weight_source
+            )
+            resolved_sources[model_key] = resolved
+            if warning_msg:
+                warnings.append(warning_msg)
+        for msg in sorted(set(warnings)):
+            st.warning(msg)
         # Generate captions from all 4 models
         results = {}
         progress = st.progress(0, text="Starting comparison...")
                     num_beams=num_beams,
                     max_new_tokens=max_new_tokens,
                     length_penalty=length_penalty,
+                    weight_source=resolved_sources.get(model_key, weight_source),
                 )
                 results[model_key] = cap
             except Exception as e:
             cap = results.get(model_key, "[Not available]")
             with col:
                 render_caption_card(
+                    model_key, cap, resolved_sources.get(model_key, weight_source),
                     num_beams, length_penalty, max_new_tokens,
                     container=st,
                     card_class="compare-card",