Spaces:

meettilavat
/

imagecaptioning

Sleeping

App Files Files Community

meettilavat commited on Oct 18, 2025

Commit

a17ff38

verified ·

1 Parent(s): f454bfa

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +19 -0
app.py +304 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+ENV PORT=7860
+EXPOSE 7860
+HEALTHCHECK CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860')" || exit 1
+ENTRYPOINT ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import functools
+import os
+from pathlib import Path
+from typing import Iterable, List, Tuple
+import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import AutoProcessor, Blip2ForConditionalGeneration
+def is_writable(path: Path) -> bool:
+    try:
+        path.mkdir(parents=True, exist_ok=True)
+        probe = path / ".probe"
+        probe.write_text("ok", encoding="utf-8")
+        probe.unlink(missing_ok=True)
+        return True
+    except Exception:
+        return False
+def pick_writable_base() -> Path:
+    for candidate in (
+        os.getenv("SPACE_PERSISTENT_DIR"),
+        "/data",
+        "/app",
+        "/tmp",
+    ):
+        if candidate and is_writable(Path(candidate)):
+            return Path(candidate)
+    return Path("/tmp")
+def set_env_dir(key: str, path: Path) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+    os.environ[key] = str(path)
+BASE_DIR = pick_writable_base()
+set_env_dir("HOME", BASE_DIR)
+set_env_dir("XDG_CACHE_HOME", BASE_DIR / ".cache")
+set_env_dir("HF_HOME", BASE_DIR / ".cache" / "huggingface")
+set_env_dir("TRANSFORMERS_CACHE", BASE_DIR / ".cache" / "huggingface" / "transformers")
+set_env_dir("HF_HUB_CACHE", BASE_DIR / ".cache" / "huggingface" / "hub")
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
+os.environ["OMP_NUM_THREADS"] = "2"
+os.environ["MKL_NUM_THREADS"] = "2"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+torch.set_num_threads(2)
+MODEL_REPO = "meettilavat/imagecaptioning"
+SUBFOLDER_PREFIX = "outputs/blip2_full_ft_stage2"
+LOCAL_DIR = Path(os.environ["HF_HOME"]) / "models" / "imagecaptioning"
+DEFAULT_PROMPT = "Describe the image in detail."
+def _allow_patterns() -> Iterable[str]:
+    yield f"{SUBFOLDER_PREFIX}/model/config.json"
+    yield f"{SUBFOLDER_PREFIX}/model/generation_config.json"
+    yield f"{SUBFOLDER_PREFIX}/model/model.safetensors"
+    yield f"{SUBFOLDER_PREFIX}/model/model.safetensors.index.json"
+    yield f"{SUBFOLDER_PREFIX}/model/model-*.safetensors"
+    yield f"{SUBFOLDER_PREFIX}/processor/*"
+@functools.lru_cache(maxsize=1)
+def prepare_local_snapshot() -> Path:
+    root = snapshot_download(
+        repo_id=MODEL_REPO,
+        local_dir=str(LOCAL_DIR),
+        local_dir_use_symlinks=False,
+        allow_patterns=list(_allow_patterns()),
+    )
+    return Path(root)
+@functools.lru_cache(maxsize=1)
+def load_model() -> Tuple[AutoProcessor, Blip2ForConditionalGeneration, torch.device, torch.dtype]:
+    repo_root = prepare_local_snapshot()
+    base = repo_root / SUBFOLDER_PREFIX
+    processor_dir = base / "processor"
+    model_dir = base / "model"
+    device = torch.device("cpu")
+    dtype: torch.dtype = torch.bfloat16
+    processor = AutoProcessor.from_pretrained(processor_dir)
+    try:
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            model_dir,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=True,
+        )
+    except Exception:
+        dtype = torch.float32
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            model_dir,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=True,
+        )
+    model = model.to(device).eval()
+    return processor, model, device, dtype
+def generate_caption(
+    processor: AutoProcessor,
+    model: Blip2ForConditionalGeneration,
+    device: torch.device,
+    dtype: torch.dtype,
+    image: Image.Image,
+    prompt: str,
+    max_new_tokens: int,
+    num_beams: int,
+) -> str:
+    inputs = processor(images=image, text=prompt, return_tensors="pt")
+    pixel_values = inputs["pixel_values"].to(device=device, dtype=dtype)
+    input_ids = inputs.get("input_ids")
+    attention_mask = inputs.get("attention_mask")
+    if input_ids is not None:
+        input_ids = input_ids.to(device)
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(device)
+    with torch.inference_mode():
+        generated = model.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            num_beams=num_beams,
+            do_sample=False,
+        )
+    return processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
+def batched_predictions(
+    processor: AutoProcessor,
+    model: Blip2ForConditionalGeneration,
+    device: torch.device,
+    dtype: torch.dtype,
+    image: Image.Image,
+    prompt: str,
+    max_new_tokens: int,
+    beam_options: List[int],
+) -> List[Tuple[int, str]]:
+    outputs: List[Tuple[int, str]] = []
+    for beams in beam_options:
+        caption = generate_caption(
+            processor,
+            model,
+            device,
+            dtype,
+            image,
+            prompt,
+            max_new_tokens,
+            beams,
+        )
+        outputs.append((beams, caption))
+    return outputs
+processor, model, device, dtype = load_model()
+def run_inference(
+    image: Image.Image,
+    prompt: str,
+    max_new_tokens: int,
+    beam_mode: str,
+    single_beam: int,
+    compare_beams: List[str],
+) -> str:
+    if image is None:
+        raise gr.Error("Please upload an image first.")
+    clean_prompt = (prompt or "").strip() or DEFAULT_PROMPT
+    if beam_mode == "Single":
+        beam_list = [int(single_beam or 4)]
+    else:
+        default_options = [2, 4, 6]
+        if not compare_beams:
+            beam_list = default_options
+        else:
+            deduped = []
+            for value in compare_beams:
+                beam = int(value)
+                if beam not in deduped:
+                    deduped.append(beam)
+                if len(deduped) == 4:
+                    break
+            beam_list = deduped or default_options
+    results = batched_predictions(
+        processor,
+        model,
+        device,
+        dtype,
+        image.convert("RGB"),
+        clean_prompt,
+        max_new_tokens,
+        beam_list,
+    )
+    blocks = []
+    for beams, text in results:
+        blocks.append(f"**Beam width {beams}**\n{text}")
+    return "\n\n".join(blocks)
+def update_beam_visibility(choice: str):
+    single_visible = choice == "Single"
+    compare_visible = choice == "Compare"
+    return (
+        gr.Slider.update(visible=single_visible),
+        gr.CheckboxGroup.update(visible=compare_visible),
+    )
+with gr.Blocks(title="BLIP-2 Image Captioning") as demo:
+    gr.Markdown("# BLIP-2 Image Captioning (H200 fine-tuned)")
+    gr.Markdown(
+        "Upload an image, tweak decoding settings, and optionally compare beam widths side by side."
+    )
+    with gr.Row():
+        with gr.Column(scale=6, min_width=320):
+            image_input = gr.Image(
+                label="Upload an image",
+                type="pil",
+                image_mode="RGB",
+            )
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                value=DEFAULT_PROMPT,
+                lines=3,
+                placeholder="Describe the instruction for BLIP-2",
+            )
+            max_tokens_input = gr.Slider(
+                label="Max new tokens",
+                minimum=16,
+                maximum=128,
+                step=8,
+                value=56,
+            )
+            beam_mode_input = gr.Radio(
+                label="Beam mode",
+                choices=["Single", "Compare"],
+                value="Single",
+                info="Use a single beam width or compare several options simultaneously.",
+            )
+            single_beam_slider = gr.Slider(
+                label="Beam width",
+                minimum=1,
+                maximum=8,
+                step=1,
+                value=4,
+            )
+            compare_beams_group = gr.CheckboxGroup(
+                label="Select beam widths",
+                choices=[str(i) for i in range(1, 9)],
+                value=["2", "4", "6"],
+                interactive=True,
+                visible=False,
+            )
+            run_button = gr.Button("Generate caption(s)")
+        with gr.Column(scale=9):
+            caption_output = gr.Markdown(value="Upload an image to preview captions.")
+            gr.Markdown(
+                f"Running inference on {device.type.upper()} with dtype {dtype}. "
+                "Compare beams to balance diversity vs. precision."
+            )
+    beam_mode_input.change(
+        fn=update_beam_visibility,
+        inputs=beam_mode_input,
+        outputs=[single_beam_slider, compare_beams_group],
+    )
+    run_button.click(
+        fn=run_inference,
+        inputs=[
+            image_input,
+            prompt_input,
+            max_tokens_input,
+            beam_mode_input,
+            single_beam_slider,
+            compare_beams_group,
+        ],
+        outputs=caption_output,
+        api_name="generate",
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch>=2.3,<2.8
+transformers>=4.56
+huggingface_hub>=0.24
+timm>=1.0.19
+sentencepiece>=0.2.1
+gradio>=4.44
+Pillow>=10.4