Spaces:

stephenebert
/

Image_Tagger

Sleeping

App Files Files Community

stephenebert commited on Aug 13, 2025

Commit

2164c4f

verified ·

1 Parent(s): add6df8

Update tagger.py

Browse files

Files changed (1) hide show

tagger.py +56 -49

tagger.py CHANGED Viewed

@@ -1,5 +1,14 @@
 from __future__ import annotations
 import datetime as _dt
 import json as _json
 import pathlib as _pl
@@ -7,54 +16,47 @@ import re as _re
 import sys as _sys
 from typing import List
-import nltk
 from PIL import Image
 from transformers import BlipForConditionalGeneration, BlipProcessor
-# ─── ensure punkt + perceptron tagger are downloaded ──────────────────────────
-for res, subdir in [
-    ("punkt", "tokenizers"),
-    ("averaged_perceptron_tagger", "taggers"),
-]:
-    try:
-        nltk.data.find(f"{subdir}/{res}")
-    except LookupError:
-        nltk.download(res, quiet=True)
-# ─── where we dump the caption+tags JSON sidecars ──────────────────────────────
-CAP_TAG_DIR = _pl.Path.home() / "Desktop" / "image_tags"
-CAP_TAG_DIR.mkdir(exist_ok=True, parents=True)
-# ─── load the BLIP model once ──────────────────────────────────────────────────
 _processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-_model     = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# ─── allowed POS prefixes ──────────────────────────────────────────────────────
-_POS = {"nouns": ("NN",), "adjs": ("JJ",), "verbs": ("VB",)}
-def _caption_to_tags(
-    caption: str,
-    k: int,
-    keep_nouns: bool,
-    keep_adjs: bool,
-    keep_verbs: bool,
-) -> List[str]:
-    from nltk.tokenize import wordpunct_tokenize
-    allowed = []
-    if keep_nouns: allowed += _POS["nouns"]
-    if keep_adjs:  allowed += _POS["adjs"]
-    if keep_verbs: allowed += _POS["verbs"]
     seen, out = set(), []
-    for w, pos in nltk.pos_tag(wordpunct_tokenize(caption.lower())):
-        if any(pos.startswith(pref) for pref in allowed):
-            clean = _re.sub(r"[^a-z0-9-]", "", w)
-            if clean and clean not in seen:
-                out.append(clean)
-                seen.add(clean)
-                if len(out) >= k:
-                    break
     return out
 def tag_pil_image(
@@ -62,22 +64,26 @@ def tag_pil_image(
     stem: str,
     *,
     top_k: int = 5,
-    keep_nouns: bool = True,
-    keep_adjs: bool = True,
-    keep_verbs: bool = True,
 ) -> List[str]:
-    # 1) generate caption
-    ids = _model.generate(**_processor(images=img, return_tensors="pt"), max_length=30)
     caption = _processor.decode(ids[0], skip_special_tokens=True)
-    # 2) extract tags
-    tags = _caption_to_tags(caption, top_k, keep_nouns, keep_adjs, keep_verbs)
-    # 3) persist side-car JSON for main.py to read back
     payload = {
         "caption": caption,
         "tags": tags,
         "timestamp": _dt.datetime.now(_dt.timezone.utc).isoformat(),
     }
-    (_p := CAP_TAG_DIR / f"{stem}.json").write_text(_json.dumps(payload, indent=2))
     return tags
 if __name__ == "__main__":
@@ -89,3 +95,4 @@ if __name__ == "__main__":
     k = int(_sys.argv[2]) if len(_sys.argv) > 2 else 5
     with Image.open(path).convert("RGB") as im:
         print("tags:", ", ".join(tag_pil_image(im, path.stem, top_k=k)))

 from __future__ import annotations
+"""
+Image captioning + simple tag extraction (no POS/NLTK).
+- Caption: Salesforce/blip-image-captioning-base (Transformers)
+- Tags: first unique meaningful words from the caption (stopwords removed)
+- Sidecar: writes ./data/<stem>.json with {"caption","tags","timestamp"}
+"""
+import os
 import datetime as _dt
 import json as _json
 import pathlib as _pl
 import sys as _sys
 from typing import List
+import torch
 from PIL import Image
 from transformers import BlipForConditionalGeneration, BlipProcessor
+# Where to save caption+tags JSON (writable on Hugging Face Spaces)
+CAP_TAG_DIR = _pl.Path(os.environ.get("CAP_TAG_DIR", "./data")).resolve()
+CAP_TAG_DIR.mkdir(parents=True, exist_ok=True)
+# Device + model
+_device = "cuda" if torch.cuda.is_available() else "cpu"
 _processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+_model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-base"
+).to(_device)
+# Very small stopword list to keep tags clean (no NLTK required)
+_STOP = {
+    "a", "an", "the", "and", "or", "but", "if", "then", "so", "to", "from",
+    "of", "in", "on", "at", "by", "for", "with", "without", "into", "out",
+    "is", "are", "was", "were", "be", "being", "been", "it", "its", "this",
+    "that", "these", "those", "as", "over", "under", "near", "above", "below",
+    "up", "down", "left", "right"
+}
+def _caption_to_tags_simple(caption: str, k: int) -> List[str]:
+    """
+    Convert a caption string to up to k simple tags:
+    - lowercase alphanumeric/hyphen tokens
+    - remove short/stopword tokens
+    - keep first unique occurrences (order-preserving)
+    """
+    tokens = _re.findall(r"[a-z0-9-]+", caption.lower())
     seen, out = set(), []
+    for w in tokens:
+        if len(w) <= 2 or w in _STOP:
+            continue
+        if w not in seen:
+            out.append(w)
+            seen.add(w)
+            if len(out) >= k:
+                break
     return out
 def tag_pil_image(
     stem: str,
     *,
     top_k: int = 5,
+    keep_nouns: bool = True,   # kept for API compatibility; ignored
+    keep_adjs: bool = True,    # kept for API compatibility; ignored
+    keep_verbs: bool = True,   # kept for API compatibility; ignored
 ) -> List[str]:
+    """Generate a caption and simple tags for a PIL image."""
+    inputs = _processor(images=img, return_tensors="pt")
+    if _device == "cuda":
+        inputs = {k: v.to(_device) for k, v in inputs.items()}
+    with torch.inference_mode():
+        ids = _model.generate(**inputs, max_length=30)
     caption = _processor.decode(ids[0], skip_special_tokens=True)
+    tags = _caption_to_tags_simple(caption, top_k)
     payload = {
         "caption": caption,
         "tags": tags,
         "timestamp": _dt.datetime.now(_dt.timezone.utc).isoformat(),
     }
+    (CAP_TAG_DIR / f"{stem}.json").write_text(_json.dumps(payload, indent=2))
     return tags
 if __name__ == "__main__":
     k = int(_sys.argv[2]) if len(_sys.argv) > 2 else 5
     with Image.open(path).convert("RGB") as im:
         print("tags:", ", ".join(tag_pil_image(im, path.stem, top_k=k)))