Spaces:

Andy-6
/

Text-to-ImageApp

Sleeping

App Files Files Community

Andy-6 commited on 29 days ago

Commit

35b0ab1

1 Parent(s): 1694430

initial commit

Browse files

Files changed (10) hide show

.gitignore +1 -0
README.md +1 -0
app.py +192 -0
chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/data_level0.bin +3 -0
chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/header.bin +3 -0
chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/index_metadata.pickle +3 -0
chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/length.bin +3 -0
chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/link_lists.bin +3 -0
chroma_db/chroma.sqlite3 +3 -0
requirements.txt +132 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ data/images/

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Text-to-Image-Retrieval App

app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+app.py
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Gradio web app for text-to-image retrieval.
+How it works:
+  1. At startup: load CLIP (text encoder) + ChromaDB collection (pre-built)
+  2. On query  : encode the user's text prompt → cosine search → top-K images
+Image source (automatic fallback):
+  - Local  : if data/images/ exists and contains files → serve from disk
+  - Remote : otherwise → load images from HuggingFace Flickr8k dataset
+Run locally:
+  python app.py
+Deploy to HuggingFace Spaces:
+  Push this file + requirements.txt + chroma_db/ to your Space.
+  (data/images/ is optional — if absent, images are loaded from HuggingFace)
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+"""
+from pathlib import Path
+import chromadb
+import gradio as gr
+import torch
+from PIL import Image
+from transformers import CLIPModel, CLIPProcessor
+# ── Config ────────────────────────────────────────────────────────────────────
+DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_NAME   = "openai/clip-vit-base-patch16"
+IMAGES_DIR   = Path("data/images")
+CHROMA_DIR   = Path("chroma_db")
+COLLECTION   = "flickr8k"
+DEFAULT_TOPK = 10
+MAX_TOPK     = 20
+# ──────────────────────────────────────────────────────────────────────────────
+# ── Load CLIP ─────────────────────────────────────────────────────────────────
+print(f"\nStarting up on device: {DEVICE}")
+print("Loading CLIP model …")
+model = CLIPModel.from_pretrained(MODEL_NAME).to(DEVICE)
+processor = CLIPProcessor.from_pretrained(MODEL_NAME)
+model.eval()
+print("  CLIP ready.\n")
+# ── Connect to ChromaDB ───────────────────────────────────────────────────────
+print("Connecting to ChromaDB …")
+if not (CHROMA_DIR / "chroma.sqlite3").exists():
+    raise FileNotFoundError(
+        f"ChromaDB not found at '{CHROMA_DIR}'. "
+        "Run build_index.py first, then re-launch."
+    )
+chroma_client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+collection = chroma_client.get_collection(COLLECTION)
+print(f"  Collection ready: {collection.count()} images indexed.\n")
+# ── Image source: local disk or HuggingFace dataset ──────────────────────────
+USE_LOCAL_IMAGES = IMAGES_DIR.exists() and any(IMAGES_DIR.iterdir())
+if USE_LOCAL_IMAGES:
+    print(f"Image source: local disk ({IMAGES_DIR})\n")
+    dataset = None
+else:
+    print("Image source: HuggingFace dataset (data/images/ not found locally)")
+    print("Loading Flickr8k …")
+    from datasets import load_dataset
+    dataset = load_dataset("jxie/flickr8k", split="train+validation+test")
+    print(f"  Dataset ready: {len(dataset)} images.\n")
+# ── Helper: load a single image ───────────────────────────────────────────────
+def load_image(meta: dict) -> Image.Image:
+    """
+    Load an image from local disk or HuggingFace dataset depending on
+    what is available at runtime.
+    """
+    if USE_LOCAL_IMAGES:
+        return Image.open(IMAGES_DIR / meta["filename"]).convert("RGB")
+    else:
+        return dataset[meta["dataset_index"]]["image"].convert("RGB")
+# ── Core retrieval function ───────────────────────────────────────────────────
+def retrieve(query: str, top_k: int = DEFAULT_TOPK) -> list[tuple[Image.Image, str]]:
+    """
+    Encode `query` with CLIP and return the top-k matching (image, score) pairs.
+    Returns an empty list when the query is blank.
+    """
+    query = query.strip()
+    if not query:
+        return []
+    # Encode text with CLIP
+    inputs = processor(text=[query], return_tensors="pt", padding=True).to(DEVICE)
+    with torch.no_grad():
+        output = model.get_text_features(**inputs)
+        # handle both tensor and object outputs across transformers versions
+        text_features = output.pooler_output if hasattr(output, "pooler_output") else output
+    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+    query_vec = text_features.cpu().numpy().tolist()[0]
+    # Vector search in ChromaDB
+    results = collection.query(
+        query_embeddings=[query_vec],
+        n_results=int(top_k),
+        include=["metadatas", "distances"],
+    )
+    # Build output: (PIL image, score label)
+    output = []
+    for meta, dist in zip(results["metadatas"][0], results["distances"][0]):
+        img = load_image(meta)
+        # ChromaDB cosine distance: 0 = identical, 2 = opposite
+        # Convert to a 0-100 similarity percentage for display
+        similarity = round((1 - dist / 2) * 100, 1)
+        output.append((img, f"Score: {similarity}%"))
+    return output
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+_EXAMPLES = [
+    ["a dog playing in the snow"],
+    ["children playing at a park"],
+    ["a man surfing ocean waves"],
+    ["a woman reading a book"],
+    ["a group of people watching a performance"],
+    ["a cat sitting on a windowsill"],
+    ["a bike race on a mountain trail"],
+    ["fireworks over a city at night"],
+]
+with gr.Blocks(
+    title="CLIP Text-to-Image Retrieval",
+    theme=gr.themes.Soft(),
+) as demo:
+    gr.Markdown(
+        """
+        # 🔍 Text-to-Image Retrieval
+        Enter a natural language description and find matching images from the **Flickr8k** dataset.
+        Built with [CLIP](https://openai.com/research/clip) (ViT-B/16) + [ChromaDB](https://www.trychroma.com/).
+        """
+    )
+    with gr.Row():
+        query_box = gr.Textbox(
+            placeholder="e.g.  a dog playing in the snow",
+            label="Search prompt",
+            scale=5,
+        )
+        topk_slider = gr.Slider(
+            minimum=1, maximum=MAX_TOPK, value=DEFAULT_TOPK, step=1,
+            label="Results",
+            scale=1,
+        )
+        search_btn = gr.Button("Search 🔎", variant="primary", scale=1)
+    gallery = gr.Gallery(
+        label="Top results",
+        columns=5,
+        rows=2,
+        height="auto",
+        object_fit="cover",
+        show_label=True,
+    )
+    gr.Examples(
+        examples=_EXAMPLES,
+        inputs=query_box,
+        label="Try one of these …",
+    )
+    # Wire up interactions — both button click and Enter key trigger retrieve()
+    search_btn.click(fn=retrieve, inputs=[query_box, topk_slider], outputs=gallery)
+    query_box.submit(fn=retrieve, inputs=[query_box, topk_slider], outputs=gallery)
+# ── Entry point ───────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",  # listen on all interfaces (needed for LAN access)
+        share=False,             # set True for a temporary public gradio.live URL
+    )

chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:615f5509afa96d52f58663bdd7c0f09db6a17b10172b4949d8e740403637f8d1
+size 15683584

chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:247d206a62e4985a7fe1ffd10f57fcb1c4fd569a80f4d39aa2fc20804739750f
+size 100

chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6967640dd90753ba50c1b3822cf0d5eaf73e8a98c1b97b6d80de51cd9b849992
+size 198640

chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af779dc40371f141b64ab92aefa7b1b377c564f9c6b0334606ec1aa3abd9d216
+size 28672

chroma_db/57e6ab60-34f7-4656-8506-9bb8673dc71a/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14619dec3385eb7542ec27e892a07d4eb9dee71df09b51c0dd1e43474fc3fc33
+size 62740

chroma_db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:257d0073b2e1a2bc06370c4b667c97072468d2c661b7e0b9572a49b10dbc674c
+size 6504448

requirements.txt ADDED Viewed

	@@ -0,0 +1,132 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.1
+attrs==26.1.0
+bcrypt==5.0.0
+brotli==1.2.0
+build==1.4.0
+certifi==2026.2.25
+charset-normalizer==3.4.6
+chromadb==1.5.5
+click==8.3.1
+cuda-bindings==12.9.4
+cuda-pathfinder==1.2.2
+datasets==4.8.3
+dill==0.4.1
+durationpy==0.10
+fastapi==0.135.1
+ffmpy==1.0.0
+filelock==3.20.0
+flatbuffers==25.12.19
+frozenlist==1.8.0
+fsspec==2025.12.0
+googleapis-common-protos==1.73.0
+gradio==6.9.0
+gradio_client==2.3.0
+groovy==0.1.2
+grpcio==1.78.0
+h11==0.16.0
+hf-xet==1.4.2
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+huggingface_hub==1.7.2
+idna==3.11
+importlib_metadata==8.7.1
+importlib_resources==6.5.2
+Jinja2==3.1.6
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+kubernetes==35.0.0
+markdown-it-py==4.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mmh3==5.2.1
+mpmath==1.3.0
+multidict==6.7.1
+multiprocess==0.70.19
+networkx==3.6.1
+numpy==2.3.5
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.6.77
+oauthlib==3.3.1
+onnxruntime==1.24.4
+opentelemetry-api==1.40.0
+opentelemetry-exporter-otlp-proto-common==1.40.0
+opentelemetry-exporter-otlp-proto-grpc==1.40.0
+opentelemetry-proto==1.40.0
+opentelemetry-sdk==1.40.0
+opentelemetry-semantic-conventions==0.61b0
+orjson==3.11.7
+overrides==7.7.0
+packaging @ file:///home/task_176104874243446/conda-bld/packaging_1761049080023/work
+pandas==3.0.1
+pillow==12.0.0
+propcache==0.4.1
+protobuf==6.33.6
+pyarrow==23.0.1
+pybase64==1.4.3
+pydantic==2.12.5
+pydantic-settings==2.13.1
+pydantic_core==2.41.5
+pydub==0.25.1
+Pygments==2.19.2
+PyPika==0.51.1
+pyproject_hooks==1.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.2
+python-multipart==0.0.22
+pytz==2026.1.post1
+PyYAML==6.0.3
+referencing==0.37.0
+regex==2026.2.28
+requests==2.32.5
+requests-oauthlib==2.0.0
+rich==14.3.3
+rpds-py==0.30.0
+safehttpx==0.1.7
+safetensors==0.7.0
+semantic-version==2.10.0
+setuptools==80.10.2
+shellingham==1.5.4
+six==1.17.0
+starlette==0.52.1
+sympy==1.14.0
+tenacity==9.1.4
+tokenizers==0.22.2
+tomlkit==0.13.3
+torch==2.10.0+cu126
+torchvision==0.25.0+cu126
+tqdm==4.67.3
+transformers==5.3.0
+triton==3.6.0
+typer==0.24.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.42.0
+uvloop==0.22.1
+watchfiles==1.1.1
+websocket-client==1.9.0
+websockets==16.0
+wheel==0.46.3
+xxhash==3.6.0
+yarl==1.23.0
+zipp==3.23.0