Spaces:

Kabila22
/

Image_search

Running

App Files Files Community

Kabila22 commited on Feb 27, 2025

Commit

8c65df5

0 Parent(s):

image search is done

Browse files

Files changed (8) hide show

.env +1 -0
README.md +0 -0
app.py +113 -0
data/covert.py +13 -0
data/image.csv +0 -0
embeddings.py +73 -0
pinecone_index.py +34 -0
requirements.txt +9 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ PINECONE_API_KEY = pcsk_7TwL9Y_D3jYo3CzRCybTN9DYNkf4iR8tGbpgmtMmS2GzxzSyG6NnoYv3ybtB8HkArqQ53M

README.md ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+from pinecone import Pinecone
+from dotenv import load_dotenv
+import os
+from PIL import Image
+import requests
+from transformers import AutoProcessor, CLIPModel
+import numpy as np
+# Load environment variables
+load_dotenv()
+# Initialize Pinecone
+pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
+index_name = "image-index-50000"
+unsplash_index = pc.Index(index_name)
+# Load CLIP model and processor
+@st.cache_resource
+def load_clip_model():
+    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    return model, processor
+model, processor = load_clip_model()
+# Function to generate embedding from text
+def get_text_embedding(text):
+    inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
+    text_features = model.get_text_features(**inputs)
+    embedding = text_features.detach().cpu().numpy().flatten().tolist()
+    return embedding
+# Function to generate embedding from image
+def get_image_embedding(image):
+    inputs = processor(images=image, return_tensors="pt")
+    image_features = model.get_image_features(**inputs)
+    embedding = image_features.detach().cpu().numpy().flatten().tolist()
+    return embedding
+# Function to query Pinecone and fetch similar images
+def search_similar_images(embedding, top_k=10):
+    results = unsplash_index.query(
+        vector=embedding,
+        top_k=top_k,
+        include_metadata=True,
+        namespace="image-search-dataset"
+    )
+    return results["matches"]
+# Streamlit UI
+st.title("🔍 Image Search App")
+# Sidebar for search controls
+with st.sidebar:
+    st.header("Search Options")
+    # Search type selection
+    search_type = st.radio(
+        "Select search type:",
+        ("Text to Image", "Image to Image")
+    )
+    # Input based on search type
+    if search_type == "Text to Image":
+        search_query = st.text_input("Enter your search query (e.g. Flower)")
+        uploaded_file = None
+    else:  # Image to Image
+        uploaded_file = st.file_uploader("Upload an image to search",
+                                       type=["jpg", "jpeg", "png"])
+        search_query = None
+    # Search button
+    search_button = st.button("Search")
+# Main content area for results
+if search_button:
+    if (search_type == "Text to Image" and search_query) or (search_type == "Image to Image" and uploaded_file):
+        # Generate embedding based on search type
+        with st.spinner("Generating embedding..."):
+            if search_type == "Text to Image":
+                embedding = get_text_embedding(search_query)
+            else:  # Image to Image
+                image = Image.open(uploaded_file).convert("RGB")
+                embedding = get_image_embedding(image)
+                # Display the uploaded image
+                st.image(image, caption="Uploaded Image", use_container_width=True)
+        # Search for similar images
+        with st.spinner("Searching for similar images..."):
+            matches = search_similar_images(embedding, top_k=10)
+        # Display results
+        st.subheader("Top Similar Images")
+        for match in matches:
+            score = match["score"]
+            photo_id = match["id"]
+            url = match["metadata"]["url"]
+            st.write(f"**Photo ID**: {photo_id} | **Similarity Score**: {score:.4f}")
+            try:
+                # Fetch and display the image from the URL
+                response = requests.get(url, stream=True)
+                response.raw.decode_content = True
+                img = Image.open(response.raw)
+                st.image(img, caption=f"Photo ID: {photo_id}", use_container_width=True)
+            except Exception as e:
+                st.error(f"Could not load image from {url}: {e}")
+    else:
+        st.warning("Please provide a search query or upload an image!")
+# Instructions
+st.write("---")
+st.write("Note: This app searches an Unsplash dataset indexed in Pinecone using CLIP embeddings based on your input.")

data/covert.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import pandas as pd
+# Read the TSV file
+tsv_file = '/Users/kabilanravi/Downloads/unsplash-research-dataset-lite-latest/photos.tsv000'  # Replace with your TSV file path
+csv_file = '/Users/kabilanravi/Desktop/image.csv'  # Desired output CSV file path
+# Load TSV into a DataFrame (tab-separated by default)
+df = pd.read_csv(tsv_file, sep='\t')
+# Write DataFrame to CSV
+df.to_csv(csv_file, index=False)  # index=False avoids adding row numbers
+print(f"Converted {tsv_file} to {csv_file}")

data/image.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

embeddings.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import pandas as pd
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+import requests
+from tqdm import tqdm
+from PIL import Image
+from transformers import AutoProcessor, CLIPModel
+import logging
+import time
+# Logging setup
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Load environment variables
+load_dotenv()
+# Pinecone setup
+pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
+index_name = "image-index"
+if index_name not in pc.list_indexes().names():
+    pc.create_index(
+        name=index_name,
+        metric="cosine",
+        dimension=512,
+        spec=ServerlessSpec(cloud="aws", region="us-east-1")
+    )
+    while not pc.describe_index(index_name).status.get("ready", False):
+        logger.info("Waiting for index to be ready...")
+        time.sleep(1)
+unsplash_index = pc.Index(index_name)
+# CLIP setup (loaded once)
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+# Load dataset
+images_df = pd.read_csv("image.csv")[["photo_id", "photo_image_url"]][:500]
+total_images = len(images_df)
+logger.info(f"Total images to process: {total_images}")
+# Sequential processing function
+def process_image(row):
+    try:
+        url = row["photo_image_url"]
+        photo_id = row["photo_id"]
+        # Download image
+        img = Image.open(requests.get(url, stream=True).raw)
+        # Generate embeddings
+        inputs = processor(images=img, return_tensors="pt")
+        image_features = model.get_image_features(**inputs)
+        embeddings = image_features.detach().cpu().numpy().flatten().tolist()
+        # Upsert to Pinecone
+        unsplash_index.upsert(
+            vectors=[{
+                "id": photo_id,
+                "values": embeddings,
+                "metadata": {"url": url, "photo_id": photo_id}
+            }],
+            namespace="image-search-dataset"
+        )
+        return f"Processed {photo_id}"
+    except Exception as e:
+        logger.error(f"Error processing {photo_id} with URL {url}: {e}")
+        return f"Error {photo_id}"
+# Process images sequentially with tqdm
+for _, row in tqdm(images_df.iterrows(), total=total_images, desc="Indexing images"):
+    result = process_image(row)
+    logger.info(result)
+logger.info("Indexing complete!")

pinecone_index.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Import the Pinecone library
+from pinecone import Pinecone, ServerlessSpec
+# Initialize Pinecone with your API key
+api_key ="pcsk_7TwL9Y_D3jYo3CzRCybTN9DYNkf4iR8tGbpgmtMmS2GzxzSyG6NnoYv3ybtB8HkArqQ53M"  # Replace with your actual API key
+pc = Pinecone(api_key=api_key)
+# Define the index name and parameters
+index_name = "image-search"
+dimension = 512  # Example dimension (e.g., for OpenAI embeddings like text-embedding-ada-002)
+metric = "cosine"  # Similarity metric: 'cosine', 'euclidean', or 'dotproduct'
+# Create a serverless index
+# Note: Serverless indexes are recommended for most use cases as they scale automatically
+if index_name not in pc.list_indexes().names():  # Check if index doesn't already exist
+    pc.create_index(
+        name=index_name,
+        dimension=dimension,
+        metric=metric,
+        spec=ServerlessSpec(
+            cloud="aws",      # Cloud provider (e.g., 'aws', 'gcp', 'azure')
+            region="us-east-1"  # Region (e.g., 'us-east-1' for AWS)
+        )
+    )
+    print(f"Index '{index_name}' created successfully!")
+else:
+    print(f"Index '{index_name}' already exists.")
+# Connect to the index
+index = pc.Index(index_name)
+# Optional: Verify the index is ready
+index_stats = index.describe_index_stats()
+print(f"Index stats: {index_stats}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+pinecone-client
+python-dotenv
+pillow
+requests
+transformers
+numpy
+pandas
+tqdm