Kabila22 commited on
Commit
8c65df5
·
0 Parent(s):

image search is done

Browse files
Files changed (8) hide show
  1. .env +1 -0
  2. README.md +0 -0
  3. app.py +113 -0
  4. data/covert.py +13 -0
  5. data/image.csv +0 -0
  6. embeddings.py +73 -0
  7. pinecone_index.py +34 -0
  8. requirements.txt +9 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ PINECONE_API_KEY = pcsk_7TwL9Y_D3jYo3CzRCybTN9DYNkf4iR8tGbpgmtMmS2GzxzSyG6NnoYv3ybtB8HkArqQ53M
README.md ADDED
File without changes
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pinecone import Pinecone
3
+ from dotenv import load_dotenv
4
+ import os
5
+ from PIL import Image
6
+ import requests
7
+ from transformers import AutoProcessor, CLIPModel
8
+ import numpy as np
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Initialize Pinecone
14
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
15
+ index_name = "image-index-50000"
16
+ unsplash_index = pc.Index(index_name)
17
+
18
+ # Load CLIP model and processor
19
+ @st.cache_resource
20
+ def load_clip_model():
21
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
22
+ processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
23
+ return model, processor
24
+
25
+ model, processor = load_clip_model()
26
+
27
+ # Function to generate embedding from text
28
+ def get_text_embedding(text):
29
+ inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
30
+ text_features = model.get_text_features(**inputs)
31
+ embedding = text_features.detach().cpu().numpy().flatten().tolist()
32
+ return embedding
33
+
34
+ # Function to generate embedding from image
35
+ def get_image_embedding(image):
36
+ inputs = processor(images=image, return_tensors="pt")
37
+ image_features = model.get_image_features(**inputs)
38
+ embedding = image_features.detach().cpu().numpy().flatten().tolist()
39
+ return embedding
40
+
41
+ # Function to query Pinecone and fetch similar images
42
+ def search_similar_images(embedding, top_k=10):
43
+ results = unsplash_index.query(
44
+ vector=embedding,
45
+ top_k=top_k,
46
+ include_metadata=True,
47
+ namespace="image-search-dataset"
48
+ )
49
+ return results["matches"]
50
+
51
+ # Streamlit UI
52
+ st.title("🔍 Image Search App")
53
+
54
+ # Sidebar for search controls
55
+ with st.sidebar:
56
+ st.header("Search Options")
57
+
58
+ # Search type selection
59
+ search_type = st.radio(
60
+ "Select search type:",
61
+ ("Text to Image", "Image to Image")
62
+ )
63
+
64
+ # Input based on search type
65
+ if search_type == "Text to Image":
66
+ search_query = st.text_input("Enter your search query (e.g. Flower)")
67
+ uploaded_file = None
68
+ else: # Image to Image
69
+ uploaded_file = st.file_uploader("Upload an image to search",
70
+ type=["jpg", "jpeg", "png"])
71
+ search_query = None
72
+
73
+ # Search button
74
+ search_button = st.button("Search")
75
+
76
+ # Main content area for results
77
+ if search_button:
78
+ if (search_type == "Text to Image" and search_query) or (search_type == "Image to Image" and uploaded_file):
79
+ # Generate embedding based on search type
80
+ with st.spinner("Generating embedding..."):
81
+ if search_type == "Text to Image":
82
+ embedding = get_text_embedding(search_query)
83
+ else: # Image to Image
84
+ image = Image.open(uploaded_file).convert("RGB")
85
+ embedding = get_image_embedding(image)
86
+ # Display the uploaded image
87
+ st.image(image, caption="Uploaded Image", use_container_width=True)
88
+
89
+ # Search for similar images
90
+ with st.spinner("Searching for similar images..."):
91
+ matches = search_similar_images(embedding, top_k=10)
92
+
93
+ # Display results
94
+ st.subheader("Top Similar Images")
95
+ for match in matches:
96
+ score = match["score"]
97
+ photo_id = match["id"]
98
+ url = match["metadata"]["url"]
99
+ st.write(f"**Photo ID**: {photo_id} | **Similarity Score**: {score:.4f}")
100
+ try:
101
+ # Fetch and display the image from the URL
102
+ response = requests.get(url, stream=True)
103
+ response.raw.decode_content = True
104
+ img = Image.open(response.raw)
105
+ st.image(img, caption=f"Photo ID: {photo_id}", use_container_width=True)
106
+ except Exception as e:
107
+ st.error(f"Could not load image from {url}: {e}")
108
+ else:
109
+ st.warning("Please provide a search query or upload an image!")
110
+
111
+ # Instructions
112
+ st.write("---")
113
+ st.write("Note: This app searches an Unsplash dataset indexed in Pinecone using CLIP embeddings based on your input.")
data/covert.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Read the TSV file
4
+ tsv_file = '/Users/kabilanravi/Downloads/unsplash-research-dataset-lite-latest/photos.tsv000' # Replace with your TSV file path
5
+ csv_file = '/Users/kabilanravi/Desktop/image.csv' # Desired output CSV file path
6
+
7
+ # Load TSV into a DataFrame (tab-separated by default)
8
+ df = pd.read_csv(tsv_file, sep='\t')
9
+
10
+ # Write DataFrame to CSV
11
+ df.to_csv(csv_file, index=False) # index=False avoids adding row numbers
12
+
13
+ print(f"Converted {tsv_file} to {csv_file}")
data/image.csv ADDED
The diff for this file is too large to render. See raw diff
 
embeddings.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from pinecone import Pinecone, ServerlessSpec
4
+ from dotenv import load_dotenv
5
+ import requests
6
+ from tqdm import tqdm
7
+ from PIL import Image
8
+ from transformers import AutoProcessor, CLIPModel
9
+ import logging
10
+ import time
11
+
12
+ # Logging setup
13
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Pinecone setup
20
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
21
+ index_name = "image-index"
22
+ if index_name not in pc.list_indexes().names():
23
+ pc.create_index(
24
+ name=index_name,
25
+ metric="cosine",
26
+ dimension=512,
27
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
28
+ )
29
+ while not pc.describe_index(index_name).status.get("ready", False):
30
+ logger.info("Waiting for index to be ready...")
31
+ time.sleep(1)
32
+ unsplash_index = pc.Index(index_name)
33
+
34
+ # CLIP setup (loaded once)
35
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
36
+ processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
37
+
38
+ # Load dataset
39
+ images_df = pd.read_csv("image.csv")[["photo_id", "photo_image_url"]][:500]
40
+ total_images = len(images_df)
41
+ logger.info(f"Total images to process: {total_images}")
42
+
43
+ # Sequential processing function
44
+ def process_image(row):
45
+ try:
46
+ url = row["photo_image_url"]
47
+ photo_id = row["photo_id"]
48
+ # Download image
49
+ img = Image.open(requests.get(url, stream=True).raw)
50
+ # Generate embeddings
51
+ inputs = processor(images=img, return_tensors="pt")
52
+ image_features = model.get_image_features(**inputs)
53
+ embeddings = image_features.detach().cpu().numpy().flatten().tolist()
54
+ # Upsert to Pinecone
55
+ unsplash_index.upsert(
56
+ vectors=[{
57
+ "id": photo_id,
58
+ "values": embeddings,
59
+ "metadata": {"url": url, "photo_id": photo_id}
60
+ }],
61
+ namespace="image-search-dataset"
62
+ )
63
+ return f"Processed {photo_id}"
64
+ except Exception as e:
65
+ logger.error(f"Error processing {photo_id} with URL {url}: {e}")
66
+ return f"Error {photo_id}"
67
+
68
+ # Process images sequentially with tqdm
69
+ for _, row in tqdm(images_df.iterrows(), total=total_images, desc="Indexing images"):
70
+ result = process_image(row)
71
+ logger.info(result)
72
+
73
+ logger.info("Indexing complete!")
pinecone_index.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import the Pinecone library
2
+ from pinecone import Pinecone, ServerlessSpec
3
+
4
+ # Initialize Pinecone with your API key
5
+ api_key ="pcsk_7TwL9Y_D3jYo3CzRCybTN9DYNkf4iR8tGbpgmtMmS2GzxzSyG6NnoYv3ybtB8HkArqQ53M" # Replace with your actual API key
6
+ pc = Pinecone(api_key=api_key)
7
+
8
+ # Define the index name and parameters
9
+ index_name = "image-search"
10
+ dimension = 512 # Example dimension (e.g., for OpenAI embeddings like text-embedding-ada-002)
11
+ metric = "cosine" # Similarity metric: 'cosine', 'euclidean', or 'dotproduct'
12
+
13
+ # Create a serverless index
14
+ # Note: Serverless indexes are recommended for most use cases as they scale automatically
15
+ if index_name not in pc.list_indexes().names(): # Check if index doesn't already exist
16
+ pc.create_index(
17
+ name=index_name,
18
+ dimension=dimension,
19
+ metric=metric,
20
+ spec=ServerlessSpec(
21
+ cloud="aws", # Cloud provider (e.g., 'aws', 'gcp', 'azure')
22
+ region="us-east-1" # Region (e.g., 'us-east-1' for AWS)
23
+ )
24
+ )
25
+ print(f"Index '{index_name}' created successfully!")
26
+ else:
27
+ print(f"Index '{index_name}' already exists.")
28
+
29
+ # Connect to the index
30
+ index = pc.Index(index_name)
31
+
32
+ # Optional: Verify the index is ready
33
+ index_stats = index.describe_index_stats()
34
+ print(f"Index stats: {index_stats}")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pinecone-client
3
+ python-dotenv
4
+ pillow
5
+ requests
6
+ transformers
7
+ numpy
8
+ pandas
9
+ tqdm