Product_Screener_Embedding

Sleeping

App Files Files Community

Zaious commited on Nov 22, 2024

Commit

33ef8ae

verified ·

1 Parent(s): f4cbb4b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -97

app.py CHANGED Viewed

@@ -11,115 +11,50 @@ import time
 from sklearn.metrics.pairwise import cosine_similarity
 from huggingface_hub import HfApi, hf_hub_download, upload_file
 from pathlib import Path
 # Initialize OpenAI client
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-# Hugging Face configuration
-HF_TOKEN = os.environ.get("HF_TOKEN")
-REPO_ID = os.environ.get("REPO_ID")  # format: "username/space-name"
-EMBEDDING_FILE = "product_embeddings.pkl"
-# Initialize Hugging Face API
-hf_api = HfApi(token=HF_TOKEN)
 # Load CSV data
 df = pd.read_csv("item_new.csv", encoding='utf-8')
-def create_product_text(row):
-    """Create a comprehensive text representation of a product"""
-    #return f"{row['item_desc']} {row['item_class1_desc']} {row['item_class2_desc']} {row['item_class3_desc']} {str(row['brand'])} {str(row['spec'])}"
-    return f"{row['item_name']} {row['description']} {row['tags']}"
-def get_embedding(text: str, model="text-embedding-3-small"):
-    """Get embeddings for a text using OpenAI's API"""
-    try:
-        text = text.replace("\n", " ")
-        response = client.embeddings.create(
-            input=[text],
-            model=model
-        )
-        return response.data[0].embedding
-    except Exception as e:
-        print(f"Error getting embedding: {e}")
-        return None
-def download_embeddings():
-    """Try to download embeddings from Hugging Face"""
-    try:
-        local_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=EMBEDDING_FILE,
-            token=HF_TOKEN
-        )
-        with open(local_path, 'rb') as f:
-            return pickle.load(f)
-    except Exception as e:
-        print(f"Error downloading embeddings: {e}")
-        return None
-def upload_embeddings(embeddings):
-    """Upload embeddings to Hugging Face"""
-    try:
-        # Save embeddings locally first
-        temp_path = "temp_embeddings.pkl"
-        with open(temp_path, 'wb') as f:
-            pickle.dump(embeddings, f)
-        # Upload to Hugging Face
-        hf_api.upload_file(
-            path_or_fileobj=temp_path,
-            path_in_repo=EMBEDDING_FILE,
-            repo_id=REPO_ID,
-            token=HF_TOKEN
-        )
-        # Clean up temp file
-        os.remove(temp_path)
-        print("Successfully uploaded embeddings")
-    except Exception as e:
-        print(f"Error uploading embeddings: {e}")
-def initialize_embeddings():
-    """Initialize or load product embeddings"""
-    print("Checking for existing embeddings...")
-    embeddings = download_embeddings()
-    if embeddings is not None:
-        print("Loaded existing embeddings")
-        return embeddings
-    print("Creating new embeddings...")
-    embeddings = []
-    for idx, row in df.iterrows():
-        product_text = create_product_text(row)
-        embedding = get_embedding(product_text)
-        if embedding:
-            embeddings.append(embedding)
-        else:
-            embeddings.append([0] * 1536)  # Default embedding dimension
-        time.sleep(0.1)  # Rate limiting for API calls
-    # Upload new embeddings
-    upload_embeddings(embeddings)
-    return embeddings
-# Load or create embeddings
 print("Initializing embeddings...")
-product_embeddings = initialize_embeddings()
-product_embeddings_array = np.array(product_embeddings)
 print("Embeddings initialized")
 def find_similar_products(query_embedding, top_k=8):
-    """Find most similar products using cosine similarity"""
-    similarities = cosine_similarity(
-        [query_embedding],
-        product_embeddings_array
-    )[0]
-    top_indices = similarities.argsort()[-top_k:][::-1]
-    return df.iloc[top_indices], similarities[top_indices]
 # Rest of the code remains the same...
 def analyze_query_and_find_products(query: str) -> str:
@@ -209,9 +144,11 @@ def analyze_query_and_find_products(query: str) -> str:
 # Add system status message
 def get_system_status():
     """Get system initialization status"""
     return {
-        "embeddings_loaded": product_embeddings is not None,
-        "embedding_count": len(product_embeddings) if product_embeddings else 0,
         "product_count": len(df)
     }

 from sklearn.metrics.pairwise import cosine_similarity
 from huggingface_hub import HfApi, hf_hub_download, upload_file
 from pathlib import Path
+import faiss
 # Initialize OpenAI client
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+def initialize_embeddings_from_faiss(faiss_path: str):
+    """Load product embeddings directly from FAISS index"""
+    if not os.path.exists(faiss_path):
+        raise FileNotFoundError(f"FAISS index file not found at {faiss_path}")
+    print(f"Loading FAISS index from {faiss_path}...")
+    index = faiss.read_index(faiss_path)
+    # Extract embeddings from FAISS index
+    product_embeddings_array = faiss.vector_to_array(index.xb).reshape(index.ntotal, index.d)
+    print(f"FAISS index loaded with {index.ntotal} embeddings.")
+    return index, product_embeddings_array
 # Load CSV data
 df = pd.read_csv("item_new.csv", encoding='utf-8')
+# Load embeddings from FAISS
 print("Initializing embeddings...")
+faiss_path = "product_index.faiss"  # Path to FAISS index file
+faiss_index, product_embeddings_array = initialize_embeddings_from_faiss(faiss_path)
 print("Embeddings initialized")
 def find_similar_products(query_embedding, top_k=8):
+    """Find most similar products using FAISS index"""
+    if faiss_index is None:
+        raise ValueError("FAISS index is not loaded.")
+    # FAISS expects float32 type embeddings
+    query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
+    # Perform FAISS search
+    distances, indices = faiss_index.search(query_embedding, top_k)
+    # Retrieve matching products
+    matching_products = df.iloc[indices[0]]
+    return matching_products, distances[0]
 # Rest of the code remains the same...
 def analyze_query_and_find_products(query: str) -> str:
 # Add system status message
 def get_system_status():
     """Get system initialization status"""
+    embeddings_loaded = faiss_index is not None
+    embedding_count = faiss_index.ntotal if embeddings_loaded else 0
     return {
+        "embeddings_loaded": embeddings_loaded,
+        "embedding_count": embedding_count,
         "product_count": len(df)
     }