Spaces:

nesanchezo
/

petProject

Sleeping

File size: 5,787 Bytes

import logging
from typing import List, Dict, Any
import pickle
import nltk
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
import chromadb
from chromadb.config import Settings
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class VectorStoreCreator:
    """Class to create and manage vector stores for dog food product search."""
    
    def __init__(self, data_path: str):
        """
        Initialize the VectorStoreCreator.
        
        Args:
            data_path: Path to the pickle file containing the product data
        """
        # Load environment variables
        #load_dotenv()
        # Obtener las claves de los secrets de Hugging Face
        #openai.api_key = st.secrets["OPENAI_API_KEY"].strip()
        #os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
        #os.environ["LANGCHAIN_TRACING_V2"] = st.secrets["LANGCHAIN_TRACING_V2"]

        # Initialize OpenAI client
        self.client = OpenAI()
        
        # Download NLTK resources
        nltk.download('punkt', quiet=True)
        
        # Load data
        self.df = pd.read_pickle(data_path)
        
        # Initialize stores
        self.bm25_model = None
        self.chroma_collection = None
        self.chunks = []
        self.metadata = []
        
    def prepare_data(self) -> None:
        """Prepare data for BM25 and embeddings."""
        logging.info("Preparing data for vector stores...")
        
        # Log initial dataframe info
        total_rows = len(self.df)
        logging.info(f"Total rows in DataFrame: {total_rows}")
        
        for _, row in self.df.iterrows():
            # Combine English and Spanish descriptions
            combined_text = f"{row['description_en']} {row['description_es']}"
            self.chunks.append(combined_text)
            
            # Create metadata
            metadata = {
                "product_name": row["product_name"],
                "brand": row["brand"],
                "dog_type": row["dog_type"],
                "food_type": row["food_type"],
                "weight": float(row["weight"]),
                "price": float(row["price"]),
                "reviews": float(row["reviews"]) if pd.notna(row["reviews"]) else 0.0
            }
            self.metadata.append(metadata)
        
        # Log final chunks info
        logging.info(f"Total chunks created: {len(self.chunks)}")
        if len(self.chunks) != total_rows:
            logging.warning(f"Mismatch between DataFrame rows ({total_rows}) and chunks created ({len(self.chunks)})")
        
        # Log sample of first chunk
        if self.chunks:
            logging.info(f"Sample of first chunk: {self.chunks[0][:200]}...")
    
    def create_bm25_index(self, save_path: str = "bm25_index.pkl") -> None:
        """
        Create and save BM25 index.
        
        Args:
            save_path: Path to save the BM25 index
        """
        logging.info("Creating BM25 index...")
        
        # Tokenize chunks
        tokenized_chunks = [word_tokenize(chunk.lower()) for chunk in self.chunks]
        
        # Create BM25 model
        self.bm25_model = BM25Okapi(tokenized_chunks)
        
        # Save the model and related data
        with open(save_path, 'wb') as f:
            pickle.dump({
                'model': self.bm25_model,
                'chunks': self.chunks,
                'metadata': self.metadata
            }, f)
        
        logging.info(f"BM25 index saved to {save_path}")
    
    def create_chroma_db(self, db_path: str = "chroma_db") -> None:
        """
        Create ChromaDB database.
        
        Args:
            db_path: Path to save the ChromaDB
        """
        logging.info("Creating ChromaDB database...")
        
        # Initialize ChromaDB with new client syntax
        client = chromadb.PersistentClient(path=db_path)
        
        # Create or get collection
        self.chroma_collection = client.get_or_create_collection(
            name="dog_food_descriptions"
        )
        
        # Add documents in batches
        batch_size = 10
        for i in tqdm(range(0, len(self.chunks), batch_size)):
            batch_chunks = self.chunks[i:i + batch_size]
            batch_metadata = self.metadata[i:i + batch_size]
            batch_ids = [str(idx) for idx in range(i, min(i + batch_size, len(self.chunks)))]
            
            # Get embeddings for batch
            embeddings = []
            for chunk in batch_chunks:
                response = self.client.embeddings.create(
                    model="text-embedding-ada-002",
                    input=chunk
                )
                embeddings.append(response.data[0].embedding)
            
            # Add to collection
            self.chroma_collection.add(
                embeddings=embeddings,
                metadatas=batch_metadata,
                documents=batch_chunks,
                ids=batch_ids
            )
        
        logging.info(f"ChromaDB saved to {db_path}")

def main():
    """Main execution function."""
    try:
        # Initialize creator
        creator = VectorStoreCreator("3rd_clean_comida_dogs_enriched_multilingual_2.pkl")
        
        # Prepare data
        creator.prepare_data()
        
        # Create indices
        creator.create_bm25_index()
        creator.create_chroma_db()
        
        logging.info("Vector stores created successfully!")
        
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        raise

if __name__ == "__main__":
    main()