import pandas as pd from sentence_transformers import SentenceTransformer import chromadb class DataHandler: def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2', db_path="D:/Projects/Liminal/AI_Guide/resources/S3_chromadb"): # Initialize the ChromaDB persistent client and embedding model self.client = chromadb.PersistentClient(path=db_path) # ChromaDB with persistent storage at the specified path self.model = SentenceTransformer(model_name) self.collection = self.client.get_or_create_collection(name="food_places") def load_data(self, file_path): """ Load data from a CSV file. :param file_path: Path to the CSV file. :return: DataFrame with loaded data. """ return pd.read_csv(file_path) def get_unique_cities(self): """ Retrieve unique cities from the restaurant dataset. :return: List of unique cities. """ df = self.load_data("D:/Projects/Liminal/AI_Guide/resources/S3Dataset.csv") return sorted(df['City'].unique().tolist()) def process_data(self, df): """ Preprocess the DataFrame and generate text embeddings for each row. :param df: DataFrame containing restaurant data. """ count = 0 for idx, row in df.iterrows(): combined_text = f"{row['Cuisine']} {row['Location']} {row['Locality']} {row['Cost']}" embedding = self.model.encode(combined_text).tolist() # Create new metadata for each row new_metadata = { "restaurant_name": row['Name'], "location": row['Location'], "locality": row['Locality'], "city": row['City'], "votes": row['Votes'], "cost": row['Cost'], "rating": row['Rating'], "uuid": row['Unique_ID'], "image_path": row['Image_Path'] } # Check if the ID already exists in the collection existing_docs = self.collection.get(ids=[str(idx)]) # Fetch existing document by ID if existing_docs['ids']: # If the ID exists # Check if the metadata is the same existing_metadata = existing_docs['metadatas'][0] # Get the existing metadata for comparison if existing_metadata == new_metadata: continue # Skip if metadata is the same # Insert or update the collection self.collection.add( documents=[combined_text], # Add text for reference metadatas=[new_metadata], # Use the new_metadata created before the condition embeddings=[embedding], ids=[str(idx)] # Use index as ID ) count = count+1 if count%100 == 0: print(f"Inserted/Updated {count} rows into ChromaDB.") def get_collection(self): return self.collection # Return the ChromaDB collection instance # If you want to run this file separately to update the vector DB if __name__ == "__main__": data_handler = DataHandler() df = data_handler.load_data("D:/Projects/Liminal/AI_Guide/resources/S3Dataset.csv") data_handler.process_data(df)