File size: 3,387 Bytes
267bd16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb

class DataHandler:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2', db_path="D:/Projects/Liminal/AI_Guide/resources/S3_chromadb"):
        # Initialize the ChromaDB persistent client and embedding model
        self.client = chromadb.PersistentClient(path=db_path)  # ChromaDB with persistent storage at the specified path
        self.model = SentenceTransformer(model_name)
        self.collection = self.client.get_or_create_collection(name="food_places")

    def load_data(self, file_path):
        """

        Load data from a CSV file.

        :param file_path: Path to the CSV file.

        :return: DataFrame with loaded data.

        """
        return pd.read_csv(file_path)

    def get_unique_cities(self):
        """

        Retrieve unique cities from the restaurant dataset.

        :return: List of unique cities.

        """
        df = self.load_data("D:/Projects/Liminal/AI_Guide/resources/S3Dataset.csv")
        return sorted(df['City'].unique().tolist())
    def process_data(self, df):
        """

        Preprocess the DataFrame and generate text embeddings for each row.

        :param df: DataFrame containing restaurant data.

        """
        count = 0
        for idx, row in df.iterrows():

            combined_text = f"{row['Cuisine']} {row['Location']} {row['Locality']} {row['Cost']}"
            embedding = self.model.encode(combined_text).tolist()

            # Create new metadata for each row
            new_metadata = {
                "restaurant_name": row['Name'],
                "location": row['Location'],
                "locality": row['Locality'],
                "city": row['City'],
                "votes": row['Votes'],
                "cost": row['Cost'],
                "rating": row['Rating'],
                "uuid": row['Unique_ID'],
                "image_path": row['Image_Path']
            }

            # Check if the ID already exists in the collection
            existing_docs = self.collection.get(ids=[str(idx)])  # Fetch existing document by ID

            if existing_docs['ids']:  # If the ID exists
                # Check if the metadata is the same
                existing_metadata = existing_docs['metadatas'][0]  # Get the existing metadata for comparison
                if existing_metadata == new_metadata:
                    continue  # Skip if metadata is the same

            # Insert or update the collection
            self.collection.add(
                documents=[combined_text],  # Add text for reference
                metadatas=[new_metadata],   # Use the new_metadata created before the condition
                embeddings=[embedding],
                ids=[str(idx)]  # Use index as ID
            )
            count = count+1
            if count%100 == 0:
                print(f"Inserted/Updated {count} rows into ChromaDB.")

    def get_collection(self):
        return self.collection  # Return the ChromaDB collection instance

# If you want to run this file separately to update the vector DB
if __name__ == "__main__":
    data_handler = DataHandler()
    df = data_handler.load_data("D:/Projects/Liminal/AI_Guide/resources/S3Dataset.csv")
    data_handler.process_data(df)