Spaces:

bstraehle
/

advanced-rag

Sleeping

bstraehle commited on Jul 28, 2024

Commit

3271e72

verified ·

1 Parent(s): 838b33c

Update custom_utils.py

Files changed (1) hide show

custom_utils.py CHANGED Viewed

@@ -11,6 +11,26 @@ from pymongo.mongo_client import MongoClient
 DB_NAME = "airbnb_dataset"
 COLLECTION_NAME = "listings_reviews"
 def process_records(data_frame):
     records = data_frame.to_dict(orient='records')
     # Handle potential `NaT` values

 DB_NAME = "airbnb_dataset"
 COLLECTION_NAME = "listings_reviews"
+def rag_ingestion():
+    print("111")
+    dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
+    #dataset = dataset.take(100)
+    print("222")
+    # Convert the dataset to a pandas dataframe
+    dataset_df = pd.DataFrame(dataset)
+    #dataset_df.head(5)
+    #print("Columns:", dataset_df.columns)
+    listings = process_records(dataset_df)
+    print("333")
+    collection.delete_many({})
+    collection.insert_many(listings)
+    print("Data ingestion into MongoDB completed")
+    print("555")
+    # Manually create vector search index, feature is not available in free tier
 def process_records(data_frame):
     records = data_frame.to_dict(orient='records')
     # Handle potential `NaT` values