Spaces:
Sleeping
Sleeping
Update custom_utils.py
Browse files- custom_utils.py +20 -0
custom_utils.py
CHANGED
|
@@ -11,6 +11,26 @@ from pymongo.mongo_client import MongoClient
|
|
| 11 |
DB_NAME = "airbnb_dataset"
|
| 12 |
COLLECTION_NAME = "listings_reviews"
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def process_records(data_frame):
|
| 15 |
records = data_frame.to_dict(orient='records')
|
| 16 |
# Handle potential `NaT` values
|
|
|
|
| 11 |
DB_NAME = "airbnb_dataset"
|
| 12 |
COLLECTION_NAME = "listings_reviews"
|
| 13 |
|
| 14 |
+
def rag_ingestion():
|
| 15 |
+
print("111")
|
| 16 |
+
dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
|
| 17 |
+
|
| 18 |
+
#dataset = dataset.take(100)
|
| 19 |
+
print("222")
|
| 20 |
+
# Convert the dataset to a pandas dataframe
|
| 21 |
+
dataset_df = pd.DataFrame(dataset)
|
| 22 |
+
#dataset_df.head(5)
|
| 23 |
+
#print("Columns:", dataset_df.columns)
|
| 24 |
+
|
| 25 |
+
listings = process_records(dataset_df)
|
| 26 |
+
print("333")
|
| 27 |
+
collection.delete_many({})
|
| 28 |
+
collection.insert_many(listings)
|
| 29 |
+
print("Data ingestion into MongoDB completed")
|
| 30 |
+
print("555")
|
| 31 |
+
|
| 32 |
+
# Manually create vector search index, feature is not available in free tier
|
| 33 |
+
|
| 34 |
def process_records(data_frame):
|
| 35 |
records = data_frame.to_dict(orient='records')
|
| 36 |
# Handle potential `NaT` values
|