Spaces:
Sleeping
Sleeping
feat: implement project synchronization service with database integration and automated embedding updates
Browse files
src/similarity_model/sync_projects.py
CHANGED
|
@@ -2,6 +2,10 @@ import json
|
|
| 2 |
import logging
|
| 3 |
import sys
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from Data.database.sql_connector import engine
|
| 7 |
from src.similarity_model.preprocessing import preprocess_dataset
|
|
@@ -45,9 +49,9 @@ def sync_projects():
|
|
| 45 |
logger.info(f"Loaded {len(projects_df)} active projects from database.")
|
| 46 |
|
| 47 |
|
| 48 |
-
logger.info("Fetching existing records from '
|
| 49 |
with engine.connect() as conn:
|
| 50 |
-
existing_df = pd.read_sql("SELECT id FROM
|
| 51 |
|
| 52 |
allowed_ids = set(projects_df["Id"].tolist())
|
| 53 |
processed_ids = set(existing_df["id"].tolist())
|
|
@@ -62,9 +66,9 @@ def sync_projects():
|
|
| 62 |
|
| 63 |
with engine.begin() as conn:
|
| 64 |
conn.exec_driver_sql(
|
| 65 |
-
f"DELETE FROM
|
| 66 |
)
|
| 67 |
-
logger.info(f"Successfully removed {len(ids_to_remove)} projects from '
|
| 68 |
changed = True
|
| 69 |
|
| 70 |
|
|
@@ -118,7 +122,7 @@ def sync_projects():
|
|
| 118 |
logger.info("Uploading preprocessed records to database...")
|
| 119 |
with engine.begin() as conn:
|
| 120 |
processed_df.to_sql(
|
| 121 |
-
"
|
| 122 |
conn,
|
| 123 |
if_exists="append",
|
| 124 |
index=False
|
|
@@ -137,3 +141,6 @@ def sync_projects():
|
|
| 137 |
logger.info("Local embeddings and index updated successfully.")
|
| 138 |
else:
|
| 139 |
logger.info("No database changes detected. Embeddings remain in sync.")
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import logging
|
| 3 |
import sys
|
| 4 |
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# Ensure workspace root is in path
|
| 8 |
+
sys.path.append(str(Path(__file__).resolve().parents[2]))
|
| 9 |
|
| 10 |
from Data.database.sql_connector import engine
|
| 11 |
from src.similarity_model.preprocessing import preprocess_dataset
|
|
|
|
| 49 |
logger.info(f"Loaded {len(projects_df)} active projects from database.")
|
| 50 |
|
| 51 |
|
| 52 |
+
logger.info("Fetching existing records from 'preprocess'...")
|
| 53 |
with engine.connect() as conn:
|
| 54 |
+
existing_df = pd.read_sql("SELECT id FROM preprocess", conn)
|
| 55 |
|
| 56 |
allowed_ids = set(projects_df["Id"].tolist())
|
| 57 |
processed_ids = set(existing_df["id"].tolist())
|
|
|
|
| 66 |
|
| 67 |
with engine.begin() as conn:
|
| 68 |
conn.exec_driver_sql(
|
| 69 |
+
f"DELETE FROM preprocess WHERE id IN ({ids_str})"
|
| 70 |
)
|
| 71 |
+
logger.info(f"Successfully removed {len(ids_to_remove)} projects from 'preprocess'.")
|
| 72 |
changed = True
|
| 73 |
|
| 74 |
|
|
|
|
| 122 |
logger.info("Uploading preprocessed records to database...")
|
| 123 |
with engine.begin() as conn:
|
| 124 |
processed_df.to_sql(
|
| 125 |
+
"preprocess",
|
| 126 |
conn,
|
| 127 |
if_exists="append",
|
| 128 |
index=False
|
|
|
|
| 141 |
logger.info("Local embeddings and index updated successfully.")
|
| 142 |
else:
|
| 143 |
logger.info("No database changes detected. Embeddings remain in sync.")
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
sync_projects()
|