Spaces:

InnoTrack
/

Graduation_Project-v1.2

Sleeping

App Files Files Community

bat-6 commited on 20 days ago

Commit

ac751b2

1 Parent(s): b4f49d4

feat: implement database synchronization queue with SQL triggers and a background worker for automated FAISS index rebuilding.

Browse files

Files changed (8) hide show

Data/database/__pycache__/sql_connector.cpython-313.pyc +0 -0
Data/database/create_sync_queue.sql +28 -0
Data/database/create_triggers.sql +102 -0
src/similarity_model/__pycache__/__init__.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/semantic_search.cpython-313.pyc +0 -0
src/similarity_model/semantic_search.py +33 -14
src/similarity_model/sync_projects.py +5 -134
src/similarity_model/sync_worker.py +221 -0

Data/database/__pycache__/sql_connector.cpython-313.pyc CHANGED Viewed

Binary files a/Data/database/__pycache__/sql_connector.cpython-313.pyc and b/Data/database/__pycache__/sql_connector.cpython-313.pyc differ

Data/database/create_sync_queue.sql ADDED Viewed

	@@ -0,0 +1,28 @@

+-- SQL Server DDL for SyncQueue table
+-- Path: D:\GRAD!!!!\Final\Graduation_Project-v1.2\Data\database\create_sync_queue.sql
+IF OBJECT_ID('SyncQueue', 'U') IS NULL
+BEGIN
+    CREATE TABLE SyncQueue (
+        QueueId INT IDENTITY(1,1) PRIMARY KEY,
+        ProjectId INT NOT NULL,
+        OperationType VARCHAR(10) NOT NULL,
+        Processed BIT NOT NULL DEFAULT 0,
+        CreatedAt DATETIME2 NOT NULL DEFAULT SYSUTCDATETIME(),
+        ProcessedAt DATETIME2 NULL,
+        RetryCount INT NOT NULL DEFAULT 0,
+        ErrorMessage NVARCHAR(MAX) NULL,
+        CONSTRAINT CHK_SyncQueue_OperationType CHECK (OperationType IN ('UPSERT', 'DELETE'))
+    );
+    -- Index to optimize querying of unprocessed items in chronological order
+    CREATE INDEX IX_SyncQueue_Unprocessed
+    ON SyncQueue (Processed, CreatedAt)
+    INCLUDE (ProjectId, OperationType, RetryCount);
+    PRINT 'SyncQueue table and index IX_SyncQueue_Unprocessed created successfully.';
+END
+ELSE
+BEGIN
+    PRINT 'SyncQueue table already exists.';
+END

Data/database/create_triggers.sql ADDED Viewed

	@@ -0,0 +1,102 @@

+-- SQL Server Triggers for Projects Table
+-- Path: D:\GRAD!!!!\Final\Graduation_Project-v1.2\Data\database\create_triggers.sql
+-- 1. INSERT Trigger
+IF OBJECT_ID('trg_Projects_Insert', 'TR') IS NOT NULL
+    DROP TRIGGER trg_Projects_Insert;
+GO
+CREATE TRIGGER trg_Projects_Insert
+ON Projects
+AFTER INSERT
+AS
+BEGIN
+    SET NOCOUNT ON;
+    INSERT INTO SyncQueue (ProjectId, OperationType, Processed, CreatedAt, RetryCount)
+    SELECT Id, 'UPSERT', 0, SYSUTCDATETIME(), 0
+    FROM inserted
+    WHERE Status IN ('Completed', 'UnderReview', 'In_Progress')
+      AND NOT EXISTS (
+          SELECT 1 FROM SyncQueue
+          WHERE ProjectId = inserted.Id AND Processed = 0 AND OperationType = 'UPSERT'
+      );
+END;
+GO
+-- 2. DELETE Trigger
+IF OBJECT_ID('trg_Projects_Delete', 'TR') IS NOT NULL
+    DROP TRIGGER trg_Projects_Delete;
+GO
+CREATE TRIGGER trg_Projects_Delete
+ON Projects
+AFTER DELETE
+AS
+BEGIN
+    SET NOCOUNT ON;
+    -- Cancel any pending unprocessed UPSERT operations for these deleted projects
+    UPDATE SyncQueue
+    SET Processed = 1,
+        ProcessedAt = SYSUTCDATETIME(),
+        ErrorMessage = 'Superseded by DELETE operation'
+    WHERE ProjectId IN (SELECT Id FROM deleted) AND Processed = 0;
+    -- Enqueue DELETE operation for previously eligible deleted projects
+    INSERT INTO SyncQueue (ProjectId, OperationType, Processed, CreatedAt, RetryCount)
+    SELECT Id, 'DELETE', 0, SYSUTCDATETIME(), 0
+    FROM deleted
+    WHERE Status IN ('Completed', 'UnderReview', 'In_Progress');
+END;
+GO
+-- 3. UPDATE Trigger
+IF OBJECT_ID('trg_Projects_Update', 'TR') IS NOT NULL
+    DROP TRIGGER trg_Projects_Update;
+GO
+CREATE TRIGGER trg_Projects_Update
+ON Projects
+AFTER UPDATE
+AS
+BEGIN
+    SET NOCOUNT ON;
+    -- Case A: Project remains eligible or becomes eligible (Status in Completed, UnderReview, In_Progress)
+    -- Enqueue an UPSERT operation (if not already pending unprocessed UPSERT)
+    INSERT INTO SyncQueue (ProjectId, OperationType, Processed, CreatedAt, RetryCount)
+    SELECT i.Id, 'UPSERT', 0, SYSUTCDATETIME(), 0
+    FROM inserted i
+    WHERE i.Status IN ('Completed', 'UnderReview', 'In_Progress')
+      AND NOT EXISTS (
+          SELECT 1 FROM SyncQueue q
+          WHERE q.ProjectId = i.Id AND q.Processed = 0 AND q.OperationType = 'UPSERT'
+      );
+    -- Case B: Project transitions from eligible status to ineligible status
+    -- Cancel any pending unprocessed UPSERT operations
+    UPDATE q
+    SET q.Processed = 1,
+        q.ProcessedAt = SYSUTCDATETIME(),
+        q.ErrorMessage = 'Superseded by transition to Ineligible status'
+    FROM SyncQueue q
+    JOIN inserted i ON q.ProjectId = i.Id
+    JOIN deleted d ON i.Id = d.Id
+    WHERE q.Processed = 0
+      AND d.Status IN ('Completed', 'UnderReview', 'In_Progress')
+      AND i.Status NOT IN ('Completed', 'UnderReview', 'In_Progress');
+    -- Enqueue a DELETE operation to remove it from preprocess/embeddings
+    INSERT INTO SyncQueue (ProjectId, OperationType, Processed, CreatedAt, RetryCount)
+    SELECT i.Id, 'DELETE', 0, SYSUTCDATETIME(), 0
+    FROM inserted i
+    JOIN deleted d ON i.Id = d.Id
+    WHERE d.Status IN ('Completed', 'UnderReview', 'In_Progress')
+      AND i.Status NOT IN ('Completed', 'UnderReview', 'In_Progress')
+      AND NOT EXISTS (
+          SELECT 1 FROM SyncQueue q
+          WHERE q.ProjectId = i.Id AND q.Processed = 0 AND q.OperationType = 'DELETE'
+      );
+END;
+GO

src/similarity_model/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/__init__.cpython-313.pyc and b/src/similarity_model/__pycache__/__init__.cpython-313.pyc differ

src/similarity_model/__pycache__/semantic_search.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/semantic_search.cpython-313.pyc and b/src/similarity_model/__pycache__/semantic_search.cpython-313.pyc differ

src/similarity_model/semantic_search.py CHANGED Viewed

@@ -54,37 +54,56 @@ def tokenize(text: str) -> set:
     """
     return set(normalize_text(text).split())
 @lru_cache(maxsize=1)
 def load_model():
     logger.info(f"Loading model: {DEFAULT_MODEL}")
     return SentenceTransformer(DEFAULT_MODEL)
-@lru_cache(maxsize=1)
 def load_faiss_index():
     if not INDEX_PATH.exists():
         raise FileNotFoundError("FAISS index not found.")
-    logger.info("Loading FAISS index...")
-    return faiss.read_index(str(INDEX_PATH))
-@lru_cache(maxsize=1)
 def load_metadata():
-    logger.info(
-        "Loading metadata from Azure SQL..."
-    )
-    df = load_preprocessed_projects()
-    return df.reset_index(drop=True)
-@lru_cache(maxsize=1)
 def load_embeddings():
     if not EMBED_PATH.exists():
         raise FileNotFoundError("Embeddings not found.")
-    logger.info("Loading embeddings...")
-    return np.load(str(EMBED_PATH))
 def build_results(
     df: pd.DataFrame,

     """
     return set(normalize_text(text).split())
+import os
+_cached_faiss_index = None
+_cached_faiss_index_mtime = None
+_cached_metadata = None
+_cached_metadata_mtime = None
+_cached_embeddings = None
+_cached_embeddings_mtime = None
 @lru_cache(maxsize=1)
 def load_model():
     logger.info(f"Loading model: {DEFAULT_MODEL}")
     return SentenceTransformer(DEFAULT_MODEL)
 def load_faiss_index():
+    global _cached_faiss_index, _cached_faiss_index_mtime
     if not INDEX_PATH.exists():
         raise FileNotFoundError("FAISS index not found.")
+    mtime = os.path.getmtime(INDEX_PATH)
+    if _cached_faiss_index is None or _cached_faiss_index_mtime != mtime:
+        logger.info(f"Loading FAISS index from {INDEX_PATH} (mtime: {mtime})...")
+        _cached_faiss_index = faiss.read_index(str(INDEX_PATH))
+        _cached_faiss_index_mtime = mtime
+    return _cached_faiss_index
 def load_metadata():
+    global _cached_metadata, _cached_metadata_mtime
+    if not INDEX_PATH.exists():
+        raise FileNotFoundError("FAISS index not found for metadata alignment.")
+    mtime = os.path.getmtime(INDEX_PATH)
+    if _cached_metadata is None or _cached_metadata_mtime != mtime:
+        logger.info(f"Loading metadata from Azure SQL (syncing with FAISS index mtime: {mtime})...")
+        df = load_preprocessed_projects()
+        _cached_metadata = df.reset_index(drop=True)
+        _cached_metadata_mtime = mtime
+    return _cached_metadata
 def load_embeddings():
+    global _cached_embeddings, _cached_embeddings_mtime
     if not EMBED_PATH.exists():
         raise FileNotFoundError("Embeddings not found.")
+    mtime = os.path.getmtime(EMBED_PATH)
+    if _cached_embeddings is None or _cached_embeddings_mtime != mtime:
+        logger.info(f"Loading embeddings from {EMBED_PATH} (mtime: {mtime})...")
+        _cached_embeddings = np.load(str(EMBED_PATH))
+        _cached_embeddings_mtime = mtime
+    return _cached_embeddings
 def build_results(
     df: pd.DataFrame,

src/similarity_model/sync_projects.py CHANGED Viewed

@@ -1,146 +1,17 @@
-import json
-import logging
 import sys
-import pandas as pd
 from pathlib import Path
 # Ensure workspace root is in path
 sys.path.append(str(Path(__file__).resolve().parents[2]))
-from Data.database.sql_connector import engine
-from src.similarity_model.preprocessing import preprocess_dataset
-from src.similarity_model.embedding_engine import train_embedding_engine
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)s | %(message)s"
-)
-logger = logging.getLogger(__name__)
 def sync_projects():
-    logger.info("Initializing project synchronization service...")
-    try:
-        with engine.connect() as conn:
-            logger.info("Database connection verified successfully.")
-    except Exception as exc:
-        logger.error(
-            "Unable to connect to the SQL database. Please ensure you are connected "
-            "to the university network / VPN and that your IP is whitelisted. Error: %s",
-            exc
-        )
-        sys.exit(1)
-    projects_query = """
-    SELECT *
-    FROM Projects
-    WHERE Status IN (
-        'Completed',
-        'UnderReview',
-        'In_Progress'
-    )
     """
-    logger.info("Fetching raw active projects from 'Projects' table...")
-    with engine.connect() as conn:
-        projects_df = pd.read_sql(projects_query, conn)
-    logger.info(f"Loaded {len(projects_df)} active projects from database.")
-    logger.info("Fetching existing records from 'preprocess'...")
-    with engine.connect() as conn:
-        existing_df = pd.read_sql("SELECT id FROM preprocess", conn)
-    allowed_ids = set(projects_df["Id"].tolist())
-    processed_ids = set(existing_df["id"].tolist())
-    changed = False
-    ids_to_remove = processed_ids - allowed_ids
-    if ids_to_remove:
-        logger.info(f"Found {len(ids_to_remove)} projects to remove (status changed or deleted).")
-        ids_str = ",".join(map(str, ids_to_remove))
-        with engine.begin() as conn:
-            conn.exec_driver_sql(
-                f"DELETE FROM preprocess WHERE id IN ({ids_str})"
-            )
-        logger.info(f"Successfully removed {len(ids_to_remove)} projects from 'preprocess'.")
-        changed = True
-    new_projects = projects_df[~projects_df["Id"].isin(processed_ids)].copy()
-    if len(new_projects) > 0:
-        logger.info(f"Found {len(new_projects)} new projects to preprocess and insert.")
-        processed_df = preprocess_dataset(new_projects)
-        if len(processed_df) > 0:
-            cols_to_keep = [
-                "id",
-                "submittedat",
-                "project_title",
-                "studentnames",
-                "year",
-                "abstract",
-                "description",
-                "problemstatement",
-                "proposedsolution",
-                "objectives",
-                "full_content",
-                "clean_text",
-                "word_count",
-                "features"
-            ]
-            for col in cols_to_keep:
-                if col not in processed_df.columns:
-                    processed_df[col] = ""
-            processed_df = processed_df[cols_to_keep]
-            processed_df = processed_df.rename(
-                columns={
-                    "submittedat": "submitted_at",
-                    "studentnames": "student_names",
-                    "problemstatement": "problem_statement",
-                    "proposedsolution": "proposed_solution"
-                }
-            )
-            processed_df["features"] = processed_df["features"].apply(json.dumps)
-            logger.info("Uploading preprocessed records to database...")
-            with engine.begin() as conn:
-                processed_df.to_sql(
-                    "preprocess",
-                    conn,
-                    if_exists="append",
-                    index=False
-                )
-            logger.info(f"Successfully processed and inserted {len(processed_df)} projects.")
-            changed = True
-        else:
-            logger.warning("No new projects remained after preprocessing filters.")
-    else:
-        logger.info("No new projects found.")
-    if changed:
-        logger.info("Changes detected. Rebuilding local embeddings and FAISS index...")
-        train_embedding_engine()
-        logger.info("Local embeddings and index updated successfully.")
-    else:
-        logger.info("No database changes detected. Embeddings remain in sync.")
 if __name__ == "__main__":
     sync_projects()

 import sys
 from pathlib import Path
 # Ensure workspace root is in path
 sys.path.append(str(Path(__file__).resolve().parents[2]))
+from src.similarity_model.sync_worker import run_worker
 def sync_projects():
     """
+    Entry point to run the event-driven synchronization worker.
+    This replaces the old 60-second polling-based full-table scan loop.
+    """
+    run_worker()
 if __name__ == "__main__":
     sync_projects()

src/similarity_model/sync_worker.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import sys
+import json
+import time
+import logging
+import pandas as pd
+from pathlib import Path
+from sqlalchemy import text
+# Ensure workspace root is in path
+sys.path.append(str(Path(__file__).resolve().parents[2]))
+from Data.database.sql_connector import engine
+from src.similarity_model.preprocessing import preprocess_dataset
+from src.similarity_model.embedding_engine import train_embedding_engine
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
+)
+logger = logging.getLogger("SyncWorker")
+# Settings
+BATCH_SIZE = 10
+MAX_RETRIES = 3
+POLL_INTERVAL = 5  # seconds
+REBUILD_THRESHOLD = 5  # Rebuild FAISS index after 5 database changes
+REBUILD_COOLDOWN = 60  # Or after 60 seconds if changes exist but threshold not met
+class RebuildManager:
+    def __init__(self, rebuild_threshold=REBUILD_THRESHOLD, rebuild_cooldown=REBUILD_COOLDOWN):
+        self.rebuild_threshold = rebuild_threshold
+        self.rebuild_cooldown = rebuild_cooldown
+        self.accumulated_changes = 0
+        self.last_rebuild_time = time.time()
+        self.pending_rebuild = False
+    def record_change(self):
+        self.accumulated_changes += 1
+        self.pending_rebuild = True
+    def check_and_rebuild(self):
+        if not self.pending_rebuild:
+            return False
+        now = time.time()
+        time_elapsed = now - self.last_rebuild_time
+        # Trigger rebuild if we hit the change threshold, OR if the cooldown has passed
+        if self.accumulated_changes >= self.rebuild_threshold or time_elapsed >= self.rebuild_cooldown:
+            logger.info(
+                f"Triggering FAISS index rebuild. "
+                f"Accumulated changes: {self.accumulated_changes}, time elapsed: {time_elapsed:.1f}s"
+            )
+            try:
+                train_embedding_engine()
+                self.accumulated_changes = 0
+                self.last_rebuild_time = now
+                self.pending_rebuild = False
+                logger.info("FAISS index rebuild completed successfully.")
+                return True
+            except Exception as e:
+                logger.error(f"Failed to rebuild FAISS index: {e}", exc_info=True)
+        return False
+def process_single_item(engine, item) -> bool:
+    queue_id = item["QueueId"]
+    project_id = item["ProjectId"]
+    operation_type = item["OperationType"]
+    changed = False
+    try:
+        # Start transaction for project processing
+        with engine.begin() as conn:
+            # Re-verify queue item is still unprocessed and lock it
+            row = conn.execute(text("""
+                SELECT QueueId FROM SyncQueue WITH (UPDLOCK, HOLDLOCK)
+                WHERE QueueId = :queue_id AND Processed = 0
+            """), {"queue_id": queue_id}).fetchone()
+            if not row:
+                logger.info(f"Queue item {queue_id} already processed by another worker. Skipping.")
+                return False
+            if operation_type == 'UPSERT':
+                # Fetch project from Projects table
+                project_df = pd.read_sql(
+                    text("SELECT * FROM Projects WHERE Id = :project_id"),
+                    conn,
+                    params={"project_id": project_id}
+                )
+                eligible = False
+                if not project_df.empty:
+                    # Support case-insensitive key retrieval
+                    status = project_df.iloc[0].get("Status") or project_df.iloc[0].get("status")
+                    if status in ["Completed", "UnderReview", "In_Progress"]:
+                        eligible = True
+                if eligible:
+                    logger.info(f"Preprocessing eligible project {project_id}...")
+                    processed_df = preprocess_dataset(project_df)
+                    if not processed_df.empty:
+                        # Standardize columns to match preprocess table schema
+                        cols_to_keep = [
+                            "id", "submittedat", "project_title", "studentnames", "year",
+                            "abstract", "description", "problemstatement", "proposedsolution",
+                            "objectives", "full_content", "clean_text", "word_count", "features"
+                        ]
+                        for col in cols_to_keep:
+                            if col not in processed_df.columns:
+                                processed_df[col] = ""
+                        processed_df = processed_df[cols_to_keep]
+                        processed_df = processed_df.rename(
+                            columns={
+                                "submittedat": "submitted_at",
+                                "studentnames": "student_names",
+                                "problemstatement": "problem_statement",
+                                "proposedsolution": "proposed_solution"
+                            }
+                        )
+                        processed_df["features"] = processed_df["features"].apply(json.dumps)
+                        # Upsert behavior: delete existing first, then append
+                        conn.execute(text("DELETE FROM preprocess WHERE id = :id"), {"id": project_id})
+                        processed_df.to_sql("preprocess", conn, if_exists="append", index=False)
+                        logger.info(f"Successfully preprocessed and inserted Project {project_id} into 'preprocess'.")
+                        changed = True
+                    else:
+                        logger.info(f"Project {project_id} filtered out by preprocessing. Removing from 'preprocess' table.")
+                        conn.execute(text("DELETE FROM preprocess WHERE id = :id"), {"id": project_id})
+                        changed = True
+                else:
+                    logger.info(f"Project {project_id} is ineligible or deleted. Removing from 'preprocess' table.")
+                    conn.execute(text("DELETE FROM preprocess WHERE id = :id"), {"id": project_id})
+                    changed = True
+            elif operation_type == 'DELETE':
+                logger.info(f"Removing Project {project_id} from 'preprocess' table...")
+                conn.execute(text("DELETE FROM preprocess WHERE id = :id"), {"id": project_id})
+                changed = True
+            # Mark queue item as processed successfully
+            conn.execute(text("""
+                UPDATE SyncQueue
+                SET Processed = 1, ProcessedAt = SYSUTCDATETIME(), ErrorMessage = NULL
+                WHERE QueueId = :queue_id
+            """), {"queue_id": queue_id})
+        return changed
+    except Exception as e:
+        logger.error(f"Error processing queue item {queue_id} (Project {project_id}): {e}", exc_info=True)
+        # Log failure on SyncQueue in a separate transaction to avoid rollback
+        try:
+            with engine.begin() as error_conn:
+                error_conn.execute(text("""
+                    UPDATE SyncQueue
+                    SET RetryCount = RetryCount + 1,
+                        ErrorMessage = :error_msg,
+                        ProcessedAt = CASE WHEN RetryCount + 1 >= :max_retries THEN SYSUTCDATETIME() ELSE NULL END,
+                        Processed = CASE WHEN RetryCount + 1 >= :max_retries THEN 1 ELSE 0 END
+                    WHERE QueueId = :queue_id
+                """), {
+                    "queue_id": queue_id,
+                    "error_msg": str(e)[:4000],
+                    "max_retries": MAX_RETRIES
+                })
+        except Exception as queue_err:
+            logger.error(f"Failed to write error status for queue item {queue_id}: {queue_err}")
+        return False
+def run_worker():
+    logger.info("Initializing Sync Worker service...")
+    # Verify DB connection
+    try:
+        with engine.connect() as conn:
+            conn.execute(text("SELECT 1"))
+        logger.info("Database connection verified successfully.")
+    except Exception as exc:
+        logger.critical(f"Database connection failed: {exc}")
+        sys.exit(1)
+    rebuild_manager = RebuildManager()
+    logger.info("Sync Worker started successfully and polling...")
+    while True:
+        try:
+            # Fetch batch of unprocessed items
+            with engine.connect() as conn:
+                result = conn.execute(text("""
+                    SELECT TOP (:batch_size) QueueId, ProjectId, OperationType, RetryCount
+                    FROM SyncQueue WITH (UPDLOCK, READPAST)
+                    WHERE Processed = 0 AND RetryCount < :max_retries
+                    ORDER BY CreatedAt ASC
+                """), {"batch_size": BATCH_SIZE, "max_retries": MAX_RETRIES})
+                batch = result.mappings().all()
+            if not batch:
+                # Idle period: check if we have any pending delayed rebuilds
+                rebuild_manager.check_and_rebuild()
+                time.sleep(POLL_INTERVAL)
+                continue
+            logger.info(f"Fetched {len(batch)} items from SyncQueue.")
+            for item in batch:
+                changed = process_single_item(engine, item)
+                if changed:
+                    rebuild_manager.record_change()
+            # Post-batch check for rebuilding FAISS index
+            rebuild_manager.check_and_rebuild()
+        except Exception as e:
+            logger.error(f"Error in Sync Worker loop: {e}", exc_info=True)
+            time.sleep(POLL_INTERVAL)
+if __name__ == "__main__":
+    run_worker()