bat-6 commited on
Commit
d2f9ca7
·
1 Parent(s): 692f58b

feat: restore and update project database sync script

Browse files
src/similarity_model/__init__.py CHANGED
@@ -36,4 +36,8 @@ from .embedding_engine import (
36
 
37
  from .similarity_engine import (
38
  find_similar_projects
 
 
 
 
39
  )
 
36
 
37
  from .similarity_engine import (
38
  find_similar_projects
39
+ )
40
+
41
+ from .sync_projects import (
42
+ sync_projects
43
  )
src/similarity_model/sync_projects.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import sys
4
+ import pandas as pd
5
+
6
+ from Data.database.sql_connector import engine
7
+ from src.similarity_model.preprocessing import preprocess_dataset
8
+ from src.similarity_model.embedding_engine import train_embedding_engine
9
+
10
+ # =====================================================
11
+ # Logging
12
+ # =====================================================
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s | %(levelname)s | %(message)s"
16
+ )
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def sync_projects():
21
+ logger.info("Initializing project synchronization service...")
22
+
23
+ # Verify database connection first
24
+ try:
25
+ # Since engine is a LazyEngine, calling connect() tests connection
26
+ with engine.connect() as conn:
27
+ logger.info("Database connection verified successfully.")
28
+ except Exception as exc:
29
+ logger.error(
30
+ "Unable to connect to the SQL database. Please ensure you are connected "
31
+ "to the university network / VPN and that your IP is whitelisted. Error: %s",
32
+ exc
33
+ )
34
+ sys.exit(1)
35
+
36
+ # 1. Fetch raw active projects
37
+ projects_query = """
38
+ SELECT *
39
+ FROM Projects
40
+ WHERE Status IN (
41
+ 'Completed',
42
+ 'UnderReview',
43
+ 'In_Progress'
44
+ )
45
+ """
46
+ logger.info("Fetching raw active projects from 'Projects' table...")
47
+ projects_df = pd.read_sql(projects_query, engine)
48
+ logger.info(f"Loaded {len(projects_df)} active projects from database.")
49
+
50
+ # 2. Fetch existing preprocessed projects
51
+ logger.info("Fetching existing records from 'PreProcessed_Projects'...")
52
+ existing_df = pd.read_sql("SELECT id FROM PreProcessed_Projects", engine)
53
+
54
+ allowed_ids = set(projects_df["Id"].tolist())
55
+ processed_ids = set(existing_df["id"].tolist())
56
+
57
+ changed = False
58
+
59
+ # 3. Remove projects no longer active or allowed
60
+ ids_to_remove = processed_ids - allowed_ids
61
+ if ids_to_remove:
62
+ logger.info(f"Found {len(ids_to_remove)} projects to remove (status changed or deleted).")
63
+ ids_str = ",".join(map(str, ids_to_remove))
64
+
65
+ with engine.begin() as conn:
66
+ conn.exec_driver_sql(
67
+ f"DELETE FROM PreProcessed_Projects WHERE id IN ({ids_str})"
68
+ )
69
+ logger.info(f"Successfully removed {len(ids_to_remove)} projects from 'PreProcessed_Projects'.")
70
+ changed = True
71
+
72
+ # 4. Filter for new projects
73
+ new_projects = projects_df[~projects_df["Id"].isin(processed_ids)].copy()
74
+
75
+ if len(new_projects) > 0:
76
+ logger.info(f"Found {len(new_projects)} new projects to preprocess and insert.")
77
+
78
+ # Run preprocessing (cleaning, tokenization, feature extraction)
79
+ processed_df = preprocess_dataset(new_projects)
80
+
81
+ if len(processed_df) > 0:
82
+ # Map columns to target schema
83
+ cols_to_keep = [
84
+ "id",
85
+ "submittedat",
86
+ "project_title",
87
+ "studentnames",
88
+ "year",
89
+ "abstract",
90
+ "description",
91
+ "problemstatement",
92
+ "proposedsolution",
93
+ "objectives",
94
+ "full_content",
95
+ "clean_text",
96
+ "word_count",
97
+ "features"
98
+ ]
99
+
100
+ # Ensure columns exist before filtering
101
+ for col in cols_to_keep:
102
+ if col not in processed_df.columns:
103
+ processed_df[col] = ""
104
+
105
+ processed_df = processed_df[cols_to_keep]
106
+
107
+ processed_df = processed_df.rename(
108
+ columns={
109
+ "submittedat": "submitted_at",
110
+ "studentnames": "student_names",
111
+ "problemstatement": "problem_statement",
112
+ "proposedsolution": "proposed_solution"
113
+ }
114
+ )
115
+
116
+ # Stringify feature lists for SQL insertion
117
+ processed_df["features"] = processed_df["features"].apply(json.dumps)
118
+
119
+ # Insert into database
120
+ logger.info("Uploading preprocessed records to database...")
121
+ processed_df.to_sql(
122
+ "PreProcessed_Projects",
123
+ engine,
124
+ if_exists="append",
125
+ index=False
126
+ )
127
+ logger.info(f"Successfully processed and inserted {len(processed_df)} projects.")
128
+ changed = True
129
+ else:
130
+ logger.warning("No new projects remained after preprocessing filters.")
131
+ else:
132
+ logger.info("No new projects found.")
133
+
134
+ # 5. Rebuild local FAISS index & embeddings if anything changed
135
+ if changed:
136
+ logger.info("Changes detected. Rebuilding local embeddings and FAISS index...")
137
+ train_embedding_engine()
138
+ logger.info("Local embeddings and index updated successfully.")
139
+ else:
140
+ logger.info("No database changes detected. Embeddings remain in sync.")
141
+
142
+
143
+ if __name__ == "__main__":
144
+ sync_projects()