File size: 8,963 Bytes
5e95de5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | import json
import logging
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
# Ensure workspace root is in path
sys.path.append(str(Path(__file__).resolve().parents[2]))
from Data.database.sql_connector import engine
from src.similarity_model.preprocessing import preprocess_dataset, normalize_text
from src.similarity_model.semantic_search import load_model
from src.similarity_model.feature_similarity import load_feature_model, compute_feature_similarity
from src.similarity_model.hybrid_ranker import compute_hybrid_score, compute_originality
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)
def run_sync_preprocess():
logger.info("Initializing Sync and Preprocess Service...")
# 1. Verify DB Connection
try:
with engine.connect() as conn:
logger.info("Database connection verified successfully.")
except Exception as exc:
logger.error(f"Unable to connect to the SQL database. Error: {exc}")
sys.exit(1)
# 2. Fetch raw projects
logger.info("Pulling active records from 'Projects' table...")
projects_query = """
SELECT *
FROM Projects
WHERE Status IN (
'Completed',
'UnderReview',
'In_Progress'
)
"""
with engine.connect() as conn:
raw_df = pd.read_sql(projects_query, conn)
logger.info(f"Loaded {len(raw_df)} active projects from 'Projects' table.")
# 3. Preprocess dataset
logger.info("Preprocessing dataset...")
processed_df = preprocess_dataset(raw_df)
logger.info(f"Total projects after preprocessing filters: {len(processed_df)}")
if len(processed_df) == 0:
logger.warning("No projects left after preprocessing. Exiting.")
return
# 4. Standarize / clean columns to match structure in the screenshot
cols_to_keep = [
"id",
"submittedat",
"project_title",
"studentnames",
"year",
"abstract",
"description",
"problemstatement",
"proposedsolution",
"objectives",
"full_content",
"clean_text",
"word_count",
"features"
]
for col in cols_to_keep:
if col not in processed_df.columns:
processed_df[col] = ""
processed_df = processed_df[cols_to_keep]
processed_df = processed_df.rename(
columns={
"submittedat": "submitted_at",
"studentnames": "student_names",
"problemstatement": "problem_statement",
"proposedsolution": "proposed_solution"
}
)
# 5. Calculate Originality for each project
logger.info("Calculating originality score for each project...")
# Load models
model = load_model()
feature_model = load_feature_model()
# Generate embeddings for all preprocessed projects
rich_texts = []
for idx, row in processed_df.iterrows():
title = str(row["project_title"]).strip()
abstract = str(row["abstract"]).strip()
description = str(row["description"]).strip()
feats = row["features"] if isinstance(row["features"], list) else []
raw_text = f"{title}. {abstract}. {description}"
feature_text = " ".join(feats)
full_text = normalize_text(f"{raw_text}. {feature_text}")
rich_texts.append(full_text)
logger.info("Encoding projects to vector space...")
embeddings = model.encode(
rich_texts,
convert_to_numpy=True,
normalize_embeddings=True,
show_progress_bar=True
).astype("float32")
# Compute cosine similarity matrix
logger.info("Computing semantic similarity matrix...")
sim_matrix = cosine_similarity(embeddings, embeddings)
originality_scores = []
# Loop through each project to find closest match and compute originality
for i in range(len(processed_df)):
current_project = processed_df.iloc[i]
current_features = current_project["features"] if isinstance(current_project["features"], list) else []
# Get semantic similarity scores against other projects
scores = sim_matrix[i].copy()
scores[i] = -1.0 # Exclude self-matching
# Sort and pick top 50 semantic matches as candidates
top_indices = np.argsort(scores)[::-1][:50]
max_hybrid_score = 0.0
best_candidate_features = []
best_candidate_idx = -1
for idx in top_indices:
candidate_project = processed_df.iloc[idx]
sem_score = float(scores[idx])
candidate_features = candidate_project["features"] if isinstance(candidate_project["features"], list) else []
# Compute feature similarity
feat_result = compute_feature_similarity(
current_features,
candidate_features,
model=feature_model
)
feature_score = feat_result["score"]
coverage = feat_result["coverage"]
query_feature_count = len(current_features)
unique_query_count = len(feat_result["unique_a"])
# Compute hybrid score
hybrid_score = compute_hybrid_score(
semantic_score=sem_score,
feature_score=feature_score,
coverage=coverage,
feature_count=query_feature_count,
unique_query_count=unique_query_count
)
# Direct copy-paste detection
query_desc = (str(current_project.get("abstract", "")) + " " + str(current_project.get("description", ""))).strip()
candidate_desc = (str(candidate_project.get("abstract", "")) + " " + str(candidate_project.get("description", ""))).strip()
words_q = set(normalize_text(query_desc).split())
words_c = set(normalize_text(candidate_desc).split())
jaccard_overlap = 0.0
if words_q and words_c:
jaccard_overlap = len(words_q.intersection(words_c)) / len(words_q.union(words_c))
if jaccard_overlap >= 0.60:
hybrid_score = 0.95
if hybrid_score > max_hybrid_score:
max_hybrid_score = hybrid_score
best_candidate_features = candidate_features
best_candidate_idx = idx
# Calculate originality based on the worst-case (highest similarity) match
if best_candidate_idx != -1 and len(current_features) > 0:
# Recompute feature similarity for the best candidate to get unique_a count
feat_result = compute_feature_similarity(
current_features,
best_candidate_features,
model=feature_model
)
unique_query_count = len(feat_result["unique_a"])
orig_score = compute_originality(
hybrid_score=max_hybrid_score,
unique_query_features=unique_query_count,
total_query_features=len(current_features)
)
else:
orig_score = compute_originality(
hybrid_score=max_hybrid_score,
unique_query_features=0,
total_query_features=0
)
originality_scores.append(orig_score)
processed_df["originality"] = originality_scores
logger.info("Finished calculating originality scores.")
# 6. Save locally to preprocessed.csv in Data/processed/
local_dir = Path("Data") / "processed"
local_dir.mkdir(parents=True, exist_ok=True)
local_path = local_dir / "preprocessed.csv"
# Keep features as readable lists for the CSV file
csv_df = processed_df.copy()
csv_df.to_csv(local_path, index=False)
logger.info(f"Successfully saved preprocessed projects locally to: {local_path}")
# 7. Convert features list to JSON string for database push
db_df = processed_df.copy()
db_df["features"] = db_df["features"].apply(json.dumps)
# 8. Push to table 'preprocess' in database
logger.info("Uploading preprocessed records to database table 'preprocess'...")
try:
with engine.begin() as conn:
# Drop/replace table if exists to write the new preprocessed structure
db_df.to_sql(
"preprocess",
conn,
if_exists="replace",
index=False
)
logger.info("Successfully pushed all preprocessed projects to database table 'preprocess'.")
except Exception as exc:
logger.error(f"Failed to push table to database. Error: {exc}")
sys.exit(1)
if __name__ == "__main__":
run_sync_preprocess()
|