| import json |
| import logging |
| import os |
| import sys |
| import pandas as pd |
| import numpy as np |
| from pathlib import Path |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| |
| sys.path.append(str(Path(__file__).resolve().parents[2])) |
|
|
| from Data.database.sql_connector import engine |
| from src.similarity_model.preprocessing import preprocess_dataset, normalize_text |
| from src.similarity_model.semantic_search import load_model |
| from src.similarity_model.feature_similarity import load_feature_model, compute_feature_similarity |
| from src.similarity_model.hybrid_ranker import compute_hybrid_score, compute_originality |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s | %(levelname)s | %(message)s" |
| ) |
| logger = logging.getLogger(__name__) |
|
|
| def run_sync_preprocess(): |
| logger.info("Initializing Sync and Preprocess Service...") |
| |
| |
| try: |
| with engine.connect() as conn: |
| logger.info("Database connection verified successfully.") |
| except Exception as exc: |
| logger.error(f"Unable to connect to the SQL database. Error: {exc}") |
| sys.exit(1) |
|
|
| |
| logger.info("Pulling active records from 'Projects' table...") |
| projects_query = """ |
| SELECT * |
| FROM Projects |
| WHERE Status IN ( |
| 'Completed', |
| 'UnderReview', |
| 'In_Progress' |
| ) |
| """ |
| with engine.connect() as conn: |
| raw_df = pd.read_sql(projects_query, conn) |
| logger.info(f"Loaded {len(raw_df)} active projects from 'Projects' table.") |
|
|
| |
| logger.info("Preprocessing dataset...") |
| processed_df = preprocess_dataset(raw_df) |
| logger.info(f"Total projects after preprocessing filters: {len(processed_df)}") |
|
|
| if len(processed_df) == 0: |
| logger.warning("No projects left after preprocessing. Exiting.") |
| return |
|
|
| |
| cols_to_keep = [ |
| "id", |
| "submittedat", |
| "project_title", |
| "studentnames", |
| "year", |
| "abstract", |
| "description", |
| "problemstatement", |
| "proposedsolution", |
| "objectives", |
| "full_content", |
| "clean_text", |
| "word_count", |
| "features" |
| ] |
|
|
| for col in cols_to_keep: |
| if col not in processed_df.columns: |
| processed_df[col] = "" |
|
|
| processed_df = processed_df[cols_to_keep] |
|
|
| processed_df = processed_df.rename( |
| columns={ |
| "submittedat": "submitted_at", |
| "studentnames": "student_names", |
| "problemstatement": "problem_statement", |
| "proposedsolution": "proposed_solution" |
| } |
| ) |
|
|
| |
| logger.info("Calculating originality score for each project...") |
| |
| |
| model = load_model() |
| feature_model = load_feature_model() |
|
|
| |
| rich_texts = [] |
| for idx, row in processed_df.iterrows(): |
| title = str(row["project_title"]).strip() |
| abstract = str(row["abstract"]).strip() |
| description = str(row["description"]).strip() |
| feats = row["features"] if isinstance(row["features"], list) else [] |
| |
| raw_text = f"{title}. {abstract}. {description}" |
| feature_text = " ".join(feats) |
| full_text = normalize_text(f"{raw_text}. {feature_text}") |
| rich_texts.append(full_text) |
|
|
| logger.info("Encoding projects to vector space...") |
| embeddings = model.encode( |
| rich_texts, |
| convert_to_numpy=True, |
| normalize_embeddings=True, |
| show_progress_bar=True |
| ).astype("float32") |
|
|
| |
| logger.info("Computing semantic similarity matrix...") |
| sim_matrix = cosine_similarity(embeddings, embeddings) |
|
|
| originality_scores = [] |
|
|
| |
| for i in range(len(processed_df)): |
| current_project = processed_df.iloc[i] |
| current_features = current_project["features"] if isinstance(current_project["features"], list) else [] |
| |
| |
| scores = sim_matrix[i].copy() |
| scores[i] = -1.0 |
| |
| |
| top_indices = np.argsort(scores)[::-1][:50] |
| |
| max_hybrid_score = 0.0 |
| best_candidate_features = [] |
| best_candidate_idx = -1 |
| |
| for idx in top_indices: |
| candidate_project = processed_df.iloc[idx] |
| sem_score = float(scores[idx]) |
| |
| candidate_features = candidate_project["features"] if isinstance(candidate_project["features"], list) else [] |
| |
| |
| feat_result = compute_feature_similarity( |
| current_features, |
| candidate_features, |
| model=feature_model |
| ) |
| |
| feature_score = feat_result["score"] |
| coverage = feat_result["coverage"] |
| |
| query_feature_count = len(current_features) |
| unique_query_count = len(feat_result["unique_a"]) |
| |
| |
| hybrid_score = compute_hybrid_score( |
| semantic_score=sem_score, |
| feature_score=feature_score, |
| coverage=coverage, |
| feature_count=query_feature_count, |
| unique_query_count=unique_query_count |
| ) |
| |
| |
| query_desc = (str(current_project.get("abstract", "")) + " " + str(current_project.get("description", ""))).strip() |
| candidate_desc = (str(candidate_project.get("abstract", "")) + " " + str(candidate_project.get("description", ""))).strip() |
| |
| words_q = set(normalize_text(query_desc).split()) |
| words_c = set(normalize_text(candidate_desc).split()) |
| |
| jaccard_overlap = 0.0 |
| if words_q and words_c: |
| jaccard_overlap = len(words_q.intersection(words_c)) / len(words_q.union(words_c)) |
| |
| if jaccard_overlap >= 0.60: |
| hybrid_score = 0.95 |
| |
| if hybrid_score > max_hybrid_score: |
| max_hybrid_score = hybrid_score |
| best_candidate_features = candidate_features |
| best_candidate_idx = idx |
|
|
| |
| if best_candidate_idx != -1 and len(current_features) > 0: |
| |
| feat_result = compute_feature_similarity( |
| current_features, |
| best_candidate_features, |
| model=feature_model |
| ) |
| unique_query_count = len(feat_result["unique_a"]) |
| orig_score = compute_originality( |
| hybrid_score=max_hybrid_score, |
| unique_query_features=unique_query_count, |
| total_query_features=len(current_features) |
| ) |
| else: |
| orig_score = compute_originality( |
| hybrid_score=max_hybrid_score, |
| unique_query_features=0, |
| total_query_features=0 |
| ) |
| |
| originality_scores.append(orig_score) |
|
|
| processed_df["originality"] = originality_scores |
| logger.info("Finished calculating originality scores.") |
|
|
| |
| local_dir = Path("Data") / "processed" |
| local_dir.mkdir(parents=True, exist_ok=True) |
| local_path = local_dir / "preprocessed.csv" |
| |
| |
| csv_df = processed_df.copy() |
| csv_df.to_csv(local_path, index=False) |
| logger.info(f"Successfully saved preprocessed projects locally to: {local_path}") |
|
|
| |
| db_df = processed_df.copy() |
| db_df["features"] = db_df["features"].apply(json.dumps) |
|
|
| |
| logger.info("Uploading preprocessed records to database table 'preprocess'...") |
| try: |
| with engine.begin() as conn: |
| |
| db_df.to_sql( |
| "preprocess", |
| conn, |
| if_exists="replace", |
| index=False |
| ) |
| logger.info("Successfully pushed all preprocessed projects to database table 'preprocess'.") |
| except Exception as exc: |
| logger.error(f"Failed to push table to database. Error: {exc}") |
| sys.exit(1) |
|
|
| if __name__ == "__main__": |
| run_sync_preprocess() |
|
|