File size: 8,963 Bytes
5e95de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import json
import logging
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

# Ensure workspace root is in path
sys.path.append(str(Path(__file__).resolve().parents[2]))

from Data.database.sql_connector import engine
from src.similarity_model.preprocessing import preprocess_dataset, normalize_text
from src.similarity_model.semantic_search import load_model
from src.similarity_model.feature_similarity import load_feature_model, compute_feature_similarity
from src.similarity_model.hybrid_ranker import compute_hybrid_score, compute_originality

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)

def run_sync_preprocess():
    logger.info("Initializing Sync and Preprocess Service...")
    
    # 1. Verify DB Connection
    try:
        with engine.connect() as conn:
            logger.info("Database connection verified successfully.")
    except Exception as exc:
        logger.error(f"Unable to connect to the SQL database. Error: {exc}")
        sys.exit(1)

    # 2. Fetch raw projects
    logger.info("Pulling active records from 'Projects' table...")
    projects_query = """
    SELECT *
    FROM Projects
    WHERE Status IN (
        'Completed',
        'UnderReview',
        'In_Progress'
    )
    """
    with engine.connect() as conn:
        raw_df = pd.read_sql(projects_query, conn)
    logger.info(f"Loaded {len(raw_df)} active projects from 'Projects' table.")

    # 3. Preprocess dataset
    logger.info("Preprocessing dataset...")
    processed_df = preprocess_dataset(raw_df)
    logger.info(f"Total projects after preprocessing filters: {len(processed_df)}")

    if len(processed_df) == 0:
        logger.warning("No projects left after preprocessing. Exiting.")
        return

    # 4. Standarize / clean columns to match structure in the screenshot
    cols_to_keep = [
        "id",
        "submittedat",
        "project_title",
        "studentnames",
        "year",
        "abstract",
        "description",
        "problemstatement",
        "proposedsolution",
        "objectives",
        "full_content",
        "clean_text",
        "word_count",
        "features"
    ]

    for col in cols_to_keep:
        if col not in processed_df.columns:
            processed_df[col] = ""

    processed_df = processed_df[cols_to_keep]

    processed_df = processed_df.rename(
        columns={
            "submittedat": "submitted_at",
            "studentnames": "student_names",
            "problemstatement": "problem_statement",
            "proposedsolution": "proposed_solution"
        }
    )

    # 5. Calculate Originality for each project
    logger.info("Calculating originality score for each project...")
    
    # Load models
    model = load_model()
    feature_model = load_feature_model()

    # Generate embeddings for all preprocessed projects
    rich_texts = []
    for idx, row in processed_df.iterrows():
        title = str(row["project_title"]).strip()
        abstract = str(row["abstract"]).strip()
        description = str(row["description"]).strip()
        feats = row["features"] if isinstance(row["features"], list) else []
        
        raw_text = f"{title}. {abstract}. {description}"
        feature_text = " ".join(feats)
        full_text = normalize_text(f"{raw_text}. {feature_text}")
        rich_texts.append(full_text)

    logger.info("Encoding projects to vector space...")
    embeddings = model.encode(
        rich_texts,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    ).astype("float32")

    # Compute cosine similarity matrix
    logger.info("Computing semantic similarity matrix...")
    sim_matrix = cosine_similarity(embeddings, embeddings)

    originality_scores = []

    # Loop through each project to find closest match and compute originality
    for i in range(len(processed_df)):
        current_project = processed_df.iloc[i]
        current_features = current_project["features"] if isinstance(current_project["features"], list) else []
        
        # Get semantic similarity scores against other projects
        scores = sim_matrix[i].copy()
        scores[i] = -1.0  # Exclude self-matching
        
        # Sort and pick top 50 semantic matches as candidates
        top_indices = np.argsort(scores)[::-1][:50]
        
        max_hybrid_score = 0.0
        best_candidate_features = []
        best_candidate_idx = -1
        
        for idx in top_indices:
            candidate_project = processed_df.iloc[idx]
            sem_score = float(scores[idx])
            
            candidate_features = candidate_project["features"] if isinstance(candidate_project["features"], list) else []
            
            # Compute feature similarity
            feat_result = compute_feature_similarity(
                current_features,
                candidate_features,
                model=feature_model
            )
            
            feature_score = feat_result["score"]
            coverage = feat_result["coverage"]
            
            query_feature_count = len(current_features)
            unique_query_count = len(feat_result["unique_a"])
            
            # Compute hybrid score
            hybrid_score = compute_hybrid_score(
                semantic_score=sem_score,
                feature_score=feature_score,
                coverage=coverage,
                feature_count=query_feature_count,
                unique_query_count=unique_query_count
            )
            
            # Direct copy-paste detection
            query_desc = (str(current_project.get("abstract", "")) + " " + str(current_project.get("description", ""))).strip()
            candidate_desc = (str(candidate_project.get("abstract", "")) + " " + str(candidate_project.get("description", ""))).strip()
            
            words_q = set(normalize_text(query_desc).split())
            words_c = set(normalize_text(candidate_desc).split())
            
            jaccard_overlap = 0.0
            if words_q and words_c:
                jaccard_overlap = len(words_q.intersection(words_c)) / len(words_q.union(words_c))
                
            if jaccard_overlap >= 0.60:
                hybrid_score = 0.95
                
            if hybrid_score > max_hybrid_score:
                max_hybrid_score = hybrid_score
                best_candidate_features = candidate_features
                best_candidate_idx = idx

        # Calculate originality based on the worst-case (highest similarity) match
        if best_candidate_idx != -1 and len(current_features) > 0:
            # Recompute feature similarity for the best candidate to get unique_a count
            feat_result = compute_feature_similarity(
                current_features,
                best_candidate_features,
                model=feature_model
            )
            unique_query_count = len(feat_result["unique_a"])
            orig_score = compute_originality(
                hybrid_score=max_hybrid_score,
                unique_query_features=unique_query_count,
                total_query_features=len(current_features)
            )
        else:
            orig_score = compute_originality(
                hybrid_score=max_hybrid_score,
                unique_query_features=0,
                total_query_features=0
            )
            
        originality_scores.append(orig_score)

    processed_df["originality"] = originality_scores
    logger.info("Finished calculating originality scores.")

    # 6. Save locally to preprocessed.csv in Data/processed/
    local_dir = Path("Data") / "processed"
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = local_dir / "preprocessed.csv"
    
    # Keep features as readable lists for the CSV file
    csv_df = processed_df.copy()
    csv_df.to_csv(local_path, index=False)
    logger.info(f"Successfully saved preprocessed projects locally to: {local_path}")

    # 7. Convert features list to JSON string for database push
    db_df = processed_df.copy()
    db_df["features"] = db_df["features"].apply(json.dumps)

    # 8. Push to table 'preprocess' in database
    logger.info("Uploading preprocessed records to database table 'preprocess'...")
    try:
        with engine.begin() as conn:
            # Drop/replace table if exists to write the new preprocessed structure
            db_df.to_sql(
                "preprocess",
                conn,
                if_exists="replace",
                index=False
            )
        logger.info("Successfully pushed all preprocessed projects to database table 'preprocess'.")
    except Exception as exc:
        logger.error(f"Failed to push table to database. Error: {exc}")
        sys.exit(1)

if __name__ == "__main__":
    run_sync_preprocess()