Spaces:

InnoTrack
/

Graduation_Project-v1.2

Running

File size: 8,963 Bytes

5e95de5

import json
import logging
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

# Ensure workspace root is in path
sys.path.append(str(Path(__file__).resolve().parents[2]))

from Data.database.sql_connector import engine
from src.similarity_model.preprocessing import preprocess_dataset, normalize_text
from src.similarity_model.semantic_search import load_model
from src.similarity_model.feature_similarity import load_feature_model, compute_feature_similarity
from src.similarity_model.hybrid_ranker import compute_hybrid_score, compute_originality

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)

def run_sync_preprocess():
    logger.info("Initializing Sync and Preprocess Service...")
    
    # 1. Verify DB Connection
    try:
        with engine.connect() as conn:
            logger.info("Database connection verified successfully.")
    except Exception as exc:
        logger.error(f"Unable to connect to the SQL database. Error: {exc}")
        sys.exit(1)

    # 2. Fetch raw projects
    logger.info("Pulling active records from 'Projects' table...")
    projects_query = """
    SELECT *
    FROM Projects
    WHERE Status IN (
        'Completed',
        'UnderReview',
        'In_Progress'
    )
    """
    with engine.connect() as conn:
        raw_df = pd.read_sql(projects_query, conn)
    logger.info(f"Loaded {len(raw_df)} active projects from 'Projects' table.")

    # 3. Preprocess dataset
    logger.info("Preprocessing dataset...")
    processed_df = preprocess_dataset(raw_df)
    logger.info(f"Total projects after preprocessing filters: {len(processed_df)}")

    if len(processed_df) == 0:
        logger.warning("No projects left after preprocessing. Exiting.")
        return

    # 4. Standarize / clean columns to match structure in the screenshot
    cols_to_keep = [
        "id",
        "submittedat",
        "project_title",
        "studentnames",
        "year",
        "abstract",
        "description",
        "problemstatement",
        "proposedsolution",
        "objectives",
        "full_content",
        "clean_text",
        "word_count",
        "features"
    ]

    for col in cols_to_keep:
        if col not in processed_df.columns:
            processed_df[col] = ""

    processed_df = processed_df[cols_to_keep]

    processed_df = processed_df.rename(
        columns={
            "submittedat": "submitted_at",
            "studentnames": "student_names",
            "problemstatement": "problem_statement",
            "proposedsolution": "proposed_solution"
        }
    )

    # 5. Calculate Originality for each project
    logger.info("Calculating originality score for each project...")
    
    # Load models
    model = load_model()
    feature_model = load_feature_model()

    # Generate embeddings for all preprocessed projects
    rich_texts = []
    for idx, row in processed_df.iterrows():
        title = str(row["project_title"]).strip()
        abstract = str(row["abstract"]).strip()
        description = str(row["description"]).strip()
        feats = row["features"] if isinstance(row["features"], list) else []
        
        raw_text = f"{title}. {abstract}. {description}"
        feature_text = " ".join(feats)
        full_text = normalize_text(f"{raw_text}. {feature_text}")
        rich_texts.append(full_text)

    logger.info("Encoding projects to vector space...")
    embeddings = model.encode(
        rich_texts,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    ).astype("float32")

    # Compute cosine similarity matrix
    logger.info("Computing semantic similarity matrix...")
    sim_matrix = cosine_similarity(embeddings, embeddings)

    originality_scores = []

    # Loop through each project to find closest match and compute originality
    for i in range(len(processed_df)):
        current_project = processed_df.iloc[i]
        current_features = current_project["features"] if isinstance(current_project["features"], list) else []
        
        # Get semantic similarity scores against other projects
        scores = sim_matrix[i].copy()
        scores[i] = -1.0  # Exclude self-matching
        
        # Sort and pick top 50 semantic matches as candidates
        top_indices = np.argsort(scores)[::-1][:50]
        
        max_hybrid_score = 0.0
        best_candidate_features = []
        best_candidate_idx = -1
        
        for idx in top_indices:
            candidate_project = processed_df.iloc[idx]
            sem_score = float(scores[idx])
            
            candidate_features = candidate_project["features"] if isinstance(candidate_project["features"], list) else []
            
            # Compute feature similarity
            feat_result = compute_feature_similarity(
                current_features,
                candidate_features,
                model=feature_model
            )
            
            feature_score = feat_result["score"]
            coverage = feat_result["coverage"]
            
            query_feature_count = len(current_features)
            unique_query_count = len(feat_result["unique_a"])
            
            # Compute hybrid score
            hybrid_score = compute_hybrid_score(
                semantic_score=sem_score,
                feature_score=feature_score,
                coverage=coverage,
                feature_count=query_feature_count,
                unique_query_count=unique_query_count
            )
            
            # Direct copy-paste detection
            query_desc = (str(current_project.get("abstract", "")) + " " + str(current_project.get("description", ""))).strip()
            candidate_desc = (str(candidate_project.get("abstract", "")) + " " + str(candidate_project.get("description", ""))).strip()
            
            words_q = set(normalize_text(query_desc).split())
            words_c = set(normalize_text(candidate_desc).split())
            
            jaccard_overlap = 0.0
            if words_q and words_c:
                jaccard_overlap = len(words_q.intersection(words_c)) / len(words_q.union(words_c))
                
            if jaccard_overlap >= 0.60:
                hybrid_score = 0.95
                
            if hybrid_score > max_hybrid_score:
                max_hybrid_score = hybrid_score
                best_candidate_features = candidate_features
                best_candidate_idx = idx

        # Calculate originality based on the worst-case (highest similarity) match
        if best_candidate_idx != -1 and len(current_features) > 0:
            # Recompute feature similarity for the best candidate to get unique_a count
            feat_result = compute_feature_similarity(
                current_features,
                best_candidate_features,
                model=feature_model
            )
            unique_query_count = len(feat_result["unique_a"])
            orig_score = compute_originality(
                hybrid_score=max_hybrid_score,
                unique_query_features=unique_query_count,
                total_query_features=len(current_features)
            )
        else:
            orig_score = compute_originality(
                hybrid_score=max_hybrid_score,
                unique_query_features=0,
                total_query_features=0
            )
            
        originality_scores.append(orig_score)

    processed_df["originality"] = originality_scores
    logger.info("Finished calculating originality scores.")

    # 6. Save locally to preprocessed.csv in Data/processed/
    local_dir = Path("Data") / "processed"
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = local_dir / "preprocessed.csv"
    
    # Keep features as readable lists for the CSV file
    csv_df = processed_df.copy()
    csv_df.to_csv(local_path, index=False)
    logger.info(f"Successfully saved preprocessed projects locally to: {local_path}")

    # 7. Convert features list to JSON string for database push
    db_df = processed_df.copy()
    db_df["features"] = db_df["features"].apply(json.dumps)

    # 8. Push to table 'preprocess' in database
    logger.info("Uploading preprocessed records to database table 'preprocess'...")
    try:
        with engine.begin() as conn:
            # Drop/replace table if exists to write the new preprocessed structure
            db_df.to_sql(
                "preprocess",
                conn,
                if_exists="replace",
                index=False
            )
        logger.info("Successfully pushed all preprocessed projects to database table 'preprocess'.")
    except Exception as exc:
        logger.error(f"Failed to push table to database. Error: {exc}")
        sys.exit(1)

if __name__ == "__main__":
    run_sync_preprocess()