Spaces:

InnoTrack
/

Graduation_Project-v1.2

Running

App Files Files Community

bat-6 commited on 26 days ago

Commit

e893e13

1 Parent(s): b9ada33

update

Browse files

Files changed (11) hide show

Data/database/__pycache__/sql_connector.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/preprocessing.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/semantic_search.cpython-313.pyc +0 -0
src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc +0 -0
src/similarity_model/embedding_engine.py +19 -228
src/similarity_model/feature_similarity.py +61 -263
src/similarity_model/preprocessing.py +55 -284
src/similarity_model/similarity_engine.py +0 -22

Data/database/__pycache__/sql_connector.cpython-313.pyc CHANGED Viewed

Binary files a/Data/database/__pycache__/sql_connector.cpython-313.pyc and b/Data/database/__pycache__/sql_connector.cpython-313.pyc differ

src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc and b/src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc differ

src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc and b/src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc differ

src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc and b/src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc differ

src/similarity_model/__pycache__/preprocessing.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/preprocessing.cpython-313.pyc and b/src/similarity_model/__pycache__/preprocessing.cpython-313.pyc differ

src/similarity_model/__pycache__/semantic_search.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/semantic_search.cpython-313.pyc and b/src/similarity_model/__pycache__/semantic_search.cpython-313.pyc differ

src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc CHANGED Viewed

Binary files a/src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc and b/src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc differ

src/similarity_model/embedding_engine.py CHANGED Viewed

@@ -1,97 +1,36 @@
-# src/embedding_engine.py
-import re
 import logging
 from pathlib import Path
 from typing import List
 import pandas as pd
 import numpy as np
 import faiss
 from sentence_transformers import SentenceTransformer
-from Data.database.sql_connector import (
-    load_preprocessed_projects
-)
-# =====================================================
-# Logging
-# =====================================================
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)s | %(message)s"
-)
 logger = logging.getLogger(__name__)
-# =====================================================
-# Config
-# =====================================================
 DEFAULT_MODEL = "all-MiniLM-L6-v2"
 TEXT_COL = "clean_text"
 TITLE_COL = "project_title"
 TECH_COL = "technologies"
-# Resolve paths relative to the project root (3 levels up from this file:
-# src/similarity_model/embedding_engine.py -> src/similarity_model -> src -> project root)
 _PROJECT_ROOT = Path(__file__).resolve().parents[2]
 MODEL_DIR  = _PROJECT_ROOT / "models"
 INDEX_PATH = MODEL_DIR / "faiss_index.bin"
 META_PATH  = MODEL_DIR / "metadata.parquet"
-TOP_K_DEFAULT = 10
-MIN_SCORE_THRESHOLD = 0.35
-# =====================================================
-# Helpers
-# =====================================================
-def normalize_text(text: str) -> str:
-    """
-    Same cleaning logic used in preprocessing.
-    """
-    if pd.isna(text):
-        return ""
-    text = str(text).strip().lower()
-    text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
-    text = re.sub(r"[^a-z0-9\s\+\#\./\-]", " ", text)
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-def tokenize(text: str) -> set:
-    """
-    Simple tokenization for keyword boosting.
-    """
-    text = normalize_text(text)
-    return set(text.split())
-# =====================================================
-# Core Engine
-# =====================================================
 class ProjectEmbedder:
     def __init__(self, model_name: str = DEFAULT_MODEL):
         logger.info(f"Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
         self.index = None
         self.metadata = None
-    # -------------------------------------------------
-    # Embeddings
-    # -------------------------------------------------
-    def generate_embeddings(
-        self,
-        texts: List[str],
-        batch_size: int = 64
-    ) -> np.ndarray:
         logger.info(f"Generating embeddings for {len(texts)} projects...")
         vectors = self.model.encode(
             texts,
             batch_size=batch_size,
@@ -99,23 +38,13 @@ class ProjectEmbedder:
             convert_to_numpy=True,
             normalize_embeddings=True
         )
         return vectors.astype("float32")
-    # -------------------------------------------------
-    # Build Index
-    # -------------------------------------------------
     def build_index(self, df: pd.DataFrame):
-        """
-        Build FAISS cosine index.
-        """
         self.metadata = df.copy()
-        # preserve ids
         self.metadata = self.metadata.reset_index(drop=True)
-        # ensure needed columns exist
         for col in [TITLE_COL, TEXT_COL]:
             if col not in self.metadata.columns:
                 self.metadata[col] = ""
@@ -124,186 +53,48 @@ class ProjectEmbedder:
             self.metadata[TECH_COL] = ""
         FEATURE_COL = "features"
         if FEATURE_COL not in self.metadata.columns:
             self.metadata[FEATURE_COL] = ""
-        feature_text = (
-            self.metadata[FEATURE_COL]
-            .fillna("")
-            .astype(str)
-        )
-        # weighted content:
-        # title repeated twice
         rich_texts = (
             self.metadata[TITLE_COL].fillna("").astype(str)
             + " "
-            + self.metadata[TITLE_COL].fillna("").astype(str)
-            + " "
             + self.metadata[TEXT_COL].fillna("").astype(str)
             + " "
             + feature_text
         ).tolist()
         embeddings = self.generate_embeddings(rich_texts)
         dim = embeddings.shape[1]
         base_index = faiss.IndexFlatIP(dim)
         self.index = faiss.IndexIDMap(base_index)
         ids = np.arange(len(self.metadata)).astype("int64")
         self.index.add_with_ids(embeddings, ids)
-        logger.info(
-            f"FAISS index built successfully with {self.index.ntotal} vectors."
-        )
-    # -------------------------------------------------
-    # Save
-    # -------------------------------------------------
     def save_artifacts(self, folder: str = "models"):
         path = Path(folder)
         path.mkdir(parents=True, exist_ok=True)
-        faiss.write_index(
-            self.index,
-            str(path / "faiss_index.bin")
-        )
-        self.metadata.to_parquet(
-            path / "metadata.parquet",
-            index=False
-        )
         logger.info(f"Artifacts saved to {folder}")
-    # -------------------------------------------------
-    # Load
-    # -------------------------------------------------
     def load_artifacts(self, folder: str = "models"):
         path = Path(folder)
-        self.index = faiss.read_index(
-            str(path / "faiss_index.bin")
-        )
-        self.metadata = pd.read_parquet(
-            path / "metadata.parquet"
-        )
         logger.info("Artifacts loaded successfully.")
-    # -------------------------------------------------
-    # Search
-    # -------------------------------------------------
-    def search(
-        self,
-        query: str,
-        k: int = TOP_K_DEFAULT,
-        threshold: float = MIN_SCORE_THRESHOLD
-    ) -> pd.DataFrame:
-        if self.index is None or self.metadata is None:
-            raise ValueError("Index or metadata not loaded.")
-        # normalize query
-        query_clean = normalize_text(query)
-        query_vec = self.model.encode(
-            [query_clean],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        ).astype("float32")
-        scores, ids = self.index.search(query_vec, k)
-        query_words = tokenize(query_clean)
-        results = []
-        for idx, score in zip(ids[0], scores[0]):
-            if idx == -1:
-                continue
-            row = self.metadata.loc[idx]
-            final_score = float(score)
-            # keyword boost
-            title_words = tokenize(row[TITLE_COL])
-            tech_words = tokenize(row[TECH_COL])
-            overlap = len(query_words & title_words)
-            overlap += len(query_words & tech_words)
-            if overlap > 0:
-                final_score += 0.02 * overlap
-            # cap score
-            final_score = min(final_score, 1.0)
-            # threshold
-            if final_score < threshold:
-                continue
-            results.append({
-                "project_id": int(idx),
-                "title": row[TITLE_COL],
-                "technologies": row[TECH_COL],
-                "similarity_score": round(final_score, 4)
-            })
-        if not results:
-            return pd.DataFrame([{
-                "message": "No similar projects found."
-            }])
-        return pd.DataFrame(results).sort_values(
-            by="similarity_score",
-            ascending=False
-        ).reset_index(drop=True)
-# =====================================================
-# Full Training Pipeline
-# =====================================================
 def train_embedding_engine():
-    logger.info(
-        "Loading processed dataset from Azure SQL..."
-    )
     df = load_preprocessed_projects()
     engine = ProjectEmbedder()
     engine.build_index(df)
     engine.save_artifacts()
-    logger.info(
-        "Embedding engine completed successfully."
-    )
-    return engine
-# =====================================================
-# Example Run
-# =====================================================
-if __name__ == "__main__":
-    engine = train_embedding_engine()
-    query = "Build a mobile app for expense tracking using flutter and firebase"
-    print(f"\nQuery: {query}\n")
-    results = engine.search(query, k=5)
-    print(results)

 import logging
 from pathlib import Path
 from typing import List
 import pandas as pd
 import numpy as np
 import faiss
 from sentence_transformers import SentenceTransformer
+from Data.database.sql_connector import load_preprocessed_projects
+logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
 logger = logging.getLogger(__name__)
 DEFAULT_MODEL = "all-MiniLM-L6-v2"
 TEXT_COL = "clean_text"
 TITLE_COL = "project_title"
 TECH_COL = "technologies"
+# Resolve paths relative to the project root
 _PROJECT_ROOT = Path(__file__).resolve().parents[2]
 MODEL_DIR  = _PROJECT_ROOT / "models"
 INDEX_PATH = MODEL_DIR / "faiss_index.bin"
 META_PATH  = MODEL_DIR / "metadata.parquet"
 class ProjectEmbedder:
     def __init__(self, model_name: str = DEFAULT_MODEL):
         logger.info(f"Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
         self.index = None
         self.metadata = None
+    def generate_embeddings(self, texts: List[str], batch_size: int = 64) -> np.ndarray:
         logger.info(f"Generating embeddings for {len(texts)} projects...")
         vectors = self.model.encode(
             texts,
             batch_size=batch_size,
             convert_to_numpy=True,
             normalize_embeddings=True
         )
         return vectors.astype("float32")
     def build_index(self, df: pd.DataFrame):
+        """Build FAISS cosine index."""
         self.metadata = df.copy()
         self.metadata = self.metadata.reset_index(drop=True)
         for col in [TITLE_COL, TEXT_COL]:
             if col not in self.metadata.columns:
                 self.metadata[col] = ""
             self.metadata[TECH_COL] = ""
         FEATURE_COL = "features"
         if FEATURE_COL not in self.metadata.columns:
             self.metadata[FEATURE_COL] = ""
+        feature_text = self.metadata[FEATURE_COL].fillna("").astype(str)
         rich_texts = (
             self.metadata[TITLE_COL].fillna("").astype(str)
             + " "
             + self.metadata[TEXT_COL].fillna("").astype(str)
             + " "
             + feature_text
         ).tolist()
         embeddings = self.generate_embeddings(rich_texts)
         dim = embeddings.shape[1]
         base_index = faiss.IndexFlatIP(dim)
         self.index = faiss.IndexIDMap(base_index)
         ids = np.arange(len(self.metadata)).astype("int64")
         self.index.add_with_ids(embeddings, ids)
+        logger.info(f"FAISS index built successfully with {self.index.ntotal} vectors.")
     def save_artifacts(self, folder: str = "models"):
         path = Path(folder)
         path.mkdir(parents=True, exist_ok=True)
+        faiss.write_index(self.index, str(path / "faiss_index.bin"))
+        self.metadata.to_parquet(path / "metadata.parquet", index=False)
         logger.info(f"Artifacts saved to {folder}")
     def load_artifacts(self, folder: str = "models"):
         path = Path(folder)
+        self.index = faiss.read_index(str(path / "faiss_index.bin"))
+        self.metadata = pd.read_parquet(path / "metadata.parquet")
         logger.info("Artifacts loaded successfully.")
 def train_embedding_engine():
+    logger.info("Loading processed dataset from Azure SQL...")
     df = load_preprocessed_projects()
     engine = ProjectEmbedder()
     engine.build_index(df)
     engine.save_artifacts()
+    logger.info("Embedding engine completed successfully.")
+    return engine

src/similarity_model/feature_similarity.py CHANGED Viewed

@@ -1,115 +1,75 @@
-# src/feature_similarity.py
-import logging
-import ast
-from functools import lru_cache
 from typing import List, Dict, Any
-import numpy as np
 import pandas as pd
 from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
 from scipy.optimize import linear_sum_assignment
-# =====================================================
-# Logging
-# =====================================================
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)s | %(message)s"
-)
 logger = logging.getLogger(__name__)
-# =====================================================
-# Config
-# =====================================================
 MODEL_NAME = "all-MiniLM-L6-v2"
-DEFAULT_THRESHOLD = 0.80
 SIMILARITY_WEIGHT = 0.70
 COVERAGE_WEIGHT = 0.30
-# =====================================================
-# Model Loader
-# =====================================================
 @lru_cache(maxsize=1)
 def load_feature_model():
-    """
-    Load feature embedding model once.
-    """
     logger.info(f"Loading feature model: {MODEL_NAME}")
     return SentenceTransformer(MODEL_NAME)
-# =====================================================
-# Helpers
-# =====================================================
 def safe_feature_list(features):
     """
     Convert any feature input into clean List[str]
-    Supports:
-    list, tuple, numpy array, string, NaN
     """
     import numpy as np
-    # None
     if features is None:
         return []
-    # numpy nan scalar only
-    if isinstance(features, float):
-        if pd.isna(features):
-            return []
-    # numpy array
     if isinstance(features, np.ndarray):
         features = features.tolist()
-    # tuple
     if isinstance(features, tuple):
         features = list(features)
-    # string
     if isinstance(features, str):
-        features = [features]
-    # list
     if isinstance(features, list):
         cleaned = []
         for item in features:
-            val = str(item).strip().lower()
             if val and val != "nan":
                 cleaned.append(val)
         return list(dict.fromkeys(cleaned))
     return []
 def remove_redundant_features(features):
     cleaned = []
     seen_words = []
     for feat in features:
         feat_words = set(feat.split())
         redundant = False
         for existing in seen_words:
-            overlap = len(
-                feat_words & existing
-            ) / max(len(feat_words), 1)
             if overlap >= 0.60:
                 redundant = True
                 break
@@ -120,13 +80,7 @@ def remove_redundant_features(features):
     return cleaned
-def empty_result(
-    unique_a=None,
-    unique_b=None
-) -> Dict[str, Any]:
     return {
         "score": 0.0,
         "coverage": 0.0,
@@ -136,15 +90,11 @@ def empty_result(
         "unique_b": unique_b or []
     }
 def encode_features(
     features: List[str],
     model
-) -> np.ndarray:
-    """
-    Encode feature phrases into normalized vectors.
-    """
     if not features:
         return np.array([])
@@ -153,167 +103,76 @@ def encode_features(
         convert_to_numpy=True,
         normalize_embeddings=True
     )
     return vectors.astype("float32")
-# =====================================================
-# Core Similarity Engine
-# =====================================================
 def compute_feature_similarity(
     features_a,
     features_b,
     model=None,
     threshold: float = DEFAULT_THRESHOLD
 ) -> Dict[str, Any]:
-    """
-    Compare two feature lists using:
-    1. Sentence embeddings
-    2. Cosine similarity matrix
-    3. Hungarian optimal matching
-    4. Coverage-aware final score
-    """
     if model is None:
         model = load_feature_model()
-    fa = remove_redundant_features(
-    safe_feature_list(features_a)
-    )
-    fb = remove_redundant_features(
-        safe_feature_list(features_b)
-    )
-    # empty cases
     if not fa or not fb:
-        return empty_result(
-            unique_a=fa,
-            unique_b=fb
-        )
-    # -------------------------------------------------
-    # Encode features
-    # -------------------------------------------------
     emb_a = encode_features(fa, model)
     emb_b = encode_features(fb, model)
-    # -------------------------------------------------
-    # Similarity matrix
-    # -------------------------------------------------
-    sim_matrix = cosine_similarity(
-        emb_a,
-        emb_b
-    )
-    # -------------------------------------------------
-    # Hungarian Algorithm
-    # maximize similarity => minimize negative matrix
-    # -------------------------------------------------
-    row_idx, col_idx = linear_sum_assignment(
-        -sim_matrix
-    )
     matches = []
     matched_a = set()
     matched_b = set()
     for i, j in zip(row_idx, col_idx):
         sim = float(sim_matrix[i, j])
         if sim >= threshold:
             matches.append({
                 "feature_a": fa[i],
                 "feature_b": fb[j],
                 "score": round(sim, 3)
             })
             matched_a.add(i)
             matched_b.add(j)
-    # -------------------------------------------------
     # Final Metrics
-    # -------------------------------------------------
-    shared_scores = [
-        m["score"] for m in matches
-    ]
-    mean_similarity = (
-        float(np.mean(shared_scores))
-        if shared_scores else 0.0
-    )
     min_len = min(len(fa), len(fb))
-    coverage = (
-        len(matches) / min_len
-        if min_len > 0 else 0.0
-    )
-    final_score = (
-        (SIMILARITY_WEIGHT * mean_similarity)
-        +
-        (COVERAGE_WEIGHT * coverage)
-    )
     if len(matches) == 0:
         final_score = 0.0
     final_score = min(final_score, 1.0)
-    matched_text_a = " ".join(
-    [
-        m["feature_a"]
-        for m in matches
-    ]
-    ).lower()
-    matched_text_b = " ".join(
-        [
-            m["feature_b"]
-            for m in matches
-        ]
-    ).lower()
-    def is_semantically_redundant(
-        feature,
-        matched_text
-    ):
         words = set(feature.lower().split())
-        overlap = sum(
-            1 for w in words
-            if w in matched_text
-        )
-        ratio = overlap / max(len(words), 1)
-        return ratio >= 0.5
     unique_a = [
-        fa[i]
-        for i in range(len(fa))
-        if i not in matched_a
-        and not is_semantically_redundant(
-            fa[i],
-            matched_text_a
-        )
     ]
     unique_b = [
-        fb[j]
-        for j in range(len(fb))
-        if j not in matched_b
-        and not is_semantically_redundant(
-            fb[j],
-            matched_text_b
-        )
     ]
     return {
@@ -325,96 +184,35 @@ def compute_feature_similarity(
         "unique_b": unique_b
     }
-# =====================================================
-# Compare Two Rows From DataFrame
-# =====================================================
 def compare_projects(
     df: pd.DataFrame,
     idx1: int,
     idx2: int,
     model=None
 ) -> Dict[str, Any]:
-    """
-    Compare two projects from dataset.
-    """
-    if model is None:
-        model = load_feature_model()
     f1 = df.loc[idx1, "features"]
     f2 = df.loc[idx2, "features"]
-    result = compute_feature_similarity(
-        f1,
-        f2,
-        model=model
-    )
-    result["project_a_id"] = int(idx1)
-    result["project_b_id"] = int(idx2)
-    return result
-# =====================================================
-# Compare One Against Many
-# =====================================================
 def compare_project_against_many(
-    query_features,
-    candidate_feature_lists,
-    model=None,
-    threshold: float = DEFAULT_THRESHOLD
-):
-    """
-    Compare one project against many candidates.
-    """
-    if model is None:
-        model = load_feature_model()
-    results = []
-    for idx, candidate in enumerate(
-        candidate_feature_lists
-    ):
-        result = compute_feature_similarity(
-            query_features,
-            candidate,
-            model=model,
-            threshold=threshold
-        )
-        result["candidate_id"] = idx
-        results.append(result)
     return results
-# =====================================================
-# Example Run
-# =====================================================
-if __name__ == "__main__":
-    project_a = [
-        "online reservation",
-        "ai chatbot",
-        "patient records",
-        "doctor dashboard"
-    ]
-    project_b = [
-        "appointment booking",
-        "chatbot assistant",
-        "medical records",
-        "analytics dashboard"
-    ]
-    result = compute_feature_similarity(
-        project_a,
-        project_b
-    )
-    print(result)

 from typing import List, Dict, Any
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from scipy.optimize import linear_sum_assignment
+from sklearn.metrics.pairwise import cosine_similarity
+import logging
+from functools import lru_cache
 logger = logging.getLogger(__name__)
 MODEL_NAME = "all-MiniLM-L6-v2"
 SIMILARITY_WEIGHT = 0.70
 COVERAGE_WEIGHT = 0.30
+DEFAULT_THRESHOLD = 0.80
 @lru_cache(maxsize=1)
 def load_feature_model():
     logger.info(f"Loading feature model: {MODEL_NAME}")
     return SentenceTransformer(MODEL_NAME)
 def safe_feature_list(features):
     """
     Convert any feature input into clean List[str]
     """
     import numpy as np
     if features is None:
         return []
+    if isinstance(features, float) and pd.isna(features):
+        return []
     if isinstance(features, np.ndarray):
         features = features.tolist()
     if isinstance(features, tuple):
         features = list(features)
     if isinstance(features, str):
+        try:
+            import ast
+            parsed = ast.literal_eval(features)
+            if isinstance(parsed, list):
+                features = parsed
+            else:
+                features = [features]
+        except:
+            features = [features]
     if isinstance(features, list):
         cleaned = []
         for item in features:
+            if isinstance(item, dict) and "feature" in item:
+                val = str(item["feature"]).strip().lower()
+            else:
+                val = str(item).strip().lower()
             if val and val != "nan":
                 cleaned.append(val)
         return list(dict.fromkeys(cleaned))
     return []
 def remove_redundant_features(features):
     cleaned = []
     seen_words = []
     for feat in features:
         feat_words = set(feat.split())
         redundant = False
         for existing in seen_words:
+            overlap = len(feat_words & existing) / max(len(feat_words), 1)
             if overlap >= 0.60:
                 redundant = True
                 break
     return cleaned
+def empty_result(unique_a=None, unique_b=None):
     return {
         "score": 0.0,
         "coverage": 0.0,
         "unique_b": unique_b or []
     }
 def encode_features(
     features: List[str],
     model
+):
+    import numpy as np
     if not features:
         return np.array([])
         convert_to_numpy=True,
         normalize_embeddings=True
     )
     return vectors.astype("float32")
 def compute_feature_similarity(
     features_a,
     features_b,
     model=None,
     threshold: float = DEFAULT_THRESHOLD
 ) -> Dict[str, Any]:
     if model is None:
         model = load_feature_model()
+    fa = remove_redundant_features(safe_feature_list(features_a))
+    fb = remove_redundant_features(safe_feature_list(features_b))
     if not fa or not fb:
+        return empty_result(unique_a=fa, unique_b=fb)
     emb_a = encode_features(fa, model)
     emb_b = encode_features(fb, model)
+    sim_matrix = cosine_similarity(emb_a, emb_b)
+    # Hungarian match
+    row_idx, col_idx = linear_sum_assignment(-sim_matrix)
     matches = []
     matched_a = set()
     matched_b = set()
     for i, j in zip(row_idx, col_idx):
         sim = float(sim_matrix[i, j])
         if sim >= threshold:
             matches.append({
                 "feature_a": fa[i],
                 "feature_b": fb[j],
                 "score": round(sim, 3)
             })
             matched_a.add(i)
             matched_b.add(j)
     # Final Metrics
+    import numpy as np
+    shared_scores = [m["score"] for m in matches]
+    mean_similarity = float(np.mean(shared_scores)) if shared_scores else 0.0
     min_len = min(len(fa), len(fb))
+    coverage = len(matches) / min_len if min_len > 0 else 0.0
+    final_score = (SIMILARITY_WEIGHT * mean_similarity) + (COVERAGE_WEIGHT * coverage)
     if len(matches) == 0:
         final_score = 0.0
     final_score = min(final_score, 1.0)
+    matched_text_a = " ".join([m["feature_a"] for m in matches]).lower()
+    matched_text_b = " ".join([m["feature_b"] for m in matches]).lower()
+    def is_semantically_redundant(feature, matched_text):
         words = set(feature.lower().split())
+        overlap = sum(1 for w in words if w in matched_text)
+        return (overlap / max(len(words), 1)) >= 0.5
     unique_a = [
+        fa[i] for i in range(len(fa))
+        if i not in matched_a and not is_semantically_redundant(fa[i], matched_text_a)
     ]
     unique_b = [
+        fb[j] for j in range(len(fb))
+        if j not in matched_b and not is_semantically_redundant(fb[j], matched_text_b)
     ]
     return {
         "unique_b": unique_b
     }
 def compare_projects(
     df: pd.DataFrame,
     idx1: int,
     idx2: int,
     model=None
 ) -> Dict[str, Any]:
+    if idx1 not in df.index or idx2 not in df.index:
+        return empty_result()
     f1 = df.loc[idx1, "features"]
     f2 = df.loc[idx2, "features"]
+    return compute_feature_similarity(f1, f2, model=model)
 def compare_project_against_many(
+    df: pd.DataFrame,
+    idx1: int,
+    indices: List[int],
+    model=None
+) -> Dict[int, Dict[str, Any]]:
+    if idx1 not in df.index:
+        return {}
+    f1 = df.loc[idx1, 'features']
+    results = {}
+    for idx2 in indices:
+        if idx2 in df.index:
+            f2 = df.loc[idx2, 'features']
+            results[idx2] = compute_feature_similarity(f1, f2, model=model)
     return results

src/similarity_model/preprocessing.py CHANGED Viewed

@@ -1,9 +1,6 @@
-# src/preprocessing.py
-# FINAL POLISHED VERSION
-# Best Practical Feature Extraction for Graduation Project System
 import re
 import logging
 import numpy as np
 from functools import lru_cache
 from pathlib import Path
@@ -11,80 +8,42 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-# =====================================================
-# Logging
-# =====================================================
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)s | %(message)s"
-)
 logger = logging.getLogger(__name__)
-# =====================================================
-# Models
-# =====================================================
 MODEL_NAME = "all-MiniLM-L6-v2"
 @lru_cache(maxsize=1)
 def _get_embed_model():
-    """Lazy-load the embedding model once on first use."""
     logger.info(f"Loading embed model: {MODEL_NAME}")
     return SentenceTransformer(MODEL_NAME)
-# =====================================================
-# Config
-# =====================================================
 MIN_WORDS = 8
 MAX_WORDS = 4000
-# =====================================================
-# Helpers
-# =====================================================
-def normalize_text(text):
-    """
-    Clean raw text
-    """
     if pd.isna(text):
         return ""
     text = str(text).lower().strip()
-    # remove urls/emails
-    text = re.sub(
-        r"http\S+|www\S+|\S+@\S+",
-        " ",
-        text
-    )
-    # keep useful chars
-    text = re.sub(
-        r"[^a-z0-9\+\#\./\- ]",
-        " ",
-        text
-    )
-    # remove spaces
-    text = re.sub(
-        r"\s+",
-        " ",
-        text
-    )
     return text.strip()
-def semantic_deduplicate(
-    features,
-    model,
-    threshold=0.85
-):
-    """
-    Remove semantically similar features.
-    """
     if len(features) <= 1:
         return features
@@ -95,126 +54,55 @@ def semantic_deduplicate(
     )
     kept = []
     for i, feat in enumerate(features):
         redundant = False
         for existing in kept:
             sim = cosine_similarity(
                 embeddings[i].reshape(1, -1),
                 embeddings[existing].reshape(1, -1)
             )[0][0]
             if sim >= threshold:
                 redundant = True
                 break
         if not redundant:
             kept.append(i)
     return [features[i] for i in kept]
-# =====================================================
-# Local Feature Dictionary (cached)
-# =====================================================
-_PROJECT_ROOT = Path(__file__).resolve().parents[2]
-_METADATA_PATH = _PROJECT_ROOT / "models" / "metadata.parquet"
 @lru_cache(maxsize=1)
-def _load_known_features():
-    """
-    Load all known feature strings from the training metadata.
-    Cached — reads the parquet file only once per process.
-    Sorted longest-first so multi-word features (e.g. 'deep learning')
-    are matched before their sub-words (e.g. 'learning').
-    """
-    if not _METADATA_PATH.exists():
-        logger.warning("metadata.parquet not found; feature extraction will return []")
-        return []
-    df = pd.read_parquet(str(_METADATA_PATH))
-    features_set = set()
-    for f_list in df.get("features", pd.Series(dtype=object)):
-        if isinstance(f_list, (list, np.ndarray, tuple, set)):
-            for f in f_list:
-                val = str(f).strip().lower()
-                if val and val != "nan" and len(val) >= 3:
-                    features_set.add(val)
-    logger.info(f"Loaded {len(features_set)} known features from metadata")
-    # longest first → greedy multi-word match wins
-    return sorted(features_set, key=len, reverse=True)
-# =====================================================
-# Main Feature Extraction  (fully local, no API)
-# =====================================================
 def extract_features(text: str) -> list:
     """
-    Match technical features from text against the known feature
-    dictionary built from training data.
-    No external API required.
     """
-    known_features = _load_known_features()
-    if not known_features:
-        return []
-    text_norm = normalize_text(text)
     matched = []
-    for feat in known_features:
-        pattern = r'\b' + re.escape(feat) + r'\b'
-        if re.search(pattern, text_norm):
-            matched.append(feat)
-            if len(matched) >= 15:   # collect up to 15, dedup will trim
-                break
-    logger.info(f"Local feature extraction matched {len(matched)} features")
-    return semantic_deduplicate(
-        matched,
-        _get_embed_model(),
-        threshold=0.85
-    )
-# =====================================================
-# Main Pipeline
-# =====================================================
 def preprocess_dataset(df):
-    """
-    Full preprocessing pipeline
-    """
-    logger.info(
-        "Starting preprocessing..."
-    )
     df = df.copy()
-    # clean columns
-    df.columns = (
-        df.columns
-        .str.strip()
-        .str.lower()
-        .str.replace(
-            r"\W+",
-            "_",
-            regex=True
-        )
-    )
-    # =============================================
-    # Column Mapping Fix
-    # =============================================
     column_mapping = {
         "title": "project_title",
         "ai_summary": "ai_summary",
@@ -230,143 +118,26 @@ def preprocess_dataset(df):
     df = df.rename(columns=column_mapping)
-    # ensure needed columns
-    for col in [
-        "project_title",
-        "abstract",
-        "description"
-    ]:
         if col not in df.columns:
             df[col] = ""
-        df[col] = (
-            df[col]
-            .fillna("")
-            .astype(str)
-        )
-    # =============================================
-    # Smart weighted merge
-    # =============================================
-    df["full_content"] = (
-        df["project_title"] + ". " +
-        df["project_title"] + ". " +
-        df["abstract"] + ". " +
-        df["description"]
-    )
-    # normalize
-    df["clean_text"] = (
-        df["full_content"]
-        .apply(normalize_text)
-    )
-    # remove duplicates
     before = len(df)
-    df = df.drop_duplicates(
-        subset=[
-            "project_title",
-            "clean_text"
-        ]
-    ).copy()
-    logger.info(
-        f"Removed duplicates: {before-len(df)}"
-    )
-    # word count filter
-    df["word_count"] = (
-        df["clean_text"]
-        .str.split()
-        .str.len()
-    )
-    df = df[
-        df["word_count"].between(
-            MIN_WORDS,
-            MAX_WORDS
-        )
-    ].copy()
-    df.reset_index(
-        drop=True,
-        inplace=True
-    )
-    # =============================================
-    # Feature Extraction
-    # =============================================
-    logger.info(
-        "Extracting features..."
-    )
-    df["features"] = (
-        df["clean_text"]
-        .apply(extract_features)
-    )
-    # remove empty rows
-    df = df[
-        df["features"]
-        .apply(len) > 0
-    ].copy()
-    df.reset_index(
-        drop=True,
-        inplace=True
-    )
-    logger.info(
-        f"Final rows: {len(df)}"
-    )
     return df
-# =====================================================
-# Save
-# =====================================================
-def save_processed_data(
-    df,
-    output_dir="Data/processed"
-):
-    path = Path(output_dir)
-    path.mkdir(
-        parents=True,
-        exist_ok=True
-    )
-    df.to_parquet(
-        path / "projects_clean.parquet",
-        index=False
-    )
-    df.to_csv(
-        path / "projects_clean.csv",
-        index=False
-    )
-    logger.info(
-        f"Saved to {path}"
-    )
-# =====================================================
-# Run
-# =====================================================
-if __name__ == "__main__":
-    file_path = "Data/raw/projects.xlsx"
-    if file_path.endswith(".csv"):
-        raw_df = pd.read_csv(file_path)
-    else:
-        raw_df = pd.read_excel(file_path)
-    clean_df = preprocess_dataset(raw_df)
-    save_processed_data(clean_df)

 import re
 import logging
+import yake
 import numpy as np
 from functools import lru_cache
 from pathlib import Path
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
 logger = logging.getLogger(__name__)
 MODEL_NAME = "all-MiniLM-L6-v2"
 @lru_cache(maxsize=1)
 def _get_embed_model():
     logger.info(f"Loading embed model: {MODEL_NAME}")
     return SentenceTransformer(MODEL_NAME)
 MIN_WORDS = 8
 MAX_WORDS = 4000
+def normalize_text(text):
     if pd.isna(text):
         return ""
     text = str(text).lower().strip()
+    text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
+    text = re.sub(r"[^a-z0-9\+\#\./\- ]", " ", text)
+    text = re.sub(r"\s+", " ", text)
     return text.strip()
+def substring_deduplicate(features):
+    features = sorted(features, key=len, reverse=True)
+    kept = []
+    for feat in features:
+        is_substring = False
+        for longer_feat in kept:
+            if feat in longer_feat:
+                is_substring = True
+                break
+        if not is_substring:
+            kept.append(feat)
+    return kept
+def semantic_deduplicate(features, model, threshold=0.85):
     if len(features) <= 1:
         return features
     )
     kept = []
     for i, feat in enumerate(features):
         redundant = False
         for existing in kept:
             sim = cosine_similarity(
                 embeddings[i].reshape(1, -1),
                 embeddings[existing].reshape(1, -1)
             )[0][0]
             if sim >= threshold:
                 redundant = True
                 break
         if not redundant:
             kept.append(i)
     return [features[i] for i in kept]
 @lru_cache(maxsize=1)
+def _get_yake_extractor():
+    logger.info("Initializing YAKE NLP feature extractor")
+    return yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=20, features=None)
 def extract_features(text: str) -> list:
     """
+    Extracts detailed, multi-word phrases generated purely by YAKE.
     """
     matched = []
+    try:
+        kw_extractor = _get_yake_extractor()
+        yake_results = kw_extractor.extract_keywords(text)
+        for kw, score in yake_results:
+            kw_clean = str(kw).strip().lower()
+            if len(kw_clean.split()) > 1 and kw_clean not in matched:
+                matched.append(kw_clean)
+    except Exception as e:
+        logger.error(f"YAKE extraction failed: {e}")
+    if not matched:
+        return []
+    matched = substring_deduplicate(matched)
+    return semantic_deduplicate(matched, _get_embed_model(), threshold=0.85)
 def preprocess_dataset(df):
+    logger.info("Starting preprocessing...")
     df = df.copy()
+    df.columns = df.columns.str.strip().str.lower().str.replace(r"\W+", "_", regex=True)
     column_mapping = {
         "title": "project_title",
         "ai_summary": "ai_summary",
     df = df.rename(columns=column_mapping)
+    for col in ["project_title", "abstract", "description"]:
         if col not in df.columns:
             df[col] = ""
+        df[col] = df[col].fillna("").astype(str)
+    df["full_content"] = df["project_title"] + ". " + df["abstract"] + ". " + df["description"]
+    df["clean_text"] = df["full_content"].apply(normalize_text)
     before = len(df)
+    df = df.drop_duplicates(subset=["project_title", "clean_text"]).copy()
+    logger.info(f"Removed duplicates: {before-len(df)}")
+    df["word_count"] = df["clean_text"].str.split().str.len()
+    df = df[df["word_count"].between(MIN_WORDS, MAX_WORDS)].copy()
+    df.reset_index(drop=True, inplace=True)
+    logger.info("Extracting features...")
+    df["features"] = df["clean_text"].apply(extract_features)
+    df = df[df["features"].apply(len) > 0].copy()
+    df.reset_index(drop=True, inplace=True)
+    logger.info(f"Final rows: {len(df)}")
     return df

src/similarity_model/similarity_engine.py CHANGED Viewed

@@ -417,25 +417,3 @@ def find_similar_projects(
     return final_df
-# =====================================================
-# Example Run
-# =====================================================
-if __name__ == "__main__":
-    results = find_similar_projects(
-        title="Smart Library",
-        abstract="""
-        AI based digital library for students.
-        """,
-        description="""
-        Includes chatbot,
-        recommendation system,
-        qr code scanner,
-        mobile application.
-        """,
-        features=["library"],
-        top_k=5
-    )
-    print(results)


417
418	return final_df
419