File size: 6,087 Bytes
7d235bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e95de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d235bd
 
 
5e95de5
7d235bd
 
 
 
 
 
5e95de5
 
 
7d235bd
 
 
5e95de5
 
 
 
7d235bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import re
import logging
import yake
import numpy as np
from functools import lru_cache
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)

MODEL_NAME = "all-mpnet-base-v2"

@lru_cache(maxsize=1)
def _get_embed_model():
    logger.info(f"Loading embed model: {MODEL_NAME}")
    return SentenceTransformer(MODEL_NAME)

MIN_WORDS = 8
MAX_WORDS = 4000

def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower().strip()
    text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
    text = re.sub(r"[^a-z0-9\+\#\./\- ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def substring_deduplicate(features):
    features = sorted(features, key=len, reverse=True)
    kept = []
    for feat in features:
        is_substring = False
        for longer_feat in kept:
            if feat in longer_feat:
                is_substring = True
                break
        if not is_substring:
            kept.append(feat)
    return kept

def semantic_deduplicate(features, model, threshold=0.85):
    if len(features) <= 1:
        return features

    embeddings = model.encode(
        features,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    kept = []
    for i, feat in enumerate(features):
        redundant = False
        for existing in kept:
            sim = cosine_similarity(
                embeddings[i].reshape(1, -1),
                embeddings[existing].reshape(1, -1)
            )[0][0]
            if sim >= threshold:
                redundant = True
                break
        if not redundant:
            kept.append(i)

    return [features[i] for i in kept]

@lru_cache(maxsize=1)
def _get_yake_extractor():
    logger.info("Initializing YAKE NLP feature extractor")
    return yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=20, features=None)

import json

_feature_db_frequencies = None

def load_feature_frequencies_cache():
    global _feature_db_frequencies
    if _feature_db_frequencies is None:
        try:
            from src.similarity_model.semantic_search import load_metadata
            df = load_metadata()
            from collections import Counter
            counter = Counter()
            total_docs = len(df)
            if total_docs > 0:
                for feats in df["features"]:
                    if isinstance(feats, str):
                        try:
                            feats = json.loads(feats)
                        except:
                            feats = []
                    if isinstance(feats, list):
                        seen = set(str(f).strip().lower() for f in feats)
                        for f in seen:
                            if f:
                                counter[f] += 1
                _feature_db_frequencies = {k: v / total_docs for k, v in counter.items()}
            else:
                _feature_db_frequencies = {}
        except Exception:
            _feature_db_frequencies = {}
    return _feature_db_frequencies

def extract_features(text: str) -> list:
    """

    Extracts detailed, multi-word phrases generated purely by YAKE.

    Filters out highly generic features appearing in > 15% of indexed projects.

    """
    matched = []
    try:
        kw_extractor = _get_yake_extractor()
        yake_results = kw_extractor.extract_keywords(text)
        
        freq_cache = load_feature_frequencies_cache()
        max_df_threshold = 0.15  # Filter if keyword appears in > 15% of database
        
        for kw, score in yake_results:
            kw_clean = str(kw).strip().lower()
            if len(kw_clean.split()) > 1 and kw_clean not in matched:
                # Apply IDF filter check
                doc_freq = freq_cache.get(kw_clean, 0.0)
                if doc_freq <= max_df_threshold:
                    matched.append(kw_clean)
                
    except Exception as e:
        logger.error(f"YAKE extraction failed: {e}")

    if not matched:
        return []

    matched = substring_deduplicate(matched)
    return semantic_deduplicate(matched, _get_embed_model(), threshold=0.85)

def preprocess_dataset(df):
    logger.info("Starting preprocessing...")
    df = df.copy()

    df.columns = df.columns.str.strip().str.lower().str.replace(r"\W+", "_", regex=True)

    column_mapping = {
        "title": "project_title",
        "ai_summary": "ai_summary",
        "technologies": "technologies",
        "keywords": "keywords",
        "abstract": "abstract",
        "description": "description",
        "problem_statement": "problem_statement",
        "proposed_solution": "proposed_solution",
        "objectives": "objectives",
        "category": "category"
    }

    df = df.rename(columns=column_mapping)

    for col in ["project_title", "abstract", "description"]:
        if col not in df.columns:
            df[col] = ""
        df[col] = df[col].fillna("").astype(str)

    df["full_content"] = df["project_title"] + ". " + df["abstract"] + ". " + df["description"]
    df["clean_text"] = df["full_content"].apply(normalize_text)

    before = len(df)
    df = df.drop_duplicates(subset=["project_title", "clean_text"]).copy()
    logger.info(f"Removed duplicates: {before-len(df)}")

    df["word_count"] = df["clean_text"].str.split().str.len()
    df = df[df["word_count"].between(MIN_WORDS, MAX_WORDS)].copy()
    df.reset_index(drop=True, inplace=True)

    logger.info("Extracting features...")
    df["features"] = df["clean_text"].apply(extract_features)
    df = df[df["features"].apply(len) > 0].copy()
    df.reset_index(drop=True, inplace=True)

    logger.info(f"Final rows: {len(df)}")
    return df