bat-6 commited on
Commit
e893e13
·
1 Parent(s): b9ada33
Data/database/__pycache__/sql_connector.cpython-313.pyc CHANGED
Binary files a/Data/database/__pycache__/sql_connector.cpython-313.pyc and b/Data/database/__pycache__/sql_connector.cpython-313.pyc differ
 
src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc and b/src/similarity_model/__pycache__/embedding_engine.cpython-313.pyc differ
 
src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc and b/src/similarity_model/__pycache__/feature_similarity.cpython-313.pyc differ
 
src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc and b/src/similarity_model/__pycache__/hybrid_ranker.cpython-313.pyc differ
 
src/similarity_model/__pycache__/preprocessing.cpython-313.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/preprocessing.cpython-313.pyc and b/src/similarity_model/__pycache__/preprocessing.cpython-313.pyc differ
 
src/similarity_model/__pycache__/semantic_search.cpython-313.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/semantic_search.cpython-313.pyc and b/src/similarity_model/__pycache__/semantic_search.cpython-313.pyc differ
 
src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc CHANGED
Binary files a/src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc and b/src/similarity_model/__pycache__/similarity_engine.cpython-313.pyc differ
 
src/similarity_model/embedding_engine.py CHANGED
@@ -1,97 +1,36 @@
1
- # src/embedding_engine.py
2
-
3
- import re
4
  import logging
5
  from pathlib import Path
6
  from typing import List
7
-
8
  import pandas as pd
9
  import numpy as np
10
  import faiss
11
  from sentence_transformers import SentenceTransformer
12
- from Data.database.sql_connector import (
13
- load_preprocessed_projects
14
- )
15
 
16
- # =====================================================
17
- # Logging
18
- # =====================================================
19
- logging.basicConfig(
20
- level=logging.INFO,
21
- format="%(asctime)s | %(levelname)s | %(message)s"
22
- )
23
  logger = logging.getLogger(__name__)
24
 
25
- # =====================================================
26
- # Config
27
- # =====================================================
28
  DEFAULT_MODEL = "all-MiniLM-L6-v2"
29
-
30
  TEXT_COL = "clean_text"
31
  TITLE_COL = "project_title"
32
  TECH_COL = "technologies"
33
 
34
- # Resolve paths relative to the project root (3 levels up from this file:
35
- # src/similarity_model/embedding_engine.py -> src/similarity_model -> src -> project root)
36
  _PROJECT_ROOT = Path(__file__).resolve().parents[2]
37
-
38
  MODEL_DIR = _PROJECT_ROOT / "models"
39
  INDEX_PATH = MODEL_DIR / "faiss_index.bin"
40
  META_PATH = MODEL_DIR / "metadata.parquet"
41
 
42
- TOP_K_DEFAULT = 10
43
- MIN_SCORE_THRESHOLD = 0.35
44
-
45
- # =====================================================
46
- # Helpers
47
- # =====================================================
48
- def normalize_text(text: str) -> str:
49
- """
50
- Same cleaning logic used in preprocessing.
51
- """
52
- if pd.isna(text):
53
- return ""
54
-
55
- text = str(text).strip().lower()
56
-
57
- text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
58
- text = re.sub(r"[^a-z0-9\s\+\#\./\-]", " ", text)
59
- text = re.sub(r"\s+", " ", text).strip()
60
-
61
- return text
62
-
63
-
64
- def tokenize(text: str) -> set:
65
- """
66
- Simple tokenization for keyword boosting.
67
- """
68
- text = normalize_text(text)
69
- return set(text.split())
70
-
71
-
72
- # =====================================================
73
- # Core Engine
74
- # =====================================================
75
  class ProjectEmbedder:
76
-
77
  def __init__(self, model_name: str = DEFAULT_MODEL):
78
  logger.info(f"Loading embedding model: {model_name}")
79
-
80
  self.model = SentenceTransformer(model_name)
81
  self.index = None
82
  self.metadata = None
83
 
84
- # -------------------------------------------------
85
- # Embeddings
86
- # -------------------------------------------------
87
- def generate_embeddings(
88
- self,
89
- texts: List[str],
90
- batch_size: int = 64
91
- ) -> np.ndarray:
92
-
93
  logger.info(f"Generating embeddings for {len(texts)} projects...")
94
-
95
  vectors = self.model.encode(
96
  texts,
97
  batch_size=batch_size,
@@ -99,23 +38,13 @@ class ProjectEmbedder:
99
  convert_to_numpy=True,
100
  normalize_embeddings=True
101
  )
102
-
103
  return vectors.astype("float32")
104
 
105
- # -------------------------------------------------
106
- # Build Index
107
- # -------------------------------------------------
108
  def build_index(self, df: pd.DataFrame):
109
- """
110
- Build FAISS cosine index.
111
- """
112
-
113
  self.metadata = df.copy()
114
-
115
- # preserve ids
116
  self.metadata = self.metadata.reset_index(drop=True)
117
 
118
- # ensure needed columns exist
119
  for col in [TITLE_COL, TEXT_COL]:
120
  if col not in self.metadata.columns:
121
  self.metadata[col] = ""
@@ -124,186 +53,48 @@ class ProjectEmbedder:
124
  self.metadata[TECH_COL] = ""
125
 
126
  FEATURE_COL = "features"
127
-
128
  if FEATURE_COL not in self.metadata.columns:
129
  self.metadata[FEATURE_COL] = ""
130
 
131
-
132
- feature_text = (
133
- self.metadata[FEATURE_COL]
134
- .fillna("")
135
- .astype(str)
136
- )
137
- # weighted content:
138
- # title repeated twice
139
  rich_texts = (
140
  self.metadata[TITLE_COL].fillna("").astype(str)
141
  + " "
142
- + self.metadata[TITLE_COL].fillna("").astype(str)
143
- + " "
144
  + self.metadata[TEXT_COL].fillna("").astype(str)
145
  + " "
146
  + feature_text
147
  ).tolist()
148
 
149
  embeddings = self.generate_embeddings(rich_texts)
150
-
151
  dim = embeddings.shape[1]
152
-
153
  base_index = faiss.IndexFlatIP(dim)
154
  self.index = faiss.IndexIDMap(base_index)
155
-
156
  ids = np.arange(len(self.metadata)).astype("int64")
157
-
158
  self.index.add_with_ids(embeddings, ids)
 
159
 
160
- logger.info(
161
- f"FAISS index built successfully with {self.index.ntotal} vectors."
162
- )
163
-
164
- # -------------------------------------------------
165
- # Save
166
- # -------------------------------------------------
167
  def save_artifacts(self, folder: str = "models"):
168
-
169
  path = Path(folder)
170
  path.mkdir(parents=True, exist_ok=True)
171
-
172
- faiss.write_index(
173
- self.index,
174
- str(path / "faiss_index.bin")
175
- )
176
-
177
- self.metadata.to_parquet(
178
- path / "metadata.parquet",
179
- index=False
180
- )
181
-
182
  logger.info(f"Artifacts saved to {folder}")
183
 
184
- # -------------------------------------------------
185
- # Load
186
- # -------------------------------------------------
187
  def load_artifacts(self, folder: str = "models"):
188
-
189
  path = Path(folder)
190
-
191
- self.index = faiss.read_index(
192
- str(path / "faiss_index.bin")
193
- )
194
-
195
- self.metadata = pd.read_parquet(
196
- path / "metadata.parquet"
197
- )
198
-
199
  logger.info("Artifacts loaded successfully.")
200
 
201
- # -------------------------------------------------
202
- # Search
203
- # -------------------------------------------------
204
- def search(
205
- self,
206
- query: str,
207
- k: int = TOP_K_DEFAULT,
208
- threshold: float = MIN_SCORE_THRESHOLD
209
- ) -> pd.DataFrame:
210
-
211
- if self.index is None or self.metadata is None:
212
- raise ValueError("Index or metadata not loaded.")
213
-
214
- # normalize query
215
- query_clean = normalize_text(query)
216
-
217
- query_vec = self.model.encode(
218
- [query_clean],
219
- convert_to_numpy=True,
220
- normalize_embeddings=True
221
- ).astype("float32")
222
-
223
- scores, ids = self.index.search(query_vec, k)
224
-
225
- query_words = tokenize(query_clean)
226
-
227
- results = []
228
-
229
- for idx, score in zip(ids[0], scores[0]):
230
-
231
- if idx == -1:
232
- continue
233
-
234
- row = self.metadata.loc[idx]
235
-
236
- final_score = float(score)
237
-
238
- # keyword boost
239
- title_words = tokenize(row[TITLE_COL])
240
- tech_words = tokenize(row[TECH_COL])
241
-
242
- overlap = len(query_words & title_words)
243
- overlap += len(query_words & tech_words)
244
-
245
- if overlap > 0:
246
- final_score += 0.02 * overlap
247
-
248
- # cap score
249
- final_score = min(final_score, 1.0)
250
-
251
- # threshold
252
- if final_score < threshold:
253
- continue
254
-
255
- results.append({
256
- "project_id": int(idx),
257
- "title": row[TITLE_COL],
258
- "technologies": row[TECH_COL],
259
- "similarity_score": round(final_score, 4)
260
- })
261
-
262
- if not results:
263
- return pd.DataFrame([{
264
- "message": "No similar projects found."
265
- }])
266
-
267
- return pd.DataFrame(results).sort_values(
268
- by="similarity_score",
269
- ascending=False
270
- ).reset_index(drop=True)
271
-
272
- # =====================================================
273
- # Full Training Pipeline
274
- # =====================================================
275
  def train_embedding_engine():
276
-
277
- logger.info(
278
- "Loading processed dataset from Azure SQL..."
279
- )
280
-
281
  df = load_preprocessed_projects()
282
-
283
  engine = ProjectEmbedder()
284
-
285
  engine.build_index(df)
286
-
287
  engine.save_artifacts()
288
-
289
- logger.info(
290
- "Embedding engine completed successfully."
291
- )
292
-
293
- return engine
294
-
295
-
296
- # =====================================================
297
- # Example Run
298
- # =====================================================
299
- if __name__ == "__main__":
300
-
301
- engine = train_embedding_engine()
302
-
303
- query = "Build a mobile app for expense tracking using flutter and firebase"
304
-
305
- print(f"\nQuery: {query}\n")
306
-
307
- results = engine.search(query, k=5)
308
-
309
- print(results)
 
 
 
 
1
  import logging
2
  from pathlib import Path
3
  from typing import List
 
4
  import pandas as pd
5
  import numpy as np
6
  import faiss
7
  from sentence_transformers import SentenceTransformer
 
 
 
8
 
9
+ from Data.database.sql_connector import load_preprocessed_projects
10
+
11
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
 
 
 
 
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
14
  DEFAULT_MODEL = "all-MiniLM-L6-v2"
 
15
  TEXT_COL = "clean_text"
16
  TITLE_COL = "project_title"
17
  TECH_COL = "technologies"
18
 
19
+ # Resolve paths relative to the project root
 
20
  _PROJECT_ROOT = Path(__file__).resolve().parents[2]
 
21
  MODEL_DIR = _PROJECT_ROOT / "models"
22
  INDEX_PATH = MODEL_DIR / "faiss_index.bin"
23
  META_PATH = MODEL_DIR / "metadata.parquet"
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  class ProjectEmbedder:
 
26
  def __init__(self, model_name: str = DEFAULT_MODEL):
27
  logger.info(f"Loading embedding model: {model_name}")
 
28
  self.model = SentenceTransformer(model_name)
29
  self.index = None
30
  self.metadata = None
31
 
32
+ def generate_embeddings(self, texts: List[str], batch_size: int = 64) -> np.ndarray:
 
 
 
 
 
 
 
 
33
  logger.info(f"Generating embeddings for {len(texts)} projects...")
 
34
  vectors = self.model.encode(
35
  texts,
36
  batch_size=batch_size,
 
38
  convert_to_numpy=True,
39
  normalize_embeddings=True
40
  )
 
41
  return vectors.astype("float32")
42
 
 
 
 
43
  def build_index(self, df: pd.DataFrame):
44
+ """Build FAISS cosine index."""
 
 
 
45
  self.metadata = df.copy()
 
 
46
  self.metadata = self.metadata.reset_index(drop=True)
47
 
 
48
  for col in [TITLE_COL, TEXT_COL]:
49
  if col not in self.metadata.columns:
50
  self.metadata[col] = ""
 
53
  self.metadata[TECH_COL] = ""
54
 
55
  FEATURE_COL = "features"
 
56
  if FEATURE_COL not in self.metadata.columns:
57
  self.metadata[FEATURE_COL] = ""
58
 
59
+ feature_text = self.metadata[FEATURE_COL].fillna("").astype(str)
 
 
 
 
 
 
 
60
  rich_texts = (
61
  self.metadata[TITLE_COL].fillna("").astype(str)
62
  + " "
 
 
63
  + self.metadata[TEXT_COL].fillna("").astype(str)
64
  + " "
65
  + feature_text
66
  ).tolist()
67
 
68
  embeddings = self.generate_embeddings(rich_texts)
 
69
  dim = embeddings.shape[1]
70
+
71
  base_index = faiss.IndexFlatIP(dim)
72
  self.index = faiss.IndexIDMap(base_index)
 
73
  ids = np.arange(len(self.metadata)).astype("int64")
74
+
75
  self.index.add_with_ids(embeddings, ids)
76
+ logger.info(f"FAISS index built successfully with {self.index.ntotal} vectors.")
77
 
 
 
 
 
 
 
 
78
  def save_artifacts(self, folder: str = "models"):
 
79
  path = Path(folder)
80
  path.mkdir(parents=True, exist_ok=True)
81
+ faiss.write_index(self.index, str(path / "faiss_index.bin"))
82
+ self.metadata.to_parquet(path / "metadata.parquet", index=False)
 
 
 
 
 
 
 
 
 
83
  logger.info(f"Artifacts saved to {folder}")
84
 
 
 
 
85
  def load_artifacts(self, folder: str = "models"):
 
86
  path = Path(folder)
87
+ self.index = faiss.read_index(str(path / "faiss_index.bin"))
88
+ self.metadata = pd.read_parquet(path / "metadata.parquet")
 
 
 
 
 
 
 
89
  logger.info("Artifacts loaded successfully.")
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def train_embedding_engine():
92
+ logger.info("Loading processed dataset from Azure SQL...")
 
 
 
 
93
  df = load_preprocessed_projects()
94
+
95
  engine = ProjectEmbedder()
 
96
  engine.build_index(df)
 
97
  engine.save_artifacts()
98
+
99
+ logger.info("Embedding engine completed successfully.")
100
+ return engine
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/similarity_model/feature_similarity.py CHANGED
@@ -1,115 +1,75 @@
1
- # src/feature_similarity.py
2
-
3
- import logging
4
- import ast
5
- from functools import lru_cache
6
  from typing import List, Dict, Any
7
-
8
- import numpy as np
9
  import pandas as pd
10
-
11
  from sentence_transformers import SentenceTransformer
12
- from sklearn.metrics.pairwise import cosine_similarity
13
  from scipy.optimize import linear_sum_assignment
14
-
15
- # =====================================================
16
- # Logging
17
- # =====================================================
18
- logging.basicConfig(
19
- level=logging.INFO,
20
- format="%(asctime)s | %(levelname)s | %(message)s"
21
- )
22
 
23
  logger = logging.getLogger(__name__)
24
 
25
- # =====================================================
26
- # Config
27
- # =====================================================
28
  MODEL_NAME = "all-MiniLM-L6-v2"
29
-
30
- DEFAULT_THRESHOLD = 0.80
31
-
32
  SIMILARITY_WEIGHT = 0.70
33
  COVERAGE_WEIGHT = 0.30
 
34
 
35
- # =====================================================
36
- # Model Loader
37
- # =====================================================
38
  @lru_cache(maxsize=1)
39
  def load_feature_model():
40
- """
41
- Load feature embedding model once.
42
- """
43
  logger.info(f"Loading feature model: {MODEL_NAME}")
44
  return SentenceTransformer(MODEL_NAME)
45
 
46
-
47
- # =====================================================
48
- # Helpers
49
- # =====================================================
50
  def safe_feature_list(features):
51
  """
52
  Convert any feature input into clean List[str]
53
- Supports:
54
- list, tuple, numpy array, string, NaN
55
  """
56
-
57
  import numpy as np
58
 
59
- # None
60
  if features is None:
61
  return []
62
 
63
- # numpy nan scalar only
64
- if isinstance(features, float):
65
- if pd.isna(features):
66
- return []
67
 
68
- # numpy array
69
  if isinstance(features, np.ndarray):
70
  features = features.tolist()
71
 
72
- # tuple
73
  if isinstance(features, tuple):
74
  features = list(features)
75
 
76
- # string
77
  if isinstance(features, str):
78
- features = [features]
 
 
 
 
 
 
 
 
79
 
80
- # list
81
  if isinstance(features, list):
82
-
83
  cleaned = []
84
-
85
  for item in features:
86
- val = str(item).strip().lower()
87
-
 
 
88
  if val and val != "nan":
89
  cleaned.append(val)
90
-
91
  return list(dict.fromkeys(cleaned))
92
 
93
  return []
94
 
95
-
96
  def remove_redundant_features(features):
97
-
98
  cleaned = []
99
  seen_words = []
100
 
101
  for feat in features:
102
-
103
  feat_words = set(feat.split())
104
-
105
  redundant = False
106
 
107
  for existing in seen_words:
108
-
109
- overlap = len(
110
- feat_words & existing
111
- ) / max(len(feat_words), 1)
112
-
113
  if overlap >= 0.60:
114
  redundant = True
115
  break
@@ -120,13 +80,7 @@ def remove_redundant_features(features):
120
 
121
  return cleaned
122
 
123
-
124
-
125
- def empty_result(
126
- unique_a=None,
127
- unique_b=None
128
- ) -> Dict[str, Any]:
129
-
130
  return {
131
  "score": 0.0,
132
  "coverage": 0.0,
@@ -136,15 +90,11 @@ def empty_result(
136
  "unique_b": unique_b or []
137
  }
138
 
139
-
140
  def encode_features(
141
  features: List[str],
142
  model
143
- ) -> np.ndarray:
144
- """
145
- Encode feature phrases into normalized vectors.
146
- """
147
-
148
  if not features:
149
  return np.array([])
150
 
@@ -153,167 +103,76 @@ def encode_features(
153
  convert_to_numpy=True,
154
  normalize_embeddings=True
155
  )
156
-
157
  return vectors.astype("float32")
158
 
159
-
160
- # =====================================================
161
- # Core Similarity Engine
162
- # =====================================================
163
  def compute_feature_similarity(
164
  features_a,
165
  features_b,
166
  model=None,
167
  threshold: float = DEFAULT_THRESHOLD
168
  ) -> Dict[str, Any]:
169
- """
170
- Compare two feature lists using:
171
-
172
- 1. Sentence embeddings
173
- 2. Cosine similarity matrix
174
- 3. Hungarian optimal matching
175
- 4. Coverage-aware final score
176
- """
177
-
178
  if model is None:
179
  model = load_feature_model()
180
 
181
- fa = remove_redundant_features(
182
- safe_feature_list(features_a)
183
- )
184
 
185
- fb = remove_redundant_features(
186
- safe_feature_list(features_b)
187
- )
188
-
189
- # empty cases
190
  if not fa or not fb:
191
- return empty_result(
192
- unique_a=fa,
193
- unique_b=fb
194
- )
195
-
196
- # -------------------------------------------------
197
- # Encode features
198
- # -------------------------------------------------
199
  emb_a = encode_features(fa, model)
200
  emb_b = encode_features(fb, model)
201
 
202
- # -------------------------------------------------
203
- # Similarity matrix
204
- # -------------------------------------------------
205
- sim_matrix = cosine_similarity(
206
- emb_a,
207
- emb_b
208
- )
209
-
210
- # -------------------------------------------------
211
- # Hungarian Algorithm
212
- # maximize similarity => minimize negative matrix
213
- # -------------------------------------------------
214
- row_idx, col_idx = linear_sum_assignment(
215
- -sim_matrix
216
- )
217
 
218
  matches = []
219
-
220
  matched_a = set()
221
  matched_b = set()
222
 
223
  for i, j in zip(row_idx, col_idx):
224
-
225
  sim = float(sim_matrix[i, j])
226
-
227
  if sim >= threshold:
228
-
229
  matches.append({
230
  "feature_a": fa[i],
231
  "feature_b": fb[j],
232
  "score": round(sim, 3)
233
  })
234
-
235
  matched_a.add(i)
236
  matched_b.add(j)
237
 
238
- # -------------------------------------------------
239
  # Final Metrics
240
- # -------------------------------------------------
241
- shared_scores = [
242
- m["score"] for m in matches
243
- ]
244
-
245
- mean_similarity = (
246
- float(np.mean(shared_scores))
247
- if shared_scores else 0.0
248
- )
249
 
250
  min_len = min(len(fa), len(fb))
 
251
 
252
- coverage = (
253
- len(matches) / min_len
254
- if min_len > 0 else 0.0
255
- )
256
-
257
- final_score = (
258
- (SIMILARITY_WEIGHT * mean_similarity)
259
- +
260
- (COVERAGE_WEIGHT * coverage)
261
- )
262
-
263
  if len(matches) == 0:
264
  final_score = 0.0
265
 
266
  final_score = min(final_score, 1.0)
267
 
268
- matched_text_a = " ".join(
269
- [
270
- m["feature_a"]
271
- for m in matches
272
- ]
273
- ).lower()
274
-
275
- matched_text_b = " ".join(
276
- [
277
- m["feature_b"]
278
- for m in matches
279
- ]
280
- ).lower()
281
-
282
 
283
- def is_semantically_redundant(
284
- feature,
285
- matched_text
286
- ):
287
  words = set(feature.lower().split())
288
-
289
- overlap = sum(
290
- 1 for w in words
291
- if w in matched_text
292
- )
293
-
294
- ratio = overlap / max(len(words), 1)
295
-
296
- return ratio >= 0.5
297
-
298
 
299
  unique_a = [
300
- fa[i]
301
- for i in range(len(fa))
302
- if i not in matched_a
303
- and not is_semantically_redundant(
304
- fa[i],
305
- matched_text_a
306
- )
307
  ]
308
 
309
  unique_b = [
310
- fb[j]
311
- for j in range(len(fb))
312
- if j not in matched_b
313
- and not is_semantically_redundant(
314
- fb[j],
315
- matched_text_b
316
- )
317
  ]
318
 
319
  return {
@@ -325,96 +184,35 @@ def compute_feature_similarity(
325
  "unique_b": unique_b
326
  }
327
 
328
-
329
- # =====================================================
330
- # Compare Two Rows From DataFrame
331
- # =====================================================
332
  def compare_projects(
333
  df: pd.DataFrame,
334
  idx1: int,
335
  idx2: int,
336
  model=None
337
  ) -> Dict[str, Any]:
338
- """
339
- Compare two projects from dataset.
340
- """
341
-
342
- if model is None:
343
- model = load_feature_model()
344
 
345
  f1 = df.loc[idx1, "features"]
346
  f2 = df.loc[idx2, "features"]
347
 
348
- result = compute_feature_similarity(
349
- f1,
350
- f2,
351
- model=model
352
- )
353
-
354
- result["project_a_id"] = int(idx1)
355
- result["project_b_id"] = int(idx2)
356
-
357
- return result
358
 
359
-
360
- # =====================================================
361
- # Compare One Against Many
362
- # =====================================================
363
  def compare_project_against_many(
364
- query_features,
365
- candidate_feature_lists,
366
- model=None,
367
- threshold: float = DEFAULT_THRESHOLD
368
- ):
369
- """
370
- Compare one project against many candidates.
371
- """
372
-
373
- if model is None:
374
- model = load_feature_model()
375
-
376
- results = []
377
-
378
- for idx, candidate in enumerate(
379
- candidate_feature_lists
380
- ):
381
-
382
- result = compute_feature_similarity(
383
- query_features,
384
- candidate,
385
- model=model,
386
- threshold=threshold
387
- )
388
 
389
- result["candidate_id"] = idx
 
390
 
391
- results.append(result)
 
 
 
392
 
393
  return results
394
-
395
-
396
- # =====================================================
397
- # Example Run
398
- # =====================================================
399
- if __name__ == "__main__":
400
-
401
- project_a = [
402
- "online reservation",
403
- "ai chatbot",
404
- "patient records",
405
- "doctor dashboard"
406
- ]
407
-
408
- project_b = [
409
- "appointment booking",
410
- "chatbot assistant",
411
- "medical records",
412
- "analytics dashboard"
413
- ]
414
-
415
- result = compute_feature_similarity(
416
- project_a,
417
- project_b
418
- )
419
-
420
- print(result)
 
 
 
 
 
 
1
  from typing import List, Dict, Any
 
 
2
  import pandas as pd
 
3
  from sentence_transformers import SentenceTransformer
 
4
  from scipy.optimize import linear_sum_assignment
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import logging
7
+ from functools import lru_cache
 
 
 
 
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
 
 
11
  MODEL_NAME = "all-MiniLM-L6-v2"
 
 
 
12
  SIMILARITY_WEIGHT = 0.70
13
  COVERAGE_WEIGHT = 0.30
14
+ DEFAULT_THRESHOLD = 0.80
15
 
 
 
 
16
  @lru_cache(maxsize=1)
17
  def load_feature_model():
 
 
 
18
  logger.info(f"Loading feature model: {MODEL_NAME}")
19
  return SentenceTransformer(MODEL_NAME)
20
 
 
 
 
 
21
  def safe_feature_list(features):
22
  """
23
  Convert any feature input into clean List[str]
 
 
24
  """
 
25
  import numpy as np
26
 
 
27
  if features is None:
28
  return []
29
 
30
+ if isinstance(features, float) and pd.isna(features):
31
+ return []
 
 
32
 
 
33
  if isinstance(features, np.ndarray):
34
  features = features.tolist()
35
 
 
36
  if isinstance(features, tuple):
37
  features = list(features)
38
 
 
39
  if isinstance(features, str):
40
+ try:
41
+ import ast
42
+ parsed = ast.literal_eval(features)
43
+ if isinstance(parsed, list):
44
+ features = parsed
45
+ else:
46
+ features = [features]
47
+ except:
48
+ features = [features]
49
 
 
50
  if isinstance(features, list):
 
51
  cleaned = []
 
52
  for item in features:
53
+ if isinstance(item, dict) and "feature" in item:
54
+ val = str(item["feature"]).strip().lower()
55
+ else:
56
+ val = str(item).strip().lower()
57
  if val and val != "nan":
58
  cleaned.append(val)
 
59
  return list(dict.fromkeys(cleaned))
60
 
61
  return []
62
 
 
63
  def remove_redundant_features(features):
 
64
  cleaned = []
65
  seen_words = []
66
 
67
  for feat in features:
 
68
  feat_words = set(feat.split())
 
69
  redundant = False
70
 
71
  for existing in seen_words:
72
+ overlap = len(feat_words & existing) / max(len(feat_words), 1)
 
 
 
 
73
  if overlap >= 0.60:
74
  redundant = True
75
  break
 
80
 
81
  return cleaned
82
 
83
+ def empty_result(unique_a=None, unique_b=None):
 
 
 
 
 
 
84
  return {
85
  "score": 0.0,
86
  "coverage": 0.0,
 
90
  "unique_b": unique_b or []
91
  }
92
 
 
93
  def encode_features(
94
  features: List[str],
95
  model
96
+ ):
97
+ import numpy as np
 
 
 
98
  if not features:
99
  return np.array([])
100
 
 
103
  convert_to_numpy=True,
104
  normalize_embeddings=True
105
  )
 
106
  return vectors.astype("float32")
107
 
 
 
 
 
108
  def compute_feature_similarity(
109
  features_a,
110
  features_b,
111
  model=None,
112
  threshold: float = DEFAULT_THRESHOLD
113
  ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
114
  if model is None:
115
  model = load_feature_model()
116
 
117
+ fa = remove_redundant_features(safe_feature_list(features_a))
118
+ fb = remove_redundant_features(safe_feature_list(features_b))
 
119
 
 
 
 
 
 
120
  if not fa or not fb:
121
+ return empty_result(unique_a=fa, unique_b=fb)
122
+
 
 
 
 
 
 
123
  emb_a = encode_features(fa, model)
124
  emb_b = encode_features(fb, model)
125
 
126
+ sim_matrix = cosine_similarity(emb_a, emb_b)
127
+
128
+ # Hungarian match
129
+ row_idx, col_idx = linear_sum_assignment(-sim_matrix)
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  matches = []
 
132
  matched_a = set()
133
  matched_b = set()
134
 
135
  for i, j in zip(row_idx, col_idx):
 
136
  sim = float(sim_matrix[i, j])
 
137
  if sim >= threshold:
 
138
  matches.append({
139
  "feature_a": fa[i],
140
  "feature_b": fb[j],
141
  "score": round(sim, 3)
142
  })
 
143
  matched_a.add(i)
144
  matched_b.add(j)
145
 
 
146
  # Final Metrics
147
+ import numpy as np
148
+ shared_scores = [m["score"] for m in matches]
149
+ mean_similarity = float(np.mean(shared_scores)) if shared_scores else 0.0
 
 
 
 
 
 
150
 
151
  min_len = min(len(fa), len(fb))
152
+ coverage = len(matches) / min_len if min_len > 0 else 0.0
153
 
154
+ final_score = (SIMILARITY_WEIGHT * mean_similarity) + (COVERAGE_WEIGHT * coverage)
 
 
 
 
 
 
 
 
 
 
155
  if len(matches) == 0:
156
  final_score = 0.0
157
 
158
  final_score = min(final_score, 1.0)
159
 
160
+ matched_text_a = " ".join([m["feature_a"] for m in matches]).lower()
161
+ matched_text_b = " ".join([m["feature_b"] for m in matches]).lower()
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ def is_semantically_redundant(feature, matched_text):
 
 
 
164
  words = set(feature.lower().split())
165
+ overlap = sum(1 for w in words if w in matched_text)
166
+ return (overlap / max(len(words), 1)) >= 0.5
 
 
 
 
 
 
 
 
167
 
168
  unique_a = [
169
+ fa[i] for i in range(len(fa))
170
+ if i not in matched_a and not is_semantically_redundant(fa[i], matched_text_a)
 
 
 
 
 
171
  ]
172
 
173
  unique_b = [
174
+ fb[j] for j in range(len(fb))
175
+ if j not in matched_b and not is_semantically_redundant(fb[j], matched_text_b)
 
 
 
 
 
176
  ]
177
 
178
  return {
 
184
  "unique_b": unique_b
185
  }
186
 
 
 
 
 
187
  def compare_projects(
188
  df: pd.DataFrame,
189
  idx1: int,
190
  idx2: int,
191
  model=None
192
  ) -> Dict[str, Any]:
193
+ if idx1 not in df.index or idx2 not in df.index:
194
+ return empty_result()
 
 
 
 
195
 
196
  f1 = df.loc[idx1, "features"]
197
  f2 = df.loc[idx2, "features"]
198
 
199
+ return compute_feature_similarity(f1, f2, model=model)
 
 
 
 
 
 
 
 
 
200
 
 
 
 
 
201
  def compare_project_against_many(
202
+ df: pd.DataFrame,
203
+ idx1: int,
204
+ indices: List[int],
205
+ model=None
206
+ ) -> Dict[int, Dict[str, Any]]:
207
+ if idx1 not in df.index:
208
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
+ f1 = df.loc[idx1, 'features']
211
+ results = {}
212
 
213
+ for idx2 in indices:
214
+ if idx2 in df.index:
215
+ f2 = df.loc[idx2, 'features']
216
+ results[idx2] = compute_feature_similarity(f1, f2, model=model)
217
 
218
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/similarity_model/preprocessing.py CHANGED
@@ -1,9 +1,6 @@
1
- # src/preprocessing.py
2
- # FINAL POLISHED VERSION
3
- # Best Practical Feature Extraction for Graduation Project System
4
-
5
  import re
6
  import logging
 
7
  import numpy as np
8
  from functools import lru_cache
9
  from pathlib import Path
@@ -11,80 +8,42 @@ import pandas as pd
11
  from sentence_transformers import SentenceTransformer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
14
-
15
-
16
- # =====================================================
17
- # Logging
18
- # =====================================================
19
- logging.basicConfig(
20
- level=logging.INFO,
21
- format="%(asctime)s | %(levelname)s | %(message)s"
22
- )
23
  logger = logging.getLogger(__name__)
24
 
25
- # =====================================================
26
- # Models
27
- # =====================================================
28
  MODEL_NAME = "all-MiniLM-L6-v2"
29
 
30
  @lru_cache(maxsize=1)
31
  def _get_embed_model():
32
- """Lazy-load the embedding model once on first use."""
33
  logger.info(f"Loading embed model: {MODEL_NAME}")
34
  return SentenceTransformer(MODEL_NAME)
35
 
36
-
37
- # =====================================================
38
- # Config
39
- # =====================================================
40
  MIN_WORDS = 8
41
  MAX_WORDS = 4000
42
- # =====================================================
43
- # Helpers
44
- # =====================================================
45
- def normalize_text(text):
46
- """
47
- Clean raw text
48
- """
49
 
 
50
  if pd.isna(text):
51
  return ""
52
-
53
  text = str(text).lower().strip()
54
-
55
- # remove urls/emails
56
- text = re.sub(
57
- r"http\S+|www\S+|\S+@\S+",
58
- " ",
59
- text
60
- )
61
-
62
- # keep useful chars
63
- text = re.sub(
64
- r"[^a-z0-9\+\#\./\- ]",
65
- " ",
66
- text
67
- )
68
-
69
- # remove spaces
70
- text = re.sub(
71
- r"\s+",
72
- " ",
73
- text
74
- )
75
-
76
  return text.strip()
77
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- def semantic_deduplicate(
80
- features,
81
- model,
82
- threshold=0.85
83
- ):
84
- """
85
- Remove semantically similar features.
86
- """
87
-
88
  if len(features) <= 1:
89
  return features
90
 
@@ -95,126 +54,55 @@ def semantic_deduplicate(
95
  )
96
 
97
  kept = []
98
-
99
  for i, feat in enumerate(features):
100
-
101
  redundant = False
102
-
103
  for existing in kept:
104
-
105
  sim = cosine_similarity(
106
  embeddings[i].reshape(1, -1),
107
  embeddings[existing].reshape(1, -1)
108
  )[0][0]
109
-
110
  if sim >= threshold:
111
  redundant = True
112
  break
113
-
114
  if not redundant:
115
  kept.append(i)
116
 
117
  return [features[i] for i in kept]
118
 
119
-
120
- # =====================================================
121
- # Local Feature Dictionary (cached)
122
- # =====================================================
123
- _PROJECT_ROOT = Path(__file__).resolve().parents[2]
124
- _METADATA_PATH = _PROJECT_ROOT / "models" / "metadata.parquet"
125
-
126
-
127
  @lru_cache(maxsize=1)
128
- def _load_known_features():
129
- """
130
- Load all known feature strings from the training metadata.
131
- Cached — reads the parquet file only once per process.
132
- Sorted longest-first so multi-word features (e.g. 'deep learning')
133
- are matched before their sub-words (e.g. 'learning').
134
- """
135
- if not _METADATA_PATH.exists():
136
- logger.warning("metadata.parquet not found; feature extraction will return []")
137
- return []
138
-
139
- df = pd.read_parquet(str(_METADATA_PATH))
140
-
141
- features_set = set()
142
- for f_list in df.get("features", pd.Series(dtype=object)):
143
- if isinstance(f_list, (list, np.ndarray, tuple, set)):
144
- for f in f_list:
145
- val = str(f).strip().lower()
146
- if val and val != "nan" and len(val) >= 3:
147
- features_set.add(val)
148
 
149
- logger.info(f"Loaded {len(features_set)} known features from metadata")
150
-
151
- # longest first → greedy multi-word match wins
152
- return sorted(features_set, key=len, reverse=True)
153
-
154
-
155
- # =====================================================
156
- # Main Feature Extraction (fully local, no API)
157
- # =====================================================
158
  def extract_features(text: str) -> list:
159
  """
160
- Match technical features from text against the known feature
161
- dictionary built from training data.
162
- No external API required.
163
  """
164
- known_features = _load_known_features()
165
-
166
- if not known_features:
167
- return []
168
-
169
- text_norm = normalize_text(text)
170
  matched = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- for feat in known_features:
173
- pattern = r'\b' + re.escape(feat) + r'\b'
174
- if re.search(pattern, text_norm):
175
- matched.append(feat)
176
- if len(matched) >= 15: # collect up to 15, dedup will trim
177
- break
178
-
179
- logger.info(f"Local feature extraction matched {len(matched)} features")
180
-
181
- return semantic_deduplicate(
182
- matched,
183
- _get_embed_model(),
184
- threshold=0.85
185
- )
186
-
187
-
188
 
189
- # =====================================================
190
- # Main Pipeline
191
- # =====================================================
192
  def preprocess_dataset(df):
193
- """
194
- Full preprocessing pipeline
195
- """
196
-
197
- logger.info(
198
- "Starting preprocessing..."
199
- )
200
-
201
  df = df.copy()
202
 
203
- # clean columns
204
- df.columns = (
205
- df.columns
206
- .str.strip()
207
- .str.lower()
208
- .str.replace(
209
- r"\W+",
210
- "_",
211
- regex=True
212
- )
213
- )
214
 
215
- # =============================================
216
- # Column Mapping Fix
217
- # =============================================
218
  column_mapping = {
219
  "title": "project_title",
220
  "ai_summary": "ai_summary",
@@ -230,143 +118,26 @@ def preprocess_dataset(df):
230
 
231
  df = df.rename(columns=column_mapping)
232
 
233
- # ensure needed columns
234
- for col in [
235
- "project_title",
236
- "abstract",
237
- "description"
238
- ]:
239
-
240
  if col not in df.columns:
241
  df[col] = ""
 
242
 
243
- df[col] = (
244
- df[col]
245
- .fillna("")
246
- .astype(str)
247
- )
248
 
249
- # =============================================
250
- # Smart weighted merge
251
- # =============================================
252
- df["full_content"] = (
253
- df["project_title"] + ". " +
254
- df["project_title"] + ". " +
255
- df["abstract"] + ". " +
256
- df["description"]
257
- )
258
-
259
- # normalize
260
- df["clean_text"] = (
261
- df["full_content"]
262
- .apply(normalize_text)
263
- )
264
-
265
- # remove duplicates
266
  before = len(df)
 
 
267
 
268
- df = df.drop_duplicates(
269
- subset=[
270
- "project_title",
271
- "clean_text"
272
- ]
273
- ).copy()
274
-
275
- logger.info(
276
- f"Removed duplicates: {before-len(df)}"
277
- )
278
-
279
- # word count filter
280
- df["word_count"] = (
281
- df["clean_text"]
282
- .str.split()
283
- .str.len()
284
- )
285
-
286
- df = df[
287
- df["word_count"].between(
288
- MIN_WORDS,
289
- MAX_WORDS
290
- )
291
- ].copy()
292
 
293
- df.reset_index(
294
- drop=True,
295
- inplace=True
296
- )
297
-
298
- # =============================================
299
- # Feature Extraction
300
- # =============================================
301
- logger.info(
302
- "Extracting features..."
303
- )
304
-
305
- df["features"] = (
306
- df["clean_text"]
307
- .apply(extract_features)
308
- )
309
-
310
- # remove empty rows
311
- df = df[
312
- df["features"]
313
- .apply(len) > 0
314
- ].copy()
315
-
316
- df.reset_index(
317
- drop=True,
318
- inplace=True
319
- )
320
-
321
- logger.info(
322
- f"Final rows: {len(df)}"
323
- )
324
 
 
325
  return df
326
-
327
-
328
- # =====================================================
329
- # Save
330
- # =====================================================
331
- def save_processed_data(
332
- df,
333
- output_dir="Data/processed"
334
- ):
335
-
336
- path = Path(output_dir)
337
-
338
- path.mkdir(
339
- parents=True,
340
- exist_ok=True
341
- )
342
-
343
- df.to_parquet(
344
- path / "projects_clean.parquet",
345
- index=False
346
- )
347
-
348
- df.to_csv(
349
- path / "projects_clean.csv",
350
- index=False
351
- )
352
-
353
- logger.info(
354
- f"Saved to {path}"
355
- )
356
-
357
-
358
- # =====================================================
359
- # Run
360
- # =====================================================
361
- if __name__ == "__main__":
362
-
363
- file_path = "Data/raw/projects.xlsx"
364
-
365
- if file_path.endswith(".csv"):
366
- raw_df = pd.read_csv(file_path)
367
- else:
368
- raw_df = pd.read_excel(file_path)
369
-
370
- clean_df = preprocess_dataset(raw_df)
371
-
372
- save_processed_data(clean_df)
 
 
 
 
 
1
  import re
2
  import logging
3
+ import yake
4
  import numpy as np
5
  from functools import lru_cache
6
  from pathlib import Path
 
8
  from sentence_transformers import SentenceTransformer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
 
 
 
 
 
 
 
 
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
14
  MODEL_NAME = "all-MiniLM-L6-v2"
15
 
16
  @lru_cache(maxsize=1)
17
  def _get_embed_model():
 
18
  logger.info(f"Loading embed model: {MODEL_NAME}")
19
  return SentenceTransformer(MODEL_NAME)
20
 
 
 
 
 
21
  MIN_WORDS = 8
22
  MAX_WORDS = 4000
 
 
 
 
 
 
 
23
 
24
+ def normalize_text(text):
25
  if pd.isna(text):
26
  return ""
 
27
  text = str(text).lower().strip()
28
+ text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
29
+ text = re.sub(r"[^a-z0-9\+\#\./\- ]", " ", text)
30
+ text = re.sub(r"\s+", " ", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  return text.strip()
32
 
33
+ def substring_deduplicate(features):
34
+ features = sorted(features, key=len, reverse=True)
35
+ kept = []
36
+ for feat in features:
37
+ is_substring = False
38
+ for longer_feat in kept:
39
+ if feat in longer_feat:
40
+ is_substring = True
41
+ break
42
+ if not is_substring:
43
+ kept.append(feat)
44
+ return kept
45
 
46
+ def semantic_deduplicate(features, model, threshold=0.85):
 
 
 
 
 
 
 
 
47
  if len(features) <= 1:
48
  return features
49
 
 
54
  )
55
 
56
  kept = []
 
57
  for i, feat in enumerate(features):
 
58
  redundant = False
 
59
  for existing in kept:
 
60
  sim = cosine_similarity(
61
  embeddings[i].reshape(1, -1),
62
  embeddings[existing].reshape(1, -1)
63
  )[0][0]
 
64
  if sim >= threshold:
65
  redundant = True
66
  break
 
67
  if not redundant:
68
  kept.append(i)
69
 
70
  return [features[i] for i in kept]
71
 
 
 
 
 
 
 
 
 
72
  @lru_cache(maxsize=1)
73
+ def _get_yake_extractor():
74
+ logger.info("Initializing YAKE NLP feature extractor")
75
+ return yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=20, features=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
 
 
 
 
 
 
 
 
 
77
  def extract_features(text: str) -> list:
78
  """
79
+ Extracts detailed, multi-word phrases generated purely by YAKE.
 
 
80
  """
 
 
 
 
 
 
81
  matched = []
82
+ try:
83
+ kw_extractor = _get_yake_extractor()
84
+ yake_results = kw_extractor.extract_keywords(text)
85
+
86
+ for kw, score in yake_results:
87
+ kw_clean = str(kw).strip().lower()
88
+ if len(kw_clean.split()) > 1 and kw_clean not in matched:
89
+ matched.append(kw_clean)
90
+
91
+ except Exception as e:
92
+ logger.error(f"YAKE extraction failed: {e}")
93
+
94
+ if not matched:
95
+ return []
96
 
97
+ matched = substring_deduplicate(matched)
98
+ return semantic_deduplicate(matched, _get_embed_model(), threshold=0.85)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
 
 
100
  def preprocess_dataset(df):
101
+ logger.info("Starting preprocessing...")
 
 
 
 
 
 
 
102
  df = df.copy()
103
 
104
+ df.columns = df.columns.str.strip().str.lower().str.replace(r"\W+", "_", regex=True)
 
 
 
 
 
 
 
 
 
 
105
 
 
 
 
106
  column_mapping = {
107
  "title": "project_title",
108
  "ai_summary": "ai_summary",
 
118
 
119
  df = df.rename(columns=column_mapping)
120
 
121
+ for col in ["project_title", "abstract", "description"]:
 
 
 
 
 
 
122
  if col not in df.columns:
123
  df[col] = ""
124
+ df[col] = df[col].fillna("").astype(str)
125
 
126
+ df["full_content"] = df["project_title"] + ". " + df["abstract"] + ". " + df["description"]
127
+ df["clean_text"] = df["full_content"].apply(normalize_text)
 
 
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  before = len(df)
130
+ df = df.drop_duplicates(subset=["project_title", "clean_text"]).copy()
131
+ logger.info(f"Removed duplicates: {before-len(df)}")
132
 
133
+ df["word_count"] = df["clean_text"].str.split().str.len()
134
+ df = df[df["word_count"].between(MIN_WORDS, MAX_WORDS)].copy()
135
+ df.reset_index(drop=True, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ logger.info("Extracting features...")
138
+ df["features"] = df["clean_text"].apply(extract_features)
139
+ df = df[df["features"].apply(len) > 0].copy()
140
+ df.reset_index(drop=True, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ logger.info(f"Final rows: {len(df)}")
143
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/similarity_model/similarity_engine.py CHANGED
@@ -417,25 +417,3 @@ def find_similar_projects(
417
 
418
  return final_df
419
 
420
-
421
- # =====================================================
422
- # Example Run
423
- # =====================================================
424
- if __name__ == "__main__":
425
-
426
- results = find_similar_projects(
427
- title="Smart Library",
428
- abstract="""
429
- AI based digital library for students.
430
- """,
431
- description="""
432
- Includes chatbot,
433
- recommendation system,
434
- qr code scanner,
435
- mobile application.
436
- """,
437
- features=["library"],
438
- top_k=5
439
- )
440
-
441
- print(results)
 
417
 
418
  return final_df
419