bat-6 commited on
Commit
7d235bd
·
verified ·
1 Parent(s): 12b83fb

Update: preprocessing.py

Browse files
Files changed (1) hide show
  1. src/similarity_model/preprocessing.py +143 -143
src/similarity_model/preprocessing.py CHANGED
@@ -1,143 +1,143 @@
1
- import re
2
- import logging
3
- import yake
4
- import numpy as np
5
- from functools import lru_cache
6
- from pathlib import Path
7
- import pandas as pd
8
- from sentence_transformers import SentenceTransformer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
-
11
- logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
12
- logger = logging.getLogger(__name__)
13
-
14
- MODEL_NAME = "all-mpnet-base-v2"
15
-
16
- @lru_cache(maxsize=1)
17
- def _get_embed_model():
18
- logger.info(f"Loading embed model: {MODEL_NAME}")
19
- return SentenceTransformer(MODEL_NAME)
20
-
21
- MIN_WORDS = 8
22
- MAX_WORDS = 4000
23
-
24
- def normalize_text(text):
25
- if pd.isna(text):
26
- return ""
27
- text = str(text).lower().strip()
28
- text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
29
- text = re.sub(r"[^a-z0-9\+\#\./\- ]", " ", text)
30
- text = re.sub(r"\s+", " ", text)
31
- return text.strip()
32
-
33
- def substring_deduplicate(features):
34
- features = sorted(features, key=len, reverse=True)
35
- kept = []
36
- for feat in features:
37
- is_substring = False
38
- for longer_feat in kept:
39
- if feat in longer_feat:
40
- is_substring = True
41
- break
42
- if not is_substring:
43
- kept.append(feat)
44
- return kept
45
-
46
- def semantic_deduplicate(features, model, threshold=0.85):
47
- if len(features) <= 1:
48
- return features
49
-
50
- embeddings = model.encode(
51
- features,
52
- convert_to_numpy=True,
53
- normalize_embeddings=True
54
- )
55
-
56
- kept = []
57
- for i, feat in enumerate(features):
58
- redundant = False
59
- for existing in kept:
60
- sim = cosine_similarity(
61
- embeddings[i].reshape(1, -1),
62
- embeddings[existing].reshape(1, -1)
63
- )[0][0]
64
- if sim >= threshold:
65
- redundant = True
66
- break
67
- if not redundant:
68
- kept.append(i)
69
-
70
- return [features[i] for i in kept]
71
-
72
- @lru_cache(maxsize=1)
73
- def _get_yake_extractor():
74
- logger.info("Initializing YAKE NLP feature extractor")
75
- return yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=20, features=None)
76
-
77
- def extract_features(text: str) -> list:
78
- """
79
- Extracts detailed, multi-word phrases generated purely by YAKE.
80
- """
81
- matched = []
82
- try:
83
- kw_extractor = _get_yake_extractor()
84
- yake_results = kw_extractor.extract_keywords(text)
85
-
86
- for kw, score in yake_results:
87
- kw_clean = str(kw).strip().lower()
88
- if len(kw_clean.split()) > 1 and kw_clean not in matched:
89
- matched.append(kw_clean)
90
-
91
- except Exception as e:
92
- logger.error(f"YAKE extraction failed: {e}")
93
-
94
- if not matched:
95
- return []
96
-
97
- matched = substring_deduplicate(matched)
98
- return semantic_deduplicate(matched, _get_embed_model(), threshold=0.85)
99
-
100
- def preprocess_dataset(df):
101
- logger.info("Starting preprocessing...")
102
- df = df.copy()
103
-
104
- df.columns = df.columns.str.strip().str.lower().str.replace(r"\W+", "_", regex=True)
105
-
106
- column_mapping = {
107
- "title": "project_title",
108
- "ai_summary": "ai_summary",
109
- "technologies": "technologies",
110
- "keywords": "keywords",
111
- "abstract": "abstract",
112
- "description": "description",
113
- "problem_statement": "problem_statement",
114
- "proposed_solution": "proposed_solution",
115
- "objectives": "objectives",
116
- "category": "category"
117
- }
118
-
119
- df = df.rename(columns=column_mapping)
120
-
121
- for col in ["project_title", "abstract", "description"]:
122
- if col not in df.columns:
123
- df[col] = ""
124
- df[col] = df[col].fillna("").astype(str)
125
-
126
- df["full_content"] = df["project_title"] + ". " + df["abstract"] + ". " + df["description"]
127
- df["clean_text"] = df["full_content"].apply(normalize_text)
128
-
129
- before = len(df)
130
- df = df.drop_duplicates(subset=["project_title", "clean_text"]).copy()
131
- logger.info(f"Removed duplicates: {before-len(df)}")
132
-
133
- df["word_count"] = df["clean_text"].str.split().str.len()
134
- df = df[df["word_count"].between(MIN_WORDS, MAX_WORDS)].copy()
135
- df.reset_index(drop=True, inplace=True)
136
-
137
- logger.info("Extracting features...")
138
- df["features"] = df["clean_text"].apply(extract_features)
139
- df = df[df["features"].apply(len) > 0].copy()
140
- df.reset_index(drop=True, inplace=True)
141
-
142
- logger.info(f"Final rows: {len(df)}")
143
- return df
 
1
+ import re
2
+ import logging
3
+ import yake
4
+ import numpy as np
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ import pandas as pd
8
+ from sentence_transformers import SentenceTransformer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
12
+ logger = logging.getLogger(__name__)
13
+
14
+ MODEL_NAME = "all-mpnet-base-v2"
15
+
16
+ @lru_cache(maxsize=1)
17
+ def _get_embed_model():
18
+ logger.info(f"Loading embed model: {MODEL_NAME}")
19
+ return SentenceTransformer(MODEL_NAME)
20
+
21
+ MIN_WORDS = 8
22
+ MAX_WORDS = 4000
23
+
24
+ def normalize_text(text):
25
+ if pd.isna(text):
26
+ return ""
27
+ text = str(text).lower().strip()
28
+ text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
29
+ text = re.sub(r"[^a-z0-9\+\#\./\- ]", " ", text)
30
+ text = re.sub(r"\s+", " ", text)
31
+ return text.strip()
32
+
33
+ def substring_deduplicate(features):
34
+ features = sorted(features, key=len, reverse=True)
35
+ kept = []
36
+ for feat in features:
37
+ is_substring = False
38
+ for longer_feat in kept:
39
+ if feat in longer_feat:
40
+ is_substring = True
41
+ break
42
+ if not is_substring:
43
+ kept.append(feat)
44
+ return kept
45
+
46
+ def semantic_deduplicate(features, model, threshold=0.85):
47
+ if len(features) <= 1:
48
+ return features
49
+
50
+ embeddings = model.encode(
51
+ features,
52
+ convert_to_numpy=True,
53
+ normalize_embeddings=True
54
+ )
55
+
56
+ kept = []
57
+ for i, feat in enumerate(features):
58
+ redundant = False
59
+ for existing in kept:
60
+ sim = cosine_similarity(
61
+ embeddings[i].reshape(1, -1),
62
+ embeddings[existing].reshape(1, -1)
63
+ )[0][0]
64
+ if sim >= threshold:
65
+ redundant = True
66
+ break
67
+ if not redundant:
68
+ kept.append(i)
69
+
70
+ return [features[i] for i in kept]
71
+
72
+ @lru_cache(maxsize=1)
73
+ def _get_yake_extractor():
74
+ logger.info("Initializing YAKE NLP feature extractor")
75
+ return yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=20, features=None)
76
+
77
+ def extract_features(text: str) -> list:
78
+ """
79
+ Extracts detailed, multi-word phrases generated purely by YAKE.
80
+ """
81
+ matched = []
82
+ try:
83
+ kw_extractor = _get_yake_extractor()
84
+ yake_results = kw_extractor.extract_keywords(text)
85
+
86
+ for kw, score in yake_results:
87
+ kw_clean = str(kw).strip().lower()
88
+ if len(kw_clean.split()) > 1 and kw_clean not in matched:
89
+ matched.append(kw_clean)
90
+
91
+ except Exception as e:
92
+ logger.error(f"YAKE extraction failed: {e}")
93
+
94
+ if not matched:
95
+ return []
96
+
97
+ matched = substring_deduplicate(matched)
98
+ return semantic_deduplicate(matched, _get_embed_model(), threshold=0.85)
99
+
100
+ def preprocess_dataset(df):
101
+ logger.info("Starting preprocessing...")
102
+ df = df.copy()
103
+
104
+ df.columns = df.columns.str.strip().str.lower().str.replace(r"\W+", "_", regex=True)
105
+
106
+ column_mapping = {
107
+ "title": "project_title",
108
+ "ai_summary": "ai_summary",
109
+ "technologies": "technologies",
110
+ "keywords": "keywords",
111
+ "abstract": "abstract",
112
+ "description": "description",
113
+ "problem_statement": "problem_statement",
114
+ "proposed_solution": "proposed_solution",
115
+ "objectives": "objectives",
116
+ "category": "category"
117
+ }
118
+
119
+ df = df.rename(columns=column_mapping)
120
+
121
+ for col in ["project_title", "abstract", "description"]:
122
+ if col not in df.columns:
123
+ df[col] = ""
124
+ df[col] = df[col].fillna("").astype(str)
125
+
126
+ df["full_content"] = df["project_title"] + ". " + df["abstract"] + ". " + df["description"]
127
+ df["clean_text"] = df["full_content"].apply(normalize_text)
128
+
129
+ before = len(df)
130
+ df = df.drop_duplicates(subset=["project_title", "clean_text"]).copy()
131
+ logger.info(f"Removed duplicates: {before-len(df)}")
132
+
133
+ df["word_count"] = df["clean_text"].str.split().str.len()
134
+ df = df[df["word_count"].between(MIN_WORDS, MAX_WORDS)].copy()
135
+ df.reset_index(drop=True, inplace=True)
136
+
137
+ logger.info("Extracting features...")
138
+ df["features"] = df["clean_text"].apply(extract_features)
139
+ df = df[df["features"].apply(len) > 0].copy()
140
+ df.reset_index(drop=True, inplace=True)
141
+
142
+ logger.info(f"Final rows: {len(df)}")
143
+ return df