pratham0011 commited on
Commit
5e03342
·
verified ·
1 Parent(s): b4edeae

Upload recommender.py

Browse files
Files changed (1) hide show
  1. recommender.py +267 -0
recommender.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import torch
8
+ import gc
9
+ import time
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+
12
+ class SHLRecommender:
13
+ _cache = {}
14
+ _cache_size = 20
15
+ def __init__(self, data_path='utils/data.csv'):
16
+ try:
17
+ self.df = pd.read_csv(data_path)
18
+ except FileNotFoundError:
19
+ raise FileNotFoundError(f"Data file not found at {data_path}. Please check the path.")
20
+
21
+ self.df.columns = [col.strip() for col in self.df.columns]
22
+
23
+ try:
24
+ import os
25
+ cache_dir = os.path.join(os.getcwd(), 'model_cache')
26
+ os.makedirs(cache_dir, exist_ok=True)
27
+ print(f"Using cache directory: {cache_dir}")
28
+
29
+ self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder=cache_dir)
30
+ print("Successfully loaded all-MiniLM-L6-v2 model")
31
+ except Exception as e:
32
+ print(f"Error loading primary model: {str(e)}")
33
+ try:
34
+ # Try a different model as fallback
35
+ print("Trying fallback model: paraphrase-MiniLM-L3-v2")
36
+ self.embedding_model = SentenceTransformer('paraphrase-MiniLM-L3-v2', cache_folder=cache_dir)
37
+ print("Successfully loaded fallback model")
38
+ except Exception as e2:
39
+ print(f"Error loading fallback model: {str(e2)}")
40
+ # Create a simple embedding model as last resort
41
+ from sentence_transformers import models, SentenceTransformer
42
+ print("Creating basic embedding model from scratch")
43
+ word_embedding_model = models.Transformer('bert-base-uncased', cache_dir=cache_dir)
44
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
45
+ self.embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
46
+ print("Created basic embedding model")
47
+
48
+ model_id = "Qwen/Qwen2.5-0.5B-Instruct"
49
+
50
+ self.tokenizer = AutoTokenizer.from_pretrained(
51
+ model_id,
52
+ trust_remote_code=True,
53
+ use_fast=True,
54
+ model_max_length=512,
55
+ )
56
+
57
+ try:
58
+ print(f"Loading Qwen model: {model_id}")
59
+ self.model = AutoModelForCausalLM.from_pretrained(
60
+ model_id,
61
+ trust_remote_code=True,
62
+ torch_dtype=torch.float32,
63
+ device_map="auto",
64
+ low_cpu_mem_usage=True,
65
+ cache_dir=cache_dir,
66
+ local_files_only=False,
67
+ revision="main"
68
+ )
69
+ print("Successfully loaded Qwen model")
70
+ except ValueError as e:
71
+ print(f"Error with device_map: {str(e)}")
72
+ try:
73
+ print("Trying without device_map")
74
+ self.model = AutoModelForCausalLM.from_pretrained(
75
+ model_id,
76
+ trust_remote_code=True,
77
+ torch_dtype=torch.float32,
78
+ low_cpu_mem_usage=True,
79
+ cache_dir=cache_dir
80
+ )
81
+ print("Successfully loaded Qwen model without device_map")
82
+ except Exception as e2:
83
+ print(f"Error loading Qwen model: {str(e2)}")
84
+ try:
85
+ print("Trying fallback to smaller model: distilgpt2")
86
+ self.model = AutoModelForCausalLM.from_pretrained(
87
+ "distilgpt2",
88
+ cache_dir=cache_dir
89
+ )
90
+ self.tokenizer = AutoTokenizer.from_pretrained(
91
+ "distilgpt2",
92
+ cache_dir=cache_dir
93
+ )
94
+ print("Successfully loaded fallback model")
95
+ except Exception as e3:
96
+ print(f"All model loading attempts failed: {str(e3)}")
97
+ raise ValueError("Could not load any language model. Please check your environment and permissions.")
98
+
99
+ self.create_embeddings()
100
+
101
+ def create_embeddings(self):
102
+ texts = []
103
+ for _, row in self.df.iterrows():
104
+ text = f"{row['Test Name']} {row['Test Type']}"
105
+ texts.append(text)
106
+
107
+ self.product_embeddings = self.embedding_model.encode(texts)
108
+
109
+ def extract_text_from_url(self, url):
110
+ try:
111
+ response = requests.get(url)
112
+ response.raise_for_status()
113
+
114
+ soup = BeautifulSoup(response.content, 'html.parser')
115
+
116
+ for script in soup(["script", "style"]):
117
+ script.extract()
118
+
119
+ text = soup.get_text()
120
+
121
+ lines = (line.strip() for line in text.splitlines())
122
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
123
+ text = '\n'.join(chunk for chunk in chunks if chunk)
124
+
125
+ return text
126
+ except Exception as e:
127
+ return f"Error extracting text from URL: {str(e)}"
128
+
129
+ def optimize_memory(self):
130
+
131
+ if torch.cuda.is_available():
132
+ torch.cuda.empty_cache()
133
+
134
+ self._cache.clear()
135
+
136
+ gc.collect()
137
+
138
+ return {"status": "Memory optimized"}
139
+
140
+ def generate_test_description(self, test_name, test_type):
141
+ try:
142
+ cache_key = f"{test_name}_{test_type}"
143
+ if cache_key in self._cache:
144
+ return self._cache[cache_key]
145
+
146
+ prompt = f"Write a short, factual description of '{test_name}', a {test_type} assessment, in 1-2 sentences."
147
+
148
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128, padding=True)
149
+
150
+ with torch.no_grad():
151
+ outputs = self.model.generate(
152
+ inputs.input_ids,
153
+ attention_mask=inputs.attention_mask,
154
+ max_new_tokens=40,
155
+ temperature=0.2,
156
+ top_p=0.95,
157
+ do_sample=False,
158
+ no_repeat_ngram_size=3
159
+ )
160
+
161
+ full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
162
+
163
+ generated_text = full_response.replace(prompt, "").strip()
164
+
165
+ if len(generated_text) < 20 or "write" in generated_text.lower() or "description" in generated_text.lower():
166
+ if test_type.lower() in ["cognitive ability", "cognitive", "reasoning"]:
167
+ description = f"The {test_name} measures cognitive abilities and problem-solving skills."
168
+ elif "numerical" in test_name.lower() or "numerical" in test_type.lower():
169
+ description = f"The {test_name} assesses numerical reasoning and data analysis abilities."
170
+ elif "verbal" in test_name.lower() or "verbal" in test_type.lower():
171
+ description = f"The {test_name} evaluates verbal reasoning and language comprehension skills."
172
+ elif "personality" in test_type.lower() or "behavioral" in test_type.lower():
173
+ description = f"The {test_name} assesses behavioral tendencies and personality traits in workplace contexts."
174
+ elif "technical" in test_type.lower() or any(tech in test_name.lower() for tech in ["java", "python", ".net", "sql", "coding"]):
175
+ description = f"The {test_name} evaluates technical knowledge and programming skills."
176
+ else:
177
+ description = f"The {test_name} assesses candidate suitability through standardized methods."
178
+ else:
179
+ description = generated_text
180
+
181
+ if len(self._cache) >= self._cache_size:
182
+ self._cache.pop(next(iter(self._cache)))
183
+ self._cache[cache_key] = description
184
+
185
+ return description
186
+
187
+ except Exception:
188
+ if test_type.lower() in ["cognitive ability", "cognitive", "reasoning"]:
189
+ return f"The {test_name} measures cognitive abilities through structured problem-solving tasks."
190
+ elif test_type.lower() in ["personality", "behavioral"]:
191
+ return f"The {test_name} assesses behavioral tendencies and personality traits."
192
+ elif "technical" in test_type.lower():
193
+ return f"The {test_name} evaluates technical knowledge and skills."
194
+ else:
195
+ return f"The {test_name} assesses {test_type.lower()} capabilities."
196
+
197
+ def check_health(self):
198
+ try:
199
+ test_prompt = "This is a test prompt to check model health."
200
+
201
+ start_time = time.time()
202
+ inputs = self.tokenizer(
203
+ test_prompt,
204
+ return_tensors="pt",
205
+ truncation=True,
206
+ max_length=32,
207
+ padding=True
208
+ )
209
+ tokenization_time = time.time() - start_time
210
+
211
+ start_time = time.time()
212
+ with torch.no_grad():
213
+ _ = self.model.generate(
214
+ inputs.input_ids,
215
+ attention_mask=inputs.attention_mask,
216
+ max_new_tokens=20,
217
+ do_sample=True
218
+ )
219
+ inference_time = time.time() - start_time
220
+
221
+ start_time = time.time()
222
+ self.embedding_model.encode(["Test embedding"])
223
+ embedding_time = time.time() - start_time
224
+
225
+ return {
226
+ "status": "healthy",
227
+ "tokenization_time_ms": round(tokenization_time * 1000, 2),
228
+ "inference_time_ms": round(inference_time * 1000, 2),
229
+ "embedding_time_ms": round(embedding_time * 1000, 2),
230
+ "cache_size": len(self._cache)
231
+ }
232
+ except Exception as e:
233
+ return {"status": "unhealthy", "error": str(e)}
234
+
235
+ def get_recommendations(self, query, is_url=False, max_recommendations=10):
236
+ self._cache.clear()
237
+
238
+ if is_url:
239
+ text = self.extract_text_from_url(query)
240
+ else:
241
+ text = query
242
+
243
+ max_text_length = 2000
244
+ if len(text) > max_text_length:
245
+ text = text[:max_text_length] + "..."
246
+
247
+ query_embedding = self.embedding_model.encode(text[:1000])
248
+
249
+ similarity_scores = cosine_similarity(
250
+ [query_embedding],
251
+ self.product_embeddings
252
+ )[0]
253
+
254
+ top_indices = np.argsort(similarity_scores)[::-1][:max_recommendations]
255
+
256
+ recommendations = []
257
+ for idx in top_indices:
258
+ recommendations.append({
259
+ 'Test Name': self.df.iloc[idx]['Test Name'],
260
+ 'Test Type': self.df.iloc[idx]['Test Type'],
261
+ 'Remote Testing': self.df.iloc[idx]['Remote Testing (Yes/No)'],
262
+ 'Adaptive/IRT': self.df.iloc[idx]['Adaptive/IRT (Yes/No)'],
263
+ 'Duration': self.df.iloc[idx]['Duration'],
264
+ 'Link': self.df.iloc[idx]['Link']
265
+ })
266
+
267
+ return recommendations