Spaces:

pratham0011
/

SHL-Test-Recommender-API

Sleeping

App Files Files Community

pratham0011 commited on Apr 9, 2025

Commit

5e03342

verified ·

1 Parent(s): b4edeae

Upload recommender.py

Browse files

Files changed (1) hide show

recommender.py +267 -0

recommender.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import requests
+from bs4 import BeautifulSoup
+import torch
+import gc
+import time
+from transformers import AutoTokenizer, AutoModelForCausalLM
+class SHLRecommender:
+    _cache = {}
+    _cache_size = 20
+    def __init__(self, data_path='utils/data.csv'):
+        try:
+            self.df = pd.read_csv(data_path)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Data file not found at {data_path}. Please check the path.")
+        self.df.columns = [col.strip() for col in self.df.columns]
+        try:
+            import os
+            cache_dir = os.path.join(os.getcwd(), 'model_cache')
+            os.makedirs(cache_dir, exist_ok=True)
+            print(f"Using cache directory: {cache_dir}")
+            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder=cache_dir)
+            print("Successfully loaded all-MiniLM-L6-v2 model")
+        except Exception as e:
+            print(f"Error loading primary model: {str(e)}")
+            try:
+                # Try a different model as fallback
+                print("Trying fallback model: paraphrase-MiniLM-L3-v2")
+                self.embedding_model = SentenceTransformer('paraphrase-MiniLM-L3-v2', cache_folder=cache_dir)
+                print("Successfully loaded fallback model")
+            except Exception as e2:
+                print(f"Error loading fallback model: {str(e2)}")
+                # Create a simple embedding model as last resort
+                from sentence_transformers import models, SentenceTransformer
+                print("Creating basic embedding model from scratch")
+                word_embedding_model = models.Transformer('bert-base-uncased', cache_dir=cache_dir)
+                pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+                self.embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+                print("Created basic embedding model")
+        model_id = "Qwen/Qwen2.5-0.5B-Instruct"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            use_fast=True,
+            model_max_length=512,
+        )
+        try:
+            print(f"Loading Qwen model: {model_id}")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                trust_remote_code=True,
+                torch_dtype=torch.float32,
+                device_map="auto",
+                low_cpu_mem_usage=True,
+                cache_dir=cache_dir,
+                local_files_only=False,
+                revision="main"
+            )
+            print("Successfully loaded Qwen model")
+        except ValueError as e:
+            print(f"Error with device_map: {str(e)}")
+            try:
+                print("Trying without device_map")
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    model_id,
+                    trust_remote_code=True,
+                    torch_dtype=torch.float32,
+                    low_cpu_mem_usage=True,
+                    cache_dir=cache_dir
+                )
+                print("Successfully loaded Qwen model without device_map")
+            except Exception as e2:
+                print(f"Error loading Qwen model: {str(e2)}")
+                try:
+                    print("Trying fallback to smaller model: distilgpt2")
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        "distilgpt2",
+                        cache_dir=cache_dir
+                    )
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        "distilgpt2",
+                        cache_dir=cache_dir
+                    )
+                    print("Successfully loaded fallback model")
+                except Exception as e3:
+                    print(f"All model loading attempts failed: {str(e3)}")
+                    raise ValueError("Could not load any language model. Please check your environment and permissions.")
+        self.create_embeddings()
+    def create_embeddings(self):
+        texts = []
+        for _, row in self.df.iterrows():
+            text = f"{row['Test Name']} {row['Test Type']}"
+            texts.append(text)
+        self.product_embeddings = self.embedding_model.encode(texts)
+    def extract_text_from_url(self, url):
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            for script in soup(["script", "style"]):
+                script.extract()
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+            return text
+        except Exception as e:
+            return f"Error extracting text from URL: {str(e)}"
+    def optimize_memory(self):
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        self._cache.clear()
+        gc.collect()
+        return {"status": "Memory optimized"}
+    def generate_test_description(self, test_name, test_type):
+        try:
+            cache_key = f"{test_name}_{test_type}"
+            if cache_key in self._cache:
+                return self._cache[cache_key]
+            prompt = f"Write a short, factual description of '{test_name}', a {test_type} assessment, in 1-2 sentences."
+            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128, padding=True)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    max_new_tokens=40,
+                    temperature=0.2,
+                    top_p=0.95,
+                    do_sample=False,
+                    no_repeat_ngram_size=3
+                )
+            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            generated_text = full_response.replace(prompt, "").strip()
+            if len(generated_text) < 20 or "write" in generated_text.lower() or "description" in generated_text.lower():
+                if test_type.lower() in ["cognitive ability", "cognitive", "reasoning"]:
+                    description = f"The {test_name} measures cognitive abilities and problem-solving skills."
+                elif "numerical" in test_name.lower() or "numerical" in test_type.lower():
+                    description = f"The {test_name} assesses numerical reasoning and data analysis abilities."
+                elif "verbal" in test_name.lower() or "verbal" in test_type.lower():
+                    description = f"The {test_name} evaluates verbal reasoning and language comprehension skills."
+                elif "personality" in test_type.lower() or "behavioral" in test_type.lower():
+                    description = f"The {test_name} assesses behavioral tendencies and personality traits in workplace contexts."
+                elif "technical" in test_type.lower() or any(tech in test_name.lower() for tech in ["java", "python", ".net", "sql", "coding"]):
+                    description = f"The {test_name} evaluates technical knowledge and programming skills."
+                else:
+                    description = f"The {test_name} assesses candidate suitability through standardized methods."
+            else:
+                description = generated_text
+            if len(self._cache) >= self._cache_size:
+                self._cache.pop(next(iter(self._cache)))
+            self._cache[cache_key] = description
+            return description
+        except Exception:
+            if test_type.lower() in ["cognitive ability", "cognitive", "reasoning"]:
+                return f"The {test_name} measures cognitive abilities through structured problem-solving tasks."
+            elif test_type.lower() in ["personality", "behavioral"]:
+                return f"The {test_name} assesses behavioral tendencies and personality traits."
+            elif "technical" in test_type.lower():
+                return f"The {test_name} evaluates technical knowledge and skills."
+            else:
+                return f"The {test_name} assesses {test_type.lower()} capabilities."
+    def check_health(self):
+        try:
+            test_prompt = "This is a test prompt to check model health."
+            start_time = time.time()
+            inputs = self.tokenizer(
+                test_prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=32,
+                padding=True
+            )
+            tokenization_time = time.time() - start_time
+            start_time = time.time()
+            with torch.no_grad():
+                _ = self.model.generate(
+                    inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    max_new_tokens=20,
+                    do_sample=True
+                )
+            inference_time = time.time() - start_time
+            start_time = time.time()
+            self.embedding_model.encode(["Test embedding"])
+            embedding_time = time.time() - start_time
+            return {
+                "status": "healthy",
+                "tokenization_time_ms": round(tokenization_time * 1000, 2),
+                "inference_time_ms": round(inference_time * 1000, 2),
+                "embedding_time_ms": round(embedding_time * 1000, 2),
+                "cache_size": len(self._cache)
+            }
+        except Exception as e:
+            return {"status": "unhealthy", "error": str(e)}
+    def get_recommendations(self, query, is_url=False, max_recommendations=10):
+        self._cache.clear()
+        if is_url:
+            text = self.extract_text_from_url(query)
+        else:
+            text = query
+        max_text_length = 2000
+        if len(text) > max_text_length:
+            text = text[:max_text_length] + "..."
+        query_embedding = self.embedding_model.encode(text[:1000])
+        similarity_scores = cosine_similarity(
+            [query_embedding],
+            self.product_embeddings
+        )[0]
+        top_indices = np.argsort(similarity_scores)[::-1][:max_recommendations]
+        recommendations = []
+        for idx in top_indices:
+            recommendations.append({
+                'Test Name': self.df.iloc[idx]['Test Name'],
+                'Test Type': self.df.iloc[idx]['Test Type'],
+                'Remote Testing': self.df.iloc[idx]['Remote Testing (Yes/No)'],
+                'Adaptive/IRT': self.df.iloc[idx]['Adaptive/IRT (Yes/No)'],
+                'Duration': self.df.iloc[idx]['Duration'],
+                'Link': self.df.iloc[idx]['Link']
+            })
+        return recommendations