SHL / query_processing.py
joker7094's picture
added frontend
8a2672b
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
import logging
import re
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import os
# Import the scraper module
from scraper import scrape_job_description, is_url
# Set up cache directories in user's home directory
import os
# Define a container-writable cache directory in /tmp
cache_dir = os.path.join("/tmp", "shl_cache")
os.makedirs(cache_dir, exist_ok=True)
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HF_HOME"] = cache_dir
os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
# Initialize models and caches as module-level singletons
_sentence_transformer = None
_llm = None
_llm_chain = None
_embedding_cache = {}
def get_sentence_transformer():
global _sentence_transformer
if _sentence_transformer is None:
try:
logging.info("Initializing SentenceTransformer model")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
_sentence_transformer = SentenceTransformer(model_name, cache_folder=os.environ["SENTENCE_TRANSFORMERS_HOME"])
# Configure the model with required attributes
if not hasattr(_sentence_transformer, 'config'):
from transformers import AutoConfig
config = AutoConfig.from_pretrained('bert-base-uncased', cache_dir=os.environ["TRANSFORMERS_CACHE"])
config.model_type = 'bert'
_sentence_transformer.config = config
except Exception as e:
logging.error(f"Error initializing SentenceTransformer: {str(e)}")
raise
return _sentence_transformer
def get_llm():
global _llm, _llm_chain
if _llm is None:
try:
logging.info("Initializing Gemma model")
# Load API key from environment with explicit path to .env file
from dotenv import load_dotenv
load_dotenv(os.path.join(os.path.dirname(__file__), '.env'))
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY not found in environment variables")
_llm = ChatGoogleGenerativeAI(model="gemma-3-27b-it", google_api_key=api_key)
prompt_template = ChatPromptTemplate.from_template(
"""
You are a helpful assistant designed to analyze job descriptions and extract key information.
Reply like you are the website and guiding users like a first person perspective.
Based *only* on the following text content, please provide:
1. A concise description of the particular assessment (2-4 sentences).
2. Key features, benefits, or what it measures (up to 5 bullet points).
Scraped Content:
{context}
Analysis:
"""
)
output_parser = StrOutputParser()
_llm_chain = ({"context": RunnablePassthrough()}
| prompt_template
| _llm
| output_parser)
except Exception as e:
logging.error(f"Error initializing Gemini API: {str(e)}")
# More robust fallback that maintains chain structure
def fallback_processor(text):
return f"Analysis unavailable (API error). Original text: {text[:500]}"
_llm_chain = ({"context": RunnablePassthrough()}
| (lambda x: {"context": x["context"], "result": fallback_processor(x["context"])})
| (lambda x: x["result"]))
return _llm_chain
def generate_embedding(text):
# Use cache to avoid regenerating embeddings for identical text
cache_key = hash(text)
if cache_key in _embedding_cache:
return _embedding_cache[cache_key]
# Generate new embedding
model = get_sentence_transformer()
embedding = model.encode([text])
# Cache the result
_embedding_cache[cache_key] = embedding
return embedding
# Function to process query and generate embedding
def process_query(input_data):
try:
# Check if input is a URL
if is_url(input_data):
# Scrape job description from URL
text = scrape_job_description(input_data)
# Check if scraping returned an error message
if text.startswith("Unable to access") or text.startswith("No job description"):
logging.warning(f"Scraping failed for URL: {input_data}")
# Still try to process the error message to avoid breaking the flow
processed_text = f"Query: {input_data}\n\nNote: {text}"
else:
try:
# Process the scraped content with Gemma to understand job requirements
llm_chain = get_llm()
job_analysis = llm_chain.invoke(text)
# Combine the original text with the analysis for better embedding
processed_text = f"Job Description: {text}\n\nAnalysis: {job_analysis}"
except Exception as e:
logging.error(f"Error analyzing job description with LLM: {str(e)}")
# Fallback to just using the scraped text
processed_text = f"Job Description: {text}"
else:
# If not a URL, use the input text directly
processed_text = input_data
# Generate embedding from the processed text
embedding = generate_embedding(processed_text)
return embedding
except Exception as e:
logging.error(f"Error in process_query: {str(e)}")
# Return a default embedding for the error message to avoid breaking the flow
error_text = f"Error processing query: {str(e)}"
return generate_embedding(error_text)
# Function to perform vector search
def vector_search(query_embedding):
try:
# Load the vector index
index = faiss.read_index('shl_vector_index.idx')
# Perform similarity search
distances, indices = index.search(query_embedding, k=10)
return distances, indices
except Exception as e:
logging.error(f"Error in vector search: {str(e)}")
# Return empty results that won't break the flow
# Create empty arrays with the right shape
empty_indices = np.zeros((1, 10), dtype=np.int64)
empty_distances = np.ones((1, 10), dtype=np.float32) * 999 # Large distance = low similarity
return empty_distances, empty_indices
# Function to extract attributes from top results using Gemma
def extract_attributes(distances, indices):
try:
# Load and cache the processed data
if not hasattr(extract_attributes, 'processed_data'):
try:
with open('shl_processed_analysis_specific.json', 'r', encoding='utf-8') as f:
extract_attributes.processed_data = json.load(f)
except Exception as e:
logging.error(f"Error loading processed data: {str(e)}")
# Return empty results if data can't be loaded
return [{
'Assessment Name': 'Error',
'URL': 'N/A',
'description': f"Error loading assessment data: {str(e)}",
'Key Features': [],
'Duration': '',
'Remote Testing': False,
'Raw Analysis': '',
'Similarity Score': 0
}]
processed_data = extract_attributes.processed_data
results = []
for i, idx in enumerate(indices[0]):
try:
# Handle index out of bounds
if idx >= len(processed_data):
logging.warning(f"Index {idx} out of bounds for processed_data with length {len(processed_data)}")
continue
item = processed_data[idx]
similarity_score = 1 / (0.5 + distances[0][i]) # Adjusted formula to boost similarity scores
# Filter to only include assessment-specific URLs containing '/view/'
if '/view/' not in item.get('url', ''):
continue
extracted_text = item.get('extracted_text', '')
if not extracted_text:
logging.warning(f"Empty extracted text for index {idx}")
continue
try:
# Use Gemma to analyze the assessment details with a structured prompt
llm_chain = get_llm()
assessment_content = extracted_text.split('\n\n')[0]
analysis = llm_chain.invoke(
f"""Assessment Data:
{assessment_content}
Please analyze this specific assessment and provide a focused, assessment-specific output with these exact section headers. Avoid general company information.
## description:
[Provide a 1 sentence description that specifically describes what this assessment measures, its primary purpose, and its target audience. Focus only on this specific assessment's unique characteristics.]
## Key Features:
- [List 3-5 specific features or capabilities of this assessment]
- [Focus on what skills/abilities it measures]
- [Include technical aspects like adaptive testing if applicable]
## Duration:
[Specify exact duration in minutes if available, or provide estimated time range]
## Remote Testing:
[Yes/No - Include any specific remote proctoring details if available]
## Target Role/Level:
[Specify the job roles, levels, or industries this assessment is designed for]
"""
)
except Exception as e:
logging.error(f"Error analyzing assessment with LLM: {str(e)}")
# Use a placeholder analysis if LLM fails
analysis = f"Assessment information. Unable to analyze details: {str(e)}"
# Process the structured analysis output
analysis_lines = analysis.split('\n')
description = ''
features = []
assessment_name = item.get('title', '') or 'SHL Assessment'
duration = ''
remote_testing = False
# Parse the structured response sections
current_section = ''
for line in analysis_lines:
line = line.strip()
if line.startswith('##'):
current_section = line.replace('#', '').strip().lower()
elif line and current_section == 'description:':
description = line.strip('[]')
elif line.startswith('-') and current_section == 'key features:':
feature = line.strip('- []')
if feature:
features.append(feature)
elif current_section == 'duration:':
if line and not line.startswith('['):
duration = line.strip('[]')
elif current_section == 'remote testing:':
remote_testing = 'yes' in line.lower() or 'available' in line.lower() or 'supported' in line.lower()
# Parse the structured response sections
current_section = ''
for line in analysis_lines:
line = line.strip()
if line.startswith('##'):
current_section = line.replace('#', '').strip().lower()
elif line and current_section == 'description:':
# Extract clean description without brackets
if '[' in line and ']' in line:
description = line[line.find('[')+1:line.find(']')]
else:
description = line
elif line.startswith('-') and current_section == 'key features:':
feature = line.strip('- []')
if feature:
features.append(feature)
elif current_section == 'duration:':
if line and not line.startswith('['):
duration = line.strip('[]')
elif current_section == 'remote testing:':
remote_testing = 'yes' in line.lower() or 'available' in line.lower() or 'supported' in line.lower()
# Clean up and validate the description
if not description or len(description.strip()) < 10:
# Fallback to a basic description if the LLM output is insufficient
description = f"Assessment measuring key competencies and skills for {assessment_name}."
# Ensure features list is not empty
if not features:
features = ["Measures relevant job competencies", "Provides standardized assessment"]
# Clean up duration string
if duration:
# Extract numbers from duration string
duration_numbers = re.findall(r'\d+', duration)
if duration_numbers:
duration = duration_numbers[0] # Take the first number found
# Fallback duration extraction if not found in analysis
if not duration and 'approximate completion time' in extracted_text.lower():
time_match = re.search(r'Approximate Completion Time in minutes = (\d+)', extracted_text, re.IGNORECASE)
if time_match:
duration = f"{time_match.group(1)} minutes"
url=item.get('url', 'N/A')
result = {
'Assessment_Name': assessment_name,
'URL': url,
'description': description,
'Key_Features': features,
'Duration': duration,
'Remote_Testing': remote_testing,
'Raw_Analysis': analysis,
'Similarity_Score': similarity_score
}
results.append(result)
except Exception as e:
logging.error(f"Error processing result at index {i}: {str(e)}")
# Add an error result instead of failing completely
results.append({
'Assessment_Name': 'Error',
'URL': 'N/A',
'description': f"Error processing assessment: {str(e)}",
'Key_Features': [],
'Duration': '',
'Remote_Testing': False,
'Raw_Analysis': '',
'Similarity_Score': 0
})
# If no results were found or all processing failed, return a helpful message
if not results:
results.append({
'Assessment_Name': 'No Results',
'URL': 'N/A',
'description': "No matching assessments found for your query.",
'Key_Features': ["Try a different search term", "Be more specific about the job role or skills"],
'Duration': '',
'Remote_Testing': False,
'Raw_Analysis': '',
'Similarity_Score': 0
})
return results
except Exception as e:
logging.error(f"Unexpected error in extract_attributes: {str(e)}")
# Return a single error result
return [{
'Assessment Name': 'Error',
'URL': 'N/A',
'description': f"An unexpected error occurred: {str(e)}",
'Key Features': ["Please try again later"],
'Duration': '',
'Remote Testing': False,
'Raw Analysis': '',
'Similarity Score': 0
}]
# Example usage
def calculate_metrics(results, relevant_assessments, k=3):
"""Calculate Mean Recall@K and MAP@K metrics.
Args:
results: List of retrieved assessment results
relevant_assessments: List of relevant assessment IDs/names
k: Number of top results to consider (default: 3)
Returns:
tuple: (recall@k, map@k)
"""
if not results or not relevant_assessments:
return 0.0, 0.0
# Get top K results
top_k = results[:k]
retrieved_assessments = [r['Assessment_Name'] for r in top_k]
# Calculate Recall@K
relevant_retrieved = sum(1 for r in retrieved_assessments if r in relevant_assessments)
recall_k = relevant_retrieved / len(relevant_assessments) if relevant_assessments else 0.0
# Calculate MAP@K
precision_sum = 0.0
relevant_count = 0
for i, assessment in enumerate(retrieved_assessments, 1):
if assessment in relevant_assessments:
relevant_count += 1
precision_at_i = relevant_count / i
precision_sum += precision_at_i
map_k = precision_sum / min(k, len(relevant_assessments)) if relevant_assessments else 0.0
return recall_k, map_k
def main():
try:
input_query = "Your input query or URL here"
query_embedding = process_query(input_query)
distances, indices = vector_search(query_embedding)
# Reshape indices and distances to match expected format
if len(indices.shape) == 1:
indices = indices.reshape(1, -1)
distances = distances.reshape(1, -1)
results = extract_attributes(distances=distances, indices=indices)
# Example usage of metrics calculation
# In a real scenario, relevant_assessments would come from ground truth data
relevant_assessments = ["Example Assessment 1", "Example Assessment 2"]
recall_k, map_k = calculate_metrics(results, relevant_assessments)
logging.info(f"Mean Recall@3: {recall_k:.3f}")
logging.info(f"MAP@3: {map_k:.3f}")
return results
except Exception as e:
logging.error(f"Error in main function: {str(e)}")
return [{
'Assessment Name': 'Error',
'URL': 'N/A',
'description': f"An error occurred while processing your query: {str(e)}",
'Key Features': ["Please try again later"],
'Duration': '',
'Remote Testing': False,
'Raw Analysis': '',
'Similarity Score': 0
}]
if __name__ == "__main__":
results = main()
print(results)