Spaces:

akshit7093
/

SHL

Sleeping

SHL

File size: 19,659 Bytes

import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
import logging
import re
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import os
# Import the scraper module
from scraper import scrape_job_description, is_url

# Set up cache directories in user's home directory
import os

# Define a container-writable cache directory in /tmp
cache_dir = os.path.join("/tmp", "shl_cache")
os.makedirs(cache_dir, exist_ok=True)

os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HF_HOME"] = cache_dir
os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir

# Initialize models and caches as module-level singletons
_sentence_transformer = None
_llm = None
_llm_chain = None
_embedding_cache = {}

def get_sentence_transformer():
    global _sentence_transformer
    if _sentence_transformer is None:
        try:
            logging.info("Initializing SentenceTransformer model")
            model_name = 'sentence-transformers/all-MiniLM-L6-v2'
            _sentence_transformer = SentenceTransformer(model_name, cache_folder=os.environ["SENTENCE_TRANSFORMERS_HOME"])
            # Configure the model with required attributes
            if not hasattr(_sentence_transformer, 'config'):
                from transformers import AutoConfig
                config = AutoConfig.from_pretrained('bert-base-uncased', cache_dir=os.environ["TRANSFORMERS_CACHE"])
                config.model_type = 'bert'
                _sentence_transformer.config = config
        except Exception as e:
            logging.error(f"Error initializing SentenceTransformer: {str(e)}")
            raise
    return _sentence_transformer

def get_llm():
    global _llm, _llm_chain
    if _llm is None:
        try:
            logging.info("Initializing Gemma model")
            # Load API key from environment with explicit path to .env file
            from dotenv import load_dotenv
            load_dotenv(os.path.join(os.path.dirname(__file__), '.env'))
            
            api_key = os.getenv("GOOGLE_API_KEY")
            if not api_key:
                raise ValueError("GOOGLE_API_KEY not found in environment variables")
                
            _llm = ChatGoogleGenerativeAI(model="gemma-3-27b-it", google_api_key=api_key)
            prompt_template = ChatPromptTemplate.from_template(
                """
                You are a helpful assistant designed to analyze job descriptions and extract key information.
                Reply like you are the website and guiding users like a first person perspective.
                Based *only* on the following text content, please provide:
                1. A concise description of the particular assessment (2-4 sentences).
                2. Key features, benefits, or what it measures (up to 5 bullet points).
                Scraped Content:
                {context}
                Analysis:
                """
            )
            output_parser = StrOutputParser()
            _llm_chain = ({"context": RunnablePassthrough()} 
                         | prompt_template 
                         | _llm 
                         | output_parser)
        except Exception as e:
            logging.error(f"Error initializing Gemini API: {str(e)}")
            # More robust fallback that maintains chain structure
            def fallback_processor(text):
                return f"Analysis unavailable (API error). Original text: {text[:500]}"
            
            _llm_chain = ({"context": RunnablePassthrough()} 
                         | (lambda x: {"context": x["context"], "result": fallback_processor(x["context"])}) 
                         | (lambda x: x["result"]))
    return _llm_chain

def generate_embedding(text):
    # Use cache to avoid regenerating embeddings for identical text
    cache_key = hash(text)
    if cache_key in _embedding_cache:
        return _embedding_cache[cache_key]
    
    # Generate new embedding
    model = get_sentence_transformer()
    embedding = model.encode([text])
    
    # Cache the result
    _embedding_cache[cache_key] = embedding
    return embedding

# Function to process query and generate embedding
def process_query(input_data):
    try:
        # Check if input is a URL
        if is_url(input_data):
            # Scrape job description from URL
            text = scrape_job_description(input_data)
            
            # Check if scraping returned an error message
            if text.startswith("Unable to access") or text.startswith("No job description"):
                logging.warning(f"Scraping failed for URL: {input_data}")
                # Still try to process the error message to avoid breaking the flow
                processed_text = f"Query: {input_data}\n\nNote: {text}"
            else:
                try:
                    # Process the scraped content with Gemma to understand job requirements
                    llm_chain = get_llm()
                    job_analysis = llm_chain.invoke(text)
                    # Combine the original text with the analysis for better embedding
                    processed_text = f"Job Description: {text}\n\nAnalysis: {job_analysis}"
                except Exception as e:
                    logging.error(f"Error analyzing job description with LLM: {str(e)}")
                    # Fallback to just using the scraped text
                    processed_text = f"Job Description: {text}"
        else:
            # If not a URL, use the input text directly
            processed_text = input_data
            
        # Generate embedding from the processed text
        embedding = generate_embedding(processed_text)
        return embedding
    except Exception as e:
        logging.error(f"Error in process_query: {str(e)}")
        # Return a default embedding for the error message to avoid breaking the flow
        error_text = f"Error processing query: {str(e)}"
        return generate_embedding(error_text)

# Function to perform vector search
def vector_search(query_embedding):
    try:
        # Load the vector index
        index = faiss.read_index('shl_vector_index.idx')
        # Perform similarity search
        distances, indices = index.search(query_embedding, k=10)
        return distances, indices
    except Exception as e:
        logging.error(f"Error in vector search: {str(e)}")
        # Return empty results that won't break the flow
        # Create empty arrays with the right shape
        empty_indices = np.zeros((1, 10), dtype=np.int64)
        empty_distances = np.ones((1, 10), dtype=np.float32) * 999  # Large distance = low similarity
        return empty_distances, empty_indices

# Function to extract attributes from top results using Gemma
def extract_attributes(distances, indices):
    try:
        # Load and cache the processed data
        if not hasattr(extract_attributes, 'processed_data'):
            try:
                with open('shl_processed_analysis_specific.json', 'r', encoding='utf-8') as f:
                    extract_attributes.processed_data = json.load(f)
            except Exception as e:
                logging.error(f"Error loading processed data: {str(e)}")
                # Return empty results if data can't be loaded
                return [{
                    'Assessment Name': 'Error',
                    'URL': 'N/A',
                    'description': f"Error loading assessment data: {str(e)}",
                    'Key Features': [],
                    'Duration': '',
                    'Remote Testing': False,
                    'Raw Analysis': '',
                    'Similarity Score': 0
                }]
        processed_data = extract_attributes.processed_data
        results = []
        
        for i, idx in enumerate(indices[0]):
            try:
                # Handle index out of bounds
                if idx >= len(processed_data):
                    logging.warning(f"Index {idx} out of bounds for processed_data with length {len(processed_data)}")
                    continue
                    
                item = processed_data[idx]
                similarity_score = 1 / (0.5 + distances[0][i])  # Adjusted formula to boost similarity scores
                
                # Filter to only include assessment-specific URLs containing '/view/'
                if '/view/' not in item.get('url', ''):
                    continue
                extracted_text = item.get('extracted_text', '')
                
                if not extracted_text:
                    logging.warning(f"Empty extracted text for index {idx}")
                    continue
                
                try:
                    # Use Gemma to analyze the assessment details with a structured prompt
                    llm_chain = get_llm()
                    assessment_content = extracted_text.split('\n\n')[0] 
                    analysis = llm_chain.invoke(
                        f"""Assessment Data:
                        {assessment_content}
                        
                        Please analyze this specific assessment and provide a focused, assessment-specific output with these exact section headers. Avoid general company information.
                        
                        ## description:
                        [Provide a 1 sentence description that specifically describes what this assessment measures, its primary purpose, and its target audience. Focus only on this specific assessment's unique characteristics.]
                        
                        ## Key Features:
                        - [List 3-5 specific features or capabilities of this assessment]
                        - [Focus on what skills/abilities it measures]
                        - [Include technical aspects like adaptive testing if applicable]
                        
                        ## Duration:
                        [Specify exact duration in minutes if available, or provide estimated time range]
                        
                        ## Remote Testing:
                        [Yes/No - Include any specific remote proctoring details if available]
                        
                        ## Target Role/Level:
                        [Specify the job roles, levels, or industries this assessment is designed for]
                        """
                    )
                except Exception as e:
                    logging.error(f"Error analyzing assessment with LLM: {str(e)}")
                    # Use a placeholder analysis if LLM fails
                    analysis = f"Assessment information. Unable to analyze details: {str(e)}"
                
                # Process the structured analysis output
                analysis_lines = analysis.split('\n')
                description = ''
                features = []
                assessment_name = item.get('title', '') or 'SHL Assessment'
                duration = ''
                remote_testing = False
                
                # Parse the structured response sections
                current_section = ''
                for line in analysis_lines:
                    line = line.strip()
                    if line.startswith('##'):
                        current_section = line.replace('#', '').strip().lower()
                    elif line and current_section == 'description:':
                        description = line.strip('[]')
                    elif line.startswith('-') and current_section == 'key features:':
                        feature = line.strip('- []')
                        if feature:
                            features.append(feature)
                    elif current_section == 'duration:':
                        if line and not line.startswith('['):
                            duration = line.strip('[]')
                    elif current_section == 'remote testing:':
                        remote_testing = 'yes' in line.lower() or 'available' in line.lower() or 'supported' in line.lower()
                # Parse the structured response sections
                current_section = ''
                for line in analysis_lines:
                    line = line.strip()
                    if line.startswith('##'):
                        current_section = line.replace('#', '').strip().lower()
                    elif line and current_section == 'description:':
                        # Extract clean description without brackets
                        if '[' in line and ']' in line:
                            description = line[line.find('[')+1:line.find(']')]
                        else:
                            description = line
                    elif line.startswith('-') and current_section == 'key features:':
                        feature = line.strip('- []')
                        if feature:
                            features.append(feature)
                    elif current_section == 'duration:':
                        if line and not line.startswith('['):
                            duration = line.strip('[]')
                    elif current_section == 'remote testing:':
                        remote_testing = 'yes' in line.lower() or 'available' in line.lower() or 'supported' in line.lower()
                
                # Clean up and validate the description
                if not description or len(description.strip()) < 10:
                    # Fallback to a basic description if the LLM output is insufficient
                    description = f"Assessment measuring key competencies and skills for {assessment_name}."
                
                # Ensure features list is not empty
                if not features:
                    features = ["Measures relevant job competencies", "Provides standardized assessment"]
                
                # Clean up duration string
                if duration:
                    # Extract numbers from duration string
                    duration_numbers = re.findall(r'\d+', duration)
                    if duration_numbers:
                        duration = duration_numbers[0]  # Take the first number found
                        
                # Fallback duration extraction if not found in analysis
                if not duration and 'approximate completion time' in extracted_text.lower():
                    time_match = re.search(r'Approximate Completion Time in minutes = (\d+)', extracted_text, re.IGNORECASE)
                    if time_match:
                        duration = f"{time_match.group(1)} minutes"
                url=item.get('url', 'N/A')
                result = {
                    'Assessment_Name': assessment_name,
                    'URL': url,
                    'description': description,
                    'Key_Features': features,
                    'Duration': duration,
                    'Remote_Testing': remote_testing,
                    'Raw_Analysis': analysis,
                    'Similarity_Score': similarity_score
                }
                results.append(result)
            
            except Exception as e:
                logging.error(f"Error processing result at index {i}: {str(e)}")
                # Add an error result instead of failing completely
                results.append({
                    'Assessment_Name': 'Error',
                    'URL': 'N/A',
                    'description': f"Error processing assessment: {str(e)}",
                    'Key_Features': [],
                    'Duration': '',
                    'Remote_Testing': False,
                    'Raw_Analysis': '',
                    'Similarity_Score': 0
                })
        
        # If no results were found or all processing failed, return a helpful message
        if not results:
            results.append({
                'Assessment_Name': 'No Results',
                'URL': 'N/A',
                'description': "No matching assessments found for your query.",
                'Key_Features': ["Try a different search term", "Be more specific about the job role or skills"],
                'Duration': '',
                'Remote_Testing': False,
                'Raw_Analysis': '',
                'Similarity_Score': 0
            })
            
        return results
    except Exception as e:
        logging.error(f"Unexpected error in extract_attributes: {str(e)}")
        # Return a single error result
        return [{
            'Assessment Name': 'Error',
            'URL': 'N/A',
            'description': f"An unexpected error occurred: {str(e)}",
            'Key Features': ["Please try again later"],
            'Duration': '',
            'Remote Testing': False,
            'Raw Analysis': '',
            'Similarity Score': 0
        }]

# Example usage
def calculate_metrics(results, relevant_assessments, k=3):
    """Calculate Mean Recall@K and MAP@K metrics.
    
    Args:
        results: List of retrieved assessment results
        relevant_assessments: List of relevant assessment IDs/names
        k: Number of top results to consider (default: 3)
    
    Returns:
        tuple: (recall@k, map@k)
    """
    if not results or not relevant_assessments:
        return 0.0, 0.0
    
    # Get top K results
    top_k = results[:k]
    retrieved_assessments = [r['Assessment_Name'] for r in top_k]
    
    # Calculate Recall@K
    relevant_retrieved = sum(1 for r in retrieved_assessments if r in relevant_assessments)
    recall_k = relevant_retrieved / len(relevant_assessments) if relevant_assessments else 0.0
    
    # Calculate MAP@K
    precision_sum = 0.0
    relevant_count = 0
    
    for i, assessment in enumerate(retrieved_assessments, 1):
        if assessment in relevant_assessments:
            relevant_count += 1
            precision_at_i = relevant_count / i
            precision_sum += precision_at_i
    
    map_k = precision_sum / min(k, len(relevant_assessments)) if relevant_assessments else 0.0
    
    return recall_k, map_k

def main():
    try:
        input_query = "Your input query or URL here"
        query_embedding = process_query(input_query)
        distances, indices = vector_search(query_embedding)
        # Reshape indices and distances to match expected format
        if len(indices.shape) == 1:
            indices = indices.reshape(1, -1)
            distances = distances.reshape(1, -1)
        results = extract_attributes(distances=distances, indices=indices)
        
        # Example usage of metrics calculation
        # In a real scenario, relevant_assessments would come from ground truth data
        relevant_assessments = ["Example Assessment 1", "Example Assessment 2"]
        recall_k, map_k = calculate_metrics(results, relevant_assessments)
        logging.info(f"Mean Recall@3: {recall_k:.3f}")
        logging.info(f"MAP@3: {map_k:.3f}")
        
        return results
    except Exception as e:
        logging.error(f"Error in main function: {str(e)}")
        return [{
            'Assessment Name': 'Error',
            'URL': 'N/A',
            'description': f"An error occurred while processing your query: {str(e)}",
            'Key Features': ["Please try again later"],
            'Duration': '',
            'Remote Testing': False,
            'Raw Analysis': '',
            'Similarity Score': 0
        }]

if __name__ == "__main__":
    results = main()
    print(results)