Spaces:

GS123
/

shl_test_recommendation

Sleeping

File size: 12,745 Bytes
import streamlit as st
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import math
import re

# st.title("hello")
st.set_page_config(layout="wide")
# --- Configuration ---
CSV_FILE = "shl_data.csv"
COLLECTION_NAME = "shl_assessments"
# Use a robust model good for semantic search
MODEL_NAME = 'msmarco-distilbert-base-v4' # Or 'all-MiniLM-L6-v2'


# --- Caching Functions ---

# Cache the embedding model loading
@st.cache_resource
def load_embedding_model(model_name=MODEL_NAME):
    """Loads the Sentence Transformer model."""
    print("Loading embedding model...")
    try:
        model = SentenceTransformer(model_name)
        print("Embedding model loaded.")
        return model
    except Exception as e:
        st.error(f"Error loading embedding model '{model_name}': {e}")
        return None

# Cache the ChromaDB client and collection setup
@st.cache_resource
def setup_chroma_collection(collection_name=COLLECTION_NAME, model_name=MODEL_NAME):
    """Initializes ChromaDB client and collection, loading data if empty."""
    print("Setting up ChromaDB collection...")
    try:
        # Using an in-memory client suitable for Streamlit sharing / HF Spaces
        client = chromadb.Client()

        # Use the SentenceTransformerEmbeddingFunction for automatic embedding
        embedding_function = chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

        collection = client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_function
            # metadata={"hnsw:space": "cosine"} # Optional: ensure cosine distance
        )
        print(f"ChromaDB collection '{collection_name}' retrieved/created.")

        # Load and preprocess data only if collection is empty
        if collection.count() == 0:
            print("Collection is empty. Loading data from CSV...")
            try:
                df = pd.read_csv(CSV_FILE)
            except FileNotFoundError:
                st.error(f"Error: Data file '{CSV_FILE}' not found. Make sure it's in the same directory as app.py.")
                return None
            except Exception as e:
                 st.error(f"Error reading CSV file: {e}")
                 return None

            # --- Data Cleaning and Preprocessing (same as Colab) ---
            df.rename(columns={
                'Link': 'url', 'Assessment Name': 'name', 'Remote Testing': 'remote_support',
                'Adaptive/IRT': 'adaptive_support', 'Assessment Length': 'duration',
                'Test Type': 'test_type_raw', 'Description': 'description'
            }, inplace=True)

            df['description'].fillna('No description available.', inplace=True)
            df['name'].fillna('Unnamed Assessment', inplace=True)
            for col in ['remote_support', 'adaptive_support']:
                if col in df.columns:
                    df[col] = df[col].astype(str).str.strip().str.lower().apply(lambda x: 'Yes' if x == 'yes' else 'No')
                else: df[col] = 'No'
            if 'duration' in df.columns:
                df['duration'] = pd.to_numeric(df['duration'], errors='coerce').fillna(0).astype(int)
            else: df['duration'] = 0

            if 'test_type_raw' in df.columns:
                df['test_type_list'] = df['test_type_raw'].fillna('').astype(str).apply(
                    lambda x: [t.strip() for t in x.split(',') if t.strip()]
                )
                type_mapping = {
                    'A': 'Ability', 'B': 'Behavior', 'C': 'Cognitive', 'P': 'Personality',
                    'S': 'Simulation', 'K': 'Knowledge & Skills', 'D': 'Development', 'E': 'Exercise'
                }
                df['test_type_list'] = df['test_type_list'].apply(lambda types: list(set([type_mapping.get(t, t) for t in types])))
            else: df['test_type_list'] = [[] for _ in range(len(df))]

            df.dropna(subset=['url', 'name'], inplace=True)
            df = df[df['url'].str.startswith('http')]
            # -------------------------------------------------------

            # --- Prepare for ChromaDB ---
            documents = []
            metadatas = []
            ids = []
            required_fields_for_api = ['url', 'adaptive_support', 'description', 'duration', 'remote_support']

            for index, row in df.iterrows():
                doc_text = f"{row['name']}: {row['description']}"
                documents.append(re.sub(r'\s+', ' ', doc_text).strip())

                meta = {field: row[field] for field in required_fields_for_api if field in row}
                meta['url'] = str(meta.get('url', ''))
                meta['adaptive_support'] = str(meta.get('adaptive_support', 'No'))
                meta['description'] = str(meta.get('description', 'No description available.'))
                meta['duration'] = int(meta.get('duration', 0))
                meta['remote_support'] = str(meta.get('remote_support', 'No'))
                meta['name'] = str(row['name'])

                test_type_list = row['test_type_list'] if 'test_type_list' in row and isinstance(row['test_type_list'], list) else []
                meta['test_type_json'] = json.dumps(test_type_list) # Store as JSON string

                metadatas.append(meta)
                ids.append(f"shl_assessment_{index}") # Make sure IDs are strings
            # --------------------------

            if not ids:
                 st.warning("No valid data found in the CSV to add to the database.")
                 return collection # Return empty collection

            print(f"Adding {len(ids)} items to the collection...")
            # Add data in batches if necessary (though for this size, one go is fine)
            batch_size = 100
            for i in range(0, len(ids), batch_size):
                collection.add(
                    ids=ids[i:i+batch_size],
                    documents=documents[i:i+batch_size],
                    metadatas=metadatas[i:i+batch_size]
                )
            print("Data added successfully.")

        print(f"ChromaDB setup complete. Collection size: {collection.count()}")
        return collection

    except Exception as e:
        st.error(f"Error setting up ChromaDB: {e}")
        print(f"!!! Error setting up ChromaDB: {e}") # Also print to console
        return None


# --- Query Function ---
def get_recommendations_from_chroma(query_text, collection, top_n=10):
    """Queries the ChromaDB collection and formats results for API spec."""
    if collection is None or collection.count() == 0:
        print("Collection is not available or empty.")
        return {"recommended_assessments": []}

    try:
        results = collection.query(
            query_texts=[query_text],
            n_results=min(top_n * 2, collection.count()), # Retrieve more initially for potential filtering
            include=['metadatas', 'distances']
        )
    except Exception as e:
        st.error(f"Error querying ChromaDB: {e}")
        print(f"!!! Error querying ChromaDB: {e}")
        return {"recommended_assessments": []}

    recommended_assessments = []
    seen_urls = set() # Avoid duplicates if any slipped through

    if results and results.get('ids') and results['ids'][0]:
        for i, item_id in enumerate(results['ids'][0]):
            if len(recommended_assessments) >= top_n: # Stop once we have enough
                break

            meta = results['metadatas'][0][i]
            # distance = results['distances'][0][i] # Lower distance = more similar

            # Basic check for duplicate URLs
            url = meta.get('url', '')
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)

            # Parse test_type from JSON string
            test_type_list = []
            test_type_json_str = meta.get('test_type_json', '[]')
            try:
                test_type_list = json.loads(test_type_json_str)
                if not isinstance(test_type_list, list): test_type_list = []
            except json.JSONDecodeError:
                print(f"Warning: Could not parse test_type_json for ID {item_id}: {test_type_json_str}")
                test_type_list = []

            # Format according to API spec
            formatted_result = {
                "url": url,
                "adaptive_support": meta.get('adaptive_support', 'No'),
                "description": meta.get('description', 'No description available.'),
                "duration": int(meta.get('duration', 0)),
                "remote_support": meta.get('remote_support', 'No'),
                "test_type": test_type_list,
                # Include name for display purposes in Streamlit
                "name": meta.get('name', 'Unknown Assessment')
            }
            recommended_assessments.append(formatted_result)

    # Ensure minimum 1 result if possible (and max 10)
    final_recommendations = recommended_assessments[:top_n]
    if not final_recommendations and collection.count() > 0:
         print("Query returned no results, attempting fallback peek...")
         try:
             fallback_results = collection.peek(limit=1) # Get the 'first' item
             if fallback_results and fallback_results.get('ids'):
                  meta = fallback_results['metadatas'][0]
                  test_type_list_fb = []
                  test_type_json_str_fb = meta.get('test_type_json', '[]')
                  try: test_type_list_fb = json.loads(test_type_json_str_fb)
                  except: pass
                  final_recommendations.append({
                     "url": meta.get('url', ''),
                     "adaptive_support": meta.get('adaptive_support', 'No'),
                     "description": meta.get('description', 'No description available.'),
                     "duration": int(meta.get('duration', 0)),
                     "remote_support": meta.get('remote_support', 'No'),
                     "test_type": test_type_list_fb if isinstance(test_type_list_fb, list) else [],
                     "name": meta.get('name', 'Unknown Assessment')
                  })
         except Exception as fb_e:
             print(f"Error during fallback peek: {fb_e}")


    return {"recommended_assessments": final_recommendations}


# --- Streamlit App UI ---


st.title("🚀 SHL Assessment Recommendation System")
st.markdown("Enter a natural language query or job description text to find relevant SHL assessments.")

# Load model and collection (cached)
# model = load_embedding_model() # Model is implicitly used by Chroma's embedding function
collection = setup_chroma_collection()

# User Input
query = st.text_area("Enter your query or job description:", height=150)

# Search Button
search_button = st.button("Find Assessments")

if search_button and query:
    if collection is not None:
        with st.spinner("Searching for relevant assessments..."):
            results_data = get_recommendations_from_chroma(query, collection, top_n=10)
            recommendations = results_data.get("recommended_assessments", [])

        st.subheader(f"Top {len(recommendations)} Recommendations:")

        if recommendations:
            for i, rec in enumerate(recommendations):
                st.markdown(f"---")
                st.markdown(f"**{i+1}. {rec.get('name', 'N/A')}**")
                st.markdown(f"**URL:** [{rec.get('url')}]({rec.get('url')})")
                st.markdown(f"**Description:** {rec.get('description')}")
                col1, col2, col3 = st.columns(3)
                with col1:
                    st.markdown(f"**Duration:** {rec.get('duration', 'N/A')} min")
                with col2:
                    st.markdown(f"**Remote Support:** {rec.get('remote_support', 'N/A')}")
                with col3:
                    st.markdown(f"**Adaptive/IRT:** {rec.get('adaptive_support', 'N/A')}")

                # Display test types as a comma-separated string
                test_types_str = ", ".join(rec.get('test_type', []))
                st.markdown(f"**Test Type(s):** {test_types_str if test_types_str else 'N/A'}")

        else:
            st.warning("No relevant assessments found for your query.")
    else:
        st.error("Database collection could not be loaded. Please check logs.")

elif search_button and not query:
    st.warning("Please enter a query.")