Krepselis's picture
Update cv_match.py
784ffdd verified
import numpy as np
import time
import os
import traceback
import chromadb # Ensure this is chromadb-client if that's what you installed
from dotenv import load_dotenv
from chromadb import HttpClient, Settings as ChromaSettings
import requests
import urllib3
import json
# Load config
load_dotenv()
# Constants
TOP_N_RESULTS_DEFAULT = int(os.getenv('TOP_N_RESULTS', '20'))
CHROMA_HOST = os.getenv('CHROMA_HOST')
CHROMA_PORT_STR = os.getenv('CHROMA_PORT')
COLLECTION_NAME = os.getenv('CHROMA_COLLECTION')
EMBEDDING_API_URL = os.getenv('EMBEDDING_API_URL', '')
VERIFY_SSL_STR = os.getenv('VERIFY_SSL', 'true')
EXPLAIN_TOP_N_CONTRIBUTING_SKILLS = int(os.getenv('EXPLAIN_TOP_N_CONTRIBUTING_SKILLS', 3))
MODEL_NAME_FOR_EMBEDDING = os.getenv('MODEL_NAME', 'paraphrase-multilingual-mpnet-base-v2')
try:
CHROMA_PORT = int(CHROMA_PORT_STR) if CHROMA_PORT_STR else 8000
except ValueError:
print(f"Warning: Invalid CHROMA_PORT '{CHROMA_PORT_STR}', defaulting to 8000.")
CHROMA_PORT = 8000
VERIFY_SSL = VERIFY_SSL_STR.lower() == 'true'
if not VERIFY_SSL and EMBEDDING_API_URL:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
_embedding_model_cache = None
def get_embedding_model():
global _embedding_model_cache
if _embedding_model_cache is None:
try:
from sentence_transformers import SentenceTransformer
print(f"Loading local sentence transformer model: {MODEL_NAME_FOR_EMBEDDING}")
_embedding_model_cache = SentenceTransformer(MODEL_NAME_FOR_EMBEDDING)
print("Local embedding model loaded.")
except ImportError:
print("Warning: sentence_transformers library not found. Local embedding will not work.")
_embedding_model_cache = "error" # Mark as error to prevent re-attempt
except Exception as e:
print(f"Error loading local SentenceTransformer model: {e}")
_embedding_model_cache = "error"
return _embedding_model_cache if _embedding_model_cache != "error" else None
def get_remote_embedding_batch(texts_batch):
if not EMBEDDING_API_URL:
print("Error: EMBEDDING_API_URL not configured for remote embedding.")
return [None] * len(texts_batch)
if not texts_batch: return []
try:
print(f"Calling remote embedding API for {len(texts_batch)} texts...")
response = requests.post(EMBEDDING_API_URL, json={"texts": texts_batch}, verify=VERIFY_SSL, timeout=60)
response.raise_for_status()
embeddings_data = response.json().get("embeddings", [])
if len(embeddings_data) == len(texts_batch):
return [np.array(emb) if emb is not None else None for emb in embeddings_data]
else:
print(f"Warning: Mismatch in remote embeddings count. Expected {len(texts_batch)}, got {len(embeddings_data if embeddings_data is not None else [])}.")
return [None] * len(texts_batch)
except Exception as e:
print(f"Error calling remote embedding API: {e}")
traceback.print_exc()
return [None] * len(texts_batch)
def generate_embedding_for_skills(skills_list): # This generates ONE combined embedding for a list of skills
if not skills_list or not isinstance(skills_list, list):
print("Warning: No skills provided or not a list, returning None for combined embedding.")
return None
valid_skills = [s for s in skills_list if isinstance(s, str) and s.strip()]
if not valid_skills:
print("Warning: No valid skills after stripping, returning None for combined embedding.")
return None
skill_embeddings_list_of_np_arrays = []
if EMBEDDING_API_URL:
print(f"Attempting remote embedding for {len(valid_skills)} skills to generate combined embedding.")
# For combined embedding, we can still send them in batches if needed, but the API might handle a list
# Assuming get_remote_embedding_batch can take all valid_skills and returns a list of np.arrays
temp_embeddings = get_remote_embedding_batch(valid_skills)
skill_embeddings_list_of_np_arrays = [emb for emb in temp_embeddings if emb is not None]
else:
print("EMBEDDING_API_URL not set. Attempting local embedding for combined embedding.")
model = get_embedding_model()
if model:
try:
raw_embeddings = model.encode(valid_skills, show_progress_bar=False)
skill_embeddings_list_of_np_arrays = [np.array(emb) for emb in raw_embeddings]
except Exception as e:
print(f"Error during local batch skill embedding for combined vector: {e}")
else:
print("Local embedding model not available. Cannot generate combined skill embedding.")
return None
if not skill_embeddings_list_of_np_arrays:
print("No valid skill embeddings were generated for combined vector.")
return None
average_embedding = np.mean(skill_embeddings_list_of_np_arrays, axis=0)
norm = np.linalg.norm(average_embedding)
return (average_embedding / norm if norm > 0 else average_embedding).tolist()
def cosine_similarity_np(vec1, vec2):
if vec1 is None or vec2 is None: return 0.0
vec1_np, vec2_np = np.array(vec1), np.array(vec2)
if vec1_np.shape != vec2_np.shape: # Basic shape check
print(f"Warning: Cosine similarity shape mismatch: {vec1_np.shape} vs {vec2_np.shape}")
return 0.0
if np.all(vec1_np==0) or np.all(vec2_np==0):
return 0.0 # Avoid issues with zero vectors if norm is also zero
dot_product = np.dot(vec1_np, vec2_np)
norm_vec1, norm_vec2 = np.linalg.norm(vec1_np), np.linalg.norm(vec2_np)
if norm_vec1 == 0 or norm_vec2 == 0:
return 0.0
return dot_product / (norm_vec1 * norm_vec2)
# --- NEW FUNCTION FROM trifon-cv_match.py ---
def calculate_enhanced_job_match_score(cv_skills, job_skills, cv_skill_embeddings_dict, job_embedding):
if not cv_skills or not job_skills or not cv_skill_embeddings_dict or job_embedding is None:
return 0.0, []
cv_skills_lower = [s.lower().strip() for s in cv_skills if isinstance(s, str)]
job_skills_lower = [s.lower().strip() for s in job_skills if isinstance(s, str)]
exact_matches = []
for cv_skill_orig, cv_skill_lower in zip(cv_skills, cv_skills_lower): # Iterate with original case for details
if not cv_skill_lower: continue
for job_skill_orig, job_skill_lower in zip(job_skills, job_skills_lower): # Iterate with original case
if not job_skill_lower: continue
if cv_skill_lower == job_skill_lower:
exact_matches.append((cv_skill_orig, job_skill_orig, 1.0))
elif cv_skill_lower in job_skill_lower or job_skill_lower in cv_skill_lower:
len_ratio = min(len(cv_skill_lower), len(job_skill_lower)) / max(len(cv_skill_lower), len(job_skill_lower))
if len_ratio >= 0.7:
exact_matches.append((cv_skill_orig, job_skill_orig, 0.9 * len_ratio)) # Use original case skills
skill_similarities = []
job_emb_np = np.array(job_embedding)
for skill_text, skill_vec_np in cv_skill_embeddings_dict.items():
if skill_vec_np is not None:
sim = cosine_similarity_np(skill_vec_np, job_emb_np)
skill_similarities.append((skill_text, sim))
job_skill_alignment_scores = []
if job_skills and cv_skill_embeddings_dict:
for job_skill_text in job_skills:
if not isinstance(job_skill_text, str) or not job_skill_text.strip(): continue
best_alignment = 0
# For job_skill_alignment, we might need to embed the job_skill on the fly if not pre-embedded
# Or compare against the *entire CV profile* (job_emb_np vs cv_overall_embedding)
# The logic from trifon-cv was:
# alignment = cosine_similarity_np(cv_emb, job_emb_np) * 0.5
# This seems to compare individual CV skill embeddings to the job embedding again.
# Let's refine: for job skill alignment, let's see how well job skills are covered by *any* CV skill semantically.
# Simplified: Check if job skill has a strong semantic counterpart in CV skills
temp_job_skill_emb = None # Placeholder: ideally embed job_skill_text
# For now, let's use job_emb_np as a proxy for the overall job context
# A more advanced method would embed job_skill_text and compare to all cv_skill_embeddings_dict values.
for cv_skill_text, cv_emb_np in cv_skill_embeddings_dict.items():
if cv_emb_np is not None:
# Heuristic: if job skill is in cv skill or vice-versa (already covered by exact_matches), high alignment
if job_skill_text.lower() in cv_skill_text.lower() or cv_skill_text.lower() in job_skill_text.lower():
alignment = 0.85 # Higher for textual overlap implying direct relevance
else: # General semantic similarity of a CV skill to the overall job embedding context
# This part is a bit redundant with skill_similarities if job_emb_np is used.
# Let's use a placeholder or a simplified match for now for this component.
# A better approach would be job_skill_text embedding vs all cv_skill embeddings.
# Given the existing trifon logic, it seems it compared cv_emb to job_emb_np.
# This is effectively covered by skill_similarities.
# Let's make job_skill_alignment_scores more about job skill coverage by CV.
alignment = cosine_similarity_np(cv_emb_np, job_emb_np) * 0.5 # How much this CV skill contributes to matching the job
best_alignment = max(best_alignment, alignment)
job_skill_alignment_scores.append((job_skill_text, best_alignment))
num_job_skills_val = max(1, len(job_skills_lower))
exact_match_score_val = sum(score for _, _, score in exact_matches) / num_job_skills_val if exact_matches else 0
if skill_similarities:
skill_similarities.sort(key=lambda x: x[1], reverse=True)
top_count = max(1, min(5, len(skill_similarities)))
semantic_score_val = sum(sim for _, sim in skill_similarities[:top_count]) / top_count if skill_similarities else 0
else:
semantic_score_val = 0
if job_skill_alignment_scores:
job_alignment_score_val = sum(score for _, score in job_skill_alignment_scores) / len(job_skill_alignment_scores) if job_skill_alignment_scores else 0
else:
job_alignment_score_val = 0
enhanced_score_val = (exact_match_score_val * 0.5) + (semantic_score_val * 0.3) + (job_alignment_score_val * 0.2)
final_score_val = min(100.0, enhanced_score_val * 100.0)
match_details_list = []
for cv_s, job_s, m_score in exact_matches:
match_details_list.append({'cv_skill': cv_s, 'job_skill': job_s, 'match_type': 'exact' if m_score == 1.0 else 'partial', 'score': round(m_score, 3)})
if skill_similarities:
for s_text, s_sim in skill_similarities[:EXPLAIN_TOP_N_CONTRIBUTING_SKILLS]: # Use constant
if not any(d['cv_skill'] == s_text and d['match_type'] != 'alignment' for d in match_details_list):
match_details_list.append({'cv_skill': s_text, 'job_skill': None, 'match_type': 'semantic_to_job', 'score': round(s_sim, 3)})
# For job_skill_alignment, this could be added if the logic is firmed up.
# if job_skill_alignment_scores:
# job_skill_alignment_scores.sort(key=lambda x: x[1], reverse=True)
# for job_s_text, align_s in job_skill_alignment_scores[:3]:
# match_details_list.append({'cv_skill': None, 'job_skill': job_s_text, 'match_type': 'job_skill_coverage', 'score': round(align_s,3)})
# Sort details for consistency, e.g., by score then type
match_details_list.sort(key=lambda x: (x.get('score', 0), x.get('match_type', '')), reverse=True)
return final_score_val, match_details_list
def explain_job(cv_skills, job_skills_from_meta_unused, cv_skill_embeddings_dict, job_embedding_from_chroma):
# This function is now more about individual CV skill similarity to the overall job embedding
job_embedding_valid = (job_embedding_from_chroma is not None and hasattr(job_embedding_from_chroma, '__len__') and len(job_embedding_from_chroma) > 0)
if not cv_skills or not job_embedding_valid or not cv_skill_embeddings_dict:
return []
skill_contributions = []
job_emb_np = np.array(job_embedding_from_chroma)
for skill_text in cv_skills: # Iterate through the original list of CV skills
skill_vector_np = cv_skill_embeddings_dict.get(skill_text) # Get pre-calculated embedding
if skill_vector_np is not None:
similarity = cosine_similarity_np(skill_vector_np, job_emb_np)
skill_contributions.append((skill_text, similarity))
# else: # Skill wasn't in the pre-calculated dict (e.g., embedding failed for it)
# print(f"Debug: Skill '{skill_text}' not found in cv_skill_embeddings_dict for explanation.")
if not skill_contributions:
# print("No CV skills had pre-calculated embeddings available for explanation.")
return []
skill_contributions.sort(key=lambda x: x[1], reverse=True)
return skill_contributions[:EXPLAIN_TOP_N_CONTRIBUTING_SKILLS]
def find_similar_jobs(cv_skills, cv_embedding, top_n=None, active_only=True, use_enhanced_scoring=True):
if top_n is None: top_n = TOP_N_RESULTS_DEFAULT
if cv_embedding is None or not cv_skills:
print("Error: CV embedding or CV skills list is missing.")
return [], "CV Embedding or Skills missing."
# --- Pre-calculate individual CV skill embeddings ONCE ---
cv_individual_skill_embeddings_dict = {}
valid_cv_skills_for_embedding = [s for s in cv_skills if isinstance(s, str) and s.strip()]
if valid_cv_skills_for_embedding:
print(f"Pre-calculating embeddings for {len(valid_cv_skills_for_embedding)} CV skills...")
batch_cv_skill_vectors = []
if EMBEDDING_API_URL:
batch_cv_skill_vectors = get_remote_embedding_batch(valid_cv_skills_for_embedding)
else:
model = get_embedding_model()
if model:
try:
raw_embeddings = model.encode(valid_cv_skills_for_embedding, show_progress_bar=False)
batch_cv_skill_vectors = [np.array(e) if e is not None else None for e in raw_embeddings]
except Exception as e_emb:
print(f"Error embedding individual CV skills locally: {e_emb}")
batch_cv_skill_vectors = [None] * len(valid_cv_skills_for_embedding)
else: # No local model
batch_cv_skill_vectors = [None] * len(valid_cv_skills_for_embedding)
for i, skill_text in enumerate(valid_cv_skills_for_embedding):
if i < len(batch_cv_skill_vectors) and batch_cv_skill_vectors[i] is not None:
cv_individual_skill_embeddings_dict[skill_text] = batch_cv_skill_vectors[i]
print(f"Successfully pre-calculated {len(cv_individual_skill_embeddings_dict)} individual CV skill embeddings.")
# --- End Pre-calculation ---
if not all([CHROMA_HOST, CHROMA_PORT_STR, COLLECTION_NAME]):
return [], "ChromaDB connection details (host, port, collection) are not fully configured."
try:
chroma_client = HttpClient(host=CHROMA_HOST, port=CHROMA_PORT, settings=ChromaSettings(anonymized_telemetry=False))
collection = chroma_client.get_collection(COLLECTION_NAME)
print(f"Connected to ChromaDB collection: {COLLECTION_NAME}")
except Exception as conn_err:
return [], f"Failed to get ChromaDB collection '{COLLECTION_NAME}': {conn_err}"
where_clause = {"Status": "active"} if active_only else None
if active_only: print("Filtering for active jobs only in ChromaDB.")
query_embedding_list = cv_embedding if isinstance(cv_embedding, list) else np.array(cv_embedding).tolist()
print(f"Querying ChromaDB with top_n={top_n} using combined CV embedding...")
results = collection.query(
query_embeddings=[query_embedding_list], n_results=top_n,
include=["metadatas", "distances", "documents", "embeddings"], where=where_clause
)
matches = []
if not (results and results.get('ids') and results['ids'] and isinstance(results['ids'][0], list)):
print("No results or unexpected format from ChromaDB query. 'results.ids[0]' might be empty or not a list.")
return [], "No results from ChromaDB or unexpected format."
num_results = len(results['ids'][0])
print(f"ChromaDB returned {num_results} raw results based on combined CV embedding.")
distances_list = results.get('distances', [[]])[0] if results.get('distances') and results['distances'] else [None] * num_results
metadatas_list = results.get('metadatas', [[]])[0] if results.get('metadatas') and results['metadatas'] else [{}] * num_results
documents_list = results.get('documents', [[]])[0] if results.get('documents') and results['documents'] else [""] * num_results
job_embeddings_list = results.get('embeddings', [[]])[0] if results.get('embeddings') and results['embeddings'] else [None] * num_results
for i in range(num_results):
chroma_db_id = results['ids'][0][i]
distance = distances_list[i] if i < len(distances_list) and distances_list[i] is not None else 1.0 # Default to max distance
metadata_item = metadatas_list[i] if i < len(metadatas_list) and isinstance(metadatas_list[i], dict) else {}
document_text_item = documents_list[i] if i < len(documents_list) and isinstance(documents_list[i], str) else ""
job_embedding_item_raw = job_embeddings_list[i] if i < len(job_embeddings_list) and job_embeddings_list[i] is not None else None
job_embedding_for_match = np.array(job_embedding_item_raw) if job_embedding_item_raw is not None else None
job_embedding_valid_for_match = (job_embedding_for_match is not None and job_embedding_for_match.size > 0)
# Initial score based on vector distance
clamped_distance = min(max(float(distance), 0.0), 2.0) # Chroma cosine is 1-sim, so distance can be > 1
original_similarity_score = (1.0 - (clamped_distance / 2.0)) * 100.0 # Normalize to 0-100
current_score = original_similarity_score
match_details_for_job = []
job_skills_from_meta_str = metadata_item.get('Skills_Json_Str', '[]')
try:
job_skills_list_from_meta = json.loads(job_skills_from_meta_str) if isinstance(job_skills_from_meta_str, str) else []
except json.JSONDecodeError: job_skills_list_from_meta = []
if use_enhanced_scoring and job_embedding_valid_for_match and job_skills_list_from_meta and cv_individual_skill_embeddings_dict:
enhanced_score_val, details_from_enh_score = calculate_enhanced_job_match_score(
cv_skills, job_skills_list_from_meta, cv_individual_skill_embeddings_dict, job_embedding_for_match
)
# Blending: more weight to enhanced if it's substantial
blend_weight_enhanced = 0.7
current_score = (enhanced_score_val * blend_weight_enhanced) + (original_similarity_score * (1.0 - blend_weight_enhanced))
current_score = max(0.0, min(100.0, current_score)) # Ensure score is within 0-100
match_details_for_job.extend(details_from_enh_score)
contributing_cv_skills_list = []
if job_embedding_valid_for_match and cv_individual_skill_embeddings_dict:
contributing_cv_skills_list = explain_job(cv_skills, job_skills_list_from_meta, cv_individual_skill_embeddings_dict, job_embedding_for_match)
# --- Construct the final match dictionary with lowercase keys from metadata ---
match_data = {key.lower(): value for key, value in metadata_item.items()} # Convert all metadata keys to lowercase
match_data.update({
"chroma_id": chroma_db_id, # Keep this as chroma_id
"score": current_score,
"document": document_text_item, # Standardize to 'document'
"job_skills": job_skills_list_from_meta,
"contributing_skills": contributing_cv_skills_list,
"match_details": match_details_for_job, # Add the new details
# Ensure essential fields are present, falling back to 'N/A' or sensible defaults
"title": metadata_item.get('Title', 'N/A'),
"company": metadata_item.get('Company', 'N/A'),
"area": metadata_item.get('Area', 'N/A'),
"category": metadata_item.get('Category', 'N/A'),
"url": metadata_item.get('Application_URL', metadata_item.get('URL', '#')), # Prefer Application_URL
"status": metadata_item.get('Status', 'unknown'),
# Make sure keys like 'job_id' are also included if they exist in metadata_item
"job_id": metadata_item.get('job_id', chroma_db_id) # Fallback to chroma_id if job_id not in meta
})
matches.append(match_data)
print(f"Processed {len(matches)} matches from ChromaDB results.")
return matches, "ChromaDB Vector Search with Enhanced Scoring" if use_enhanced_scoring else "ChromaDB Vector Search"
if __name__ == "__main__":
print("cv_match.py (alex version with trifon's enhancements) loaded for direct execution test.")
if not all([EMBEDDING_API_URL, CHROMA_HOST, CHROMA_PORT_STR, COLLECTION_NAME]):
print("Skipping test: Missing one or more required .env variables for full test.")
else:
sample_cv_skills_list = ["Python", "machine learning", "data analysis", "communication", "projektledelse", "SQL", "Django", "AWS"]
print(f"Test CV Skills: {sample_cv_skills_list}")
test_cv_skill_embedding = generate_embedding_for_skills(sample_cv_skills_list)
if test_cv_skill_embedding:
print(f"Generated test CV combined embedding (first 5 dims): {np.array(test_cv_skill_embedding[:5])}")
job_matches_found, message_status = find_similar_jobs(
cv_skills=sample_cv_skills_list,
cv_embedding=test_cv_skill_embedding,
top_n=5,
use_enhanced_scoring=True # Test with enhanced scoring
)
print(f"Search Status: {message_status}")
if job_matches_found:
print(f"Found {len(job_matches_found)} job matches:")
for i_match, match_item in enumerate(job_matches_found):
print(f"\n {i_match+1}. Title: {match_item.get('title', 'N/A')}")
print(f" Score: {match_item.get('score', 0.0):.2f}% (Chroma ID: {match_item.get('chroma_id')})")
print(f" Company: {match_item.get('company', 'N/A')}, Area: {match_item.get('area', 'N/A')}")
print(f" Job Skills: {match_item.get('job_skills', [])[:5]}...")
print(f" Contributing CV Skills (Top): {[(s_item[0], round(s_item[1], 3)) for s_item in match_item.get('contributing_skills', [])]}")
if match_item.get('match_details'):
print(f" Match Details (Top 3):")
for detail_idx, detail in enumerate(match_item['match_details'][:3]):
print(f" - Type: {detail.get('match_type')}, CV: {detail.get('cv_skill')}, Job: {detail.get('job_skill')}, Score: {detail.get('score')}")
else: print("No job matches found for the test CV skills.")
else: print("Failed to generate combined embedding for test CV skills.")