| import numpy as np |
| import time |
| import os |
| import traceback |
| import chromadb |
| from dotenv import load_dotenv |
| from chromadb import HttpClient, Settings as ChromaSettings |
| import requests |
| import urllib3 |
| import json |
|
|
| |
| load_dotenv() |
|
|
| |
| TOP_N_RESULTS_DEFAULT = int(os.getenv('TOP_N_RESULTS', '20')) |
| CHROMA_HOST = os.getenv('CHROMA_HOST') |
| CHROMA_PORT_STR = os.getenv('CHROMA_PORT') |
| COLLECTION_NAME = os.getenv('CHROMA_COLLECTION') |
| EMBEDDING_API_URL = os.getenv('EMBEDDING_API_URL', '') |
| VERIFY_SSL_STR = os.getenv('VERIFY_SSL', 'true') |
| EXPLAIN_TOP_N_CONTRIBUTING_SKILLS = int(os.getenv('EXPLAIN_TOP_N_CONTRIBUTING_SKILLS', 3)) |
| MODEL_NAME_FOR_EMBEDDING = os.getenv('MODEL_NAME', 'paraphrase-multilingual-mpnet-base-v2') |
|
|
| try: |
| CHROMA_PORT = int(CHROMA_PORT_STR) if CHROMA_PORT_STR else 8000 |
| except ValueError: |
| print(f"Warning: Invalid CHROMA_PORT '{CHROMA_PORT_STR}', defaulting to 8000.") |
| CHROMA_PORT = 8000 |
|
|
| VERIFY_SSL = VERIFY_SSL_STR.lower() == 'true' |
| if not VERIFY_SSL and EMBEDDING_API_URL: |
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
|
|
| _embedding_model_cache = None |
| def get_embedding_model(): |
| global _embedding_model_cache |
| if _embedding_model_cache is None: |
| try: |
| from sentence_transformers import SentenceTransformer |
| print(f"Loading local sentence transformer model: {MODEL_NAME_FOR_EMBEDDING}") |
| _embedding_model_cache = SentenceTransformer(MODEL_NAME_FOR_EMBEDDING) |
| print("Local embedding model loaded.") |
| except ImportError: |
| print("Warning: sentence_transformers library not found. Local embedding will not work.") |
| _embedding_model_cache = "error" |
| except Exception as e: |
| print(f"Error loading local SentenceTransformer model: {e}") |
| _embedding_model_cache = "error" |
| return _embedding_model_cache if _embedding_model_cache != "error" else None |
|
|
| def get_remote_embedding_batch(texts_batch): |
| if not EMBEDDING_API_URL: |
| print("Error: EMBEDDING_API_URL not configured for remote embedding.") |
| return [None] * len(texts_batch) |
|
|
| if not texts_batch: return [] |
|
|
| try: |
| print(f"Calling remote embedding API for {len(texts_batch)} texts...") |
| response = requests.post(EMBEDDING_API_URL, json={"texts": texts_batch}, verify=VERIFY_SSL, timeout=60) |
| response.raise_for_status() |
| embeddings_data = response.json().get("embeddings", []) |
| if len(embeddings_data) == len(texts_batch): |
| return [np.array(emb) if emb is not None else None for emb in embeddings_data] |
| else: |
| print(f"Warning: Mismatch in remote embeddings count. Expected {len(texts_batch)}, got {len(embeddings_data if embeddings_data is not None else [])}.") |
| return [None] * len(texts_batch) |
| except Exception as e: |
| print(f"Error calling remote embedding API: {e}") |
| traceback.print_exc() |
| return [None] * len(texts_batch) |
|
|
| def generate_embedding_for_skills(skills_list): |
| if not skills_list or not isinstance(skills_list, list): |
| print("Warning: No skills provided or not a list, returning None for combined embedding.") |
| return None |
|
|
| valid_skills = [s for s in skills_list if isinstance(s, str) and s.strip()] |
| if not valid_skills: |
| print("Warning: No valid skills after stripping, returning None for combined embedding.") |
| return None |
|
|
| skill_embeddings_list_of_np_arrays = [] |
|
|
| if EMBEDDING_API_URL: |
| print(f"Attempting remote embedding for {len(valid_skills)} skills to generate combined embedding.") |
| |
| |
| temp_embeddings = get_remote_embedding_batch(valid_skills) |
| skill_embeddings_list_of_np_arrays = [emb for emb in temp_embeddings if emb is not None] |
| else: |
| print("EMBEDDING_API_URL not set. Attempting local embedding for combined embedding.") |
| model = get_embedding_model() |
| if model: |
| try: |
| raw_embeddings = model.encode(valid_skills, show_progress_bar=False) |
| skill_embeddings_list_of_np_arrays = [np.array(emb) for emb in raw_embeddings] |
| except Exception as e: |
| print(f"Error during local batch skill embedding for combined vector: {e}") |
| else: |
| print("Local embedding model not available. Cannot generate combined skill embedding.") |
| return None |
|
|
| if not skill_embeddings_list_of_np_arrays: |
| print("No valid skill embeddings were generated for combined vector.") |
| return None |
|
|
| average_embedding = np.mean(skill_embeddings_list_of_np_arrays, axis=0) |
| norm = np.linalg.norm(average_embedding) |
|
|
| return (average_embedding / norm if norm > 0 else average_embedding).tolist() |
|
|
| def cosine_similarity_np(vec1, vec2): |
| if vec1 is None or vec2 is None: return 0.0 |
| vec1_np, vec2_np = np.array(vec1), np.array(vec2) |
| if vec1_np.shape != vec2_np.shape: |
| print(f"Warning: Cosine similarity shape mismatch: {vec1_np.shape} vs {vec2_np.shape}") |
| return 0.0 |
| if np.all(vec1_np==0) or np.all(vec2_np==0): |
| return 0.0 |
| dot_product = np.dot(vec1_np, vec2_np) |
| norm_vec1, norm_vec2 = np.linalg.norm(vec1_np), np.linalg.norm(vec2_np) |
| if norm_vec1 == 0 or norm_vec2 == 0: |
| return 0.0 |
| return dot_product / (norm_vec1 * norm_vec2) |
|
|
| |
| def calculate_enhanced_job_match_score(cv_skills, job_skills, cv_skill_embeddings_dict, job_embedding): |
| if not cv_skills or not job_skills or not cv_skill_embeddings_dict or job_embedding is None: |
| return 0.0, [] |
|
|
| cv_skills_lower = [s.lower().strip() for s in cv_skills if isinstance(s, str)] |
| job_skills_lower = [s.lower().strip() for s in job_skills if isinstance(s, str)] |
|
|
| exact_matches = [] |
| for cv_skill_orig, cv_skill_lower in zip(cv_skills, cv_skills_lower): |
| if not cv_skill_lower: continue |
| for job_skill_orig, job_skill_lower in zip(job_skills, job_skills_lower): |
| if not job_skill_lower: continue |
| if cv_skill_lower == job_skill_lower: |
| exact_matches.append((cv_skill_orig, job_skill_orig, 1.0)) |
| elif cv_skill_lower in job_skill_lower or job_skill_lower in cv_skill_lower: |
| len_ratio = min(len(cv_skill_lower), len(job_skill_lower)) / max(len(cv_skill_lower), len(job_skill_lower)) |
| if len_ratio >= 0.7: |
| exact_matches.append((cv_skill_orig, job_skill_orig, 0.9 * len_ratio)) |
|
|
| skill_similarities = [] |
| job_emb_np = np.array(job_embedding) |
| for skill_text, skill_vec_np in cv_skill_embeddings_dict.items(): |
| if skill_vec_np is not None: |
| sim = cosine_similarity_np(skill_vec_np, job_emb_np) |
| skill_similarities.append((skill_text, sim)) |
|
|
| job_skill_alignment_scores = [] |
| if job_skills and cv_skill_embeddings_dict: |
| for job_skill_text in job_skills: |
| if not isinstance(job_skill_text, str) or not job_skill_text.strip(): continue |
| best_alignment = 0 |
| |
| |
| |
| |
| |
| |
| |
| |
| temp_job_skill_emb = None |
| |
| |
|
|
| for cv_skill_text, cv_emb_np in cv_skill_embeddings_dict.items(): |
| if cv_emb_np is not None: |
| |
| if job_skill_text.lower() in cv_skill_text.lower() or cv_skill_text.lower() in job_skill_text.lower(): |
| alignment = 0.85 |
| else: |
| |
| |
| |
| |
| |
| |
| alignment = cosine_similarity_np(cv_emb_np, job_emb_np) * 0.5 |
| best_alignment = max(best_alignment, alignment) |
| job_skill_alignment_scores.append((job_skill_text, best_alignment)) |
|
|
|
|
| num_job_skills_val = max(1, len(job_skills_lower)) |
| exact_match_score_val = sum(score for _, _, score in exact_matches) / num_job_skills_val if exact_matches else 0 |
|
|
| if skill_similarities: |
| skill_similarities.sort(key=lambda x: x[1], reverse=True) |
| top_count = max(1, min(5, len(skill_similarities))) |
| semantic_score_val = sum(sim for _, sim in skill_similarities[:top_count]) / top_count if skill_similarities else 0 |
| else: |
| semantic_score_val = 0 |
|
|
| if job_skill_alignment_scores: |
| job_alignment_score_val = sum(score for _, score in job_skill_alignment_scores) / len(job_skill_alignment_scores) if job_skill_alignment_scores else 0 |
| else: |
| job_alignment_score_val = 0 |
|
|
| enhanced_score_val = (exact_match_score_val * 0.5) + (semantic_score_val * 0.3) + (job_alignment_score_val * 0.2) |
| final_score_val = min(100.0, enhanced_score_val * 100.0) |
|
|
| match_details_list = [] |
| for cv_s, job_s, m_score in exact_matches: |
| match_details_list.append({'cv_skill': cv_s, 'job_skill': job_s, 'match_type': 'exact' if m_score == 1.0 else 'partial', 'score': round(m_score, 3)}) |
| if skill_similarities: |
| for s_text, s_sim in skill_similarities[:EXPLAIN_TOP_N_CONTRIBUTING_SKILLS]: |
| if not any(d['cv_skill'] == s_text and d['match_type'] != 'alignment' for d in match_details_list): |
| match_details_list.append({'cv_skill': s_text, 'job_skill': None, 'match_type': 'semantic_to_job', 'score': round(s_sim, 3)}) |
| |
| |
| |
| |
| |
|
|
|
|
| |
| match_details_list.sort(key=lambda x: (x.get('score', 0), x.get('match_type', '')), reverse=True) |
| return final_score_val, match_details_list |
|
|
|
|
| def explain_job(cv_skills, job_skills_from_meta_unused, cv_skill_embeddings_dict, job_embedding_from_chroma): |
| |
| job_embedding_valid = (job_embedding_from_chroma is not None and hasattr(job_embedding_from_chroma, '__len__') and len(job_embedding_from_chroma) > 0) |
| if not cv_skills or not job_embedding_valid or not cv_skill_embeddings_dict: |
| return [] |
|
|
| skill_contributions = [] |
| job_emb_np = np.array(job_embedding_from_chroma) |
|
|
| for skill_text in cv_skills: |
| skill_vector_np = cv_skill_embeddings_dict.get(skill_text) |
| if skill_vector_np is not None: |
| similarity = cosine_similarity_np(skill_vector_np, job_emb_np) |
| skill_contributions.append((skill_text, similarity)) |
| |
| |
|
|
|
|
| if not skill_contributions: |
| |
| return [] |
|
|
| skill_contributions.sort(key=lambda x: x[1], reverse=True) |
| return skill_contributions[:EXPLAIN_TOP_N_CONTRIBUTING_SKILLS] |
|
|
|
|
| def find_similar_jobs(cv_skills, cv_embedding, top_n=None, active_only=True, use_enhanced_scoring=True): |
| if top_n is None: top_n = TOP_N_RESULTS_DEFAULT |
| if cv_embedding is None or not cv_skills: |
| print("Error: CV embedding or CV skills list is missing.") |
| return [], "CV Embedding or Skills missing." |
|
|
| |
| cv_individual_skill_embeddings_dict = {} |
| valid_cv_skills_for_embedding = [s for s in cv_skills if isinstance(s, str) and s.strip()] |
| if valid_cv_skills_for_embedding: |
| print(f"Pre-calculating embeddings for {len(valid_cv_skills_for_embedding)} CV skills...") |
| batch_cv_skill_vectors = [] |
| if EMBEDDING_API_URL: |
| batch_cv_skill_vectors = get_remote_embedding_batch(valid_cv_skills_for_embedding) |
| else: |
| model = get_embedding_model() |
| if model: |
| try: |
| raw_embeddings = model.encode(valid_cv_skills_for_embedding, show_progress_bar=False) |
| batch_cv_skill_vectors = [np.array(e) if e is not None else None for e in raw_embeddings] |
| except Exception as e_emb: |
| print(f"Error embedding individual CV skills locally: {e_emb}") |
| batch_cv_skill_vectors = [None] * len(valid_cv_skills_for_embedding) |
| else: |
| batch_cv_skill_vectors = [None] * len(valid_cv_skills_for_embedding) |
|
|
| for i, skill_text in enumerate(valid_cv_skills_for_embedding): |
| if i < len(batch_cv_skill_vectors) and batch_cv_skill_vectors[i] is not None: |
| cv_individual_skill_embeddings_dict[skill_text] = batch_cv_skill_vectors[i] |
| print(f"Successfully pre-calculated {len(cv_individual_skill_embeddings_dict)} individual CV skill embeddings.") |
| |
|
|
| if not all([CHROMA_HOST, CHROMA_PORT_STR, COLLECTION_NAME]): |
| return [], "ChromaDB connection details (host, port, collection) are not fully configured." |
| try: |
| chroma_client = HttpClient(host=CHROMA_HOST, port=CHROMA_PORT, settings=ChromaSettings(anonymized_telemetry=False)) |
| collection = chroma_client.get_collection(COLLECTION_NAME) |
| print(f"Connected to ChromaDB collection: {COLLECTION_NAME}") |
| except Exception as conn_err: |
| return [], f"Failed to get ChromaDB collection '{COLLECTION_NAME}': {conn_err}" |
|
|
| where_clause = {"Status": "active"} if active_only else None |
| if active_only: print("Filtering for active jobs only in ChromaDB.") |
| query_embedding_list = cv_embedding if isinstance(cv_embedding, list) else np.array(cv_embedding).tolist() |
|
|
| print(f"Querying ChromaDB with top_n={top_n} using combined CV embedding...") |
| results = collection.query( |
| query_embeddings=[query_embedding_list], n_results=top_n, |
| include=["metadatas", "distances", "documents", "embeddings"], where=where_clause |
| ) |
|
|
| matches = [] |
| if not (results and results.get('ids') and results['ids'] and isinstance(results['ids'][0], list)): |
| print("No results or unexpected format from ChromaDB query. 'results.ids[0]' might be empty or not a list.") |
| return [], "No results from ChromaDB or unexpected format." |
|
|
| num_results = len(results['ids'][0]) |
| print(f"ChromaDB returned {num_results} raw results based on combined CV embedding.") |
| distances_list = results.get('distances', [[]])[0] if results.get('distances') and results['distances'] else [None] * num_results |
| metadatas_list = results.get('metadatas', [[]])[0] if results.get('metadatas') and results['metadatas'] else [{}] * num_results |
| documents_list = results.get('documents', [[]])[0] if results.get('documents') and results['documents'] else [""] * num_results |
| job_embeddings_list = results.get('embeddings', [[]])[0] if results.get('embeddings') and results['embeddings'] else [None] * num_results |
|
|
| for i in range(num_results): |
| chroma_db_id = results['ids'][0][i] |
| distance = distances_list[i] if i < len(distances_list) and distances_list[i] is not None else 1.0 |
| metadata_item = metadatas_list[i] if i < len(metadatas_list) and isinstance(metadatas_list[i], dict) else {} |
| document_text_item = documents_list[i] if i < len(documents_list) and isinstance(documents_list[i], str) else "" |
| job_embedding_item_raw = job_embeddings_list[i] if i < len(job_embeddings_list) and job_embeddings_list[i] is not None else None |
|
|
| job_embedding_for_match = np.array(job_embedding_item_raw) if job_embedding_item_raw is not None else None |
| job_embedding_valid_for_match = (job_embedding_for_match is not None and job_embedding_for_match.size > 0) |
|
|
|
|
| |
| clamped_distance = min(max(float(distance), 0.0), 2.0) |
| original_similarity_score = (1.0 - (clamped_distance / 2.0)) * 100.0 |
|
|
| current_score = original_similarity_score |
| match_details_for_job = [] |
|
|
| job_skills_from_meta_str = metadata_item.get('Skills_Json_Str', '[]') |
| try: |
| job_skills_list_from_meta = json.loads(job_skills_from_meta_str) if isinstance(job_skills_from_meta_str, str) else [] |
| except json.JSONDecodeError: job_skills_list_from_meta = [] |
|
|
| if use_enhanced_scoring and job_embedding_valid_for_match and job_skills_list_from_meta and cv_individual_skill_embeddings_dict: |
| enhanced_score_val, details_from_enh_score = calculate_enhanced_job_match_score( |
| cv_skills, job_skills_list_from_meta, cv_individual_skill_embeddings_dict, job_embedding_for_match |
| ) |
| |
| blend_weight_enhanced = 0.7 |
| current_score = (enhanced_score_val * blend_weight_enhanced) + (original_similarity_score * (1.0 - blend_weight_enhanced)) |
| current_score = max(0.0, min(100.0, current_score)) |
| match_details_for_job.extend(details_from_enh_score) |
|
|
|
|
| contributing_cv_skills_list = [] |
| if job_embedding_valid_for_match and cv_individual_skill_embeddings_dict: |
| contributing_cv_skills_list = explain_job(cv_skills, job_skills_list_from_meta, cv_individual_skill_embeddings_dict, job_embedding_for_match) |
|
|
| |
| match_data = {key.lower(): value for key, value in metadata_item.items()} |
| match_data.update({ |
| "chroma_id": chroma_db_id, |
| "score": current_score, |
| "document": document_text_item, |
| "job_skills": job_skills_list_from_meta, |
| "contributing_skills": contributing_cv_skills_list, |
| "match_details": match_details_for_job, |
| |
| "title": metadata_item.get('Title', 'N/A'), |
| "company": metadata_item.get('Company', 'N/A'), |
| "area": metadata_item.get('Area', 'N/A'), |
| "category": metadata_item.get('Category', 'N/A'), |
| "url": metadata_item.get('Application_URL', metadata_item.get('URL', '#')), |
| "status": metadata_item.get('Status', 'unknown'), |
| |
| "job_id": metadata_item.get('job_id', chroma_db_id) |
| }) |
| matches.append(match_data) |
|
|
| print(f"Processed {len(matches)} matches from ChromaDB results.") |
| return matches, "ChromaDB Vector Search with Enhanced Scoring" if use_enhanced_scoring else "ChromaDB Vector Search" |
|
|
| if __name__ == "__main__": |
| print("cv_match.py (alex version with trifon's enhancements) loaded for direct execution test.") |
| if not all([EMBEDDING_API_URL, CHROMA_HOST, CHROMA_PORT_STR, COLLECTION_NAME]): |
| print("Skipping test: Missing one or more required .env variables for full test.") |
| else: |
| sample_cv_skills_list = ["Python", "machine learning", "data analysis", "communication", "projektledelse", "SQL", "Django", "AWS"] |
| print(f"Test CV Skills: {sample_cv_skills_list}") |
| test_cv_skill_embedding = generate_embedding_for_skills(sample_cv_skills_list) |
| if test_cv_skill_embedding: |
| print(f"Generated test CV combined embedding (first 5 dims): {np.array(test_cv_skill_embedding[:5])}") |
| job_matches_found, message_status = find_similar_jobs( |
| cv_skills=sample_cv_skills_list, |
| cv_embedding=test_cv_skill_embedding, |
| top_n=5, |
| use_enhanced_scoring=True |
| ) |
| print(f"Search Status: {message_status}") |
| if job_matches_found: |
| print(f"Found {len(job_matches_found)} job matches:") |
| for i_match, match_item in enumerate(job_matches_found): |
| print(f"\n {i_match+1}. Title: {match_item.get('title', 'N/A')}") |
| print(f" Score: {match_item.get('score', 0.0):.2f}% (Chroma ID: {match_item.get('chroma_id')})") |
| print(f" Company: {match_item.get('company', 'N/A')}, Area: {match_item.get('area', 'N/A')}") |
| print(f" Job Skills: {match_item.get('job_skills', [])[:5]}...") |
| print(f" Contributing CV Skills (Top): {[(s_item[0], round(s_item[1], 3)) for s_item in match_item.get('contributing_skills', [])]}") |
| if match_item.get('match_details'): |
| print(f" Match Details (Top 3):") |
| for detail_idx, detail in enumerate(match_item['match_details'][:3]): |
| print(f" - Type: {detail.get('match_type')}, CV: {detail.get('cv_skill')}, Job: {detail.get('job_skill')}, Score: {detail.get('score')}") |
| else: print("No job matches found for the test CV skills.") |
| else: print("Failed to generate combined embedding for test CV skills.") |