Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import asyncio | |
| import base64 | |
| import json | |
| import os | |
| import time | |
| import logging | |
| from typing import List, Dict, Optional | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import hashlib | |
| import pickle | |
| from pathlib import Path | |
| # Import configuration | |
| from config import OPENAI_API_KEY, CACHE_FILE, validate_config | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Validate configuration | |
| try: | |
| validate_config() | |
| except ValueError as e: | |
| logger.error(f"Configuration error: {e}") | |
| raise | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=OPENAI_API_KEY) | |
| candEmbed = [] | |
| ds2 = None | |
| def load_data(): | |
| """Load and validate CSV data""" | |
| global ds2 | |
| try: | |
| ds2 = pd.read_csv("Candidates.csv") | |
| # Add Match column if it doesn't exist | |
| if 'Match' not in ds2.columns: | |
| ds2['Match'] = 0.0 | |
| logger.info(f"Loaded {len(ds2)} candidates") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error loading data: {e}") | |
| return False | |
| def get_embedding(text: str) -> np.ndarray: | |
| """Get embedding for text using OpenAI API with error handling""" | |
| try: | |
| if not text or not text.strip(): | |
| raise ValueError("Empty text provided") | |
| response = client.embeddings.create( | |
| model="text-embedding-ada-002", | |
| input=[text.strip()] | |
| ) | |
| embedding = response.data[0].embedding | |
| return np.array(embedding, dtype=np.float32) | |
| except Exception as e: | |
| logger.error(f"Error getting embedding: {e}") | |
| raise | |
| def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: | |
| """Calculate cosine similarity between two vectors""" | |
| try: | |
| a = a.astype(np.float32) | |
| b = b.astype(np.float32) | |
| # Normalize vectors | |
| a_norm = np.linalg.norm(a) | |
| b_norm = np.linalg.norm(b) | |
| if a_norm == 0 or b_norm == 0: | |
| return 0.0 | |
| return np.dot(a, b) / (a_norm * b_norm) | |
| except Exception as e: | |
| logger.error(f"Error calculating similarity: {e}") | |
| return 0.0 | |
| def load_cached_embeddings() -> bool: | |
| """Load cached embeddings if available""" | |
| global candEmbed | |
| try: | |
| if Path(CACHE_FILE).exists(): | |
| with open(CACHE_FILE, 'rb') as f: | |
| candEmbed = pickle.load(f) | |
| logger.info(f"Loaded {len(candEmbed)} cached embeddings") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error loading cached embeddings: {e}") | |
| return False | |
| def save_cached_embeddings(): | |
| """Save embeddings to cache""" | |
| try: | |
| with open(CACHE_FILE, 'wb') as f: | |
| pickle.dump(candEmbed, f) | |
| logger.info("Saved embeddings to cache") | |
| except Exception as e: | |
| logger.error(f"Error saving embeddings: {e}") | |
| def generate_candidate_embeddings(): | |
| """Generate embeddings for all candidates""" | |
| global candEmbed | |
| candEmbed = [] | |
| logger.info("Generating candidate embeddings...") | |
| for i, resume in enumerate(ds2["Resume"]): | |
| try: | |
| if pd.isna(resume) or not resume.strip(): | |
| logger.warning(f"Empty resume at index {i}") | |
| candEmbed.append(np.zeros(1536, dtype=np.float32)) | |
| continue | |
| embedding = get_embedding(resume) | |
| candEmbed.append(embedding) | |
| # Progress logging | |
| if (i + 1) % 10 == 0: | |
| logger.info(f"Processed {i + 1}/{len(ds2)} candidates") | |
| except Exception as e: | |
| logger.error(f"Error processing candidate {i}: {e}") | |
| candEmbed.append(np.zeros(1536, dtype=np.float32)) | |
| # Save to cache | |
| save_cached_embeddings() | |
| logger.info("Completed generating candidate embeddings") | |
| def search_candidates(exp: float, skills: str, job_desc: str, job_title: str, top_percent: float) -> pd.DataFrame: | |
| """Search for candidates based on criteria""" | |
| try: | |
| if not job_desc.strip() and not skills.strip(): | |
| raise ValueError("Please provide either job description or skills") | |
| # Create a more focused search query | |
| search_terms = [] | |
| if job_title.strip(): | |
| search_terms.append(job_title.strip()) | |
| if job_desc.strip(): | |
| search_terms.append(job_desc.strip()) | |
| if skills.strip(): | |
| search_terms.append(skills.strip()) | |
| # Combine all search terms | |
| search_text = " ".join(search_terms) | |
| # Pre-filter candidates based on keywords to improve relevance | |
| pre_filtered_df = ds2[ds2["Experience"] >= exp].copy() | |
| # Convert search terms to lowercase for case-insensitive matching | |
| search_lower = search_text.lower() | |
| job_title_lower = job_title.lower() if job_title else "" | |
| skills_lower = skills.lower() if skills else "" | |
| # Create a relevance score based on keyword matching and track found keywords | |
| relevance_scores = [] | |
| found_keywords_list = [] | |
| for idx, candidate in pre_filtered_df.iterrows(): | |
| resume_text = str(candidate.get('Resume', '')).lower() | |
| candidate_title = str(candidate.get('Title', '')).lower() | |
| candidate_skills = str(candidate.get('Skills', '')).lower() | |
| # Track found keywords for this candidate | |
| candidate_keywords = [] | |
| # Calculate keyword relevance with better context checking | |
| title_match = 1 if job_title_lower in candidate_title else 0 | |
| if title_match and job_title_lower: | |
| candidate_keywords.append(job_title_lower) | |
| # Check for skills with context (avoid false positives) | |
| skills_match = 0 | |
| required_skills = [skill.strip() for skill in skills_lower.split(',') if skill.strip()] | |
| found_required_skills = 0 | |
| for skill in required_skills: | |
| if skill in resume_text: | |
| # Check if it's in a relevant context (not just random occurrence) | |
| # For SEO, look for it in skills section or job descriptions | |
| if skill == 'seo': | |
| # Look for SEO in skills, marketing, or content context | |
| seo_contexts = ['seo', 'search engine optimization', 'marketing', 'content', 'digital marketing'] | |
| if any(context in resume_text for context in seo_contexts): | |
| found_required_skills += 1 | |
| candidate_keywords.append(skill) | |
| else: | |
| found_required_skills += 1 | |
| candidate_keywords.append(skill) | |
| # Use AND logic: all required skills must be found | |
| if required_skills: | |
| if found_required_skills == len(required_skills): | |
| skills_match = len(required_skills) # All skills found | |
| else: | |
| skills_match = 0 # Not all required skills found | |
| else: | |
| skills_match = 0 # No skills specified | |
| # Check for content-related terms with better context | |
| content_match = 0 | |
| if 'content' in search_lower: | |
| # Look for content writing, not just "content" in general | |
| content_writing_terms = ['content writer', 'content writing', 'copywriter', 'blog', 'article', 'copywriting'] | |
| if any(term in resume_text for term in content_writing_terms): | |
| content_match = 1 | |
| candidate_keywords.extend([term for term in content_writing_terms if term in resume_text]) | |
| elif 'content' in resume_text: | |
| # Check if it's in a writing context, not technical context | |
| writing_contexts = ['writing', 'blog', 'article', 'copy', 'seo content'] | |
| if any(context in resume_text for context in writing_contexts): | |
| content_match = 1 | |
| candidate_keywords.append('content') | |
| # Weight the relevance score | |
| relevance_score = (title_match * 5) + (skills_match * 3) + (content_match * 2) | |
| relevance_scores.append(relevance_score) | |
| found_keywords_list.append(candidate_keywords) | |
| # Add relevance scores and found keywords to the dataframe | |
| pre_filtered_df['Relevance'] = relevance_scores | |
| pre_filtered_df['Found_Keywords'] = found_keywords_list | |
| # Filter out candidates with zero relevance (no keyword matches) | |
| # Also filter out candidates who don't have all required skills | |
| pre_filtered_df = pre_filtered_df[pre_filtered_df['Relevance'] > 0] | |
| # Additional filtering: if skills are specified, only include candidates with all required skills | |
| if skills.strip(): | |
| required_skills = [skill.strip().lower() for skill in skills.split(',') if skill.strip()] | |
| skills_filter_mask = [] | |
| for idx, candidate in pre_filtered_df.iterrows(): | |
| resume_text = str(candidate.get('Resume', '')).lower() | |
| candidate_skills = str(candidate.get('Skills', '')).lower() | |
| all_text = resume_text + ' ' + candidate_skills | |
| # Check if ALL required skills are present | |
| has_all_skills = True | |
| for skill in required_skills: | |
| if skill not in all_text: | |
| has_all_skills = False | |
| break | |
| skills_filter_mask.append(has_all_skills) | |
| pre_filtered_df = pre_filtered_df[skills_filter_mask] | |
| if pre_filtered_df.empty: | |
| # Provide specific feedback based on search criteria | |
| if skills.strip(): | |
| required_skills = [skill.strip() for skill in skills.split(',') if skill.strip()] | |
| if len(required_skills) > 1: | |
| message = f"β οΈ No candidates found with ALL required skills: {', '.join(required_skills)}. Try reducing the number of required skills or search for candidates with individual skills." | |
| else: | |
| message = f"β οΈ No candidates found with the required skill: {required_skills[0]}" | |
| elif 'content' in search_lower or 'writer' in search_lower: | |
| message = "β οΈ No content writers found in the dataset. The current dataset contains mostly DevOps and technical roles. Try searching for technical positions like 'DevOps Engineer', 'Cloud Engineer', or 'System Administrator'." | |
| elif 'seo' in skills.lower(): | |
| message = "β οΈ No candidates with SEO experience found in the current dataset. The dataset contains mostly technical roles. Try searching for technical skills like 'AWS', 'Docker', 'Kubernetes', or 'Python'." | |
| else: | |
| message = "β οΈ No candidates found matching your criteria. Try adjusting the filters or search for different skills." | |
| logger.warning(message) | |
| return pd.DataFrame(columns=['Title', 'Experience', 'Match %', 'Link', 'Skills', 'Keywords Found']) | |
| # Get embedding for search query | |
| search_embedding = get_embedding(search_text) | |
| # Calculate AI similarities only for pre-filtered candidates | |
| for idx, candidate in pre_filtered_df.iterrows(): | |
| original_idx = candidate.name # Get the original index | |
| candidate_embedding = candEmbed[original_idx] | |
| similarity = cosine_similarity(search_embedding, candidate_embedding) | |
| pre_filtered_df.loc[idx, 'Match'] = similarity | |
| # Apply minimum similarity threshold | |
| min_similarity_threshold = 0.05 # Lower threshold since we pre-filtered | |
| filtered_df = pre_filtered_df[pre_filtered_df['Match'] >= min_similarity_threshold] | |
| if filtered_df.empty: | |
| return pd.DataFrame(columns=['Title', 'Experience', 'Match %', 'Link', 'Skills', 'Keywords Found']) | |
| # Sort by match percentage | |
| filtered_df = filtered_df.sort_values("Match", ascending=False) | |
| # Select top percentage | |
| top_count = max(1, int(len(filtered_df) * (top_percent / 100))) | |
| result_df = filtered_df.head(top_count)[['Title', 'Experience', 'Match', 'Link', 'Found_Keywords']].copy() | |
| # Convert to percentage and format | |
| result_df['Match %'] = (result_df['Match'] * 100).round(2) | |
| result_df = result_df.drop('Match', axis=1) | |
| # Add skills column if available | |
| if 'Skills' in ds2.columns: | |
| skills_data = filtered_df.head(top_count)['Skills'].fillna('N/A') | |
| # Clean up skills data - remove empty strings and format properly | |
| skills_data = skills_data.apply(lambda x: 'N/A' if pd.isna(x) or str(x).strip() == '' else str(x).strip()) | |
| # If skills are mostly N/A, try to extract from resume | |
| if skills_data.value_counts().get('N/A', 0) > len(skills_data) * 0.8: # If more than 80% are N/A | |
| extracted_skills = [] | |
| for idx, candidate in filtered_df.head(top_count).iterrows(): | |
| resume_text = str(candidate.get('Resume', '')).lower() | |
| # Extract common technical skills from resume | |
| common_skills = [ | |
| 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'terraform', | |
| 'ansible', 'python', 'java', 'linux', 'windows', 'mysql', 'postgresql', | |
| 'mongodb', 'nginx', 'apache', 'prometheus', 'grafana', 'elk', 'vmware', | |
| 'devops', 'ci/cd', 'microservices', 'cloud', 'monitoring', 'automation' | |
| ] | |
| found_skills = [skill for skill in common_skills if skill in resume_text] | |
| if found_skills: | |
| extracted_skills.append(', '.join(found_skills[:5])) # Limit to 5 skills | |
| else: | |
| extracted_skills.append('N/A') | |
| result_df['Skills'] = extracted_skills | |
| else: | |
| result_df['Skills'] = skills_data | |
| # Format keywords found column | |
| result_df['Keywords Found'] = result_df['Found_Keywords'].apply(lambda x: ', '.join(x) if x else 'None') | |
| result_df = result_df.drop('Found_Keywords', axis=1) | |
| return result_df | |
| except Exception as e: | |
| logger.error(f"Error in search_candidates: {e}") | |
| return pd.DataFrame(columns=['Title', 'Experience', 'Match %', 'Link']) | |
| def validate_inputs(exp: float, skills: str, job_desc: str, job_title: str, top_percent: float) -> str: | |
| """Validate user inputs""" | |
| errors = [] | |
| if exp < 0: | |
| errors.append("Experience cannot be negative") | |
| if not skills.strip() and not job_desc.strip(): | |
| errors.append("Please provide either skills or job description") | |
| if top_percent <= 0 or top_percent > 100: | |
| errors.append("Top percentage must be between 1 and 100") | |
| return "; ".join(errors) if errors else "" | |
| def search_candidates_wrapper(exp: float, skills: str, job_desc: str, job_title: str, top_percent: float) -> tuple: | |
| """Wrapper function for candidate search with validation""" | |
| try: | |
| # Validate inputs | |
| validation_error = validate_inputs(exp, skills, job_desc, job_title, top_percent) | |
| if validation_error: | |
| return pd.DataFrame(), f"β {validation_error}" | |
| # Perform AI search | |
| result_df = search_candidates(exp, skills, job_desc, job_title, top_percent) | |
| message = f"β Found {len(result_df)} candidates using AI matching" | |
| # Add debug information for the first few results | |
| if len(result_df) > 0: | |
| debug_info = "\n\nπ **Debug Info (Top 3 matches):**" | |
| for i, (idx, row) in enumerate(result_df.head(3).iterrows()): | |
| candidate_name = row['Title'] | |
| match_score = row['Match %'] | |
| debug_info += f"\n{i+1}. {candidate_name}: {match_score}%" | |
| message += debug_info | |
| if result_df.empty: | |
| # Provide more specific feedback based on search terms | |
| if 'content' in job_title.lower() or 'content' in job_desc.lower(): | |
| message = "β οΈ No content writers found in the current dataset. The dataset contains mostly DevOps, Cloud, and technical roles. Try searching for technical positions like 'DevOps Engineer', 'Cloud Engineer', or 'System Administrator'." | |
| elif 'seo' in skills.lower(): | |
| message = "β οΈ No candidates with SEO experience found in the current dataset. The dataset contains mostly technical roles. Try searching for technical skills like 'AWS', 'Docker', 'Kubernetes', or 'Python'." | |
| else: | |
| message = "β οΈ No candidates found matching your criteria. Try adjusting the filters or search for different skills." | |
| return result_df, message | |
| except Exception as e: | |
| logger.error(f"Error in search wrapper: {e}") | |
| return pd.DataFrame(), f"β Error: {str(e)}" | |
| # Initialize data and embeddings | |
| def initialize_app(): | |
| """Initialize the application""" | |
| if not load_data(): | |
| return "β Failed to load data files" | |
| if not load_cached_embeddings(): | |
| generate_candidate_embeddings() | |
| return "β Application initialized successfully" | |
| # Create Gradio interface | |
| def create_interface(): | |
| """Create the Gradio interface""" | |
| with gr.Blocks(title="AI Resume Search", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # π AI-Powered Resume Search System | |
| This application helps you find the best candidates by matching job descriptions and skills with candidate resumes using AI. | |
| """) | |
| with gr.Accordion("π Features and Tips", open=False): | |
| gr.Markdown( | |
| """ | |
| **Features:** | |
| - π€ AI-powered semantic matching using OpenAI embeddings | |
| - π Experience-based filtering | |
| - π― Skills and job description matching | |
| - π Match percentage scoring | |
| - π Direct links to candidate profiles | |
| - π Keywords found in each resume | |
| **π Available Candidate Types:** | |
| - DevOps Engineers | |
| - Cloud Engineers (AWS, Azure, GCP) | |
| - System Administrators | |
| - Software Engineers | |
| - IT Infrastructure Specialists | |
| **π‘ Search Tips:** | |
| - Try technical skills like: AWS, Docker, Kubernetes, Python, Linux | |
| - Use job titles like: DevOps Engineer, Cloud Engineer, System Administrator | |
| - The dataset contains mostly technical/IT roles | |
| - Check the "Keywords Found" column to see which search terms matched in each resume | |
| - **Skills Logic**: Multiple skills use AND logic (all skills must be present) | |
| --- | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Search Criteria") | |
| job_title = gr.Text( | |
| label="Job Title", | |
| placeholder="e.g., DevOps Engineer, Software Developer", | |
| info="Enter the job title for context" | |
| ) | |
| job_desc = gr.Text( | |
| label="Job Description", | |
| lines=4, | |
| placeholder="Enter detailed job description for better AI matching...", | |
| info="Detailed description improves match accuracy" | |
| ) | |
| skills = gr.Text( | |
| label="Required Skills", | |
| placeholder="e.g., Python, AWS, Docker, Kubernetes", | |
| info="List key skills separated by commas" | |
| ) | |
| exp = gr.Slider( | |
| 0, 25, value=0, step=0.5, | |
| label="Minimum Experience (Years)", | |
| info="Filter candidates by minimum experience" | |
| ) | |
| top_percent = gr.Slider( | |
| 1, 100, value=20, step=1, | |
| label="Top Results (%)", | |
| info="Percentage of top matching candidates to display" | |
| ) | |
| search_btn = gr.Button( | |
| "π Search Candidates", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| status_msg = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| value="Ready to search..." | |
| ) | |
| with gr.Column(scale=3): | |
| gr.Markdown("### π₯ Search Results") | |
| results_df = gr.DataFrame( | |
| headers=["Title", "Experience", "Match %", "Link", "Skills", "Keywords Found"], | |
| datatype=["markdown", "markdown", "markdown", "markdown", "markdown", "markdown"], | |
| label="Matching Candidates", | |
| ) | |
| gr.Markdown( | |
| """ | |
| **How to use:** | |
| 1. Enter job title, description, and required skills | |
| 2. Set minimum experience requirement | |
| 3. Choose percentage of top results to display | |
| 4. Click "Search Candidates" to find matches | |
| **Tips for better results:** | |
| - Provide detailed job descriptions | |
| - List specific technical skills (use AND logic for multiple skills) | |
| - Adjust experience filter based on role requirements | |
| - Check the "Keywords Found" column to see which terms matched | |
| """ | |
| ) | |
| # Event handlers | |
| search_btn.click( | |
| fn=search_candidates_wrapper, | |
| inputs=[exp, skills, job_desc, job_title, top_percent], | |
| outputs=[results_df, status_msg] | |
| ) | |
| # Initialize app | |
| demo.load(initialize_app, outputs=[status_msg]) | |
| return demo | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| debug=True | |
| ) |