Spaces:

michaelozon
/

Resume_job_Matching_System_new

Sleeping

App Files Files Community

Resume_job_Matching_System_new / app.py

michaelozon

Update app.py

1281728 verified 3 months ago

raw

history blame contribute delete

26.6 kB

	# app.py
	"""
	Hugging Face Space (Gradio) - Resume ↔ Job Matching System
	==========================================================

	This app implements the assignment's "Input → Output Pipeline":
	User job input → embed (query) → cosine similarity vs precomputed resume embeddings → Top-K ranked candidates.

	It uses:
	- pipeline.py (init_pipeline + rank_candidates_for_new_job)
	- precomputed embeddings uploaded to the Space repo (./embeddings/*)
	- resumes dataset from HF (michaelozon/candidate-matching-synthetic)

	NEW FEATURES:
	- Send top candidate directly to Make.com webhook
	- AI-generated interview invitation letter using Groq API
	"""

	import os
	import re
	import tempfile
	import requests
	import random
	from typing import List, Tuple, Optional

	import pandas as pd
	import gradio as gr

	from pipeline import init_pipeline, rank_candidates_for_new_job


	# -------------------------
	# Config
	# -------------------------
	WEBHOOK_URL = "https://hook.eu2.make.com/st4h0t3ycjud9llfgnebjyofvg35z8sz"
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"


	# -------------------------
	# Helpers
	# -------------------------
	def _parse_list(text: str) -> List[str]:
	"""Parse comma/newline-separated text into a clean list."""
	if text is None:
	return []
	text = str(text).strip()
	if not text:
	return []
	parts = re.split(r"[,;\n]+", text)
	out = [p.strip() for p in parts if p.strip()]
	# de-dup while preserving order
	seen = set()
	dedup = []
	for x in out:
	key = x.lower()
	if key not in seen:
	seen.add(key)
	dedup.append(x)
	return dedup


	def clean_text(value) -> str:
	"""Clean and normalize text values"""
	if value is None:
	return ""
	if isinstance(value, (int, float)):
	return str(value)
	text = str(value).strip()
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	return text


	def ensure_list(value) -> List[str]:
	"""Ensure value is a list of strings"""
	if value is None:
	return []
	if isinstance(value, list):
	return [clean_text(item) for item in value]
	if isinstance(value, str):
	# If it's a comma-separated string, split it
	if ',' in value:
	return [clean_text(item) for item in value.split(',')]
	return [clean_text(value)]
	return [clean_text(value)]


	def _format_stats(df: pd.DataFrame) -> str:
	"""Format statistics from results DataFrame"""
	if df is None or len(df) == 0 or "similarity_score" not in df.columns:
	return "No results to summarize."
	mn = float(df["similarity_score"].min())
	mx = float(df["similarity_score"].max())
	avg = float(df["similarity_score"].mean())
	return (
	f"Score range: [{mn:.4f}, {mx:.4f}] \n"
	f"Average score: {avg:.4f} \n"
	f"Returned candidates: {len(df)}"
	)


	def _make_csv(df: pd.DataFrame) -> Optional[str]:
	"""Create CSV file from DataFrame for download"""
	if df is None or len(df) == 0:
	return None
	tmpdir = tempfile.mkdtemp()
	path = os.path.join(tmpdir, "top_candidates.csv")
	df.to_csv(path, index=False, encoding="utf-8")
	return path


	# -------------------------
	# AI Interview Letter Generation (Groq API)
	# -------------------------
	def generate_interview_invitation(df: pd.DataFrame, job_title: str) -> str:
	"""
	Generate a personalized interview invitation letter using Groq API

	Args:
	df: DataFrame with ranked candidates
	job_title: The job title from the search

	Returns:
	Generated letter or error message
	"""
	# Validate input
	if df is None or len(df) == 0:
	return "❌ No candidates available. Please run a search first."

	if not GROQ_API_KEY:
	return "❌ Error: Groq API key not found. Please add GROQ_API_KEY to Space secrets."

	if not job_title or job_title.strip() == "":
	return "❌ Error: Job title is required to generate invitation letter."

	# Get top candidate
	try:
	top_candidate = df.iloc[0]

	candidate_role = clean_text(top_candidate.get('role', 'the position'))
	candidate_seniority = clean_text(top_candidate.get('seniority', ''))
	candidate_industry = clean_text(top_candidate.get('industry', ''))
	candidate_skills = ensure_list(top_candidate.get('skills', []))
	years_exp = int(top_candidate.get('years_experience', 0))
	match_score = float(top_candidate.get('similarity_score', 0))

	# Create skill summary (top 3-5 skills)
	skill_text = ", ".join(candidate_skills[:5]) if candidate_skills else "relevant skills"

	except Exception as e:
	return f"❌ Error extracting candidate data: {str(e)}"

	# Create varied prompts for diversity
	tone_variations = [
	"warm and enthusiastic",
	"professional but friendly",
	"encouraging and positive",
	"concise and welcoming",
	"engaging and personable"
	]

	selected_tone = random.choice(tone_variations)

	# Build the prompt
	user_prompt = f"""Write a {selected_tone} interview invitation letter for a job candidate.

	Position: {job_title}
	Candidate Background: {candidate_seniority} {candidate_role} with {years_exp} years of experience in {candidate_industry}
	Key Skills: {skill_text}
	Match Score: {match_score:.1%}
	Candidate ID: {top_candidate.get('resume_id', 'N/A')}


	Requirements:
	- Start with a warm greeting
	- Express enthusiasm about their profile
	- Mention we found them to be an excellent match for the {job_title} role
	- Highlight 1-2 specific skills that stood out ({skill_text})
	- Invite them to schedule an interview
	- End with a warm closing
	- Keep it 150-200 words
	- Write in a {selected_tone} tone
	- Do NOT use placeholder names like [Candidate Name] or [Your Name] - use the actual ID and name provided


	Write ONLY the letter body, no subject line."""

	# Call Groq API
	try:
	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json"
	}

	payload = {
	"model": "llama-3.3-70b-versatile",
	"messages": [
	{
	"role": "system",
	"content": "You are a professional HR recruiter writing interview invitation letters. Write clear, warm, and professional letters."
	},
	{
	"role": "user",
	"content": user_prompt
	}
	],
	"temperature": 0.8,
	"max_tokens": 400,
	"top_p": 0.9
	}

	response = requests.post(
	GROQ_API_URL,
	json=payload,
	headers=headers,
	timeout=15
	)

	if response.status_code != 200:
	error_detail = response.json().get('error', {}).get('message', response.text[:200])
	return f"❌ API Error ({response.status_code}): {error_detail}"

	result = response.json()
	letter = result.get("choices", [{}])[0].get("message", {}).get("content", "").strip()

	if not letter or len(letter) < 50:
	return "❌ Error: Generated letter is too short or empty. Please try again."

	# Format the output
	output = f"""## ✍️ AI-Generated Interview Invitation

	For: {candidate_seniority} {candidate_role} \| Match Score: {match_score:.2%}

	---

	{letter}

	---

	💡 Generated by Llama 3.1 70B (via Groq) • Tone: {selected_tone}
	"""

	return output

	except requests.exceptions.Timeout:
	return "❌ Request timed out. The API is taking too long. Please try again."

	except requests.exceptions.ConnectionError:
	return "❌ Connection error. Unable to reach Groq API. Please check your internet connection."

	except KeyError as e:
	return f"❌ Response parsing error: Missing expected field in API response: {str(e)}"

	except Exception as e:
	error_msg = str(e)
	if "api key" in error_msg.lower() or "unauthorized" in error_msg.lower():
	return "❌ Authentication error. Please check your Groq API key in Space secrets."
	elif "rate limit" in error_msg.lower():
	return "❌ Rate limit exceeded. Please wait a moment and try again."
	else:
	return f"❌ Unexpected error: {error_msg[:200]}"


	# -------------------------
	# Webhook Integration
	# -------------------------
	def send_top_candidate_to_webhook(df: pd.DataFrame) -> str:
	"""
	Send the top candidate (rank #1) to Make.com webhook

	Args:
	df: DataFrame with ranked candidates

	Returns:
	Status message for UI
	"""
	# Validate input
	if df is None or len(df) == 0:
	return "❌ No candidates to send. Please run a search first."

	# Get top candidate (rank #1)
	top_candidate = df.iloc[0]

	# Prepare payload
	try:
	payload = {
	"resume_id": str(top_candidate.get('resume_id', '')),
	"role": clean_text(top_candidate.get('role', '')),
	"industry": clean_text(top_candidate.get('industry', '')),
	"seniority": clean_text(top_candidate.get('seniority', '')),
	"years_experience": int(top_candidate.get('years_experience', 0)),
	"education": clean_text(top_candidate.get('education', '')),
	"skills": ensure_list(top_candidate.get('skills', [])),
	"summary": clean_text(top_candidate.get('summary', '')),
	"experience_bullets": ensure_list(top_candidate.get('experience_bullets', []))
	}

	# Add similarity score and rank for context
	if 'similarity_score' in top_candidate:
	payload['similarity_score'] = float(top_candidate['similarity_score'])
	if 'rank' in top_candidate:
	payload['rank'] = int(top_candidate['rank'])

	except Exception as e:
	return f"❌ Error preparing data: {str(e)}"

	# Send to webhook
	try:
	response = requests.post(
	WEBHOOK_URL,
	json=payload,
	headers={'Content-Type': 'application/json'},
	timeout=10
	)

	# Check response
	if response.status_code == 200:
	return (
	f"✅ Successfully sent to Michael!\n\n"
	f"Candidate: {payload['role']} ({payload['seniority']})\n"
	f"Resume ID: {payload['resume_id']}\n"
	f"Industry: {payload['industry']}\n"
	f"Match Score: {payload.get('similarity_score', 'N/A')}"
	)
	else:
	return (
	f"⚠️ Webhook responded with status {response.status_code}\n"
	f"Response: {response.text[:200]}"
	)

	except requests.exceptions.Timeout:
	return "❌ Request timed out. The webhook might be slow or unavailable."
	except requests.exceptions.ConnectionError:
	return "❌ Connection error. Please check the webhook URL or your internet connection."
	except Exception as e:
	return f"❌ Error sending to webhook: {str(e)}"


	# -------------------------
	# Core handler
	# -------------------------
	def run_matching(
	job_title: str,
	seniority: str,
	industry: str,
	must_have_skills_text: str,
	nice_to_have_skills_text: str,
	description: str,
	responsibilities_text: str,
	requirements_text: str,
	top_k: int,
	filter_by_role: bool,
	filter_by_industry: bool,
	) -> Tuple[pd.DataFrame, str, Optional[str]]:
	"""
	Main matching function called by Gradio

	Returns:
	- DataFrame with results
	- Statistics markdown
	- CSV file path for download
	"""
	# Parse text inputs into lists
	must_have = _parse_list(must_have_skills_text)
	nice_to_have = _parse_list(nice_to_have_skills_text)
	responsibilities = _parse_list(responsibilities_text)
	requirements = _parse_list(requirements_text)

	# Run pipeline
	df = rank_candidates_for_new_job(
	job_title=job_title,
	seniority=seniority,
	industry=industry,
	must_have_skills=must_have,
	nice_to_have_skills=nice_to_have if nice_to_have else None,
	description=description or "",
	responsibilities=responsibilities if responsibilities else None,
	requirements=requirements if requirements else None,
	top_k=int(top_k),
	filter_by_role=bool(filter_by_role),
	filter_by_industry=bool(filter_by_industry),
	)

	# Professional column ordering (safe)
	preferred_cols = [
	"rank",
	"similarity_score",
	"resume_id",
	"role",
	"seniority",
	"industry",
	"years_experience",
	"education",
	"skills",
	"summary",
	]
	cols = [c for c in preferred_cols if c in df.columns] + [
	c for c in df.columns if c not in preferred_cols
	]
	df = df[cols] if len(df) else df

	# Generate statistics and CSV
	stats_md = _format_stats(df)
	csv_path = _make_csv(df)

	return df, stats_md, csv_path


	# -------------------------
	# App initialization
	# -------------------------
	APP_TITLE = "Resume ↔ Job Matching System"
	APP_SUBTITLE = "Input → Output Pipeline (Embeddings + Cosine Similarity) • HuggingFace Space Demo"


	def _startup_message() -> str:
	"""
	Lightweight startup status for the UI.
	If embeddings are missing, init_pipeline() will raise and Space logs will show why.
	"""
	groq_status = "✅ Configured" if GROQ_API_KEY else "❌ Not configured"
	return (
	f"✅ Pipeline initialized successfully.\n\n"
	f"This Space loads:\n"
	f"- Resumes dataset from HuggingFace\n"
	f"- Precomputed resume embeddings from this Space repo\n"
	f"- Embedding model for query (intfloat/e5-small-v2)\n"
	f"- Groq API for letter generation: {groq_status}\n"
	)


	# Pre-load everything once so first user request is fast.
	# (If something is wrong with files, it will fail early and be visible in logs.)
	try:
	init_pipeline(force_reload=False)
	print("✅ Pipeline loaded successfully at startup")
	except Exception as e:
	print(f"⚠️ Warning: Pipeline initialization failed: {e}")
	print("The app will try to initialize on first request.")


	# -------------------------
	# Gradio UI
	# -------------------------
	SENIORITY_OPTIONS = ["Junior", "Mid", "Mid-Level", "Senior", "Lead", "Manager"]

	# Industry suggestions (user can also type custom)
	INDUSTRY_SUGGESTIONS = [
	"FinTech",
	"E-commerce",
	"SaaS",
	"Technology",
	"Healthcare",
	"Retail",
	"EdTech",
	"Cloud Services",
	"Design",
	"Gaming",
	"Cybersecurity",
	]

	# Examples for "Quick Starters" (3 examples as required)
	EXAMPLES = [
	[
	"Senior Data Scientist",
	"Senior",
	"FinTech",
	"Python, SQL, Machine Learning",
	"NLP, AWS",
	"Build ML models, run experiments, and support product decisions with data.",
	"Modeling, Experimentation, Stakeholder communication",
	"3+ years DS, Strong Python, Statistics",
	10,
	False,
	False,
	],
	[
	"UX Designer",
	"Mid-Level",
	"Design",
	"Figma, User Research, Prototyping",
	"",
	"Design user flows and high-fidelity prototypes for a product team.",
	"Wireframes, User interviews, Prototyping",
	"Portfolio, Collaboration, Communication",
	8,
	True,
	False,
	],
	[
	"Product Manager",
	"Mid-Level",
	"E-commerce",
	"Product Strategy, Roadmapping, SQL",
	"A/B Testing, Analytics",
	"Lead product development and work with cross-functional teams.",
	"Roadmap, Stakeholder management, Prioritization",
	"2+ years PM, Strong communication",
	10,
	False,
	True,
	],
	]


	with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE, css="""
	.send-button {
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
	border: none !important;
	color: white !important;
	font-weight: 600 !important;
	padding: 12px 24px !important;
	border-radius: 8px !important;
	transition: all 0.3s ease !important;
	}
	.send-button:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
	}
	.letter-button {
	background: linear-gradient(90deg, #f093fb 0%, #f5576c 100%) !important;
	border: none !important;
	color: white !important;
	font-weight: 600 !important;
	padding: 12px 24px !important;
	border-radius: 8px !important;
	transition: all 0.3s ease !important;
	}
	.letter-button:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 4px 12px rgba(240, 147, 251, 0.4) !important;
	}
	""") as demo:
	gr.Markdown(
	f"""
	# {APP_TITLE}
	{APP_SUBTITLE}

	This app demonstrates a complete matching pipeline:
	1) User enters job details
	2) We embed the job using intfloat/e5-small-v2
	3) We compute cosine similarity against precomputed resume embeddings
	4) We return Top-K ranked candidates with metadata

	> Tip: Use the examples below to see how the pipeline behaves.
	"""
	)

	with gr.Tabs():
	with gr.TabItem("Match Candidates (Single Job)"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("## Job Input")

	job_title = gr.Textbox(
	label="Job Title",
	placeholder="e.g., Senior Data Scientist"
	)
	seniority = gr.Dropdown(
	choices=SENIORITY_OPTIONS,
	value="Senior",
	label="Seniority",
	allow_custom_value=True,
	)
	industry = gr.Textbox(
	label="Industry",
	placeholder="e.g., FinTech",
	value="FinTech",
	)

	must_have = gr.Textbox(
	label="Must-have Skills (comma or new line separated)",
	placeholder="e.g., Python, SQL, Machine Learning",
	lines=2,
	)
	nice_to_have = gr.Textbox(
	label="Nice-to-have Skills (optional)",
	placeholder="e.g., NLP, AWS",
	lines=2,
	)

	description = gr.Textbox(
	label="Job Description (optional)",
	placeholder="Short role description...",
	lines=3,
	)
	responsibilities = gr.Textbox(
	label="Responsibilities (optional) — comma/newline separated",
	placeholder="e.g., Modeling, Experimentation, Stakeholder communication",
	lines=2,
	)
	requirements = gr.Textbox(
	label="Requirements (optional) — comma/newline separated",
	placeholder="e.g., 3+ years experience, Strong Python",
	lines=2,
	)

	with gr.Row():
	top_k = gr.Slider(
	minimum=1,
	maximum=30,
	value=10,
	step=1,
	label="Top-K results",
	)

	with gr.Row():
	filter_by_role = gr.Checkbox(
	value=False,
	label="Post-filter by role keywords (job title words must appear in candidate role)",
	)
	filter_by_industry = gr.Checkbox(
	value=False,
	label="Post-filter by exact industry match",
	)

	run_btn = gr.Button("Run Matching", variant="primary")

	gr.Markdown(
	"""
	### What the filters do
	- Role filter helps avoid cases where the embedding similarity is high but the role label differs.
	- Industry filter enforces an exact match on the dataset industry field.
	"""
	)

	with gr.Column(scale=1):
	gr.Markdown("## Results")

	stats = gr.Markdown(value=_startup_message())
	results_table = gr.Dataframe(
	label="Top Candidates",
	interactive=False,
	wrap=True,
	row_count=10,
	)

	# Download CSV button
	download_csv = gr.File(label="Download CSV (Top Candidates)")

	# Send to Michael button
	with gr.Row():
	send_webhook_btn = gr.Button(
	"📤 Send Top Candidate to Michael",
	variant="secondary",
	elem_classes=["send-button"],
	size="lg"
	)

	# Webhook status message
	webhook_status = gr.Markdown(value="", visible=True)

	# NEW: Generate Interview Letter button
	with gr.Row():
	generate_letter_btn = gr.Button(
	"✍️ Generate Interview Invitation Letter (AI)",
	variant="secondary",
	elem_classes=["letter-button"],
	size="lg"
	)

	# Letter output
	letter_output = gr.Markdown(value="", visible=True)

	# 3 Quick Starters (as required by Part 5)
	gr.Examples(
	label="🎯 Quick Starters (1-click examples)",
	examples=EXAMPLES,
	inputs=[
	job_title,
	seniority,
	industry,
	must_have,
	nice_to_have,
	description,
	responsibilities,
	requirements,
	top_k,
	filter_by_role,
	filter_by_industry,
	],
	outputs=[results_table, stats, download_csv],
	fn=run_matching,
	cache_examples=True,
	)

	# Connect Run Matching button
	run_btn.click(
	fn=run_matching,
	inputs=[
	job_title,
	seniority,
	industry,
	must_have,
	nice_to_have,
	description,
	responsibilities,
	requirements,
	top_k,
	filter_by_role,
	filter_by_industry,
	],
	outputs=[results_table, stats, download_csv],
	)

	# Connect Send to Michael button
	send_webhook_btn.click(
	fn=send_top_candidate_to_webhook,
	inputs=[results_table],
	outputs=[webhook_status],
	)

	# Connect Generate Letter button
	generate_letter_btn.click(
	fn=generate_interview_invitation,
	inputs=[results_table, job_title],
	outputs=[letter_output],
	)

	with gr.TabItem("About / How it works"):
	gr.Markdown(
	"""
	## Pipeline Overview (Assignment Alignment)

	Part 3 produced:
	- Precomputed resume embeddings (saved as `.npy`)
	- Matching resume IDs (saved as `.json`)
	- The chosen embedding model: intfloat/e5-small-v2

	Part 4 (this Space) does:
	- Loads resumes from the dataset repo (michaelozon/candidate-matching-synthetic)
	- Loads embeddings + IDs from the Space repository (`./embeddings/...`)
	- Accepts user job input, builds text in the same format as Part 3
	- Embeds the job query and computes cosine similarity
	- Returns Top-K candidates with fields (role, skills, seniority, etc.)

	### Files expected inside the Space repo
	- `embeddings/intfloat__e5-small-v2_resumes.npy`
	- `embeddings/intfloat__e5-small-v2_resume_ids.json`

	### Notes on scoring
	Because the dataset is synthetic and structured, similarity scores are often high (0.8-0.95).
	For better filtering, the app includes optional post-filters by role and industry.

	### Key Features
	✅ Uses precomputed embeddings (no recalculation)
	✅ Same text format as Part 3 (ensures consistency)
	✅ Cosine similarity via normalized embeddings
	✅ Optional post-filtering by role/industry
	✅ CSV export for results
	✅ 3 Quick Starter examples
	✅ NEW: Send top candidate directly to Make.com webhook
	✅ NEW: AI-generated interview invitation letters (Groq API - Llama 3.1 70B)

	### AI Letter Generation
	The app uses Groq API with Llama 3.1 70B to generate personalized interview invitation letters. Each letter is unique thanks to:
	- Random tone variations (warm, professional, encouraging, etc.)
	- High temperature (0.8) for creativity
	- Top-p sampling (0.9) for diverse word choices
	- Fast response times (~1 second)

	Setup: Add your free Groq API key to Space secrets as `GROQ_API_KEY`
	"""
	)

	gr.Markdown(
	"""
	---
	Built for: Data Science Final Project - Part 4 & 5
	Model: intfloat/e5-small-v2
	Dataset: michaelozon/candidate-matching-synthetic
	Integrations: Make.com webhook • Groq API (Llama 3.1 70B)
	"""
	)


	if __name__ == "__main__":
	demo.launch()