michaelozon's picture
Update app.py
1281728 verified
# app.py
"""
Hugging Face Space (Gradio) - Resume ↔ Job Matching System
==========================================================
This app implements the assignment's "Input β†’ Output Pipeline":
User job input β†’ embed (query) β†’ cosine similarity vs precomputed resume embeddings β†’ Top-K ranked candidates.
It uses:
- pipeline.py (init_pipeline + rank_candidates_for_new_job)
- precomputed embeddings uploaded to the Space repo (./embeddings/*)
- resumes dataset from HF (michaelozon/candidate-matching-synthetic)
NEW FEATURES:
- Send top candidate directly to Make.com webhook
- AI-generated interview invitation letter using Groq API
"""
import os
import re
import tempfile
import requests
import random
from typing import List, Tuple, Optional
import pandas as pd
import gradio as gr
from pipeline import init_pipeline, rank_candidates_for_new_job
# -------------------------
# Config
# -------------------------
WEBHOOK_URL = "https://hook.eu2.make.com/st4h0t3ycjud9llfgnebjyofvg35z8sz"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
# -------------------------
# Helpers
# -------------------------
def _parse_list(text: str) -> List[str]:
"""Parse comma/newline-separated text into a clean list."""
if text is None:
return []
text = str(text).strip()
if not text:
return []
parts = re.split(r"[,;\n]+", text)
out = [p.strip() for p in parts if p.strip()]
# de-dup while preserving order
seen = set()
dedup = []
for x in out:
key = x.lower()
if key not in seen:
seen.add(key)
dedup.append(x)
return dedup
def clean_text(value) -> str:
"""Clean and normalize text values"""
if value is None:
return ""
if isinstance(value, (int, float)):
return str(value)
text = str(value).strip()
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
return text
def ensure_list(value) -> List[str]:
"""Ensure value is a list of strings"""
if value is None:
return []
if isinstance(value, list):
return [clean_text(item) for item in value]
if isinstance(value, str):
# If it's a comma-separated string, split it
if ',' in value:
return [clean_text(item) for item in value.split(',')]
return [clean_text(value)]
return [clean_text(value)]
def _format_stats(df: pd.DataFrame) -> str:
"""Format statistics from results DataFrame"""
if df is None or len(df) == 0 or "similarity_score" not in df.columns:
return "No results to summarize."
mn = float(df["similarity_score"].min())
mx = float(df["similarity_score"].max())
avg = float(df["similarity_score"].mean())
return (
f"**Score range:** [{mn:.4f}, {mx:.4f}] \n"
f"**Average score:** {avg:.4f} \n"
f"**Returned candidates:** {len(df)}"
)
def _make_csv(df: pd.DataFrame) -> Optional[str]:
"""Create CSV file from DataFrame for download"""
if df is None or len(df) == 0:
return None
tmpdir = tempfile.mkdtemp()
path = os.path.join(tmpdir, "top_candidates.csv")
df.to_csv(path, index=False, encoding="utf-8")
return path
# -------------------------
# AI Interview Letter Generation (Groq API)
# -------------------------
def generate_interview_invitation(df: pd.DataFrame, job_title: str) -> str:
"""
Generate a personalized interview invitation letter using Groq API
Args:
df: DataFrame with ranked candidates
job_title: The job title from the search
Returns:
Generated letter or error message
"""
# Validate input
if df is None or len(df) == 0:
return "❌ **No candidates available.** Please run a search first."
if not GROQ_API_KEY:
return "❌ **Error:** Groq API key not found. Please add GROQ_API_KEY to Space secrets."
if not job_title or job_title.strip() == "":
return "❌ **Error:** Job title is required to generate invitation letter."
# Get top candidate
try:
top_candidate = df.iloc[0]
candidate_role = clean_text(top_candidate.get('role', 'the position'))
candidate_seniority = clean_text(top_candidate.get('seniority', ''))
candidate_industry = clean_text(top_candidate.get('industry', ''))
candidate_skills = ensure_list(top_candidate.get('skills', []))
years_exp = int(top_candidate.get('years_experience', 0))
match_score = float(top_candidate.get('similarity_score', 0))
# Create skill summary (top 3-5 skills)
skill_text = ", ".join(candidate_skills[:5]) if candidate_skills else "relevant skills"
except Exception as e:
return f"❌ **Error extracting candidate data:** {str(e)}"
# Create varied prompts for diversity
tone_variations = [
"warm and enthusiastic",
"professional but friendly",
"encouraging and positive",
"concise and welcoming",
"engaging and personable"
]
selected_tone = random.choice(tone_variations)
# Build the prompt
user_prompt = f"""Write a {selected_tone} interview invitation letter for a job candidate.
**Position:** {job_title}
**Candidate Background:** {candidate_seniority} {candidate_role} with {years_exp} years of experience in {candidate_industry}
**Key Skills:** {skill_text}
**Match Score:** {match_score:.1%}
**Candidate ID:** {top_candidate.get('resume_id', 'N/A')}
**Requirements:**
- Start with a warm greeting
- Express enthusiasm about their profile
- Mention we found them to be an excellent match for the {job_title} role
- Highlight 1-2 specific skills that stood out ({skill_text})
- Invite them to schedule an interview
- End with a warm closing
- Keep it 150-200 words
- Write in a {selected_tone} tone
- Do NOT use placeholder names like [Candidate Name] or [Your Name] - use the actual ID and name provided
Write ONLY the letter body, no subject line."""
# Call Groq API
try:
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "llama-3.3-70b-versatile",
"messages": [
{
"role": "system",
"content": "You are a professional HR recruiter writing interview invitation letters. Write clear, warm, and professional letters."
},
{
"role": "user",
"content": user_prompt
}
],
"temperature": 0.8,
"max_tokens": 400,
"top_p": 0.9
}
response = requests.post(
GROQ_API_URL,
json=payload,
headers=headers,
timeout=15
)
if response.status_code != 200:
error_detail = response.json().get('error', {}).get('message', response.text[:200])
return f"❌ **API Error ({response.status_code}):** {error_detail}"
result = response.json()
letter = result.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
if not letter or len(letter) < 50:
return "❌ **Error:** Generated letter is too short or empty. Please try again."
# Format the output
output = f"""## ✍️ AI-Generated Interview Invitation
**For:** {candidate_seniority} {candidate_role} | **Match Score:** {match_score:.2%}
---
{letter}
---
πŸ’‘ *Generated by Llama 3.1 70B (via Groq) β€’ Tone: {selected_tone}*
"""
return output
except requests.exceptions.Timeout:
return "❌ **Request timed out.** The API is taking too long. Please try again."
except requests.exceptions.ConnectionError:
return "❌ **Connection error.** Unable to reach Groq API. Please check your internet connection."
except KeyError as e:
return f"❌ **Response parsing error:** Missing expected field in API response: {str(e)}"
except Exception as e:
error_msg = str(e)
if "api key" in error_msg.lower() or "unauthorized" in error_msg.lower():
return "❌ **Authentication error.** Please check your Groq API key in Space secrets."
elif "rate limit" in error_msg.lower():
return "❌ **Rate limit exceeded.** Please wait a moment and try again."
else:
return f"❌ **Unexpected error:** {error_msg[:200]}"
# -------------------------
# Webhook Integration
# -------------------------
def send_top_candidate_to_webhook(df: pd.DataFrame) -> str:
"""
Send the top candidate (rank #1) to Make.com webhook
Args:
df: DataFrame with ranked candidates
Returns:
Status message for UI
"""
# Validate input
if df is None or len(df) == 0:
return "❌ No candidates to send. Please run a search first."
# Get top candidate (rank #1)
top_candidate = df.iloc[0]
# Prepare payload
try:
payload = {
"resume_id": str(top_candidate.get('resume_id', '')),
"role": clean_text(top_candidate.get('role', '')),
"industry": clean_text(top_candidate.get('industry', '')),
"seniority": clean_text(top_candidate.get('seniority', '')),
"years_experience": int(top_candidate.get('years_experience', 0)),
"education": clean_text(top_candidate.get('education', '')),
"skills": ensure_list(top_candidate.get('skills', [])),
"summary": clean_text(top_candidate.get('summary', '')),
"experience_bullets": ensure_list(top_candidate.get('experience_bullets', []))
}
# Add similarity score and rank for context
if 'similarity_score' in top_candidate:
payload['similarity_score'] = float(top_candidate['similarity_score'])
if 'rank' in top_candidate:
payload['rank'] = int(top_candidate['rank'])
except Exception as e:
return f"❌ Error preparing data: {str(e)}"
# Send to webhook
try:
response = requests.post(
WEBHOOK_URL,
json=payload,
headers={'Content-Type': 'application/json'},
timeout=10
)
# Check response
if response.status_code == 200:
return (
f"βœ… **Successfully sent to Michael!**\n\n"
f"**Candidate:** {payload['role']} ({payload['seniority']})\n"
f"**Resume ID:** {payload['resume_id']}\n"
f"**Industry:** {payload['industry']}\n"
f"**Match Score:** {payload.get('similarity_score', 'N/A')}"
)
else:
return (
f"⚠️ Webhook responded with status {response.status_code}\n"
f"Response: {response.text[:200]}"
)
except requests.exceptions.Timeout:
return "❌ Request timed out. The webhook might be slow or unavailable."
except requests.exceptions.ConnectionError:
return "❌ Connection error. Please check the webhook URL or your internet connection."
except Exception as e:
return f"❌ Error sending to webhook: {str(e)}"
# -------------------------
# Core handler
# -------------------------
def run_matching(
job_title: str,
seniority: str,
industry: str,
must_have_skills_text: str,
nice_to_have_skills_text: str,
description: str,
responsibilities_text: str,
requirements_text: str,
top_k: int,
filter_by_role: bool,
filter_by_industry: bool,
) -> Tuple[pd.DataFrame, str, Optional[str]]:
"""
Main matching function called by Gradio
Returns:
- DataFrame with results
- Statistics markdown
- CSV file path for download
"""
# Parse text inputs into lists
must_have = _parse_list(must_have_skills_text)
nice_to_have = _parse_list(nice_to_have_skills_text)
responsibilities = _parse_list(responsibilities_text)
requirements = _parse_list(requirements_text)
# Run pipeline
df = rank_candidates_for_new_job(
job_title=job_title,
seniority=seniority,
industry=industry,
must_have_skills=must_have,
nice_to_have_skills=nice_to_have if nice_to_have else None,
description=description or "",
responsibilities=responsibilities if responsibilities else None,
requirements=requirements if requirements else None,
top_k=int(top_k),
filter_by_role=bool(filter_by_role),
filter_by_industry=bool(filter_by_industry),
)
# Professional column ordering (safe)
preferred_cols = [
"rank",
"similarity_score",
"resume_id",
"role",
"seniority",
"industry",
"years_experience",
"education",
"skills",
"summary",
]
cols = [c for c in preferred_cols if c in df.columns] + [
c for c in df.columns if c not in preferred_cols
]
df = df[cols] if len(df) else df
# Generate statistics and CSV
stats_md = _format_stats(df)
csv_path = _make_csv(df)
return df, stats_md, csv_path
# -------------------------
# App initialization
# -------------------------
APP_TITLE = "Resume ↔ Job Matching System"
APP_SUBTITLE = "Input β†’ Output Pipeline (Embeddings + Cosine Similarity) β€’ HuggingFace Space Demo"
def _startup_message() -> str:
"""
Lightweight startup status for the UI.
If embeddings are missing, init_pipeline() will raise and Space logs will show why.
"""
groq_status = "βœ… Configured" if GROQ_API_KEY else "❌ Not configured"
return (
f"βœ… Pipeline initialized successfully.\n\n"
f"This Space loads:\n"
f"- Resumes dataset from HuggingFace\n"
f"- Precomputed resume embeddings from this Space repo\n"
f"- Embedding model for query (intfloat/e5-small-v2)\n"
f"- Groq API for letter generation: {groq_status}\n"
)
# Pre-load everything once so first user request is fast.
# (If something is wrong with files, it will fail early and be visible in logs.)
try:
init_pipeline(force_reload=False)
print("βœ… Pipeline loaded successfully at startup")
except Exception as e:
print(f"⚠️ Warning: Pipeline initialization failed: {e}")
print("The app will try to initialize on first request.")
# -------------------------
# Gradio UI
# -------------------------
SENIORITY_OPTIONS = ["Junior", "Mid", "Mid-Level", "Senior", "Lead", "Manager"]
# Industry suggestions (user can also type custom)
INDUSTRY_SUGGESTIONS = [
"FinTech",
"E-commerce",
"SaaS",
"Technology",
"Healthcare",
"Retail",
"EdTech",
"Cloud Services",
"Design",
"Gaming",
"Cybersecurity",
]
# Examples for "Quick Starters" (3 examples as required)
EXAMPLES = [
[
"Senior Data Scientist",
"Senior",
"FinTech",
"Python, SQL, Machine Learning",
"NLP, AWS",
"Build ML models, run experiments, and support product decisions with data.",
"Modeling, Experimentation, Stakeholder communication",
"3+ years DS, Strong Python, Statistics",
10,
False,
False,
],
[
"UX Designer",
"Mid-Level",
"Design",
"Figma, User Research, Prototyping",
"",
"Design user flows and high-fidelity prototypes for a product team.",
"Wireframes, User interviews, Prototyping",
"Portfolio, Collaboration, Communication",
8,
True,
False,
],
[
"Product Manager",
"Mid-Level",
"E-commerce",
"Product Strategy, Roadmapping, SQL",
"A/B Testing, Analytics",
"Lead product development and work with cross-functional teams.",
"Roadmap, Stakeholder management, Prioritization",
"2+ years PM, Strong communication",
10,
False,
True,
],
]
with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE, css="""
.send-button {
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
padding: 12px 24px !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
}
.send-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
}
.letter-button {
background: linear-gradient(90deg, #f093fb 0%, #f5576c 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
padding: 12px 24px !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
}
.letter-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 4px 12px rgba(240, 147, 251, 0.4) !important;
}
""") as demo:
gr.Markdown(
f"""
# {APP_TITLE}
**{APP_SUBTITLE}**
This app demonstrates a complete matching pipeline:
1) **User enters job details**
2) We embed the job using **intfloat/e5-small-v2**
3) We compute **cosine similarity** against **precomputed resume embeddings**
4) We return **Top-K ranked candidates** with metadata
> Tip: Use the examples below to see how the pipeline behaves.
"""
)
with gr.Tabs():
with gr.TabItem("Match Candidates (Single Job)"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Job Input")
job_title = gr.Textbox(
label="Job Title",
placeholder="e.g., Senior Data Scientist"
)
seniority = gr.Dropdown(
choices=SENIORITY_OPTIONS,
value="Senior",
label="Seniority",
allow_custom_value=True,
)
industry = gr.Textbox(
label="Industry",
placeholder="e.g., FinTech",
value="FinTech",
)
must_have = gr.Textbox(
label="Must-have Skills (comma or new line separated)",
placeholder="e.g., Python, SQL, Machine Learning",
lines=2,
)
nice_to_have = gr.Textbox(
label="Nice-to-have Skills (optional)",
placeholder="e.g., NLP, AWS",
lines=2,
)
description = gr.Textbox(
label="Job Description (optional)",
placeholder="Short role description...",
lines=3,
)
responsibilities = gr.Textbox(
label="Responsibilities (optional) β€” comma/newline separated",
placeholder="e.g., Modeling, Experimentation, Stakeholder communication",
lines=2,
)
requirements = gr.Textbox(
label="Requirements (optional) β€” comma/newline separated",
placeholder="e.g., 3+ years experience, Strong Python",
lines=2,
)
with gr.Row():
top_k = gr.Slider(
minimum=1,
maximum=30,
value=10,
step=1,
label="Top-K results",
)
with gr.Row():
filter_by_role = gr.Checkbox(
value=False,
label="Post-filter by role keywords (job title words must appear in candidate role)",
)
filter_by_industry = gr.Checkbox(
value=False,
label="Post-filter by exact industry match",
)
run_btn = gr.Button("Run Matching", variant="primary")
gr.Markdown(
"""
### What the filters do
- **Role filter** helps avoid cases where the embedding similarity is high but the role label differs.
- **Industry filter** enforces an exact match on the dataset industry field.
"""
)
with gr.Column(scale=1):
gr.Markdown("## Results")
stats = gr.Markdown(value=_startup_message())
results_table = gr.Dataframe(
label="Top Candidates",
interactive=False,
wrap=True,
row_count=10,
)
# Download CSV button
download_csv = gr.File(label="Download CSV (Top Candidates)")
# Send to Michael button
with gr.Row():
send_webhook_btn = gr.Button(
"πŸ“€ Send Top Candidate to Michael",
variant="secondary",
elem_classes=["send-button"],
size="lg"
)
# Webhook status message
webhook_status = gr.Markdown(value="", visible=True)
# NEW: Generate Interview Letter button
with gr.Row():
generate_letter_btn = gr.Button(
"✍️ Generate Interview Invitation Letter (AI)",
variant="secondary",
elem_classes=["letter-button"],
size="lg"
)
# Letter output
letter_output = gr.Markdown(value="", visible=True)
# 3 Quick Starters (as required by Part 5)
gr.Examples(
label="🎯 Quick Starters (1-click examples)",
examples=EXAMPLES,
inputs=[
job_title,
seniority,
industry,
must_have,
nice_to_have,
description,
responsibilities,
requirements,
top_k,
filter_by_role,
filter_by_industry,
],
outputs=[results_table, stats, download_csv],
fn=run_matching,
cache_examples=True,
)
# Connect Run Matching button
run_btn.click(
fn=run_matching,
inputs=[
job_title,
seniority,
industry,
must_have,
nice_to_have,
description,
responsibilities,
requirements,
top_k,
filter_by_role,
filter_by_industry,
],
outputs=[results_table, stats, download_csv],
)
# Connect Send to Michael button
send_webhook_btn.click(
fn=send_top_candidate_to_webhook,
inputs=[results_table],
outputs=[webhook_status],
)
# Connect Generate Letter button
generate_letter_btn.click(
fn=generate_interview_invitation,
inputs=[results_table, job_title],
outputs=[letter_output],
)
with gr.TabItem("About / How it works"):
gr.Markdown(
"""
## Pipeline Overview (Assignment Alignment)
**Part 3 produced:**
- Precomputed **resume embeddings** (saved as `.npy`)
- Matching **resume IDs** (saved as `.json`)
- The chosen embedding model: **intfloat/e5-small-v2**
**Part 4 (this Space) does:**
- Loads resumes from the dataset repo (**michaelozon/candidate-matching-synthetic**)
- Loads embeddings + IDs from the Space repository (`./embeddings/...`)
- Accepts **user job input**, builds text in the **same format as Part 3**
- Embeds the job query and computes **cosine similarity**
- Returns **Top-K** candidates with fields (role, skills, seniority, etc.)
### Files expected inside the Space repo
- `embeddings/intfloat__e5-small-v2_resumes.npy`
- `embeddings/intfloat__e5-small-v2_resume_ids.json`
### Notes on scoring
Because the dataset is synthetic and structured, similarity scores are often high (0.8-0.95).
For better filtering, the app includes **optional post-filters** by role and industry.
### Key Features
βœ… Uses precomputed embeddings (no recalculation)
βœ… Same text format as Part 3 (ensures consistency)
βœ… Cosine similarity via normalized embeddings
βœ… Optional post-filtering by role/industry
βœ… CSV export for results
βœ… 3 Quick Starter examples
βœ… **NEW:** Send top candidate directly to Make.com webhook
βœ… **NEW:** AI-generated interview invitation letters (Groq API - Llama 3.1 70B)
### AI Letter Generation
The app uses **Groq API** with **Llama 3.1 70B** to generate personalized interview invitation letters. Each letter is unique thanks to:
- Random tone variations (warm, professional, encouraging, etc.)
- High temperature (0.8) for creativity
- Top-p sampling (0.9) for diverse word choices
- Fast response times (~1 second)
**Setup:** Add your free Groq API key to Space secrets as `GROQ_API_KEY`
"""
)
gr.Markdown(
"""
---
**Built for:** Data Science Final Project - Part 4 & 5
**Model:** intfloat/e5-small-v2
**Dataset:** michaelozon/candidate-matching-synthetic
**Integrations:** Make.com webhook β€’ Groq API (Llama 3.1 70B)
"""
)
if __name__ == "__main__":
demo.launch()