| |
| """ |
| Hugging Face Space (Gradio) - Resume β Job Matching System |
| ========================================================== |
| |
| This app implements the assignment's "Input β Output Pipeline": |
| User job input β embed (query) β cosine similarity vs precomputed resume embeddings β Top-K ranked candidates. |
| |
| It uses: |
| - pipeline.py (init_pipeline + rank_candidates_for_new_job) |
| - precomputed embeddings uploaded to the Space repo (./embeddings/*) |
| - resumes dataset from HF (michaelozon/candidate-matching-synthetic) |
| |
| NEW FEATURES: |
| - Send top candidate directly to Make.com webhook |
| - AI-generated interview invitation letter using Groq API |
| """ |
|
|
| import os |
| import re |
| import tempfile |
| import requests |
| import random |
| from typing import List, Tuple, Optional |
|
|
| import pandas as pd |
| import gradio as gr |
|
|
| from pipeline import init_pipeline, rank_candidates_for_new_job |
|
|
|
|
| |
| |
| |
| WEBHOOK_URL = "https://hook.eu2.make.com/st4h0t3ycjud9llfgnebjyofvg35z8sz" |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
| GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions" |
|
|
|
|
| |
| |
| |
| def _parse_list(text: str) -> List[str]: |
| """Parse comma/newline-separated text into a clean list.""" |
| if text is None: |
| return [] |
| text = str(text).strip() |
| if not text: |
| return [] |
| parts = re.split(r"[,;\n]+", text) |
| out = [p.strip() for p in parts if p.strip()] |
| |
| seen = set() |
| dedup = [] |
| for x in out: |
| key = x.lower() |
| if key not in seen: |
| seen.add(key) |
| dedup.append(x) |
| return dedup |
|
|
|
|
| def clean_text(value) -> str: |
| """Clean and normalize text values""" |
| if value is None: |
| return "" |
| if isinstance(value, (int, float)): |
| return str(value) |
| text = str(value).strip() |
| |
| text = re.sub(r'\s+', ' ', text) |
| return text |
|
|
|
|
| def ensure_list(value) -> List[str]: |
| """Ensure value is a list of strings""" |
| if value is None: |
| return [] |
| if isinstance(value, list): |
| return [clean_text(item) for item in value] |
| if isinstance(value, str): |
| |
| if ',' in value: |
| return [clean_text(item) for item in value.split(',')] |
| return [clean_text(value)] |
| return [clean_text(value)] |
|
|
|
|
| def _format_stats(df: pd.DataFrame) -> str: |
| """Format statistics from results DataFrame""" |
| if df is None or len(df) == 0 or "similarity_score" not in df.columns: |
| return "No results to summarize." |
| mn = float(df["similarity_score"].min()) |
| mx = float(df["similarity_score"].max()) |
| avg = float(df["similarity_score"].mean()) |
| return ( |
| f"**Score range:** [{mn:.4f}, {mx:.4f}] \n" |
| f"**Average score:** {avg:.4f} \n" |
| f"**Returned candidates:** {len(df)}" |
| ) |
|
|
|
|
| def _make_csv(df: pd.DataFrame) -> Optional[str]: |
| """Create CSV file from DataFrame for download""" |
| if df is None or len(df) == 0: |
| return None |
| tmpdir = tempfile.mkdtemp() |
| path = os.path.join(tmpdir, "top_candidates.csv") |
| df.to_csv(path, index=False, encoding="utf-8") |
| return path |
|
|
|
|
| |
| |
| |
| def generate_interview_invitation(df: pd.DataFrame, job_title: str) -> str: |
| """ |
| Generate a personalized interview invitation letter using Groq API |
| |
| Args: |
| df: DataFrame with ranked candidates |
| job_title: The job title from the search |
| |
| Returns: |
| Generated letter or error message |
| """ |
| |
| if df is None or len(df) == 0: |
| return "β **No candidates available.** Please run a search first." |
| |
| if not GROQ_API_KEY: |
| return "β **Error:** Groq API key not found. Please add GROQ_API_KEY to Space secrets." |
| |
| if not job_title or job_title.strip() == "": |
| return "β **Error:** Job title is required to generate invitation letter." |
| |
| |
| try: |
| top_candidate = df.iloc[0] |
| |
| candidate_role = clean_text(top_candidate.get('role', 'the position')) |
| candidate_seniority = clean_text(top_candidate.get('seniority', '')) |
| candidate_industry = clean_text(top_candidate.get('industry', '')) |
| candidate_skills = ensure_list(top_candidate.get('skills', [])) |
| years_exp = int(top_candidate.get('years_experience', 0)) |
| match_score = float(top_candidate.get('similarity_score', 0)) |
| |
| |
| skill_text = ", ".join(candidate_skills[:5]) if candidate_skills else "relevant skills" |
| |
| except Exception as e: |
| return f"β **Error extracting candidate data:** {str(e)}" |
| |
| |
| tone_variations = [ |
| "warm and enthusiastic", |
| "professional but friendly", |
| "encouraging and positive", |
| "concise and welcoming", |
| "engaging and personable" |
| ] |
| |
| selected_tone = random.choice(tone_variations) |
| |
| |
| user_prompt = f"""Write a {selected_tone} interview invitation letter for a job candidate. |
| |
| **Position:** {job_title} |
| **Candidate Background:** {candidate_seniority} {candidate_role} with {years_exp} years of experience in {candidate_industry} |
| **Key Skills:** {skill_text} |
| **Match Score:** {match_score:.1%} |
| **Candidate ID:** {top_candidate.get('resume_id', 'N/A')} |
| |
| |
| **Requirements:** |
| - Start with a warm greeting |
| - Express enthusiasm about their profile |
| - Mention we found them to be an excellent match for the {job_title} role |
| - Highlight 1-2 specific skills that stood out ({skill_text}) |
| - Invite them to schedule an interview |
| - End with a warm closing |
| - Keep it 150-200 words |
| - Write in a {selected_tone} tone |
| - Do NOT use placeholder names like [Candidate Name] or [Your Name] - use the actual ID and name provided |
| |
| |
| Write ONLY the letter body, no subject line.""" |
|
|
| |
| try: |
| headers = { |
| "Authorization": f"Bearer {GROQ_API_KEY}", |
| "Content-Type": "application/json" |
| } |
| |
| payload = { |
| "model": "llama-3.3-70b-versatile", |
| "messages": [ |
| { |
| "role": "system", |
| "content": "You are a professional HR recruiter writing interview invitation letters. Write clear, warm, and professional letters." |
| }, |
| { |
| "role": "user", |
| "content": user_prompt |
| } |
| ], |
| "temperature": 0.8, |
| "max_tokens": 400, |
| "top_p": 0.9 |
| } |
| |
| response = requests.post( |
| GROQ_API_URL, |
| json=payload, |
| headers=headers, |
| timeout=15 |
| ) |
| |
| if response.status_code != 200: |
| error_detail = response.json().get('error', {}).get('message', response.text[:200]) |
| return f"β **API Error ({response.status_code}):** {error_detail}" |
| |
| result = response.json() |
| letter = result.get("choices", [{}])[0].get("message", {}).get("content", "").strip() |
| |
| if not letter or len(letter) < 50: |
| return "β **Error:** Generated letter is too short or empty. Please try again." |
| |
| |
| output = f"""## βοΈ AI-Generated Interview Invitation |
| |
| **For:** {candidate_seniority} {candidate_role} | **Match Score:** {match_score:.2%} |
| |
| --- |
| |
| {letter} |
| |
| --- |
| |
| π‘ *Generated by Llama 3.1 70B (via Groq) β’ Tone: {selected_tone}* |
| """ |
| |
| return output |
| |
| except requests.exceptions.Timeout: |
| return "β **Request timed out.** The API is taking too long. Please try again." |
| |
| except requests.exceptions.ConnectionError: |
| return "β **Connection error.** Unable to reach Groq API. Please check your internet connection." |
| |
| except KeyError as e: |
| return f"β **Response parsing error:** Missing expected field in API response: {str(e)}" |
| |
| except Exception as e: |
| error_msg = str(e) |
| if "api key" in error_msg.lower() or "unauthorized" in error_msg.lower(): |
| return "β **Authentication error.** Please check your Groq API key in Space secrets." |
| elif "rate limit" in error_msg.lower(): |
| return "β **Rate limit exceeded.** Please wait a moment and try again." |
| else: |
| return f"β **Unexpected error:** {error_msg[:200]}" |
|
|
|
|
| |
| |
| |
| def send_top_candidate_to_webhook(df: pd.DataFrame) -> str: |
| """ |
| Send the top candidate (rank #1) to Make.com webhook |
| |
| Args: |
| df: DataFrame with ranked candidates |
| |
| Returns: |
| Status message for UI |
| """ |
| |
| if df is None or len(df) == 0: |
| return "β No candidates to send. Please run a search first." |
| |
| |
| top_candidate = df.iloc[0] |
| |
| |
| try: |
| payload = { |
| "resume_id": str(top_candidate.get('resume_id', '')), |
| "role": clean_text(top_candidate.get('role', '')), |
| "industry": clean_text(top_candidate.get('industry', '')), |
| "seniority": clean_text(top_candidate.get('seniority', '')), |
| "years_experience": int(top_candidate.get('years_experience', 0)), |
| "education": clean_text(top_candidate.get('education', '')), |
| "skills": ensure_list(top_candidate.get('skills', [])), |
| "summary": clean_text(top_candidate.get('summary', '')), |
| "experience_bullets": ensure_list(top_candidate.get('experience_bullets', [])) |
| } |
| |
| |
| if 'similarity_score' in top_candidate: |
| payload['similarity_score'] = float(top_candidate['similarity_score']) |
| if 'rank' in top_candidate: |
| payload['rank'] = int(top_candidate['rank']) |
| |
| except Exception as e: |
| return f"β Error preparing data: {str(e)}" |
| |
| |
| try: |
| response = requests.post( |
| WEBHOOK_URL, |
| json=payload, |
| headers={'Content-Type': 'application/json'}, |
| timeout=10 |
| ) |
| |
| |
| if response.status_code == 200: |
| return ( |
| f"β
**Successfully sent to Michael!**\n\n" |
| f"**Candidate:** {payload['role']} ({payload['seniority']})\n" |
| f"**Resume ID:** {payload['resume_id']}\n" |
| f"**Industry:** {payload['industry']}\n" |
| f"**Match Score:** {payload.get('similarity_score', 'N/A')}" |
| ) |
| else: |
| return ( |
| f"β οΈ Webhook responded with status {response.status_code}\n" |
| f"Response: {response.text[:200]}" |
| ) |
| |
| except requests.exceptions.Timeout: |
| return "β Request timed out. The webhook might be slow or unavailable." |
| except requests.exceptions.ConnectionError: |
| return "β Connection error. Please check the webhook URL or your internet connection." |
| except Exception as e: |
| return f"β Error sending to webhook: {str(e)}" |
|
|
|
|
| |
| |
| |
| def run_matching( |
| job_title: str, |
| seniority: str, |
| industry: str, |
| must_have_skills_text: str, |
| nice_to_have_skills_text: str, |
| description: str, |
| responsibilities_text: str, |
| requirements_text: str, |
| top_k: int, |
| filter_by_role: bool, |
| filter_by_industry: bool, |
| ) -> Tuple[pd.DataFrame, str, Optional[str]]: |
| """ |
| Main matching function called by Gradio |
| |
| Returns: |
| - DataFrame with results |
| - Statistics markdown |
| - CSV file path for download |
| """ |
| |
| must_have = _parse_list(must_have_skills_text) |
| nice_to_have = _parse_list(nice_to_have_skills_text) |
| responsibilities = _parse_list(responsibilities_text) |
| requirements = _parse_list(requirements_text) |
|
|
| |
| df = rank_candidates_for_new_job( |
| job_title=job_title, |
| seniority=seniority, |
| industry=industry, |
| must_have_skills=must_have, |
| nice_to_have_skills=nice_to_have if nice_to_have else None, |
| description=description or "", |
| responsibilities=responsibilities if responsibilities else None, |
| requirements=requirements if requirements else None, |
| top_k=int(top_k), |
| filter_by_role=bool(filter_by_role), |
| filter_by_industry=bool(filter_by_industry), |
| ) |
|
|
| |
| preferred_cols = [ |
| "rank", |
| "similarity_score", |
| "resume_id", |
| "role", |
| "seniority", |
| "industry", |
| "years_experience", |
| "education", |
| "skills", |
| "summary", |
| ] |
| cols = [c for c in preferred_cols if c in df.columns] + [ |
| c for c in df.columns if c not in preferred_cols |
| ] |
| df = df[cols] if len(df) else df |
|
|
| |
| stats_md = _format_stats(df) |
| csv_path = _make_csv(df) |
|
|
| return df, stats_md, csv_path |
|
|
|
|
| |
| |
| |
| APP_TITLE = "Resume β Job Matching System" |
| APP_SUBTITLE = "Input β Output Pipeline (Embeddings + Cosine Similarity) β’ HuggingFace Space Demo" |
|
|
|
|
| def _startup_message() -> str: |
| """ |
| Lightweight startup status for the UI. |
| If embeddings are missing, init_pipeline() will raise and Space logs will show why. |
| """ |
| groq_status = "β
Configured" if GROQ_API_KEY else "β Not configured" |
| return ( |
| f"β
Pipeline initialized successfully.\n\n" |
| f"This Space loads:\n" |
| f"- Resumes dataset from HuggingFace\n" |
| f"- Precomputed resume embeddings from this Space repo\n" |
| f"- Embedding model for query (intfloat/e5-small-v2)\n" |
| f"- Groq API for letter generation: {groq_status}\n" |
| ) |
|
|
|
|
| |
| |
| try: |
| init_pipeline(force_reload=False) |
| print("β
Pipeline loaded successfully at startup") |
| except Exception as e: |
| print(f"β οΈ Warning: Pipeline initialization failed: {e}") |
| print("The app will try to initialize on first request.") |
|
|
|
|
| |
| |
| |
| SENIORITY_OPTIONS = ["Junior", "Mid", "Mid-Level", "Senior", "Lead", "Manager"] |
|
|
| |
| INDUSTRY_SUGGESTIONS = [ |
| "FinTech", |
| "E-commerce", |
| "SaaS", |
| "Technology", |
| "Healthcare", |
| "Retail", |
| "EdTech", |
| "Cloud Services", |
| "Design", |
| "Gaming", |
| "Cybersecurity", |
| ] |
|
|
| |
| EXAMPLES = [ |
| [ |
| "Senior Data Scientist", |
| "Senior", |
| "FinTech", |
| "Python, SQL, Machine Learning", |
| "NLP, AWS", |
| "Build ML models, run experiments, and support product decisions with data.", |
| "Modeling, Experimentation, Stakeholder communication", |
| "3+ years DS, Strong Python, Statistics", |
| 10, |
| False, |
| False, |
| ], |
| [ |
| "UX Designer", |
| "Mid-Level", |
| "Design", |
| "Figma, User Research, Prototyping", |
| "", |
| "Design user flows and high-fidelity prototypes for a product team.", |
| "Wireframes, User interviews, Prototyping", |
| "Portfolio, Collaboration, Communication", |
| 8, |
| True, |
| False, |
| ], |
| [ |
| "Product Manager", |
| "Mid-Level", |
| "E-commerce", |
| "Product Strategy, Roadmapping, SQL", |
| "A/B Testing, Analytics", |
| "Lead product development and work with cross-functional teams.", |
| "Roadmap, Stakeholder management, Prioritization", |
| "2+ years PM, Strong communication", |
| 10, |
| False, |
| True, |
| ], |
| ] |
|
|
|
|
| with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE, css=""" |
| .send-button { |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important; |
| border: none !important; |
| color: white !important; |
| font-weight: 600 !important; |
| padding: 12px 24px !important; |
| border-radius: 8px !important; |
| transition: all 0.3s ease !important; |
| } |
| .send-button:hover { |
| transform: translateY(-2px) !important; |
| box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important; |
| } |
| .letter-button { |
| background: linear-gradient(90deg, #f093fb 0%, #f5576c 100%) !important; |
| border: none !important; |
| color: white !important; |
| font-weight: 600 !important; |
| padding: 12px 24px !important; |
| border-radius: 8px !important; |
| transition: all 0.3s ease !important; |
| } |
| .letter-button:hover { |
| transform: translateY(-2px) !important; |
| box-shadow: 0 4px 12px rgba(240, 147, 251, 0.4) !important; |
| } |
| """) as demo: |
| gr.Markdown( |
| f""" |
| # {APP_TITLE} |
| **{APP_SUBTITLE}** |
| |
| This app demonstrates a complete matching pipeline: |
| 1) **User enters job details** |
| 2) We embed the job using **intfloat/e5-small-v2** |
| 3) We compute **cosine similarity** against **precomputed resume embeddings** |
| 4) We return **Top-K ranked candidates** with metadata |
| |
| > Tip: Use the examples below to see how the pipeline behaves. |
| """ |
| ) |
|
|
| with gr.Tabs(): |
| with gr.TabItem("Match Candidates (Single Job)"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("## Job Input") |
|
|
| job_title = gr.Textbox( |
| label="Job Title", |
| placeholder="e.g., Senior Data Scientist" |
| ) |
| seniority = gr.Dropdown( |
| choices=SENIORITY_OPTIONS, |
| value="Senior", |
| label="Seniority", |
| allow_custom_value=True, |
| ) |
| industry = gr.Textbox( |
| label="Industry", |
| placeholder="e.g., FinTech", |
| value="FinTech", |
| ) |
|
|
| must_have = gr.Textbox( |
| label="Must-have Skills (comma or new line separated)", |
| placeholder="e.g., Python, SQL, Machine Learning", |
| lines=2, |
| ) |
| nice_to_have = gr.Textbox( |
| label="Nice-to-have Skills (optional)", |
| placeholder="e.g., NLP, AWS", |
| lines=2, |
| ) |
|
|
| description = gr.Textbox( |
| label="Job Description (optional)", |
| placeholder="Short role description...", |
| lines=3, |
| ) |
| responsibilities = gr.Textbox( |
| label="Responsibilities (optional) β comma/newline separated", |
| placeholder="e.g., Modeling, Experimentation, Stakeholder communication", |
| lines=2, |
| ) |
| requirements = gr.Textbox( |
| label="Requirements (optional) β comma/newline separated", |
| placeholder="e.g., 3+ years experience, Strong Python", |
| lines=2, |
| ) |
|
|
| with gr.Row(): |
| top_k = gr.Slider( |
| minimum=1, |
| maximum=30, |
| value=10, |
| step=1, |
| label="Top-K results", |
| ) |
|
|
| with gr.Row(): |
| filter_by_role = gr.Checkbox( |
| value=False, |
| label="Post-filter by role keywords (job title words must appear in candidate role)", |
| ) |
| filter_by_industry = gr.Checkbox( |
| value=False, |
| label="Post-filter by exact industry match", |
| ) |
|
|
| run_btn = gr.Button("Run Matching", variant="primary") |
|
|
| gr.Markdown( |
| """ |
| ### What the filters do |
| - **Role filter** helps avoid cases where the embedding similarity is high but the role label differs. |
| - **Industry filter** enforces an exact match on the dataset industry field. |
| """ |
| ) |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("## Results") |
|
|
| stats = gr.Markdown(value=_startup_message()) |
| results_table = gr.Dataframe( |
| label="Top Candidates", |
| interactive=False, |
| wrap=True, |
| row_count=10, |
| ) |
| |
| |
| download_csv = gr.File(label="Download CSV (Top Candidates)") |
| |
| |
| with gr.Row(): |
| send_webhook_btn = gr.Button( |
| "π€ Send Top Candidate to Michael", |
| variant="secondary", |
| elem_classes=["send-button"], |
| size="lg" |
| ) |
| |
| |
| webhook_status = gr.Markdown(value="", visible=True) |
| |
| |
| with gr.Row(): |
| generate_letter_btn = gr.Button( |
| "βοΈ Generate Interview Invitation Letter (AI)", |
| variant="secondary", |
| elem_classes=["letter-button"], |
| size="lg" |
| ) |
| |
| |
| letter_output = gr.Markdown(value="", visible=True) |
|
|
| |
| gr.Examples( |
| label="π― Quick Starters (1-click examples)", |
| examples=EXAMPLES, |
| inputs=[ |
| job_title, |
| seniority, |
| industry, |
| must_have, |
| nice_to_have, |
| description, |
| responsibilities, |
| requirements, |
| top_k, |
| filter_by_role, |
| filter_by_industry, |
| ], |
| outputs=[results_table, stats, download_csv], |
| fn=run_matching, |
| cache_examples=True, |
| ) |
|
|
| |
| run_btn.click( |
| fn=run_matching, |
| inputs=[ |
| job_title, |
| seniority, |
| industry, |
| must_have, |
| nice_to_have, |
| description, |
| responsibilities, |
| requirements, |
| top_k, |
| filter_by_role, |
| filter_by_industry, |
| ], |
| outputs=[results_table, stats, download_csv], |
| ) |
| |
| |
| send_webhook_btn.click( |
| fn=send_top_candidate_to_webhook, |
| inputs=[results_table], |
| outputs=[webhook_status], |
| ) |
| |
| |
| generate_letter_btn.click( |
| fn=generate_interview_invitation, |
| inputs=[results_table, job_title], |
| outputs=[letter_output], |
| ) |
|
|
| with gr.TabItem("About / How it works"): |
| gr.Markdown( |
| """ |
| ## Pipeline Overview (Assignment Alignment) |
| |
| **Part 3 produced:** |
| - Precomputed **resume embeddings** (saved as `.npy`) |
| - Matching **resume IDs** (saved as `.json`) |
| - The chosen embedding model: **intfloat/e5-small-v2** |
| |
| **Part 4 (this Space) does:** |
| - Loads resumes from the dataset repo (**michaelozon/candidate-matching-synthetic**) |
| - Loads embeddings + IDs from the Space repository (`./embeddings/...`) |
| - Accepts **user job input**, builds text in the **same format as Part 3** |
| - Embeds the job query and computes **cosine similarity** |
| - Returns **Top-K** candidates with fields (role, skills, seniority, etc.) |
| |
| ### Files expected inside the Space repo |
| - `embeddings/intfloat__e5-small-v2_resumes.npy` |
| - `embeddings/intfloat__e5-small-v2_resume_ids.json` |
| |
| ### Notes on scoring |
| Because the dataset is synthetic and structured, similarity scores are often high (0.8-0.95). |
| For better filtering, the app includes **optional post-filters** by role and industry. |
| |
| ### Key Features |
| β
Uses precomputed embeddings (no recalculation) |
| β
Same text format as Part 3 (ensures consistency) |
| β
Cosine similarity via normalized embeddings |
| β
Optional post-filtering by role/industry |
| β
CSV export for results |
| β
3 Quick Starter examples |
| β
**NEW:** Send top candidate directly to Make.com webhook |
| β
**NEW:** AI-generated interview invitation letters (Groq API - Llama 3.1 70B) |
| |
| ### AI Letter Generation |
| The app uses **Groq API** with **Llama 3.1 70B** to generate personalized interview invitation letters. Each letter is unique thanks to: |
| - Random tone variations (warm, professional, encouraging, etc.) |
| - High temperature (0.8) for creativity |
| - Top-p sampling (0.9) for diverse word choices |
| - Fast response times (~1 second) |
| |
| **Setup:** Add your free Groq API key to Space secrets as `GROQ_API_KEY` |
| """ |
| ) |
|
|
| gr.Markdown( |
| """ |
| --- |
| **Built for:** Data Science Final Project - Part 4 & 5 |
| **Model:** intfloat/e5-small-v2 |
| **Dataset:** michaelozon/candidate-matching-synthetic |
| **Integrations:** Make.com webhook β’ Groq API (Llama 3.1 70B) |
| """ |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch() |