import gradio as gr import pandas as pd import json import os import requests import re from functools import lru_cache # ---------------------------- # CONFIG # ---------------------------- JSON_FILE = "form-submissions-1.json" MODEL_ID = "google/flan-t5-small" # NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets. HF_API_TOKEN = os.environ.get("HF_API_TOKEN") FILTERED_CSV = "/tmp/filtered_candidates.csv" OUTPUT_FILE = "/tmp/outputs.csv" BATCH_SIZE = 50 if not HF_API_TOKEN: pass CATEGORIES = { "AI": [ "AI/ML Ops Engineer","Senior Machine Learning Engineer","Principal Data Scientist", "Senior Data Scientist","Machine Learning Research Scientist","Senior AI/ML Engineer", "AI/ML Engineer","Big Data Engineer","AI Research Scientist","AI Research Analyst Consultant", "AI Analyst","Senior Data Analyst","Automation Engineer","Senior Data Engineer", "Machine Learning Engineer","Data Engineer","Data Scientist","Data Analyst" ], "Marketing": ["Marketing Specialist","Sales Agent","Salesman","Sales Associate"], "CTO": ["Chief Technology Officer","CTO"], "Legal": ["Legal Specialist","Attorney","Legal Intern","Lawyer"], "Finance": ["Financial Analyst","Financial Advisor"] } # ---------------------------- # LLM Call for Scoring (Focus: Role Experience ONLY) # ---------------------------- @lru_cache(maxsize=512) def score_candidate(candidate_str, category_name, job_titles_tuple): if not HF_API_TOKEN: print("API Token is missing. Returning score 0.") return 0 prompt = f""" You are an HR assistant. Your task is to rate a candidate's suitability based ONLY on their previous job roles. Rate the suitability of the following candidate on a scale of 1 (Lowest) to 10 (Highest). The score must reflect how closely the candidate's 'Roles' align with the target job titles. The target roles for the '{category_name}' category are: {list(job_titles_tuple)} Candidate JSON: {candidate_str} **Task**: Respond ONLY with the rating number (an integer from 1 to 10). """ headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"} payload = { "inputs": prompt, "parameters": { "max_new_tokens": 5, "return_full_text": False, "temperature": 0.1 } } try: response = requests.post( f"https://api-inference.huggingface.co/models/{MODEL_ID}", headers=headers, data=json.dumps(payload), timeout=60 ) response.raise_for_status() result = response.json() generated_text = result[0].get("generated_text", "0").strip() match = re.search(r'\d+', generated_text) if match: score = int(match.group(0)) return max(1, min(10, score)) return 0 except Exception as e: print(f"LLM scoring call failed for candidate (API/Network Error): {e}") return 0 # ---------------------------- # Step 1: Filter by roles (Unchanged) # ---------------------------- def filter_by_roles(category_name): job_titles = CATEGORIES[category_name] try: with open(JSON_FILE, encoding="utf-8") as f: data = json.load(f) except FileNotFoundError: return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The LLM can't proceed." filtered = [] for person in data: work_exps = person.get("work_experiences", []) if not work_exps: continue non_fullstack_roles = [ exp.get("roleName") for exp in work_exps if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower() ] if not non_fullstack_roles: continue if any(role in job_titles for role in non_fullstack_roles): filtered.append({ "Name": person.get("name"), "Email": person.get("email"), "Phone": person.get("phone"), "Location": person.get("location"), "Roles": ", ".join(non_fullstack_roles), "Skills": ", ".join(person.get("skills", [])), "Salary": person.get("annual_salary_expectation", {}).get("full-time","N/A"), "Category": category_name }) if not filtered: return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The LLM can't proceed." df = pd.DataFrame(filtered) df.to_csv(FILTERED_CSV, index=False) return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM scoring." # ---------------------------- # Step 2: LLM recommendations (Scoring, Sorting, and Output) # ---------------------------- def llm_recommendations(category_name): job_titles = CATEGORIES[category_name] if not os.path.exists(FILTERED_CSV): df_filtered, msg = filter_by_roles(category_name) if df_filtered.empty: return msg else: df_filtered = pd.read_csv(FILTERED_CSV) df_filtered = df_filtered[df_filtered["Category"] == category_name] if df_filtered.empty: return f"No filtered candidates found for category '{category_name}'. Run Step 1 first." # Prepare for scoring df_filtered_clean = df_filtered.fillna('N/A') filtered_candidates = df_filtered_clean.to_dict(orient="records") scores = [] for person in filtered_candidates: candidate_info = { "Name": person.get("Name"), "Roles": person.get("Roles"), "Skills": person.get("Skills") } candidate_str = json.dumps(candidate_info) score = score_candidate(candidate_str, category_name, tuple(job_titles)) scores.append(score) df_filtered["LLM_Score"] = scores df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy() if df_recommended.empty: if not HF_API_TOKEN: return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again." return f"LLM scored all candidates 0. The candidates' roles are deemed irrelevant by the LLM for '{category_name}'." def parse_salary(s): try: return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf')))) except: return float('inf') df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary) df_top5 = df_recommended.sort_values( by=['LLM_Score', 'Salary_sort'], ascending=[False, True] ).head(5) final_names = df_top5["Name"].tolist() output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n" for i, name in enumerate(final_names): score = df_top5.iloc[i]['LLM_Score'] output_text += f"{i+1}. {name} (Suitability Score: {score}/10)\n" output_text += "\nThese candidates were ranked by the LLM based **only on the alignment of their previous job roles** with the target roles, using expected salary as a tie-breaker." return output_text # ---------------------------- # Show first 5 raw JSON candidates (Unchanged) # ---------------------------- def show_first_candidates(): try: with open(JSON_FILE, encoding="utf-8") as f: data = json.load(f) return pd.DataFrame(data[:5]) except FileNotFoundError: return pd.DataFrame({"Error": [f"JSON file '{JSON_FILE}' not found. Please ensure it is present."]}) except Exception as e: return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]}) # ---------------------------- # Gradio interface (Updated Heading and Launch) # ---------------------------- with gr.Blocks() as app: # 🚩 CHANGE: Updated Heading gr.Markdown("# 🤖 Candidate Selection (Role-Based Scoring)") gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates") gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries") gr.Markdown("---") category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="1. Select Category") # Step 1: Filter by roles filter_button = gr.Button("2. Filter Candidates by Roles") filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)") filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.") filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status]) gr.Markdown("---") # Step 2: LLM Recommendations llm_button = gr.Button("3. Get LLM Recommendations (Role Experience Ranking)") llm_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Get LLM Recommendations' after Step 2 completes.") llm_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[llm_output_text]) if __name__ == "__main__": # 🚩 CHANGE: Set share=True to generate a public link app.launch(share=True)