Spaces:

IET-DEV
/

HR-Bot-V1

Sleeping

File size: 12,085 Bytes

2f174fb

import os
import json
import re
import gdown
import shutil
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from typing import TypedDict, List
from pydantic import BaseModel, Field

# Mistral & LangGraph Imports
from langchain_mistralai import ChatMistralAI 
from langgraph.graph import StateGraph, START, END

# =================================================================
# 1. SETUP & UI STYLING
# =================================================================
st.set_page_config(page_title="HR AI Agent", layout="wide", page_icon="👤")
load_dotenv() 

# Use st.secrets for cloud or os.environ for local
api_key = os.environ.get("MISTRAL_API_KEY") or st.secrets.get("MISTRAL_API_KEY")

if not api_key:
    st.error("🔑 Mistral API Key not found. Please set it in your environment variables or secrets.")
    st.stop()

# =================================================================
# 2. DATA SCHEMAS
# =================================================================
class ScoredCandidate(BaseModel):
    name: str
    score: float = Field(..., description="Objective score 0.00-100.00.")
    review: str = Field(..., description="Exactly 2 lines of review comment.")

class AgentState(TypedDict):
    gdrive_link: str
    job_description: str
    num_to_hire: int
    raw_candidates: List[dict]
    evaluated_results: dict 
    final_report: str

# =================================================================
# 3. HELPER FUNCTIONS
# =================================================================
def download_from_gdrive(url):
    temp_dir = "temp_resumes"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    try:
        # Note: GDrive folders must be "Anyone with the link"
        gdown.download_folder(url, output=temp_dir, quiet=True, remaining_ok=True, use_cookies=False)
        return temp_dir
    except Exception as e:
        st.error(f"Error downloading from Google Drive: {e}")
        return None

def process_pdfs_to_json(folder_path):
    llm = ChatMistralAI(model="mistral-large-latest", api_key=api_key, temperature=0)
    all_candidates_json = []

    # Get all PDFs, including those in subfolders created by gdown
    files = []
    for root, dirs, filenames in os.walk(folder_path):
        for f in filenames:
            if f.lower().endswith(".pdf"):
                files.append(os.path.join(root, f))

    if not files:
        st.warning("No PDF files found in the folder.")
        return []

    progress_bar = st.progress(0)
    status_text = st.empty()
    
    for i, path in enumerate(files):
        filename = os.path.basename(path)
        status_text.text(f"🔍 Analyzing: {filename}")
        try:
            reader = PdfReader(path)
            raw_text = "".join([page.extract_text() or "" for page in reader.pages])
            
            if len(raw_text.strip()) < 50:
                continue # Skip empty or scanned PDFs without OCR

            prompt = f"Extract details from this resume into JSON (name, email, phone, skills, experience_years):\n{raw_text[:7000]}"
            response = llm.invoke(prompt)
            json_match = re.search(r"\{.*\}", response.content, re.DOTALL)
            if json_match:
                candidate_data = json.loads(json_match.group())
                candidate_data["resume_text"] = raw_text 
                all_candidates_json.append(candidate_data)
        except Exception:
            pass
        progress_bar.progress((i + 1) / len(files))
    
    status_text.empty()
    progress_bar.empty()
    return all_candidates_json

# =================================================================
# 4. AGENT NODES
# =================================================================
def extract_resumes_node(state: AgentState):
    st.write("---")
    st.info("⚡ **Phase 1:** Fetching resumes from Google Drive...")
    temp_path = download_from_gdrive(state['gdrive_link'])
    if temp_path:
        candidates = process_pdfs_to_json(temp_path)
        shutil.rmtree(temp_path) # Cleanup
        return {"raw_candidates": candidates}
    return {"raw_candidates": []}



def rank_candidates_node(state: AgentState):
    """

    Evaluates candidates using a strict weighted rubric and 0-temperature 

    to ensure deterministic and consistent scoring.

    """
    print("\n" + "="*50)
    print("🚀 STEP 2: DETERMINISTIC SCORING ENGINE")
    print("="*50)

    # Initialize LLM with Temperature 0 for consistency
    llm = ChatMistralAI(model="mistral-large-latest", api_key=api_key, temperature=0)
    structured_llm = llm.with_structured_output(ScoredCandidate)
    
    scored_list = []

    for cand in state['raw_candidates']:
        name = cand.get('name', 'Unknown Candidate')
        print(f"🧠 Analyzing: {name}...")

        # OPTIMIZED PROMPT: Using a Point-Based Rubric
        prompt = f"""

        YOU ARE AN EXPERT RECRUITER. Evaluate the candidate against the Job Description (JD).

        

        ### JOB DESCRIPTION:

        {state['job_description']}



        ### CANDIDATE DATA:

        {json.dumps(cand)}



        ### SCORING RUBRIC (Strict 100-Point Scale):

        1. Technical Skill Match (40 pts): Compare 'skills' in candidate data to JD requirements.

        2. Experience Level (30 pts): Rate years of experience and seniority fit.

        3. Industry Fit (20 pts): Does their previous experience align with this JD's industry?

        4. Education/Certifications (10 pts): Does the candidate meet the degree requirements?



        ### RULES:

        - You must be OBJECTIVE. If a skill is not explicitly mentioned, do not award points for it.

        - Temperature is set to 0; provide the most logical mathematical score.

        - The 'review' must explain exactly why points were deducted.

        - You must not make tie between candidates.

        """

        try:
            # Mistral performs the evaluation based on the rubric above
            result = structured_llm.invoke(prompt)
            
            if result:
                scored_list.append(result.model_dump())
                print(f"✅ Scored {name}: {result.score}/100")
            else:
                scored_list.append({"name": name, "score": 0.0, "review": "Parsing error in AI output."})
        
        except Exception as e:
            print(f"⚠️ Error scoring {name}: {e}")
            scored_list.append({"name": name, "score": 0.0, "review": f"Processing Error: {str(e)}"})

    # SORTING: Ensures the list is ordered by score (highest first)
    sorted_all = sorted(scored_list, key=lambda x: x['score'], reverse=True)
    
    # OUTPUT: Returns the updated state to the LangGraph
    return {
        "evaluated_results": {
            "all_evaluated_candidates": scored_list,
            "top_n_hired_list": sorted_all[:state['num_to_hire']]
        }
    }


def report_node(state: AgentState):
    st.info("⚡ **Phase 3:** Compiling final report...")
    evals = state['evaluated_results']['top_n_hired_list']
    report = "\n".join([f"🏆 **{c['name']}** (Score: {c['score']})\n{c['review']}\n" for c in evals])
    return {"final_report": report}

# =================================================================
# 5. GRAPH ORCHESTRATION
# =================================================================
workflow = StateGraph(AgentState)
workflow.add_node("parser", extract_resumes_node)
workflow.add_node("ranker", rank_candidates_node)
workflow.add_node("reporter", report_node)
workflow.add_edge(START, "parser")
workflow.add_edge("parser", "ranker")
workflow.add_edge("ranker", "reporter")
workflow.add_edge("reporter", END)
app = workflow.compile()

# =================================================================
# 6. UI LAYOUT
# =================================================================
st.title("🌟 AI HR Agent: Google Drive Edition")

col1, col2 = st.columns([2, 1])

with col1:
    jd_input = st.text_area("📋 Job Description", placeholder="Paste the job requirements here...", height=200)

with col2:
    gdrive_link = st.text_input("🔗 Public GDrive Folder Link")
    hire_count = st.number_input("Selection Count (Top N)", min_value=1, max_value=20, value=3)
    analyze_btn = st.button("🚀 Run Analysis", type="primary", use_container_width=True)

if analyze_btn:
    if not jd_input or not gdrive_link:
        st.warning("Please provide both a Job Description and a Google Drive Link.")
    else:
        inputs = {
            "gdrive_link": gdrive_link, 
            "job_description": jd_input, 
            "num_to_hire": int(hire_count), 
            "raw_candidates": []
        }
        
        with st.status("AI Agent is working...", expanded=True) as status:
            final_state = app.invoke(inputs)
            status.update(label="Analysis Complete!", state="complete")
        
        st.session_state.result_state = final_state
        st.session_state.jd = jd_input
        
        st.success("### 📋 Shortlisted Candidates")
        st.markdown(final_state["final_report"])

# =================================================================
# 7. CHATBOT (FIXED: ACCESS TO ALL CANDIDATES)
# =================================================================
if "result_state" in st.session_state:
    st.divider()
    st.subheader("💬 Deep-Dive: Ask the HR Agent")
    
    # Initialize chat history
    if "messages" not in st.session_state: 
        st.session_state.messages = []

    # Display chat history
    for msg in st.session_state.messages:
        with st.chat_message(msg["role"]): 
            st.markdown(msg["content"])

    if prompt := st.chat_input("Ex: Why was John selected but Sarah wasn't?"):
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"): 
            st.markdown(prompt)
        
        # 1. PREPARE LEAN DATA (Crucial: Removes heavy resume_text)
        all_evals = st.session_state.result_state['evaluated_results']['all_evaluated_candidates']
        top_hired = [c['name'] for c in st.session_state.result_state['evaluated_results']['top_n_hired_list']]
        
        # Build a summarized list of EVERY candidate
        knowledge_base = []
        for eval_item in all_evals:
            status = "SELECTED/TOP-TIER" if eval_item['name'] in top_hired else "DESELECTED/LOWER-RANKED"
            knowledge_base.append({
                "name": eval_item['name'],
                "score": eval_item['score'],
                "status": status,
                "reasoning": eval_item['review']
            })
        
        # 2. SYSTEM INSTRUCTIONS FOR THE AI
        chat_llm = ChatMistralAI(model="mistral-large-latest", api_key=api_key)
        
        context_message = f"""

        You are an HR Analytics Bot. You have full access to the scoring results for ALL candidates.

        

        JOB DESCRIPTION:

        {st.session_state.jd}

        

        CANDIDATE DATA (Scores and Status):

        {json.dumps(knowledge_base, indent=2)}

        

        INSTRUCTIONS:

        1. Answer questions about specific candidates using the 'reasoning' and 'score' provided.

        2. If asked why someone was deselected, compare their score/reasoning to the higher-scoring candidates.

        3. Use Markdown tables if asked to compare multiple people.

        """
        
        with st.chat_message("assistant"):
            # Use a list of messages (System + User) for better steering
            response = chat_llm.invoke([
                ("system", context_message),
                ("user", prompt)
            ])
            st.markdown(response.content)
            st.session_state.messages.append({"role": "assistant", "content": response.content})