{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e02e1b00",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import cloudinary\n",
"import cloudinary.uploader\n",
"import requests\n",
"from io import BytesIO\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"# Explicitly configure using your 3 credentials\n",
"cloudinary.config( \n",
" cloud_name = os.getenv('CLOUDINARY_CLOUD_NAME'), \n",
" api_key = os.getenv('CLOUDINARY_API_KEY'), \n",
" api_secret = os.getenv('CLOUDINARY_API_SECRET'),\n",
" secure = True\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c11377c5",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import cloudinary\n",
"import cloudinary.uploader\n",
"from dotenv import load_dotenv\n",
"\n",
"# 1. Load credentials from your .env file\n",
"load_dotenv()\n",
"\n",
"cloudinary.config( \n",
" cloud_name = os.getenv('CLOUDINARY_CLOUD_NAME'), \n",
" api_key = os.getenv('CLOUDINARY_API_KEY'), \n",
" api_secret = os.getenv('CLOUDINARY_API_SECRET'),\n",
" secure = True\n",
")\n",
"\n",
"# 2. Set your variables\n",
"resume_path = r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"\n",
"thread_id = \"trial_thread_001\"\n",
"file_name = \"ChirayuResume\"\n",
"\n",
"# 3. Perform the upload\n",
"try:\n",
" response = cloudinary.uploader.upload(\n",
" resume_path,\n",
" folder = f\"threads/{thread_id}\",\n",
" public_id = file_name,\n",
" resource_type = \"image\" # Use \"image\" for PDFs to get previews in UI\n",
" )\n",
"\n",
" # 4. Create the URL from the response\n",
" pdf_url = response.get(\"secure_url\")\n",
" \n",
" print(f\"ā
Upload Successful!\")\n",
" print(f\"š Folder: threads/{thread_id}\")\n",
" print(f\"š URL to push: {pdf_url}\")\n",
"\n",
"except Exception as e:\n",
" print(f\"ā Upload failed: {e}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f986ff8f",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from io import BytesIO\n",
"\n",
"def get_pdf_for_ai(url):\n",
" # 1. Reach out to the URL\n",
" response = requests.get(url)\n",
" \n",
" if response.status_code == 200:\n",
" # 2. Convert the web response into a \"file-like\" object\n",
" pdf_stream = BytesIO(response.content)\n",
" print(\"ā
PDF loaded into memory for processing!\")\n",
" return pdf_stream\n",
" else:\n",
" print(f\"ā Failed to fetch PDF. Status: {response.status_code}\")\n",
" return None\n",
"\n",
"# --- USE YOUR ACTUAL URL ---\n",
"resume_url = \"https://res.cloudinary.com/dvxnazx8e/image/upload/v1774166452/threads/trial_thread_001/ChirayuResume.pdf\"\n",
"pdf_data = get_pdf_for_ai(resume_url)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "938186bf",
"metadata": {},
"outputs": [],
"source": [
"import cloudinary\n",
"from cloudinary import Search\n",
"\n",
"\n",
"def get_resume_url(thread_id: str) -> str:\n",
" \"\"\"\n",
" Searches Cloudinary for the resume PDF in the thread's folder\n",
" and returns the secure URL.\n",
" \"\"\"\n",
" result = Search() \\\n",
" .expression(f'folder:\"threads/{thread_id}/*\"') \\\n",
" .sort_by('public_id', 'desc') \\\n",
" .max_results(1) \\\n",
" .execute()\n",
"\n",
" resources = result.get(\"resources\", [])\n",
"\n",
" if not resources:\n",
" raise FileNotFoundError(f\"No resume found for thread_id: {thread_id}\")\n",
"\n",
" pdf_url = resources[0][\"secure_url\"]\n",
" print(f\"Found resume: {pdf_url}\")\n",
" return pdf_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4340cbb",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import tempfile\n",
"import os\n",
"from langchain_community.document_loaders import PyMuPDFLoader\n",
"\n",
"thread_id = \"trial_thread_001\"\n",
"\n",
"try:\n",
" # Step 1 ā Get URL from Cloudinary\n",
" url = get_resume_url(thread_id)\n",
" print(f\"URL: {url}\")\n",
"\n",
" # Step 2 ā Fetch PDF bytes\n",
" response = requests.get(url)\n",
" response.raise_for_status()\n",
"\n",
" # Step 3 ā Write to temp file\n",
" with tempfile.NamedTemporaryFile(delete=False, suffix=\".pdf\") as tmp:\n",
" tmp.write(response.content)\n",
" tmp_path = tmp.name\n",
"\n",
" # Step 4 ā Load with PyMuPDF\n",
" loader = PyMuPDFLoader(tmp_path)\n",
" docs = loader.load()\n",
" resume_text = \"\\n\".join([doc.page_content for doc in docs])\n",
"\n",
" # Step 5 ā Cleanup\n",
" os.remove(tmp_path)\n",
"\n",
" print(f\"Pages loaded: {len(docs)}\")\n",
" print(f\"Preview:\\n{resume_text[:500]}\")\n",
"\n",
"except FileNotFoundError as e:\n",
" print(f\"Not found: {e}\")\n",
"except Exception as e:\n",
" print(f\"Error: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b010e49b",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyMuPDFLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7164448e",
"metadata": {},
"outputs": [],
"source": [
"resumepath=r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d1029c0",
"metadata": {},
"outputs": [],
"source": [
"import cloudinary.uploader\n",
"import os\n",
"\n",
"# Your resume path (using 'r' for raw string to handle backslashes correctly)\n",
"resume_path = r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"\n",
"\n",
"# Extract filename without extension for the public_id\n",
"file_name = os.path.basename(resume_path).split('.')[0] \n",
"thread_id = \"trial_thread_001\"\n",
"\n",
"# Upload directly using the file path\n",
"upload_result = cloudinary.uploader.upload(\n",
" resume_path, \n",
" folder=f\"threads/{thread_id}\",\n",
" public_id=file_name,\n",
" resource_type=\"auto\" # Handles the PDF correctly\n",
")\n",
"\n",
"print(f\"Upload Successful! URL: {upload_result['secure_url']}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "928b7237",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"import cloudinary\n",
"from io import BytesIO\n",
"\n",
"# --- 1. CONFIGURATION (Do this once) ---\n",
"# This tells the library your API Key/Secret. \n",
"# In a real app, put this in your .env file!\n",
"os.environ[\"CLOUDINARY_URL\"] = \"cloudinary://866996699612973:9Tp3hGjI9npawSIrN4Mu4hFRwLQ@dtscmobmv\"\n",
"\n",
"def get_pdf_content(file_url):\n",
" \"\"\"\n",
" This function expects an HTTPS url, NOT the cloudinary:// credentials.\n",
" \"\"\"\n",
" # Ensure the URL is a real web link\n",
" if not file_url.startswith(\"http\"):\n",
" raise ValueError(\"The URL must start with http or https!\")\n",
"\n",
" response = requests.get(file_url)\n",
" if response.status_code == 200:\n",
" return BytesIO(response.content)\n",
" else:\n",
" print(f\"Error: Could not download file. Status: {response.status_code}\")\n",
" return None\n",
"\n",
"# --- 2. TESTING ---\n",
"# This is what the MERN devs will send you:\n",
"test_resume_url = \"https://res.cloudinary.com\"\n",
"\n",
"# This will now work!\n",
"pdf_file = get_pdf_content(test_resume_url)\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b75a238a",
"metadata": {},
"outputs": [],
"source": [
"from typing import Any, Dict, List, Optional, Tuple,TypedDict,Literal\n",
"from typing import Annotated, Sequence\n",
"import os\n",
"from pydantic import BaseModel, Field\n",
"from langchain_groq import ChatGroq\n",
"from langchain_core.messages import SystemMessage, HumanMessage,ToolMessage,AIMessage\n",
"from langchain_core.tools import Tool\n",
"from langgraph.graph import StateGraph,END,START\n",
"from langgraph.types import interrupt \n",
"from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder\n",
"from langchain_community.document_loaders import PyMuPDFLoader\n",
"import json\n",
"from pydantic import BaseModel, Field\n",
"from typing import List, Optional\n",
"from pprint import pprint\n",
"import os\n",
"from dotenv import load_dotenv\n",
"import json\n",
"from langchain_core.documents import Document\n",
"from langchain_huggingface import HuggingFaceEmbeddings\n",
"import os\n",
"from pinecone import Pinecone, ServerlessSpec\n",
"from pinecone_text.sparse import BM25Encoder\n",
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
"from langchain_community.retrievers import PineconeHybridSearchRetriever\n",
"import json\n",
"from langchain_core.documents import Document\n",
"from langchain_core.messages import BaseMessage\n",
"from langgraph.graph import add_messages\n",
"from langgraph.prebuilt import ToolNode ,tools_condition\n",
"import torch\n",
"from langgraph.checkpoint.memory import MemorySaver\n"
]
},
{
"cell_type": "markdown",
"id": "c7058b37",
"metadata": {},
"source": [
"Pydantic model of resume data extraction"
]
},
{
"cell_type": "markdown",
"id": "69094b87",
"metadata": {},
"source": [
"**Defining the pydantic models to be used**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7da5b1c6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"class Skill(BaseModel):\n",
" name: str = Field(..., description=\"Skill name e.g. Python, Docker\")\n",
" category: Optional[str] = Field(\n",
" None, description=\"Category: Backend | ML | DevOps | Frontend | Other\"\n",
" )\n",
"\n",
"\n",
"class ExperienceItem(BaseModel):\n",
" job_title: str = Field(\n",
" ...,\n",
" description=\"Role title of the candidate. Example: 'Backend Intern', 'Software Engineer'\"\n",
" )\n",
"\n",
" experience_type: Optional[Literal['internship', 'full_time', 'contract', 'freelance']] = Field(\n",
" None,\n",
" description=\"Type of experience: internship, full_time, contract, or freelance\"\n",
" )\n",
"\n",
"\n",
"\n",
" technologies: Optional[List[str]] = Field(\n",
" default_factory=list,\n",
" description=\"Technologies, tools, or frameworks used in this role\"\n",
" )\n",
"\n",
" responsibilities: Optional[List[str]] = Field(\n",
" default_factory=list,\n",
" description=\"Key responsibilities, tasks, or learnings in concise bullet points keep it summarised detail *not* required\"\n",
" )\n",
"\n",
"class ProjectItem(BaseModel):\n",
" name: str = Field(..., description=\"Project name\")\n",
" technologies: List[str] = Field(\n",
" default_factory=list,\n",
" description=\"Technologies used in this project hence learned during the project.\"\n",
" )\n",
" \n",
"\n",
"\n",
"class CertificationItem(BaseModel):\n",
" name: str = Field(..., description=\"Certification name\")\n",
" \n",
" topics_covered: List[str] = Field(\n",
" default_factory=list,\n",
" description=\"Key topics or skills the certification covers\"\n",
" )\n",
"\n",
"\n",
"\n",
"class ResumeExtract(BaseModel):\n",
"\n",
"\n",
" candidate_name:Optional[str]\n",
"\n",
" \n",
" job_title: Optional[str] = Field(\n",
" None,\n",
" description=(\n",
" \"Primary job title or role of the candidate. \"\n",
" \"Examples: 'AI Engineer', 'Data Scientist', \"\n",
" \"'Construction Project Manager', 'Healthcare Representative'. \"\n",
" \"Should reflect the most recent or current role.\"\n",
" )\n",
" )\n",
"\n",
" \n",
"\n",
" \n",
" skills: List[Skill] = Field(\n",
" default_factory=list,\n",
" description=\"Skills explicitly listed by the candidate\"\n",
" )\n",
" experience: List[ExperienceItem] = Field(\n",
" default_factory=list,\n",
" description=(\n",
" \"Each role as a separate entry. \"\n",
" \"No company name needed ā focus on what was done and learned.\"\n",
" )\n",
" )\n",
" projects: List[ProjectItem] = Field(\n",
" default_factory=list,\n",
" description=\"Projects with technologies used and what was built\"\n",
" )\n",
" certifications: Optional[List[CertificationItem]] = Field(\n",
" None,\n",
" description=\"Certifications with topics they cover. None if not present.\"\n",
" )\n",
" \n",
"\n",
"\n",
" is_fresher: bool = Field(\n",
" ...,\n",
" description=(\n",
" \"Set to True if the candidate lacks full-time professional employment. \"\n",
" \"Academic projects, certifications, and internships are considered \"\n",
" \"part of the learning phase and do not qualify a candidate as 'non-fresher' hence is_.\"\n",
" )\n",
")\n"
]
},
{
"cell_type": "markdown",
"id": "99ac1086",
"metadata": {},
"source": [
" \"skills\": {\"__all__\": {\"category\"}}, # Drops 'category' from every skill\n",
" \"experience\": {\"__all__\": {\"responsibilities\"}}, # Drops bullet points\n",
" \"projects\": {\"__all__\": {\"what_was_built\"}}, # Drops project descriptions\n",
" \"certifications\": {\"__all__\": {\"issuer\"}} # Drops the issuer"
]
},
{
"cell_type": "markdown",
"id": "5b0756e0",
"metadata": {},
"source": [
"Pydantic model for job description"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4b2441cd",
"metadata": {},
"outputs": [],
"source": [
"from pydantic import BaseModel, Field\n",
"from typing import List, Optional\n",
"\n",
"\n",
"class SkillRequirement(BaseModel):\n",
" name: str = Field(\n",
" ...,\n",
" description=\"Skill or technology required for the job (e.g., Python, SQL, React)\"\n",
" )\n",
" level: Optional[str] = Field(\n",
" None,\n",
" description=\"Expected proficiency level: beginner | intermediate | strong\"\n",
" )\n",
"\n",
"\n",
"class ResponsibilityItem(BaseModel):\n",
" description: str = Field(\n",
" ...,\n",
" description=\"Key responsibility or task expected from the candidate\"\n",
" )\n",
"\n",
"\n",
"class RequirementItem(BaseModel):\n",
" description: str = Field(\n",
" ...,\n",
" description=\"Qualification or requirement such as education, availability, etc.\"\n",
" )\n",
"\n",
"\n",
"class ConstraintItem(BaseModel):\n",
" type: str = Field(\n",
" ...,\n",
" description=\"Constraint type such as location, duration, eligibility\"\n",
" )\n",
" value: str = Field(\n",
" ...,\n",
" description=\"Constraint value (e.g., 'Pune only', '6 months', 'Fresher')\"\n",
" )\n",
"\n",
"\n",
"\n",
"class JobDescriptionExtract(BaseModel):\n",
" job_title: Optional[str] = Field(\n",
" None,\n",
" description=\"Job role/title (e.g., AI/ML Intern, Web Developer)\"\n",
" )\n",
"\n",
" company_name: Optional[str] = Field(\n",
" None,\n",
" description=\"Company offering the job\"\n",
" )\n",
"\n",
" location: Optional[str] = Field(\n",
" None,\n",
" description=\"Job location if specified\"\n",
" )\n",
"\n",
" employment_type: Optional[str] = Field(\n",
" None,\n",
" description=\"Type of job: internship, full-time, contract\"\n",
" )\n",
"\n",
" duration_months: Optional[int] = Field(\n",
" None,\n",
" description=\"Duration of role in months (for internships/contracts)\"\n",
" )\n",
"\n",
" is_fresher_allowed: Optional[bool] = Field(\n",
" None,\n",
" description=\"Whether freshers are eligible for this role\"\n",
" )\n",
"\n",
" skills_required: Optional[List[SkillRequirement]] = Field(\n",
" None,\n",
" description=\"List of required skills and expected levels\"\n",
" )\n",
"\n",
" tools_technologies: Optional[List[str]] = Field(\n",
" None,\n",
" description=\"Specific tools/frameworks mentioned (e.g., Pandas, WordPress)\"\n",
" )\n",
"\n",
" responsibilities: Optional[List[ResponsibilityItem]] = Field(\n",
" None,\n",
" description=\"Key job responsibilities\"\n",
" )\n",
"\n",
" requirements: Optional[List[RequirementItem]] = Field(\n",
" None,\n",
" description=\"General requirements like availability, qualifications\"\n",
" )\n",
"\n",
" constraints: Optional[List[ConstraintItem]] = Field(\n",
" None,\n",
" description=\"Special constraints like location restriction, duration, etc.\"\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "4b12a3bc",
"metadata": {},
"source": [
"**Pydantic model for skill gap analysis**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4f1341e0",
"metadata": {},
"outputs": [],
"source": [
"class SkillGap(BaseModel):\n",
" skill_name: str = Field(\n",
" ..., \n",
" description=\"The specific technology or tool missing or requiring an upgrade (e.g., 'PostgreSQL')\"\n",
" )\n",
" \n",
" gap_type: Literal[\"missing_foundation\", \"needs_advanced_upgrade\"] = Field(\n",
" ...,\n",
" description=(\n",
" \"missing_foundation: Candidate has no recorded experience in this core requirement. \"\n",
" \"needs_advanced_upgrade: Candidate knows the basics but needs role-specific advanced training.\"\n",
" )\n",
" )\n",
" \n",
" priority: Literal[\"high\", \"medium\", \"low\"] = Field(\n",
" ...,\n",
" description=\"How critical this skill is for the target job role.\"\n",
" )\n",
" \n",
" reasoning: str = Field(\n",
" ...,\n",
" description=(\n",
" \"The 'Reasoning Trace'. This MUST be provided for every skill gap identified. \"\n",
" \"Explain exactly WHY this gap was flagged based on the resume vs JD comparison. \"\n",
" \"Example: 'JD requires FastAPI; candidate has Python experience but no record of using FastAPI framework.'\"\n",
" )\n",
" )\n",
" \n",
" target_competency: str = Field(\n",
" ...,\n",
" description=\"The specific outcome the candidate needs to reach (e.g., 'Build asynchronous database endpoints')\"\n",
" )\n",
"\n",
"class SkillGapAnalysis(BaseModel):\n",
" job_title: str = Field(..., description=\"The target role from the JD\")\n",
" candidate_name: Optional[str] = Field(None, description=\"Extracted name from resume\")\n",
" \n",
" analyzed_gaps: List[SkillGap] = Field(\n",
" default_factory=list,\n",
" description=\"List of specific technical gaps found between Resume and JD\"\n",
" )\n",
" \n",
" is_fresher_adaptation_needed: bool = Field(\n",
" default=False,\n",
" description=\"True if foundational corporate/soft-skill modules should be added to the path.\"\n",
" )\n",
" \n",
" executive_summary: str = Field(\n",
" ...,\n",
" description=\"A 2-3 sentence overview of the candidate's readiness and the primary focus of the onboarding.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "18663bb3",
"metadata": {},
"outputs": [],
"source": [
"class RoadmapStep(BaseModel):\n",
" course_id: str\n",
" title: str\n",
" reasoning: str = Field(..., description=\"Why this specific course was chosen for this user\")\n",
" is_foundation: bool\n",
" sequence_order: int = Field(..., description=\"The order in which the course should be taken\")\n",
"\n",
"class LearningRoadmap(BaseModel):\n",
" candidate_name: str\n",
" target_role: str\n",
" roadmap: List[RoadmapStep]\n",
" onboarding_summary: str"
]
},
{
"cell_type": "markdown",
"id": "604e9728",
"metadata": {},
"source": [
"**Defining the agents to be used**"
]
},
{
"cell_type": "markdown",
"id": "9036d57e",
"metadata": {},
"source": [
"Resume data extraction agent"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "14dab004",
"metadata": {},
"outputs": [],
"source": [
"resume_agent=ChatGroq(\n",
" model=\"moonshotai/kimi-k2-instruct-0905\",\n",
" temperature=0.2,\n",
")\n",
"\n",
"\n",
"resume_agent=resume_agent.with_structured_output(\n",
"\n",
" schema=ResumeExtract,\n",
" method=\"json_schema\",\n",
" include_raw=True,\n",
" strict=True\n",
")\n"
]
},
{
"cell_type": "markdown",
"id": "7683eb69",
"metadata": {},
"source": [
"Job description data extraction agent"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "472dae2f",
"metadata": {},
"outputs": [],
"source": [
"jd_agent=ChatGroq(\n",
" model=\"meta-llama/llama-4-scout-17b-16e-instruct\",\n",
" temperature=0.2,\n",
")\n",
"\n",
"\n",
"jd_agent=jd_agent.with_structured_output(\n",
"\n",
" schema=JobDescriptionExtract,\n",
" method=\"json_schema\",\n",
" include_raw=True,\n",
" strict=True\n",
")\n"
]
},
{
"cell_type": "markdown",
"id": "d14736d5",
"metadata": {},
"source": [
"defining the gap analysis agent"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "0d5e3b17",
"metadata": {},
"outputs": [],
"source": [
"gap_analysis_agent=ChatGroq(\n",
" model=\"openai/gpt-oss-120b\",\n",
" temperature=0.2,\n",
")\n",
"\n",
"\n",
"gap_analysis_agent=gap_analysis_agent.with_structured_output(\n",
" schema=SkillGapAnalysis,\n",
" method=\"json_schema\",\n",
" include_raw=True,\n",
" strict=True\n",
")\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "28bc58ad",
"metadata": {},
"source": [
"defining the roadmap planner agent"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "0ccc026b",
"metadata": {},
"outputs": [],
"source": [
"roadmap_planner_agent=ChatGroq(\n",
" model=\"moonshotai/kimi-k2-instruct-0905\",\n",
" temperature=0.2,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "2bd41131",
"metadata": {},
"source": [
"**Tools**"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c8827093",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index ready: {'_response_info': {'raw_headers': {'connection': 'keep-alive',\n",
" 'content-length': '187',\n",
" 'content-type': 'application/json',\n",
" 'date': 'Mon, 23 Mar 2026 20:11:40 GMT',\n",
" 'grpc-status': '0',\n",
" 'server': 'envoy',\n",
" 'x-envoy-upstream-service-time': '62',\n",
" 'x-pinecone-request-latency-ms': '61',\n",
" 'x-pinecone-response-duration-ms': '64'}},\n",
" 'dimension': 384,\n",
" 'index_fullness': 0.0,\n",
" 'memoryFullness': 0.0,\n",
" 'metric': 'dotproduct',\n",
" 'namespaces': {'__default__': {'vector_count': 47}},\n",
" 'storageFullness': 0.0,\n",
" 'total_vector_count': 47,\n",
" 'vector_type': 'dense'}\n"
]
}
],
"source": [
"\n",
"\n",
"PINECONE_API_KEY = os.getenv(\"PINECONE_API_KEY\")\n",
"pc = Pinecone(api_key=PINECONE_API_KEY)\n",
"\n",
"index_name = \"final-catalog-index\"\n",
"\n",
"\n",
"# Create index if not exists\n",
"if index_name not in pc.list_indexes().names():\n",
" pc.create_index(\n",
" name=index_name,\n",
" dimension=384,\n",
" metric=\"dotproduct\",\n",
" spec=ServerlessSpec(\n",
" cloud=\"aws\",\n",
" region=\"us-east-1\"\n",
" )\n",
" )\n",
" print(\"Index created.\")\n",
"\n",
"index = pc.Index(index_name)\n",
"print(\"Index ready:\", index.describe_index_stats())\n"
]
},
{
"cell_type": "markdown",
"id": "44180d94",
"metadata": {},
"source": [
"Opening the docs for BM25 retriver"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7561b3a1",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from langchain_core.documents import Document\n",
"\n",
"\n",
"doc_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\app\\utils\\langchain_formatted.json\"\n",
"\n",
"\n",
"documents = []\n",
"\n",
"# Load the transformed catalog\n",
"with open(doc_path, \"r\") as f:\n",
" data = json.load(f)\n",
" for doc in data:\n",
" # Create a LangChain Document object for each entry\n",
" documents.append(\n",
" Document(\n",
" page_content=doc[\"page_content\"], \n",
" metadata=doc[\"metadata\"]\n",
" )\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f0845a99",
"metadata": {},
"outputs": [],
"source": [
"device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c8e6d2a5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\ATHARVA\\AppData\\Local\\Temp\\ipykernel_30068\\2526755923.py:1: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n",
" embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": device})\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3af10b64d4584d53952822157482186f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading weights: 0%| | 0/103 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2\n",
"Key | Status | | \n",
"------------------------+------------+--+-\n",
"embeddings.position_ids | UNEXPECTED | | \n",
"\n",
"Notes:\n",
"- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
]
}
],
"source": [
"embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": device})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bc7292f",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mnotebook controller is DISPOSED. \n",
"\u001b[1;31mView Jupyter log for further details."
]
}
],
"source": [
"bm25_encoder = BM25Encoder()\n",
"\n",
"bm25_encoder.fit([doc.page_content for doc in documents])\n",
"\n",
"retriever = PineconeHybridSearchRetriever(\n",
" embeddings=embeddings,\n",
" sparse_encoder=bm25_encoder,\n",
" index=index,\n",
" alpha=0.5\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03c755a1",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mnotebook controller is DISPOSED. \n",
"\u001b[1;31mView Jupyter log for further details."
]
}
],
"source": [
"from langchain_core.tools import tool\n",
"from typing import Optional\n",
"\n",
"@tool\n",
"def search_courses(query: str):\n",
" \"\"\"\n",
" Search the course catalog for relevant modules based on a skill query \n",
" \n",
" \n",
" Args:\n",
" \n",
" query:the skill to find with semantic terms (e.g., 'FastAPI', 'PostgreSQL', 'Docker','Enterprise VMS Strategy','Utilization Management').\n",
" \n",
" \"\"\"\n",
" \n",
" results = retriever.invoke(\n",
" query\n",
" )\n",
"\n",
" if not results:\n",
" return f\"No courses found for '{query}'.\"\n",
"\n",
" formatted_output = []\n",
" for doc in results:\n",
" course_id = doc.metadata.get('course_id', 'N/A')\n",
" \n",
" # We include the ID for roadmap generation, followed by the full context\n",
" # created during the transformation stage (Title, Desc, Outcomes, Prereqs).\n",
" course_block = (\n",
" f\"ID: {course_id}\\n\"\n",
" f\"{doc.page_content}\\n\"\n",
" \"---\"\n",
" )\n",
" formatted_output.append(course_block)\n",
"\n",
" return \"\\n\".join(formatted_output)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9db28710",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from typing import Optional, Dict, Any\n",
"from langchain_core.tools import tool\n",
"\n",
"class CourseLookup:\n",
" def __init__(self, catalog_path: str = \"course_catalog.json\"):\n",
" self.catalog_path = catalog_path\n",
" self.courses_map = {}\n",
" self._load_catalog()\n",
"\n",
" def _load_catalog(self):\n",
" \"\"\"Loads the catalog into a dictionary for O(1) lookup speed.\"\"\"\n",
" try:\n",
" with open(self.catalog_path, 'r') as f:\n",
" catalog = json.load(f)\n",
" # Key the dictionary by course_id for instant retrieval\n",
" self.courses_map = {course['course_id']: course for course in catalog}\n",
" except FileNotFoundError:\n",
" print(f\"Error: {self.catalog_path} not found.\")\n",
" except json.JSONDecodeError:\n",
" print(f\"Error: Failed to decode {self.catalog_path}.\")\n",
"\n",
" def get_course_details(self, course_id: str) -> Optional[Dict[str, Any]]:\n",
" \"\"\"Retrieves full details of a course by its ID.\"\"\"\n",
" return self.courses_map.get(course_id)\n",
"\n",
"\n",
"lookup_service = CourseLookup(r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\app\\tools\\Catalog.json\")\n",
"\n",
"@tool\n",
"def get_course_by_id(course_id: str) -> str:\n",
" \"\"\"\n",
" Retrieves full details for a specific course using its unique course_id.\n",
" Use this tool when you find a prerequisite ID in another course and \n",
" need to fetch its title, description, and duration to add to the roadmap.\n",
" \"\"\"\n",
" details = lookup_service.get_course_details(course_id)\n",
" if not details:\n",
" return f\"Error: Course with ID {course_id} not found in catalog.\"\n",
" \n",
" # Return a clean string for the agent to process\n",
" return json.dumps(details, indent=2)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "09d238ab",
"metadata": {},
"outputs": [],
"source": [
"\n",
"@tool(args_schema=LearningRoadmap)\n",
"def submit_final_roadmap(candidate_name, target_role, roadmap, onboarding_summary):\n",
" \"\"\"\n",
" STRICTLY call this tool to submit the final structured learning roadmap.\n",
" This saves the data to the global system and the graph state.\n",
" \"\"\"\n",
" \n",
" \n",
" # Construct the structured JSON\n",
" result = {\n",
" \"candidate_name\": candidate_name,\n",
" \"target_role\": target_role,\n",
" \"onboarding_summary\": onboarding_summary,\n",
" \"roadmap\": [\n",
" step.model_dump() if hasattr(step, \"model_dump\") else step \n",
" for step in roadmap\n",
" ]\n",
" }\n",
" \n",
" \n",
" \n",
" # Return to LangGraph (will be stored in state via a post-processing node)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ad04bc6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"@tool\n",
"def submit_mermaid_visualization(mermaid_code: str):\n",
" \"\"\"\n",
" STRICTLY call this tool to save the Mermaid.js visualization of the roadmap.\n",
" \"\"\"\n",
" \n",
" \n",
" # 2. Now this assignment updates the global variable\n",
" mermaid_roadmap_code = mermaid_code\n",
" \n",
" return \"Mermaid visualization saved successfully.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "285f74bb",
"metadata": {},
"outputs": [],
"source": [
"roadmap_planner_agent_tools=[search_courses, get_course_by_id,submit_final_roadmap,submit_mermaid_visualization]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47564782",
"metadata": {},
"outputs": [],
"source": [
"roadmap_planner_agent=roadmap_planner_agent.bind_tools(roadmap_planner_agent_tools)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c311f642",
"metadata": {},
"outputs": [],
"source": [
"# Replace 'roadmap_planner_agent' with your bound model variable\n",
"print(roadmap_planner_agent.kwargs.get(\"tools\"))\n"
]
},
{
"cell_type": "markdown",
"id": "2da3f43b",
"metadata": {},
"source": [
"**Trail resume path**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cfbfc3a",
"metadata": {},
"outputs": [],
"source": [
"resumepath=r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\""
]
},
{
"cell_type": "markdown",
"id": "14f4946c",
"metadata": {},
"source": [
"**Langgraph agent state**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5deda2bb",
"metadata": {},
"outputs": [],
"source": [
"class OnboardingState(TypedDict):\n",
" candidate_name: Optional[str]\n",
" resume_text: str \n",
" file_path: str \n",
" job_description: str \n",
" messages: Annotated[Sequence[BaseMessage], add_messages]\n",
" \n",
" # Analysis & Extraction Data\n",
" skill_gap_analysis_data: Optional[SkillGapAnalysis]\n",
" resume_data: Optional[ResumeExtract] \n",
" extraction_error: Optional[str] \n",
" JobDescriptionExtract_data: Optional[JobDescriptionExtract]\n",
" \n",
" # --- NEW KEYS FOR OUTPUT ---\n",
" mermaid_code: Optional[str] # Stores the Mermaid visualization string\n",
" final_roadmap: Optional[Dict] # Stores the final structured JSON roadmap"
]
},
{
"cell_type": "markdown",
"id": "e54bac6a",
"metadata": {},
"source": [
"**Prompts**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8df9934",
"metadata": {},
"outputs": [],
"source": [
"from langchain_groq import ChatGroq\n",
"from langchain_core.prompts import ChatPromptTemplate\n",
"\n",
"\n",
"resume_agent_prompt = \"\"\"\n",
"\n",
"You are a precise resume parser. Your only job is to extract structured information from a raw resume text.\n",
"\n",
"\n",
"\n",
"- Extract ONLY what is explicitly present in the resume. Do NOT infer or hallucinate missing fields.\n",
"- current_role: the job title stated at the top of the resume or most recent role. If the candidate is a student with no job, set it to \"Student\".\n",
"- is_fresher: set True ONLY if the candidate has zero professional work experience. Having projects or certifications does NOT make someone non-fresher.\n",
"- total_experience_years: total years of professional work only. Set 0.0 for freshers.\n",
"- skills: extract from the explicit skills section only. Do NOT pull skills from project descriptions here.\n",
"- experience: each role is a SEPARATE entry. Ignore company name. Focus on job_title, technologies used, and what they did or learned.\n",
"- projects: extract each project separately. Capture technologies and one line on what was built.\n",
"- certifications: extract ONLY if present. Set null if none found. Include topics the certification covers.\n",
"- achievements: extract ONLY if present. Set null if none found. Include the domain (e.g. Hackathon, Quiz, Competitive Programming).\n",
"\n",
"\n",
"\n",
"\n",
"Return a single valid JSON object matching the schema. No extra text, no markdown, no explanation.\n",
"\n",
"\n",
"\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "608efafd",
"metadata": {},
"outputs": [],
"source": [
"jd_agent_prompt =\"\"\" \n",
"\n",
"You are a precise job description parser.\n",
"Extract structured information from the given job description.\n",
"\n",
"\n",
"\n",
"- Extract ONLY explicitly mentioned information. Do NOT infer or hallucinate.\n",
"\n",
"- Follow the provided schema strictly.\n",
"\n",
"- If a field is not present, return null (not empty list unless schema default applies).\n",
"\n",
"- Keep skills atomic (e.g., Python, SQL, React).\n",
"\n",
"- Do NOT mix fields:\n",
" - skills = only required skills\n",
" - responsibilities = what the candidate will do\n",
" - constraints = restrictions like location, duration, eligibility\n",
"\n",
"- Convert durations like \"6 months\" into integer months.\n",
"\n",
"- is_fresher_allowed:\n",
" - True only if explicitly allowed\n",
" - False only if explicitly restricted\n",
" \n",
"\n",
"\n",
"\n",
"Return a valid JSON object only.\n",
" \"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a6c1483",
"metadata": {},
"outputs": [],
"source": [
"gap_analysis_agent_prompt=\"\"\"\n",
"\n",
"You are an expert technical assessor and the core intelligence of an AI-driven, adaptive onboarding engine[cite: 5]. \n",
"Your objective is to parse a new hire's current capabilities against a target job description and identify precise skill gaps to reach role-specific competency[cite: 5].\n",
"\n",
"\n",
"\n",
"Current corporate onboarding utilizes static, \"one-size-fits-all\" curricula, resulting in significant inefficiencies[cite: 3]. \n",
"Your ultimate goal is to solve this: you must ensure experienced hires do NOT waste time on known concepts, while ensuring beginners are NOT overwhelmed by advanced modules[cite: 3, 4].\n",
"\n",
"\n",
"\n",
"- Cross-reference the JD's `skills_required` and `tools_technologies` against the candidate's `skills_list`, `experience.technologies`, and `projects.technologies`.\n",
"- Identify Hard Gaps: Technologies explicitly required by the JD that are completely absent from the candidate's profile.\n",
"- Apply Adaptive Logic (Proficiency Gaps):\n",
" - For Experienced Hires: If they possess the skill, DO NOT flag it for basic training. Only flag a gap if they need an advanced, role-specific upgrade based on low duration of use.\n",
" - For Beginners/Freshers: Flag foundational gaps and prerequisites heavily to ensure they are prepared before tackling complex JD requirements.\n",
"- Keep skills atomic and highly specific (e.g., output \"FastAPI\" or \"PostgreSQL\", do NOT output vague terms like \"Backend Frameworks\").\n",
"- Do NOT hallucinate requirements that are not explicitly stated in the JD data.\n",
"- Do NOT attempt to build the curriculum or suggest courses yet. Your sole focus is diagnosing the gaps.\n",
"- Provide a concise `reasoning` string for each identified gap. This reasoning MUST justify why the gap exists based on the user's experience level to prove the adaptive logic.\n",
"\n",
"\n",
"Return a valid JSON object only.\n",
"\n",
"\n",
"\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "059e5f86",
"metadata": {},
"outputs": [],
"source": [
"roadmap_planner_agent_prompt = \"\"\"\n",
"\n",
"You are an expert technical onboarding architect.\n",
"Transform a Skill Gap Analysis into a minimal, logically sequenced learning roadmap.\n",
"\n",
"\n",
"\n",
"STEP 1 ā SEARCH\n",
"For every gap ā call search_courses.\n",
"Use ONLY course IDs returned by the tool. Never guess IDs.\n",
"\n",
"STEP 2 ā RESOLVE PREREQUISITES\n",
"For each retrieved course inspect its prerequisites list.\n",
"If candidate's resume does NOT prove mastery ā call get_course_by_id for each missing prerequisite.\n",
"Skip courses the candidate already demonstrates via projects or experience.\n",
"\n",
"STEP 3 ā SEQUENCE\n",
"Prerequisites always before target modules.\n",
"sequence_order must be 1, 2, 3... strictly.\n",
"If is_fresher_adaptation_needed is True ā add a professional fundamentals module first.\n",
"\n",
"STEP 4 ā SUBMIT (TERMINAL STEP)\n",
"Call submit_final_roadmap ONCE with the complete roadmap.\n",
"Call submit_mermaid_visualization ONCE with the Mermaid string.\n",
"After both return ā STOP. Do not call any tool again.\n",
"\n",
"\n",
"\n",
"- gap courses ā :::gap\n",
"- known prerequisites ā :::known\n",
"- start node ā :::start\n",
"- end node ā :::done\n",
"- group by week using subgraph\n",
"\n",
"\n",
"\n",
"flowchart TD\n",
" A([Start ā Candidate's current skills]):::start\n",
" subgraph W1[\"Week 1 ā Core gaps\"]\n",
" B[CS-DOCKER-101\\nDocker & Containerization]:::gap\n",
" C[CS-PY-101\\nPython Fundamentals]:::known\n",
" end\n",
" subgraph W2[\"Week 2 ā Role readiness\"]\n",
" D[CS-CICD-201\\nCI/CD with GitHub Actions]:::gap\n",
" end\n",
" Z([Role-ready ā DevOps Engineer]):::done\n",
" A --> B & C\n",
" B --> D\n",
" D --> Z\n",
" classDef gap fill:#EEEDFE,stroke:#534AB7,color:#26215C\n",
" classDef known fill:#E1F5EE,stroke:#0F6E56,color:#085041\n",
" classDef start fill:#1D9E75,stroke:#0F6E56,color:#E1F5EE\n",
" classDef done fill:#534AB7,stroke:#3C3489,color:#EEEDFE\n",
"\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c4dea1e",
"metadata": {},
"outputs": [],
"source": [
"def input_node(state: OnboardingState):\n",
" file_path = state.get(\"file_path\")\n",
" \n",
" print(f\"š File path received: {file_path}\")\n",
" print(f\"š File exists: {os.path.exists(file_path) if file_path else 'NO PATH'}\")\n",
"\n",
" if not file_path:\n",
" return {\"extraction_error\": \"Missing file_path in state\"}\n",
"\n",
" try:\n",
" loader = PyMuPDFLoader(file_path)\n",
" docs = loader.load()\n",
" \n",
" print(f\"š Pages loaded: {len(docs)}\")\n",
" \n",
" resume_text = \"\\n\".join([doc.page_content for doc in docs])\n",
" \n",
" print(f\"š Text length: {len(resume_text)}\")\n",
"\n",
" return {\n",
" \"resume_text\": resume_text,\n",
" \"extraction_error\": None\n",
" }\n",
"\n",
" except Exception as e:\n",
" print(f\"ā PyMuPDF failed: {str(e)}\")\n",
" return {\n",
" \"resume_text\": None,\n",
" \"extraction_error\": f\"Failed to load resume: {str(e)}\"\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb13ffc0",
"metadata": {},
"outputs": [],
"source": [
"def extractResumeDataNode(state: OnboardingState):\n",
" resume_text = state.get(\"resume_text\")\n",
"\n",
" # Guard 1 ā empty text\n",
" if not resume_text or len(resume_text.strip()) < 10:\n",
" print(\"ā RESUME TEXT EMPTY OR TOO SHORT\")\n",
" return {\"resume_data\": None, \"extraction_error\": \"Resume text is empty\"}\n",
"\n",
" print(f\"š Resume text length: {len(resume_text)} chars\")\n",
"\n",
" messages = [\n",
" SystemMessage(content=resume_agent_prompt),\n",
" HumanMessage(content=f\"{resume_text}\")\n",
" ]\n",
"\n",
" result = resume_agent.invoke(messages)\n",
"\n",
" # Guard 2 ā parsing failed\n",
" if result.get(\"parsing_error\"):\n",
" print(f\"ā PARSING ERROR: {result['parsing_error']}\")\n",
" return {\"resume_data\": None, \"extraction_error\": str(result[\"parsing_error\"])}\n",
"\n",
" # Guard 3 ā parsed is None\n",
" if result.get(\"parsed\") is None:\n",
" print(f\"ā PARSED IS NONE. RAW OUTPUT: {result.get('raw')}\")\n",
" return {\"resume_data\": None, \"extraction_error\": \"LLM returned null schema\"}\n",
"\n",
" print(f\"ā
Resume extracted: {result['parsed'].job_title}\")\n",
" return {\"resume_data\": result[\"parsed\"]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "330acef6",
"metadata": {},
"outputs": [],
"source": [
"def extractJDDataNode(state: OnboardingState):\n",
" # 1. Safety Check: Is the text even in the state?\n",
" jd_text = state.get(\"job_description\", \"\")\n",
" \n",
" if not jd_text or len(jd_text.strip()) < 5:\n",
" print(\"DEBUGGER ERROR: job_description text is MISSING from state!\")\n",
" return {\"JobDescriptionExtract_data\": JobDescriptionExtract()}\n",
"\n",
" print(f\"DEBUGGER: Sending {len(jd_text)} characters to JD Agent...\")\n",
"\n",
" messages = [\n",
" SystemMessage(content=jd_agent_prompt),\n",
" HumanMessage(content=f\"EXTRACT FROM THIS TEXT:\\n\\n{jd_text}\")\n",
" ]\n",
"\n",
" try:\n",
" # 2. Invoke the agent\n",
" result = jd_agent.invoke(messages)\n",
" \n",
" # 3. Handle the 'parsed' key (ensure your chain is configured correctly)\n",
" # If result is already the Pydantic object, use it directly.\n",
" # If result is a dict with 'parsed', use result['parsed'].\n",
" parsed_data = result.get(\"parsed\") if isinstance(result, dict) else result\n",
"\n",
" # 4. Critical Check: Did it actually find anything?\n",
" if parsed_data.job_title is None and parsed_data.tools_technologies is None:\n",
" print(\"DEBUGGER WARNING: LLM returned empty schema! Checking prompt...\")\n",
" else:\n",
" print(f\"DEBUGGER SUCCESS: Extracted {parsed_data.job_title}\")\n",
"\n",
" return {\"JobDescriptionExtract_data\": parsed_data}\n",
" \n",
" except Exception as e:\n",
" print(f\"DEBUGGER CRITICAL: Invoke failed: {str(e)}\")\n",
" return {\"JobDescriptionExtract_data\": JobDescriptionExtract()}"
]
},
{
"cell_type": "markdown",
"id": "795e2446",
"metadata": {},
"source": [
"removing this ->\"skills\": {\"__all__\": {\"category\"}}, # Drops 'category' from every skill\n",
" \"experience\": {\"__all__\": {\"responsibilities\"}}, # Drops bullet points\n",
" \"projects\": {\"__all__\": {\"what_was_built\"}}, # Drops project descriptions\n",
" \"certifications\": {\"__all__\": {\"issuer\"}} # Drops the issuer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7352181c",
"metadata": {},
"outputs": [],
"source": [
"def skill_gap_node(state: OnboardingState):\n",
" \n",
" resume_data = state[\"resume_data\"] \n",
" candidate_name = state[\"candidate_name\"]\n",
" \n",
" # To remove noise and reduce size of the prompt.\n",
" lean_resume_dict = resume_data.model_dump(\n",
"\n",
" exclude_none=True # Bonus: Automatically drops any fields that are None/null!\n",
" )\n",
"\n",
" raw_jd = state[\"JobDescriptionExtract_data\"]\n",
" \n",
" # Strip the HR noise and text bloat\n",
" lean_jd_dict = raw_jd.model_dump(\n",
" exclude={\n",
" \n",
" \n",
" \n",
" \n",
" \"responsibilities\": True, # Dropping verbose bullet points\n",
" \"requirements\": True,\n",
" \"constraints\": True\n",
" },\n",
" exclude_none=True # Drops any null fields\n",
" )\n",
" \n",
" #Convert back to a JSON string if your prompt template requires it\n",
" \n",
" lean_resume_json = json.dumps(lean_resume_dict, indent=2)\n",
"\n",
"\n",
" lean_jd_json = json.dumps(lean_jd_dict, indent=2)\n",
"\n",
" messages = [\n",
" SystemMessage(content=gap_analysis_agent_prompt),\n",
" HumanMessage(content=f\"Users Resume:{lean_resume_json} Job Description:{lean_jd_json}\"),\n",
" \n",
" ]\n",
"\n",
" \n",
" result = gap_analysis_agent.invoke(messages)\n",
"\n",
" return {\"skill_gap_analysis_data\": result[\"parsed\"]}\n",
"\n",
"\n",
" \n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fb2f0d1",
"metadata": {},
"outputs": [],
"source": [
"def roadmap_planning_node(state: OnboardingState):\n",
" \"\"\"\n",
" The agent's 'thinking' node. It looks at the Skill Gaps and \n",
" decides which tool to call next.\n",
" \"\"\"\n",
" skill_gap_data = state[\"skill_gap_analysis_data\"]\n",
"\n",
" skill_gap_data= skill_gap_data.model_dump()\n",
"\n",
" system_prompt = SystemMessage(content=roadmap_planner_agent_prompt)\n",
" input_msg = HumanMessage(content=f\" {skill_gap_data} \")\n",
" \n",
" response = roadmap_planner_agent.invoke([system_prompt, input_msg] + state[\"messages\"])\n",
" \n",
" return {\"messages\": [response]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cea90664",
"metadata": {},
"outputs": [],
"source": [
"def finalize_state_node(state: OnboardingState):\n",
" \"\"\"\n",
" Final node that extracts structured data from the message scratchpad\n",
" and populates the main state keys. No global variables needed!\n",
" \"\"\"\n",
" final_roadmap = None\n",
" mermaid_code = None\n",
"\n",
" # We search the messages in reverse to find the LATEST tool calls\n",
" for msg in reversed(state[\"messages\"]):\n",
" # Check if the message has tool calls (this will be an AIMessage)\n",
" if hasattr(msg, \"tool_calls\") and msg.tool_calls:\n",
" for tool_call in msg.tool_calls:\n",
" \n",
" # 1. Extract the Roadmap JSON\n",
" if tool_call[\"name\"] == \"submit_final_roadmap\":\n",
" final_roadmap = tool_call[\"args\"]\n",
" \n",
" # 2. Extract the Mermaid String\n",
" elif tool_call[\"name\"] == \"submit_mermaid_visualization\":\n",
" mermaid_code = tool_call[\"args\"].get(\"mermaid_code\")\n",
"\n",
" # Once we have both, we can stop searching\n",
" if final_roadmap and mermaid_code:\n",
" break\n",
"\n",
" \n",
" \n",
" return {\n",
" \"final_roadmap\": final_roadmap,\n",
" \"mermaid_code\": mermaid_code\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba9f22e1",
"metadata": {},
"outputs": [],
"source": [
"tool_node = ToolNode(roadmap_planner_agent_tools)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5cfe4c3",
"metadata": {},
"outputs": [],
"source": [
"builder = StateGraph(OnboardingState)\n",
"\n",
"# Define Nodes\n",
"builder.add_node(\"input_node\", input_node)\n",
"builder.add_node(\"resume_data_extraction\", extractResumeDataNode)\n",
"builder.add_node(\"jd_data_extraction\", extractJDDataNode)\n",
"builder.add_node(\"skill_gap_analysis\", skill_gap_node)\n",
"builder.add_node(\"roadmap_planning_agent\", roadmap_planning_node)\n",
"builder.add_node(\"tools\", tool_node) # Named 'tools' for tools_condition compatibility\n",
"builder.add_node(\"finalize_state\", finalize_state_node)\n",
"\n",
"# Define Entry Point and initial Extraction Parallelism\n",
"builder.set_entry_point(\"input_node\")\n",
"builder.add_edge(\"input_node\", \"resume_data_extraction\")\n",
"builder.add_edge(\"input_node\", \"jd_data_extraction\")\n",
"\n",
"# Join Extractions into Gap Analysis\n",
"builder.add_edge(\"resume_data_extraction\", \"skill_gap_analysis\")\n",
"builder.add_edge(\"jd_data_extraction\", \"skill_gap_analysis\")\n",
"\n",
"# Transition from Analysis to Planning Agent\n",
"builder.add_edge(\"skill_gap_analysis\", \"roadmap_planning_agent\")\n",
"\n",
"# Agentic ReAct Loop (Planning Agent <-> Tools)\n",
"builder.add_conditional_edges(\n",
" \"roadmap_planning_agent\",\n",
" tools_condition,\n",
" {\n",
" \"tools\": \"tools\", # If tool_calls exist, go to tools\n",
" \"__end__\": \"finalize_state\" # If finished, go to finalize_state\n",
" }\n",
")\n",
"\n",
"# 2. Loop back to agent after tools\n",
"builder.add_edge(\"tools\", \"roadmap_planning_agent\")\n",
"\n",
"\n",
"\n",
"# Compile the Graph\n",
"graph = builder.compile()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53588a77",
"metadata": {},
"outputs": [],
"source": [
"display(graph)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0edf8d59",
"metadata": {},
"outputs": [],
"source": [
"jd_text=\"\"\"Job Title: Backend Developer\n",
"\n",
"Company name: CodeForge\n",
"We are hiring a Backend Developer to build scalable APIs and backend systems.\n",
"\n",
"Responsibilities:\n",
"- Develop REST APIs using FastAPI\n",
"- Design and manage PostgreSQL databases\n",
"- Implement authentication and authorization systems\n",
"- Optimize performance and scalability\n",
"\n",
"Requirements:\n",
"- Strong knowledge of Python\n",
"- Experience with FastAPI or Django\n",
"- Good understanding of SQL and database design\n",
"- Familiarity with Docker\n",
"\n",
"Constraints:\n",
"- Location: Pune only\n",
"- Full-time role \"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da3df5a4",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"# Define the keys your React frontend actually needs\n",
"REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n",
"\n",
"def export_ui_payload(state, filename=\"hook_output.json\"):\n",
" \"\"\"\n",
" Extracts specific keys from the graph state and ensures \n",
" Pydantic objects are dumped to dicts for JSON compatibility.\n",
" \"\"\"\n",
" ui_data = {}\n",
"\n",
" for key in REQUIRED_KEYS:\n",
" # Get the value from the state\n",
" val = state.get(key)\n",
" \n",
" if val is None:\n",
" continue\n",
"\n",
" # Check if the value is a Pydantic object (has .model_dump())\n",
" # This fixes the \"skill_gap_analysis_data as a string\" issue\n",
" if hasattr(val, \"model_dump\"):\n",
" ui_data[key] = val.model_dump()\n",
" else:\n",
" # If it's already a dict (final_roadmap) or string (mermaid_code)\n",
" ui_data[key] = val\n",
"\n",
" # Save to the local file\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(ui_data, f, indent=2)\n",
" \n",
" print(f\"ā
UI Payload successfully exported to {filename}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a95b4db7",
"metadata": {},
"outputs": [],
"source": [
"initial_input = {\n",
" \"candidate_name\": \"Chirayu Jain\",\n",
" \"resume_text\": None,\n",
" \"job_description\": jd_text,\n",
" \"file_path\": r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\\ChirayuResume.pdf\",\n",
" \"resume_data\": None,\n",
" \"extraction_error\": None,\n",
" \"JobDescriptionExtract_data\": None,\n",
" \"skill_gap_analysis_data\": None\n",
" \n",
" \n",
"}\n",
"import uuid\n",
"\n",
"\n",
"checkpointer = MemorySaver() \n",
"graph = builder.compile(checkpointer=checkpointer)\n",
"\n",
"THREAD_ID = str(uuid.uuid4())\n",
"\n",
"\n",
"\n",
"config = {\"configurable\": {\"thread_id\": THREAD_ID,\"langgraph_user_id\": \"Chirayu Jain\"}}\n",
"\n",
"final_state = graph.invoke(initial_input, config=config)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "093bdd6e",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"def run_graph_with_stream(graph, initial_input, config):\n",
" \"\"\"\n",
" Executes the graph in streaming mode to visualize the 'under the hood' \n",
" process of node transitions and data updates.\n",
" \"\"\"\n",
" print(\"š Starting Graph Stream...\\n\")\n",
" \n",
" # Using stream_mode=\"updates\" to see exactly what each node returns\n",
" for event in graph.stream(initial_input, config, stream_mode=\"updates\"):\n",
" for node_name, node_update in event.items():\n",
" print(f\"--- š Node: {node_name} ---\")\n",
" \n",
" # 1. Check for Tool Calls (The 'ReAct' thinking process)\n",
" if \"messages\" in node_update:\n",
" last_msg = node_update[\"messages\"][-1]\n",
" if hasattr(last_msg, \"tool_calls\") and last_msg.tool_calls:\n",
" for tool in last_msg.tool_calls:\n",
" print(f\"š ļø AGENT CALLING TOOL: {tool['name']}\")\n",
" print(f\"š ARGS: {json.dumps(tool['args'], indent=2)}\")\n",
" elif hasattr(last_msg, \"content\") and last_msg.content:\n",
" # Show a snippet of the AI's internal reasoning\n",
" content_snippet = last_msg.content[:150].replace('\\n', ' ')\n",
" print(f\"š§ AI THOUGHT: {content_snippet}...\")\n",
"\n",
" # 2. Check for Data Extraction (JD/Resume results)\n",
" if \"JobDescriptionExtract_data\" in node_update:\n",
" jd = node_update[\"JobDescriptionExtract_data\"]\n",
" print(f\"ā
Extracted JD: {getattr(jd, 'job_title', 'Unknown')}\")\n",
" \n",
" if \"resume_data\" in node_update:\n",
" res = node_update[\"resume_data\"]\n",
" print(f\"ā
Extracted Resume for: {getattr(res, 'candidate_name', 'Unknown')}\")\n",
"\n",
" # 3. Check for the final output keys\n",
" if \"skill_gap_analysis_data\" in node_update:\n",
" print(\"šÆ Skill Gap Analysis Completed.\")\n",
" \n",
" if \"learning_roadmap\" in node_update or \"final_roadmap\" in node_update:\n",
" print(\"š Final Roadmap Constructed.\")\n",
"\n",
" print(\"\\n\" + \"=\"*50 + \"\\n\")\n",
"\n",
" # Access the final state after the stream ends\n",
" final_state = graph.get_state(config)\n",
" print(\"⨠Stream Finished. Final state captured.\")\n",
" return final_state.values\n",
"\n",
"# --- Example Usage ---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a36ecb1",
"metadata": {},
"outputs": [],
"source": [
"config = {\"configurable\": {\"thread_id\": \"debug_123\"}}\n",
"final_result = run_graph_with_stream(graph, initial_input, config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e502949",
"metadata": {},
"outputs": [],
"source": [
"final_result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53ba21aa",
"metadata": {},
"outputs": [],
"source": [
"///break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5afbce5b",
"metadata": {},
"outputs": [],
"source": [
"final_state"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "25a6b5b4",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"# Define the keys your React frontend actually needs\n",
"REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n",
"\n",
"def export_ui_payload(state, filename=\"ai_output.json\"):\n",
" \"\"\"\n",
" Extracts specific keys from the graph state and ensures \n",
" Pydantic objects are dumped to dicts for JSON compatibility.\n",
" \"\"\"\n",
" ui_data = {}\n",
"\n",
" for key in REQUIRED_KEYS:\n",
" # Get the value from the state\n",
" val = state.get(key)\n",
" \n",
" if val is None:\n",
" continue\n",
"\n",
" # Check if the value is a Pydantic object (has .model_dump())\n",
" # This fixes the \"skill_gap_analysis_data as a string\" issue\n",
" if hasattr(val, \"model_dump\"):\n",
" ui_data[key] = val.model_dump()\n",
" else:\n",
" # If it's already a dict (final_roadmap) or string (mermaid_code)\n",
" ui_data[key] = val\n",
"\n",
" # Save to the local file\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(ui_data, f, indent=2)\n",
" \n",
" print(f\"ā
UI Payload successfully exported to {filename}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26c10157",
"metadata": {},
"outputs": [],
"source": [
"export_ui_payload(final_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "478f19dd",
"metadata": {},
"outputs": [],
"source": [
"test_data=['atgdata.json','buisnessdata.json','chefdata.json','casemanager.json']\n",
"test_resumes=['ATGPDF.pdf','Business.pdf','CHEF.pdf','casemanager.pdf']\n",
"\n",
"\n",
"test_resume_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\\{test_resumes}\"\n",
"test_data_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\\{test_data}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "83f3bd72",
"metadata": {},
"outputs": [],
"source": [
"store_state=[]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b29b7ea",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import uuid\n",
"import os\n",
"from langgraph.checkpoint.memory import MemorySaver\n",
"\n",
"# --- Configuration & Paths ---\n",
"\n",
"test_map = [\n",
" {\"resume\": \"ATGPDF.pdf\", \"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n",
" \n",
"]\n",
"\n",
"RESUME_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\"\n",
"DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
"\n",
"# Windows-safe absolute output path\n",
"OUTPUT_DIR = os.path.join(os.getcwd(), \"predictions\")\n",
"\n",
"if not os.path.exists(OUTPUT_DIR):\n",
" os.makedirs(OUTPUT_DIR)\n",
"\n",
"\n",
"# --- Helper Functions ---\n",
"\n",
"def get_job_description_string(data_filename: str) -> str | None:\n",
" \"\"\"\n",
" Extracts the Job Description from the test data JSON and\n",
" formats it as a clean string for the extraction node.\n",
" \"\"\"\n",
" path = os.path.join(DATA_DIR, data_filename)\n",
"\n",
" if not os.path.exists(path):\n",
" print(f\"ā ļø Data file not found: {path}\")\n",
" return None\n",
"\n",
" try:\n",
" with open(path, \"r\", encoding=\"utf-8\") as f:\n",
" suite = json.load(f)\n",
"\n",
" jd_obj = suite.get(\"job_description\") or suite.get(\"job_description_requirements\")\n",
"\n",
" if not jd_obj:\n",
" print(f\"ā ļø No JD key found in {data_filename}\")\n",
" return None\n",
"\n",
" title = jd_obj.get(\"title\") or jd_obj.get(\"job_title\", \"N/A\")\n",
" desc = jd_obj.get(\"description\", \"\")\n",
" reqs = jd_obj.get(\"requirements\", [])\n",
"\n",
" jd_string = f\"JOB TITLE: {title}\\n\\n\"\n",
" if desc:\n",
" jd_string += f\"OVERVIEW: {desc}\\n\\n\"\n",
" jd_string += \"REQUIREMENTS:\\n\" + \"\\n\".join([f\"- {r}\" for r in reqs])\n",
"\n",
" return jd_string\n",
"\n",
" except Exception as e:\n",
" print(f\"ā Error loading JD from {data_filename}: {e}\")\n",
" return None\n",
"\n",
"\n",
"def export_ui_payload(state: dict) -> dict:\n",
" \"\"\"\n",
" Extracts required keys from graph state.\n",
" Converts Pydantic objects to dicts via .model_dump().\n",
" \"\"\"\n",
" REQUIRED_KEYS = [\n",
" \"candidate_name\",\n",
" \"skill_gap_analysis_data\",\n",
" \"mermaid_code\",\n",
" \"final_roadmap\",\n",
" ]\n",
" ui_data = {}\n",
"\n",
" for key in REQUIRED_KEYS:\n",
" val = state.get(key)\n",
" if val is None:\n",
" continue\n",
" if hasattr(val, \"model_dump\"):\n",
" ui_data[key] = val.model_dump()\n",
" else:\n",
" ui_data[key] = val\n",
"\n",
" return ui_data\n",
"\n",
"\n",
"# --- Execution Loop ---\n",
"\n",
"def run_evaluation_suite(graph_instance):\n",
" \"\"\"\n",
" Runs the graph for every resume in test_map.\n",
" Saves UI-ready payloads as predicted_{name}.json in OUTPUT_DIR.\n",
" \"\"\"\n",
" print(f\"\\nš Output directory: {OUTPUT_DIR}\\n\")\n",
"\n",
" for case in test_map:\n",
" print(f\"š Processing: {case['resume']}...\")\n",
"\n",
" # 1. Validate resume file exists\n",
" resume_path = os.path.join(RESUME_DIR, case[\"resume\"])\n",
" if not os.path.exists(resume_path):\n",
" print(f\"ā ļø Resume not found, skipping: {resume_path}\")\n",
" continue\n",
"\n",
" # 2. Load JD string\n",
" jd_content = get_job_description_string(case[\"data\"])\n",
" if not jd_content:\n",
" print(f\"ā ļø Skipping {case['resume']}: JD not found in {case['data']}\")\n",
" continue\n",
"\n",
" # 3. Build initial state\n",
" initial_input = {\n",
" \"candidate_name\": case[\"name\"].replace(\"_\", \" \"),\n",
" \"file_path\": resume_path,\n",
" \"job_description\": jd_content,\n",
" \"resume_text\": None,\n",
" \"resume_data\": None,\n",
" \"extraction_error\": None,\n",
" \"JobDescriptionExtract_data\": None,\n",
" \"skill_gap_analysis_data\": None,\n",
" \"messages\": [],\n",
" \"mermaid_code\": None,\n",
" \"final_roadmap\": None,\n",
" }\n",
"\n",
" # 4. Invoke graph\n",
" config = {\"configurable\": {\"thread_id\": str(uuid.uuid4())}}\n",
"\n",
" try:\n",
" final_state = graph_instance.invoke(initial_input, config=config)\n",
"\n",
" store_state.append(final_state)\n",
"\n",
" print(f\"ā
Graph execution successful for {case['resume']}\\n\")\n",
"\n",
" # 5. Export payload\n",
" prediction = export_ui_payload(final_state)\n",
" output_file = f\"predicted_{case['name'].lower()}.json\"\n",
" output_path = os.path.join(OUTPUT_DIR, output_file)\n",
"\n",
" with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(prediction, f, indent=2, ensure_ascii=False)\n",
"\n",
" print(f\"ā
Saved: {output_path}\\n\")\n",
"\n",
" except Exception as e:\n",
" print(f\"ā Error during graph execution for {case['resume']}: {e}\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1638a3f",
"metadata": {},
"outputs": [],
"source": [
"if __name__ == \"__main__\":\n",
" # Assuming your graph is already compiled and named 'graph'\n",
" run_evaluation_suite(graph)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0910b325",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import uuid\n",
"import os\n",
"from langgraph.checkpoint.memory import MemorySaver\n",
"\n",
"# --- Configuration & Paths ---\n",
"# Mapping resumes to the JSON files containing the Ground Truth data we created\n",
"test_map = [\n",
" \n",
" {\"resume\": \"casemanager.pdf\", \"data\": \"casemanagerdata.json\", \"name\": \"Case_Manager\"}\n",
"]\n",
"\n",
"# Update these to your actual local paths where the files are stored\n",
"RESUME_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\"\n",
"DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
"OUTPUT_DIR = \"./predictions\" \n",
"\n",
"if not os.path.exists(OUTPUT_DIR):\n",
" os.makedirs(OUTPUT_DIR)\n",
"\n",
"# --- Helper Functions ---\n",
"\n",
"def get_job_description_string(data_filename):\n",
" \"\"\"\n",
" Extracts the Job Description from the test data and formats it as a clean \n",
" string for the Extraction Node. Handles both 'job_description' and \n",
" 'job_description_requirements' keys.\n",
" \"\"\"\n",
" path = os.path.join(DATA_DIR, data_filename)\n",
" try:\n",
" with open(path, 'r', encoding='utf-8') as f:\n",
" suite = json.load(f)\n",
" \n",
" # Extract from 'job_description' or 'job_description_requirements'\n",
" jd_obj = suite.get(\"job_description\") or suite.get(\"job_description_requirements\")\n",
" \n",
" if not jd_obj:\n",
" return None\n",
" \n",
" title = jd_obj.get(\"title\") or jd_obj.get(\"job_title\", \"N/A\")\n",
" desc = jd_obj.get(\"description\", \"\")\n",
" reqs = jd_obj.get(\"requirements\", [])\n",
" \n",
" # Format as a clean string for the LLM to analyze\n",
" jd_string = f\"JOB TITLE: {title}\\n\\n\"\n",
" if desc:\n",
" jd_string += f\"OVERVIEW: {desc}\\n\\n\"\n",
" jd_string += \"REQUIREMENTS:\\n\" + \"\\n\".join([f\"- {r}\" for r in reqs])\n",
" \n",
" return jd_string\n",
" except Exception as e:\n",
" print(f\"Error loading JD from {data_filename}: {e}\")\n",
" return None\n",
"\n",
"def export_ui_payload(state):\n",
" \"\"\"\n",
" Extracts and formats state data for the UI payload.\n",
" Ensures Pydantic objects are converted to dicts using .model_dump().\n",
" \"\"\"\n",
" REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n",
" ui_data = {}\n",
"\n",
" for key in REQUIRED_KEYS:\n",
" val = state.get(key)\n",
" if val is None:\n",
" continue\n",
" \n",
" # If it's a Pydantic object, dump it to a dict\n",
" if hasattr(val, \"model_dump\"):\n",
" ui_data[key] = val.model_dump()\n",
" else:\n",
" # If it's already a dict, list, or string (like mermaid_code)\n",
" ui_data[key] = val\n",
" return ui_data\n",
"\n",
"# --- Execution Loop ---\n",
"\n",
"def run_evaluation_suite_re(graph_instance):\n",
" \"\"\"\n",
" Automates the graph execution for every resume in the test suite.\n",
" Saves the final UI-ready payloads as 'predicted_{name}.json'.\n",
" \"\"\"\n",
" for case in test_map:\n",
" print(f\"š Processing: {case['resume']}...\")\n",
" \n",
" # 1. Prepare Inputs\n",
" jd_content = get_job_description_string(case['data'])\n",
" \n",
" if not jd_content:\n",
" print(f\"ā ļø Skipping {case['resume']}: JD not found in {case['data']}\")\n",
" continue\n",
"\n",
" # The 'job_description' key must match your extraction node's expectation\n",
" initial_input = {\n",
" \"candidate_name\": case['name'].replace(\"_\", \" \"),\n",
" \"resume_path\": os.path.join(RESUME_DIR, case['resume']),\n",
" \"job_description\": jd_content, \n",
" \"resume_text\": None # Assuming input_node or extraction node loads the PDF\n",
" }\n",
"\n",
" # 2. Invoke Graph with a unique thread\n",
" thread_id = str(uuid.uuid4())\n",
" config = {\"configurable\": {\"thread_id\": thread_id}}\n",
" \n",
" try:\n",
" # Execution\n",
" final_state = graph_instance.invoke(initial_input, config=config)\n",
" \n",
" # 3. Process and Save UI Payload\n",
" prediction = export_ui_payload(final_state)\n",
" output_file = f\"predicted_{case['name'].lower()}.json\"\n",
" output_path = os.path.join(OUTPUT_DIR, output_file)\n",
" \n",
" with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(prediction, f, indent=2)\n",
" \n",
" print(f\"ā
Success! Prediction saved to: {output_path}\")\n",
" \n",
" except Exception as e:\n",
" print(f\"ā Error during graph execution for {case['resume']}: {e}\")\n",
"\n",
"# --- Example of Triggering ---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8ef7f0b",
"metadata": {},
"outputs": [],
"source": [
"if __name__ == \"__main__\":\n",
" # Assuming your graph is already compiled and named 'graph'\n",
" run_evaluation_suite_re(graph)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e37e4370",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import re\n",
"from rouge_score import rouge_scorer\n",
"\n",
"def normalize(text):\n",
" if not text: return \"\"\n",
" return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n",
"\n",
"def calculate_f1(target_set, predicted_set):\n",
" if not target_set and not predicted_set: return 1.0\n",
" if not target_set or not predicted_set: return 0.0\n",
" intersection = target_set.intersection(predicted_set)\n",
" precision = len(intersection) / len(predicted_set)\n",
" recall = len(intersection) / len(target_set)\n",
" if (precision + recall) == 0:\n",
" return 0.0\n",
" return 2 * (precision * recall) / (precision + recall)\n",
"\n",
"# ā replaces your manual calculate_lcs + get_rouge_l\n",
"scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
"\n",
"def get_rouge_l(reference, candidate):\n",
" if not reference or not candidate:\n",
" return 0.0\n",
" scores = scorer.score(reference, candidate)\n",
" return scores['rougeL'].fmeasure # F1 score directly\n",
"\n",
"\n",
"def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n",
" print(\"\\nš --- AI ENGINE PERFORMANCE REPORT ---\")\n",
" print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n",
" print(\"-\" * 75)\n",
"\n",
" final_report = {}\n",
"\n",
" for case in mapping:\n",
" name = case['name']\n",
" target_path = os.path.join(target_data_dir, case['data'])\n",
" pred_path = os.path.join(predicted_data_dir, f\"predicted_{name.lower()}.json\")\n",
"\n",
" if not os.path.exists(target_path) or not os.path.exists(pred_path):\n",
" print(f\"ā ļø Skipping {name} ā file not found\")\n",
" continue\n",
"\n",
" try:\n",
" with open(target_path, 'r', encoding='utf-8') as f:\n",
" target_json = json.load(f)[\"ai_target\"]\n",
" with open(pred_path, 'r', encoding='utf-8') as f:\n",
" pred_json = json.load(f)\n",
"\n",
" # 1. Skill Extraction F1\n",
" target_skills = {normalize(g['skill_name']) for g in target_json['skill_gap_analysis_data']['analyzed_gaps']}\n",
" pred_skills = {normalize(g['skill_name']) for g in pred_json['skill_gap_analysis_data']['analyzed_gaps']}\n",
" skill_f1 = calculate_f1(target_skills, pred_skills)\n",
"\n",
" # 2. Reasoning Quality ā ROUGE-L via library\n",
" reasoning_rouge = get_rouge_l(\n",
" target_json['skill_gap_analysis_data']['executive_summary'],\n",
" pred_json['skill_gap_analysis_data']['executive_summary']\n",
" )\n",
"\n",
" # 3. Retrieval Hit Rate\n",
" target_ids = {normalize(c['course_id']) for c in target_json['final_roadmap']['roadmap']}\n",
" pred_ids = {normalize(c['course_id']) for c in pred_json['final_roadmap']['roadmap']}\n",
" hit_count = len(target_ids.intersection(pred_ids))\n",
" hit_rate = hit_count / len(target_ids) if target_ids else 0.0\n",
"\n",
" print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n",
"\n",
" final_report[name] = {\n",
" \"skill_extraction_f1\": round(skill_f1, 4),\n",
" \"reasoning_rouge_l\": round(reasoning_rouge, 4),\n",
" \"retrieval_hit_rate\": round(hit_rate, 4),\n",
" }\n",
"\n",
" except Exception as e:\n",
" print(f\"ā ļø Error processing {name}: {e}\")\n",
"\n",
" # Average across all test cases\n",
" if final_report:\n",
" avg_f1 = sum(v[\"skill_extraction_f1\"] for v in final_report.values()) / len(final_report)\n",
" avg_rouge = sum(v[\"reasoning_rouge_l\"] for v in final_report.values()) / len(final_report)\n",
" avg_hit = sum(v[\"retrieval_hit_rate\"] for v in final_report.values()) / len(final_report)\n",
" print(\"-\" * 75)\n",
" print(f\"{'AVERAGE':<20} | {avg_f1:>8.2%} | {avg_rouge:>15.2%} | {avg_hit:>15.2%}\")\n",
"\n",
" return final_report\n",
"\n",
"\n",
"# --- Mapping & Paths ---\n",
"mapping = [\n",
" {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n",
" \n",
"]\n",
"\n",
"DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
"PRED_DIR = \"./predictions\"\n",
"\n",
"report = run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70ced174",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import re\n",
"\n",
"def normalize(text):\n",
" \"\"\"Clean and normalize text for comparison.\"\"\"\n",
" if not text: return \"\"\n",
" return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n",
"\n",
"def calculate_lcs(X, Y):\n",
" \"\"\"Calculates the length of the Longest Common Subsequence.\"\"\"\n",
" m, n = len(X), len(Y)\n",
" L = [[0] * (n + 1) for _ in range(m + 1)]\n",
" for i in range(m + 1):\n",
" for j in range(n + 1):\n",
" if i == 0 or j == 0:\n",
" L[i][j] = 0\n",
" elif X[i-1] == Y[j-1]:\n",
" L[i][j] = L[i-1][j-1] + 1\n",
" else:\n",
" L[i][j] = max(L[i-1][j], L[i][j-1])\n",
" return L[m][n]\n",
"\n",
"def get_rouge_l(reference, candidate):\n",
" \"\"\"Calculates ROUGE-L F1 score using LCS.\"\"\"\n",
" if not reference or not candidate:\n",
" return 0.0\n",
" ref_tokens = normalize(reference).split()\n",
" cand_tokens = normalize(candidate).split()\n",
" if not ref_tokens or not cand_tokens:\n",
" return 0.0\n",
" lcs_count = calculate_lcs(ref_tokens, cand_tokens)\n",
" recall = lcs_count / len(ref_tokens)\n",
" precision = lcs_count / len(cand_tokens)\n",
" if (recall + precision) == 0:\n",
" return 0.0\n",
" f1 = (2 * recall * precision) / (recall + precision)\n",
" return f1\n",
"\n",
"def calculate_f1(target_set, predicted_set):\n",
" \"\"\"Calculates Precision, Recall, and F1 for sets of entities (Skills/IDs).\"\"\"\n",
" if not target_set and not predicted_set: return 1.0\n",
" if not target_set or not predicted_set: return 0.0\n",
" intersection = target_set.intersection(predicted_set)\n",
" precision = len(intersection) / len(predicted_set)\n",
" recall = len(intersection) / len(target_set)\n",
" if (precision + recall) == 0:\n",
" return 0.0\n",
" return 2 * (precision * recall) / (precision + recall)\n",
"\n",
"def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n",
" \"\"\"\n",
" Main evaluation loop benchmarking predictions against ground truth.\n",
" Handles missing keys and files gracefully.\n",
" \"\"\"\n",
" print(\"\\nš --- AI ENGINE PERFORMANCE REPORT ---\")\n",
" print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n",
" print(\"-\" * 75)\n",
"\n",
" final_report = {}\n",
"\n",
" for case in mapping:\n",
" name = case['name']\n",
" target_path = os.path.join(target_data_dir, case['data'])\n",
" # Look for the predicted file (lowercase name mapper)\n",
" pred_filename = f\"predicted_{name.lower()}.json\"\n",
" pred_path = os.path.join(predicted_data_dir, pred_filename)\n",
" \n",
" # 1. Check for File Existence\n",
" if not os.path.exists(target_path):\n",
" print(f\"ā ļø Skipping {name} ā Target file '{case['data']}' not found.\")\n",
" continue\n",
" if not os.path.exists(pred_path):\n",
" print(f\"ā ļø Skipping {name} ā Prediction file '{pred_filename}' not found.\")\n",
" continue\n",
" \n",
" try:\n",
" with open(target_path, 'r', encoding='utf-8') as f:\n",
" target_json = json.load(f).get(\"ai_target\", {})\n",
" with open(pred_path, 'r', encoding='utf-8') as f:\n",
" pred_json = json.load(f)\n",
" \n",
" # --- Metric 1: Skill Extraction Accuracy (F1) ---\n",
" target_gap_data = target_json.get('skill_gap_analysis_data', {})\n",
" pred_gap_data = pred_json.get('skill_gap_analysis_data', {})\n",
" \n",
" target_skills = {normalize(g.get('skill_name')) for g in target_gap_data.get('analyzed_gaps', [])}\n",
" pred_skills = {normalize(g.get('skill_name')) for g in pred_gap_data.get('analyzed_gaps', [])}\n",
" skill_f1 = calculate_f1(target_skills, pred_skills)\n",
" \n",
" # --- Metric 2: Reasoning Quality (ROUGE-L) ---\n",
" target_summary = target_gap_data.get('executive_summary', \"\")\n",
" pred_summary = pred_gap_data.get('executive_summary', \"\")\n",
" reasoning_rouge = get_rouge_l(target_summary, pred_summary)\n",
" \n",
" # --- Metric 3: Retrieval Precision (Top-1 Hit Rate) ---\n",
" # Extract expected IDs from target\n",
" target_roadmap = target_json.get('final_roadmap', {}).get('roadmap', [])\n",
" target_ids = {normalize(c.get('course_id')) for c in target_roadmap}\n",
" \n",
" # Extract predicted IDs from prediction (Checking common possible keys)\n",
" pred_roadmap_obj = pred_json.get('final_roadmap', {})\n",
" # If final_roadmap is a list directly in some versions\n",
" if isinstance(pred_roadmap_obj, list):\n",
" pred_roadmap = pred_roadmap_obj\n",
" else:\n",
" pred_roadmap = pred_roadmap_obj.get('roadmap', [])\n",
" \n",
" pred_ids = {normalize(c.get('course_id')) for c in pred_roadmap}\n",
" \n",
" if target_ids:\n",
" hit_count = len(target_ids.intersection(pred_ids))\n",
" hit_rate = hit_count / len(target_ids)\n",
" else:\n",
" hit_rate = 0.0\n",
" \n",
" # Print status row\n",
" print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n",
" \n",
" final_report[name] = {\n",
" \"skill_extraction_f1\": skill_f1,\n",
" \"reasoning_rouge_l\": reasoning_rouge,\n",
" \"retrieval_hit_rate\": hit_rate\n",
" }\n",
" except Exception as e:\n",
" print(f\"ā ļø Error processing {name}: {str(e)}\")\n",
"\n",
" return final_report\n",
"\n",
"# --- Mapping & Paths ---\n",
"mapping = [\n",
" {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n",
" {\"data\": \"buisnessdata.json\", \"name\": \"Business_Manager\"},\n",
" {\"data\": \"chefdata.json\", \"name\": \"Executive_Chef\"},\n",
" {\"data\": \"casemanagerdata.json\", \"name\": \"Case_Manager\"}\n",
"]\n",
"\n",
"# Note: Ensure these paths are correct for your local environment\n",
"DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
"PRED_DIR = \"./predictions\"\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" report = run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73a45cfd",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import re\n",
"from rouge_score import rouge_scorer\n",
"\n",
"# -----------------------------\n",
"# Scorer instance (created once)\n",
"# -----------------------------\n",
"scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
"\n",
"\n",
"def normalize(text):\n",
" if not text: return \"\"\n",
" return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n",
"\n",
"\n",
"def get_rouge_l(reference, candidate):\n",
" if not reference or not candidate:\n",
" return 0.0\n",
" return scorer.score(reference, candidate)['rougeL'].fmeasure\n",
"\n",
"\n",
"def calculate_skill_f1(target_skills, pred_skills):\n",
" if not target_skills and not pred_skills: return 1.0\n",
" if not target_skills or not pred_skills: return 0.0\n",
"\n",
" tp = 0\n",
" matched_targets = set()\n",
"\n",
" for p in pred_skills:\n",
" for t in target_skills:\n",
" if t in matched_targets: continue\n",
" if p in t or t in p:\n",
" tp += 1\n",
" matched_targets.add(t)\n",
" break\n",
"\n",
" precision = tp / len(pred_skills)\n",
" recall = tp / len(target_skills)\n",
"\n",
" if (precision + recall) == 0:\n",
" return 0.0\n",
" return 2 * (precision * recall) / (precision + recall)\n",
"\n",
"\n",
"def calculate_retrieval_hit(target_ids, pred_ids):\n",
" if not target_ids: return 0.0\n",
" if not pred_ids: return 0.0\n",
" return len(target_ids.intersection(pred_ids)) / len(target_ids)\n",
"\n",
"\n",
"def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n",
" print(\"\\nš --- AI ENGINE PERFORMANCE REPORT ---\")\n",
" print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n",
" print(\"-\" * 75)\n",
"\n",
" final_report = {}\n",
"\n",
" for case in mapping:\n",
" name = case['name']\n",
" target_path = os.path.join(target_data_dir, case['data'])\n",
" pred_path = os.path.join(predicted_data_dir, f\"predicted_{name.lower()}.json\")\n",
"\n",
" if not os.path.exists(target_path) or not os.path.exists(pred_path):\n",
" if not os.path.exists(pred_path):\n",
" print(f\"{name:<20} | SKIPPED ā prediction file not found\")\n",
" continue\n",
"\n",
" try:\n",
" with open(target_path, 'r', encoding='utf-8') as f:\n",
" target_json = json.load(f).get(\"ai_target\", {})\n",
" with open(pred_path, 'r', encoding='utf-8') as f:\n",
" pred_json = json.load(f)\n",
"\n",
" # --- Metric 1: Skill F1 ---\n",
" target_gap = target_json.get('skill_gap_analysis_data', {})\n",
" pred_gap = pred_json.get('skill_gap_analysis_data', {})\n",
"\n",
" target_skills = {normalize(g.get('skill_name')) for g in target_gap.get('analyzed_gaps', [])}\n",
" pred_skills = {normalize(g.get('skill_name')) for g in pred_gap.get('analyzed_gaps', [])}\n",
" skill_f1 = calculate_skill_f1(target_skills, pred_skills)\n",
"\n",
" # --- Metric 2: ROUGE-L ---\n",
" reasoning_rouge = get_rouge_l(\n",
" target_gap.get('executive_summary', \"\"),\n",
" pred_gap.get('executive_summary', \"\")\n",
" )\n",
"\n",
" # --- Metric 3: Retrieval Hit Rate ---\n",
" target_ids = {normalize(c.get('course_id')) for c in target_json.get('final_roadmap', {}).get('roadmap', [])}\n",
"\n",
" pred_roadmap_obj = pred_json.get('final_roadmap', {})\n",
" pred_roadmap = pred_roadmap_obj if isinstance(pred_roadmap_obj, list) else pred_roadmap_obj.get('roadmap', [])\n",
" pred_ids = {normalize(c.get('course_id')) for c in pred_roadmap}\n",
"\n",
" hit_rate = calculate_retrieval_hit(target_ids, pred_ids)\n",
"\n",
" print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n",
"\n",
" final_report[name] = {\n",
" \"skill_extraction_f1\": round(skill_f1, 4),\n",
" \"reasoning_rouge_l\": round(reasoning_rouge, 4),\n",
" \"retrieval_hit_rate\": round(hit_rate, 4),\n",
" }\n",
"\n",
" except Exception as e:\n",
" print(f\"ā ļø Error processing {name}: {str(e)}\")\n",
"\n",
" # --- Average Row ---\n",
" if final_report:\n",
" avg_f1 = sum(v[\"skill_extraction_f1\"] for v in final_report.values()) / len(final_report)\n",
" avg_rouge = sum(v[\"reasoning_rouge_l\"] for v in final_report.values()) / len(final_report)\n",
" avg_hit = sum(v[\"retrieval_hit_rate\"] for v in final_report.values()) / len(final_report)\n",
" print(\"-\" * 75)\n",
" print(f\"{'AVERAGE':<20} | {avg_f1:>8.2%} | {avg_rouge:>15.2%} | {avg_hit:>15.2%}\")\n",
"\n",
" return final_report\n",
"\n",
"\n",
"# --- Mapping & Paths ---\n",
"mapping = [\n",
" {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n",
" {\"data\": \"buisnessdata.json\", \"name\": \"Business_Manager\"},\n",
" {\"data\": \"chefdata.json\", \"name\": \"Executive_Chef\"},\n",
" {\"data\": \"casemanager.json\", \"name\": \"Case_Manager\"}\n",
"]\n",
"\n",
"DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
"PRED_DIR = \"./predictions\"\n",
"\n",
"if __name__ == \"__main__\":\n",
" run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}