{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e02e1b00", "metadata": {}, "outputs": [], "source": [ "import os\n", "import cloudinary\n", "import cloudinary.uploader\n", "import requests\n", "from io import BytesIO\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "\n", "# Explicitly configure using your 3 credentials\n", "cloudinary.config( \n", " cloud_name = os.getenv('CLOUDINARY_CLOUD_NAME'), \n", " api_key = os.getenv('CLOUDINARY_API_KEY'), \n", " api_secret = os.getenv('CLOUDINARY_API_SECRET'),\n", " secure = True\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c11377c5", "metadata": {}, "outputs": [], "source": [ "import os\n", "import cloudinary\n", "import cloudinary.uploader\n", "from dotenv import load_dotenv\n", "\n", "# 1. Load credentials from your .env file\n", "load_dotenv()\n", "\n", "cloudinary.config( \n", " cloud_name = os.getenv('CLOUDINARY_CLOUD_NAME'), \n", " api_key = os.getenv('CLOUDINARY_API_KEY'), \n", " api_secret = os.getenv('CLOUDINARY_API_SECRET'),\n", " secure = True\n", ")\n", "\n", "# 2. Set your variables\n", "resume_path = r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"\n", "thread_id = \"trial_thread_001\"\n", "file_name = \"ChirayuResume\"\n", "\n", "# 3. Perform the upload\n", "try:\n", " response = cloudinary.uploader.upload(\n", " resume_path,\n", " folder = f\"threads/{thread_id}\",\n", " public_id = file_name,\n", " resource_type = \"image\" # Use \"image\" for PDFs to get previews in UI\n", " )\n", "\n", " # 4. Create the URL from the response\n", " pdf_url = response.get(\"secure_url\")\n", " \n", " print(f\"āœ… Upload Successful!\")\n", " print(f\"šŸ“‚ Folder: threads/{thread_id}\")\n", " print(f\"šŸ”— URL to push: {pdf_url}\")\n", "\n", "except Exception as e:\n", " print(f\"āŒ Upload failed: {e}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f986ff8f", "metadata": {}, "outputs": [], "source": [ "import requests\n", "from io import BytesIO\n", "\n", "def get_pdf_for_ai(url):\n", " # 1. Reach out to the URL\n", " response = requests.get(url)\n", " \n", " if response.status_code == 200:\n", " # 2. Convert the web response into a \"file-like\" object\n", " pdf_stream = BytesIO(response.content)\n", " print(\"āœ… PDF loaded into memory for processing!\")\n", " return pdf_stream\n", " else:\n", " print(f\"āŒ Failed to fetch PDF. Status: {response.status_code}\")\n", " return None\n", "\n", "# --- USE YOUR ACTUAL URL ---\n", "resume_url = \"https://res.cloudinary.com/dvxnazx8e/image/upload/v1774166452/threads/trial_thread_001/ChirayuResume.pdf\"\n", "pdf_data = get_pdf_for_ai(resume_url)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "938186bf", "metadata": {}, "outputs": [], "source": [ "import cloudinary\n", "from cloudinary import Search\n", "\n", "\n", "def get_resume_url(thread_id: str) -> str:\n", " \"\"\"\n", " Searches Cloudinary for the resume PDF in the thread's folder\n", " and returns the secure URL.\n", " \"\"\"\n", " result = Search() \\\n", " .expression(f'folder:\"threads/{thread_id}/*\"') \\\n", " .sort_by('public_id', 'desc') \\\n", " .max_results(1) \\\n", " .execute()\n", "\n", " resources = result.get(\"resources\", [])\n", "\n", " if not resources:\n", " raise FileNotFoundError(f\"No resume found for thread_id: {thread_id}\")\n", "\n", " pdf_url = resources[0][\"secure_url\"]\n", " print(f\"Found resume: {pdf_url}\")\n", " return pdf_url" ] }, { "cell_type": "code", "execution_count": null, "id": "f4340cbb", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import tempfile\n", "import os\n", "from langchain_community.document_loaders import PyMuPDFLoader\n", "\n", "thread_id = \"trial_thread_001\"\n", "\n", "try:\n", " # Step 1 — Get URL from Cloudinary\n", " url = get_resume_url(thread_id)\n", " print(f\"URL: {url}\")\n", "\n", " # Step 2 — Fetch PDF bytes\n", " response = requests.get(url)\n", " response.raise_for_status()\n", "\n", " # Step 3 — Write to temp file\n", " with tempfile.NamedTemporaryFile(delete=False, suffix=\".pdf\") as tmp:\n", " tmp.write(response.content)\n", " tmp_path = tmp.name\n", "\n", " # Step 4 — Load with PyMuPDF\n", " loader = PyMuPDFLoader(tmp_path)\n", " docs = loader.load()\n", " resume_text = \"\\n\".join([doc.page_content for doc in docs])\n", "\n", " # Step 5 — Cleanup\n", " os.remove(tmp_path)\n", "\n", " print(f\"Pages loaded: {len(docs)}\")\n", " print(f\"Preview:\\n{resume_text[:500]}\")\n", "\n", "except FileNotFoundError as e:\n", " print(f\"Not found: {e}\")\n", "except Exception as e:\n", " print(f\"Error: {e}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b010e49b", "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader" ] }, { "cell_type": "code", "execution_count": null, "id": "7164448e", "metadata": {}, "outputs": [], "source": [ "resumepath=r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"" ] }, { "cell_type": "code", "execution_count": null, "id": "6d1029c0", "metadata": {}, "outputs": [], "source": [ "import cloudinary.uploader\n", "import os\n", "\n", "# Your resume path (using 'r' for raw string to handle backslashes correctly)\n", "resume_path = r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"\n", "\n", "# Extract filename without extension for the public_id\n", "file_name = os.path.basename(resume_path).split('.')[0] \n", "thread_id = \"trial_thread_001\"\n", "\n", "# Upload directly using the file path\n", "upload_result = cloudinary.uploader.upload(\n", " resume_path, \n", " folder=f\"threads/{thread_id}\",\n", " public_id=file_name,\n", " resource_type=\"auto\" # Handles the PDF correctly\n", ")\n", "\n", "print(f\"Upload Successful! URL: {upload_result['secure_url']}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "928b7237", "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "import cloudinary\n", "from io import BytesIO\n", "\n", "# --- 1. CONFIGURATION (Do this once) ---\n", "# This tells the library your API Key/Secret. \n", "# In a real app, put this in your .env file!\n", "os.environ[\"CLOUDINARY_URL\"] = \"cloudinary://866996699612973:9Tp3hGjI9npawSIrN4Mu4hFRwLQ@dtscmobmv\"\n", "\n", "def get_pdf_content(file_url):\n", " \"\"\"\n", " This function expects an HTTPS url, NOT the cloudinary:// credentials.\n", " \"\"\"\n", " # Ensure the URL is a real web link\n", " if not file_url.startswith(\"http\"):\n", " raise ValueError(\"The URL must start with http or https!\")\n", "\n", " response = requests.get(file_url)\n", " if response.status_code == 200:\n", " return BytesIO(response.content)\n", " else:\n", " print(f\"Error: Could not download file. Status: {response.status_code}\")\n", " return None\n", "\n", "# --- 2. TESTING ---\n", "# This is what the MERN devs will send you:\n", "test_resume_url = \"https://res.cloudinary.com\"\n", "\n", "# This will now work!\n", "pdf_file = get_pdf_content(test_resume_url)\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "b75a238a", "metadata": {}, "outputs": [], "source": [ "from typing import Any, Dict, List, Optional, Tuple,TypedDict,Literal\n", "from typing import Annotated, Sequence\n", "import os\n", "from pydantic import BaseModel, Field\n", "from langchain_groq import ChatGroq\n", "from langchain_core.messages import SystemMessage, HumanMessage,ToolMessage,AIMessage\n", "from langchain_core.tools import Tool\n", "from langgraph.graph import StateGraph,END,START\n", "from langgraph.types import interrupt \n", "from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder\n", "from langchain_community.document_loaders import PyMuPDFLoader\n", "import json\n", "from pydantic import BaseModel, Field\n", "from typing import List, Optional\n", "from pprint import pprint\n", "import os\n", "from dotenv import load_dotenv\n", "import json\n", "from langchain_core.documents import Document\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "import os\n", "from pinecone import Pinecone, ServerlessSpec\n", "from pinecone_text.sparse import BM25Encoder\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "from langchain_community.retrievers import PineconeHybridSearchRetriever\n", "import json\n", "from langchain_core.documents import Document\n", "from langchain_core.messages import BaseMessage\n", "from langgraph.graph import add_messages\n", "from langgraph.prebuilt import ToolNode ,tools_condition\n", "import torch\n", "from langgraph.checkpoint.memory import MemorySaver\n" ] }, { "cell_type": "markdown", "id": "c7058b37", "metadata": {}, "source": [ "Pydantic model of resume data extraction" ] }, { "cell_type": "markdown", "id": "69094b87", "metadata": {}, "source": [ "**Defining the pydantic models to be used**" ] }, { "cell_type": "code", "execution_count": 2, "id": "7da5b1c6", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "\n", "class Skill(BaseModel):\n", " name: str = Field(..., description=\"Skill name e.g. Python, Docker\")\n", " category: Optional[str] = Field(\n", " None, description=\"Category: Backend | ML | DevOps | Frontend | Other\"\n", " )\n", "\n", "\n", "class ExperienceItem(BaseModel):\n", " job_title: str = Field(\n", " ...,\n", " description=\"Role title of the candidate. Example: 'Backend Intern', 'Software Engineer'\"\n", " )\n", "\n", " experience_type: Optional[Literal['internship', 'full_time', 'contract', 'freelance']] = Field(\n", " None,\n", " description=\"Type of experience: internship, full_time, contract, or freelance\"\n", " )\n", "\n", "\n", "\n", " technologies: Optional[List[str]] = Field(\n", " default_factory=list,\n", " description=\"Technologies, tools, or frameworks used in this role\"\n", " )\n", "\n", " responsibilities: Optional[List[str]] = Field(\n", " default_factory=list,\n", " description=\"Key responsibilities, tasks, or learnings in concise bullet points keep it summarised detail *not* required\"\n", " )\n", "\n", "class ProjectItem(BaseModel):\n", " name: str = Field(..., description=\"Project name\")\n", " technologies: List[str] = Field(\n", " default_factory=list,\n", " description=\"Technologies used in this project hence learned during the project.\"\n", " )\n", " \n", "\n", "\n", "class CertificationItem(BaseModel):\n", " name: str = Field(..., description=\"Certification name\")\n", " \n", " topics_covered: List[str] = Field(\n", " default_factory=list,\n", " description=\"Key topics or skills the certification covers\"\n", " )\n", "\n", "\n", "\n", "class ResumeExtract(BaseModel):\n", "\n", "\n", " candidate_name:Optional[str]\n", "\n", " \n", " job_title: Optional[str] = Field(\n", " None,\n", " description=(\n", " \"Primary job title or role of the candidate. \"\n", " \"Examples: 'AI Engineer', 'Data Scientist', \"\n", " \"'Construction Project Manager', 'Healthcare Representative'. \"\n", " \"Should reflect the most recent or current role.\"\n", " )\n", " )\n", "\n", " \n", "\n", " \n", " skills: List[Skill] = Field(\n", " default_factory=list,\n", " description=\"Skills explicitly listed by the candidate\"\n", " )\n", " experience: List[ExperienceItem] = Field(\n", " default_factory=list,\n", " description=(\n", " \"Each role as a separate entry. \"\n", " \"No company name needed — focus on what was done and learned.\"\n", " )\n", " )\n", " projects: List[ProjectItem] = Field(\n", " default_factory=list,\n", " description=\"Projects with technologies used and what was built\"\n", " )\n", " certifications: Optional[List[CertificationItem]] = Field(\n", " None,\n", " description=\"Certifications with topics they cover. None if not present.\"\n", " )\n", " \n", "\n", "\n", " is_fresher: bool = Field(\n", " ...,\n", " description=(\n", " \"Set to True if the candidate lacks full-time professional employment. \"\n", " \"Academic projects, certifications, and internships are considered \"\n", " \"part of the learning phase and do not qualify a candidate as 'non-fresher' hence is_.\"\n", " )\n", ")\n" ] }, { "cell_type": "markdown", "id": "99ac1086", "metadata": {}, "source": [ " \"skills\": {\"__all__\": {\"category\"}}, # Drops 'category' from every skill\n", " \"experience\": {\"__all__\": {\"responsibilities\"}}, # Drops bullet points\n", " \"projects\": {\"__all__\": {\"what_was_built\"}}, # Drops project descriptions\n", " \"certifications\": {\"__all__\": {\"issuer\"}} # Drops the issuer" ] }, { "cell_type": "markdown", "id": "5b0756e0", "metadata": {}, "source": [ "Pydantic model for job description" ] }, { "cell_type": "code", "execution_count": 3, "id": "4b2441cd", "metadata": {}, "outputs": [], "source": [ "from pydantic import BaseModel, Field\n", "from typing import List, Optional\n", "\n", "\n", "class SkillRequirement(BaseModel):\n", " name: str = Field(\n", " ...,\n", " description=\"Skill or technology required for the job (e.g., Python, SQL, React)\"\n", " )\n", " level: Optional[str] = Field(\n", " None,\n", " description=\"Expected proficiency level: beginner | intermediate | strong\"\n", " )\n", "\n", "\n", "class ResponsibilityItem(BaseModel):\n", " description: str = Field(\n", " ...,\n", " description=\"Key responsibility or task expected from the candidate\"\n", " )\n", "\n", "\n", "class RequirementItem(BaseModel):\n", " description: str = Field(\n", " ...,\n", " description=\"Qualification or requirement such as education, availability, etc.\"\n", " )\n", "\n", "\n", "class ConstraintItem(BaseModel):\n", " type: str = Field(\n", " ...,\n", " description=\"Constraint type such as location, duration, eligibility\"\n", " )\n", " value: str = Field(\n", " ...,\n", " description=\"Constraint value (e.g., 'Pune only', '6 months', 'Fresher')\"\n", " )\n", "\n", "\n", "\n", "class JobDescriptionExtract(BaseModel):\n", " job_title: Optional[str] = Field(\n", " None,\n", " description=\"Job role/title (e.g., AI/ML Intern, Web Developer)\"\n", " )\n", "\n", " company_name: Optional[str] = Field(\n", " None,\n", " description=\"Company offering the job\"\n", " )\n", "\n", " location: Optional[str] = Field(\n", " None,\n", " description=\"Job location if specified\"\n", " )\n", "\n", " employment_type: Optional[str] = Field(\n", " None,\n", " description=\"Type of job: internship, full-time, contract\"\n", " )\n", "\n", " duration_months: Optional[int] = Field(\n", " None,\n", " description=\"Duration of role in months (for internships/contracts)\"\n", " )\n", "\n", " is_fresher_allowed: Optional[bool] = Field(\n", " None,\n", " description=\"Whether freshers are eligible for this role\"\n", " )\n", "\n", " skills_required: Optional[List[SkillRequirement]] = Field(\n", " None,\n", " description=\"List of required skills and expected levels\"\n", " )\n", "\n", " tools_technologies: Optional[List[str]] = Field(\n", " None,\n", " description=\"Specific tools/frameworks mentioned (e.g., Pandas, WordPress)\"\n", " )\n", "\n", " responsibilities: Optional[List[ResponsibilityItem]] = Field(\n", " None,\n", " description=\"Key job responsibilities\"\n", " )\n", "\n", " requirements: Optional[List[RequirementItem]] = Field(\n", " None,\n", " description=\"General requirements like availability, qualifications\"\n", " )\n", "\n", " constraints: Optional[List[ConstraintItem]] = Field(\n", " None,\n", " description=\"Special constraints like location restriction, duration, etc.\"\n", " )" ] }, { "cell_type": "markdown", "id": "4b12a3bc", "metadata": {}, "source": [ "**Pydantic model for skill gap analysis**" ] }, { "cell_type": "code", "execution_count": 4, "id": "4f1341e0", "metadata": {}, "outputs": [], "source": [ "class SkillGap(BaseModel):\n", " skill_name: str = Field(\n", " ..., \n", " description=\"The specific technology or tool missing or requiring an upgrade (e.g., 'PostgreSQL')\"\n", " )\n", " \n", " gap_type: Literal[\"missing_foundation\", \"needs_advanced_upgrade\"] = Field(\n", " ...,\n", " description=(\n", " \"missing_foundation: Candidate has no recorded experience in this core requirement. \"\n", " \"needs_advanced_upgrade: Candidate knows the basics but needs role-specific advanced training.\"\n", " )\n", " )\n", " \n", " priority: Literal[\"high\", \"medium\", \"low\"] = Field(\n", " ...,\n", " description=\"How critical this skill is for the target job role.\"\n", " )\n", " \n", " reasoning: str = Field(\n", " ...,\n", " description=(\n", " \"The 'Reasoning Trace'. This MUST be provided for every skill gap identified. \"\n", " \"Explain exactly WHY this gap was flagged based on the resume vs JD comparison. \"\n", " \"Example: 'JD requires FastAPI; candidate has Python experience but no record of using FastAPI framework.'\"\n", " )\n", " )\n", " \n", " target_competency: str = Field(\n", " ...,\n", " description=\"The specific outcome the candidate needs to reach (e.g., 'Build asynchronous database endpoints')\"\n", " )\n", "\n", "class SkillGapAnalysis(BaseModel):\n", " job_title: str = Field(..., description=\"The target role from the JD\")\n", " candidate_name: Optional[str] = Field(None, description=\"Extracted name from resume\")\n", " \n", " analyzed_gaps: List[SkillGap] = Field(\n", " default_factory=list,\n", " description=\"List of specific technical gaps found between Resume and JD\"\n", " )\n", " \n", " is_fresher_adaptation_needed: bool = Field(\n", " default=False,\n", " description=\"True if foundational corporate/soft-skill modules should be added to the path.\"\n", " )\n", " \n", " executive_summary: str = Field(\n", " ...,\n", " description=\"A 2-3 sentence overview of the candidate's readiness and the primary focus of the onboarding.\"\n", " )" ] }, { "cell_type": "code", "execution_count": 5, "id": "18663bb3", "metadata": {}, "outputs": [], "source": [ "class RoadmapStep(BaseModel):\n", " course_id: str\n", " title: str\n", " reasoning: str = Field(..., description=\"Why this specific course was chosen for this user\")\n", " is_foundation: bool\n", " sequence_order: int = Field(..., description=\"The order in which the course should be taken\")\n", "\n", "class LearningRoadmap(BaseModel):\n", " candidate_name: str\n", " target_role: str\n", " roadmap: List[RoadmapStep]\n", " onboarding_summary: str" ] }, { "cell_type": "markdown", "id": "604e9728", "metadata": {}, "source": [ "**Defining the agents to be used**" ] }, { "cell_type": "markdown", "id": "9036d57e", "metadata": {}, "source": [ "Resume data extraction agent" ] }, { "cell_type": "code", "execution_count": 6, "id": "14dab004", "metadata": {}, "outputs": [], "source": [ "resume_agent=ChatGroq(\n", " model=\"moonshotai/kimi-k2-instruct-0905\",\n", " temperature=0.2,\n", ")\n", "\n", "\n", "resume_agent=resume_agent.with_structured_output(\n", "\n", " schema=ResumeExtract,\n", " method=\"json_schema\",\n", " include_raw=True,\n", " strict=True\n", ")\n" ] }, { "cell_type": "markdown", "id": "7683eb69", "metadata": {}, "source": [ "Job description data extraction agent" ] }, { "cell_type": "code", "execution_count": 7, "id": "472dae2f", "metadata": {}, "outputs": [], "source": [ "jd_agent=ChatGroq(\n", " model=\"meta-llama/llama-4-scout-17b-16e-instruct\",\n", " temperature=0.2,\n", ")\n", "\n", "\n", "jd_agent=jd_agent.with_structured_output(\n", "\n", " schema=JobDescriptionExtract,\n", " method=\"json_schema\",\n", " include_raw=True,\n", " strict=True\n", ")\n" ] }, { "cell_type": "markdown", "id": "d14736d5", "metadata": {}, "source": [ "defining the gap analysis agent" ] }, { "cell_type": "code", "execution_count": 8, "id": "0d5e3b17", "metadata": {}, "outputs": [], "source": [ "gap_analysis_agent=ChatGroq(\n", " model=\"openai/gpt-oss-120b\",\n", " temperature=0.2,\n", ")\n", "\n", "\n", "gap_analysis_agent=gap_analysis_agent.with_structured_output(\n", " schema=SkillGapAnalysis,\n", " method=\"json_schema\",\n", " include_raw=True,\n", " strict=True\n", ")\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "id": "28bc58ad", "metadata": {}, "source": [ "defining the roadmap planner agent" ] }, { "cell_type": "code", "execution_count": 9, "id": "0ccc026b", "metadata": {}, "outputs": [], "source": [ "roadmap_planner_agent=ChatGroq(\n", " model=\"moonshotai/kimi-k2-instruct-0905\",\n", " temperature=0.2,\n", ")" ] }, { "cell_type": "markdown", "id": "2bd41131", "metadata": {}, "source": [ "**Tools**" ] }, { "cell_type": "code", "execution_count": 10, "id": "c8827093", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index ready: {'_response_info': {'raw_headers': {'connection': 'keep-alive',\n", " 'content-length': '187',\n", " 'content-type': 'application/json',\n", " 'date': 'Mon, 23 Mar 2026 20:11:40 GMT',\n", " 'grpc-status': '0',\n", " 'server': 'envoy',\n", " 'x-envoy-upstream-service-time': '62',\n", " 'x-pinecone-request-latency-ms': '61',\n", " 'x-pinecone-response-duration-ms': '64'}},\n", " 'dimension': 384,\n", " 'index_fullness': 0.0,\n", " 'memoryFullness': 0.0,\n", " 'metric': 'dotproduct',\n", " 'namespaces': {'__default__': {'vector_count': 47}},\n", " 'storageFullness': 0.0,\n", " 'total_vector_count': 47,\n", " 'vector_type': 'dense'}\n" ] } ], "source": [ "\n", "\n", "PINECONE_API_KEY = os.getenv(\"PINECONE_API_KEY\")\n", "pc = Pinecone(api_key=PINECONE_API_KEY)\n", "\n", "index_name = \"final-catalog-index\"\n", "\n", "\n", "# Create index if not exists\n", "if index_name not in pc.list_indexes().names():\n", " pc.create_index(\n", " name=index_name,\n", " dimension=384,\n", " metric=\"dotproduct\",\n", " spec=ServerlessSpec(\n", " cloud=\"aws\",\n", " region=\"us-east-1\"\n", " )\n", " )\n", " print(\"Index created.\")\n", "\n", "index = pc.Index(index_name)\n", "print(\"Index ready:\", index.describe_index_stats())\n" ] }, { "cell_type": "markdown", "id": "44180d94", "metadata": {}, "source": [ "Opening the docs for BM25 retriver" ] }, { "cell_type": "code", "execution_count": 12, "id": "7561b3a1", "metadata": {}, "outputs": [], "source": [ "import json\n", "from langchain_core.documents import Document\n", "\n", "\n", "doc_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\app\\utils\\langchain_formatted.json\"\n", "\n", "\n", "documents = []\n", "\n", "# Load the transformed catalog\n", "with open(doc_path, \"r\") as f:\n", " data = json.load(f)\n", " for doc in data:\n", " # Create a LangChain Document object for each entry\n", " documents.append(\n", " Document(\n", " page_content=doc[\"page_content\"], \n", " metadata=doc[\"metadata\"]\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 13, "id": "f0845a99", "metadata": {}, "outputs": [], "source": [ "device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "c8e6d2a5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\ATHARVA\\AppData\\Local\\Temp\\ipykernel_30068\\2526755923.py:1: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n", " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": device})\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3af10b64d4584d53952822157482186f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading weights: 0%| | 0/103 [00:00log for further details." ] } ], "source": [ "bm25_encoder = BM25Encoder()\n", "\n", "bm25_encoder.fit([doc.page_content for doc in documents])\n", "\n", "retriever = PineconeHybridSearchRetriever(\n", " embeddings=embeddings,\n", " sparse_encoder=bm25_encoder,\n", " index=index,\n", " alpha=0.5\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "03c755a1", "metadata": {}, "outputs": [ { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mnotebook controller is DISPOSED. \n", "\u001b[1;31mView Jupyter log for further details." ] } ], "source": [ "from langchain_core.tools import tool\n", "from typing import Optional\n", "\n", "@tool\n", "def search_courses(query: str):\n", " \"\"\"\n", " Search the course catalog for relevant modules based on a skill query \n", " \n", " \n", " Args:\n", " \n", " query:the skill to find with semantic terms (e.g., 'FastAPI', 'PostgreSQL', 'Docker','Enterprise VMS Strategy','Utilization Management').\n", " \n", " \"\"\"\n", " \n", " results = retriever.invoke(\n", " query\n", " )\n", "\n", " if not results:\n", " return f\"No courses found for '{query}'.\"\n", "\n", " formatted_output = []\n", " for doc in results:\n", " course_id = doc.metadata.get('course_id', 'N/A')\n", " \n", " # We include the ID for roadmap generation, followed by the full context\n", " # created during the transformation stage (Title, Desc, Outcomes, Prereqs).\n", " course_block = (\n", " f\"ID: {course_id}\\n\"\n", " f\"{doc.page_content}\\n\"\n", " \"---\"\n", " )\n", " formatted_output.append(course_block)\n", "\n", " return \"\\n\".join(formatted_output)" ] }, { "cell_type": "code", "execution_count": 16, "id": "9db28710", "metadata": {}, "outputs": [], "source": [ "import json\n", "from typing import Optional, Dict, Any\n", "from langchain_core.tools import tool\n", "\n", "class CourseLookup:\n", " def __init__(self, catalog_path: str = \"course_catalog.json\"):\n", " self.catalog_path = catalog_path\n", " self.courses_map = {}\n", " self._load_catalog()\n", "\n", " def _load_catalog(self):\n", " \"\"\"Loads the catalog into a dictionary for O(1) lookup speed.\"\"\"\n", " try:\n", " with open(self.catalog_path, 'r') as f:\n", " catalog = json.load(f)\n", " # Key the dictionary by course_id for instant retrieval\n", " self.courses_map = {course['course_id']: course for course in catalog}\n", " except FileNotFoundError:\n", " print(f\"Error: {self.catalog_path} not found.\")\n", " except json.JSONDecodeError:\n", " print(f\"Error: Failed to decode {self.catalog_path}.\")\n", "\n", " def get_course_details(self, course_id: str) -> Optional[Dict[str, Any]]:\n", " \"\"\"Retrieves full details of a course by its ID.\"\"\"\n", " return self.courses_map.get(course_id)\n", "\n", "\n", "lookup_service = CourseLookup(r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\app\\tools\\Catalog.json\")\n", "\n", "@tool\n", "def get_course_by_id(course_id: str) -> str:\n", " \"\"\"\n", " Retrieves full details for a specific course using its unique course_id.\n", " Use this tool when you find a prerequisite ID in another course and \n", " need to fetch its title, description, and duration to add to the roadmap.\n", " \"\"\"\n", " details = lookup_service.get_course_details(course_id)\n", " if not details:\n", " return f\"Error: Course with ID {course_id} not found in catalog.\"\n", " \n", " # Return a clean string for the agent to process\n", " return json.dumps(details, indent=2)" ] }, { "cell_type": "code", "execution_count": 18, "id": "09d238ab", "metadata": {}, "outputs": [], "source": [ "\n", "@tool(args_schema=LearningRoadmap)\n", "def submit_final_roadmap(candidate_name, target_role, roadmap, onboarding_summary):\n", " \"\"\"\n", " STRICTLY call this tool to submit the final structured learning roadmap.\n", " This saves the data to the global system and the graph state.\n", " \"\"\"\n", " \n", " \n", " # Construct the structured JSON\n", " result = {\n", " \"candidate_name\": candidate_name,\n", " \"target_role\": target_role,\n", " \"onboarding_summary\": onboarding_summary,\n", " \"roadmap\": [\n", " step.model_dump() if hasattr(step, \"model_dump\") else step \n", " for step in roadmap\n", " ]\n", " }\n", " \n", " \n", " \n", " # Return to LangGraph (will be stored in state via a post-processing node)\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "id": "6ad04bc6", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "@tool\n", "def submit_mermaid_visualization(mermaid_code: str):\n", " \"\"\"\n", " STRICTLY call this tool to save the Mermaid.js visualization of the roadmap.\n", " \"\"\"\n", " \n", " \n", " # 2. Now this assignment updates the global variable\n", " mermaid_roadmap_code = mermaid_code\n", " \n", " return \"Mermaid visualization saved successfully.\"" ] }, { "cell_type": "code", "execution_count": null, "id": "285f74bb", "metadata": {}, "outputs": [], "source": [ "roadmap_planner_agent_tools=[search_courses, get_course_by_id,submit_final_roadmap,submit_mermaid_visualization]" ] }, { "cell_type": "code", "execution_count": null, "id": "47564782", "metadata": {}, "outputs": [], "source": [ "roadmap_planner_agent=roadmap_planner_agent.bind_tools(roadmap_planner_agent_tools)" ] }, { "cell_type": "code", "execution_count": null, "id": "c311f642", "metadata": {}, "outputs": [], "source": [ "# Replace 'roadmap_planner_agent' with your bound model variable\n", "print(roadmap_planner_agent.kwargs.get(\"tools\"))\n" ] }, { "cell_type": "markdown", "id": "2da3f43b", "metadata": {}, "source": [ "**Trail resume path**" ] }, { "cell_type": "code", "execution_count": null, "id": "7cfbfc3a", "metadata": {}, "outputs": [], "source": [ "resumepath=r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"" ] }, { "cell_type": "markdown", "id": "14f4946c", "metadata": {}, "source": [ "**Langgraph agent state**" ] }, { "cell_type": "code", "execution_count": null, "id": "5deda2bb", "metadata": {}, "outputs": [], "source": [ "class OnboardingState(TypedDict):\n", " candidate_name: Optional[str]\n", " resume_text: str \n", " file_path: str \n", " job_description: str \n", " messages: Annotated[Sequence[BaseMessage], add_messages]\n", " \n", " # Analysis & Extraction Data\n", " skill_gap_analysis_data: Optional[SkillGapAnalysis]\n", " resume_data: Optional[ResumeExtract] \n", " extraction_error: Optional[str] \n", " JobDescriptionExtract_data: Optional[JobDescriptionExtract]\n", " \n", " # --- NEW KEYS FOR OUTPUT ---\n", " mermaid_code: Optional[str] # Stores the Mermaid visualization string\n", " final_roadmap: Optional[Dict] # Stores the final structured JSON roadmap" ] }, { "cell_type": "markdown", "id": "e54bac6a", "metadata": {}, "source": [ "**Prompts**" ] }, { "cell_type": "code", "execution_count": null, "id": "c8df9934", "metadata": {}, "outputs": [], "source": [ "from langchain_groq import ChatGroq\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "\n", "resume_agent_prompt = \"\"\"\n", "\n", "You are a precise resume parser. Your only job is to extract structured information from a raw resume text.\n", "\n", "\n", "\n", "- Extract ONLY what is explicitly present in the resume. Do NOT infer or hallucinate missing fields.\n", "- current_role: the job title stated at the top of the resume or most recent role. If the candidate is a student with no job, set it to \"Student\".\n", "- is_fresher: set True ONLY if the candidate has zero professional work experience. Having projects or certifications does NOT make someone non-fresher.\n", "- total_experience_years: total years of professional work only. Set 0.0 for freshers.\n", "- skills: extract from the explicit skills section only. Do NOT pull skills from project descriptions here.\n", "- experience: each role is a SEPARATE entry. Ignore company name. Focus on job_title, technologies used, and what they did or learned.\n", "- projects: extract each project separately. Capture technologies and one line on what was built.\n", "- certifications: extract ONLY if present. Set null if none found. Include topics the certification covers.\n", "- achievements: extract ONLY if present. Set null if none found. Include the domain (e.g. Hackathon, Quiz, Competitive Programming).\n", "\n", "\n", "\n", "\n", "Return a single valid JSON object matching the schema. No extra text, no markdown, no explanation.\n", "\n", "\n", "\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "608efafd", "metadata": {}, "outputs": [], "source": [ "jd_agent_prompt =\"\"\" \n", "\n", "You are a precise job description parser.\n", "Extract structured information from the given job description.\n", "\n", "\n", "\n", "- Extract ONLY explicitly mentioned information. Do NOT infer or hallucinate.\n", "\n", "- Follow the provided schema strictly.\n", "\n", "- If a field is not present, return null (not empty list unless schema default applies).\n", "\n", "- Keep skills atomic (e.g., Python, SQL, React).\n", "\n", "- Do NOT mix fields:\n", " - skills = only required skills\n", " - responsibilities = what the candidate will do\n", " - constraints = restrictions like location, duration, eligibility\n", "\n", "- Convert durations like \"6 months\" into integer months.\n", "\n", "- is_fresher_allowed:\n", " - True only if explicitly allowed\n", " - False only if explicitly restricted\n", " \n", "\n", "\n", "\n", "Return a valid JSON object only.\n", " \"\"\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8a6c1483", "metadata": {}, "outputs": [], "source": [ "gap_analysis_agent_prompt=\"\"\"\n", "\n", "You are an expert technical assessor and the core intelligence of an AI-driven, adaptive onboarding engine[cite: 5]. \n", "Your objective is to parse a new hire's current capabilities against a target job description and identify precise skill gaps to reach role-specific competency[cite: 5].\n", "\n", "\n", "\n", "Current corporate onboarding utilizes static, \"one-size-fits-all\" curricula, resulting in significant inefficiencies[cite: 3]. \n", "Your ultimate goal is to solve this: you must ensure experienced hires do NOT waste time on known concepts, while ensuring beginners are NOT overwhelmed by advanced modules[cite: 3, 4].\n", "\n", "\n", "\n", "- Cross-reference the JD's `skills_required` and `tools_technologies` against the candidate's `skills_list`, `experience.technologies`, and `projects.technologies`.\n", "- Identify Hard Gaps: Technologies explicitly required by the JD that are completely absent from the candidate's profile.\n", "- Apply Adaptive Logic (Proficiency Gaps):\n", " - For Experienced Hires: If they possess the skill, DO NOT flag it for basic training. Only flag a gap if they need an advanced, role-specific upgrade based on low duration of use.\n", " - For Beginners/Freshers: Flag foundational gaps and prerequisites heavily to ensure they are prepared before tackling complex JD requirements.\n", "- Keep skills atomic and highly specific (e.g., output \"FastAPI\" or \"PostgreSQL\", do NOT output vague terms like \"Backend Frameworks\").\n", "- Do NOT hallucinate requirements that are not explicitly stated in the JD data.\n", "- Do NOT attempt to build the curriculum or suggest courses yet. Your sole focus is diagnosing the gaps.\n", "- Provide a concise `reasoning` string for each identified gap. This reasoning MUST justify why the gap exists based on the user's experience level to prove the adaptive logic.\n", "\n", "\n", "Return a valid JSON object only.\n", "\n", "\n", "\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "059e5f86", "metadata": {}, "outputs": [], "source": [ "roadmap_planner_agent_prompt = \"\"\"\n", "\n", "You are an expert technical onboarding architect.\n", "Transform a Skill Gap Analysis into a minimal, logically sequenced learning roadmap.\n", "\n", "\n", "\n", "STEP 1 — SEARCH\n", "For every gap → call search_courses.\n", "Use ONLY course IDs returned by the tool. Never guess IDs.\n", "\n", "STEP 2 — RESOLVE PREREQUISITES\n", "For each retrieved course inspect its prerequisites list.\n", "If candidate's resume does NOT prove mastery → call get_course_by_id for each missing prerequisite.\n", "Skip courses the candidate already demonstrates via projects or experience.\n", "\n", "STEP 3 — SEQUENCE\n", "Prerequisites always before target modules.\n", "sequence_order must be 1, 2, 3... strictly.\n", "If is_fresher_adaptation_needed is True → add a professional fundamentals module first.\n", "\n", "STEP 4 — SUBMIT (TERMINAL STEP)\n", "Call submit_final_roadmap ONCE with the complete roadmap.\n", "Call submit_mermaid_visualization ONCE with the Mermaid string.\n", "After both return → STOP. Do not call any tool again.\n", "\n", "\n", "\n", "- gap courses → :::gap\n", "- known prerequisites → :::known\n", "- start node → :::start\n", "- end node → :::done\n", "- group by week using subgraph\n", "\n", "\n", "\n", "flowchart TD\n", " A([Start — Candidate's current skills]):::start\n", " subgraph W1[\"Week 1 — Core gaps\"]\n", " B[CS-DOCKER-101\\nDocker & Containerization]:::gap\n", " C[CS-PY-101\\nPython Fundamentals]:::known\n", " end\n", " subgraph W2[\"Week 2 — Role readiness\"]\n", " D[CS-CICD-201\\nCI/CD with GitHub Actions]:::gap\n", " end\n", " Z([Role-ready — DevOps Engineer]):::done\n", " A --> B & C\n", " B --> D\n", " D --> Z\n", " classDef gap fill:#EEEDFE,stroke:#534AB7,color:#26215C\n", " classDef known fill:#E1F5EE,stroke:#0F6E56,color:#085041\n", " classDef start fill:#1D9E75,stroke:#0F6E56,color:#E1F5EE\n", " classDef done fill:#534AB7,stroke:#3C3489,color:#EEEDFE\n", "\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "9c4dea1e", "metadata": {}, "outputs": [], "source": [ "def input_node(state: OnboardingState):\n", " file_path = state.get(\"file_path\")\n", " \n", " print(f\"šŸ“‚ File path received: {file_path}\")\n", " print(f\"šŸ“‚ File exists: {os.path.exists(file_path) if file_path else 'NO PATH'}\")\n", "\n", " if not file_path:\n", " return {\"extraction_error\": \"Missing file_path in state\"}\n", "\n", " try:\n", " loader = PyMuPDFLoader(file_path)\n", " docs = loader.load()\n", " \n", " print(f\"šŸ“„ Pages loaded: {len(docs)}\")\n", " \n", " resume_text = \"\\n\".join([doc.page_content for doc in docs])\n", " \n", " print(f\"šŸ“„ Text length: {len(resume_text)}\")\n", "\n", " return {\n", " \"resume_text\": resume_text,\n", " \"extraction_error\": None\n", " }\n", "\n", " except Exception as e:\n", " print(f\"āŒ PyMuPDF failed: {str(e)}\")\n", " return {\n", " \"resume_text\": None,\n", " \"extraction_error\": f\"Failed to load resume: {str(e)}\"\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "eb13ffc0", "metadata": {}, "outputs": [], "source": [ "def extractResumeDataNode(state: OnboardingState):\n", " resume_text = state.get(\"resume_text\")\n", "\n", " # Guard 1 — empty text\n", " if not resume_text or len(resume_text.strip()) < 10:\n", " print(\"āŒ RESUME TEXT EMPTY OR TOO SHORT\")\n", " return {\"resume_data\": None, \"extraction_error\": \"Resume text is empty\"}\n", "\n", " print(f\"šŸ“„ Resume text length: {len(resume_text)} chars\")\n", "\n", " messages = [\n", " SystemMessage(content=resume_agent_prompt),\n", " HumanMessage(content=f\"{resume_text}\")\n", " ]\n", "\n", " result = resume_agent.invoke(messages)\n", "\n", " # Guard 2 — parsing failed\n", " if result.get(\"parsing_error\"):\n", " print(f\"āŒ PARSING ERROR: {result['parsing_error']}\")\n", " return {\"resume_data\": None, \"extraction_error\": str(result[\"parsing_error\"])}\n", "\n", " # Guard 3 — parsed is None\n", " if result.get(\"parsed\") is None:\n", " print(f\"āŒ PARSED IS NONE. RAW OUTPUT: {result.get('raw')}\")\n", " return {\"resume_data\": None, \"extraction_error\": \"LLM returned null schema\"}\n", "\n", " print(f\"āœ… Resume extracted: {result['parsed'].job_title}\")\n", " return {\"resume_data\": result[\"parsed\"]}" ] }, { "cell_type": "code", "execution_count": null, "id": "330acef6", "metadata": {}, "outputs": [], "source": [ "def extractJDDataNode(state: OnboardingState):\n", " # 1. Safety Check: Is the text even in the state?\n", " jd_text = state.get(\"job_description\", \"\")\n", " \n", " if not jd_text or len(jd_text.strip()) < 5:\n", " print(\"DEBUGGER ERROR: job_description text is MISSING from state!\")\n", " return {\"JobDescriptionExtract_data\": JobDescriptionExtract()}\n", "\n", " print(f\"DEBUGGER: Sending {len(jd_text)} characters to JD Agent...\")\n", "\n", " messages = [\n", " SystemMessage(content=jd_agent_prompt),\n", " HumanMessage(content=f\"EXTRACT FROM THIS TEXT:\\n\\n{jd_text}\")\n", " ]\n", "\n", " try:\n", " # 2. Invoke the agent\n", " result = jd_agent.invoke(messages)\n", " \n", " # 3. Handle the 'parsed' key (ensure your chain is configured correctly)\n", " # If result is already the Pydantic object, use it directly.\n", " # If result is a dict with 'parsed', use result['parsed'].\n", " parsed_data = result.get(\"parsed\") if isinstance(result, dict) else result\n", "\n", " # 4. Critical Check: Did it actually find anything?\n", " if parsed_data.job_title is None and parsed_data.tools_technologies is None:\n", " print(\"DEBUGGER WARNING: LLM returned empty schema! Checking prompt...\")\n", " else:\n", " print(f\"DEBUGGER SUCCESS: Extracted {parsed_data.job_title}\")\n", "\n", " return {\"JobDescriptionExtract_data\": parsed_data}\n", " \n", " except Exception as e:\n", " print(f\"DEBUGGER CRITICAL: Invoke failed: {str(e)}\")\n", " return {\"JobDescriptionExtract_data\": JobDescriptionExtract()}" ] }, { "cell_type": "markdown", "id": "795e2446", "metadata": {}, "source": [ "removing this ->\"skills\": {\"__all__\": {\"category\"}}, # Drops 'category' from every skill\n", " \"experience\": {\"__all__\": {\"responsibilities\"}}, # Drops bullet points\n", " \"projects\": {\"__all__\": {\"what_was_built\"}}, # Drops project descriptions\n", " \"certifications\": {\"__all__\": {\"issuer\"}} # Drops the issuer" ] }, { "cell_type": "code", "execution_count": null, "id": "7352181c", "metadata": {}, "outputs": [], "source": [ "def skill_gap_node(state: OnboardingState):\n", " \n", " resume_data = state[\"resume_data\"] \n", " candidate_name = state[\"candidate_name\"]\n", " \n", " # To remove noise and reduce size of the prompt.\n", " lean_resume_dict = resume_data.model_dump(\n", "\n", " exclude_none=True # Bonus: Automatically drops any fields that are None/null!\n", " )\n", "\n", " raw_jd = state[\"JobDescriptionExtract_data\"]\n", " \n", " # Strip the HR noise and text bloat\n", " lean_jd_dict = raw_jd.model_dump(\n", " exclude={\n", " \n", " \n", " \n", " \n", " \"responsibilities\": True, # Dropping verbose bullet points\n", " \"requirements\": True,\n", " \"constraints\": True\n", " },\n", " exclude_none=True # Drops any null fields\n", " )\n", " \n", " #Convert back to a JSON string if your prompt template requires it\n", " \n", " lean_resume_json = json.dumps(lean_resume_dict, indent=2)\n", "\n", "\n", " lean_jd_json = json.dumps(lean_jd_dict, indent=2)\n", "\n", " messages = [\n", " SystemMessage(content=gap_analysis_agent_prompt),\n", " HumanMessage(content=f\"Users Resume:{lean_resume_json} Job Description:{lean_jd_json}\"),\n", " \n", " ]\n", "\n", " \n", " result = gap_analysis_agent.invoke(messages)\n", "\n", " return {\"skill_gap_analysis_data\": result[\"parsed\"]}\n", "\n", "\n", " \n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "1fb2f0d1", "metadata": {}, "outputs": [], "source": [ "def roadmap_planning_node(state: OnboardingState):\n", " \"\"\"\n", " The agent's 'thinking' node. It looks at the Skill Gaps and \n", " decides which tool to call next.\n", " \"\"\"\n", " skill_gap_data = state[\"skill_gap_analysis_data\"]\n", "\n", " skill_gap_data= skill_gap_data.model_dump()\n", "\n", " system_prompt = SystemMessage(content=roadmap_planner_agent_prompt)\n", " input_msg = HumanMessage(content=f\" {skill_gap_data} \")\n", " \n", " response = roadmap_planner_agent.invoke([system_prompt, input_msg] + state[\"messages\"])\n", " \n", " return {\"messages\": [response]}" ] }, { "cell_type": "code", "execution_count": null, "id": "cea90664", "metadata": {}, "outputs": [], "source": [ "def finalize_state_node(state: OnboardingState):\n", " \"\"\"\n", " Final node that extracts structured data from the message scratchpad\n", " and populates the main state keys. No global variables needed!\n", " \"\"\"\n", " final_roadmap = None\n", " mermaid_code = None\n", "\n", " # We search the messages in reverse to find the LATEST tool calls\n", " for msg in reversed(state[\"messages\"]):\n", " # Check if the message has tool calls (this will be an AIMessage)\n", " if hasattr(msg, \"tool_calls\") and msg.tool_calls:\n", " for tool_call in msg.tool_calls:\n", " \n", " # 1. Extract the Roadmap JSON\n", " if tool_call[\"name\"] == \"submit_final_roadmap\":\n", " final_roadmap = tool_call[\"args\"]\n", " \n", " # 2. Extract the Mermaid String\n", " elif tool_call[\"name\"] == \"submit_mermaid_visualization\":\n", " mermaid_code = tool_call[\"args\"].get(\"mermaid_code\")\n", "\n", " # Once we have both, we can stop searching\n", " if final_roadmap and mermaid_code:\n", " break\n", "\n", " \n", " \n", " return {\n", " \"final_roadmap\": final_roadmap,\n", " \"mermaid_code\": mermaid_code\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "ba9f22e1", "metadata": {}, "outputs": [], "source": [ "tool_node = ToolNode(roadmap_planner_agent_tools)" ] }, { "cell_type": "code", "execution_count": null, "id": "b5cfe4c3", "metadata": {}, "outputs": [], "source": [ "builder = StateGraph(OnboardingState)\n", "\n", "# Define Nodes\n", "builder.add_node(\"input_node\", input_node)\n", "builder.add_node(\"resume_data_extraction\", extractResumeDataNode)\n", "builder.add_node(\"jd_data_extraction\", extractJDDataNode)\n", "builder.add_node(\"skill_gap_analysis\", skill_gap_node)\n", "builder.add_node(\"roadmap_planning_agent\", roadmap_planning_node)\n", "builder.add_node(\"tools\", tool_node) # Named 'tools' for tools_condition compatibility\n", "builder.add_node(\"finalize_state\", finalize_state_node)\n", "\n", "# Define Entry Point and initial Extraction Parallelism\n", "builder.set_entry_point(\"input_node\")\n", "builder.add_edge(\"input_node\", \"resume_data_extraction\")\n", "builder.add_edge(\"input_node\", \"jd_data_extraction\")\n", "\n", "# Join Extractions into Gap Analysis\n", "builder.add_edge(\"resume_data_extraction\", \"skill_gap_analysis\")\n", "builder.add_edge(\"jd_data_extraction\", \"skill_gap_analysis\")\n", "\n", "# Transition from Analysis to Planning Agent\n", "builder.add_edge(\"skill_gap_analysis\", \"roadmap_planning_agent\")\n", "\n", "# Agentic ReAct Loop (Planning Agent <-> Tools)\n", "builder.add_conditional_edges(\n", " \"roadmap_planning_agent\",\n", " tools_condition,\n", " {\n", " \"tools\": \"tools\", # If tool_calls exist, go to tools\n", " \"__end__\": \"finalize_state\" # If finished, go to finalize_state\n", " }\n", ")\n", "\n", "# 2. Loop back to agent after tools\n", "builder.add_edge(\"tools\", \"roadmap_planning_agent\")\n", "\n", "\n", "\n", "# Compile the Graph\n", "graph = builder.compile()" ] }, { "cell_type": "code", "execution_count": null, "id": "53588a77", "metadata": {}, "outputs": [], "source": [ "display(graph)" ] }, { "cell_type": "code", "execution_count": null, "id": "0edf8d59", "metadata": {}, "outputs": [], "source": [ "jd_text=\"\"\"Job Title: Backend Developer\n", "\n", "Company name: CodeForge\n", "We are hiring a Backend Developer to build scalable APIs and backend systems.\n", "\n", "Responsibilities:\n", "- Develop REST APIs using FastAPI\n", "- Design and manage PostgreSQL databases\n", "- Implement authentication and authorization systems\n", "- Optimize performance and scalability\n", "\n", "Requirements:\n", "- Strong knowledge of Python\n", "- Experience with FastAPI or Django\n", "- Good understanding of SQL and database design\n", "- Familiarity with Docker\n", "\n", "Constraints:\n", "- Location: Pune only\n", "- Full-time role \"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "da3df5a4", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "# Define the keys your React frontend actually needs\n", "REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n", "\n", "def export_ui_payload(state, filename=\"hook_output.json\"):\n", " \"\"\"\n", " Extracts specific keys from the graph state and ensures \n", " Pydantic objects are dumped to dicts for JSON compatibility.\n", " \"\"\"\n", " ui_data = {}\n", "\n", " for key in REQUIRED_KEYS:\n", " # Get the value from the state\n", " val = state.get(key)\n", " \n", " if val is None:\n", " continue\n", "\n", " # Check if the value is a Pydantic object (has .model_dump())\n", " # This fixes the \"skill_gap_analysis_data as a string\" issue\n", " if hasattr(val, \"model_dump\"):\n", " ui_data[key] = val.model_dump()\n", " else:\n", " # If it's already a dict (final_roadmap) or string (mermaid_code)\n", " ui_data[key] = val\n", "\n", " # Save to the local file\n", " with open(filename, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(ui_data, f, indent=2)\n", " \n", " print(f\"āœ… UI Payload successfully exported to {filename}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a95b4db7", "metadata": {}, "outputs": [], "source": [ "initial_input = {\n", " \"candidate_name\": \"Chirayu Jain\",\n", " \"resume_text\": None,\n", " \"job_description\": jd_text,\n", " \"file_path\": r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\\ChirayuResume.pdf\",\n", " \"resume_data\": None,\n", " \"extraction_error\": None,\n", " \"JobDescriptionExtract_data\": None,\n", " \"skill_gap_analysis_data\": None\n", " \n", " \n", "}\n", "import uuid\n", "\n", "\n", "checkpointer = MemorySaver() \n", "graph = builder.compile(checkpointer=checkpointer)\n", "\n", "THREAD_ID = str(uuid.uuid4())\n", "\n", "\n", "\n", "config = {\"configurable\": {\"thread_id\": THREAD_ID,\"langgraph_user_id\": \"Chirayu Jain\"}}\n", "\n", "final_state = graph.invoke(initial_input, config=config)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "093bdd6e", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "def run_graph_with_stream(graph, initial_input, config):\n", " \"\"\"\n", " Executes the graph in streaming mode to visualize the 'under the hood' \n", " process of node transitions and data updates.\n", " \"\"\"\n", " print(\"šŸš€ Starting Graph Stream...\\n\")\n", " \n", " # Using stream_mode=\"updates\" to see exactly what each node returns\n", " for event in graph.stream(initial_input, config, stream_mode=\"updates\"):\n", " for node_name, node_update in event.items():\n", " print(f\"--- šŸ“ Node: {node_name} ---\")\n", " \n", " # 1. Check for Tool Calls (The 'ReAct' thinking process)\n", " if \"messages\" in node_update:\n", " last_msg = node_update[\"messages\"][-1]\n", " if hasattr(last_msg, \"tool_calls\") and last_msg.tool_calls:\n", " for tool in last_msg.tool_calls:\n", " print(f\"šŸ› ļø AGENT CALLING TOOL: {tool['name']}\")\n", " print(f\"šŸ“ ARGS: {json.dumps(tool['args'], indent=2)}\")\n", " elif hasattr(last_msg, \"content\") and last_msg.content:\n", " # Show a snippet of the AI's internal reasoning\n", " content_snippet = last_msg.content[:150].replace('\\n', ' ')\n", " print(f\"🧠 AI THOUGHT: {content_snippet}...\")\n", "\n", " # 2. Check for Data Extraction (JD/Resume results)\n", " if \"JobDescriptionExtract_data\" in node_update:\n", " jd = node_update[\"JobDescriptionExtract_data\"]\n", " print(f\"āœ… Extracted JD: {getattr(jd, 'job_title', 'Unknown')}\")\n", " \n", " if \"resume_data\" in node_update:\n", " res = node_update[\"resume_data\"]\n", " print(f\"āœ… Extracted Resume for: {getattr(res, 'candidate_name', 'Unknown')}\")\n", "\n", " # 3. Check for the final output keys\n", " if \"skill_gap_analysis_data\" in node_update:\n", " print(\"šŸŽÆ Skill Gap Analysis Completed.\")\n", " \n", " if \"learning_roadmap\" in node_update or \"final_roadmap\" in node_update:\n", " print(\"šŸ Final Roadmap Constructed.\")\n", "\n", " print(\"\\n\" + \"=\"*50 + \"\\n\")\n", "\n", " # Access the final state after the stream ends\n", " final_state = graph.get_state(config)\n", " print(\"✨ Stream Finished. Final state captured.\")\n", " return final_state.values\n", "\n", "# --- Example Usage ---\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7a36ecb1", "metadata": {}, "outputs": [], "source": [ "config = {\"configurable\": {\"thread_id\": \"debug_123\"}}\n", "final_result = run_graph_with_stream(graph, initial_input, config)" ] }, { "cell_type": "code", "execution_count": null, "id": "4e502949", "metadata": {}, "outputs": [], "source": [ "final_result" ] }, { "cell_type": "code", "execution_count": null, "id": "53ba21aa", "metadata": {}, "outputs": [], "source": [ "///break" ] }, { "cell_type": "code", "execution_count": null, "id": "5afbce5b", "metadata": {}, "outputs": [], "source": [ "final_state" ] }, { "cell_type": "code", "execution_count": null, "id": "25a6b5b4", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "# Define the keys your React frontend actually needs\n", "REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n", "\n", "def export_ui_payload(state, filename=\"ai_output.json\"):\n", " \"\"\"\n", " Extracts specific keys from the graph state and ensures \n", " Pydantic objects are dumped to dicts for JSON compatibility.\n", " \"\"\"\n", " ui_data = {}\n", "\n", " for key in REQUIRED_KEYS:\n", " # Get the value from the state\n", " val = state.get(key)\n", " \n", " if val is None:\n", " continue\n", "\n", " # Check if the value is a Pydantic object (has .model_dump())\n", " # This fixes the \"skill_gap_analysis_data as a string\" issue\n", " if hasattr(val, \"model_dump\"):\n", " ui_data[key] = val.model_dump()\n", " else:\n", " # If it's already a dict (final_roadmap) or string (mermaid_code)\n", " ui_data[key] = val\n", "\n", " # Save to the local file\n", " with open(filename, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(ui_data, f, indent=2)\n", " \n", " print(f\"āœ… UI Payload successfully exported to {filename}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "26c10157", "metadata": {}, "outputs": [], "source": [ "export_ui_payload(final_state)" ] }, { "cell_type": "code", "execution_count": null, "id": "478f19dd", "metadata": {}, "outputs": [], "source": [ "test_data=['atgdata.json','buisnessdata.json','chefdata.json','casemanager.json']\n", "test_resumes=['ATGPDF.pdf','Business.pdf','CHEF.pdf','casemanager.pdf']\n", "\n", "\n", "test_resume_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\\{test_resumes}\"\n", "test_data_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\\{test_data}\"" ] }, { "cell_type": "code", "execution_count": null, "id": "83f3bd72", "metadata": {}, "outputs": [], "source": [ "store_state=[]" ] }, { "cell_type": "code", "execution_count": null, "id": "5b29b7ea", "metadata": {}, "outputs": [], "source": [ "import json\n", "import uuid\n", "import os\n", "from langgraph.checkpoint.memory import MemorySaver\n", "\n", "# --- Configuration & Paths ---\n", "\n", "test_map = [\n", " {\"resume\": \"ATGPDF.pdf\", \"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n", " \n", "]\n", "\n", "RESUME_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\"\n", "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n", "\n", "# Windows-safe absolute output path\n", "OUTPUT_DIR = os.path.join(os.getcwd(), \"predictions\")\n", "\n", "if not os.path.exists(OUTPUT_DIR):\n", " os.makedirs(OUTPUT_DIR)\n", "\n", "\n", "# --- Helper Functions ---\n", "\n", "def get_job_description_string(data_filename: str) -> str | None:\n", " \"\"\"\n", " Extracts the Job Description from the test data JSON and\n", " formats it as a clean string for the extraction node.\n", " \"\"\"\n", " path = os.path.join(DATA_DIR, data_filename)\n", "\n", " if not os.path.exists(path):\n", " print(f\"āš ļø Data file not found: {path}\")\n", " return None\n", "\n", " try:\n", " with open(path, \"r\", encoding=\"utf-8\") as f:\n", " suite = json.load(f)\n", "\n", " jd_obj = suite.get(\"job_description\") or suite.get(\"job_description_requirements\")\n", "\n", " if not jd_obj:\n", " print(f\"āš ļø No JD key found in {data_filename}\")\n", " return None\n", "\n", " title = jd_obj.get(\"title\") or jd_obj.get(\"job_title\", \"N/A\")\n", " desc = jd_obj.get(\"description\", \"\")\n", " reqs = jd_obj.get(\"requirements\", [])\n", "\n", " jd_string = f\"JOB TITLE: {title}\\n\\n\"\n", " if desc:\n", " jd_string += f\"OVERVIEW: {desc}\\n\\n\"\n", " jd_string += \"REQUIREMENTS:\\n\" + \"\\n\".join([f\"- {r}\" for r in reqs])\n", "\n", " return jd_string\n", "\n", " except Exception as e:\n", " print(f\"āŒ Error loading JD from {data_filename}: {e}\")\n", " return None\n", "\n", "\n", "def export_ui_payload(state: dict) -> dict:\n", " \"\"\"\n", " Extracts required keys from graph state.\n", " Converts Pydantic objects to dicts via .model_dump().\n", " \"\"\"\n", " REQUIRED_KEYS = [\n", " \"candidate_name\",\n", " \"skill_gap_analysis_data\",\n", " \"mermaid_code\",\n", " \"final_roadmap\",\n", " ]\n", " ui_data = {}\n", "\n", " for key in REQUIRED_KEYS:\n", " val = state.get(key)\n", " if val is None:\n", " continue\n", " if hasattr(val, \"model_dump\"):\n", " ui_data[key] = val.model_dump()\n", " else:\n", " ui_data[key] = val\n", "\n", " return ui_data\n", "\n", "\n", "# --- Execution Loop ---\n", "\n", "def run_evaluation_suite(graph_instance):\n", " \"\"\"\n", " Runs the graph for every resume in test_map.\n", " Saves UI-ready payloads as predicted_{name}.json in OUTPUT_DIR.\n", " \"\"\"\n", " print(f\"\\nšŸ“ Output directory: {OUTPUT_DIR}\\n\")\n", "\n", " for case in test_map:\n", " print(f\"šŸš€ Processing: {case['resume']}...\")\n", "\n", " # 1. Validate resume file exists\n", " resume_path = os.path.join(RESUME_DIR, case[\"resume\"])\n", " if not os.path.exists(resume_path):\n", " print(f\"āš ļø Resume not found, skipping: {resume_path}\")\n", " continue\n", "\n", " # 2. Load JD string\n", " jd_content = get_job_description_string(case[\"data\"])\n", " if not jd_content:\n", " print(f\"āš ļø Skipping {case['resume']}: JD not found in {case['data']}\")\n", " continue\n", "\n", " # 3. Build initial state\n", " initial_input = {\n", " \"candidate_name\": case[\"name\"].replace(\"_\", \" \"),\n", " \"file_path\": resume_path,\n", " \"job_description\": jd_content,\n", " \"resume_text\": None,\n", " \"resume_data\": None,\n", " \"extraction_error\": None,\n", " \"JobDescriptionExtract_data\": None,\n", " \"skill_gap_analysis_data\": None,\n", " \"messages\": [],\n", " \"mermaid_code\": None,\n", " \"final_roadmap\": None,\n", " }\n", "\n", " # 4. Invoke graph\n", " config = {\"configurable\": {\"thread_id\": str(uuid.uuid4())}}\n", "\n", " try:\n", " final_state = graph_instance.invoke(initial_input, config=config)\n", "\n", " store_state.append(final_state)\n", "\n", " print(f\"āœ… Graph execution successful for {case['resume']}\\n\")\n", "\n", " # 5. Export payload\n", " prediction = export_ui_payload(final_state)\n", " output_file = f\"predicted_{case['name'].lower()}.json\"\n", " output_path = os.path.join(OUTPUT_DIR, output_file)\n", "\n", " with open(output_path, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(prediction, f, indent=2, ensure_ascii=False)\n", "\n", " print(f\"āœ… Saved: {output_path}\\n\")\n", "\n", " except Exception as e:\n", " print(f\"āŒ Error during graph execution for {case['resume']}: {e}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c1638a3f", "metadata": {}, "outputs": [], "source": [ "if __name__ == \"__main__\":\n", " # Assuming your graph is already compiled and named 'graph'\n", " run_evaluation_suite(graph)" ] }, { "cell_type": "code", "execution_count": null, "id": "0910b325", "metadata": {}, "outputs": [], "source": [ "import json\n", "import uuid\n", "import os\n", "from langgraph.checkpoint.memory import MemorySaver\n", "\n", "# --- Configuration & Paths ---\n", "# Mapping resumes to the JSON files containing the Ground Truth data we created\n", "test_map = [\n", " \n", " {\"resume\": \"casemanager.pdf\", \"data\": \"casemanagerdata.json\", \"name\": \"Case_Manager\"}\n", "]\n", "\n", "# Update these to your actual local paths where the files are stored\n", "RESUME_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\"\n", "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n", "OUTPUT_DIR = \"./predictions\" \n", "\n", "if not os.path.exists(OUTPUT_DIR):\n", " os.makedirs(OUTPUT_DIR)\n", "\n", "# --- Helper Functions ---\n", "\n", "def get_job_description_string(data_filename):\n", " \"\"\"\n", " Extracts the Job Description from the test data and formats it as a clean \n", " string for the Extraction Node. Handles both 'job_description' and \n", " 'job_description_requirements' keys.\n", " \"\"\"\n", " path = os.path.join(DATA_DIR, data_filename)\n", " try:\n", " with open(path, 'r', encoding='utf-8') as f:\n", " suite = json.load(f)\n", " \n", " # Extract from 'job_description' or 'job_description_requirements'\n", " jd_obj = suite.get(\"job_description\") or suite.get(\"job_description_requirements\")\n", " \n", " if not jd_obj:\n", " return None\n", " \n", " title = jd_obj.get(\"title\") or jd_obj.get(\"job_title\", \"N/A\")\n", " desc = jd_obj.get(\"description\", \"\")\n", " reqs = jd_obj.get(\"requirements\", [])\n", " \n", " # Format as a clean string for the LLM to analyze\n", " jd_string = f\"JOB TITLE: {title}\\n\\n\"\n", " if desc:\n", " jd_string += f\"OVERVIEW: {desc}\\n\\n\"\n", " jd_string += \"REQUIREMENTS:\\n\" + \"\\n\".join([f\"- {r}\" for r in reqs])\n", " \n", " return jd_string\n", " except Exception as e:\n", " print(f\"Error loading JD from {data_filename}: {e}\")\n", " return None\n", "\n", "def export_ui_payload(state):\n", " \"\"\"\n", " Extracts and formats state data for the UI payload.\n", " Ensures Pydantic objects are converted to dicts using .model_dump().\n", " \"\"\"\n", " REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n", " ui_data = {}\n", "\n", " for key in REQUIRED_KEYS:\n", " val = state.get(key)\n", " if val is None:\n", " continue\n", " \n", " # If it's a Pydantic object, dump it to a dict\n", " if hasattr(val, \"model_dump\"):\n", " ui_data[key] = val.model_dump()\n", " else:\n", " # If it's already a dict, list, or string (like mermaid_code)\n", " ui_data[key] = val\n", " return ui_data\n", "\n", "# --- Execution Loop ---\n", "\n", "def run_evaluation_suite_re(graph_instance):\n", " \"\"\"\n", " Automates the graph execution for every resume in the test suite.\n", " Saves the final UI-ready payloads as 'predicted_{name}.json'.\n", " \"\"\"\n", " for case in test_map:\n", " print(f\"šŸš€ Processing: {case['resume']}...\")\n", " \n", " # 1. Prepare Inputs\n", " jd_content = get_job_description_string(case['data'])\n", " \n", " if not jd_content:\n", " print(f\"āš ļø Skipping {case['resume']}: JD not found in {case['data']}\")\n", " continue\n", "\n", " # The 'job_description' key must match your extraction node's expectation\n", " initial_input = {\n", " \"candidate_name\": case['name'].replace(\"_\", \" \"),\n", " \"resume_path\": os.path.join(RESUME_DIR, case['resume']),\n", " \"job_description\": jd_content, \n", " \"resume_text\": None # Assuming input_node or extraction node loads the PDF\n", " }\n", "\n", " # 2. Invoke Graph with a unique thread\n", " thread_id = str(uuid.uuid4())\n", " config = {\"configurable\": {\"thread_id\": thread_id}}\n", " \n", " try:\n", " # Execution\n", " final_state = graph_instance.invoke(initial_input, config=config)\n", " \n", " # 3. Process and Save UI Payload\n", " prediction = export_ui_payload(final_state)\n", " output_file = f\"predicted_{case['name'].lower()}.json\"\n", " output_path = os.path.join(OUTPUT_DIR, output_file)\n", " \n", " with open(output_path, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(prediction, f, indent=2)\n", " \n", " print(f\"āœ… Success! Prediction saved to: {output_path}\")\n", " \n", " except Exception as e:\n", " print(f\"āŒ Error during graph execution for {case['resume']}: {e}\")\n", "\n", "# --- Example of Triggering ---\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a8ef7f0b", "metadata": {}, "outputs": [], "source": [ "if __name__ == \"__main__\":\n", " # Assuming your graph is already compiled and named 'graph'\n", " run_evaluation_suite_re(graph)" ] }, { "cell_type": "code", "execution_count": null, "id": "e37e4370", "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import re\n", "from rouge_score import rouge_scorer\n", "\n", "def normalize(text):\n", " if not text: return \"\"\n", " return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n", "\n", "def calculate_f1(target_set, predicted_set):\n", " if not target_set and not predicted_set: return 1.0\n", " if not target_set or not predicted_set: return 0.0\n", " intersection = target_set.intersection(predicted_set)\n", " precision = len(intersection) / len(predicted_set)\n", " recall = len(intersection) / len(target_set)\n", " if (precision + recall) == 0:\n", " return 0.0\n", " return 2 * (precision * recall) / (precision + recall)\n", "\n", "# ← replaces your manual calculate_lcs + get_rouge_l\n", "scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n", "\n", "def get_rouge_l(reference, candidate):\n", " if not reference or not candidate:\n", " return 0.0\n", " scores = scorer.score(reference, candidate)\n", " return scores['rougeL'].fmeasure # F1 score directly\n", "\n", "\n", "def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n", " print(\"\\nšŸ“Š --- AI ENGINE PERFORMANCE REPORT ---\")\n", " print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n", " print(\"-\" * 75)\n", "\n", " final_report = {}\n", "\n", " for case in mapping:\n", " name = case['name']\n", " target_path = os.path.join(target_data_dir, case['data'])\n", " pred_path = os.path.join(predicted_data_dir, f\"predicted_{name.lower()}.json\")\n", "\n", " if not os.path.exists(target_path) or not os.path.exists(pred_path):\n", " print(f\"āš ļø Skipping {name} — file not found\")\n", " continue\n", "\n", " try:\n", " with open(target_path, 'r', encoding='utf-8') as f:\n", " target_json = json.load(f)[\"ai_target\"]\n", " with open(pred_path, 'r', encoding='utf-8') as f:\n", " pred_json = json.load(f)\n", "\n", " # 1. Skill Extraction F1\n", " target_skills = {normalize(g['skill_name']) for g in target_json['skill_gap_analysis_data']['analyzed_gaps']}\n", " pred_skills = {normalize(g['skill_name']) for g in pred_json['skill_gap_analysis_data']['analyzed_gaps']}\n", " skill_f1 = calculate_f1(target_skills, pred_skills)\n", "\n", " # 2. Reasoning Quality — ROUGE-L via library\n", " reasoning_rouge = get_rouge_l(\n", " target_json['skill_gap_analysis_data']['executive_summary'],\n", " pred_json['skill_gap_analysis_data']['executive_summary']\n", " )\n", "\n", " # 3. Retrieval Hit Rate\n", " target_ids = {normalize(c['course_id']) for c in target_json['final_roadmap']['roadmap']}\n", " pred_ids = {normalize(c['course_id']) for c in pred_json['final_roadmap']['roadmap']}\n", " hit_count = len(target_ids.intersection(pred_ids))\n", " hit_rate = hit_count / len(target_ids) if target_ids else 0.0\n", "\n", " print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n", "\n", " final_report[name] = {\n", " \"skill_extraction_f1\": round(skill_f1, 4),\n", " \"reasoning_rouge_l\": round(reasoning_rouge, 4),\n", " \"retrieval_hit_rate\": round(hit_rate, 4),\n", " }\n", "\n", " except Exception as e:\n", " print(f\"āš ļø Error processing {name}: {e}\")\n", "\n", " # Average across all test cases\n", " if final_report:\n", " avg_f1 = sum(v[\"skill_extraction_f1\"] for v in final_report.values()) / len(final_report)\n", " avg_rouge = sum(v[\"reasoning_rouge_l\"] for v in final_report.values()) / len(final_report)\n", " avg_hit = sum(v[\"retrieval_hit_rate\"] for v in final_report.values()) / len(final_report)\n", " print(\"-\" * 75)\n", " print(f\"{'AVERAGE':<20} | {avg_f1:>8.2%} | {avg_rouge:>15.2%} | {avg_hit:>15.2%}\")\n", "\n", " return final_report\n", "\n", "\n", "# --- Mapping & Paths ---\n", "mapping = [\n", " {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n", " \n", "]\n", "\n", "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n", "PRED_DIR = \"./predictions\"\n", "\n", "report = run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)" ] }, { "cell_type": "code", "execution_count": null, "id": "70ced174", "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import re\n", "\n", "def normalize(text):\n", " \"\"\"Clean and normalize text for comparison.\"\"\"\n", " if not text: return \"\"\n", " return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n", "\n", "def calculate_lcs(X, Y):\n", " \"\"\"Calculates the length of the Longest Common Subsequence.\"\"\"\n", " m, n = len(X), len(Y)\n", " L = [[0] * (n + 1) for _ in range(m + 1)]\n", " for i in range(m + 1):\n", " for j in range(n + 1):\n", " if i == 0 or j == 0:\n", " L[i][j] = 0\n", " elif X[i-1] == Y[j-1]:\n", " L[i][j] = L[i-1][j-1] + 1\n", " else:\n", " L[i][j] = max(L[i-1][j], L[i][j-1])\n", " return L[m][n]\n", "\n", "def get_rouge_l(reference, candidate):\n", " \"\"\"Calculates ROUGE-L F1 score using LCS.\"\"\"\n", " if not reference or not candidate:\n", " return 0.0\n", " ref_tokens = normalize(reference).split()\n", " cand_tokens = normalize(candidate).split()\n", " if not ref_tokens or not cand_tokens:\n", " return 0.0\n", " lcs_count = calculate_lcs(ref_tokens, cand_tokens)\n", " recall = lcs_count / len(ref_tokens)\n", " precision = lcs_count / len(cand_tokens)\n", " if (recall + precision) == 0:\n", " return 0.0\n", " f1 = (2 * recall * precision) / (recall + precision)\n", " return f1\n", "\n", "def calculate_f1(target_set, predicted_set):\n", " \"\"\"Calculates Precision, Recall, and F1 for sets of entities (Skills/IDs).\"\"\"\n", " if not target_set and not predicted_set: return 1.0\n", " if not target_set or not predicted_set: return 0.0\n", " intersection = target_set.intersection(predicted_set)\n", " precision = len(intersection) / len(predicted_set)\n", " recall = len(intersection) / len(target_set)\n", " if (precision + recall) == 0:\n", " return 0.0\n", " return 2 * (precision * recall) / (precision + recall)\n", "\n", "def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n", " \"\"\"\n", " Main evaluation loop benchmarking predictions against ground truth.\n", " Handles missing keys and files gracefully.\n", " \"\"\"\n", " print(\"\\nšŸ“Š --- AI ENGINE PERFORMANCE REPORT ---\")\n", " print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n", " print(\"-\" * 75)\n", "\n", " final_report = {}\n", "\n", " for case in mapping:\n", " name = case['name']\n", " target_path = os.path.join(target_data_dir, case['data'])\n", " # Look for the predicted file (lowercase name mapper)\n", " pred_filename = f\"predicted_{name.lower()}.json\"\n", " pred_path = os.path.join(predicted_data_dir, pred_filename)\n", " \n", " # 1. Check for File Existence\n", " if not os.path.exists(target_path):\n", " print(f\"āš ļø Skipping {name} — Target file '{case['data']}' not found.\")\n", " continue\n", " if not os.path.exists(pred_path):\n", " print(f\"āš ļø Skipping {name} — Prediction file '{pred_filename}' not found.\")\n", " continue\n", " \n", " try:\n", " with open(target_path, 'r', encoding='utf-8') as f:\n", " target_json = json.load(f).get(\"ai_target\", {})\n", " with open(pred_path, 'r', encoding='utf-8') as f:\n", " pred_json = json.load(f)\n", " \n", " # --- Metric 1: Skill Extraction Accuracy (F1) ---\n", " target_gap_data = target_json.get('skill_gap_analysis_data', {})\n", " pred_gap_data = pred_json.get('skill_gap_analysis_data', {})\n", " \n", " target_skills = {normalize(g.get('skill_name')) for g in target_gap_data.get('analyzed_gaps', [])}\n", " pred_skills = {normalize(g.get('skill_name')) for g in pred_gap_data.get('analyzed_gaps', [])}\n", " skill_f1 = calculate_f1(target_skills, pred_skills)\n", " \n", " # --- Metric 2: Reasoning Quality (ROUGE-L) ---\n", " target_summary = target_gap_data.get('executive_summary', \"\")\n", " pred_summary = pred_gap_data.get('executive_summary', \"\")\n", " reasoning_rouge = get_rouge_l(target_summary, pred_summary)\n", " \n", " # --- Metric 3: Retrieval Precision (Top-1 Hit Rate) ---\n", " # Extract expected IDs from target\n", " target_roadmap = target_json.get('final_roadmap', {}).get('roadmap', [])\n", " target_ids = {normalize(c.get('course_id')) for c in target_roadmap}\n", " \n", " # Extract predicted IDs from prediction (Checking common possible keys)\n", " pred_roadmap_obj = pred_json.get('final_roadmap', {})\n", " # If final_roadmap is a list directly in some versions\n", " if isinstance(pred_roadmap_obj, list):\n", " pred_roadmap = pred_roadmap_obj\n", " else:\n", " pred_roadmap = pred_roadmap_obj.get('roadmap', [])\n", " \n", " pred_ids = {normalize(c.get('course_id')) for c in pred_roadmap}\n", " \n", " if target_ids:\n", " hit_count = len(target_ids.intersection(pred_ids))\n", " hit_rate = hit_count / len(target_ids)\n", " else:\n", " hit_rate = 0.0\n", " \n", " # Print status row\n", " print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n", " \n", " final_report[name] = {\n", " \"skill_extraction_f1\": skill_f1,\n", " \"reasoning_rouge_l\": reasoning_rouge,\n", " \"retrieval_hit_rate\": hit_rate\n", " }\n", " except Exception as e:\n", " print(f\"āš ļø Error processing {name}: {str(e)}\")\n", "\n", " return final_report\n", "\n", "# --- Mapping & Paths ---\n", "mapping = [\n", " {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n", " {\"data\": \"buisnessdata.json\", \"name\": \"Business_Manager\"},\n", " {\"data\": \"chefdata.json\", \"name\": \"Executive_Chef\"},\n", " {\"data\": \"casemanagerdata.json\", \"name\": \"Case_Manager\"}\n", "]\n", "\n", "# Note: Ensure these paths are correct for your local environment\n", "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n", "PRED_DIR = \"./predictions\"\n", "\n", "\n", "if __name__ == \"__main__\":\n", " report = run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)" ] }, { "cell_type": "code", "execution_count": null, "id": "73a45cfd", "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import re\n", "from rouge_score import rouge_scorer\n", "\n", "# -----------------------------\n", "# Scorer instance (created once)\n", "# -----------------------------\n", "scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n", "\n", "\n", "def normalize(text):\n", " if not text: return \"\"\n", " return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n", "\n", "\n", "def get_rouge_l(reference, candidate):\n", " if not reference or not candidate:\n", " return 0.0\n", " return scorer.score(reference, candidate)['rougeL'].fmeasure\n", "\n", "\n", "def calculate_skill_f1(target_skills, pred_skills):\n", " if not target_skills and not pred_skills: return 1.0\n", " if not target_skills or not pred_skills: return 0.0\n", "\n", " tp = 0\n", " matched_targets = set()\n", "\n", " for p in pred_skills:\n", " for t in target_skills:\n", " if t in matched_targets: continue\n", " if p in t or t in p:\n", " tp += 1\n", " matched_targets.add(t)\n", " break\n", "\n", " precision = tp / len(pred_skills)\n", " recall = tp / len(target_skills)\n", "\n", " if (precision + recall) == 0:\n", " return 0.0\n", " return 2 * (precision * recall) / (precision + recall)\n", "\n", "\n", "def calculate_retrieval_hit(target_ids, pred_ids):\n", " if not target_ids: return 0.0\n", " if not pred_ids: return 0.0\n", " return len(target_ids.intersection(pred_ids)) / len(target_ids)\n", "\n", "\n", "def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n", " print(\"\\nšŸ“Š --- AI ENGINE PERFORMANCE REPORT ---\")\n", " print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n", " print(\"-\" * 75)\n", "\n", " final_report = {}\n", "\n", " for case in mapping:\n", " name = case['name']\n", " target_path = os.path.join(target_data_dir, case['data'])\n", " pred_path = os.path.join(predicted_data_dir, f\"predicted_{name.lower()}.json\")\n", "\n", " if not os.path.exists(target_path) or not os.path.exists(pred_path):\n", " if not os.path.exists(pred_path):\n", " print(f\"{name:<20} | SKIPPED — prediction file not found\")\n", " continue\n", "\n", " try:\n", " with open(target_path, 'r', encoding='utf-8') as f:\n", " target_json = json.load(f).get(\"ai_target\", {})\n", " with open(pred_path, 'r', encoding='utf-8') as f:\n", " pred_json = json.load(f)\n", "\n", " # --- Metric 1: Skill F1 ---\n", " target_gap = target_json.get('skill_gap_analysis_data', {})\n", " pred_gap = pred_json.get('skill_gap_analysis_data', {})\n", "\n", " target_skills = {normalize(g.get('skill_name')) for g in target_gap.get('analyzed_gaps', [])}\n", " pred_skills = {normalize(g.get('skill_name')) for g in pred_gap.get('analyzed_gaps', [])}\n", " skill_f1 = calculate_skill_f1(target_skills, pred_skills)\n", "\n", " # --- Metric 2: ROUGE-L ---\n", " reasoning_rouge = get_rouge_l(\n", " target_gap.get('executive_summary', \"\"),\n", " pred_gap.get('executive_summary', \"\")\n", " )\n", "\n", " # --- Metric 3: Retrieval Hit Rate ---\n", " target_ids = {normalize(c.get('course_id')) for c in target_json.get('final_roadmap', {}).get('roadmap', [])}\n", "\n", " pred_roadmap_obj = pred_json.get('final_roadmap', {})\n", " pred_roadmap = pred_roadmap_obj if isinstance(pred_roadmap_obj, list) else pred_roadmap_obj.get('roadmap', [])\n", " pred_ids = {normalize(c.get('course_id')) for c in pred_roadmap}\n", "\n", " hit_rate = calculate_retrieval_hit(target_ids, pred_ids)\n", "\n", " print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n", "\n", " final_report[name] = {\n", " \"skill_extraction_f1\": round(skill_f1, 4),\n", " \"reasoning_rouge_l\": round(reasoning_rouge, 4),\n", " \"retrieval_hit_rate\": round(hit_rate, 4),\n", " }\n", "\n", " except Exception as e:\n", " print(f\"āš ļø Error processing {name}: {str(e)}\")\n", "\n", " # --- Average Row ---\n", " if final_report:\n", " avg_f1 = sum(v[\"skill_extraction_f1\"] for v in final_report.values()) / len(final_report)\n", " avg_rouge = sum(v[\"reasoning_rouge_l\"] for v in final_report.values()) / len(final_report)\n", " avg_hit = sum(v[\"retrieval_hit_rate\"] for v in final_report.values()) / len(final_report)\n", " print(\"-\" * 75)\n", " print(f\"{'AVERAGE':<20} | {avg_f1:>8.2%} | {avg_rouge:>15.2%} | {avg_hit:>15.2%}\")\n", "\n", " return final_report\n", "\n", "\n", "# --- Mapping & Paths ---\n", "mapping = [\n", " {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n", " {\"data\": \"buisnessdata.json\", \"name\": \"Business_Manager\"},\n", " {\"data\": \"chefdata.json\", \"name\": \"Executive_Chef\"},\n", " {\"data\": \"casemanager.json\", \"name\": \"Case_Manager\"}\n", "]\n", "\n", "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n", "PRED_DIR = \"./predictions\"\n", "\n", "if __name__ == \"__main__\":\n", " run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }