Spaces:

Gaykar
/

AdaptiveEngineService

Sleeping

File size: 100,103 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e02e1b00",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import cloudinary\n",
    "import cloudinary.uploader\n",
    "import requests\n",
    "from io import BytesIO\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "# Explicitly configure using your 3 credentials\n",
    "cloudinary.config( \n",
    "  cloud_name = os.getenv('CLOUDINARY_CLOUD_NAME'), \n",
    "  api_key = os.getenv('CLOUDINARY_API_KEY'), \n",
    "  api_secret = os.getenv('CLOUDINARY_API_SECRET'),\n",
    "  secure = True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c11377c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import cloudinary\n",
    "import cloudinary.uploader\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# 1. Load credentials from your .env file\n",
    "load_dotenv()\n",
    "\n",
    "cloudinary.config( \n",
    "  cloud_name = os.getenv('CLOUDINARY_CLOUD_NAME'), \n",
    "  api_key = os.getenv('CLOUDINARY_API_KEY'), \n",
    "  api_secret = os.getenv('CLOUDINARY_API_SECRET'),\n",
    "  secure = True\n",
    ")\n",
    "\n",
    "# 2. Set your variables\n",
    "resume_path = r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"\n",
    "thread_id = \"trial_thread_001\"\n",
    "file_name = \"ChirayuResume\"\n",
    "\n",
    "# 3. Perform the upload\n",
    "try:\n",
    "    response = cloudinary.uploader.upload(\n",
    "        resume_path,\n",
    "        folder = f\"threads/{thread_id}\",\n",
    "        public_id = file_name,\n",
    "        resource_type = \"image\"  # Use \"image\" for PDFs to get previews in UI\n",
    "    )\n",
    "\n",
    "    # 4. Create the URL from the response\n",
    "    pdf_url = response.get(\"secure_url\")\n",
    "    \n",
    "    print(f\"✅ Upload Successful!\")\n",
    "    print(f\"📂 Folder: threads/{thread_id}\")\n",
    "    print(f\"🔗 URL to push: {pdf_url}\")\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"❌ Upload failed: {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f986ff8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from io import BytesIO\n",
    "\n",
    "def get_pdf_for_ai(url):\n",
    "    # 1. Reach out to the URL\n",
    "    response = requests.get(url)\n",
    "    \n",
    "    if response.status_code == 200:\n",
    "        # 2. Convert the web response into a \"file-like\" object\n",
    "        pdf_stream = BytesIO(response.content)\n",
    "        print(\"✅ PDF loaded into memory for processing!\")\n",
    "        return pdf_stream\n",
    "    else:\n",
    "        print(f\"❌ Failed to fetch PDF. Status: {response.status_code}\")\n",
    "        return None\n",
    "\n",
    "# --- USE YOUR ACTUAL URL ---\n",
    "resume_url = \"https://res.cloudinary.com/dvxnazx8e/image/upload/v1774166452/threads/trial_thread_001/ChirayuResume.pdf\"\n",
    "pdf_data = get_pdf_for_ai(resume_url)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "938186bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cloudinary\n",
    "from cloudinary import Search\n",
    "\n",
    "\n",
    "def get_resume_url(thread_id: str) -> str:\n",
    "    \"\"\"\n",
    "    Searches Cloudinary for the resume PDF in the thread's folder\n",
    "    and returns the secure URL.\n",
    "    \"\"\"\n",
    "    result = Search() \\\n",
    "        .expression(f'folder:\"threads/{thread_id}/*\"') \\\n",
    "        .sort_by('public_id', 'desc') \\\n",
    "        .max_results(1) \\\n",
    "        .execute()\n",
    "\n",
    "    resources = result.get(\"resources\", [])\n",
    "\n",
    "    if not resources:\n",
    "        raise FileNotFoundError(f\"No resume found for thread_id: {thread_id}\")\n",
    "\n",
    "    pdf_url = resources[0][\"secure_url\"]\n",
    "    print(f\"Found resume: {pdf_url}\")\n",
    "    return pdf_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4340cbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import tempfile\n",
    "import os\n",
    "from langchain_community.document_loaders import PyMuPDFLoader\n",
    "\n",
    "thread_id = \"trial_thread_001\"\n",
    "\n",
    "try:\n",
    "    # Step 1 — Get URL from Cloudinary\n",
    "    url = get_resume_url(thread_id)\n",
    "    print(f\"URL: {url}\")\n",
    "\n",
    "    # Step 2 — Fetch PDF bytes\n",
    "    response = requests.get(url)\n",
    "    response.raise_for_status()\n",
    "\n",
    "    # Step 3 — Write to temp file\n",
    "    with tempfile.NamedTemporaryFile(delete=False, suffix=\".pdf\") as tmp:\n",
    "        tmp.write(response.content)\n",
    "        tmp_path = tmp.name\n",
    "\n",
    "    # Step 4 — Load with PyMuPDF\n",
    "    loader = PyMuPDFLoader(tmp_path)\n",
    "    docs = loader.load()\n",
    "    resume_text = \"\\n\".join([doc.page_content for doc in docs])\n",
    "\n",
    "    # Step 5 — Cleanup\n",
    "    os.remove(tmp_path)\n",
    "\n",
    "    print(f\"Pages loaded: {len(docs)}\")\n",
    "    print(f\"Preview:\\n{resume_text[:500]}\")\n",
    "\n",
    "except FileNotFoundError as e:\n",
    "    print(f\"Not found: {e}\")\n",
    "except Exception as e:\n",
    "    print(f\"Error: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b010e49b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import  PyMuPDFLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7164448e",
   "metadata": {},
   "outputs": [],
   "source": [
    "resumepath=r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d1029c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cloudinary.uploader\n",
    "import os\n",
    "\n",
    "# Your resume path (using 'r' for raw string to handle backslashes correctly)\n",
    "resume_path = r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\"\n",
    "\n",
    "# Extract filename without extension for the public_id\n",
    "file_name = os.path.basename(resume_path).split('.')[0] \n",
    "thread_id = \"trial_thread_001\"\n",
    "\n",
    "# Upload directly using the file path\n",
    "upload_result = cloudinary.uploader.upload(\n",
    "    resume_path, \n",
    "    folder=f\"threads/{thread_id}\",\n",
    "    public_id=file_name,\n",
    "    resource_type=\"auto\" # Handles the PDF correctly\n",
    ")\n",
    "\n",
    "print(f\"Upload Successful! URL: {upload_result['secure_url']}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "928b7237",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "import cloudinary\n",
    "from io import BytesIO\n",
    "\n",
    "# --- 1. CONFIGURATION (Do this once) ---\n",
    "# This tells the library your API Key/Secret. \n",
    "# In a real app, put this in your .env file!\n",
    "os.environ[\"CLOUDINARY_URL\"] = \"cloudinary://866996699612973:9Tp3hGjI9npawSIrN4Mu4hFRwLQ@dtscmobmv\"\n",
    "\n",
    "def get_pdf_content(file_url):\n",
    "    \"\"\"\n",
    "    This function expects an HTTPS url, NOT the cloudinary:// credentials.\n",
    "    \"\"\"\n",
    "    # Ensure the URL is a real web link\n",
    "    if not file_url.startswith(\"http\"):\n",
    "        raise ValueError(\"The URL must start with http or https!\")\n",
    "\n",
    "    response = requests.get(file_url)\n",
    "    if response.status_code == 200:\n",
    "        return BytesIO(response.content)\n",
    "    else:\n",
    "        print(f\"Error: Could not download file. Status: {response.status_code}\")\n",
    "        return None\n",
    "\n",
    "# --- 2. TESTING ---\n",
    "# This is what the MERN devs will send you:\n",
    "test_resume_url = \"https://res.cloudinary.com\"\n",
    "\n",
    "# This will now work!\n",
    "pdf_file = get_pdf_content(test_resume_url)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b75a238a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any, Dict, List, Optional, Tuple,TypedDict,Literal\n",
    "from typing import Annotated, Sequence\n",
    "import os\n",
    "from pydantic import BaseModel, Field\n",
    "from langchain_groq import ChatGroq\n",
    "from langchain_core.messages import SystemMessage, HumanMessage,ToolMessage,AIMessage\n",
    "from langchain_core.tools import Tool\n",
    "from langgraph.graph import StateGraph,END,START\n",
    "from langgraph.types import interrupt  \n",
    "from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder\n",
    "from langchain_community.document_loaders import  PyMuPDFLoader\n",
    "import json\n",
    "from pydantic import BaseModel, Field\n",
    "from typing import List, Optional\n",
    "from pprint import pprint\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "import json\n",
    "from langchain_core.documents import Document\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "import os\n",
    "from pinecone import Pinecone, ServerlessSpec\n",
    "from pinecone_text.sparse import BM25Encoder\n",
    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "from langchain_community.retrievers import PineconeHybridSearchRetriever\n",
    "import json\n",
    "from langchain_core.documents import Document\n",
    "from langchain_core.messages import BaseMessage\n",
    "from langgraph.graph import add_messages\n",
    "from langgraph.prebuilt import ToolNode ,tools_condition\n",
    "import torch\n",
    "from langgraph.checkpoint.memory import MemorySaver\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7058b37",
   "metadata": {},
   "source": [
    "Pydantic model of resume data extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69094b87",
   "metadata": {},
   "source": [
    "**Defining the pydantic models to be used**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7da5b1c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "class Skill(BaseModel):\n",
    "    name: str = Field(..., description=\"Skill name e.g. Python, Docker\")\n",
    "    category: Optional[str] = Field(\n",
    "        None, description=\"Category: Backend | ML | DevOps | Frontend | Other\"\n",
    "    )\n",
    "\n",
    "\n",
    "class ExperienceItem(BaseModel):\n",
    "    job_title: str = Field(\n",
    "        ...,\n",
    "        description=\"Role title of the candidate. Example: 'Backend Intern', 'Software Engineer'\"\n",
    "    )\n",
    "\n",
    "    experience_type: Optional[Literal['internship', 'full_time', 'contract', 'freelance']] = Field(\n",
    "        None,\n",
    "        description=\"Type of experience: internship, full_time, contract, or freelance\"\n",
    "    )\n",
    "\n",
    "\n",
    "\n",
    "    technologies: Optional[List[str]] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"Technologies, tools, or frameworks used in this role\"\n",
    "    )\n",
    "\n",
    "    responsibilities: Optional[List[str]] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"Key responsibilities, tasks, or learnings in concise bullet points keep it summarised detail *not* required\"\n",
    "    )\n",
    "\n",
    "class ProjectItem(BaseModel):\n",
    "    name: str = Field(..., description=\"Project name\")\n",
    "    technologies: List[str] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"Technologies used in this project hence learned during the project.\"\n",
    "    )\n",
    "   \n",
    "\n",
    "\n",
    "class CertificationItem(BaseModel):\n",
    "    name: str = Field(..., description=\"Certification name\")\n",
    "    \n",
    "    topics_covered: List[str] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"Key topics or skills the certification covers\"\n",
    "    )\n",
    "\n",
    "\n",
    "\n",
    "class ResumeExtract(BaseModel):\n",
    "\n",
    "\n",
    "    candidate_name:Optional[str]\n",
    "\n",
    "    \n",
    "    job_title: Optional[str] = Field(\n",
    "    None,\n",
    "    description=(\n",
    "        \"Primary job title or role of the candidate. \"\n",
    "        \"Examples: 'AI Engineer', 'Data Scientist', \"\n",
    "        \"'Construction Project Manager', 'Healthcare Representative'. \"\n",
    "        \"Should reflect the most recent or current role.\"\n",
    "       )\n",
    "    )\n",
    "\n",
    "  \n",
    "\n",
    "   \n",
    "    skills: List[Skill] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"Skills explicitly listed by the candidate\"\n",
    "    )\n",
    "    experience: List[ExperienceItem] = Field(\n",
    "        default_factory=list,\n",
    "        description=(\n",
    "            \"Each role as a separate entry. \"\n",
    "            \"No company name needed — focus on what was done and learned.\"\n",
    "        )\n",
    "    )\n",
    "    projects: List[ProjectItem] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"Projects with technologies used and what was built\"\n",
    "    )\n",
    "    certifications: Optional[List[CertificationItem]] = Field(\n",
    "        None,\n",
    "        description=\"Certifications with topics they cover. None if not present.\"\n",
    "    )\n",
    "    \n",
    "\n",
    "\n",
    "    is_fresher: bool = Field(\n",
    "    ...,\n",
    "    description=(\n",
    "        \"Set to True if the candidate lacks full-time professional employment. \"\n",
    "        \"Academic projects, certifications, and internships are considered \"\n",
    "        \"part of the learning phase and do not qualify a candidate as 'non-fresher' hence is_.\"\n",
    "    )\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99ac1086",
   "metadata": {},
   "source": [
    "  \"skills\": {\"__all__\": {\"category\"}}, # Drops 'category' from every skill\n",
    "            \"experience\": {\"__all__\": {\"responsibilities\"}}, # Drops bullet points\n",
    "            \"projects\": {\"__all__\": {\"what_was_built\"}}, # Drops project descriptions\n",
    "            \"certifications\": {\"__all__\": {\"issuer\"}} # Drops the issuer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b0756e0",
   "metadata": {},
   "source": [
    "Pydantic model for job description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4b2441cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pydantic import BaseModel, Field\n",
    "from typing import List, Optional\n",
    "\n",
    "\n",
    "class SkillRequirement(BaseModel):\n",
    "    name: str = Field(\n",
    "        ...,\n",
    "        description=\"Skill or technology required for the job (e.g., Python, SQL, React)\"\n",
    "    )\n",
    "    level: Optional[str] = Field(\n",
    "        None,\n",
    "        description=\"Expected proficiency level: beginner | intermediate | strong\"\n",
    "    )\n",
    "\n",
    "\n",
    "class ResponsibilityItem(BaseModel):\n",
    "    description: str = Field(\n",
    "        ...,\n",
    "        description=\"Key responsibility or task expected from the candidate\"\n",
    "    )\n",
    "\n",
    "\n",
    "class RequirementItem(BaseModel):\n",
    "    description: str = Field(\n",
    "        ...,\n",
    "        description=\"Qualification or requirement such as education, availability, etc.\"\n",
    "    )\n",
    "\n",
    "\n",
    "class ConstraintItem(BaseModel):\n",
    "    type: str = Field(\n",
    "        ...,\n",
    "        description=\"Constraint type such as location, duration, eligibility\"\n",
    "    )\n",
    "    value: str = Field(\n",
    "        ...,\n",
    "        description=\"Constraint value (e.g., 'Pune only', '6 months', 'Fresher')\"\n",
    "    )\n",
    "\n",
    "\n",
    "\n",
    "class JobDescriptionExtract(BaseModel):\n",
    "    job_title: Optional[str] = Field(\n",
    "        None,\n",
    "        description=\"Job role/title (e.g., AI/ML Intern, Web Developer)\"\n",
    "    )\n",
    "\n",
    "    company_name: Optional[str] = Field(\n",
    "        None,\n",
    "        description=\"Company offering the job\"\n",
    "    )\n",
    "\n",
    "    location: Optional[str] = Field(\n",
    "        None,\n",
    "        description=\"Job location if specified\"\n",
    "    )\n",
    "\n",
    "    employment_type: Optional[str] = Field(\n",
    "        None,\n",
    "        description=\"Type of job: internship, full-time, contract\"\n",
    "    )\n",
    "\n",
    "    duration_months: Optional[int] = Field(\n",
    "        None,\n",
    "        description=\"Duration of role in months (for internships/contracts)\"\n",
    "    )\n",
    "\n",
    "    is_fresher_allowed: Optional[bool] = Field(\n",
    "        None,\n",
    "        description=\"Whether freshers are eligible for this role\"\n",
    "    )\n",
    "\n",
    "    skills_required: Optional[List[SkillRequirement]] = Field(\n",
    "        None,\n",
    "        description=\"List of required skills and expected levels\"\n",
    "    )\n",
    "\n",
    "    tools_technologies: Optional[List[str]] = Field(\n",
    "        None,\n",
    "        description=\"Specific tools/frameworks mentioned (e.g., Pandas, WordPress)\"\n",
    "    )\n",
    "\n",
    "    responsibilities: Optional[List[ResponsibilityItem]] = Field(\n",
    "        None,\n",
    "        description=\"Key job responsibilities\"\n",
    "    )\n",
    "\n",
    "    requirements: Optional[List[RequirementItem]] = Field(\n",
    "        None,\n",
    "        description=\"General requirements like availability, qualifications\"\n",
    "    )\n",
    "\n",
    "    constraints: Optional[List[ConstraintItem]] = Field(\n",
    "        None,\n",
    "        description=\"Special constraints like location restriction, duration, etc.\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b12a3bc",
   "metadata": {},
   "source": [
    "**Pydantic model for skill gap analysis**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4f1341e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "class SkillGap(BaseModel):\n",
    "    skill_name: str = Field(\n",
    "        ..., \n",
    "        description=\"The specific technology or tool missing or requiring an upgrade (e.g., 'PostgreSQL')\"\n",
    "    )\n",
    "    \n",
    "    gap_type: Literal[\"missing_foundation\", \"needs_advanced_upgrade\"] = Field(\n",
    "        ...,\n",
    "        description=(\n",
    "            \"missing_foundation: Candidate has no recorded experience in this core requirement. \"\n",
    "            \"needs_advanced_upgrade: Candidate knows the basics but needs role-specific advanced training.\"\n",
    "        )\n",
    "    )\n",
    "    \n",
    "    priority: Literal[\"high\", \"medium\", \"low\"] = Field(\n",
    "        ...,\n",
    "        description=\"How critical this skill is for the target job role.\"\n",
    "    )\n",
    "    \n",
    "    reasoning: str = Field(\n",
    "        ...,\n",
    "        description=(\n",
    "            \"The 'Reasoning Trace'. This MUST be provided for every skill gap identified. \"\n",
    "            \"Explain exactly WHY this gap was flagged based on the resume vs JD comparison. \"\n",
    "            \"Example: 'JD requires FastAPI; candidate has Python experience but no record of using FastAPI framework.'\"\n",
    "        )\n",
    "    )\n",
    "    \n",
    "    target_competency: str = Field(\n",
    "        ...,\n",
    "        description=\"The specific outcome the candidate needs to reach (e.g., 'Build asynchronous database endpoints')\"\n",
    "    )\n",
    "\n",
    "class SkillGapAnalysis(BaseModel):\n",
    "    job_title: str = Field(..., description=\"The target role from the JD\")\n",
    "    candidate_name: Optional[str] = Field(None, description=\"Extracted name from resume\")\n",
    "    \n",
    "    analyzed_gaps: List[SkillGap] = Field(\n",
    "        default_factory=list,\n",
    "        description=\"List of specific technical gaps found between Resume and JD\"\n",
    "    )\n",
    "    \n",
    "    is_fresher_adaptation_needed: bool = Field(\n",
    "        default=False,\n",
    "        description=\"True if foundational corporate/soft-skill modules should be added to the path.\"\n",
    "    )\n",
    "    \n",
    "    executive_summary: str = Field(\n",
    "        ...,\n",
    "        description=\"A 2-3 sentence overview of the candidate's readiness and the primary focus of the onboarding.\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "18663bb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "class RoadmapStep(BaseModel):\n",
    "    course_id: str\n",
    "    title: str\n",
    "    reasoning: str = Field(..., description=\"Why this specific course was chosen for this user\")\n",
    "    is_foundation: bool\n",
    "    sequence_order: int = Field(..., description=\"The order in which the course should be taken\")\n",
    "\n",
    "class LearningRoadmap(BaseModel):\n",
    "    candidate_name: str\n",
    "    target_role: str\n",
    "    roadmap: List[RoadmapStep]\n",
    "    onboarding_summary: str"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "604e9728",
   "metadata": {},
   "source": [
    "**Defining  the agents to be used**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9036d57e",
   "metadata": {},
   "source": [
    "Resume data extraction agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "14dab004",
   "metadata": {},
   "outputs": [],
   "source": [
    "resume_agent=ChatGroq(\n",
    "    model=\"moonshotai/kimi-k2-instruct-0905\",\n",
    "    temperature=0.2,\n",
    ")\n",
    "\n",
    "\n",
    "resume_agent=resume_agent.with_structured_output(\n",
    "\n",
    "    schema=ResumeExtract,\n",
    "    method=\"json_schema\",\n",
    "    include_raw=True,\n",
    "    strict=True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7683eb69",
   "metadata": {},
   "source": [
    "Job description data extraction agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "472dae2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "jd_agent=ChatGroq(\n",
    "    model=\"meta-llama/llama-4-scout-17b-16e-instruct\",\n",
    "    temperature=0.2,\n",
    ")\n",
    "\n",
    "\n",
    "jd_agent=jd_agent.with_structured_output(\n",
    "\n",
    "    schema=JobDescriptionExtract,\n",
    "    method=\"json_schema\",\n",
    "    include_raw=True,\n",
    "    strict=True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d14736d5",
   "metadata": {},
   "source": [
    "defining the gap analysis agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0d5e3b17",
   "metadata": {},
   "outputs": [],
   "source": [
    "gap_analysis_agent=ChatGroq(\n",
    "    model=\"openai/gpt-oss-120b\",\n",
    "    temperature=0.2,\n",
    ")\n",
    "\n",
    "\n",
    "gap_analysis_agent=gap_analysis_agent.with_structured_output(\n",
    "    schema=SkillGapAnalysis,\n",
    "    method=\"json_schema\",\n",
    "    include_raw=True,\n",
    "    strict=True\n",
    ")\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28bc58ad",
   "metadata": {},
   "source": [
    "defining the roadmap planner agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0ccc026b",
   "metadata": {},
   "outputs": [],
   "source": [
    "roadmap_planner_agent=ChatGroq(\n",
    "    model=\"moonshotai/kimi-k2-instruct-0905\",\n",
    "    temperature=0.2,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2bd41131",
   "metadata": {},
   "source": [
    "**Tools**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c8827093",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index ready: {'_response_info': {'raw_headers': {'connection': 'keep-alive',\n",
      "                                    'content-length': '187',\n",
      "                                    'content-type': 'application/json',\n",
      "                                    'date': 'Mon, 23 Mar 2026 20:11:40 GMT',\n",
      "                                    'grpc-status': '0',\n",
      "                                    'server': 'envoy',\n",
      "                                    'x-envoy-upstream-service-time': '62',\n",
      "                                    'x-pinecone-request-latency-ms': '61',\n",
      "                                    'x-pinecone-response-duration-ms': '64'}},\n",
      " 'dimension': 384,\n",
      " 'index_fullness': 0.0,\n",
      " 'memoryFullness': 0.0,\n",
      " 'metric': 'dotproduct',\n",
      " 'namespaces': {'__default__': {'vector_count': 47}},\n",
      " 'storageFullness': 0.0,\n",
      " 'total_vector_count': 47,\n",
      " 'vector_type': 'dense'}\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "PINECONE_API_KEY = os.getenv(\"PINECONE_API_KEY\")\n",
    "pc = Pinecone(api_key=PINECONE_API_KEY)\n",
    "\n",
    "index_name = \"final-catalog-index\"\n",
    "\n",
    "\n",
    "# Create index if not exists\n",
    "if index_name not in pc.list_indexes().names():\n",
    "    pc.create_index(\n",
    "        name=index_name,\n",
    "        dimension=384,\n",
    "        metric=\"dotproduct\",\n",
    "        spec=ServerlessSpec(\n",
    "            cloud=\"aws\",\n",
    "            region=\"us-east-1\"\n",
    "        )\n",
    "    )\n",
    "    print(\"Index created.\")\n",
    "\n",
    "index = pc.Index(index_name)\n",
    "print(\"Index ready:\", index.describe_index_stats())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44180d94",
   "metadata": {},
   "source": [
    "Opening the docs for BM25 retriver"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7561b3a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from langchain_core.documents import Document\n",
    "\n",
    "\n",
    "doc_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\app\\utils\\langchain_formatted.json\"\n",
    "\n",
    "\n",
    "documents = []\n",
    "\n",
    "# Load the transformed catalog\n",
    "with open(doc_path, \"r\") as f:\n",
    "    data = json.load(f)\n",
    "    for doc in data:\n",
    "        # Create a LangChain Document object for each entry\n",
    "        documents.append(\n",
    "            Document(\n",
    "                page_content=doc[\"page_content\"], \n",
    "                metadata=doc[\"metadata\"]\n",
    "            )\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f0845a99",
   "metadata": {},
   "outputs": [],
   "source": [
    "device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c8e6d2a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ATHARVA\\AppData\\Local\\Temp\\ipykernel_30068\\2526755923.py:1: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n",
      "  embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": device})\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3af10b64d4584d53952822157482186f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2\n",
      "Key                     | Status     |  | \n",
      "------------------------+------------+--+-\n",
      "embeddings.position_ids | UNEXPECTED |  | \n",
      "\n",
      "Notes:\n",
      "- UNEXPECTED\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\n"
     ]
    }
   ],
   "source": [
    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": device})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6bc7292f",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mnotebook controller is DISPOSED. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "bm25_encoder = BM25Encoder()\n",
    "\n",
    "bm25_encoder.fit([doc.page_content for doc in documents])\n",
    "\n",
    "retriever = PineconeHybridSearchRetriever(\n",
    "    embeddings=embeddings,\n",
    "    sparse_encoder=bm25_encoder,\n",
    "    index=index,\n",
    "    alpha=0.5\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03c755a1",
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mnotebook controller is DISPOSED. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "from langchain_core.tools import tool\n",
    "from typing import Optional\n",
    "\n",
    "@tool\n",
    "def search_courses(query: str):\n",
    "    \"\"\"\n",
    "    Search the course catalog for relevant modules based on a skill query \n",
    "    \n",
    "    \n",
    "    Args:\n",
    "       \n",
    "      query:the skill to find with  semantic terms (e.g., 'FastAPI', 'PostgreSQL', 'Docker','Enterprise VMS Strategy','Utilization Management').\n",
    "       \n",
    "    \"\"\"\n",
    "    \n",
    "    results = retriever.invoke(\n",
    "        query\n",
    "    )\n",
    "\n",
    "    if not results:\n",
    "        return f\"No courses found  for '{query}'.\"\n",
    "\n",
    "    formatted_output = []\n",
    "    for doc in results:\n",
    "        course_id = doc.metadata.get('course_id', 'N/A')\n",
    "        \n",
    "        # We include the ID for roadmap generation, followed by the full context\n",
    "        # created during the transformation stage (Title, Desc, Outcomes, Prereqs).\n",
    "        course_block = (\n",
    "            f\"ID: {course_id}\\n\"\n",
    "            f\"{doc.page_content}\\n\"\n",
    "            \"---\"\n",
    "        )\n",
    "        formatted_output.append(course_block)\n",
    "\n",
    "    return \"\\n\".join(formatted_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9db28710",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from typing import Optional, Dict, Any\n",
    "from langchain_core.tools import tool\n",
    "\n",
    "class CourseLookup:\n",
    "    def __init__(self, catalog_path: str = \"course_catalog.json\"):\n",
    "        self.catalog_path = catalog_path\n",
    "        self.courses_map = {}\n",
    "        self._load_catalog()\n",
    "\n",
    "    def _load_catalog(self):\n",
    "        \"\"\"Loads the catalog into a dictionary for O(1) lookup speed.\"\"\"\n",
    "        try:\n",
    "            with open(self.catalog_path, 'r') as f:\n",
    "                catalog = json.load(f)\n",
    "                # Key the dictionary by course_id for instant retrieval\n",
    "                self.courses_map = {course['course_id']: course for course in catalog}\n",
    "        except FileNotFoundError:\n",
    "            print(f\"Error: {self.catalog_path} not found.\")\n",
    "        except json.JSONDecodeError:\n",
    "            print(f\"Error: Failed to decode {self.catalog_path}.\")\n",
    "\n",
    "    def get_course_details(self, course_id: str) -> Optional[Dict[str, Any]]:\n",
    "        \"\"\"Retrieves full details of a course by its ID.\"\"\"\n",
    "        return self.courses_map.get(course_id)\n",
    "\n",
    "\n",
    "lookup_service = CourseLookup(r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\app\\tools\\Catalog.json\")\n",
    "\n",
    "@tool\n",
    "def get_course_by_id(course_id: str) -> str:\n",
    "    \"\"\"\n",
    "    Retrieves full details for a specific course using its unique course_id.\n",
    "    Use this tool when you find a prerequisite ID in another course and \n",
    "    need to fetch its title, description, and duration to add to the roadmap.\n",
    "    \"\"\"\n",
    "    details = lookup_service.get_course_details(course_id)\n",
    "    if not details:\n",
    "        return f\"Error: Course with ID {course_id} not found in catalog.\"\n",
    "    \n",
    "    # Return a clean string for the agent to process\n",
    "    return json.dumps(details, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "09d238ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "@tool(args_schema=LearningRoadmap)\n",
    "def submit_final_roadmap(candidate_name, target_role, roadmap, onboarding_summary):\n",
    "    \"\"\"\n",
    "    STRICTLY call this tool to submit the final structured learning roadmap.\n",
    "    This saves the data to the global system and the graph state.\n",
    "    \"\"\"\n",
    "    \n",
    "    \n",
    "    # Construct the structured JSON\n",
    "    result = {\n",
    "        \"candidate_name\": candidate_name,\n",
    "        \"target_role\": target_role,\n",
    "        \"onboarding_summary\": onboarding_summary,\n",
    "        \"roadmap\": [\n",
    "            step.model_dump() if hasattr(step, \"model_dump\") else step \n",
    "            for step in roadmap\n",
    "        ]\n",
    "    }\n",
    "    \n",
    "    \n",
    "    \n",
    "    # Return to LangGraph (will be stored in state via a post-processing node)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ad04bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "@tool\n",
    "def submit_mermaid_visualization(mermaid_code: str):\n",
    "    \"\"\"\n",
    "    STRICTLY call this tool to save the Mermaid.js visualization of the roadmap.\n",
    "    \"\"\"\n",
    "    \n",
    "    \n",
    "    # 2. Now this assignment updates the global variable\n",
    "    mermaid_roadmap_code = mermaid_code\n",
    "    \n",
    "    return \"Mermaid visualization saved successfully.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "285f74bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "roadmap_planner_agent_tools=[search_courses, get_course_by_id,submit_final_roadmap,submit_mermaid_visualization]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47564782",
   "metadata": {},
   "outputs": [],
   "source": [
    "roadmap_planner_agent=roadmap_planner_agent.bind_tools(roadmap_planner_agent_tools)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c311f642",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace 'roadmap_planner_agent' with your bound model variable\n",
    "print(roadmap_planner_agent.kwargs.get(\"tools\"))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2da3f43b",
   "metadata": {},
   "source": [
    "**Trail resume path**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cfbfc3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "resumepath=r\"c:\\Users\\ATHARVA\\Downloads\\my codes\\python\\machine_learning\\Learning_Files\\ChirayuResume.pdf\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14f4946c",
   "metadata": {},
   "source": [
    "**Langgraph agent state**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5deda2bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "class OnboardingState(TypedDict):\n",
    "    candidate_name: Optional[str]\n",
    "    resume_text: str  \n",
    "    file_path: str \n",
    "    job_description: str \n",
    "    messages: Annotated[Sequence[BaseMessage], add_messages]\n",
    "    \n",
    "    # Analysis & Extraction Data\n",
    "    skill_gap_analysis_data: Optional[SkillGapAnalysis]\n",
    "    resume_data: Optional[ResumeExtract]   \n",
    "    extraction_error: Optional[str]         \n",
    "    JobDescriptionExtract_data: Optional[JobDescriptionExtract]\n",
    "    \n",
    "    # --- NEW KEYS FOR OUTPUT ---\n",
    "    mermaid_code: Optional[str]        # Stores the Mermaid visualization string\n",
    "    final_roadmap: Optional[Dict]      # Stores the final structured JSON roadmap"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e54bac6a",
   "metadata": {},
   "source": [
    "**Prompts**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8df9934",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_groq import ChatGroq\n",
    "from langchain_core.prompts import ChatPromptTemplate\n",
    "\n",
    "\n",
    "resume_agent_prompt = \"\"\"\n",
    "<role>\n",
    "You are a precise resume parser. Your only job is to extract structured information from a raw resume text.\n",
    "</role>\n",
    "\n",
    "<rules>\n",
    "- Extract ONLY what is explicitly present in the resume. Do NOT infer or hallucinate missing fields.\n",
    "- current_role: the job title stated at the top of the resume or most recent role. If the candidate is a student with no job, set it to \"Student\".\n",
    "- is_fresher: set True ONLY if the candidate has zero professional work experience. Having projects or certifications does NOT make someone non-fresher.\n",
    "- total_experience_years: total years of professional work only. Set 0.0 for freshers.\n",
    "- skills: extract from the explicit skills section only. Do NOT pull skills from project descriptions here.\n",
    "- experience: each role is a SEPARATE entry. Ignore company name. Focus on job_title, technologies used, and what they did or learned.\n",
    "- projects: extract each project separately. Capture technologies and one line on what was built.\n",
    "- certifications: extract ONLY if present. Set null if none found. Include topics the certification covers.\n",
    "- achievements: extract ONLY if present. Set null if none found. Include the domain (e.g. Hackathon, Quiz, Competitive Programming).\n",
    "\n",
    "</rules>\n",
    "\n",
    "<output_format>\n",
    "Return a single valid JSON object matching the schema. No extra text, no markdown, no explanation.\n",
    "</output_format>\n",
    "\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "608efafd",
   "metadata": {},
   "outputs": [],
   "source": [
    "jd_agent_prompt =\"\"\" \n",
    "<role>\n",
    "You are a precise job description parser.\n",
    "Extract structured information from the given job description.\n",
    "</role>\n",
    "\n",
    "<rules>\n",
    "- Extract ONLY explicitly mentioned information. Do NOT infer or hallucinate.\n",
    "\n",
    "- Follow the provided schema strictly.\n",
    "\n",
    "- If a field is not present, return null (not empty list unless schema default applies).\n",
    "\n",
    "- Keep skills atomic (e.g., Python, SQL, React).\n",
    "\n",
    "- Do NOT mix fields:\n",
    "  - skills = only required skills\n",
    "  - responsibilities = what the candidate will do\n",
    "  - constraints = restrictions like location, duration, eligibility\n",
    "\n",
    "- Convert durations like \"6 months\" into integer months.\n",
    "\n",
    "- is_fresher_allowed:\n",
    "  - True only if explicitly allowed\n",
    "  - False only if explicitly restricted\n",
    " \n",
    "</rules>\n",
    "\n",
    "<output_format>\n",
    "Return a valid JSON object only.\n",
    "</output_format> \"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a6c1483",
   "metadata": {},
   "outputs": [],
   "source": [
    "gap_analysis_agent_prompt=\"\"\"\n",
    "<role>\n",
    "You are an expert technical assessor and the core intelligence of an AI-driven, adaptive onboarding engine[cite: 5]. \n",
    "Your objective is to parse a new hire's current capabilities against a target job description and identify precise skill gaps to reach role-specific competency[cite: 5].\n",
    "</role>\n",
    "\n",
    "<context>\n",
    "Current corporate onboarding utilizes static, \"one-size-fits-all\" curricula, resulting in significant inefficiencies[cite: 3]. \n",
    "Your ultimate goal is to solve this: you must ensure experienced hires do NOT waste time on known concepts, while ensuring beginners are NOT overwhelmed by advanced modules[cite: 3, 4].\n",
    "</context>\n",
    "\n",
    "<rules>\n",
    "- Cross-reference the JD's `skills_required` and `tools_technologies` against the candidate's `skills_list`, `experience.technologies`, and `projects.technologies`.\n",
    "- Identify Hard Gaps: Technologies explicitly required by the JD that are completely absent from the candidate's profile.\n",
    "- Apply Adaptive Logic (Proficiency Gaps):\n",
    "  - For Experienced Hires: If they possess the skill, DO NOT flag it for basic training. Only flag a gap if they need an advanced, role-specific upgrade based on low duration of use.\n",
    "  - For Beginners/Freshers: Flag foundational gaps and prerequisites heavily to ensure they are prepared before tackling complex JD requirements.\n",
    "- Keep skills atomic and highly specific (e.g., output \"FastAPI\" or \"PostgreSQL\", do NOT output vague terms like \"Backend Frameworks\").\n",
    "- Do NOT hallucinate requirements that are not explicitly stated in the JD data.\n",
    "- Do NOT attempt to build the curriculum or suggest courses yet. Your sole focus is diagnosing the gaps.\n",
    "- Provide a concise `reasoning` string for each identified gap. This reasoning MUST justify why the gap exists based on the user's experience level to prove the adaptive logic.\n",
    "</rules>\n",
    "<output_format>\n",
    "Return a valid JSON object only.\n",
    "</output_format>\n",
    "\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "059e5f86",
   "metadata": {},
   "outputs": [],
   "source": [
    "roadmap_planner_agent_prompt = \"\"\"\n",
    "<role>\n",
    "You are an expert technical onboarding architect.\n",
    "Transform a Skill Gap Analysis into a minimal, logically sequenced learning roadmap.\n",
    "</role>\n",
    "\n",
    "<strict_workflow>\n",
    "STEP 1 — SEARCH\n",
    "For every  gap → call search_courses.\n",
    "Use ONLY course IDs returned by the tool. Never guess IDs.\n",
    "\n",
    "STEP 2 — RESOLVE PREREQUISITES\n",
    "For each retrieved course inspect its prerequisites list.\n",
    "If candidate's resume does NOT prove mastery → call get_course_by_id for each missing prerequisite.\n",
    "Skip courses the candidate already demonstrates via projects or experience.\n",
    "\n",
    "STEP 3 — SEQUENCE\n",
    "Prerequisites always before target modules.\n",
    "sequence_order must be 1, 2, 3... strictly.\n",
    "If is_fresher_adaptation_needed is True → add a professional fundamentals module first.\n",
    "\n",
    "STEP 4 — SUBMIT (TERMINAL STEP)\n",
    "Call submit_final_roadmap ONCE with the complete roadmap.\n",
    "Call submit_mermaid_visualization ONCE with the Mermaid string.\n",
    "After both return → STOP. Do not call any tool again.\n",
    "</strict_workflow>\n",
    "\n",
    "<mermaid_rules>\n",
    "- gap courses → :::gap\n",
    "- known prerequisites → :::known\n",
    "- start node → :::start\n",
    "- end node → :::done\n",
    "- group by week using subgraph\n",
    "</mermaid_rules>\n",
    "\n",
    "<example_mermaid>\n",
    "flowchart TD\n",
    "    A([Start — Candidate's current skills]):::start\n",
    "    subgraph W1[\"Week 1 — Core gaps\"]\n",
    "      B[CS-DOCKER-101\\nDocker & Containerization]:::gap\n",
    "      C[CS-PY-101\\nPython Fundamentals]:::known\n",
    "    end\n",
    "    subgraph W2[\"Week 2 — Role readiness\"]\n",
    "      D[CS-CICD-201\\nCI/CD with GitHub Actions]:::gap\n",
    "    end\n",
    "    Z([Role-ready — DevOps Engineer]):::done\n",
    "    A --> B & C\n",
    "    B --> D\n",
    "    D --> Z\n",
    "    classDef gap   fill:#EEEDFE,stroke:#534AB7,color:#26215C\n",
    "    classDef known fill:#E1F5EE,stroke:#0F6E56,color:#085041\n",
    "    classDef start fill:#1D9E75,stroke:#0F6E56,color:#E1F5EE\n",
    "    classDef done  fill:#534AB7,stroke:#3C3489,color:#EEEDFE\n",
    "</example_mermaid>\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c4dea1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def input_node(state: OnboardingState):\n",
    "    file_path = state.get(\"file_path\")\n",
    "    \n",
    "    print(f\"📂 File path received: {file_path}\")\n",
    "    print(f\"📂 File exists: {os.path.exists(file_path) if file_path else 'NO PATH'}\")\n",
    "\n",
    "    if not file_path:\n",
    "        return {\"extraction_error\": \"Missing file_path in state\"}\n",
    "\n",
    "    try:\n",
    "        loader = PyMuPDFLoader(file_path)\n",
    "        docs = loader.load()\n",
    "        \n",
    "        print(f\"📄 Pages loaded: {len(docs)}\")\n",
    "        \n",
    "        resume_text = \"\\n\".join([doc.page_content for doc in docs])\n",
    "        \n",
    "        print(f\"📄 Text length: {len(resume_text)}\")\n",
    "\n",
    "        return {\n",
    "            \"resume_text\": resume_text,\n",
    "            \"extraction_error\": None\n",
    "        }\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"❌ PyMuPDF failed: {str(e)}\")\n",
    "        return {\n",
    "            \"resume_text\": None,\n",
    "            \"extraction_error\": f\"Failed to load resume: {str(e)}\"\n",
    "        }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb13ffc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extractResumeDataNode(state: OnboardingState):\n",
    "    resume_text = state.get(\"resume_text\")\n",
    "\n",
    "    # Guard 1 — empty text\n",
    "    if not resume_text or len(resume_text.strip()) < 10:\n",
    "        print(\"❌ RESUME TEXT EMPTY OR TOO SHORT\")\n",
    "        return {\"resume_data\": None, \"extraction_error\": \"Resume text is empty\"}\n",
    "\n",
    "    print(f\"📄 Resume text length: {len(resume_text)} chars\")\n",
    "\n",
    "    messages = [\n",
    "        SystemMessage(content=resume_agent_prompt),\n",
    "        HumanMessage(content=f\"<resume_text>{resume_text}</resume_text>\")\n",
    "    ]\n",
    "\n",
    "    result = resume_agent.invoke(messages)\n",
    "\n",
    "    # Guard 2 — parsing failed\n",
    "    if result.get(\"parsing_error\"):\n",
    "        print(f\"❌ PARSING ERROR: {result['parsing_error']}\")\n",
    "        return {\"resume_data\": None, \"extraction_error\": str(result[\"parsing_error\"])}\n",
    "\n",
    "    # Guard 3 — parsed is None\n",
    "    if result.get(\"parsed\") is None:\n",
    "        print(f\"❌ PARSED IS NONE. RAW OUTPUT: {result.get('raw')}\")\n",
    "        return {\"resume_data\": None, \"extraction_error\": \"LLM returned null schema\"}\n",
    "\n",
    "    print(f\"✅ Resume extracted: {result['parsed'].job_title}\")\n",
    "    return {\"resume_data\": result[\"parsed\"]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "330acef6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extractJDDataNode(state: OnboardingState):\n",
    "    # 1. Safety Check: Is the text even in the state?\n",
    "    jd_text = state.get(\"job_description\", \"\")\n",
    "    \n",
    "    if not jd_text or len(jd_text.strip()) < 5:\n",
    "        print(\"DEBUGGER ERROR: job_description text is MISSING from state!\")\n",
    "        return {\"JobDescriptionExtract_data\": JobDescriptionExtract()}\n",
    "\n",
    "    print(f\"DEBUGGER: Sending {len(jd_text)} characters to JD Agent...\")\n",
    "\n",
    "    messages = [\n",
    "        SystemMessage(content=jd_agent_prompt),\n",
    "        HumanMessage(content=f\"EXTRACT FROM THIS TEXT:\\n\\n{jd_text}\")\n",
    "    ]\n",
    "\n",
    "    try:\n",
    "        # 2. Invoke the agent\n",
    "        result = jd_agent.invoke(messages)\n",
    "        \n",
    "        # 3. Handle the 'parsed' key (ensure your chain is configured correctly)\n",
    "        # If result is already the Pydantic object, use it directly.\n",
    "        # If result is a dict with 'parsed', use result['parsed'].\n",
    "        parsed_data = result.get(\"parsed\") if isinstance(result, dict) else result\n",
    "\n",
    "        # 4. Critical Check: Did it actually find anything?\n",
    "        if parsed_data.job_title is None and parsed_data.tools_technologies is None:\n",
    "            print(\"DEBUGGER WARNING: LLM returned empty schema! Checking prompt...\")\n",
    "        else:\n",
    "            print(f\"DEBUGGER SUCCESS: Extracted {parsed_data.job_title}\")\n",
    "\n",
    "        return {\"JobDescriptionExtract_data\": parsed_data}\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"DEBUGGER CRITICAL: Invoke failed: {str(e)}\")\n",
    "        return {\"JobDescriptionExtract_data\": JobDescriptionExtract()}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "795e2446",
   "metadata": {},
   "source": [
    "removing this  ->\"skills\": {\"__all__\": {\"category\"}}, # Drops 'category' from every skill\n",
    "                \"experience\": {\"__all__\": {\"responsibilities\"}}, # Drops bullet points\n",
    "               \"projects\": {\"__all__\": {\"what_was_built\"}}, # Drops project descriptions\n",
    "              \"certifications\": {\"__all__\": {\"issuer\"}} # Drops the issuer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7352181c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def skill_gap_node(state: OnboardingState):\n",
    "    \n",
    "    resume_data = state[\"resume_data\"] \n",
    "    candidate_name = state[\"candidate_name\"]\n",
    "    \n",
    "    # To remove noise and reduce size  of the prompt.\n",
    "    lean_resume_dict = resume_data.model_dump(\n",
    "\n",
    "        exclude_none=True # Bonus: Automatically drops any fields that are None/null!\n",
    "    )\n",
    "\n",
    "    raw_jd = state[\"JobDescriptionExtract_data\"]\n",
    "    \n",
    "    # Strip the HR noise and text bloat\n",
    "    lean_jd_dict = raw_jd.model_dump(\n",
    "        exclude={\n",
    "            \n",
    "            \n",
    "            \n",
    "            \n",
    "            \"responsibilities\": True, # Dropping verbose bullet points\n",
    "            \"requirements\": True,\n",
    "            \"constraints\": True\n",
    "        },\n",
    "        exclude_none=True # Drops any null fields\n",
    "    )\n",
    "    \n",
    "    #Convert back to a JSON string if your prompt template requires it\n",
    "    \n",
    "    lean_resume_json = json.dumps(lean_resume_dict, indent=2)\n",
    "\n",
    "\n",
    "    lean_jd_json = json.dumps(lean_jd_dict, indent=2)\n",
    "\n",
    "    messages = [\n",
    "        SystemMessage(content=gap_analysis_agent_prompt),\n",
    "        HumanMessage(content=f\"Users Resume:<lean_resume_json>{lean_resume_json}</lean_resume_json> Job Description:<lean_jd_json>{lean_jd_json}</lean_jd_json>\"),\n",
    "        \n",
    "    ]\n",
    "\n",
    "    \n",
    "    result = gap_analysis_agent.invoke(messages)\n",
    "\n",
    "    return {\"skill_gap_analysis_data\": result[\"parsed\"]}\n",
    "\n",
    "\n",
    "    \n",
    "\n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fb2f0d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def roadmap_planning_node(state: OnboardingState):\n",
    "    \"\"\"\n",
    "    The agent's 'thinking' node. It looks at the Skill Gaps and \n",
    "    decides which tool to call next.\n",
    "    \"\"\"\n",
    "    skill_gap_data = state[\"skill_gap_analysis_data\"]\n",
    "\n",
    "    skill_gap_data= skill_gap_data.model_dump()\n",
    "\n",
    "    system_prompt = SystemMessage(content=roadmap_planner_agent_prompt)\n",
    "    input_msg = HumanMessage(content=f\"<skill_gap_data> {skill_gap_data} </skill_gap_data>\")\n",
    "    \n",
    "    response = roadmap_planner_agent.invoke([system_prompt, input_msg] + state[\"messages\"])\n",
    "    \n",
    "    return {\"messages\": [response]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cea90664",
   "metadata": {},
   "outputs": [],
   "source": [
    "def finalize_state_node(state: OnboardingState):\n",
    "    \"\"\"\n",
    "    Final node that extracts structured data from the message scratchpad\n",
    "    and populates the main state keys. No global variables needed!\n",
    "    \"\"\"\n",
    "    final_roadmap = None\n",
    "    mermaid_code = None\n",
    "\n",
    "    # We search the messages in reverse to find the LATEST tool calls\n",
    "    for msg in reversed(state[\"messages\"]):\n",
    "        # Check if the message has tool calls (this will be an AIMessage)\n",
    "        if hasattr(msg, \"tool_calls\") and msg.tool_calls:\n",
    "            for tool_call in msg.tool_calls:\n",
    "                \n",
    "                # 1. Extract the Roadmap JSON\n",
    "                if tool_call[\"name\"] == \"submit_final_roadmap\":\n",
    "                    final_roadmap = tool_call[\"args\"]\n",
    "                \n",
    "                # 2. Extract the Mermaid String\n",
    "                elif tool_call[\"name\"] == \"submit_mermaid_visualization\":\n",
    "                    mermaid_code = tool_call[\"args\"].get(\"mermaid_code\")\n",
    "\n",
    "        # Once we have both, we can stop searching\n",
    "        if final_roadmap and mermaid_code:\n",
    "            break\n",
    "\n",
    "    \n",
    "    \n",
    "    return {\n",
    "        \"final_roadmap\": final_roadmap,\n",
    "        \"mermaid_code\": mermaid_code\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba9f22e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "tool_node = ToolNode(roadmap_planner_agent_tools)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5cfe4c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "builder = StateGraph(OnboardingState)\n",
    "\n",
    "# Define Nodes\n",
    "builder.add_node(\"input_node\", input_node)\n",
    "builder.add_node(\"resume_data_extraction\", extractResumeDataNode)\n",
    "builder.add_node(\"jd_data_extraction\", extractJDDataNode)\n",
    "builder.add_node(\"skill_gap_analysis\", skill_gap_node)\n",
    "builder.add_node(\"roadmap_planning_agent\", roadmap_planning_node)\n",
    "builder.add_node(\"tools\", tool_node) # Named 'tools' for tools_condition compatibility\n",
    "builder.add_node(\"finalize_state\", finalize_state_node)\n",
    "\n",
    "# Define Entry Point and initial Extraction Parallelism\n",
    "builder.set_entry_point(\"input_node\")\n",
    "builder.add_edge(\"input_node\", \"resume_data_extraction\")\n",
    "builder.add_edge(\"input_node\", \"jd_data_extraction\")\n",
    "\n",
    "# Join Extractions into Gap Analysis\n",
    "builder.add_edge(\"resume_data_extraction\", \"skill_gap_analysis\")\n",
    "builder.add_edge(\"jd_data_extraction\", \"skill_gap_analysis\")\n",
    "\n",
    "# Transition from Analysis to Planning Agent\n",
    "builder.add_edge(\"skill_gap_analysis\", \"roadmap_planning_agent\")\n",
    "\n",
    "# Agentic ReAct Loop (Planning Agent <-> Tools)\n",
    "builder.add_conditional_edges(\n",
    "    \"roadmap_planning_agent\",\n",
    "    tools_condition,\n",
    "    {\n",
    "        \"tools\": \"tools\",            # If tool_calls exist, go to tools\n",
    "        \"__end__\": \"finalize_state\"  # If finished, go to finalize_state\n",
    "    }\n",
    ")\n",
    "\n",
    "# 2. Loop back to agent after tools\n",
    "builder.add_edge(\"tools\", \"roadmap_planning_agent\")\n",
    "\n",
    "\n",
    "\n",
    "# Compile the Graph\n",
    "graph = builder.compile()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53588a77",
   "metadata": {},
   "outputs": [],
   "source": [
    "display(graph)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0edf8d59",
   "metadata": {},
   "outputs": [],
   "source": [
    "jd_text=\"\"\"Job Title: Backend Developer\n",
    "\n",
    "Company name: CodeForge\n",
    "We are hiring a Backend Developer to build scalable APIs and backend systems.\n",
    "\n",
    "Responsibilities:\n",
    "- Develop REST APIs using FastAPI\n",
    "- Design and manage PostgreSQL databases\n",
    "- Implement authentication and authorization systems\n",
    "- Optimize performance and scalability\n",
    "\n",
    "Requirements:\n",
    "- Strong knowledge of Python\n",
    "- Experience with FastAPI or Django\n",
    "- Good understanding of SQL and database design\n",
    "- Familiarity with Docker\n",
    "\n",
    "Constraints:\n",
    "- Location: Pune only\n",
    "- Full-time role \"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da3df5a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Define the keys your React frontend actually needs\n",
    "REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n",
    "\n",
    "def export_ui_payload(state, filename=\"hook_output.json\"):\n",
    "    \"\"\"\n",
    "    Extracts specific keys from the graph state and ensures \n",
    "    Pydantic objects are dumped to dicts for JSON compatibility.\n",
    "    \"\"\"\n",
    "    ui_data = {}\n",
    "\n",
    "    for key in REQUIRED_KEYS:\n",
    "        # Get the value from the state\n",
    "        val = state.get(key)\n",
    "        \n",
    "        if val is None:\n",
    "            continue\n",
    "\n",
    "        # Check if the value is a Pydantic object (has .model_dump())\n",
    "        # This fixes the \"skill_gap_analysis_data as a string\" issue\n",
    "        if hasattr(val, \"model_dump\"):\n",
    "            ui_data[key] = val.model_dump()\n",
    "        else:\n",
    "            # If it's already a dict (final_roadmap) or string (mermaid_code)\n",
    "            ui_data[key] = val\n",
    "\n",
    "    # Save to the local file\n",
    "    with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
    "        json.dump(ui_data, f, indent=2)\n",
    "    \n",
    "    print(f\"✅ UI Payload successfully exported to {filename}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a95b4db7",
   "metadata": {},
   "outputs": [],
   "source": [
    "initial_input = {\n",
    "    \"candidate_name\": \"Chirayu Jain\",\n",
    "    \"resume_text\": None,\n",
    "    \"job_description\": jd_text,\n",
    "    \"file_path\": r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\\ChirayuResume.pdf\",\n",
    "    \"resume_data\": None,\n",
    "    \"extraction_error\": None,\n",
    "    \"JobDescriptionExtract_data\": None,\n",
    "    \"skill_gap_analysis_data\": None\n",
    "    \n",
    "    \n",
    "}\n",
    "import uuid\n",
    "\n",
    "\n",
    "checkpointer = MemorySaver()  \n",
    "graph = builder.compile(checkpointer=checkpointer)\n",
    "\n",
    "THREAD_ID = str(uuid.uuid4())\n",
    "\n",
    "\n",
    "\n",
    "config = {\"configurable\": {\"thread_id\": THREAD_ID,\"langgraph_user_id\": \"Chirayu Jain\"}}\n",
    "\n",
    "final_state = graph.invoke(initial_input, config=config)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "093bdd6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def run_graph_with_stream(graph, initial_input, config):\n",
    "    \"\"\"\n",
    "    Executes the graph in streaming mode to visualize the 'under the hood' \n",
    "    process of node transitions and data updates.\n",
    "    \"\"\"\n",
    "    print(\"🚀 Starting Graph Stream...\\n\")\n",
    "    \n",
    "    # Using stream_mode=\"updates\" to see exactly what each node returns\n",
    "    for event in graph.stream(initial_input, config, stream_mode=\"updates\"):\n",
    "        for node_name, node_update in event.items():\n",
    "            print(f\"--- 📍 Node: {node_name} ---\")\n",
    "            \n",
    "            # 1. Check for Tool Calls (The 'ReAct' thinking process)\n",
    "            if \"messages\" in node_update:\n",
    "                last_msg = node_update[\"messages\"][-1]\n",
    "                if hasattr(last_msg, \"tool_calls\") and last_msg.tool_calls:\n",
    "                    for tool in last_msg.tool_calls:\n",
    "                        print(f\"🛠️  AGENT CALLING TOOL: {tool['name']}\")\n",
    "                        print(f\"📝 ARGS: {json.dumps(tool['args'], indent=2)}\")\n",
    "                elif hasattr(last_msg, \"content\") and last_msg.content:\n",
    "                    # Show a snippet of the AI's internal reasoning\n",
    "                    content_snippet = last_msg.content[:150].replace('\\n', ' ')\n",
    "                    print(f\"🧠 AI THOUGHT: {content_snippet}...\")\n",
    "\n",
    "            # 2. Check for Data Extraction (JD/Resume results)\n",
    "            if \"JobDescriptionExtract_data\" in node_update:\n",
    "                jd = node_update[\"JobDescriptionExtract_data\"]\n",
    "                print(f\"✅ Extracted JD: {getattr(jd, 'job_title', 'Unknown')}\")\n",
    "            \n",
    "            if \"resume_data\" in node_update:\n",
    "                res = node_update[\"resume_data\"]\n",
    "                print(f\"✅ Extracted Resume for: {getattr(res, 'candidate_name', 'Unknown')}\")\n",
    "\n",
    "            # 3. Check for the final output keys\n",
    "            if \"skill_gap_analysis_data\" in node_update:\n",
    "                print(\"🎯 Skill Gap Analysis Completed.\")\n",
    "                \n",
    "            if \"learning_roadmap\" in node_update or \"final_roadmap\" in node_update:\n",
    "                print(\"🏁 Final Roadmap Constructed.\")\n",
    "\n",
    "            print(\"\\n\" + \"=\"*50 + \"\\n\")\n",
    "\n",
    "    # Access the final state after the stream ends\n",
    "    final_state = graph.get_state(config)\n",
    "    print(\"✨ Stream Finished. Final state captured.\")\n",
    "    return final_state.values\n",
    "\n",
    "# --- Example Usage ---\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a36ecb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = {\"configurable\": {\"thread_id\": \"debug_123\"}}\n",
    "final_result = run_graph_with_stream(graph, initial_input, config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e502949",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53ba21aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "///break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5afbce5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25a6b5b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Define the keys your React frontend actually needs\n",
    "REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n",
    "\n",
    "def export_ui_payload(state, filename=\"ai_output.json\"):\n",
    "    \"\"\"\n",
    "    Extracts specific keys from the graph state and ensures \n",
    "    Pydantic objects are dumped to dicts for JSON compatibility.\n",
    "    \"\"\"\n",
    "    ui_data = {}\n",
    "\n",
    "    for key in REQUIRED_KEYS:\n",
    "        # Get the value from the state\n",
    "        val = state.get(key)\n",
    "        \n",
    "        if val is None:\n",
    "            continue\n",
    "\n",
    "        # Check if the value is a Pydantic object (has .model_dump())\n",
    "        # This fixes the \"skill_gap_analysis_data as a string\" issue\n",
    "        if hasattr(val, \"model_dump\"):\n",
    "            ui_data[key] = val.model_dump()\n",
    "        else:\n",
    "            # If it's already a dict (final_roadmap) or string (mermaid_code)\n",
    "            ui_data[key] = val\n",
    "\n",
    "    # Save to the local file\n",
    "    with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
    "        json.dump(ui_data, f, indent=2)\n",
    "    \n",
    "    print(f\"✅ UI Payload successfully exported to {filename}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26c10157",
   "metadata": {},
   "outputs": [],
   "source": [
    "export_ui_payload(final_state)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "478f19dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data=['atgdata.json','buisnessdata.json','chefdata.json','casemanager.json']\n",
    "test_resumes=['ATGPDF.pdf','Business.pdf','CHEF.pdf','casemanager.pdf']\n",
    "\n",
    "\n",
    "test_resume_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\\{test_resumes}\"\n",
    "test_data_path=r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\\{test_data}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83f3bd72",
   "metadata": {},
   "outputs": [],
   "source": [
    "store_state=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b29b7ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import uuid\n",
    "import os\n",
    "from langgraph.checkpoint.memory import MemorySaver\n",
    "\n",
    "# --- Configuration & Paths ---\n",
    "\n",
    "test_map = [\n",
    "    {\"resume\": \"ATGPDF.pdf\",         \"data\": \"atgdata.json\",      \"name\": \"Atharva_Gaykar\"},\n",
    "  \n",
    "]\n",
    "\n",
    "RESUME_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\"\n",
    "DATA_DIR   = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
    "\n",
    "# Windows-safe absolute output path\n",
    "OUTPUT_DIR = os.path.join(os.getcwd(), \"predictions\")\n",
    "\n",
    "if not os.path.exists(OUTPUT_DIR):\n",
    "    os.makedirs(OUTPUT_DIR)\n",
    "\n",
    "\n",
    "# --- Helper Functions ---\n",
    "\n",
    "def get_job_description_string(data_filename: str) -> str | None:\n",
    "    \"\"\"\n",
    "    Extracts the Job Description from the test data JSON and\n",
    "    formats it as a clean string for the extraction node.\n",
    "    \"\"\"\n",
    "    path = os.path.join(DATA_DIR, data_filename)\n",
    "\n",
    "    if not os.path.exists(path):\n",
    "        print(f\"⚠️  Data file not found: {path}\")\n",
    "        return None\n",
    "\n",
    "    try:\n",
    "        with open(path, \"r\", encoding=\"utf-8\") as f:\n",
    "            suite = json.load(f)\n",
    "\n",
    "        jd_obj = suite.get(\"job_description\") or suite.get(\"job_description_requirements\")\n",
    "\n",
    "        if not jd_obj:\n",
    "            print(f\"⚠️  No JD key found in {data_filename}\")\n",
    "            return None\n",
    "\n",
    "        title = jd_obj.get(\"title\") or jd_obj.get(\"job_title\", \"N/A\")\n",
    "        desc  = jd_obj.get(\"description\", \"\")\n",
    "        reqs  = jd_obj.get(\"requirements\", [])\n",
    "\n",
    "        jd_string = f\"JOB TITLE: {title}\\n\\n\"\n",
    "        if desc:\n",
    "            jd_string += f\"OVERVIEW: {desc}\\n\\n\"\n",
    "        jd_string += \"REQUIREMENTS:\\n\" + \"\\n\".join([f\"- {r}\" for r in reqs])\n",
    "\n",
    "        return jd_string\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error loading JD from {data_filename}: {e}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def export_ui_payload(state: dict) -> dict:\n",
    "    \"\"\"\n",
    "    Extracts required keys from graph state.\n",
    "    Converts Pydantic objects to dicts via .model_dump().\n",
    "    \"\"\"\n",
    "    REQUIRED_KEYS = [\n",
    "        \"candidate_name\",\n",
    "        \"skill_gap_analysis_data\",\n",
    "        \"mermaid_code\",\n",
    "        \"final_roadmap\",\n",
    "    ]\n",
    "    ui_data = {}\n",
    "\n",
    "    for key in REQUIRED_KEYS:\n",
    "        val = state.get(key)\n",
    "        if val is None:\n",
    "            continue\n",
    "        if hasattr(val, \"model_dump\"):\n",
    "            ui_data[key] = val.model_dump()\n",
    "        else:\n",
    "            ui_data[key] = val\n",
    "\n",
    "    return ui_data\n",
    "\n",
    "\n",
    "# --- Execution Loop ---\n",
    "\n",
    "def run_evaluation_suite(graph_instance):\n",
    "    \"\"\"\n",
    "    Runs the graph for every resume in test_map.\n",
    "    Saves UI-ready payloads as predicted_{name}.json in OUTPUT_DIR.\n",
    "    \"\"\"\n",
    "    print(f\"\\n📁 Output directory: {OUTPUT_DIR}\\n\")\n",
    "\n",
    "    for case in test_map:\n",
    "        print(f\"🚀 Processing: {case['resume']}...\")\n",
    "\n",
    "        # 1. Validate resume file exists\n",
    "        resume_path = os.path.join(RESUME_DIR, case[\"resume\"])\n",
    "        if not os.path.exists(resume_path):\n",
    "            print(f\"⚠️  Resume not found, skipping: {resume_path}\")\n",
    "            continue\n",
    "\n",
    "        # 2. Load JD string\n",
    "        jd_content = get_job_description_string(case[\"data\"])\n",
    "        if not jd_content:\n",
    "            print(f\"⚠️  Skipping {case['resume']}: JD not found in {case['data']}\")\n",
    "            continue\n",
    "\n",
    "        # 3. Build initial state\n",
    "        initial_input = {\n",
    "            \"candidate_name\":         case[\"name\"].replace(\"_\", \" \"),\n",
    "            \"file_path\":              resume_path,\n",
    "            \"job_description\":        jd_content,\n",
    "            \"resume_text\":            None,\n",
    "            \"resume_data\":            None,\n",
    "            \"extraction_error\":       None,\n",
    "            \"JobDescriptionExtract_data\": None,\n",
    "            \"skill_gap_analysis_data\": None,\n",
    "            \"messages\":               [],\n",
    "            \"mermaid_code\":           None,\n",
    "            \"final_roadmap\":          None,\n",
    "        }\n",
    "\n",
    "        # 4. Invoke graph\n",
    "        config = {\"configurable\": {\"thread_id\": str(uuid.uuid4())}}\n",
    "\n",
    "        try:\n",
    "            final_state = graph_instance.invoke(initial_input, config=config)\n",
    "\n",
    "            store_state.append(final_state)\n",
    "\n",
    "            print(f\"✅ Graph execution successful for {case['resume']}\\n\")\n",
    "\n",
    "            # 5. Export payload\n",
    "            prediction  = export_ui_payload(final_state)\n",
    "            output_file = f\"predicted_{case['name'].lower()}.json\"\n",
    "            output_path = os.path.join(OUTPUT_DIR, output_file)\n",
    "\n",
    "            with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
    "                json.dump(prediction, f, indent=2, ensure_ascii=False)\n",
    "\n",
    "            print(f\"✅ Saved: {output_path}\\n\")\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"❌ Error during graph execution for {case['resume']}: {e}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1638a3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    # Assuming your graph is already compiled and named 'graph'\n",
    "    run_evaluation_suite(graph)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0910b325",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import uuid\n",
    "import os\n",
    "from langgraph.checkpoint.memory import MemorySaver\n",
    "\n",
    "# --- Configuration & Paths ---\n",
    "# Mapping resumes to the JSON files containing the Ground Truth data we created\n",
    "test_map = [\n",
    "   \n",
    "    {\"resume\": \"casemanager.pdf\", \"data\": \"casemanagerdata.json\", \"name\": \"Case_Manager\"}\n",
    "]\n",
    "\n",
    "# Update these to your actual local paths where the files are stored\n",
    "RESUME_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\Testresume\"\n",
    "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
    "OUTPUT_DIR = \"./predictions\" \n",
    "\n",
    "if not os.path.exists(OUTPUT_DIR):\n",
    "    os.makedirs(OUTPUT_DIR)\n",
    "\n",
    "# --- Helper Functions ---\n",
    "\n",
    "def get_job_description_string(data_filename):\n",
    "    \"\"\"\n",
    "    Extracts the Job Description from the test data and formats it as a clean \n",
    "    string for the Extraction Node. Handles both 'job_description' and \n",
    "    'job_description_requirements' keys.\n",
    "    \"\"\"\n",
    "    path = os.path.join(DATA_DIR, data_filename)\n",
    "    try:\n",
    "        with open(path, 'r', encoding='utf-8') as f:\n",
    "            suite = json.load(f)\n",
    "            \n",
    "            # Extract from 'job_description' or 'job_description_requirements'\n",
    "            jd_obj = suite.get(\"job_description\") or suite.get(\"job_description_requirements\")\n",
    "            \n",
    "            if not jd_obj:\n",
    "                return None\n",
    "                \n",
    "            title = jd_obj.get(\"title\") or jd_obj.get(\"job_title\", \"N/A\")\n",
    "            desc = jd_obj.get(\"description\", \"\")\n",
    "            reqs = jd_obj.get(\"requirements\", [])\n",
    "            \n",
    "            # Format as a clean string for the LLM to analyze\n",
    "            jd_string = f\"JOB TITLE: {title}\\n\\n\"\n",
    "            if desc:\n",
    "                jd_string += f\"OVERVIEW: {desc}\\n\\n\"\n",
    "            jd_string += \"REQUIREMENTS:\\n\" + \"\\n\".join([f\"- {r}\" for r in reqs])\n",
    "            \n",
    "            return jd_string\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading JD from {data_filename}: {e}\")\n",
    "        return None\n",
    "\n",
    "def export_ui_payload(state):\n",
    "    \"\"\"\n",
    "    Extracts and formats state data for the UI payload.\n",
    "    Ensures Pydantic objects are converted to dicts using .model_dump().\n",
    "    \"\"\"\n",
    "    REQUIRED_KEYS = [\"candidate_name\", \"skill_gap_analysis_data\", \"mermaid_code\", \"final_roadmap\"]\n",
    "    ui_data = {}\n",
    "\n",
    "    for key in REQUIRED_KEYS:\n",
    "        val = state.get(key)\n",
    "        if val is None:\n",
    "            continue\n",
    "            \n",
    "        # If it's a Pydantic object, dump it to a dict\n",
    "        if hasattr(val, \"model_dump\"):\n",
    "            ui_data[key] = val.model_dump()\n",
    "        else:\n",
    "            # If it's already a dict, list, or string (like mermaid_code)\n",
    "            ui_data[key] = val\n",
    "    return ui_data\n",
    "\n",
    "# --- Execution Loop ---\n",
    "\n",
    "def run_evaluation_suite_re(graph_instance):\n",
    "    \"\"\"\n",
    "    Automates the graph execution for every resume in the test suite.\n",
    "    Saves the final UI-ready payloads as 'predicted_{name}.json'.\n",
    "    \"\"\"\n",
    "    for case in test_map:\n",
    "        print(f\"🚀 Processing: {case['resume']}...\")\n",
    "        \n",
    "        # 1. Prepare Inputs\n",
    "        jd_content = get_job_description_string(case['data'])\n",
    "        \n",
    "        if not jd_content:\n",
    "            print(f\"⚠️ Skipping {case['resume']}: JD not found in {case['data']}\")\n",
    "            continue\n",
    "\n",
    "        # The 'job_description' key must match your extraction node's expectation\n",
    "        initial_input = {\n",
    "            \"candidate_name\": case['name'].replace(\"_\", \" \"),\n",
    "            \"resume_path\": os.path.join(RESUME_DIR, case['resume']),\n",
    "            \"job_description\": jd_content, \n",
    "            \"resume_text\": None # Assuming input_node or extraction node loads the PDF\n",
    "        }\n",
    "\n",
    "        # 2. Invoke Graph with a unique thread\n",
    "        thread_id = str(uuid.uuid4())\n",
    "        config = {\"configurable\": {\"thread_id\": thread_id}}\n",
    "        \n",
    "        try:\n",
    "            # Execution\n",
    "            final_state = graph_instance.invoke(initial_input, config=config)\n",
    "            \n",
    "            # 3. Process and Save UI Payload\n",
    "            prediction = export_ui_payload(final_state)\n",
    "            output_file = f\"predicted_{case['name'].lower()}.json\"\n",
    "            output_path = os.path.join(OUTPUT_DIR, output_file)\n",
    "            \n",
    "            with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
    "                json.dump(prediction, f, indent=2)\n",
    "                \n",
    "            print(f\"✅ Success! Prediction saved to: {output_path}\")\n",
    "            \n",
    "        except Exception as e:\n",
    "            print(f\"❌ Error during graph execution for {case['resume']}: {e}\")\n",
    "\n",
    "# --- Example of Triggering ---\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8ef7f0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    # Assuming your graph is already compiled and named 'graph'\n",
    "    run_evaluation_suite_re(graph)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e37e4370",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import re\n",
    "from rouge_score import rouge_scorer\n",
    "\n",
    "def normalize(text):\n",
    "    if not text: return \"\"\n",
    "    return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n",
    "\n",
    "def calculate_f1(target_set, predicted_set):\n",
    "    if not target_set and not predicted_set: return 1.0\n",
    "    if not target_set or not predicted_set: return 0.0\n",
    "    intersection = target_set.intersection(predicted_set)\n",
    "    precision = len(intersection) / len(predicted_set)\n",
    "    recall = len(intersection) / len(target_set)\n",
    "    if (precision + recall) == 0:\n",
    "        return 0.0\n",
    "    return 2 * (precision * recall) / (precision + recall)\n",
    "\n",
    "# ← replaces your manual calculate_lcs + get_rouge_l\n",
    "scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
    "\n",
    "def get_rouge_l(reference, candidate):\n",
    "    if not reference or not candidate:\n",
    "        return 0.0\n",
    "    scores = scorer.score(reference, candidate)\n",
    "    return scores['rougeL'].fmeasure     # F1 score directly\n",
    "\n",
    "\n",
    "def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n",
    "    print(\"\\n📊 --- AI ENGINE PERFORMANCE REPORT ---\")\n",
    "    print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n",
    "    print(\"-\" * 75)\n",
    "\n",
    "    final_report = {}\n",
    "\n",
    "    for case in mapping:\n",
    "        name = case['name']\n",
    "        target_path = os.path.join(target_data_dir, case['data'])\n",
    "        pred_path = os.path.join(predicted_data_dir, f\"predicted_{name.lower()}.json\")\n",
    "\n",
    "        if not os.path.exists(target_path) or not os.path.exists(pred_path):\n",
    "            print(f\"⚠️  Skipping {name} — file not found\")\n",
    "            continue\n",
    "\n",
    "        try:\n",
    "            with open(target_path, 'r', encoding='utf-8') as f:\n",
    "                target_json = json.load(f)[\"ai_target\"]\n",
    "            with open(pred_path, 'r', encoding='utf-8') as f:\n",
    "                pred_json = json.load(f)\n",
    "\n",
    "            # 1. Skill Extraction F1\n",
    "            target_skills = {normalize(g['skill_name']) for g in target_json['skill_gap_analysis_data']['analyzed_gaps']}\n",
    "            pred_skills   = {normalize(g['skill_name']) for g in pred_json['skill_gap_analysis_data']['analyzed_gaps']}\n",
    "            skill_f1 = calculate_f1(target_skills, pred_skills)\n",
    "\n",
    "            # 2. Reasoning Quality — ROUGE-L via library\n",
    "            reasoning_rouge = get_rouge_l(\n",
    "                target_json['skill_gap_analysis_data']['executive_summary'],\n",
    "                pred_json['skill_gap_analysis_data']['executive_summary']\n",
    "            )\n",
    "\n",
    "            # 3. Retrieval Hit Rate\n",
    "            target_ids = {normalize(c['course_id']) for c in target_json['final_roadmap']['roadmap']}\n",
    "            pred_ids   = {normalize(c['course_id']) for c in pred_json['final_roadmap']['roadmap']}\n",
    "            hit_count  = len(target_ids.intersection(pred_ids))\n",
    "            hit_rate   = hit_count / len(target_ids) if target_ids else 0.0\n",
    "\n",
    "            print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n",
    "\n",
    "            final_report[name] = {\n",
    "                \"skill_extraction_f1\": round(skill_f1, 4),\n",
    "                \"reasoning_rouge_l\":   round(reasoning_rouge, 4),\n",
    "                \"retrieval_hit_rate\":  round(hit_rate, 4),\n",
    "            }\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"⚠️  Error processing {name}: {e}\")\n",
    "\n",
    "    # Average across all test cases\n",
    "    if final_report:\n",
    "        avg_f1     = sum(v[\"skill_extraction_f1\"] for v in final_report.values()) / len(final_report)\n",
    "        avg_rouge  = sum(v[\"reasoning_rouge_l\"]   for v in final_report.values()) / len(final_report)\n",
    "        avg_hit    = sum(v[\"retrieval_hit_rate\"]   for v in final_report.values()) / len(final_report)\n",
    "        print(\"-\" * 75)\n",
    "        print(f\"{'AVERAGE':<20} | {avg_f1:>8.2%} | {avg_rouge:>15.2%} | {avg_hit:>15.2%}\")\n",
    "\n",
    "    return final_report\n",
    "\n",
    "\n",
    "# --- Mapping & Paths ---\n",
    "mapping = [\n",
    "    {\"data\": \"atgdata.json\",      \"name\": \"Atharva_Gaykar\"},\n",
    "    \n",
    "]\n",
    "\n",
    "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
    "PRED_DIR = \"./predictions\"\n",
    "\n",
    "report = run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70ced174",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import re\n",
    "\n",
    "def normalize(text):\n",
    "    \"\"\"Clean and normalize text for comparison.\"\"\"\n",
    "    if not text: return \"\"\n",
    "    return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n",
    "\n",
    "def calculate_lcs(X, Y):\n",
    "    \"\"\"Calculates the length of the Longest Common Subsequence.\"\"\"\n",
    "    m, n = len(X), len(Y)\n",
    "    L = [[0] * (n + 1) for _ in range(m + 1)]\n",
    "    for i in range(m + 1):\n",
    "        for j in range(n + 1):\n",
    "            if i == 0 or j == 0:\n",
    "                L[i][j] = 0\n",
    "            elif X[i-1] == Y[j-1]:\n",
    "                L[i][j] = L[i-1][j-1] + 1\n",
    "            else:\n",
    "                L[i][j] = max(L[i-1][j], L[i][j-1])\n",
    "    return L[m][n]\n",
    "\n",
    "def get_rouge_l(reference, candidate):\n",
    "    \"\"\"Calculates ROUGE-L F1 score using LCS.\"\"\"\n",
    "    if not reference or not candidate:\n",
    "        return 0.0\n",
    "    ref_tokens = normalize(reference).split()\n",
    "    cand_tokens = normalize(candidate).split()\n",
    "    if not ref_tokens or not cand_tokens:\n",
    "        return 0.0\n",
    "    lcs_count = calculate_lcs(ref_tokens, cand_tokens)\n",
    "    recall = lcs_count / len(ref_tokens)\n",
    "    precision = lcs_count / len(cand_tokens)\n",
    "    if (recall + precision) == 0:\n",
    "        return 0.0\n",
    "    f1 = (2 * recall * precision) / (recall + precision)\n",
    "    return f1\n",
    "\n",
    "def calculate_f1(target_set, predicted_set):\n",
    "    \"\"\"Calculates Precision, Recall, and F1 for sets of entities (Skills/IDs).\"\"\"\n",
    "    if not target_set and not predicted_set: return 1.0\n",
    "    if not target_set or not predicted_set: return 0.0\n",
    "    intersection = target_set.intersection(predicted_set)\n",
    "    precision = len(intersection) / len(predicted_set)\n",
    "    recall = len(intersection) / len(target_set)\n",
    "    if (precision + recall) == 0:\n",
    "        return 0.0\n",
    "    return 2 * (precision * recall) / (precision + recall)\n",
    "\n",
    "def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n",
    "    \"\"\"\n",
    "    Main evaluation loop benchmarking predictions against ground truth.\n",
    "    Handles missing keys and files gracefully.\n",
    "    \"\"\"\n",
    "    print(\"\\n📊 --- AI ENGINE PERFORMANCE REPORT ---\")\n",
    "    print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n",
    "    print(\"-\" * 75)\n",
    "\n",
    "    final_report = {}\n",
    "\n",
    "    for case in mapping:\n",
    "        name = case['name']\n",
    "        target_path = os.path.join(target_data_dir, case['data'])\n",
    "        # Look for the predicted file (lowercase name mapper)\n",
    "        pred_filename = f\"predicted_{name.lower()}.json\"\n",
    "        pred_path = os.path.join(predicted_data_dir, pred_filename)\n",
    "        \n",
    "        # 1. Check for File Existence\n",
    "        if not os.path.exists(target_path):\n",
    "            print(f\"⚠️  Skipping {name} — Target file '{case['data']}' not found.\")\n",
    "            continue\n",
    "        if not os.path.exists(pred_path):\n",
    "            print(f\"⚠️  Skipping {name} — Prediction file '{pred_filename}' not found.\")\n",
    "            continue\n",
    "            \n",
    "        try:\n",
    "            with open(target_path, 'r', encoding='utf-8') as f:\n",
    "                target_json = json.load(f).get(\"ai_target\", {})\n",
    "            with open(pred_path, 'r', encoding='utf-8') as f:\n",
    "                pred_json = json.load(f)\n",
    "                \n",
    "            # --- Metric 1: Skill Extraction Accuracy (F1) ---\n",
    "            target_gap_data = target_json.get('skill_gap_analysis_data', {})\n",
    "            pred_gap_data = pred_json.get('skill_gap_analysis_data', {})\n",
    "            \n",
    "            target_skills = {normalize(g.get('skill_name')) for g in target_gap_data.get('analyzed_gaps', [])}\n",
    "            pred_skills = {normalize(g.get('skill_name')) for g in pred_gap_data.get('analyzed_gaps', [])}\n",
    "            skill_f1 = calculate_f1(target_skills, pred_skills)\n",
    "            \n",
    "            # --- Metric 2: Reasoning Quality (ROUGE-L) ---\n",
    "            target_summary = target_gap_data.get('executive_summary', \"\")\n",
    "            pred_summary = pred_gap_data.get('executive_summary', \"\")\n",
    "            reasoning_rouge = get_rouge_l(target_summary, pred_summary)\n",
    "            \n",
    "            # --- Metric 3: Retrieval Precision (Top-1 Hit Rate) ---\n",
    "            # Extract expected IDs from target\n",
    "            target_roadmap = target_json.get('final_roadmap', {}).get('roadmap', [])\n",
    "            target_ids = {normalize(c.get('course_id')) for c in target_roadmap}\n",
    "            \n",
    "            # Extract predicted IDs from prediction (Checking common possible keys)\n",
    "            pred_roadmap_obj = pred_json.get('final_roadmap', {})\n",
    "            # If final_roadmap is a list directly in some versions\n",
    "            if isinstance(pred_roadmap_obj, list):\n",
    "                pred_roadmap = pred_roadmap_obj\n",
    "            else:\n",
    "                pred_roadmap = pred_roadmap_obj.get('roadmap', [])\n",
    "            \n",
    "            pred_ids = {normalize(c.get('course_id')) for c in pred_roadmap}\n",
    "            \n",
    "            if target_ids:\n",
    "                hit_count = len(target_ids.intersection(pred_ids))\n",
    "                hit_rate = hit_count / len(target_ids)\n",
    "            else:\n",
    "                hit_rate = 0.0\n",
    "            \n",
    "            # Print status row\n",
    "            print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n",
    "            \n",
    "            final_report[name] = {\n",
    "                \"skill_extraction_f1\": skill_f1,\n",
    "                \"reasoning_rouge_l\": reasoning_rouge,\n",
    "                \"retrieval_hit_rate\": hit_rate\n",
    "            }\n",
    "        except Exception as e:\n",
    "            print(f\"⚠️  Error processing {name}: {str(e)}\")\n",
    "\n",
    "    return final_report\n",
    "\n",
    "# --- Mapping & Paths ---\n",
    "mapping = [\n",
    "    {\"data\": \"atgdata.json\", \"name\": \"Atharva_Gaykar\"},\n",
    "    {\"data\": \"buisnessdata.json\", \"name\": \"Business_Manager\"},\n",
    "    {\"data\": \"chefdata.json\", \"name\": \"Executive_Chef\"},\n",
    "    {\"data\": \"casemanagerdata.json\", \"name\": \"Case_Manager\"}\n",
    "]\n",
    "\n",
    "# Note: Ensure these paths are correct for your local environment\n",
    "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
    "PRED_DIR = \"./predictions\"\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    report = run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73a45cfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import re\n",
    "from rouge_score import rouge_scorer\n",
    "\n",
    "# -----------------------------\n",
    "# Scorer instance (created once)\n",
    "# -----------------------------\n",
    "scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
    "\n",
    "\n",
    "def normalize(text):\n",
    "    if not text: return \"\"\n",
    "    return re.sub(r'\\W+', ' ', str(text).lower()).strip()\n",
    "\n",
    "\n",
    "def get_rouge_l(reference, candidate):\n",
    "    if not reference or not candidate:\n",
    "        return 0.0\n",
    "    return scorer.score(reference, candidate)['rougeL'].fmeasure\n",
    "\n",
    "\n",
    "def calculate_skill_f1(target_skills, pred_skills):\n",
    "    if not target_skills and not pred_skills: return 1.0\n",
    "    if not target_skills or not pred_skills: return 0.0\n",
    "\n",
    "    tp = 0\n",
    "    matched_targets = set()\n",
    "\n",
    "    for p in pred_skills:\n",
    "        for t in target_skills:\n",
    "            if t in matched_targets: continue\n",
    "            if p in t or t in p:\n",
    "                tp += 1\n",
    "                matched_targets.add(t)\n",
    "                break\n",
    "\n",
    "    precision = tp / len(pred_skills)\n",
    "    recall = tp / len(target_skills)\n",
    "\n",
    "    if (precision + recall) == 0:\n",
    "        return 0.0\n",
    "    return 2 * (precision * recall) / (precision + recall)\n",
    "\n",
    "\n",
    "def calculate_retrieval_hit(target_ids, pred_ids):\n",
    "    if not target_ids: return 0.0\n",
    "    if not pred_ids: return 0.0\n",
    "    return len(target_ids.intersection(pred_ids)) / len(target_ids)\n",
    "\n",
    "\n",
    "def run_benchmarking_report(target_data_dir, predicted_data_dir, mapping):\n",
    "    print(\"\\n📊 --- AI ENGINE PERFORMANCE REPORT ---\")\n",
    "    print(f\"{'Test Case':<20} | {'Skill F1':<10} | {'Reasoning (RG-L)':<15} | {'Retrieval (Hit)':<15}\")\n",
    "    print(\"-\" * 75)\n",
    "\n",
    "    final_report = {}\n",
    "\n",
    "    for case in mapping:\n",
    "        name = case['name']\n",
    "        target_path = os.path.join(target_data_dir, case['data'])\n",
    "        pred_path = os.path.join(predicted_data_dir, f\"predicted_{name.lower()}.json\")\n",
    "\n",
    "        if not os.path.exists(target_path) or not os.path.exists(pred_path):\n",
    "            if not os.path.exists(pred_path):\n",
    "                print(f\"{name:<20} | SKIPPED — prediction file not found\")\n",
    "            continue\n",
    "\n",
    "        try:\n",
    "            with open(target_path, 'r', encoding='utf-8') as f:\n",
    "                target_json = json.load(f).get(\"ai_target\", {})\n",
    "            with open(pred_path, 'r', encoding='utf-8') as f:\n",
    "                pred_json = json.load(f)\n",
    "\n",
    "            # --- Metric 1: Skill F1 ---\n",
    "            target_gap = target_json.get('skill_gap_analysis_data', {})\n",
    "            pred_gap   = pred_json.get('skill_gap_analysis_data', {})\n",
    "\n",
    "            target_skills = {normalize(g.get('skill_name')) for g in target_gap.get('analyzed_gaps', [])}\n",
    "            pred_skills   = {normalize(g.get('skill_name')) for g in pred_gap.get('analyzed_gaps', [])}\n",
    "            skill_f1      = calculate_skill_f1(target_skills, pred_skills)\n",
    "\n",
    "            # --- Metric 2: ROUGE-L ---\n",
    "            reasoning_rouge = get_rouge_l(\n",
    "                target_gap.get('executive_summary', \"\"),\n",
    "                pred_gap.get('executive_summary', \"\")\n",
    "            )\n",
    "\n",
    "            # --- Metric 3: Retrieval Hit Rate ---\n",
    "            target_ids = {normalize(c.get('course_id')) for c in target_json.get('final_roadmap', {}).get('roadmap', [])}\n",
    "\n",
    "            pred_roadmap_obj = pred_json.get('final_roadmap', {})\n",
    "            pred_roadmap     = pred_roadmap_obj if isinstance(pred_roadmap_obj, list) else pred_roadmap_obj.get('roadmap', [])\n",
    "            pred_ids         = {normalize(c.get('course_id')) for c in pred_roadmap}\n",
    "\n",
    "            hit_rate = calculate_retrieval_hit(target_ids, pred_ids)\n",
    "\n",
    "            print(f\"{name:<20} | {skill_f1:>8.2%} | {reasoning_rouge:>15.2%} | {hit_rate:>15.2%}\")\n",
    "\n",
    "            final_report[name] = {\n",
    "                \"skill_extraction_f1\": round(skill_f1, 4),\n",
    "                \"reasoning_rouge_l\":   round(reasoning_rouge, 4),\n",
    "                \"retrieval_hit_rate\":  round(hit_rate, 4),\n",
    "            }\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"⚠️  Error processing {name}: {str(e)}\")\n",
    "\n",
    "    # --- Average Row ---\n",
    "    if final_report:\n",
    "        avg_f1    = sum(v[\"skill_extraction_f1\"] for v in final_report.values()) / len(final_report)\n",
    "        avg_rouge = sum(v[\"reasoning_rouge_l\"]   for v in final_report.values()) / len(final_report)\n",
    "        avg_hit   = sum(v[\"retrieval_hit_rate\"]   for v in final_report.values()) / len(final_report)\n",
    "        print(\"-\" * 75)\n",
    "        print(f\"{'AVERAGE':<20} | {avg_f1:>8.2%} | {avg_rouge:>15.2%} | {avg_hit:>15.2%}\")\n",
    "\n",
    "    return final_report\n",
    "\n",
    "\n",
    "# --- Mapping & Paths ---\n",
    "mapping = [\n",
    "    {\"data\": \"atgdata.json\",      \"name\": \"Atharva_Gaykar\"},\n",
    "    {\"data\": \"buisnessdata.json\", \"name\": \"Business_Manager\"},\n",
    "    {\"data\": \"chefdata.json\",     \"name\": \"Executive_Chef\"},\n",
    "    {\"data\": \"casemanager.json\",  \"name\": \"Case_Manager\"}\n",
    "]\n",
    "\n",
    "DATA_DIR = r\"C:\\Users\\ATHARVA\\Downloads\\my codes\\web\\AdaptiveEngineService\\AI_Engine_Evaluation\\Testcases\\data\"\n",
    "PRED_DIR = \"./predictions\"\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    run_benchmarking_report(DATA_DIR, PRED_DIR, mapping)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}