Spaces:

Rishabh2095
/

AgentWorkflowJobApplications

Sleeping

App Files Files Community

Rishabh2095 commited on Jun 1, 2025

Commit

a8b79ed

0 Parent(s):

First Commit

Browse files

Files changed (32) hide show

.gitignore +117 -0
README.md +81 -0
__init__.py +69 -0
agents/__init__.py +3 -0
agents/nodes.py +177 -0
agents/output_schema.py +19 -0
classes/__init__.py +3 -0
classes/classes.py +63 -0
langgraph.json +10 -0
langgraph_init.py +4 -0
nodes/__init__.py +13 -0
nodes/createdraft.py +0 -0
nodes/initializing.py +225 -0
nodes/research_workflow.py +81 -0
nodes/selfconsistency.py +85 -0
nodes/test_workflow.py +23 -0
nodes/variations.py +73 -0
prompts.md +27 -0
prompts/__init__.py +3 -0
prompts/templates.py +239 -0
setup.py +0 -0
testing.ipynb +1069 -0
tools/TavilySearch.py +230 -0
tools/__init__.py +9 -0
utils/__init__.py +3 -0
utils/config.py +25 -0
utils/document_processing.py +443 -0
utils/errors.py +20 -0
utils/langfuse_handler.py +0 -0
utils/llm_client.py +141 -0
utils/vector_store.py +156 -0
workflow.py +210 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,117 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Python Virtual Environments
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.env
+.venv
+# Jupyter Notebook
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+*.swn
+.DS_Store
+# API keys and secrets
+.env
+.secrets
+*.pem
+*.key
+langsmith_api_key.txt
+# Logs and databases
+*.log
+*.sql
+*.sqlite
+logs/
+# Local development settings
+local_settings.py
+# Pytest and coverage reports
+.pytest_cache/
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+coverage.xml
+*.cover
+.hypothesis/
+.pylintrcls
+# Documentation
+docs/_build/
+site/
+# Type checking
+.mypy_cache/
+.dmypy.json
+dmypy.json
+.pyre/
+# LangChain related
+.langchain.db
+langsmith.db
+.langgraph_api/
+# Temporary files
+tmp/
+tests/
+temp/
+*.tmp
+*.temp
+# LangGraph specific
+langgraph.db
+*.db
+# Output files (if you generate reports/documents)
+output/
+reports/
+generated/
+# Test artifacts
+.pytest_cache/
+test-results/
+test_output/
+# OS specific
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+# Github
+.github/
+# Miscellaneous
+parsed_text.json

README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+# Job Writer Module
+A modular, well-structured package for creating tailored job applications using LangChain and LangGraph with LangSmith observability.
+## Features
+- Creates personalized job application materials based on resumes and job descriptions
+- Supports multiple application types: cover letters, bullet points, and LinkedIn messages
+- Uses RAG for personalization and web search for company research
+- Provides human-in-the-loop feedback integration
+- Implements self-consistency voting for quality control
+## Installation
+```bash
+# Install the package and its dependencies
+pip install -e .
+# Install development dependencies (including linting tools)
+pip install -r requirements-dev.txt
+```
+## Code Standards and Linting
+This project uses several tools to ensure code quality:
+1. **Black** - Code formatter that enforces consistent style
+2. **isort** - Sorts imports according to best practices
+3. **Flake8** - Style guide enforcement
+4. **mypy** - Static type checking
+### Running the Linters
+```bash
+# Format code with Black
+black job_writer/
+# Sort imports
+isort job_writer/
+# Check style with Flake8
+flake8 job_writer/
+# Type checking with mypy
+mypy job_writer/
+```
+### Pre-commit Hooks
+We use pre-commit hooks to automatically run linters before each commit:
+```bash
+# Install the pre-commit hooks
+pip install pre-commit
+pre-commit install
+# You can also run the hooks manually
+pre-commit run --all-files
+```
+## Usage Example
+```python
+import asyncio
+from job_writer.workflow import run_job_application_writer
+# Run the job application writer
+result = asyncio.run(run_job_application_writer(
+    resume_path="path/to/resume.pdf",
+    job_desc_path="https://example.com/job-posting",
+    content="cover_letter"
+))
+print(result["final"])
+```
+Alternatively, you can use the command-line interface:
+```bash
+python -m job_writer.workflow --resume path/to/resume.pdf --job https://example.com/job-posting --type cover_letter
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Job Application Writer Package
+A modular, well-structured package for creating tailored job applications
+using LangChain and LangGraph with LangSmith observability.
+"""
+__version__ = "0.1.0"
+import os, getpass
+import logging
+from pathlib import Path
+from dotenv import load_dotenv
+from langfuse import Langfuse
+# Set up logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+log_dir = Path(__file__).parent / 'logs'
+log_dir.mkdir(exist_ok=True)
+logger.addHandler(logging.FileHandler(log_dir / 'job_writer.log', mode='a'))
+logger.info("Logger initialized. Writing to %s", Path(__file__).parent / 'job_writer.log')
+# Load environment variables from .env file
+env_path = Path(__file__).parent / '.env'
+def _set_env(var: str):
+    if not os.environ.get(var):
+        os.environ[var] = getpass.getpass(f"{var}: ")
+        logger.info(f"{var} set to {os.environ[var]}")
+if env_path.exists():
+    logger.info("Loading environment variables from %s",  env_path)
+    load_dotenv(dotenv_path=env_path, override=True)
+else:
+    logger.warning(".env file not found at %s. Using system environment variables.", env_path)
+# Check for critical environment variables
+if not os.getenv("TAVILY_API_KEY"):
+    logger.warning("TAVILY_API_KEY environment variable is not set." \
+                    " Failed to get TAVILY_API_KEY at Path %s", env_path)
+    _set_env("TAVILY_API_KEY")
+if not os.getenv("GEMINI_API_KEY"):
+    logger.warning("GEMINI_API_KEY environment variable is not set. " \
+                    "Failed to get GEMINI_API_KEY at Path %s", env_path)
+    _set_env("GEMINI_API_KEY")
+if not os.getenv("PINECONE_API_KEY"):
+    logger.warning("PINECONE_API_KEY environment variable is not set." \
+                " Failed to get PINECONE_API_KEY at Path %s", env_path)
+    _set_env("PINECONE_API_KEY")
+if not os.getenv("LANGFUSE_PUBLIC_KEY"):
+    logger.warning("LANGFUSE_PUBLIC_KEY environment variable is not set." \
+                " Failed to get LANGFUSE_PUBLIC_KEY at Path %s", env_path)
+    _set_env("LANGFUSE_PUBLIC_KEY")
+if not os.getenv("LANGFUSE_SECRET_KEY"):
+    logger.warning("LANGFUSE_SECRET_KEY environment variable is not set." \
+                " Failed to get LANGFUSE_SECRET_KEY at Path %s", env_path)
+    _set_env("LANGFUSE_SECRET_KEY")
+__all__: list[str] = ["job_app_graph", "workflows/research_workflow"]

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Agent modules for job application generation.
+"""

agents/nodes.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Node functions for the job application writer LangGraph.
+This module contains all the node functions used in the job application
+writer workflow graph, each handling a specific step in the process.
+"""
+import logging
+from datetime import datetime
+from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from ..classes.classes import AppState
+from ..prompts.templates import (
+    CRITIQUE_PROMPT,
+    PERSONA_DEVELOPMENT_PROMPT,
+    COVER_LETTER_PROMPT,
+    REVISION_PROMPT,
+    BULLET_POINTS_PROMPT,
+    LINKEDIN_NOTE_PROMPT,
+)
+from ..utils.llm_client import LLMClient
+logger = logging.getLogger(__name__)
+# Constants
+CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
+LLM = LLMClient()
+llm = LLMClient().get_llm()
+def create_draft(state: AppState) -> AppState:
+    """Create initial draft of the application material."""
+    # Determine which type of content we're creating
+    current_application_session = state.get("company_research_data", {})
+    content_category = state.get("content_category", "cover_letter")
+    try:
+        if state.get("vector_store"):
+            vector_store = state.get("vector_store")
+            # Extract key requirements from job description
+            prompt = PERSONA_DEVELOPMENT_PROMPT | llm | StrOutputParser()
+            if current_application_session:
+                key_requirements = prompt.invoke({"job_description": current_application_session["job_description"]})
+            else:
+                return key_requirements
+            if not key_requirements:
+                print("Warning: No key requirements found in the job description.")
+                return state
+            # Use the key requirements to query for the most relevant resume parts
+            namespace = f"resume_{state['session_id']}"
+            relevant_docs = vector_store.retrieve_similar(
+                query=key_requirements,
+                namespace=namespace,
+                k=3
+            )
+            # Use these relevant sections with higher weight in the draft creation
+            highly_relevant_resume = "\n".join([doc.page_content for doc in relevant_docs])
+            resume_text = f"""
+            # Most Relevant Experience
+            {highly_relevant_resume}
+            # Full Resume
+            {resume_text}
+            """
+    except Exception as e:
+        print(f"Warning: Could not use vector search for relevant resume parts: {e}")
+        # Continue with regular resume text
+    # Select the appropriate prompt template based on application type and persona
+    print(f"Content category: {content_category}")
+    if content_category == "bullets":
+        FirstDraftGenerationPromptTemplate = ChatPromptTemplate([BULLET_POINTS_PROMPT])
+    elif content_category == "linkedin_connect_request":
+        FirstDraftGenerationPromptTemplate = ChatPromptTemplate([LINKEDIN_NOTE_PROMPT])
+    else:
+        FirstDraftGenerationPromptTemplate = ChatPromptTemplate([COVER_LETTER_PROMPT])
+    # Create the draft using the selected prompt template
+    CurrentSessionContextMessage = HumanMessagePromptTemplate.from_template(
+            """
+            Below is the Job Description and Resume enclosed in triple backticks.
+            Job Description and Resume:
+            ```
+            {current_job_role}
+            ```
+            Use the Company Research Data below in to create a cover letter that highlights the match between my qualifications and the job requirements and aligns with the company's values and culture.
+            Company Research Data:
+            #company_research_data
+            Create a cover letter that highlights the match between my qualifications and the job requirements.
+            """,
+            input_variables=["current_job_role",
+                        "company_research_data"])
+    FirstDraftGenerationPromptTemplate.append(CurrentSessionContextMessage)
+    # Invoke the chain with the appropriate inputs
+    chain = (
+            ({"current_job_role": lambda x: x["current_job_role"],
+              "company_research_data": lambda x: x["company_research_data"]})
+            | FirstDraftGenerationPromptTemplate
+            | llm
+    )
+    # Prepare the inputs
+    inputs = {
+        "current_job_role": current_application_session['job_description'],
+        "company_research_data": current_application_session["tavily_search"]}
+    response = chain.invoke(inputs)
+    print(f"Draft created: {response}")
+    state["draft"] = response
+    return state
+def critique_draft(state: AppState) -> AppState:
+    """Critique the draft for improvements."""
+    critique = llm.invoke(CRITIQUE_PROMPT.format(
+        job_description=state["job_description"][0],
+        draft=state["draft"]
+    ))
+    # Store the critique for reference during human feedback
+    state["critique"] = critique
+    return state
+def human_approval(state: AppState) -> AppState:
+    """Human-in-the-loop checkpoint for feedback on the draft."""
+    # This is a placeholder function that would be replaced by actual UI interaction
+    print("\n" + "="*80)
+    print("DRAFT FOR REVIEW:")
+    print(state["draft"])
+    print("\nAUTOMATIC CRITIQUE:")
+    print(state.get("critique", "No critique available"))
+    print("="*80)
+    print("\nPlease provide your feedback (press Enter to continue with no changes):")
+    # In a real implementation, this would be handled by the UI
+    feedback = input()
+    state["feedback"] = feedback
+    return state
+def finalize_document(state: AppState) -> AppState:
+    """Incorporate feedback and finalize the document."""
+    if not state["feedback"].strip():
+        state["final"] = state["draft"]
+        return state
+    final = llm.invoke(REVISION_PROMPT.format(
+        draft=state["draft"],
+        feedback=state["feedback"]
+    ))
+    state["final"] = final
+    return state
+# Decision function for conditional routing
+def determine_next_step(state: AppState) -> str:
+    """Determine the next node in the graph based on state."""
+    # If we're missing the company name, we can't do company research
+    if not state["company_name"]:
+        return "draft"
+    return "research"

agents/output_schema.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from pydantic import BaseModel, Field, field_validator
+from typing import List, Optional
+class TavilyQuerySet(BaseModel):
+    query1: Optional[List[str]] = Field(default=None, description="First search query and its rationale, e.g., ['query text']")
+    query2: Optional[List[str]] = Field(default=None, description="Second search query and its rationale")
+    query3: Optional[List[str]] = Field(default=None, description="Third search query and its rationale")
+    query4: Optional[List[str]] = Field(default=None, description="Fourth search query and its rationale")
+    query5: Optional[List[str]] = Field(default=None, description="Fifth search query and its rationale")
+    @field_validator("query1", "query2", "query3", "query4", "query5", mode="after")
+    @classmethod
+    def ensure_len_two(cls, v):
+        """Ensure each provided query list contains exactly one strings: [query]."""
+        if v is not None:  # Only validate if the list is actually provided
+            if len(v) != 1:
+                # Updated error message for clarity
+                raise ValueError("Each query list, when provided, must contain exactly one string: the query text.")
+        return v

classes/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .classes import AppState, ResearchState, DataLoadState
2	+
3	+ __all__ = ["AppState", "ResearchState", "DataLoadState"]

classes/classes.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+State definitions for the Job Writer LangGraph Workflow.
+"""
+from typing_extensions import List, Dict, Any
+from langgraph.graph import MessagesState
+class AppState(MessagesState):
+    """
+    State container for the job application writer workflow.
+    Attributes:
+        resume: List of text chunks from the candidate's resume
+        job_description: List of text chunks from the job description
+        company_name: Extracted company name
+        company_research_data: Additional information about the company from research
+        persona: The writing persona to use ("recruiter" or "hiring_manager")
+        draft: Current draft of the application material
+        feedback: Human feedback on the draft
+        final: Final version of the application material
+        content: Type of application material to generate
+    """
+    resume_path: str
+    job_description_source: str
+    company_research_data: Dict[str, Any]
+    draft: str
+    feedback: str
+    final: str
+    content: str  # "cover_letter", "bullets", "linkedin_note"
+    current_node: str
+class DataLoadState(MessagesState):
+    """
+    State container for the job application writer workflow.
+    Attributes:
+        resume: List of text chunks from the candidate's resume
+        job_description: List of text chunks from the job description
+        persona: The writing persona to use ("recruiter" or "hiring_manager")
+        content: Type of application material to generate
+    """
+    resume_path: str
+    job_description_source: str
+    resume: str
+    job_description: str
+    company_name: str
+    current_node: str
+    company_research_data: Dict[str, Any]
+class ResearchState(MessagesState):
+    """
+    State container for the job application writer workflow.
+    Attributes:
+        tavily_search: Dict[str, Any] Stores the results of the Tavily search
+        attempted_search_queries: List of queries used extracted from the job description
+        compiled_knowledge: Compiled knowledge from the research
+    """
+    company_research_data: Dict[str, Any]
+    attempted_search_queries: List[str]
+    current_node: str

langgraph.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "dependencies": [
+    "."
+  ],
+  "graphs": {
+    "job_application": "langgraph_init:job_app_graph"
+  },
+  "env": "./.env",
+  "python_version": "3.11"
+}

langgraph_init.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from .workflow import JobWorkflow
2	+
3	+
4	+ job_app_graph= JobWorkflow().compile()

nodes/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 23 16:49:52 2023
+@author: rishabhaggarwal
+"""
+from .initializing import Dataloading
+# from .createdraft import CreateDraft
+from .variations import generate_variations
+from .selfconsistency import self_consistency_vote
+from .research_workflow import research_workflow
+__all__ = ["Dataloading", "generate_variations", "self_consistency_vote", "research_workflow"]

nodes/createdraft.py ADDED Viewed

File without changes

nodes/initializing.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 23 16:49:52 2023
+@author: rishabhaggarwal
+"""
+import os
+import logging
+from typing_extensions import Literal
+from langchain_core.documents import Document
+from langchain_core.messages import SystemMessage
+from job_writer.classes import AppState, DataLoadState
+from job_writer.utils.document_processing import (
+    parse_resume,
+    get_job_description
+)
+logger = logging.getLogger(__name__)
+class Dataloading:
+    """
+    Initialize the state for the job application writer workflow.
+    """
+    def __init__(self):
+        pass
+    async def system_setup(self, state: AppState) -> DataLoadState:
+        """Initialize conversation by setting up a persona through System Prompt."""
+        resume_path = state.get("resume_path")
+        # Verify if the resume file path provided is valid
+        if not resume_path:
+            logger.error("Resume path is not provided in the state.")
+        elif not os.path.exists(resume_path):
+            logger.error("Resume file does not exist at path: %s", resume_path)
+            # Similar handling as above:
+            # raise FileNotFoundError(f"Resume file not found: {resume_path}")
+        elif not os.path.isfile(resume_path):
+            logger.error("The path provided for the resume is not a file: %s", resume_path)
+            # Similar handling:
+            # raise ValueError(f"Resume path is not a file: {resume_path}")
+        else:
+            logger.info("Resume path verified: %s", resume_path)
+        persona_init_message = SystemMessage(
+            content="You are my dedicated assistant for writing job application content, "
+                   "including cover letters, LinkedIn outreach messages, and responses to "
+                   "job-specfific questions (e.g., experience, culture fit, or motivation)."
+        )
+        messages = state.get("messages", [])
+        messages.append(persona_init_message)
+        return {
+            **state,
+            "messages": messages,
+            "current_node": "initialize_system"
+        }
+    async def get_resume(self, resume_source):
+        """
+        Get the resume te
+        """
+        try:
+            print("Parsing resume....")
+            resume_text = ""
+            resume_chunks = parse_resume(resume_source)
+            for chunk in resume_chunks:
+                if hasattr(chunk, 'page_content') and chunk.page_content:
+                        resume_text += chunk.page_content
+                elif isinstance(chunk, str) and chunk: # If parse_resume (util) returns list of strings
+                     resume_text += chunk
+                else:
+                    logger.debug("Skipping empty or invalid chunk in resume: %s", chunk)
+                continue
+            return resume_text
+        except Exception as e:
+            print(f"Error parsing resume: {e}")
+            raise e
+    async def parse_job_description(self, job_description_source):
+        try:
+            logger.info("Parsing job description from: %s", job_description_source)
+            document: Document = get_job_description(job_description_source)
+            company_name = ""
+            job_posting_text = ""
+            if document:
+                # Extract company name from metadata
+                if hasattr(document, 'metadata') and isinstance(document.metadata, dict):
+                    company_name = document.metadata.get("company_name", "")
+                    if not company_name:
+                        logger.warning("Company name not found in job description metadata.")
+                else:
+                    logger.warning("Metadata attribute not found or not a dictionary in the Document for job description.")
+                # Extract the job posting text from page_content
+                if hasattr(document, 'page_content'):
+                    job_posting_text = document.page_content
+                    if not job_posting_text:
+                        logger.info("Parsed job posting text is empty.")
+                else:
+                    logger.warning("page_content attribute not found in the Document for job description.")
+            else:
+                logger.warning("get_job_description returned None for source: %s", job_description_source)
+            return job_posting_text, company_name
+        except Exception as e:
+            logger.error("Error parsing job description from source '%s': %s", job_description_source, e, exc_info=True)
+            raise e
+    async def load_inputs(self, state: DataLoadState) -> AppState:
+        """
+        Parse the resume and job description to prepare the data from the context
+        which is required for the job application writer for the current state
+        """
+        resume_source = state.get("resume_path", "")
+        job_description_source = state.get("job_description_source", None)
+        # Initialize result containers\
+        resume_text = ""
+        job_posting_text = ""
+        company_name = ""
+        resume_chunks = []        # Handle job description input
+        if job_description_source:
+            try:
+                job_posting_text, company_name = await self.parse_job_description(job_description_source)
+                print(f"Job description parsing complete. Length: {len(job_posting_text) if job_posting_text else 0}")
+                # Ensure job_posting_text is not empty
+                if not job_posting_text:
+                    print("WARNING: Job posting text is empty after parsing.")
+                    job_posting_text = "No job description available. Please check the URL or provide a different source."
+            except Exception as e:
+                print(f"Error parsing job description: {e} in file {__file__}")
+                # Set a default value to prevent errors
+                job_posting_text = "Error parsing job description."
+                company_name = "Unknown Company"
+        if resume_source:
+            try:
+                resume_text = await self.get_resume(resume_source)
+            except Exception as e:
+                print(f"Error parsing resume: {e} in file {__file__}")
+                raise e
+        # If either is missing, prompt the user
+        if state["current_node"] == "verify" and not resume_text:
+            resume_chunks = input("Please paste the resume in text format: ")
+            resume_text = [Document(page_content=resume_chunks, metadata={"source": "resume"})]
+        if state["current_node"] == "verify" and not job_posting_text:
+            job_text = input("Please paste the job posting in text format: ")
+            job_posting_text = [job_text]
+        # Extract company name
+        state["company_research_data"] = {'resume': resume_text, 'job_description': job_posting_text, 'company_name': company_name}
+        state["current_node"] = "load_inputs"
+        return state
+    def validate_data_load_state(self,state: DataLoadState):
+        assert state.company_research_data.get("resume"), "Resume is missing in company_research_data"
+        assert state.company_research_data.get("job_description"), "Job description is missing"
+    def verify_inputs(self, state: AppState) -> Literal["load", "research"]:
+        """Verify that required inputs are present."""
+        print("Verifying Inputs")
+        state["current_node"] = "verify"
+        logger.info("Verifying loaded inputs!")
+        assert state["company_research_data"].get("resume"), "Resume is missing in company_research_data"
+        assert state["company_research_data"].get("job_description"), "Job description is missing"
+        if not state.get("company_research_data"):
+            missing_items = []
+            if not state.get("company_research_data").get("resume", ""):
+                missing_items.append("resume")
+            if not state.get("company_research_data").get("job_description", ""):
+                missing_items.append("job description")
+            print(f'Missing required data: {", ".join(missing_items)}')
+            return "load"
+        # Normalize state content to strings
+        for key in ["resume", "job_description"]:
+            try:
+                if isinstance(state["company_research_data"][key], (list, tuple)):
+                    state["company_research_data"][key] = " ".join(str(x) for x in state["company_research_data"][key])
+                elif isinstance(state["company_research_data"][key], dict):
+                    state["company_research_data"][key] = str(state["company_research_data"][key])
+                else:
+                    state["company_research_data"][key] = str(state["company_research_data"][key])
+            except Exception as e:
+                logger.warning("Error converting %s to string: %s", key, e)
+                raise e
+        return "research"
+    async def run(self, state: DataLoadState) -> AppState:
+        """
+        Run the InitializeState class to initialize
+        the state for the job application writer workflow.
+        """
+        state = await self.load_inputs(state)
+        return state

nodes/research_workflow.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# -*- coding: utf-8 -*-
+"""
+This module performs the research phase of the job application writing process.
+One of the stages is Tavily Search which will be use to search for the company
+"""
+import logging
+from langgraph.graph import StateGraph, START, END
+from job_writer.tools.TavilySearch import relevance_filter, search_company
+from job_writer.classes.classes import ResearchState
+logger = logging.getLogger(__name__)
+# Set up logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+async def research_company(state: ResearchState) -> ResearchState:
+    """Research the company if name is available."""
+    state["current_node"] = "research_company"
+    try:
+        # Extract values from state
+        company_name = state["company_research_data"].get("company_name", "")
+        job_description = state["company_research_data"].get("job_description", "")
+        logger.info(f"Researching company: {company_name}")
+        # Call search_company using the invoke method instead of __call__
+        # The tool expects job_description and company_name and returns a tuple
+        result = search_company.invoke({
+            "job_description": job_description,
+            "company_name": company_name
+        })
+        # Unpack the tuple
+        if isinstance(result, tuple) and len(result) == 2:
+            results, attempted_tavily_query_list = result
+        else:
+            # Handle the case when it's not a tuple
+            results = result
+            attempted_tavily_query_list = []
+        logger.info(f"Search completed with results and {len(attempted_tavily_query_list)} queries")
+        # Store results in state - note that results is the first item in the tuple
+        state["attempted_search_queries"] = attempted_tavily_query_list
+        state["company_research_data"]["tavily_search"] = results
+    except Exception as e:
+        logger.error(f"Error in research_company: {str(e)}")
+        # Provide empty results to avoid breaking the workflow
+        state["company_research_data"]["tavily_search"] = {"error": str(e), "tavily_search": []}
+        state["attempted_search_queries"] = []
+    return state
+print("\n\n\nInitializing research workflow...\n\n\n")
+# Create research subgraph
+research_subgraph = StateGraph(ResearchState)
+# Add research subgraph nodes
+research_subgraph.add_node("research_company", research_company)
+research_subgraph.add_node("relevance_filter", relevance_filter)
+# Add research subgraph edges
+research_subgraph.add_edge(START, "research_company")
+research_subgraph.add_edge("research_company", "relevance_filter")
+research_subgraph.add_edge("relevance_filter", END)
+# Compile research subgraph
+research_workflow = research_subgraph.compile()
+# class ResearchWorkflow:
+#     def __init__(self):
+#         self.research_workflow = research_workflow

nodes/selfconsistency.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import logging
+from datetime import datetime
+from ..classes.classes import AppState
+from ..prompts.templates import (
+    DRAFT_RATING_PROMPT,
+    BEST_DRAFT_SELECTION_PROMPT
+)
+logger = logging.getLogger(__name__)
+# Constants
+CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
+# LLM = LLMClient()
+# llm = LLMClient().get_llm()
+# llm_precise = LLMClient().get_llm()
+def self_consistency_vote(state: AppState) -> AppState:
+    """Choose the best draft from multiple variations."""
+    variations = state.get("variations", {"variations": []})
+    all_drafts = [state["draft"]] + variations["variations"]
+    # First, have the LLM rate each draft
+    ratings = []
+    # Get resume and job summaries, handling different formats
+    try:
+        if isinstance(state["resume"], list) and len(state["resume"]) > 0:
+            if hasattr(state["resume"][0], 'page_content'):
+                resume_summary = state["resume"][0].page_content
+            else:
+                resume_summary = state["resume"][0]
+        else:
+            resume_summary = str(state["resume"])
+    except Exception as e:
+        print(f"Warning: Error getting resume summary: {e}")
+        resume_summary = str(state["resume"])
+    try:
+        if isinstance(state["job_description"], list) and len(state["job_description"]) > 0:
+            job_summary = state["job_description"][0]
+        else:
+            job_summary = str(state["job_description"])
+    except Exception as e:
+        print(f"Warning: Error getting job summary: {e}")
+        job_summary = str(state["job_description"])
+    for i, draft in enumerate(all_drafts):
+        rating = llm_precise.invoke(DRAFT_RATING_PROMPT.format(
+            resume_summary=resume_summary,
+            job_summary=job_summary,
+            draft=draft,
+            draft_number=i+1
+        ))
+        ratings.append(rating)
+    # Create a clearer, more structured prompt for draft selection
+    selection_prompt = BEST_DRAFT_SELECTION_PROMPT.format(
+        ratings_json=json.dumps(ratings, indent=2),
+        num_drafts=len(all_drafts)
+    )
+    # Get the selected draft index with error handling
+    try:
+        selection = llm_precise.invoke(selection_prompt).strip()
+        # Extract just the first number found in the response
+        number_match = re.search(r'\d+', selection)
+        if not number_match:
+            print("Warning: Could not extract draft number from LLM response. Using original draft.")
+            best_draft_idx = 0
+        else:
+            best_draft_idx = int(number_match.group()) - 1
+            # Validate the index is in range
+            if best_draft_idx < 0 or best_draft_idx >= len(all_drafts):
+                print(f"Warning: Selected draft index {best_draft_idx + 1} out of range. Using original draft.")
+                best_draft_idx = 0
+    except (ValueError, TypeError) as e:
+        print(f"Warning: Error selecting best draft: {e}. Using original draft.")
+        best_draft_idx = 0
+    state["draft"] = all_drafts[best_draft_idx]
+    return state

nodes/test_workflow.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing_extensions import List, Dict, Any, Optional
+from langgraph.graph import MessagesState, StateGraph
+class DataLoadState(MessagesState):
+    """
+    State container for the job application writer workflow.
+    Attributes:
+        resume: List of text chunks from the candidate's resume
+        job_description: List of text chunks from the job description
+        persona: The writing persona to use ("recruiter" or "hiring_manager")
+        content: Type of application material to generate
+    """
+    resume_path: str
+    job_description_source: str
+    resume: str
+    job_description: str
+    company_name: str
+    current_node: str
+    company_research_data: Dict[str, Any]
+test_graph = StateGraph(DataLoadState)

nodes/variations.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import logging
+from datetime import datetime
+from typing_extensions import Dict, List
+from langchain_core.documents import Document
+from ..classes.classes import AppState
+from ..utils.llm_client import LLMClient
+from ..prompts.templates import (
+    VARIATION_PROMPT
+)
+logger = logging.getLogger(__name__)
+# Constants
+CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
+LLM = LLMClient()
+llm = LLMClient().get_llm()
+def generate_variations(state: AppState) -> Dict[str, List[str]]:
+    """Generate multiple variations of the draft for self-consistency voting."""
+    variations = []
+    # Get resume and job text, handling both string and Document types
+    try:
+        resume_text = "\n".join(doc.page_content if isinstance(doc, Document) else doc
+                               for doc in (state["resume"][:2] if isinstance(state["company_research_data"]["resume"], str)
+                                         else [state["resume"]]))
+        job_text = "\n".join(chunk for chunk in (state["company_research_data"]["job_description"][:2] if isinstance(state["company_research_data"]["job_description"], str)
+                                                else [state["company_research_data"]["job_description"]]))
+    except Exception as e:
+        print(f"Warning: Error processing resume/job text: {e}")
+        # Fallback to simple string handling
+        resume_text = str(state["company_research_data"]["resume"])
+        job_text = str(state["company_research_data"]["job_description"])
+    # Generate variations with different temperatures and creativity settings
+    temp_variations = [
+        {"temperature": 0.7, "top_p": 0.9},  # More conservative
+        {"temperature": 0.75, "top_p": 0.92},  # Balanced
+        {"temperature": 0.8, "top_p": 0.95},  # More creative
+        {"temperature": 0.7, "top_p": 0.85},  # Alternative conservative
+        {"temperature": 0.8, "top_p": 0.98}   # Most creative
+    ]
+    for settings in temp_variations:
+        try:
+            # Create a configured version of the LLM with the variation settings
+            configured_llm = llm.with_config(configurable=settings)
+            # Use VARIATION_PROMPT directly with the configured LLM
+            variation = VARIATION_PROMPT.format_messages(
+                resume_excerpt=resume_text,
+                job_excerpt=job_text,
+                draft=state["draft"]
+            )
+            response = configured_llm.invoke(variation)
+            if response and response.strip():  # Only add non-empty variations
+                variations.append(response)
+        except Exception as e:
+            print(f"Warning: Error generating variation with settings {settings}: {e}")
+            continue
+    # Ensure we have at least one variation
+    if not variations:
+        # If all variations failed, add the original draft as a fallback
+        variations.append(state["draft"])
+    return {"variations": variations}

prompts.md ADDED Viewed

	@@ -0,0 +1,27 @@

+## 1. Title / One-Line Summary
+> *E.g.* “embed_query returns empty vector with OllamaEmbeddings”
+---
+## 2. Goal / Expected Behavior
+- What you’re trying to achieve
+  *E.g.* “Index documents with OllamaEmbeddings and query via Pinecone, then feed them into Llama3.2 for answer generation.”
+---
+## 3. Environment
+- **Python**:
+- **langchain**:
+- **Ollama CLI / Daemon**:
+- **OS** (and version):
+- **Other dependencies**:
+---
+## 4. Minimal Reproducible Code
+```python
+# Paste just enough code to reproduce the issue:
+from langchain.embeddings import OllamaEmbeddings
+emb = OllamaEmbeddings(model="ollama/llama3.2-embed")
+vec = emb.embed_query("hello")
+print(len(vec))  # unexpected result

prompts/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Prompt templates for job application generation.
+"""

prompts/templates.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Prompt templates for the job application writer.
+This module contains all prompt templates used throughout the job application
+generation process, organized by task.
+"""
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import SystemMessage, HumanMessage
+# Persona selection prompts
+PERSONA_DEVELOPMENT_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+    SystemMessage(content="""
+                You are my dedicated Job‑Application Writing Assistant.
+                MISSION
+                • Draft cover letters, LinkedIn messages, and answer's to questions within the job applications.
+                • Sound like me: grounded, confident, clear—never fluffy or journalistic.
+                • You will be provided "STYLE & LANGUAGE RULES" and "SELF‑EVALUATION CHECKLIST" to follow.
+                """),
+    HumanMessage(content="""Analyze this job description and determine if it's better to write as if addressing a recruiter
+    or a hiring manager. Return ONLY 'recruiter' or 'hiring_manager':
+    {job_description}""")
+])
+# Draft generation prompts
+COVER_LETTER_PROMPT: SystemMessage = SystemMessage(content=
+                                    """
+                                    You are CoverLetterGPT, a concise career‑writing assistant.
+                                    CORE OBJECTIVE
+                                    • Draft a 3‑paragraph cover letter (150‑180 words total) that targets hiring managers
+                                    and technical recruiters. Assume it may reach the CEO.
+                                    • Begin exactly with:  "To Hiring Team,"
+                                    End exactly with:    "Thanks, Rishabh"
+                                    • Tone: polite, casual, enthusiastic — but no em dashes (—) and no clichés.
+                                    • Every fact about achievements, skills, or company details must be traceable to the
+                                    provided resume, job description, or company research; otherwise, ask the user.
+                                    • If any critical detail is missing or ambiguous, STOP and ask a clarifying question
+                                    before writing the letter.
+                                    • Keep sentences tight; avoid filler like “I am excited to…” (enthusiasm comes
+                                    through precise language).
+                                    • Never exceed 180 words. Never fall below 150 words.
+                                    SELF‑EVALUATION (append after the letter)
+                                    After producing the cover letter, output an “### Evaluation” section containing:
+                                    Comprehensiveness (1‑5)
+                                    Evidence provided (1‑5)
+                                    Clarity of explanation (1‑5)
+                                    Potential limitations or biases (bullet list)
+                                    Areas for improvement (brief notes)
+                                    ERROR HANDLING
+                                    If word count, section order, or format rules are violated, regenerate until correct.
+                                    """
+                                )
+BULLET_POINTS_PROMPT: SystemMessage = SystemMessage(content=
+                                """You are an expert job application writer who
+                                creates personalized application materials.
+                                {persona_instruction}
+                                Write 5-7 bullet points highlighting the candidate's
+                                qualifications for this specific role.
+                                Create content that genuinely reflects the candidate's
+                                background and is tailored to the specific job.
+                                Ensure the tone is professional, confident, and authentic.
+                                Today is {current_date}.""")
+LINKEDIN_NOTE_PROMPT: SystemMessage = SystemMessage(content="""You are an expert job application
+                                writer who creates personalized application materials.
+                                {persona_instruction}
+                                Write a brief LinkedIn connection note to a hiring manager or recruiter (150 words max).
+                                Create content that genuinely reflects the candidate's background and is tailored to the specific job.
+                                Ensure the tone is professional, confident, and authentic.
+                                Today is {current_date}.""")
+# Variation generation prompt
+VARIATION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+    SystemMessage(content="You are an expert job application writer. Create a variation of the given draft."),
+    HumanMessage(content="""
+    # Resume Excerpt
+    {resume_excerpt}
+    # Job Description Excerpt
+    {job_excerpt}
+    # Original Draft
+    {draft}
+    Create a variation of this draft with the same key points but different wording or structure.
+    """)
+])
+# Critique prompt
+CRITIQUE_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+    SystemMessage(content="You are a professional editor who specializes in job applications. Provide constructive feedback."),
+    HumanMessage(content="""
+    # Job Description
+    {job_description}
+    # Current Draft
+    {draft}
+    Critique this draft and suggest specific improvements. Focus on:
+    1. How well it targets the job requirements
+    2. Professional tone and language
+    3. Clarity and impact
+    4. Grammar and style
+    Return your critique in a constructive, actionable format.
+    """)
+])
+# Draft rating prompt
+DRAFT_RATING_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+    SystemMessage(content="You evaluate job application materials for effectiveness, appropriateness, and impact."),
+    HumanMessage(content="""
+    # Resume Summary
+    {resume_summary}
+    # Job Description Summary
+    {job_summary}
+    # Draft #{draft_number}
+    {draft}
+    Rate this draft on a scale of 1-10 for:
+    1. Relevance to the job requirements
+    2. Professional tone
+    3. Personalization
+    4. Persuasiveness
+    5. Clarity
+    Return ONLY a JSON object with these ratings and a brief explanation for each.
+    """)
+])
+# Best draft selection prompt
+BEST_DRAFT_SELECTION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+    SystemMessage(content="""You are a job application expert who selects the best draft based on multiple ratings.
+    You MUST return ONLY a single number between 1 and the number of drafts.
+    For example, if draft #2 is best, return ONLY '2'.
+    Do NOT include ANY other text, explanations, or characters in your response."""),
+    HumanMessage(content="""Here are the ratings for {num_drafts} different drafts:
+{ratings_json}
+Based on these ratings, return ONLY the number of the best draft (1-{num_drafts}).
+Your entire response must be just one number.
+Example: If draft #2 is best, return ONLY '2'.
+""")
+])
+REVISION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+    SystemMessage(content="You are an expert job application writer. Revise the draft based on feedback."),
+    HumanMessage(content="""
+    # Original Draft
+    {draft}
+    # Feedback
+    {feedback}
+    Revise the draft to incorporate this feedback while maintaining professionalism and impact.
+    Return the complete, final version.
+    """)
+])
+# Tavily query prompt to build knowledge context about the company
+TAVILY_QUERY_PROMPT = '''
+<Context>
+The user needs targeted search queries (with rationale) for Tavily Search to research company {} and inform a personalized cover letter.
+</Context>
+<Requirements>
+- Output a JSON object with five fields:
+  - Keys: recent_developments, recent_news, role_info, customers_partners, culture_values
+  - Each value: an array of exactly two strings: [search query for Tavily Search, reasoning].
+- Always include the company name in the search query to boost relevance.
+- If any data is missing, supply a sensible fallback query that still references the company.
+- Do not repeat queries across fields.
+</Requirements>
+<OutputFormat>
+```json
+{
+  "recent_developments": ["…", "…"],
+  "recent_news":       ["…", "…"],
+  "role_info":         ["…", "…"],
+  "customers_partners":["…", "…"],
+  "culture_values":    ["…", "…"]
+}
+```
+</OutputFormat>
+'''
+JOB_DESCRIPTION_PROMPT = """You are a JSON extraction specialist. Extract job information from the provided text and return ONLY valid JSON.
+CRITICAL: Your response must be parseable by json.loads() - no markdown, no explanations, no extra text.
+Extract these three fields in exact order:
+1. job_description field - Complete job posting formatted in clean markdown with proper headers (## Job Description, ## Responsibilities, ## Requirements, etc.)
+2. company_name field - Exact company name as mentioned
+3. job_title field - Exact job title as posted
+FORMATTING RULES:
+- Use double quotes for all strings
+- Escape internal quotes with \\"
+- Escape newlines as \\\\n in the job description field
+- Replace actual line breaks with \\\\n
+- If any field is missing, use empty string ""
+- No trailing commas
+- No comments or extra whitespace
+REQUIRED OUTPUT FORMAT:
+{{
+  "job_description": "markdown formatted job description with \\\\n for line breaks",
+  "company_name": "exact company name",
+  "job_title": "exact job title"
+}}
+Return only the JSON object - no other text."""

setup.py ADDED Viewed

File without changes

testing.ipynb ADDED Viewed

	@@ -0,0 +1,1069 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d26f6647",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain_core.messages import AIMessage, HumanMessage, SystemMessage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f337ecb5",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "92b12890",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = ChatPromptTemplate.from_messages([SystemMessage(content=f\"\"\"\n",
+    "            You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
+    "\n",
+    "            Rules:\n",
+    "            1. Generate Tavily DSL only (no natural language outside the JSON).\n",
+    "            2. Map the job description into five categories:\n",
+    "            • query1: recent developments\n",
+    "            • query2: recent news\n",
+    "            • query3:company profile\n",
+    "            • query4: key customers & partners\n",
+    "            • query5: culture & values\n",
+    "            3. Each value is a two‑element list:\n",
+    "            [<query string>, <one‑sentence rationale>]\n",
+    "            4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
+    "            5. If information is missing in the JD, fall back sensibly\n",
+    "            (e.g. search for “employee testimonials”).\n",
+    "            6. Return **only** valid JSON.\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "    , HumanMessage(content=\"Hello World\")])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e38c3632",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_message = ChatPromptTemplate.from_messages([HumanMessage(content=\"Hello World\")])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dac1ec19",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "================================\u001b[1m System Message \u001b[0m================================\n",
+      "\n",
+      "\n",
+      "            You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
+      "\n",
+      "            Rules:\n",
+      "            1. Generate Tavily DSL only (no natural language outside the JSON).\n",
+      "            2. Map the job description into five categories:\n",
+      "            • query1: recent developments\n",
+      "            • query2: recent news\n",
+      "            • query3:company profile\n",
+      "            • query4: key customers & partners\n",
+      "            • query5: culture & values\n",
+      "            3. Each value is a two‑element list:\n",
+      "            [<query string>, <one‑sentence rationale>]\n",
+      "            4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
+      "            5. If information is missing in the JD, fall back sensibly\n",
+      "            (e.g. search for “employee testimonials”).\n",
+      "            6. Return **only** valid JSON.\n",
+      "        \n",
+      "\n",
+      "================================\u001b[1m Human Message \u001b[0m=================================\n",
+      "\n",
+      "Hello World\n"
+     ]
+    }
+   ],
+   "source": [
+    "messages.pretty_print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7ebd0d0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts import (\n",
+    "    ChatPromptTemplate,\n",
+    "    HumanMessagePromptTemplate,\n",
+    "    SystemMessagePromptTemplate,\n",
+    ")\n",
+    "\n",
+    "input_message = HumanMessagePromptTemplate.from_template(\"Below is the required job description and resume: {background_information}\", input_variables=[\"background_information\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "cd6b3cb8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "HumanMessage(content='Below is the required job description and resume: This is Rishabh', additional_kwargs={}, response_metadata={})"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_message.format(background_information=\"This is Rishabh\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c9628bed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from pathlib import Path\n",
+    "from typing import List\n",
+    "\n",
+    "from langchain_community.document_loaders import PyPDFLoader\n",
+    "from langchain.text_splitter import (\n",
+    "    MarkdownHeaderTextSplitter,\n",
+    "    RecursiveCharacterTextSplitter,\n",
+    ")\n",
+    "from langchain.schema import Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "c352da72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _collapse_ws(text: str) -> str:\n",
+    "    \"\"\"Collapse stray whitespace but keep bullet breaks.\"\"\"\n",
+    "    text = re.sub(r\"\\n\\s*([•\\-–])\\s*\", r\"\\n\\1 \", text)\n",
+    "    return re.sub(r\"[ \\t\\r\\f\\v]+\", \" \", text).replace(\" \\n\", \"\\n\").strip()\n",
+    "\n",
+    "\n",
+    "def _is_heading(line: str) -> bool:\n",
+    "    return (\n",
+    "        line.isupper()\n",
+    "        and len(line.split()) <= 5\n",
+    "        and not re.search(r\"\\d\", line)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def parse_resume(pdf_path: str | Path) -> List[Document]:\n",
+    "    \"\"\"\n",
+    "    Load a single‑page résumé PDF → list[Document] chunks\n",
+    "    (≈400 chars, 50‑char overlap) with {source, section} metadata.\n",
+    "    \"\"\"\n",
+    "    text = PyPDFLoader(str(pdf_path), extraction_mode=\"layout\").load()[0].page_content\n",
+    "    print(text)\n",
+    "    text = _collapse_ws(text)\n",
+    "\n",
+    "    # Tag headings with \"###\" so Markdown splitter can see them\n",
+    "    tagged_lines = [\n",
+    "        f\"### {ln}\" if _is_heading(ln) else ln\n",
+    "        for ln in text.splitlines()\n",
+    "    ]\n",
+    "    md_text = \"\\n\".join(tagged_lines)\n",
+    "\n",
+    "    if \"###\" in md_text:\n",
+    "        splitter = MarkdownHeaderTextSplitter(\n",
+    "            headers_to_split_on=[(\"###\", \"section\")]\n",
+    "        )\n",
+    "        chunks = splitter.split_text(md_text)  # already returns Documents\n",
+    "    else:\n",
+    "        print(f\"No headings found.\")\n",
+    "        splitter = RecursiveCharacterTextSplitter(\n",
+    "            chunk_size=400, chunk_overlap=50\n",
+    "        )\n",
+    "        chunks = [\n",
+    "            Document(page_content=chunk, metadata={})\n",
+    "            for chunk in splitter.split_text(md_text)\n",
+    "        ]\n",
+    "\n",
+    "    # Attach metadata\n",
+    "    for doc in chunks:\n",
+    "        doc.metadata.setdefault(\"source\", str(pdf_path))\n",
+    "        # section already present if header‑splitter was used\n",
+    "    return chunks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "14e062e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Rishabh Aggarwal\n",
+      "                                      (602) 580-5734  •      raggar15@asu.edu  •       LinkedIn  •    Tempe, AZ\n",
+      "TECHNICAL       SKILLS\n",
+      "Programming Languages: Python, Java, JavaScript, Bash, HTML, CSS\n",
+      "Databases: SQL (PostgreSQL, MySQL, SQLite), NoSQL (MongoDB,  Redis, DynamoDB, Pinecone)\n",
+      "Frameworks/Tools: SpringBoot, React, JUnit, Node.js, RESTful APIs, Django, Kafka, Airﬂow, FastAPI, Pydantic, Tableau\n",
+      "DevOps/Cloud: AWS, GCP, GitHub Actions, Docker, Jenkins, Terraform, Kubernetes, MLFlow, GitLab\n",
+      "AI Tools/Frameworks: PyTorch, Tensorﬂow, scikit-learn, LangGraph, LangChain, LangSmith, ChatGPT\n",
+      "PROFESSIONAL EXPERIENCE\n",
+      "Amazon Inc, Tempe, AZ: Software Development Engineer | Seller Payment Services                                                Dec 2023 - Aug 2024\n",
+      "●   Established AWS Evidently        setup to handle 50K+ daily API requests           to new Lambda service using AWS CDK(TypeScript)\n",
+      "●   Added metrics      to monitor traﬃc and enhance service observability of the Lambda service through CloudWatch logs\n",
+      "●   Developed SNS Event Publishers           in Java using Spring Boot to process          10K+ daily events in an event-driven architecture\n",
+      "●   Led load balancer migration planning for a microservice with a focus                   on safe rollbacks    and minimum downtime\n",
+      "●   Designed a dashboard for ALB migration to monitor traﬃc with high-severity                  alarms   to enhance observability\n",
+      "●   Directed weekly     meetings    with a 7-member agile team to analyze metrics            and customer data, guiding decision-making for\n",
+      "    live campaigns     involving over 50K sellers\n",
+      "MetaJungle, Ozark, MO: Lead Backend Engineer                                                                                   Jun 2023 - Dec 2023\n",
+      "●   Architected    a  scalable    AWS     cloud    infrastructure    for  a  Marketplace     using   Terraform     IaC   with   ECS   and   Fargate\n",
+      "    instances, reduced costs       by  40% while maintaining high reliability      using Blue/Green deployment strategy\n",
+      "●   Engineered and managed Jenkins CI/CD pipeline allowing faster iterative development by                       reducing deployment time by\n",
+      "    75% ,  leveraging Github hooks        and Docker Containerization\n",
+      "●   Migrated over     1.2TB   on-premises     Microsoft     SQL Server database          with  over 2 million records     to AWS RDS,       utilizing\n",
+      "    AWS DMS ensuring eﬃcient indexing and retrieval\n",
+      "●   Developed 10+ RESTful APIs in Node.js to manage data for over 500 NFT collections                       and 10,000 listings    from MongoDB\n",
+      "●   Automated extraction and compression of 50,000+                 images     from Ethereum Blockchain and stored on AWS S3 using\n",
+      "    Airﬂow workﬂows in Python, leading to almost 30%                  storage cost savings\n",
+      "Omnipresent Robot Technologies, Delhi, India: Software Engineer                                                              Jun 2018 - Jul 2021\n",
+      "●     Engineered a distributed, scalable AI           surveillance   application with edge-device computation using Python, OpenCV,\n",
+      "      and scikit-learn, ensuring security       for 10,000+ daily park visitors\n",
+      "●     Architected a distributed system for real-time video streaming using Apache Kafka and Python to process                          50+ parallel\n",
+      "      video streams, reducing latency by 60%            by  rigorous   debugging and performance optimization\n",
+      "●     Led the development of an analytics          dashboard using Django, React and Postgres to show breach records, alerts, and\n",
+      "      intuitive data visualizations   using Google Charts, allowing data-driven decision making\n",
+      "●     Developed     a drone   compliance     platform using Django to automate ﬂight authorization and authentication process,\n",
+      "      leading to enhanced productivity        of the drone engineering team\n",
+      "●     Led collaboration of a team of engineers         and drone operators      to conduct real-world testing of the compliance system\n",
+      "●     Mentored interns     to understand software development best practices, coding standards, and version control systems\n",
+      "ADDITIONAL EXPERIENCE\n",
+      "ML Software Developer at ASU                                                                                                Jul 2022 - May 2023\n",
+      "●   Trained deep learning models        using PyTorch and Scikit to detect low-resolution objects            in 15,000+ satellite images\n",
+      "●   Executed     adversarial   attacks    and   utilized  MLFlow     for ﬁne-tuning     multi-class   classiﬁcation    machine     learning  model,\n",
+      "    enhancing model robustness          and improving accuracy       by  20%\n",
+      "Mayhem Heroes Cybersecurity Open Source Hackathon                                                                                           Apr 2022\n",
+      "Integrated Mayhem into CI/CD pipeline for Open Source repos                using GitHub Actions, reducing security          risks  by over 80%\n",
+      " EDUCATION\n",
+      "Master of Science in Information Technology\n",
+      "Arizona State University, Tempe, Arizona\n"
+     ]
+    }
+   ],
+   "source": [
+    "chunks = parse_resume(\"C:\\\\Users\\\\risha\\\\Downloads\\\\Rishabh_SDE_Resume.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "0100cc62",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Resume chunk: Rishabh Aggarwal\n",
+      "(602) 580-5734 • raggar15@asu.edu • LinkedIn • Tempe, AZ\n",
+      "Resume chunk: Programming Languages: Python, Java, JavaScript, Bash, HTML, CSS\n",
+      "Databases: SQL (PostgreSQL, MySQL, SQLite), NoSQL (MongoDB, Redis, DynamoDB, Pinecone)\n",
+      "Frameworks/Tools: SpringBoot, React, JUnit, Node.js, RESTful APIs, Django, Kafka, Airﬂow, FastAPI, Pydantic, Tableau\n",
+      "DevOps/Cloud: AWS, GCP, GitHub Actions, Docker, Jenkins, Terraform, Kubernetes, MLFlow, GitLab\n",
+      "AI Tools/Frameworks: PyTorch, Tensorﬂow, scikit-learn, LangGraph, LangChain, LangSmith, ChatGPT\n",
+      "Resume chunk: Amazon Inc, Tempe, AZ: Software Development Engineer | Seller Payment Services Dec 2023 - Aug 2024\n",
+      "● Established AWS Evidently setup to handle 50K+ daily API requests to new Lambda service using AWS CDK(TypeScript)\n",
+      "● Added metrics to monitor traﬃc and enhance service observability of the Lambda service through CloudWatch logs\n",
+      "● Developed SNS Event Publishers in Java using Spring Boot to process 10K+ daily events in an event-driven architecture\n",
+      "● Led load balancer migration planning for a microservice with a focus on safe rollbacks and minimum downtime\n",
+      "● Designed a dashboard for ALB migration to monitor traﬃc with high-severity alarms to enhance observability\n",
+      "● Directed weekly meetings with a 7-member agile team to analyze metrics and customer data, guiding decision-making for\n",
+      "live campaigns involving over 50K sellers\n",
+      "MetaJungle, Ozark, MO: Lead Backend Engineer Jun 2023 - Dec 2023\n",
+      "● Architected a scalable AWS cloud infrastructure for a Marketplace using Terraform IaC with ECS and Fargate\n",
+      "instances, reduced costs by 40% while maintaining high reliability using Blue/Green deployment strategy\n",
+      "● Engineered and managed Jenkins CI/CD pipeline allowing faster iterative development by reducing deployment time by\n",
+      "75% , leveraging Github hooks and Docker Containerization\n",
+      "● Migrated over 1.2TB on-premises Microsoft SQL Server database with over 2 million records to AWS RDS, utilizing\n",
+      "AWS DMS ensuring eﬃcient indexing and retrieval\n",
+      "● Developed 10+ RESTful APIs in Node.js to manage data for over 500 NFT collections and 10,000 listings from MongoDB\n",
+      "● Automated extraction and compression of 50,000+ images from Ethereum Blockchain and stored on AWS S3 using\n",
+      "Airﬂow workﬂows in Python, leading to almost 30% storage cost savings\n",
+      "Omnipresent Robot Technologies, Delhi, India: Software Engineer Jun 2018 - Jul 2021\n",
+      "● Engineered a distributed, scalable AI surveillance application with edge-device computation using Python, OpenCV,\n",
+      "and scikit-learn, ensuring security for 10,000+ daily park visitors\n",
+      "● Architected a distributed system for real-time video streaming using Apache Kafka and Python to process 50+ parallel\n",
+      "video streams, reducing latency by 60% by rigorous debugging and performance optimization\n",
+      "● Led the development of an analytics dashboard using Django, React and Postgres to show breach records, alerts, and\n",
+      "intuitive data visualizations using Google Charts, allowing data-driven decision making\n",
+      "● Developed a drone compliance platform using Django to automate ﬂight authorization and authentication process,\n",
+      "leading to enhanced productivity of the drone engineering team\n",
+      "● Led collaboration of a team of engineers and drone operators to conduct real-world testing of the compliance system\n",
+      "● Mentored interns to understand software development best practices, coding standards, and version control systems\n",
+      "Resume chunk: ML Software Developer at ASU Jul 2022 - May 2023\n",
+      "● Trained deep learning models using PyTorch and Scikit to detect low-resolution objects in 15,000+ satellite images\n",
+      "● Executed adversarial attacks and utilized MLFlow for ﬁne-tuning multi-class classiﬁcation machine learning model,\n",
+      "enhancing model robustness and improving accuracy by 20%\n",
+      "Mayhem Heroes Cybersecurity Open Source Hackathon Apr 2022\n",
+      "Integrated Mayhem into CI/CD pipeline for Open Source repos using GitHub Actions, reducing security risks by over 80%\n",
+      "Resume chunk: Master of Science in Information Technology\n",
+      "Arizona State University, Tempe, Arizona\n"
+     ]
+    }
+   ],
+   "source": [
+    "resume_text = \"\"\n",
+    "for chunk in chunks:\n",
+    "  print(f\"Resume chunk: {chunk.page_content}\")\n",
+    "  resume_text+= (chunk.page_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "b045de91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "class TavilyQuerySet(BaseModel):\n",
+    "    query1: tuple[str, str] = Field(\n",
+    "        ...,\n",
+    "        description=\"DSL for Recent Developments + 1‑sentence rationale\",\n",
+    "    )\n",
+    "    query2: tuple[str, str] = Field(\n",
+    "        ...,\n",
+    "        description=\"DSL for Recent News + rationale\",\n",
+    "    )\n",
+    "    query3: tuple[str, str]\n",
+    "    query4: tuple[str, str]\n",
+    "    query5: tuple[str, str]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "eda95e9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.output_parsers import PydanticOutputParser\n",
+    "parser = PydanticOutputParser(pydantic_object=TavilyQuerySet)\n",
+    "\n",
+    "messages = SystemMessage(content=f\"\"\"\n",
+    "            You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
+    "            {parser.get_format_instructions()}\n",
+    "\n",
+    "           \n",
+    "            Rules:\n",
+    "            1. Generate Tavily DSL only (no natural language outside the JSON).\n",
+    "            2. Map the job description into five categories:\n",
+    "            • query1: recent developments\n",
+    "            • query2: recent news\n",
+    "            • query3:company profile\n",
+    "            • query4: key customers & partners\n",
+    "            • query5: culture & values\n",
+    "            3. Each value is a two‑element list:\n",
+    "            [<query string>, <one‑sentence rationale>]\n",
+    "            4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
+    "            5. If information is missing in the JD, fall back sensibly\n",
+    "            (e.g. search for “employee testimonials”).\n",
+    "            6. Return **only** valid JSON.\n",
+    "        \"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "9738103e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\"properties\": {\"query1\": {\"description\": \"DSL for Recent Developments + 1‑sentence rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query1\", \"type\": \"array\"}, \"query2\": {\"description\": \"DSL for Recent News + rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query2\", \"type\": \"array\"}, \"query3\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query3\", \"type\": \"array\"}, \"query4\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query4\", \"type\": \"array\"}, \"query5\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query5\", \"type\": \"array\"}}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]}\\n```'"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parser.get_format_instructions()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "c3174432",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'properties': {'query1': {'description': 'DSL for Recent Developments + 1‑sentence rationale',\n",
+       "   'maxItems': 2,\n",
+       "   'minItems': 2,\n",
+       "   'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
+       "   'title': 'Query1',\n",
+       "   'type': 'array'},\n",
+       "  'query2': {'description': 'DSL for Recent News + rationale',\n",
+       "   'maxItems': 2,\n",
+       "   'minItems': 2,\n",
+       "   'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
+       "   'title': 'Query2',\n",
+       "   'type': 'array'},\n",
+       "  'query3': {'maxItems': 2,\n",
+       "   'minItems': 2,\n",
+       "   'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
+       "   'title': 'Query3',\n",
+       "   'type': 'array'},\n",
+       "  'query4': {'maxItems': 2,\n",
+       "   'minItems': 2,\n",
+       "   'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
+       "   'title': 'Query4',\n",
+       "   'type': 'array'},\n",
+       "  'query5': {'maxItems': 2,\n",
+       "   'minItems': 2,\n",
+       "   'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
+       "   'title': 'Query5',\n",
+       "   'type': 'array'}},\n",
+       " 'required': ['query1', 'query2', 'query3', 'query4', 'query5'],\n",
+       " 'title': 'TavilyQuerySet',\n",
+       " 'type': 'object'}"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "TavilyQuerySet.model_json_schema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "5884df35",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "================================\u001b[1m System Message \u001b[0m================================\n",
+      "\n",
+      "\n",
+      "            You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
+      "            The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
+      "\n",
+      "As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\n",
+      "the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
+      "\n",
+      "Here is the output schema:\n",
+      "```\n",
+      "{\"properties\": {\"query1\": {\"description\": \"DSL for Recent Developments + 1‑sentence rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query1\", \"type\": \"array\"}, \"query2\": {\"description\": \"DSL for Recent News + rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query2\", \"type\": \"array\"}, \"query3\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query3\", \"type\": \"array\"}, \"query4\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query4\", \"type\": \"array\"}, \"query5\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query5\", \"type\": \"array\"}}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]}\n",
+      "```\n",
+      "\n",
+      "\n",
+      "            Rules:\n",
+      "            1. Generate Tavily DSL only (no natural language outside the JSON).\n",
+      "            2. Map the job description into five categories:\n",
+      "            • query1: recent developments\n",
+      "            • query2: recent news\n",
+      "            • query3:company profile\n",
+      "            • query4: key customers & partners\n",
+      "            • query5: culture & values\n",
+      "            3. Each value is a two‑element list:\n",
+      "            [<query string>, <one‑sentence rationale>]\n",
+      "            4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
+      "            5. If information is missing in the JD, fall back sensibly\n",
+      "            (e.g. search for “employee testimonials”).\n",
+      "            6. Return **only** valid JSON.\n",
+      "        \n"
+     ]
+    }
+   ],
+   "source": [
+    "messages.pretty_print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "d2c3cc8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = \"\"\"properties\": {\"query1\": [{\"query\": \"Shalin Mehta AND \\\"Computational Microscopy Platform\\\"\", \"rationale\": \"Recent developments within the company\"}, {\"query\": \"Shalin Mehta AND \\\"Biohub SF\\\"\", \"rationale\": \"Recent developments within the company\"}], \"query2\": [{\"query\": \"Chan Zuckerberg Biohub - San Francisco AND recent news\", \"rationale\": \"Recent news about the company\"}, {\"query\": \"COVID-19 AND Chan Zuckerberg Biohub - San Francisco\", \"rationale\": \"Recent news about the company\"}], \"query3\": [{\"query\": \"Shalin Mehta AND \\\"role: Software Engineer\\\"\", \"rationale\": \"Information about the company that relates to the role\"}, {\"query\": \"Chan Zuckerberg Biohub - San Francisco AND \\\"team: Bioengineering\\\"\", \"rationale\": \"Information about the company that relates to the role\"}], \"query4\": [{\"query\": \"key customers: Chan Zuckerberg Biohub\", \"rationale\": \"Key customers & partners\"}, {\"query\": \"partners: Chan Zuckerberg Biohub SF\", \"rationale\": \"Key customers & partners\"}], \"query5\": [{\"query\": \"company culture: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}, {\"query\": \"values: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}]}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "7d8508a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "properties\": {\"query1\": [{\"query\": \"Shalin Mehta AND \"Computational Microscopy Platform\"\", \"rationale\": \"Recent developments within the company\"}, {\"query\": \"Shalin Mehta AND \"Biohub SF\"\", \"rationale\": \"Recent developments within the company\"}], \"query2\": [{\"query\": \"Chan Zuckerberg Biohub - San Francisco AND recent news\", \"rationale\": \"Recent news about the company\"}, {\"query\": \"COVID-19 AND Chan Zuckerberg Biohub - San Francisco\", \"rationale\": \"Recent news about the company\"}], \"query3\": [{\"query\": \"Shalin Mehta AND \"role: Software Engineer\"\", \"rationale\": \"Information about the company that relates to the role\"}, {\"query\": \"Chan Zuckerberg Biohub - San Francisco AND \"team: Bioengineering\"\", \"rationale\": \"Information about the company that relates to the role\"}], \"query4\": [{\"query\": \"key customers: Chan Zuckerberg Biohub\", \"rationale\": \"Key customers & partners\"}, {\"query\": \"partners: Chan Zuckerberg Biohub SF\", \"rationale\": \"Key customers & partners\"}], \"query5\": [{\"query\": \"company culture: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}, {\"query\": \"values: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}]}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "1fab5ee9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import (\n",
+    "    PromptTemplate,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e93695ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = PromptTemplate.from_template(\"Below is the required job description and resume: {background_information}\", input_variables=[\"background_information\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "f5330010",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = ('query1', ('recent developments within the company', 'The Associate Software engineer will build open source software tools for managing and processing 10-100 terabyte-scale datasets.'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "5753afd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "keys = ('q', ('y', 'z'))\n",
+    "\n",
+    "dict_x = dict(zip(keys, x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "06d50119",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('recent developments within the company',\n",
+       " 'The Associate Software engineer will build open source software tools for managing and processing 10-100 terabyte-scale datasets.')"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dict_x[('y', 'z')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f03d758e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.output_parsers import PydanticOutputParser, OutputFixingParser, RetryOutputParser\n",
+    "base_parser = PydanticOutputParser(pydantic_object=TavilyQuerySet)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d8dd9c74",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'parser' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m format_instructions = \u001b[43mparser\u001b[49m.get_format_instructions()\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mollama\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m chat\n\u001b[32m      5\u001b[39m tavily_role_messages = SystemMessage(content=\n\u001b[32m      6\u001b[39m             \u001b[33mf\u001b[39m\u001b[33m\"\"\"\u001b[39m\n\u001b[32m      7\u001b[39m \u001b[33m            When you reply, output **only** valid JSON that can be parsed\u001b[39m\n\u001b[32m   (...)\u001b[39m\u001b[32m     31\u001b[39m \u001b[33m            5. Return **only** valid JSON that matches the schema exactly. No other fields\u001b[39m\n\u001b[32m     32\u001b[39m \u001b[33m            \u001b[39m\u001b[33m\"\"\"\u001b[39m)\n",
+      "\u001b[31mNameError\u001b[39m: name 'parser' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "format_instructions = parser.get_format_instructions()\n",
+    "from ollama import chat\n",
+    "\n",
+    "\n",
+    "tavily_role_messages = SystemMessage(content=\n",
+    "            f\"\"\"\n",
+    "            When you reply, output **only** valid JSON that can be parsed\n",
+    "            into the Pydantic model shown below. Do **not** wrap it in \"properties\"\n",
+    "            or \"required\".:\n",
+    "            \n",
+    "            ------------------------------------------------\n",
+    "\n",
+    "\n",
+    "            {format_instructions}\n",
+    "\n",
+    "            \n",
+    "           -------------------------------------------------\n",
+    "\n",
+    "            Rules:\n",
+    "            1. Generate Tavily DSL only (no natural language outside the JSON).\n",
+    "            2. Map the job description into five categories:\n",
+    "            • query1: recent developments within the company\n",
+    "            • query2: recent news about the company\n",
+    "            • query3: information about the company that relates to the role\n",
+    "            • query4: key customers & partners\n",
+    "            • query5: culture & values of the company\n",
+    "            3. Each value is a two‑element list:\n",
+    "            [<query string>, <one‑sentence rationale>]\n",
+    "            4. If information is missing in the JD, fall back sensibly\n",
+    "            (e.g. search for “employee testimonials”).\n",
+    "            5. Return **only** valid JSON that matches the schema exactly. No other fields\n",
+    "            \"\"\")\n",
+    "\n",
+    "\n",
+    "response = chat(\n",
+    "    messages=[{\n",
+    "        tavily_role_messages,\n",
+    "        input_message}\n",
+    "      ],\n",
+    "    model='llama3.2:latest',\n",
+    "    format=TavilyQuerySet.model_json_schema(),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8deb0abd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = ('query1', ['Recent developments within the company using computational microscopy platform', 'This project will require working on microscopes in a BSL-2 imaging laboratory'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2fcab19",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Recent developments within the company using computational microscopy platform'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "p[1][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "55e3f46a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "COVER_LETTER_PROMPT = SystemMessage(content=\"\"\"You are my dedicated assistant for writing job application content, including cover letters, LinkedIn outreach messages, and responses to job-specific questions (e.g., experience, culture fit, or motivation).\n",
+    "\n",
+    "Your goal is to generate content that:\n",
+    "1. Reflects **my personality**, tone, and authentic voice, based on examples I provide.\n",
+    "2. Matches **my knowledge, experience, and interests**, which I’ll also share or update as needed.\n",
+    "3. Adopts **my writing style and energy** (e.g., grounded, confident, thoughtful—but not overly polished or generic).\n",
+    "4. Embeds **genuine enthusiasm or alignment** with the company or role, without sounding performative.\n",
+    "5. Avoids filler, clichés, or overused corporate phrases—keep it **authentic and specific**.\n",
+    "6. Learns over time by asking me relevant clarifying questions when needed (e.g., change in tone, new experience, updates to goals).\n",
+    "7. Balances job description alignment with personal storytelling, roughly in a 75:25 ratio.\n",
+    "8. Keeps outputs **concise** and within any given word or character limits.\"\"\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ea061e0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import (\n",
+    "    ChatPromptTemplate,\n",
+    "    HumanMessagePromptTemplate,\n",
+    "    SystemMessagePromptTemplate,\n",
+    ")\n",
+    "from langchain_core.messages import (\n",
+    "    AIMessage,\n",
+    "    HumanMessage,\n",
+    "    SystemMessage,\n",
+    ")\n",
+    "\n",
+    "FirstDraftGenerationPromptTemplate = ChatPromptTemplate.from_messages([COVER_LETTER_PROMPT])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b96cbe64",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatPromptTemplate(input_variables=[], input_types={}, partial_variables={}, messages=[SystemMessage(content='You are my dedicated assistant for writing job application content, including cover letters, LinkedIn outreach messages, and responses to job-specific questions (e.g., experience, culture fit, or motivation).\\n\\nYour goal is to generate content that:\\n1. Reflects **my personality**, tone, and authentic voice, based on examples I provide.\\n2. Matches **my knowledge, experience, and interests**, which I’ll also share or update as needed.\\n3. Adopts **my writing style and energy** (e.g., grounded, confident, thoughtful—but not overly polished or generic).\\n4. Embeds **genuine enthusiasm or alignment** with the company or role, without sounding performative.\\n5. Avoids filler, clichés, or overused corporate phrases—keep it **authentic and specific**.\\n6. Learns over time by asking me relevant clarifying questions when needed (e.g., change in tone, new experience, updates to goals).\\n7. Balances job description alignment with personal storytelling, roughly in a 75:25 ratio.\\n8. Keeps outputs **concise** and within any given word or character limits.', additional_kwargs={}, response_metadata={})])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FirstDraftGenerationPromptTemplate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfd03f8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_application_session  = \"Heello World\"\n",
+    "company_research_data = \"Company Research Data\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c5fef665",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CurrentSessionContextMessage = HumanMessagePromptTemplate.from_template(\n",
+    "            \"\"\"\n",
+    "            # Resume and Job Description\n",
+    "            {current_job_role}\n",
+    "\n",
+    "            # Company Information\n",
+    "            {company_research_data}\n",
+    "\n",
+    "            Create a cover letter that highlights the match between my qualifications and the job requirements.\n",
+    "            \"\"\",\n",
+    "            input_variables=[\"current_job_role\",\n",
+    "                        \"company_research_data\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c89ba644",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FirstDraftGenerationPromptTemplate.append(CurrentSessionContextMessage)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "6997c553",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain = (\n",
+    "            ({\"current_job_role\": lambda x: x[\"current_job_role\"],\n",
+    "              \"company_research_data\": lambda x: x[\"company_research_data\"]})\n",
+    "            | FirstDraftGenerationPromptTemplate\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "55f51dbf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{\n",
+       "  current_job_role: RunnableLambda(...),\n",
+       "  company_research_data: RunnableLambda(...)\n",
+       "}\n",
+       "| ChatPromptTemplate(input_variables=[], input_types={}, partial_variables={}, messages=[SystemMessage(content='You are my dedicated assistant for writing job application content, including cover letters, LinkedIn outreach messages, and responses to job-specific questions (e.g., experience, culture fit, or motivation).\\n\\nYour goal is to generate content that:\\n1. Reflects **my personality**, tone, and authentic voice, based on examples I provide.\\n2. Matches **my knowledge, experience, and interests**, which I’ll also share or update as needed.\\n3. Adopts **my writing style and energy** (e.g., grounded, confident, thoughtful—but not overly polished or generic).\\n4. Embeds **genuine enthusiasm or alignment** with the company or role, without sounding performative.\\n5. Avoids filler, clichés, or overused corporate phrases—keep it **authentic and specific**.\\n6. Learns over time by asking me relevant clarifying questions when needed (e.g., change in tone, new experience, updates to goals).\\n7. Balances job description alignment with personal storytelling, roughly in a 75:25 ratio.\\n8. Keeps outputs **concise** and within any given word or character limits.', additional_kwargs={}, response_metadata={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['company_research_data', 'current_job_role'], input_types={}, partial_variables={}, template='\\n            # Resume and Job Description\\n            {current_job_role}\\n\\n            # Company Information\\n            {company_research_data}\\n\\n            Create a cover letter that highlights the match between my qualifications and the job requirements.\\n            '), additional_kwargs={})])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48c54667",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'utils'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[25]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mllm_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LLMClient\n\u001b[32m      3\u001b[39m LLM = LLMClient()\n\u001b[32m      4\u001b[39m llm = LLMClient().get_llm()\n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'utils'"
+     ]
+    }
+   ],
+   "source": [
+    "from job_writer.utils.llm_client import LLMClient\n",
+    "\n",
+    "LLM = LLMClient()\n",
+    "llm = LLMClient().get_llm()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "421df9ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from job_writer.tools.TavilySearch import search_company\n",
+    "\n",
+    "# Test job description\n",
+    "test_job = \"\"\"\n",
+    "Software Engineer - Backend\n",
+    "OpenAI\n",
+    "\n",
+    "We are looking for experienced backend engineers to join our team. Our ideal candidate will have experience with one or more of the following technologies: Python, Java, C++. \n",
+    "\n",
+    "Responsibilities:\n",
+    "- Design and implement scalable and efficient backend systems\n",
+    "- Write clean, maintainable code\n",
+    "- Work with cross-functional teams\n",
+    "\n",
+    "Requirements:\n",
+    "- Strong proficiency in one or more programming languages\n",
+    "- Strong understanding of software design patterns and principles\n",
+    "- Experience with distributed systems\n",
+    "\"\"\"\n",
+    "\n",
+    "# Test the search_company function\n",
+    "results = search_company(test_job)\n",
+    "for query_key, data in results.items():\n",
+    "    print(f\"\\n{query_key}:\")\n",
+    "    print(f\"Query: {data['query']}\")\n",
+    "    print(f\"Rationale: {data['rationale']}\")\n",
+    "    if data['results']:\n",
+    "        print(f\"First result: {data['results'][0][:200]}...\")\n",
+    "    else:\n",
+    "        print(\"No results found\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "18f12ff8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import (\n",
+    "    ChatPromptTemplate,\n",
+    "    HumanMessagePromptTemplate,\n",
+    "    SystemMessagePromptTemplate,\n",
+    ")\n",
+    "from langchain_core.messages import (\n",
+    "    AIMessage,\n",
+    "    HumanMessage,\n",
+    "    SystemMessage,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3ba77224",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from job_writer.prompts.templates import (\n",
+    "    TAVILY_QUERY_PROMPT\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "50bb7c0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tavily_search_prompt = ChatPromptTemplate.from_messages([\n",
+    "    SystemMessage(content=TAVILY_QUERY_PROMPT),\n",
+    "    HumanMessage(\n",
+    "        \"Below is the required job description and resume: {background_information}\",\n",
+    "        input_variables=[\"background_information\"]\n",
+    "    )\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "372e6346",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_description = \"\"\"\n",
+    "Software Engineer - Backend\n",
+    "OpenAI\n",
+    "\n",
+    "We are looking for experienced backend engineers to join our team. Our ideal candidate will have experience with one or more of the following technologies: Python, Java, C++. \n",
+    "\n",
+    "Responsibilities:\n",
+    "- Design and implement scalable and efficient backend systems\n",
+    "- Write clean, maintainable code\n",
+    "- Work with cross-functional teams\n",
+    "\n",
+    "Requirements:\n",
+    "- Strong proficiency in one or more programming languages\n",
+    "- Strong understanding of software design patterns and principles\n",
+    "- Experience with distributed systems\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3a27365f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'System: \\n<Background>\\nSINCE THE USER IS APPPLYING FOR A JOB, THE QUERIES SHOULD BE WRITTEN IN A WAY THAT RESULST IN RELEVANT INFORMATION ABOUT THE COMPANY. THIS WILL HELP THE USER WRITE A MORE PERSONALIZED AND RELEVANT APPLICATION.\\n\\nCategory mapping (remember this!):\\n    query1 : recent developments\\n    query2 : recent news\\n    query3 : role-related info\\n    query4 : key customers & partners  \\n    query5 : culture & values\\n\\nNote: The above are just categories. The queries should be written in a way that results in relevant information about the company. Must include the company name in the query to ensure results have a higher confidence.\\n</Background>\\n\\n<Instructions>\\n    1. Each array must contain **exactly two** strings: [search_query, one_sentence_rationale]  \\n    2. If data is missing, craft a sensible fallback query; never return an empty array.  \\n    3. If the employer name cannot be found, use `\"UNKNOWN\"`.  \\n    4. Escape JSON only where required.\\n    5. Query cannot be repeated. It will lead to irrelevant results.\\n</Instructions>\\n\\n<EXAMPLE>\\n    JSON->\\n    \"query1\": (\"....\", \"...\")\\n    \"query2\": (\"....\", \"...\")\\n    \"query3\": (\"....\", \"...\")\\n    \"query4\": (\"....\", \"...\")\\n    \"query5\": (\"....\", \"...\")\\n</EXAMPLE>\\n                    \\nHuman: Below is the required job description and resume: {background_information}'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tavily_search_prompt.format(background_information=job_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6b973991",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initializing LLM with model llama3.2:latest and provider ollama in c:\\users\\risha\\python-dir\\knowledgebase\\job_writer\\utils\\llm_client.py\n",
+      "Initializing LLM with model llama3.2:latest and provider ollama in c:\\users\\risha\\python-dir\\knowledgebase\\job_writer\\utils\\llm_client.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "from job_writer.utils.llm_client import LLMClient\n",
+    "\n",
+    "LLM = LLMClient()\n",
+    "llm = LLMClient().get_llm()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ff5ac65",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

tools/TavilySearch.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import logging
+import os
+import json
+import asyncio
+from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
+from langchain_core.prompt_values import PromptValue
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_community.tools import tool
+from langchain.output_parsers import PydanticOutputParser, RetryOutputParser
+from openevals.llm import create_async_llm_as_judge
+from openevals.prompts import (
+    RAG_RETRIEVAL_RELEVANCE_PROMPT,
+    RAG_HELPFULNESS_PROMPT
+)
+from ..utils.llm_client import LLMClient
+from ..agents.output_schema import TavilyQuerySet
+from ..prompts.templates import TAVILY_QUERY_PROMPT
+from ..classes.classes import ResearchState
+logger = logging.getLogger(__name__)
+LLM = LLMClient()
+llm_client = LLM.get_instance(model_name="ejschwar/llama3.2-better-prompts:latest", model_provider="ollama_llm")
+llm_structured = llm_client.get_llm()
+relevance_evaluator = create_async_llm_as_judge(
+    judge=llm_structured,
+    prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,
+    feedback_key="retrieval_relevance",
+)
+helpfulness_evaluator = create_async_llm_as_judge(
+    judge=llm_structured,
+    prompt=RAG_HELPFULNESS_PROMPT
+    + '\nReturn "true" if the answer is helpful, and "false" otherwise.',
+    feedback_key="helpfulness",
+)
+@tool
+def search_company(job_description: str, company_name: str) -> dict:
+    """Gather information about a company to understand more about the role,
+    recent developments, culture, and values of the company."""
+    try:
+        # Get format instructions from the parser
+        base_parser = PydanticOutputParser(pydantic_object=TavilyQuerySet)
+        parser = RetryOutputParser.from_llm(llm_structured, base_parser)
+        format_instructions = parser.get_format_instructions()
+        # Create the prompt with both messages
+        chat_prompt_tavily: ChatPromptTemplate = ChatPromptTemplate.from_messages([
+            SystemMessagePromptTemplate.from_template(
+                TAVILY_QUERY_PROMPT,
+                input_variables=["company_name"]
+            ),
+            HumanMessagePromptTemplate.from_template(
+                "Below is the required job description to parse:\n\n{job_description}",
+                input_variables=["job_description"]
+            )
+        ])
+        chat_prompt_value: PromptValue = chat_prompt_tavily.format_prompt(
+            company_name=company_name,
+            job_description=job_description
+        )
+        # Format messages and get LLM response
+        chat_prompt_tavily_messages = chat_prompt_tavily.format_messages(
+            company_name=company_name,
+            job_description=job_description
+        )
+        # Get response from LLM
+        search_results_llm = llm_structured.invoke(chat_prompt_tavily_messages)
+        # logger.info("Raw LLM Response content: %s", search_results_llm.content)
+        try:
+            parsed_query_set: TavilyQuerySet = parser.parse_with_prompt(search_results_llm.content, chat_prompt_value)
+            logger.info("Parsed TavilyQuerySet: %s", parsed_query_set.model_dump_json(indent=2))
+        except json.JSONDecodeError as e:
+            logger.error("JSON decoding error while parsing LLM response: %s. LLM content was: %s", e, search_results_llm.content, exc_info=True)
+            raise
+        except Exception as e: # Catches PydanticValidationErrors and other parsing issues
+            logger.error("Error parsing TavilyQuerySet from LLM completion: %s. LLM content was: %s", e, search_results_llm.content, exc_info=True)
+            raise
+        # Initialize search with advanced parameters
+        search = TavilySearchResults(max_results=4, search_depth="advanced")
+        # Prepare the structure for storing queries, rationales, and Tavily results
+        company_research_data = {}
+        attempted_queries = []
+        query_attributes = [f"query{i}" for i in range(1, 6)]
+        for attr_name in query_attributes:
+            query_list = getattr(parsed_query_set, attr_name, None)
+            if query_list and isinstance(query_list, list) and len(query_list) > 0:
+                actual_query = query_list[0]
+                rationale = query_list[1] if len(query_list) > 1 else "N/A" # Handle if rationale is missing
+                company_research_data[attr_name] = {
+                    'query': actual_query,
+                    'rationale': rationale,
+                    'results': []
+                }
+        # logger.info("Prepared company research structure: %s", json.dumps(company_research_data, indent=2))
+        # Execute each query and store results
+        for query_key, query_info in company_research_data.items():
+            try:
+                if not isinstance(query_info['query'], str) or not query_info['query'].strip():
+                    logger.warning("Skipping Tavily search for %s due to invalid/empty query: '%s'", query_key, query_info['query'])
+                    query_info['results'] = []
+                    continue
+                logger.info("Executing Tavily search for %s: '%s'", query_key, query_info['query'])
+                # tool.invoke({"args": {'query': 'who won the last french open'}, "type": "tool_call", "id": "foo", "name": "tavily"})
+                tavily_api_results = search.invoke({"args": {'query': query_info['query']}, "type": "tool_call", "id": "job_search", "name": "tavily"})
+                attempted_queries.append(query_info['query'])
+                del query_info['query']
+                if tavily_api_results and isinstance(tavily_api_results, list) and len(tavily_api_results) > 0:
+                    query_info['results'] = [result['content'] for result in tavily_api_results if 'content' in result]
+                else:
+                    logger.info("No results or unexpected format from Tavily for %s.", query_key)
+                    query_info['results'] = []
+            except Exception as e:
+                logger.error("Error executing Tavily search for query %s ('%s'): %s", query_key, query_info['query'], str(e), exc_info=True)
+                query_info['results'] = []
+        # print("Results: ", results)
+        return company_research_data, attempted_queries
+    except json.JSONDecodeError as e:
+        logger.error("JSON decoding error: %s", e)
+        raise
+    except AttributeError as e:
+        logger.error("Attribute error: %s", e)
+        raise
+    except Exception as e:
+        logger.error("Unexpected error: %s", e)
+        raise
+async def relevance_filter(state: ResearchState) -> ResearchState:
+    try:
+        # Mark the current node
+        state["current_node"] = "relevance_filter"
+        # Check if company_research_data exists
+        if not state.get("company_research_data"):
+            print("ERROR: company_research_data not found in state")
+            return state
+        # Check if tavily_search results exist
+        if not state["company_research_data"].get("tavily_search"):
+            print("ERROR: tavily_search not found in company_research_data")
+            state["company_research_data"]["tavily_search"] = []
+            return state
+        # Initialize compiled_results if not present
+        if "compiled_results" not in state:
+            state["compiled_results"] = []
+        print("Filtering results...")
+        # Get the company research data which contains results for different queries
+        # Example: {'query1': {'rationale': ..., 'results': [...]}, 'query2': ...}
+        all_query_data = state["company_research_data"].get("tavily_search", {})
+        # print("All query data:", all_query_data)
+        filtered_results_for_current_run = [] # Stores results deemed relevant in this specific call
+        # Create a semaphore to limit concurrent tasks to 2
+        semaphore = asyncio.Semaphore(2)
+        async def evaluate_with_semaphore(query_result_item: dict):
+            # query_result_item is a dict like {'rationale': '...', 'results': [...]}
+            async with semaphore:
+                # Safely get the query to use for relevance evaluation
+                attempted_queries_list = state.get("attempted_search_queries", [])
+                input_query = attempted_queries_list[-1] if attempted_queries_list else "No query context available"
+                eval_result = await relevance_evaluator(
+                    inputs=input_query, context=query_result_item  # context is the whole result block for the query
+                )
+                return query_result_item, eval_result
+        # Create tasks for all results
+        tasks = [evaluate_with_semaphore(query_info) for query_info in all_query_data.values() if isinstance(query_info, dict) and "results" in query_info]
+        # Process tasks as they complete
+        for completed_task in asyncio.as_completed(tasks):
+            query_result_item, eval_result = await completed_task
+            if eval_result.get("score"): # Safely check for score
+                # Assuming query_result_item["results"] is a list of content strings
+                if isinstance(query_result_item.get("results"), list):
+                    # print(f"Evaluated result: {query_result_item}")
+                    filtered_results_for_current_run.extend(query_result_item["results"])
+                else:
+                    # Handle cases where "results" might not be a list or is missing
+                    logger.warning("Expected a list for 'results' in query_result_item, got: %s", type(query_result_item.get('results')))
+        logger.info("Filtered results for current run: %s",filtered_results_for_current_run)
+        # The error occurs at a line like the following (line 178 in your traceback):
+        # This print statement will now safely access "compiled_results"
+        # print("Compiled results (before append): ", state["compiled_results"])    # Append the newly filtered results to the main compiled_results list
+        state["compiled_results"].extend(filtered_results_for_current_run)
+        state["company_research_data"]["tavily_search"] = filtered_results_for_current_run
+        # logger.info(f"Compiled results (after append): {state['compiled_results']}")
+        return state
+    except Exception as e:
+        print(f"ERROR in relevance_filter: {e}")
+        import traceback
+        traceback.print_exc()
+        logger.error(f"Error in relevance_filter: {str(e)}")
+        # Return original state to avoid breaking the flow
+        return state

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 23 16:49:52 2023
+@author: rishabhaggarwal
+"""
+from .TavilySearch import search_company, relevance_filter
+__all__ = ["search_company", "relevance_filter"]

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Utility modules for the job_writer package.
+"""

utils/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Configuration utilities for the job writer application.
+This module provides functions for initializing and configuring
+language models and other resources.
+"""
+import os
+from typing_extensions import Dict, Any, Tuple, Optional
+from langchain.chat_models import init_chat_model
+def init_models(config: Optional[Dict[str, Any]] = None) -> Tuple[Any, Any]:
+    """Initialize language models based on configuration."""
+    config = config or {}
+    # Model configuration with defaults
+    model_name = config.get("model_name", os.getenv("OLLAMA_MODEL", "llama3.2:latest"))
+    temperature = float(config.get("temperature", "0.3"))
+    precise_temperature = float(config.get("precise_temperature", "0.2"))
+    # Initialize models
+    llm = init_chat_model(f"ollama:{model_name}", temperature=temperature)
+    llm_precise = init_chat_model(f"ollama:{model_name}", temperature=precise_temperature)
+    return llm, llm_precise

utils/document_processing.py ADDED Viewed

	@@ -0,0 +1,443 @@

+"""
+Document processing utilities for parsing resumes and job descriptions.
+"""
+import logging
+import os
+import re
+import json
+from pathlib import Path
+from urllib.parse import urlparse
+from typing_extensions import Dict, List, Any
+# Langchain imports
+from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
+from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
+from langchain_core.messages import SystemMessage
+from langchain_core.documents import Document
+from langchain_core.output_parsers.json import JsonOutputParser
+from langfuse.decorators import observe, langfuse_context
+from pydantic import BaseModel, Field
+# Local imports - using relative imports
+from .errors import URLExtractionError, LLMProcessingError, JobDescriptionParsingError
+from .llm_client import LLMClient
+from ..prompts.templates import JOB_DESCRIPTION_PROMPT
+# Set up logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# Default paths
+DEFAULT_RESUME_PATH: str = os.getenv("DEFAULT_RESUME_PATH", "")
+# Most Occurring Resume Section Headers
+RESUME_SECTIONS: list[str] = [
+    "EDUCATION", "EXPERIENCE", "SKILLS", "WORK EXPERIENCE",
+    "PROFESSIONAL EXPERIENCE", "PROJECTS", "CERTIFICATIONS",
+    "SUMMARY", "OBJECTIVE", "CONTACT", "PUBLICATIONS",
+    "AWARDS", "LANGUAGES", "INTERESTS", "REFERENCES"
+]
+# Initialize LLM client
+LLM: LLMClient = LLMClient()
+llm_client: LLMClient = LLM.get_instance(
+                            model_name="ejschwar/llama3.2-better-prompts:latest",
+                            model_provider="ollama_json")
+llm_structured = llm_client.get_llm()
+class ResumeSection(BaseModel):
+    """Model for a structured resume section."""
+    title: str = Field(description="The section title (e.g., 'Experience', 'Education')")
+    content: str = Field(description="The full content of this section")
+class StructuredResume(BaseModel):
+    """Model for a structured resume with sections."""
+    sections: List[ResumeSection] = Field(description="List of resume sections")
+    contact_info: Dict[str, str] = Field(description="Contact information extracted from the resume")
+class JobDescriptionComponents(BaseModel):
+    """Model for job description components."""
+    company_name: str = Field(description="The company name")
+    job_description: str = Field(description="The job description")
+    reasoning: str = Field(description="The reasoning for the extracted information")
+@observe()
+def clean_resume_text(text: str) -> str:
+    """Clean and normalize resume text by removing extra whitespace, fixing common PDF extraction issues.
+    Args:
+        text: Raw text extracted from resume
+    Returns:
+        Cleaned text
+    """
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Fix common PDF extraction issues
+    text = re.sub(r'([a-z])- ([a-z])', r'\1\2', text)  # Fix hyphenated words
+    # Remove header/footer page numbers
+    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
+    # Replace bullet variations with standard markdown bullets
+    text = re.sub(r'[•●○◘◙♦♣♠★]', '* ', text)
+    return text.strip()
+@observe()
+def extract_contact_info(text: str) -> Dict[str, str]:
+    """Extract contact information from resume text.
+    Args:
+        text: Resume text to extract from
+    Returns:
+        Dictionary with contact information
+    """
+    contact_info = {}
+    # Extract email
+    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
+    if email_match:
+        contact_info['email'] = email_match.group(0)
+    # Extract phone (various formats)
+    phone_match = re.search(r'(\+\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}', text)
+    if phone_match:
+        contact_info['phone'] = phone_match.group(0)
+    # Extract LinkedIn URL
+    linkedin_match = re.search(r'linkedin\.com/in/[a-zA-Z0-9_-]+/?', text)
+    if linkedin_match:
+        contact_info['linkedin'] = 'https://www.' + linkedin_match.group(0)
+    # Try to extract name (this is approximate and might need LLM for better accuracy)
+    # Typically name appears at the top of the resume
+    first_line = text.strip().split('\n')[0].strip()
+    if len(first_line) < 40 and not any(char.isdigit() for char in first_line):
+        contact_info['name'] = first_line
+    return contact_info
+@observe()
+def identify_resume_sections(text: str) -> List[Dict[str, Any]]:
+    """Identify sections in a resume text.
+    Args:
+        text: Full resume text
+        llm: Optional language model for advanced section detection
+    Returns:
+        List of dictionaries with section info
+    """
+    sections = []
+    # if llm:
+    #     # Use LLM for more accurate section identification
+    #     prompt = ChatPromptTemplate.from_messages([
+    #         SystemMessage(content="""You are an expert at parsing resumes.
+    #         Identify the main sections in this resume text and structure them.
+    #         For each section, extract the title and content."""),
+    #         HumanMessage(content=f"Resume text:\n\n{text}")
+    #     ])
+    #     class ResumeStructure(BaseModel):
+    #         sections: List[Dict[str, str]] = Field(description="List of identified sections with title and content")
+    #     parser = PydanticOutputParser(pydantic_object=ResumeStructure)
+    #     chain = prompt | llm | parser
+    #     try:
+    #         result = chain.invoke({})
+    #         return result.sections
+    #     except Exception as e:
+    #         print(f"LLM section extraction failed: {e}")
+    # Regex-based section identification
+    # Create a pattern that matches common section headers
+    section_pattern = r'(?:^|\n)(?:[^a-zA-Z\d\s]|\s)*(' + '|'.join(RESUME_SECTIONS) + r')(?:[^a-zA-Z\d\s]|\s)*(?:$|\n)'
+    matches = list(re.finditer(section_pattern, text, re.IGNORECASE))
+    if not matches:
+        # If no sections found, treat the whole resume as one section
+        sections.append({
+            "title": "resume",
+            "content": text,
+        })
+        return sections
+    # Process each section
+    for i, match in enumerate(matches):
+        section_title = match.group(1).strip()
+        start_pos = match.start()
+        # Find the end position (start of next section or end of text)
+        end_pos = matches[i+1].start() if i < len(matches) - 1 else len(text)
+        # Extract section content (excluding the header)
+        section_content = text[start_pos:end_pos].strip()
+        sections.append({
+            "title": section_title.lower(),
+            "content": section_content
+        })
+    return sections
+def _collapse_ws(text: str) -> str:
+    """Collapse stray whitespace but keep bullet breaks."""
+    text = re.sub(r"\n\s*([•\-–])\s*", r"\n\1 ", text)
+    return re.sub(r"[ \t\r\f\v]+", " ", text).replace(" \n", "\n").strip()
+def _is_heading(line: str) -> bool:
+    return (
+        line.isupper()
+        and len(line.split()) <= 5
+        and not re.search(r"\d", line)
+    )
+def parse_resume(file_path: str | Path) -> List[Document]:
+    """
+    Load a résumé from PDF or TXT file → list[Document] chunks
+    (≈400 chars, 50‑char overlap) with {source, section} metadata.
+    """
+    file_extension = Path(file_path).suffix.lower()
+    # Handle different file types
+    if file_extension == '.pdf':
+        text = PyPDFLoader(str(file_path), extraction_mode="layout").load()[0].page_content
+    elif file_extension == '.txt':
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read()
+                if not text.strip():
+                    raise ValueError("File is empty")
+        except Exception as e:
+            logger.error(f"Error reading text file: {str(e)}")
+            raise ValueError(f"Could not read text file: {file_path}. Error: {str(e)}")
+    else:
+        raise ValueError(f"Unsupported resume file type: {file_path}. Supported types: .pdf, .txt")
+    text = _collapse_ws(text)
+    # Tag headings with "###" so Markdown splitter can see them
+    tagged_lines = [
+        f"### {ln}" if _is_heading(ln) else ln
+        for ln in text.splitlines()]
+    md_text = "\n".join(tagged_lines)
+    if "###" in md_text:
+        splitter = MarkdownHeaderTextSplitter(
+            headers_to_split_on=[("###", "section")]
+        )
+        chunks = splitter.split_text(md_text)  # already returns Documents
+    else:
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=400, chunk_overlap=50
+        )
+        chunks: list[Document] = [Document(page_content=chunk, metadata={}) for chunk in splitter.split_text(md_text)]    # Attach metadata
+    for doc in chunks:
+        doc.metadata.setdefault("source", str(file_path))
+        # section already present if header‑splitter was used
+    return chunks
+def get_job_description(file_path_or_url: str) -> Document:
+    """Parse a job description from a file or URL into chunks.
+    Args:
+        file_path_or_url: Local file path or URL of job posting
+    Returns:
+        Document containing the job description
+    """
+    # Check if the input is a URL
+    if file_path_or_url.startswith(('http://', 'https://')):
+        return parse_job_desc_from_url(file_path_or_url)
+    # Handle local files based on extension
+    file_extension = Path(file_path_or_url).suffix.lower()
+    # Handle txt files
+    if file_extension == '.txt':
+        try:
+            with open(file_path_or_url, 'r', encoding='utf-8') as f:
+                content = f.read()
+                if not content.strip():
+                    raise ValueError(f"File is empty: {file_path_or_url}")
+                return Document(page_content=content, metadata={"source": file_path_or_url})
+        except Exception as e:
+            logger.error(f"Error reading text file: {str(e)}")
+            raise ValueError(f"Could not read text file: {file_path_or_url}. Error: {str(e)}")
+    # For other file types
+    raise ValueError(f"Unsupported file type: {file_path_or_url}. Supported types: .pdf, .docx, .txt, .md")
+def parse_job_desc_from_url(url: str) -> Document:
+    """Extract job description from a URL.
+    Args:
+        url: URL of the job posting
+    Returns:
+        List[str]: [job_description_markdown, company_name]
+    Raises:
+        ValueError: If URL format is invalid
+        URLExtractionError: If content extraction fails
+        LLMProcessingError: If LLM processing fails
+    """
+    logger.info("Starting job description extraction from URL: %s", url)
+    # langfuse_handler = langfuse_context.get_current_langchain_handler()
+    extracted_text = None
+    try:
+        # Validate URL format
+        parsed_url = urlparse(url)
+        if not all([parsed_url.scheme, parsed_url.netloc]):
+            logger.error("Invalid URL format: %s", url)
+            raise ValueError("URL must start with http:// or https://")
+        # Extract content from URL
+        try:
+            loader = WebBaseLoader(url)
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1000,
+                chunk_overlap=200,
+                separators=["\n\n", "\n", ". ", " ", ""]
+            )
+            document_splitted = loader.load_and_split(text_splitter=text_splitter)
+            if not document_splitted:
+                logger.error("No content could be extracted from URL: %s", url)
+                raise URLExtractionError("No content could be extracted from URL")
+            extracted_text = " ".join(doc.page_content for doc in document_splitted)
+            logger.info("Successfully extracted %d characters from URL", len(extracted_text))
+        except Exception as e:
+            raise URLExtractionError(f"Failed to extract content from URL: {str(e)}") from e
+        # Process with LLM
+        if not llm_structured:
+            logger.warning("LLM not available, returning raw extracted text")
+            return [extracted_text, "Unknown Company"]
+        try:
+            output_parser: JsonOutputParser = JsonOutputParser(pydantic_object=JobDescriptionComponents)
+            human_prompt = "Below is the job description enclosed in triple quotes:\n\n '''{extracted_text}'''\n\n"
+            job_description_parser_system_message = SystemMessagePromptTemplate.from_template(
+                                                    template=JOB_DESCRIPTION_PROMPT,
+                                                    input_variables=[])
+            job_description_parser_human_message = HumanMessagePromptTemplate.from_template(
+                                                    template=human_prompt,
+                                                    input_variables=["extracted_text"])
+            chat_prompt = ChatPromptTemplate.from_messages([job_description_parser_system_message, job_description_parser_human_message])
+            # print("Chat prompt created successfully")
+            chain = chat_prompt | llm_structured | output_parser
+            try:
+                # Process with LLM
+                try:
+                    result = chain.invoke({"extracted_text": extracted_text})
+                except Exception as e:
+                    logger.error("LLM invocation failed: %s", str(e))
+                    raise LLMProcessingError(f"LLM invocation failed: {str(e)}") from e
+                print("LLM processing result: ", result)
+                # Handle different types of LLM results
+                if isinstance(result, JobDescriptionComponents):
+                    # Direct Pydantic model
+                    result = result.model_dump()
+                if isinstance(result, dict):
+                    print("LLM returned a dictionary, converting to JobDescriptionComponents model", result)
+                else:
+                    # Unexpected result type
+                    print(f"Unexpected LLM result type: {type(result)}")
+                    logger.error("Unexpected LLM result type: %s", type(result))
+                    raise LLMProcessingError("Invalid LLM response format")
+                # Validate required fields
+                if not result.get("job_description") or not result.get("company_name"):
+                    logger.warning("LLM returned empty required fields")
+                    raise LLMProcessingError("Missing required fields in LLM response")
+                logger.info("Successfully processed job description with LLM")
+                # Create a Document object for the job description
+                job_doc = Document(
+                    page_content=result["job_description"],
+                    metadata={"company_name": result["company_name"]}
+                )
+                # print("Job description Document created successfully. Company name: ", result["company_name"])
+                # print("Job description content: ", job_doc.metadata)  # Print first 100 chars for debugging
+                return job_doc
+            except Exception as e:
+                # Handle LLM processing errors first
+                if isinstance(e, LLMProcessingError):
+                    raise
+                # Try to recover from JSON parsing errors
+                error_msg = str(e)
+                if "Invalid json output" in error_msg:
+                    logger.warning("Attempting to recover from invalid JSON output")
+                    # Extract JSON from error message
+                    output = error_msg.split("Invalid json output:", 1)[1].strip()
+                    start = output.find('{')
+                    end = output.rfind('}') + 1
+                    if start >= 0 and end > start:
+                        try:
+                            clean_json = output[start:end]
+                            result = output_parser.parse(clean_json)
+                            if hasattr(result, "job_description") and hasattr(result, "company_name"):
+                                return [result.job_description, result.company_name]
+                        except json.JSONDecodeError as json_e:
+                            logger.error("Failed to recover from JSON error: %s", json_e)
+                raise LLMProcessingError(f"Failed to process job description with LLM: {str(e)}") from e
+        except Exception as e:
+            if isinstance(e, LLMProcessingError):
+                if extracted_text:
+                    logger.warning("LLM processing failed, falling back to raw text")
+                    raise e
+                    return [extracted_text, "Unknown Company"]
+            raise LLMProcessingError(f"Failed to process job description with LLM: {str(e)}") from e
+    except ValueError as e:
+        logger.error("URL validation error: %s", str(e))
+        raise
+    except URLExtractionError as e:
+        logger.error("Content extraction error: %s", str(e))
+        raise
+    except LLMProcessingError as e:
+        if extracted_text:
+            logger.warning("Using extracted text as fallback")
+            return [extracted_text, "Unknown Company"]
+        raise
+    except Exception as e:
+        logger.error("Unexpected error during job description parsing: %s", str(e))
+        raise JobDescriptionParsingError(f"Failed to parse job description: {str(e)}") from e

utils/errors.py ADDED Viewed

	@@ -0,0 +1,20 @@

+class ModelNotFoundError(Exception):
+    """Exception raised when a requested model is not found."""
+    def __init__(self, model_name: str):
+        super().__init__(f"Model '{model_name}' not found.")
+        self.model_name = model_name
+    def __str__(self):
+        return f"ModelNotFoundError: {self.model_name}"
+class URLExtractionError(Exception):
+    """Raised when content cannot be extracted from a URL."""
+    pass
+class LLMProcessingError(Exception):
+    """Raised when LLM processing fails."""
+    pass
+class JobDescriptionParsingError(Exception):
+    """Base class for job description parsing errors."""
+    pass

utils/langfuse_handler.py ADDED Viewed

File without changes

utils/llm_client.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+LLM Client module for managing language model interactions.
+"""
+import os
+from typing_extensions import Optional, Union
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.language_models.llms import BaseLLM
+from langchain_ollama import ChatOllama
+from langchain_openai import ChatOpenAI
+from .errors import ModelNotFoundError
+class LLMClient:
+    """
+    Client for managing language model interactions.
+    Provides a unified interface for different LLM backends.
+    """
+    _instance = None  # Singleton instance
+    @classmethod
+    def get_instance(cls, model_name: Optional[str] = None, model_provider: Optional[str] = None):
+        """Get or create a singleton instance of the LLM client.
+        Args:
+            model_name: Optional model name to override the default
+        Returns:
+            LLMClient instance
+        """
+        if cls._instance is None:
+            cls._instance = LLMClient(model_name, model_provider)
+        elif model_name is not None and cls._instance.model_name != model_name:
+            # Reinitialize if a different model is requested
+            cls._instance = LLMClient(model_name)
+        return cls._instance
+    def __init__(self, model_name: Optional[str] = None, model_provider: Optional[str] = None):
+        """Initialize the LLM client with the specified model.
+        Args:
+            model_name: Name of the model to use (default: from environment or "llama3.2:latest")
+        """
+        print("Initializing LLM Client with model:", model_name, "and provider:", model_provider)
+        self.model_name = model_name or os.getenv("DEFAULT_LLM_MODEL", "llama3.2:latest")
+        self.model_provider = model_provider or os.getenv("LLM_PROVIDER", "ollama").lower()
+        self.llm = self._initialize_llm()
+    def __str__(self):
+        return f"LLMClient(model_name={self.model_name}, provider={self.model_provider})"
+    def _initialize_llm(self) -> Union[BaseLLM, BaseChatModel]:
+        """Initialize the appropriate LLM based on configuration.
+        Returns:
+            Initialized LLM instance
+        """
+        print(f"Initializing LLM with model {self.model_name} and provider {self.model_provider} in {__file__}")
+        if self.model_provider == "ollama":
+            return self._initialize_llama()
+        elif self.model_provider == "openai":
+            return self._initialize_openai()
+        elif self.model_provider == "ollama_json":
+            return self._initialize_jsonllm()
+        else:
+            raise ValueError(f"Unsupported LLM provider: {self.model_provider}")
+    def _initialize_llama(self) -> BaseChatModel:
+        """Initialize an Ollama LLM.
+        Returns:
+            Ollama LLM instance
+        """
+        try:
+            # model = OllamaLLM(model=self.model_name, temperature=0.1, top_k=1, repeat_penalty=1.2)
+            model: ChatOllama = ChatOllama(model=self.model_name, temperature=0.1, top_k=1, repeat_penalty=1.2)
+            return model
+        except Exception as e:
+            raise ModelNotFoundError(f"Failed to initialize Ollama with model {self.model_name}: {e}") from e
+    def _initialize_jsonllm(self) -> BaseChatModel:
+        """
+        Initialize a Mistral chat model.
+        Returns:
+            Mistral chat model instance
+        """
+        try:
+            model: ChatOllama = ChatOllama(model=self.model_name, format='json', temperature=0.1, top_k=1, repeat_penalty=1.2)
+            return model
+        except Exception as e:
+            raise ModelNotFoundError(f"Failed to initialize Ollama with model {self.model_name}: {e}") from e
+    def _initialize_openai(self) -> BaseChatModel:
+        """Initialize an OpenAI chat model.
+        Returns:
+            OpenAI chat model instance
+        """
+        api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJhcHAiLCJleHAiOjE3OTk5OTk5OTksInN1YiI6NjU1MDM3LCJhdWQiOiJXRUIiLCJpYXQiOjE2OTQwNzY4NTF9.hBcFcCqO1UF2Jb-m8Nv5u5zJPvQIuXUSZgyqggAD-ww"
+        # api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OPENAI_API_KEY environment variable not set")
+        try:
+            return ChatOpenAI(model_name=self.model_name, api_key=api_key)
+        except Exception as e:
+            raise ModelNotFoundError(f"Failed to initialize Ollama with model {self.model_name}: {e}") from e
+    def get_llm(self) -> Union[BaseLLM, BaseChatModel]:
+        """Get the initialized LLM instance.
+        Returns:
+            LLM instance
+        """
+        if self.llm is None:
+            raise RuntimeError("LLM client not initialized")
+        return self.llm
+    def reinitialize(self, model_name: Optional[str] = None, provider: Optional[str] = None) -> None:
+        """Reinitialize the LLM with a different model or provider.
+        Args:
+            model_name: New model name to use
+            provider: New provider to use
+        """
+        print(f"Reinitializing LLM client from {self.model_name} to {model_name}")
+        if model_name:
+            self.model_name = model_name
+        if provider:
+            self.model_provider = provider.lower()
+        self.llm = self._initialize_llm()

utils/vector_store.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Vector storage utilities for the job writer application.
+This module provides functions for storing and retrieving
+documents from vector databases.
+"""
+# Standard library imports
+import os
+from typing_extensions import List, Optional
+# Third-party library imports
+from langchain_core.documents import Document
+from langchain_community.vectorstores import Pinecone
+from langchain_ollama import OllamaEmbeddings
+from pinecone import Pinecone as PineconeClient, ServerlessSpec
+# Default configuration
+DEFAULT_PINECONE_INDEX = "job-writer-vector"
+class VectorStoreManager:
+    """Manager class for vector store operations."""
+    def __init__(
+        self,
+        index_name: str = DEFAULT_PINECONE_INDEX,
+        embedding_model: str = "llama3.2:latest"
+    ):
+        """Initialize the vector store manager.
+        Args:
+            api_key: Pinecone API key (will use env var if not provided)
+            index_name: Name of the Pinecone index to use
+            embedding_model: Name of the Ollama model to use for embeddings
+        """
+        api_key= os.getenv("PINECONE_API_KEY")
+        if not api_key:
+            raise ValueError("Environment variable PINECONE_API_KEY not set.")
+        self.index_name = index_name
+        # Initialize embeddings
+        self.embeddings = OllamaEmbeddings(
+            model=embedding_model
+        )
+        # Initialize Pinecone client
+        self.client = PineconeClient(api_key=api_key)
+        # Ensure index exists
+        self._ensure_index_exists()
+    def _ensure_index_exists(self):
+        """Make sure the required index exists, create if not."""
+        # Get embedding dimension from our embeddings model
+        try:
+            sample_embedding = self.embeddings.embed_query("Test query")
+            embedding_dim = len(sample_embedding)
+        except Exception as e:
+            print(f"Error determining embedding dimension: {e}")
+            print("Falling back to default dimension of 384")
+            embedding_dim = 384  # Common default for Ollama embeddings
+        # Check if the index exists
+        index_exists = False
+        try:
+            index_list = self.client.list_indexes()
+            index_list = [i.name for i in index_list]
+            index_exists = self.index_name in index_list
+        except Exception as e:
+            print(f"Error checking Pinecone indexes: {e}")
+        # Create index if it doesn't exist
+        if not index_exists:
+            try:
+                print(f"Creating new index: {self.index_name}")
+                self.client.create_index(
+                    name=self.index_name,
+                    dimension=embedding_dim,
+                    spec=ServerlessSpec(region="us-east-1", cloud="aws"),
+                    metric="cosine"
+                )
+                print(f"Successfully created index: {self.index_name}")
+            except Exception as e:
+                if "ALREADY_EXISTS" in str(e):
+                    print(f"Index {self.index_name} already exists (created in another process)")
+                else:
+                    print(f"Error creating index: {e}")
+        else:
+            print(f"Using Pinecone Index: {self.index_name}")
+    def store_documents(self, docs: List[Document], namespace: str) -> None:
+        """Store documents in vector database.
+        Args:
+            docs: List of Document objects to store
+            namespace: Namespace to store documents under
+        """
+        try:
+            # Get the index
+            index = self.client.Index(self.index_name)
+            # Create the vector store
+            vector_store = Pinecone(
+                index=index,
+                embedding=self.embeddings,
+                text_key="text",
+                namespace=namespace
+            )
+            # Add documents
+            vector_store.add_documents(docs)
+            print(f"Successfully stored {len(docs)} documents in namespace: {namespace}")
+        except Exception as e:
+            print(f"Error storing documents: {e}")
+            raise
+    def retrieve_similar(self, query: str, namespace: str, k: int = 3):
+        """Retrieve similar documents based on a query.
+        Args:
+            query: The query text to search for
+            namespace: Namespace to search in
+            k: Number of results to return
+        Returns:
+            List of Document objects
+        """
+        try:
+            # Get the index
+            index = self.client.Index(self.index_name)
+            # Create the vector store
+            vectorstore = Pinecone(
+                index=index,
+                embedding=self.embeddings,
+                text_key="text",
+                namespace=namespace
+            )
+            # Search for similar documents
+            docs = vectorstore.similarity_search(query, k=k, namespace=namespace)
+            return docs
+        except Exception as e:
+            print(f"Error retrieving documents: {e}")
+            return []
+VectorStoreManager = VectorStoreManager()
+VectorStoreManager.store_documents(
+    docs=[Document(page_content="Sample content", metadata={"source": "test"})],
+    namespace="test_namespace"
+)

workflow.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+Workflow runner for the job application writer.
+This module provides functions for running the job application
+writer graph in both interactive and batch modes.
+"""
+import asyncio
+import argparse
+import sys
+from datetime import datetime
+from langchain_core.tracers import ConsoleCallbackHandler
+from langgraph.graph import StateGraph
+from langfuse import Langfuse
+from job_writer.nodes import Dataloading
+from job_writer.nodes.research_workflow import research_workflow
+from job_writer.classes import AppState, DataLoadState
+from job_writer.agents.nodes import (
+    create_draft,
+    critique_draft,
+    finalize_document,
+    human_approval,
+)
+from job_writer.nodes import (
+    generate_variations,
+    self_consistency_vote
+)
+class JobWorkflow:
+    """
+    Workflow runner for the job application writer.
+    Args:
+        resume: Resume text or file path
+        job_description: Job description text or URL
+        content:
+        Type of application material to generate
+        model_config: Configuration for language models
+    """
+#
+    def __init__(self, resume=None, job_description_source=None, content=None, model_configuration=None):
+        """Initialize the Writing Workflow."""
+        print(f"Initializing Workflow for {content}")
+        self.resume = resume
+        self.job_description_source = job_description_source
+        self.content = content
+        self.model_configuration = model_configuration
+        # Initialize the app state
+        self.app_state = AppState(
+            resume_path=resume,
+            job_description_source=job_description_source,
+            company_research_data=None,
+            draft="",
+            feedback="",
+            final="",
+            content=content,
+            current_node=""
+        )
+        self.__init__nodes()
+        self._build_workflow()
+        self.langfuse = Langfuse()
+    def __init__nodes(self):
+        self.dataloading = Dataloading()
+        # self.createdraft = create_draft()
+    def _build_workflow(self):
+        # Build the graph with config
+        self.job_app_graph = StateGraph(DataLoadState)
+        self.job_app_graph.add_node("initialize_system", self.dataloading.system_setup)
+        self.job_app_graph.add_node("load", self.dataloading.run)
+        # self.job_app_graph.add_node("build_persona", select_persona)
+        # Add research workflow as a node
+        self.job_app_graph.add_node("research", research_workflow)
+        self.job_app_graph.add_node("create_draft", create_draft)
+        self.job_app_graph.add_node("variations", generate_variations)
+        self.job_app_graph.add_node("self_consistency", self_consistency_vote)
+        self.job_app_graph.add_node("critique", critique_draft)
+        self.job_app_graph.add_node("human_approval", human_approval)
+        self.job_app_graph.add_node("finalize", finalize_document)
+        self.job_app_graph.set_entry_point("initialize_system")
+        self.job_app_graph.set_finish_point("finalize")
+        self.job_app_graph.add_edge("initialize_system", "load")
+        self.job_app_graph.add_conditional_edges("load", self.dataloading.verify_inputs)
+        self.job_app_graph.add_edge("research", "create_draft")
+        self.job_app_graph.add_edge("create_draft", "variations")
+        self.job_app_graph.add_edge("variations", "self_consistency")
+        self.job_app_graph.add_edge("self_consistency", "critique")
+        self.job_app_graph.add_edge("critique", "human_approval")
+        self.job_app_graph.add_edge("human_approval", "finalize")
+    async def run(self) -> str | None:
+        """
+        Run the job application writer workflow.
+        """
+        # Compile the graph
+        try:
+            compiled_graph = self.compile()
+        except Exception as e:
+            print(f"Error compiling graph: {e}")
+            return
+         # Set up run configuration
+        run_name = f"Job Application Writer - {self.app_state['content']} - {datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+        config = {
+            "configurable": {
+                "thread_id": f"job_app_session_{datetime.now().strftime('%Y%m%d%H%M%S')}",
+                "callbacks": [ConsoleCallbackHandler()],
+                "run_name": run_name,
+                "tags": ["job-application", self.app_state['content']]
+                },
+            "recursion_limit": 10
+            }
+        # Run the graph
+        try:
+            self.app_state["current_node"] = "initialize_system"
+            graph_output = await compiled_graph.ainvoke(self.app_state, config=config)
+        except Exception as e:
+            print(f"Error running graph: {e}")
+            return
+        return graph_output
+    def compile(self):
+        """Compile the graph."""
+        graph = self.job_app_graph.compile()
+        return graph
+    def print_result(self, content_type, final_content):
+        """Print the final generated content to the console."""
+        print("\n" + "="*80)
+        print(f"FINAL {content_type.upper()}:")
+        print(final_content)
+        print("="*80)
+    def save_result(self, content_type, final_content):
+        """Save the final generated content to a file and return the filename."""
+        output_file = f"{content_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(final_content)
+        print(f"\nSaved to {output_file}")
+        return output_file
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate job application materials")
+    parser.add_argument("--resume", required=True, help="Path to resume file or resume text")
+    parser.add_argument("--job", required=True, help="Path/URL to job description or description text")
+    parser.add_argument("--type", default="cover_letter",
+                       choices=["cover_letter", "bullets", "linkedin_note"],
+                       help="Type of application material to generate")
+    parser.add_argument("--model", help="Ollama model to use")
+    parser.add_argument("--temp", type=float, help="Temperature for generation")
+    args = parser.parse_args()
+    # Configure models if specified
+    model_config = {}
+    if args.model:
+        model_config["model_name"] = args.model
+    if args.temp is not None:
+        model_config["temperature"] = min(0.25, args.temp)
+        model_config["precise_temperature"] = min(0.2, args.temp)
+    # Initialize the workflow
+    workflow = JobWorkflow(
+        resume=args.resume,
+        job_description_source=args.job,
+        content=args.type,
+        model_configuration=model_config
+    )
+    # Run the workflow
+    result = asyncio.run(workflow.run())
+    if result:
+        # Print the result to the console
+        workflow.print_result(args.type, result["final"])
+    else:
+        print("Error running workflow.")
+        sys.exit(1)
+    # Save the result to a file
+    if result:
+        workflow.save_result(args.type, result["final"])
+    else:
+        print("Error saving result.")
+        sys.exit(1)
+    # Print a success message
+    print("Workflow completed successfully.")