Commit ·
a8b79ed
0
Parent(s):
First Commit
Browse files- .gitignore +117 -0
- README.md +81 -0
- __init__.py +69 -0
- agents/__init__.py +3 -0
- agents/nodes.py +177 -0
- agents/output_schema.py +19 -0
- classes/__init__.py +3 -0
- classes/classes.py +63 -0
- langgraph.json +10 -0
- langgraph_init.py +4 -0
- nodes/__init__.py +13 -0
- nodes/createdraft.py +0 -0
- nodes/initializing.py +225 -0
- nodes/research_workflow.py +81 -0
- nodes/selfconsistency.py +85 -0
- nodes/test_workflow.py +23 -0
- nodes/variations.py +73 -0
- prompts.md +27 -0
- prompts/__init__.py +3 -0
- prompts/templates.py +239 -0
- setup.py +0 -0
- testing.ipynb +1069 -0
- tools/TavilySearch.py +230 -0
- tools/__init__.py +9 -0
- utils/__init__.py +3 -0
- utils/config.py +25 -0
- utils/document_processing.py +443 -0
- utils/errors.py +20 -0
- utils/langfuse_handler.py +0 -0
- utils/llm_client.py +141 -0
- utils/vector_store.py +156 -0
- workflow.py +210 -0
.gitignore
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
MANIFEST
|
| 23 |
+
|
| 24 |
+
# Python Virtual Environments
|
| 25 |
+
env/
|
| 26 |
+
venv/
|
| 27 |
+
ENV/
|
| 28 |
+
env.bak/
|
| 29 |
+
venv.bak/
|
| 30 |
+
.env
|
| 31 |
+
.venv
|
| 32 |
+
|
| 33 |
+
# Jupyter Notebook
|
| 34 |
+
.ipynb_checkpoints
|
| 35 |
+
*/.ipynb_checkpoints/*
|
| 36 |
+
|
| 37 |
+
# IDE specific files
|
| 38 |
+
.idea/
|
| 39 |
+
.vscode/
|
| 40 |
+
*.swp
|
| 41 |
+
*.swo
|
| 42 |
+
*.swn
|
| 43 |
+
.DS_Store
|
| 44 |
+
|
| 45 |
+
# API keys and secrets
|
| 46 |
+
.env
|
| 47 |
+
.secrets
|
| 48 |
+
*.pem
|
| 49 |
+
*.key
|
| 50 |
+
langsmith_api_key.txt
|
| 51 |
+
|
| 52 |
+
# Logs and databases
|
| 53 |
+
*.log
|
| 54 |
+
*.sql
|
| 55 |
+
*.sqlite
|
| 56 |
+
logs/
|
| 57 |
+
|
| 58 |
+
# Local development settings
|
| 59 |
+
local_settings.py
|
| 60 |
+
|
| 61 |
+
# Pytest and coverage reports
|
| 62 |
+
.pytest_cache/
|
| 63 |
+
htmlcov/
|
| 64 |
+
.tox/
|
| 65 |
+
.coverage
|
| 66 |
+
.coverage.*
|
| 67 |
+
coverage.xml
|
| 68 |
+
*.cover
|
| 69 |
+
.hypothesis/
|
| 70 |
+
.pylintrcls
|
| 71 |
+
|
| 72 |
+
# Documentation
|
| 73 |
+
docs/_build/
|
| 74 |
+
site/
|
| 75 |
+
|
| 76 |
+
# Type checking
|
| 77 |
+
.mypy_cache/
|
| 78 |
+
.dmypy.json
|
| 79 |
+
dmypy.json
|
| 80 |
+
.pyre/
|
| 81 |
+
|
| 82 |
+
# LangChain related
|
| 83 |
+
.langchain.db
|
| 84 |
+
langsmith.db
|
| 85 |
+
.langgraph_api/
|
| 86 |
+
|
| 87 |
+
# Temporary files
|
| 88 |
+
tmp/
|
| 89 |
+
tests/
|
| 90 |
+
temp/
|
| 91 |
+
*.tmp
|
| 92 |
+
*.temp
|
| 93 |
+
|
| 94 |
+
# LangGraph specific
|
| 95 |
+
langgraph.db
|
| 96 |
+
*.db
|
| 97 |
+
|
| 98 |
+
# Output files (if you generate reports/documents)
|
| 99 |
+
output/
|
| 100 |
+
reports/
|
| 101 |
+
generated/
|
| 102 |
+
|
| 103 |
+
# Test artifacts
|
| 104 |
+
.pytest_cache/
|
| 105 |
+
test-results/
|
| 106 |
+
test_output/
|
| 107 |
+
|
| 108 |
+
# OS specific
|
| 109 |
+
Thumbs.db
|
| 110 |
+
ehthumbs.db
|
| 111 |
+
Desktop.ini
|
| 112 |
+
|
| 113 |
+
# Github
|
| 114 |
+
.github/
|
| 115 |
+
|
| 116 |
+
# Miscellaneous
|
| 117 |
+
parsed_text.json
|
README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Job Writer Module
|
| 2 |
+
|
| 3 |
+
A modular, well-structured package for creating tailored job applications using LangChain and LangGraph with LangSmith observability.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- Creates personalized job application materials based on resumes and job descriptions
|
| 8 |
+
- Supports multiple application types: cover letters, bullet points, and LinkedIn messages
|
| 9 |
+
- Uses RAG for personalization and web search for company research
|
| 10 |
+
- Provides human-in-the-loop feedback integration
|
| 11 |
+
- Implements self-consistency voting for quality control
|
| 12 |
+
|
| 13 |
+
## Installation
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
# Install the package and its dependencies
|
| 17 |
+
pip install -e .
|
| 18 |
+
|
| 19 |
+
# Install development dependencies (including linting tools)
|
| 20 |
+
pip install -r requirements-dev.txt
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Code Standards and Linting
|
| 24 |
+
|
| 25 |
+
This project uses several tools to ensure code quality:
|
| 26 |
+
|
| 27 |
+
1. **Black** - Code formatter that enforces consistent style
|
| 28 |
+
2. **isort** - Sorts imports according to best practices
|
| 29 |
+
3. **Flake8** - Style guide enforcement
|
| 30 |
+
4. **mypy** - Static type checking
|
| 31 |
+
|
| 32 |
+
### Running the Linters
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
# Format code with Black
|
| 36 |
+
black job_writer/
|
| 37 |
+
|
| 38 |
+
# Sort imports
|
| 39 |
+
isort job_writer/
|
| 40 |
+
|
| 41 |
+
# Check style with Flake8
|
| 42 |
+
flake8 job_writer/
|
| 43 |
+
|
| 44 |
+
# Type checking with mypy
|
| 45 |
+
mypy job_writer/
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Pre-commit Hooks
|
| 49 |
+
|
| 50 |
+
We use pre-commit hooks to automatically run linters before each commit:
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
# Install the pre-commit hooks
|
| 54 |
+
pip install pre-commit
|
| 55 |
+
pre-commit install
|
| 56 |
+
|
| 57 |
+
# You can also run the hooks manually
|
| 58 |
+
pre-commit run --all-files
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Usage Example
|
| 62 |
+
|
| 63 |
+
```python
|
| 64 |
+
import asyncio
|
| 65 |
+
from job_writer.workflow import run_job_application_writer
|
| 66 |
+
|
| 67 |
+
# Run the job application writer
|
| 68 |
+
result = asyncio.run(run_job_application_writer(
|
| 69 |
+
resume_path="path/to/resume.pdf",
|
| 70 |
+
job_desc_path="https://example.com/job-posting",
|
| 71 |
+
content="cover_letter"
|
| 72 |
+
))
|
| 73 |
+
|
| 74 |
+
print(result["final"])
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Alternatively, you can use the command-line interface:
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
python -m job_writer.workflow --resume path/to/resume.pdf --job https://example.com/job-posting --type cover_letter
|
| 81 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Job Application Writer Package
|
| 3 |
+
|
| 4 |
+
A modular, well-structured package for creating tailored job applications
|
| 5 |
+
using LangChain and LangGraph with LangSmith observability.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "0.1.0"
|
| 9 |
+
|
| 10 |
+
import os, getpass
|
| 11 |
+
import logging
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
from langfuse import Langfuse
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Set up logging
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
logger.setLevel(logging.INFO)
|
| 20 |
+
log_dir = Path(__file__).parent / 'logs'
|
| 21 |
+
log_dir.mkdir(exist_ok=True)
|
| 22 |
+
logger.addHandler(logging.FileHandler(log_dir / 'job_writer.log', mode='a'))
|
| 23 |
+
logger.info("Logger initialized. Writing to %s", Path(__file__).parent / 'job_writer.log')
|
| 24 |
+
|
| 25 |
+
# Load environment variables from .env file
|
| 26 |
+
env_path = Path(__file__).parent / '.env'
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _set_env(var: str):
|
| 30 |
+
if not os.environ.get(var):
|
| 31 |
+
os.environ[var] = getpass.getpass(f"{var}: ")
|
| 32 |
+
logger.info(f"{var} set to {os.environ[var]}")
|
| 33 |
+
|
| 34 |
+
if env_path.exists():
|
| 35 |
+
logger.info("Loading environment variables from %s", env_path)
|
| 36 |
+
load_dotenv(dotenv_path=env_path, override=True)
|
| 37 |
+
else:
|
| 38 |
+
logger.warning(".env file not found at %s. Using system environment variables.", env_path)
|
| 39 |
+
|
| 40 |
+
# Check for critical environment variables
|
| 41 |
+
if not os.getenv("TAVILY_API_KEY"):
|
| 42 |
+
logger.warning("TAVILY_API_KEY environment variable is not set." \
|
| 43 |
+
" Failed to get TAVILY_API_KEY at Path %s", env_path)
|
| 44 |
+
_set_env("TAVILY_API_KEY")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if not os.getenv("GEMINI_API_KEY"):
|
| 48 |
+
logger.warning("GEMINI_API_KEY environment variable is not set. " \
|
| 49 |
+
"Failed to get GEMINI_API_KEY at Path %s", env_path)
|
| 50 |
+
_set_env("GEMINI_API_KEY")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if not os.getenv("PINECONE_API_KEY"):
|
| 54 |
+
logger.warning("PINECONE_API_KEY environment variable is not set." \
|
| 55 |
+
" Failed to get PINECONE_API_KEY at Path %s", env_path)
|
| 56 |
+
_set_env("PINECONE_API_KEY")
|
| 57 |
+
|
| 58 |
+
if not os.getenv("LANGFUSE_PUBLIC_KEY"):
|
| 59 |
+
logger.warning("LANGFUSE_PUBLIC_KEY environment variable is not set." \
|
| 60 |
+
" Failed to get LANGFUSE_PUBLIC_KEY at Path %s", env_path)
|
| 61 |
+
_set_env("LANGFUSE_PUBLIC_KEY")
|
| 62 |
+
|
| 63 |
+
if not os.getenv("LANGFUSE_SECRET_KEY"):
|
| 64 |
+
logger.warning("LANGFUSE_SECRET_KEY environment variable is not set." \
|
| 65 |
+
" Failed to get LANGFUSE_SECRET_KEY at Path %s", env_path)
|
| 66 |
+
_set_env("LANGFUSE_SECRET_KEY")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
__all__: list[str] = ["job_app_graph", "workflows/research_workflow"]
|
agents/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent modules for job application generation.
|
| 3 |
+
"""
|
agents/nodes.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Node functions for the job application writer LangGraph.
|
| 3 |
+
|
| 4 |
+
This module contains all the node functions used in the job application
|
| 5 |
+
writer workflow graph, each handling a specific step in the process.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
|
| 12 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 13 |
+
|
| 14 |
+
from ..classes.classes import AppState
|
| 15 |
+
from ..prompts.templates import (
|
| 16 |
+
CRITIQUE_PROMPT,
|
| 17 |
+
PERSONA_DEVELOPMENT_PROMPT,
|
| 18 |
+
COVER_LETTER_PROMPT,
|
| 19 |
+
REVISION_PROMPT,
|
| 20 |
+
BULLET_POINTS_PROMPT,
|
| 21 |
+
LINKEDIN_NOTE_PROMPT,
|
| 22 |
+
)
|
| 23 |
+
from ..utils.llm_client import LLMClient
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
# Constants
|
| 27 |
+
CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
|
| 28 |
+
|
| 29 |
+
LLM = LLMClient()
|
| 30 |
+
llm = LLMClient().get_llm()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def create_draft(state: AppState) -> AppState:
|
| 34 |
+
"""Create initial draft of the application material."""
|
| 35 |
+
# Determine which type of content we're creating
|
| 36 |
+
current_application_session = state.get("company_research_data", {})
|
| 37 |
+
|
| 38 |
+
content_category = state.get("content_category", "cover_letter")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
if state.get("vector_store"):
|
| 43 |
+
vector_store = state.get("vector_store")
|
| 44 |
+
|
| 45 |
+
# Extract key requirements from job description
|
| 46 |
+
prompt = PERSONA_DEVELOPMENT_PROMPT | llm | StrOutputParser()
|
| 47 |
+
|
| 48 |
+
if current_application_session:
|
| 49 |
+
key_requirements = prompt.invoke({"job_description": current_application_session["job_description"]})
|
| 50 |
+
else:
|
| 51 |
+
return key_requirements
|
| 52 |
+
|
| 53 |
+
if not key_requirements:
|
| 54 |
+
print("Warning: No key requirements found in the job description.")
|
| 55 |
+
return state
|
| 56 |
+
|
| 57 |
+
# Use the key requirements to query for the most relevant resume parts
|
| 58 |
+
namespace = f"resume_{state['session_id']}"
|
| 59 |
+
relevant_docs = vector_store.retrieve_similar(
|
| 60 |
+
query=key_requirements,
|
| 61 |
+
namespace=namespace,
|
| 62 |
+
k=3
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Use these relevant sections with higher weight in the draft creation
|
| 66 |
+
highly_relevant_resume = "\n".join([doc.page_content for doc in relevant_docs])
|
| 67 |
+
resume_text = f"""
|
| 68 |
+
# Most Relevant Experience
|
| 69 |
+
{highly_relevant_resume}
|
| 70 |
+
|
| 71 |
+
# Full Resume
|
| 72 |
+
{resume_text}
|
| 73 |
+
"""
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"Warning: Could not use vector search for relevant resume parts: {e}")
|
| 76 |
+
# Continue with regular resume text
|
| 77 |
+
|
| 78 |
+
# Select the appropriate prompt template based on application type and persona
|
| 79 |
+
print(f"Content category: {content_category}")
|
| 80 |
+
if content_category == "bullets":
|
| 81 |
+
FirstDraftGenerationPromptTemplate = ChatPromptTemplate([BULLET_POINTS_PROMPT])
|
| 82 |
+
elif content_category == "linkedin_connect_request":
|
| 83 |
+
FirstDraftGenerationPromptTemplate = ChatPromptTemplate([LINKEDIN_NOTE_PROMPT])
|
| 84 |
+
else:
|
| 85 |
+
FirstDraftGenerationPromptTemplate = ChatPromptTemplate([COVER_LETTER_PROMPT])
|
| 86 |
+
|
| 87 |
+
# Create the draft using the selected prompt template
|
| 88 |
+
CurrentSessionContextMessage = HumanMessagePromptTemplate.from_template(
|
| 89 |
+
"""
|
| 90 |
+
Below is the Job Description and Resume enclosed in triple backticks.
|
| 91 |
+
|
| 92 |
+
Job Description and Resume:
|
| 93 |
+
|
| 94 |
+
```
|
| 95 |
+
{current_job_role}
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
Use the Company Research Data below in to create a cover letter that highlights the match between my qualifications and the job requirements and aligns with the company's values and culture.
|
| 99 |
+
Company Research Data:
|
| 100 |
+
#company_research_data
|
| 101 |
+
|
| 102 |
+
Create a cover letter that highlights the match between my qualifications and the job requirements.
|
| 103 |
+
""",
|
| 104 |
+
input_variables=["current_job_role",
|
| 105 |
+
"company_research_data"])
|
| 106 |
+
|
| 107 |
+
FirstDraftGenerationPromptTemplate.append(CurrentSessionContextMessage)
|
| 108 |
+
|
| 109 |
+
# Invoke the chain with the appropriate inputs
|
| 110 |
+
chain = (
|
| 111 |
+
({"current_job_role": lambda x: x["current_job_role"],
|
| 112 |
+
"company_research_data": lambda x: x["company_research_data"]})
|
| 113 |
+
| FirstDraftGenerationPromptTemplate
|
| 114 |
+
| llm
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Prepare the inputs
|
| 118 |
+
inputs = {
|
| 119 |
+
"current_job_role": current_application_session['job_description'],
|
| 120 |
+
"company_research_data": current_application_session["tavily_search"]}
|
| 121 |
+
|
| 122 |
+
response = chain.invoke(inputs)
|
| 123 |
+
print(f"Draft created: {response}")
|
| 124 |
+
state["draft"] = response
|
| 125 |
+
return state
|
| 126 |
+
|
| 127 |
+
def critique_draft(state: AppState) -> AppState:
|
| 128 |
+
"""Critique the draft for improvements."""
|
| 129 |
+
critique = llm.invoke(CRITIQUE_PROMPT.format(
|
| 130 |
+
job_description=state["job_description"][0],
|
| 131 |
+
draft=state["draft"]
|
| 132 |
+
))
|
| 133 |
+
|
| 134 |
+
# Store the critique for reference during human feedback
|
| 135 |
+
state["critique"] = critique
|
| 136 |
+
return state
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def human_approval(state: AppState) -> AppState:
|
| 140 |
+
"""Human-in-the-loop checkpoint for feedback on the draft."""
|
| 141 |
+
# This is a placeholder function that would be replaced by actual UI interaction
|
| 142 |
+
print("\n" + "="*80)
|
| 143 |
+
print("DRAFT FOR REVIEW:")
|
| 144 |
+
print(state["draft"])
|
| 145 |
+
print("\nAUTOMATIC CRITIQUE:")
|
| 146 |
+
print(state.get("critique", "No critique available"))
|
| 147 |
+
print("="*80)
|
| 148 |
+
print("\nPlease provide your feedback (press Enter to continue with no changes):")
|
| 149 |
+
|
| 150 |
+
# In a real implementation, this would be handled by the UI
|
| 151 |
+
feedback = input()
|
| 152 |
+
state["feedback"] = feedback
|
| 153 |
+
return state
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def finalize_document(state: AppState) -> AppState:
|
| 157 |
+
"""Incorporate feedback and finalize the document."""
|
| 158 |
+
if not state["feedback"].strip():
|
| 159 |
+
state["final"] = state["draft"]
|
| 160 |
+
return state
|
| 161 |
+
|
| 162 |
+
final = llm.invoke(REVISION_PROMPT.format(
|
| 163 |
+
draft=state["draft"],
|
| 164 |
+
feedback=state["feedback"]
|
| 165 |
+
))
|
| 166 |
+
|
| 167 |
+
state["final"] = final
|
| 168 |
+
return state
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# Decision function for conditional routing
|
| 172 |
+
def determine_next_step(state: AppState) -> str:
|
| 173 |
+
"""Determine the next node in the graph based on state."""
|
| 174 |
+
# If we're missing the company name, we can't do company research
|
| 175 |
+
if not state["company_name"]:
|
| 176 |
+
return "draft"
|
| 177 |
+
return "research"
|
agents/output_schema.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field, field_validator
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
class TavilyQuerySet(BaseModel):
|
| 5 |
+
query1: Optional[List[str]] = Field(default=None, description="First search query and its rationale, e.g., ['query text']")
|
| 6 |
+
query2: Optional[List[str]] = Field(default=None, description="Second search query and its rationale")
|
| 7 |
+
query3: Optional[List[str]] = Field(default=None, description="Third search query and its rationale")
|
| 8 |
+
query4: Optional[List[str]] = Field(default=None, description="Fourth search query and its rationale")
|
| 9 |
+
query5: Optional[List[str]] = Field(default=None, description="Fifth search query and its rationale")
|
| 10 |
+
|
| 11 |
+
@field_validator("query1", "query2", "query3", "query4", "query5", mode="after")
|
| 12 |
+
@classmethod
|
| 13 |
+
def ensure_len_two(cls, v):
|
| 14 |
+
"""Ensure each provided query list contains exactly one strings: [query]."""
|
| 15 |
+
if v is not None: # Only validate if the list is actually provided
|
| 16 |
+
if len(v) != 1:
|
| 17 |
+
# Updated error message for clarity
|
| 18 |
+
raise ValueError("Each query list, when provided, must contain exactly one string: the query text.")
|
| 19 |
+
return v
|
classes/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .classes import AppState, ResearchState, DataLoadState
|
| 2 |
+
|
| 3 |
+
__all__ = ["AppState", "ResearchState", "DataLoadState"]
|
classes/classes.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
State definitions for the Job Writer LangGraph Workflow.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing_extensions import List, Dict, Any
|
| 6 |
+
from langgraph.graph import MessagesState
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AppState(MessagesState):
|
| 10 |
+
"""
|
| 11 |
+
State container for the job application writer workflow.
|
| 12 |
+
|
| 13 |
+
Attributes:
|
| 14 |
+
resume: List of text chunks from the candidate's resume
|
| 15 |
+
job_description: List of text chunks from the job description
|
| 16 |
+
company_name: Extracted company name
|
| 17 |
+
company_research_data: Additional information about the company from research
|
| 18 |
+
persona: The writing persona to use ("recruiter" or "hiring_manager")
|
| 19 |
+
draft: Current draft of the application material
|
| 20 |
+
feedback: Human feedback on the draft
|
| 21 |
+
final: Final version of the application material
|
| 22 |
+
content: Type of application material to generate
|
| 23 |
+
"""
|
| 24 |
+
resume_path: str
|
| 25 |
+
job_description_source: str
|
| 26 |
+
company_research_data: Dict[str, Any]
|
| 27 |
+
draft: str
|
| 28 |
+
feedback: str
|
| 29 |
+
final: str
|
| 30 |
+
content: str # "cover_letter", "bullets", "linkedin_note"
|
| 31 |
+
current_node: str
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class DataLoadState(MessagesState):
|
| 35 |
+
"""
|
| 36 |
+
State container for the job application writer workflow.
|
| 37 |
+
|
| 38 |
+
Attributes:
|
| 39 |
+
resume: List of text chunks from the candidate's resume
|
| 40 |
+
job_description: List of text chunks from the job description
|
| 41 |
+
persona: The writing persona to use ("recruiter" or "hiring_manager")
|
| 42 |
+
content: Type of application material to generate
|
| 43 |
+
"""
|
| 44 |
+
resume_path: str
|
| 45 |
+
job_description_source: str
|
| 46 |
+
resume: str
|
| 47 |
+
job_description: str
|
| 48 |
+
company_name: str
|
| 49 |
+
current_node: str
|
| 50 |
+
company_research_data: Dict[str, Any]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class ResearchState(MessagesState):
|
| 54 |
+
"""
|
| 55 |
+
State container for the job application writer workflow.
|
| 56 |
+
Attributes:
|
| 57 |
+
tavily_search: Dict[str, Any] Stores the results of the Tavily search
|
| 58 |
+
attempted_search_queries: List of queries used extracted from the job description
|
| 59 |
+
compiled_knowledge: Compiled knowledge from the research
|
| 60 |
+
"""
|
| 61 |
+
company_research_data: Dict[str, Any]
|
| 62 |
+
attempted_search_queries: List[str]
|
| 63 |
+
current_node: str
|
langgraph.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dependencies": [
|
| 3 |
+
"."
|
| 4 |
+
],
|
| 5 |
+
"graphs": {
|
| 6 |
+
"job_application": "langgraph_init:job_app_graph"
|
| 7 |
+
},
|
| 8 |
+
"env": "./.env",
|
| 9 |
+
"python_version": "3.11"
|
| 10 |
+
}
|
langgraph_init.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .workflow import JobWorkflow
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
job_app_graph= JobWorkflow().compile()
|
nodes/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Mon Oct 23 16:49:52 2023
|
| 4 |
+
@author: rishabhaggarwal
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .initializing import Dataloading
|
| 8 |
+
# from .createdraft import CreateDraft
|
| 9 |
+
from .variations import generate_variations
|
| 10 |
+
from .selfconsistency import self_consistency_vote
|
| 11 |
+
from .research_workflow import research_workflow
|
| 12 |
+
|
| 13 |
+
__all__ = ["Dataloading", "generate_variations", "self_consistency_vote", "research_workflow"]
|
nodes/createdraft.py
ADDED
|
File without changes
|
nodes/initializing.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Mon Oct 23 16:49:52 2023
|
| 4 |
+
@author: rishabhaggarwal
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
from typing_extensions import Literal
|
| 9 |
+
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
from langchain_core.messages import SystemMessage
|
| 12 |
+
|
| 13 |
+
from job_writer.classes import AppState, DataLoadState
|
| 14 |
+
from job_writer.utils.document_processing import (
|
| 15 |
+
parse_resume,
|
| 16 |
+
get_job_description
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Dataloading:
|
| 24 |
+
"""
|
| 25 |
+
Initialize the state for the job application writer workflow.
|
| 26 |
+
"""
|
| 27 |
+
def __init__(self):
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
async def system_setup(self, state: AppState) -> DataLoadState:
|
| 32 |
+
"""Initialize conversation by setting up a persona through System Prompt."""
|
| 33 |
+
|
| 34 |
+
resume_path = state.get("resume_path")
|
| 35 |
+
|
| 36 |
+
# Verify if the resume file path provided is valid
|
| 37 |
+
if not resume_path:
|
| 38 |
+
logger.error("Resume path is not provided in the state.")
|
| 39 |
+
elif not os.path.exists(resume_path):
|
| 40 |
+
logger.error("Resume file does not exist at path: %s", resume_path)
|
| 41 |
+
# Similar handling as above:
|
| 42 |
+
# raise FileNotFoundError(f"Resume file not found: {resume_path}")
|
| 43 |
+
elif not os.path.isfile(resume_path):
|
| 44 |
+
logger.error("The path provided for the resume is not a file: %s", resume_path)
|
| 45 |
+
# Similar handling:
|
| 46 |
+
# raise ValueError(f"Resume path is not a file: {resume_path}")
|
| 47 |
+
else:
|
| 48 |
+
logger.info("Resume path verified: %s", resume_path)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
persona_init_message = SystemMessage(
|
| 52 |
+
content="You are my dedicated assistant for writing job application content, "
|
| 53 |
+
"including cover letters, LinkedIn outreach messages, and responses to "
|
| 54 |
+
"job-specfific questions (e.g., experience, culture fit, or motivation)."
|
| 55 |
+
)
|
| 56 |
+
messages = state.get("messages", [])
|
| 57 |
+
messages.append(persona_init_message)
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
**state,
|
| 61 |
+
"messages": messages,
|
| 62 |
+
"current_node": "initialize_system"
|
| 63 |
+
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
async def get_resume(self, resume_source):
|
| 68 |
+
"""
|
| 69 |
+
Get the resume te
|
| 70 |
+
"""
|
| 71 |
+
try:
|
| 72 |
+
print("Parsing resume....")
|
| 73 |
+
resume_text = ""
|
| 74 |
+
resume_chunks = parse_resume(resume_source)
|
| 75 |
+
for chunk in resume_chunks:
|
| 76 |
+
if hasattr(chunk, 'page_content') and chunk.page_content:
|
| 77 |
+
resume_text += chunk.page_content
|
| 78 |
+
elif isinstance(chunk, str) and chunk: # If parse_resume (util) returns list of strings
|
| 79 |
+
resume_text += chunk
|
| 80 |
+
else:
|
| 81 |
+
logger.debug("Skipping empty or invalid chunk in resume: %s", chunk)
|
| 82 |
+
continue
|
| 83 |
+
return resume_text
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"Error parsing resume: {e}")
|
| 86 |
+
raise e
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
async def parse_job_description(self, job_description_source):
|
| 90 |
+
try:
|
| 91 |
+
logger.info("Parsing job description from: %s", job_description_source)
|
| 92 |
+
document: Document = get_job_description(job_description_source)
|
| 93 |
+
|
| 94 |
+
company_name = ""
|
| 95 |
+
job_posting_text = ""
|
| 96 |
+
|
| 97 |
+
if document:
|
| 98 |
+
# Extract company name from metadata
|
| 99 |
+
if hasattr(document, 'metadata') and isinstance(document.metadata, dict):
|
| 100 |
+
company_name = document.metadata.get("company_name", "")
|
| 101 |
+
if not company_name:
|
| 102 |
+
logger.warning("Company name not found in job description metadata.")
|
| 103 |
+
else:
|
| 104 |
+
logger.warning("Metadata attribute not found or not a dictionary in the Document for job description.")
|
| 105 |
+
|
| 106 |
+
# Extract the job posting text from page_content
|
| 107 |
+
if hasattr(document, 'page_content'):
|
| 108 |
+
job_posting_text = document.page_content
|
| 109 |
+
if not job_posting_text:
|
| 110 |
+
logger.info("Parsed job posting text is empty.")
|
| 111 |
+
else:
|
| 112 |
+
logger.warning("page_content attribute not found in the Document for job description.")
|
| 113 |
+
else:
|
| 114 |
+
logger.warning("get_job_description returned None for source: %s", job_description_source)
|
| 115 |
+
|
| 116 |
+
return job_posting_text, company_name
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error("Error parsing job description from source '%s': %s", job_description_source, e, exc_info=True)
|
| 120 |
+
raise e
|
| 121 |
+
|
| 122 |
+
async def load_inputs(self, state: DataLoadState) -> AppState:
|
| 123 |
+
"""
|
| 124 |
+
Parse the resume and job description to prepare the data from the context
|
| 125 |
+
which is required for the job application writer for the current state
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
resume_source = state.get("resume_path", "")
|
| 129 |
+
job_description_source = state.get("job_description_source", None)
|
| 130 |
+
|
| 131 |
+
# Initialize result containers\
|
| 132 |
+
resume_text = ""
|
| 133 |
+
job_posting_text = ""
|
| 134 |
+
company_name = ""
|
| 135 |
+
resume_chunks = [] # Handle job description input
|
| 136 |
+
if job_description_source:
|
| 137 |
+
try:
|
| 138 |
+
job_posting_text, company_name = await self.parse_job_description(job_description_source)
|
| 139 |
+
print(f"Job description parsing complete. Length: {len(job_posting_text) if job_posting_text else 0}")
|
| 140 |
+
|
| 141 |
+
# Ensure job_posting_text is not empty
|
| 142 |
+
if not job_posting_text:
|
| 143 |
+
print("WARNING: Job posting text is empty after parsing.")
|
| 144 |
+
job_posting_text = "No job description available. Please check the URL or provide a different source."
|
| 145 |
+
except Exception as e:
|
| 146 |
+
print(f"Error parsing job description: {e} in file {__file__}")
|
| 147 |
+
# Set a default value to prevent errors
|
| 148 |
+
job_posting_text = "Error parsing job description."
|
| 149 |
+
company_name = "Unknown Company"
|
| 150 |
+
|
| 151 |
+
if resume_source:
|
| 152 |
+
try:
|
| 153 |
+
resume_text = await self.get_resume(resume_source)
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f"Error parsing resume: {e} in file {__file__}")
|
| 156 |
+
raise e
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# If either is missing, prompt the user
|
| 160 |
+
if state["current_node"] == "verify" and not resume_text:
|
| 161 |
+
resume_chunks = input("Please paste the resume in text format: ")
|
| 162 |
+
resume_text = [Document(page_content=resume_chunks, metadata={"source": "resume"})]
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
if state["current_node"] == "verify" and not job_posting_text:
|
| 166 |
+
job_text = input("Please paste the job posting in text format: ")
|
| 167 |
+
job_posting_text = [job_text]
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# Extract company name
|
| 171 |
+
state["company_research_data"] = {'resume': resume_text, 'job_description': job_posting_text, 'company_name': company_name}
|
| 172 |
+
|
| 173 |
+
state["current_node"] = "load_inputs"
|
| 174 |
+
|
| 175 |
+
return state
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def validate_data_load_state(self,state: DataLoadState):
|
| 179 |
+
assert state.company_research_data.get("resume"), "Resume is missing in company_research_data"
|
| 180 |
+
assert state.company_research_data.get("job_description"), "Job description is missing"
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def verify_inputs(self, state: AppState) -> Literal["load", "research"]:
|
| 184 |
+
"""Verify that required inputs are present."""
|
| 185 |
+
|
| 186 |
+
print("Verifying Inputs")
|
| 187 |
+
state["current_node"] = "verify"
|
| 188 |
+
|
| 189 |
+
logger.info("Verifying loaded inputs!")
|
| 190 |
+
|
| 191 |
+
assert state["company_research_data"].get("resume"), "Resume is missing in company_research_data"
|
| 192 |
+
assert state["company_research_data"].get("job_description"), "Job description is missing"
|
| 193 |
+
|
| 194 |
+
if not state.get("company_research_data"):
|
| 195 |
+
missing_items = []
|
| 196 |
+
if not state.get("company_research_data").get("resume", ""):
|
| 197 |
+
missing_items.append("resume")
|
| 198 |
+
if not state.get("company_research_data").get("job_description", ""):
|
| 199 |
+
missing_items.append("job description")
|
| 200 |
+
print(f'Missing required data: {", ".join(missing_items)}')
|
| 201 |
+
|
| 202 |
+
return "load"
|
| 203 |
+
|
| 204 |
+
# Normalize state content to strings
|
| 205 |
+
for key in ["resume", "job_description"]:
|
| 206 |
+
try:
|
| 207 |
+
if isinstance(state["company_research_data"][key], (list, tuple)):
|
| 208 |
+
state["company_research_data"][key] = " ".join(str(x) for x in state["company_research_data"][key])
|
| 209 |
+
elif isinstance(state["company_research_data"][key], dict):
|
| 210 |
+
state["company_research_data"][key] = str(state["company_research_data"][key])
|
| 211 |
+
else:
|
| 212 |
+
state["company_research_data"][key] = str(state["company_research_data"][key])
|
| 213 |
+
except Exception as e:
|
| 214 |
+
logger.warning("Error converting %s to string: %s", key, e)
|
| 215 |
+
raise e
|
| 216 |
+
|
| 217 |
+
return "research"
|
| 218 |
+
|
| 219 |
+
async def run(self, state: DataLoadState) -> AppState:
|
| 220 |
+
"""
|
| 221 |
+
Run the InitializeState class to initialize
|
| 222 |
+
the state for the job application writer workflow.
|
| 223 |
+
"""
|
| 224 |
+
state = await self.load_inputs(state)
|
| 225 |
+
return state
|
nodes/research_workflow.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
This module performs the research phase of the job application writing process.
|
| 4 |
+
One of the stages is Tavily Search which will be use to search for the company
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
from langgraph.graph import StateGraph, START, END
|
| 8 |
+
|
| 9 |
+
from job_writer.tools.TavilySearch import relevance_filter, search_company
|
| 10 |
+
from job_writer.classes.classes import ResearchState
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Set up logging
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
async def research_company(state: ResearchState) -> ResearchState:
|
| 20 |
+
"""Research the company if name is available."""
|
| 21 |
+
state["current_node"] = "research_company"
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# Extract values from state
|
| 25 |
+
company_name = state["company_research_data"].get("company_name", "")
|
| 26 |
+
job_description = state["company_research_data"].get("job_description", "")
|
| 27 |
+
|
| 28 |
+
logger.info(f"Researching company: {company_name}")
|
| 29 |
+
# Call search_company using the invoke method instead of __call__
|
| 30 |
+
# The tool expects job_description and company_name and returns a tuple
|
| 31 |
+
result = search_company.invoke({
|
| 32 |
+
"job_description": job_description,
|
| 33 |
+
"company_name": company_name
|
| 34 |
+
})
|
| 35 |
+
# Unpack the tuple
|
| 36 |
+
if isinstance(result, tuple) and len(result) == 2:
|
| 37 |
+
results, attempted_tavily_query_list = result
|
| 38 |
+
else:
|
| 39 |
+
# Handle the case when it's not a tuple
|
| 40 |
+
results = result
|
| 41 |
+
attempted_tavily_query_list = []
|
| 42 |
+
|
| 43 |
+
logger.info(f"Search completed with results and {len(attempted_tavily_query_list)} queries")
|
| 44 |
+
|
| 45 |
+
# Store results in state - note that results is the first item in the tuple
|
| 46 |
+
state["attempted_search_queries"] = attempted_tavily_query_list
|
| 47 |
+
state["company_research_data"]["tavily_search"] = results
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.error(f"Error in research_company: {str(e)}")
|
| 51 |
+
# Provide empty results to avoid breaking the workflow
|
| 52 |
+
state["company_research_data"]["tavily_search"] = {"error": str(e), "tavily_search": []}
|
| 53 |
+
state["attempted_search_queries"] = []
|
| 54 |
+
|
| 55 |
+
return state
|
| 56 |
+
|
| 57 |
+
print("\n\n\nInitializing research workflow...\n\n\n")
|
| 58 |
+
# Create research subgraph
|
| 59 |
+
research_subgraph = StateGraph(ResearchState)
|
| 60 |
+
|
| 61 |
+
# Add research subgraph nodes
|
| 62 |
+
research_subgraph.add_node("research_company", research_company)
|
| 63 |
+
research_subgraph.add_node("relevance_filter", relevance_filter)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# Add research subgraph edges
|
| 67 |
+
research_subgraph.add_edge(START, "research_company")
|
| 68 |
+
research_subgraph.add_edge("research_company", "relevance_filter")
|
| 69 |
+
research_subgraph.add_edge("relevance_filter", END)
|
| 70 |
+
|
| 71 |
+
# Compile research subgraph
|
| 72 |
+
research_workflow = research_subgraph.compile()
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# class ResearchWorkflow:
|
| 76 |
+
|
| 77 |
+
# def __init__(self):
|
| 78 |
+
# self.research_workflow = research_workflow
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
nodes/selfconsistency.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
from ..classes.classes import AppState
|
| 5 |
+
from ..prompts.templates import (
|
| 6 |
+
DRAFT_RATING_PROMPT,
|
| 7 |
+
BEST_DRAFT_SELECTION_PROMPT
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
# Constants
|
| 13 |
+
CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
|
| 14 |
+
|
| 15 |
+
# LLM = LLMClient()
|
| 16 |
+
# llm = LLMClient().get_llm()
|
| 17 |
+
# llm_precise = LLMClient().get_llm()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def self_consistency_vote(state: AppState) -> AppState:
|
| 21 |
+
"""Choose the best draft from multiple variations."""
|
| 22 |
+
variations = state.get("variations", {"variations": []})
|
| 23 |
+
|
| 24 |
+
all_drafts = [state["draft"]] + variations["variations"]
|
| 25 |
+
|
| 26 |
+
# First, have the LLM rate each draft
|
| 27 |
+
ratings = []
|
| 28 |
+
|
| 29 |
+
# Get resume and job summaries, handling different formats
|
| 30 |
+
try:
|
| 31 |
+
if isinstance(state["resume"], list) and len(state["resume"]) > 0:
|
| 32 |
+
if hasattr(state["resume"][0], 'page_content'):
|
| 33 |
+
resume_summary = state["resume"][0].page_content
|
| 34 |
+
else:
|
| 35 |
+
resume_summary = state["resume"][0]
|
| 36 |
+
else:
|
| 37 |
+
resume_summary = str(state["resume"])
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Warning: Error getting resume summary: {e}")
|
| 40 |
+
resume_summary = str(state["resume"])
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
if isinstance(state["job_description"], list) and len(state["job_description"]) > 0:
|
| 44 |
+
job_summary = state["job_description"][0]
|
| 45 |
+
else:
|
| 46 |
+
job_summary = str(state["job_description"])
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Warning: Error getting job summary: {e}")
|
| 49 |
+
job_summary = str(state["job_description"])
|
| 50 |
+
|
| 51 |
+
for i, draft in enumerate(all_drafts):
|
| 52 |
+
rating = llm_precise.invoke(DRAFT_RATING_PROMPT.format(
|
| 53 |
+
resume_summary=resume_summary,
|
| 54 |
+
job_summary=job_summary,
|
| 55 |
+
draft=draft,
|
| 56 |
+
draft_number=i+1
|
| 57 |
+
))
|
| 58 |
+
ratings.append(rating)
|
| 59 |
+
|
| 60 |
+
# Create a clearer, more structured prompt for draft selection
|
| 61 |
+
selection_prompt = BEST_DRAFT_SELECTION_PROMPT.format(
|
| 62 |
+
ratings_json=json.dumps(ratings, indent=2),
|
| 63 |
+
num_drafts=len(all_drafts)
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Get the selected draft index with error handling
|
| 67 |
+
try:
|
| 68 |
+
selection = llm_precise.invoke(selection_prompt).strip()
|
| 69 |
+
# Extract just the first number found in the response
|
| 70 |
+
number_match = re.search(r'\d+', selection)
|
| 71 |
+
if not number_match:
|
| 72 |
+
print("Warning: Could not extract draft number from LLM response. Using original draft.")
|
| 73 |
+
best_draft_idx = 0
|
| 74 |
+
else:
|
| 75 |
+
best_draft_idx = int(number_match.group()) - 1
|
| 76 |
+
# Validate the index is in range
|
| 77 |
+
if best_draft_idx < 0 or best_draft_idx >= len(all_drafts):
|
| 78 |
+
print(f"Warning: Selected draft index {best_draft_idx + 1} out of range. Using original draft.")
|
| 79 |
+
best_draft_idx = 0
|
| 80 |
+
except (ValueError, TypeError) as e:
|
| 81 |
+
print(f"Warning: Error selecting best draft: {e}. Using original draft.")
|
| 82 |
+
best_draft_idx = 0
|
| 83 |
+
|
| 84 |
+
state["draft"] = all_drafts[best_draft_idx]
|
| 85 |
+
return state
|
nodes/test_workflow.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing_extensions import List, Dict, Any, Optional
|
| 2 |
+
from langgraph.graph import MessagesState, StateGraph
|
| 3 |
+
|
| 4 |
+
class DataLoadState(MessagesState):
|
| 5 |
+
"""
|
| 6 |
+
State container for the job application writer workflow.
|
| 7 |
+
|
| 8 |
+
Attributes:
|
| 9 |
+
resume: List of text chunks from the candidate's resume
|
| 10 |
+
job_description: List of text chunks from the job description
|
| 11 |
+
persona: The writing persona to use ("recruiter" or "hiring_manager")
|
| 12 |
+
content: Type of application material to generate
|
| 13 |
+
"""
|
| 14 |
+
resume_path: str
|
| 15 |
+
job_description_source: str
|
| 16 |
+
resume: str
|
| 17 |
+
job_description: str
|
| 18 |
+
company_name: str
|
| 19 |
+
current_node: str
|
| 20 |
+
company_research_data: Dict[str, Any]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
test_graph = StateGraph(DataLoadState)
|
nodes/variations.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from typing_extensions import Dict, List
|
| 4 |
+
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
from ..classes.classes import AppState
|
| 9 |
+
from ..utils.llm_client import LLMClient
|
| 10 |
+
from ..prompts.templates import (
|
| 11 |
+
VARIATION_PROMPT
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
# Constants
|
| 17 |
+
CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
|
| 18 |
+
|
| 19 |
+
LLM = LLMClient()
|
| 20 |
+
llm = LLMClient().get_llm()
|
| 21 |
+
|
| 22 |
+
def generate_variations(state: AppState) -> Dict[str, List[str]]:
|
| 23 |
+
"""Generate multiple variations of the draft for self-consistency voting."""
|
| 24 |
+
variations = []
|
| 25 |
+
|
| 26 |
+
# Get resume and job text, handling both string and Document types
|
| 27 |
+
try:
|
| 28 |
+
resume_text = "\n".join(doc.page_content if isinstance(doc, Document) else doc
|
| 29 |
+
for doc in (state["resume"][:2] if isinstance(state["company_research_data"]["resume"], str)
|
| 30 |
+
else [state["resume"]]))
|
| 31 |
+
job_text = "\n".join(chunk for chunk in (state["company_research_data"]["job_description"][:2] if isinstance(state["company_research_data"]["job_description"], str)
|
| 32 |
+
else [state["company_research_data"]["job_description"]]))
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Warning: Error processing resume/job text: {e}")
|
| 35 |
+
# Fallback to simple string handling
|
| 36 |
+
resume_text = str(state["company_research_data"]["resume"])
|
| 37 |
+
job_text = str(state["company_research_data"]["job_description"])
|
| 38 |
+
|
| 39 |
+
# Generate variations with different temperatures and creativity settings
|
| 40 |
+
temp_variations = [
|
| 41 |
+
{"temperature": 0.7, "top_p": 0.9}, # More conservative
|
| 42 |
+
{"temperature": 0.75, "top_p": 0.92}, # Balanced
|
| 43 |
+
{"temperature": 0.8, "top_p": 0.95}, # More creative
|
| 44 |
+
{"temperature": 0.7, "top_p": 0.85}, # Alternative conservative
|
| 45 |
+
{"temperature": 0.8, "top_p": 0.98} # Most creative
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
for settings in temp_variations:
|
| 49 |
+
try:
|
| 50 |
+
# Create a configured version of the LLM with the variation settings
|
| 51 |
+
configured_llm = llm.with_config(configurable=settings)
|
| 52 |
+
|
| 53 |
+
# Use VARIATION_PROMPT directly with the configured LLM
|
| 54 |
+
variation = VARIATION_PROMPT.format_messages(
|
| 55 |
+
resume_excerpt=resume_text,
|
| 56 |
+
job_excerpt=job_text,
|
| 57 |
+
draft=state["draft"]
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
response = configured_llm.invoke(variation)
|
| 61 |
+
|
| 62 |
+
if response and response.strip(): # Only add non-empty variations
|
| 63 |
+
variations.append(response)
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"Warning: Error generating variation with settings {settings}: {e}")
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
# Ensure we have at least one variation
|
| 69 |
+
if not variations:
|
| 70 |
+
# If all variations failed, add the original draft as a fallback
|
| 71 |
+
variations.append(state["draft"])
|
| 72 |
+
|
| 73 |
+
return {"variations": variations}
|
prompts.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## 1. Title / One-Line Summary
|
| 2 |
+
> *E.g.* “embed_query returns empty vector with OllamaEmbeddings”
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## 2. Goal / Expected Behavior
|
| 7 |
+
- What you’re trying to achieve
|
| 8 |
+
*E.g.* “Index documents with OllamaEmbeddings and query via Pinecone, then feed them into Llama3.2 for answer generation.”
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## 3. Environment
|
| 13 |
+
- **Python**:
|
| 14 |
+
- **langchain**:
|
| 15 |
+
- **Ollama CLI / Daemon**:
|
| 16 |
+
- **OS** (and version):
|
| 17 |
+
- **Other dependencies**:
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 4. Minimal Reproducible Code
|
| 22 |
+
```python
|
| 23 |
+
# Paste just enough code to reproduce the issue:
|
| 24 |
+
from langchain.embeddings import OllamaEmbeddings
|
| 25 |
+
emb = OllamaEmbeddings(model="ollama/llama3.2-embed")
|
| 26 |
+
vec = emb.embed_query("hello")
|
| 27 |
+
print(len(vec)) # unexpected result
|
prompts/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prompt templates for job application generation.
|
| 3 |
+
"""
|
prompts/templates.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prompt templates for the job application writer.
|
| 3 |
+
|
| 4 |
+
This module contains all prompt templates used throughout the job application
|
| 5 |
+
generation process, organized by task.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 9 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Persona selection prompts
|
| 13 |
+
|
| 14 |
+
PERSONA_DEVELOPMENT_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 15 |
+
SystemMessage(content="""
|
| 16 |
+
You are my dedicated Job‑Application Writing Assistant.
|
| 17 |
+
MISSION
|
| 18 |
+
• Draft cover letters, LinkedIn messages, and answer's to questions within the job applications.
|
| 19 |
+
• Sound like me: grounded, confident, clear—never fluffy or journalistic.
|
| 20 |
+
• You will be provided "STYLE & LANGUAGE RULES" and "SELF‑EVALUATION CHECKLIST" to follow.
|
| 21 |
+
"""),
|
| 22 |
+
HumanMessage(content="""Analyze this job description and determine if it's better to write as if addressing a recruiter
|
| 23 |
+
or a hiring manager. Return ONLY 'recruiter' or 'hiring_manager':
|
| 24 |
+
|
| 25 |
+
{job_description}""")
|
| 26 |
+
])
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Draft generation prompts
|
| 30 |
+
|
| 31 |
+
COVER_LETTER_PROMPT: SystemMessage = SystemMessage(content=
|
| 32 |
+
"""
|
| 33 |
+
You are CoverLetterGPT, a concise career‑writing assistant.
|
| 34 |
+
|
| 35 |
+
CORE OBJECTIVE
|
| 36 |
+
• Draft a 3‑paragraph cover letter (150‑180 words total) that targets hiring managers
|
| 37 |
+
and technical recruiters. Assume it may reach the CEO.
|
| 38 |
+
• Begin exactly with: "To Hiring Team,"
|
| 39 |
+
End exactly with: "Thanks, Rishabh"
|
| 40 |
+
• Tone: polite, casual, enthusiastic — but no em dashes (—) and no clichés.
|
| 41 |
+
• Every fact about achievements, skills, or company details must be traceable to the
|
| 42 |
+
provided resume, job description, or company research; otherwise, ask the user.
|
| 43 |
+
• If any critical detail is missing or ambiguous, STOP and ask a clarifying question
|
| 44 |
+
before writing the letter.
|
| 45 |
+
• Keep sentences tight; avoid filler like “I am excited to…” (enthusiasm comes
|
| 46 |
+
through precise language).
|
| 47 |
+
• Never exceed 180 words. Never fall below 150 words.
|
| 48 |
+
|
| 49 |
+
SELF‑EVALUATION (append after the letter)
|
| 50 |
+
After producing the cover letter, output an “### Evaluation” section containing:
|
| 51 |
+
Comprehensiveness (1‑5)
|
| 52 |
+
Evidence provided (1‑5)
|
| 53 |
+
Clarity of explanation (1‑5)
|
| 54 |
+
Potential limitations or biases (bullet list)
|
| 55 |
+
Areas for improvement (brief notes)
|
| 56 |
+
|
| 57 |
+
ERROR HANDLING
|
| 58 |
+
If word count, section order, or format rules are violated, regenerate until correct.
|
| 59 |
+
"""
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
BULLET_POINTS_PROMPT: SystemMessage = SystemMessage(content=
|
| 65 |
+
"""You are an expert job application writer who
|
| 66 |
+
creates personalized application materials.
|
| 67 |
+
|
| 68 |
+
{persona_instruction}
|
| 69 |
+
|
| 70 |
+
Write 5-7 bullet points highlighting the candidate's
|
| 71 |
+
qualifications for this specific role.
|
| 72 |
+
Create content that genuinely reflects the candidate's
|
| 73 |
+
background and is tailored to the specific job.
|
| 74 |
+
Ensure the tone is professional, confident, and authentic.
|
| 75 |
+
Today is {current_date}.""")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
LINKEDIN_NOTE_PROMPT: SystemMessage = SystemMessage(content="""You are an expert job application
|
| 79 |
+
writer who creates personalized application materials.
|
| 80 |
+
{persona_instruction}
|
| 81 |
+
|
| 82 |
+
Write a brief LinkedIn connection note to a hiring manager or recruiter (150 words max).
|
| 83 |
+
Create content that genuinely reflects the candidate's background and is tailored to the specific job.
|
| 84 |
+
Ensure the tone is professional, confident, and authentic.
|
| 85 |
+
Today is {current_date}.""")
|
| 86 |
+
|
| 87 |
+
# Variation generation prompt
|
| 88 |
+
VARIATION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 89 |
+
SystemMessage(content="You are an expert job application writer. Create a variation of the given draft."),
|
| 90 |
+
HumanMessage(content="""
|
| 91 |
+
# Resume Excerpt
|
| 92 |
+
{resume_excerpt}
|
| 93 |
+
|
| 94 |
+
# Job Description Excerpt
|
| 95 |
+
{job_excerpt}
|
| 96 |
+
|
| 97 |
+
# Original Draft
|
| 98 |
+
{draft}
|
| 99 |
+
|
| 100 |
+
Create a variation of this draft with the same key points but different wording or structure.
|
| 101 |
+
""")
|
| 102 |
+
])
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# Critique prompt
|
| 106 |
+
|
| 107 |
+
CRITIQUE_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 108 |
+
SystemMessage(content="You are a professional editor who specializes in job applications. Provide constructive feedback."),
|
| 109 |
+
HumanMessage(content="""
|
| 110 |
+
# Job Description
|
| 111 |
+
{job_description}
|
| 112 |
+
|
| 113 |
+
# Current Draft
|
| 114 |
+
{draft}
|
| 115 |
+
|
| 116 |
+
Critique this draft and suggest specific improvements. Focus on:
|
| 117 |
+
1. How well it targets the job requirements
|
| 118 |
+
2. Professional tone and language
|
| 119 |
+
3. Clarity and impact
|
| 120 |
+
4. Grammar and style
|
| 121 |
+
|
| 122 |
+
Return your critique in a constructive, actionable format.
|
| 123 |
+
""")
|
| 124 |
+
])
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# Draft rating prompt
|
| 128 |
+
|
| 129 |
+
DRAFT_RATING_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 130 |
+
SystemMessage(content="You evaluate job application materials for effectiveness, appropriateness, and impact."),
|
| 131 |
+
HumanMessage(content="""
|
| 132 |
+
# Resume Summary
|
| 133 |
+
{resume_summary}
|
| 134 |
+
|
| 135 |
+
# Job Description Summary
|
| 136 |
+
{job_summary}
|
| 137 |
+
|
| 138 |
+
# Draft #{draft_number}
|
| 139 |
+
{draft}
|
| 140 |
+
|
| 141 |
+
Rate this draft on a scale of 1-10 for:
|
| 142 |
+
1. Relevance to the job requirements
|
| 143 |
+
2. Professional tone
|
| 144 |
+
3. Personalization
|
| 145 |
+
4. Persuasiveness
|
| 146 |
+
5. Clarity
|
| 147 |
+
|
| 148 |
+
Return ONLY a JSON object with these ratings and a brief explanation for each.
|
| 149 |
+
""")
|
| 150 |
+
])
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# Best draft selection prompt
|
| 154 |
+
|
| 155 |
+
BEST_DRAFT_SELECTION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 156 |
+
SystemMessage(content="""You are a job application expert who selects the best draft based on multiple ratings.
|
| 157 |
+
You MUST return ONLY a single number between 1 and the number of drafts.
|
| 158 |
+
For example, if draft #2 is best, return ONLY '2'.
|
| 159 |
+
Do NOT include ANY other text, explanations, or characters in your response."""),
|
| 160 |
+
HumanMessage(content="""Here are the ratings for {num_drafts} different drafts:
|
| 161 |
+
|
| 162 |
+
{ratings_json}
|
| 163 |
+
|
| 164 |
+
Based on these ratings, return ONLY the number of the best draft (1-{num_drafts}).
|
| 165 |
+
Your entire response must be just one number.
|
| 166 |
+
Example: If draft #2 is best, return ONLY '2'.
|
| 167 |
+
""")
|
| 168 |
+
])
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
REVISION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 172 |
+
SystemMessage(content="You are an expert job application writer. Revise the draft based on feedback."),
|
| 173 |
+
HumanMessage(content="""
|
| 174 |
+
# Original Draft
|
| 175 |
+
{draft}
|
| 176 |
+
|
| 177 |
+
# Feedback
|
| 178 |
+
{feedback}
|
| 179 |
+
|
| 180 |
+
Revise the draft to incorporate this feedback while maintaining professionalism and impact.
|
| 181 |
+
Return the complete, final version.
|
| 182 |
+
""")
|
| 183 |
+
])
|
| 184 |
+
|
| 185 |
+
# Tavily query prompt to build knowledge context about the company
|
| 186 |
+
|
| 187 |
+
TAVILY_QUERY_PROMPT = '''
|
| 188 |
+
<Context>
|
| 189 |
+
The user needs targeted search queries (with rationale) for Tavily Search to research company {} and inform a personalized cover letter.
|
| 190 |
+
</Context>
|
| 191 |
+
|
| 192 |
+
<Requirements>
|
| 193 |
+
- Output a JSON object with five fields:
|
| 194 |
+
- Keys: recent_developments, recent_news, role_info, customers_partners, culture_values
|
| 195 |
+
- Each value: an array of exactly two strings: [search query for Tavily Search, reasoning].
|
| 196 |
+
- Always include the company name in the search query to boost relevance.
|
| 197 |
+
- If any data is missing, supply a sensible fallback query that still references the company.
|
| 198 |
+
- Do not repeat queries across fields.
|
| 199 |
+
</Requirements>
|
| 200 |
+
|
| 201 |
+
<OutputFormat>
|
| 202 |
+
```json
|
| 203 |
+
{
|
| 204 |
+
"recent_developments": ["…", "…"],
|
| 205 |
+
"recent_news": ["…", "…"],
|
| 206 |
+
"role_info": ["…", "…"],
|
| 207 |
+
"customers_partners":["…", "…"],
|
| 208 |
+
"culture_values": ["…", "…"]
|
| 209 |
+
}
|
| 210 |
+
```
|
| 211 |
+
</OutputFormat>
|
| 212 |
+
'''
|
| 213 |
+
|
| 214 |
+
JOB_DESCRIPTION_PROMPT = """You are a JSON extraction specialist. Extract job information from the provided text and return ONLY valid JSON.
|
| 215 |
+
|
| 216 |
+
CRITICAL: Your response must be parseable by json.loads() - no markdown, no explanations, no extra text.
|
| 217 |
+
|
| 218 |
+
Extract these three fields in exact order:
|
| 219 |
+
1. job_description field - Complete job posting formatted in clean markdown with proper headers (## Job Description, ## Responsibilities, ## Requirements, etc.)
|
| 220 |
+
2. company_name field - Exact company name as mentioned
|
| 221 |
+
3. job_title field - Exact job title as posted
|
| 222 |
+
|
| 223 |
+
FORMATTING RULES:
|
| 224 |
+
- Use double quotes for all strings
|
| 225 |
+
- Escape internal quotes with \\"
|
| 226 |
+
- Escape newlines as \\\\n in the job description field
|
| 227 |
+
- Replace actual line breaks with \\\\n
|
| 228 |
+
- If any field is missing, use empty string ""
|
| 229 |
+
- No trailing commas
|
| 230 |
+
- No comments or extra whitespace
|
| 231 |
+
|
| 232 |
+
REQUIRED OUTPUT FORMAT:
|
| 233 |
+
{{
|
| 234 |
+
"job_description": "markdown formatted job description with \\\\n for line breaks",
|
| 235 |
+
"company_name": "exact company name",
|
| 236 |
+
"job_title": "exact job title"
|
| 237 |
+
}}
|
| 238 |
+
|
| 239 |
+
Return only the JSON object - no other text."""
|
setup.py
ADDED
|
File without changes
|
testing.ipynb
ADDED
|
@@ -0,0 +1,1069 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 3,
|
| 6 |
+
"id": "d26f6647",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from langchain.prompts import ChatPromptTemplate\n",
|
| 11 |
+
"from langchain_core.messages import AIMessage, HumanMessage, SystemMessage"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "markdown",
|
| 16 |
+
"id": "f337ecb5",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"source": []
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"cell_type": "code",
|
| 22 |
+
"execution_count": 9,
|
| 23 |
+
"id": "92b12890",
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [],
|
| 26 |
+
"source": [
|
| 27 |
+
"messages = ChatPromptTemplate.from_messages([SystemMessage(content=f\"\"\"\n",
|
| 28 |
+
" You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
|
| 29 |
+
"\n",
|
| 30 |
+
" Rules:\n",
|
| 31 |
+
" 1. Generate Tavily DSL only (no natural language outside the JSON).\n",
|
| 32 |
+
" 2. Map the job description into five categories:\n",
|
| 33 |
+
" • query1: recent developments\n",
|
| 34 |
+
" • query2: recent news\n",
|
| 35 |
+
" • query3:company profile\n",
|
| 36 |
+
" • query4: key customers & partners\n",
|
| 37 |
+
" • query5: culture & values\n",
|
| 38 |
+
" 3. Each value is a two‑element list:\n",
|
| 39 |
+
" [<query string>, <one‑sentence rationale>]\n",
|
| 40 |
+
" 4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
|
| 41 |
+
" 5. If information is missing in the JD, fall back sensibly\n",
|
| 42 |
+
" (e.g. search for “employee testimonials”).\n",
|
| 43 |
+
" 6. Return **only** valid JSON.\n",
|
| 44 |
+
" \"\"\"\n",
|
| 45 |
+
" )\n",
|
| 46 |
+
" , HumanMessage(content=\"Hello World\")])"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": 6,
|
| 52 |
+
"id": "e38c3632",
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [],
|
| 55 |
+
"source": [
|
| 56 |
+
"input_message = ChatPromptTemplate.from_messages([HumanMessage(content=\"Hello World\")])\n"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "code",
|
| 61 |
+
"execution_count": 11,
|
| 62 |
+
"id": "dac1ec19",
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [
|
| 65 |
+
{
|
| 66 |
+
"name": "stdout",
|
| 67 |
+
"output_type": "stream",
|
| 68 |
+
"text": [
|
| 69 |
+
"================================\u001b[1m System Message \u001b[0m================================\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"\n",
|
| 72 |
+
" You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
|
| 73 |
+
"\n",
|
| 74 |
+
" Rules:\n",
|
| 75 |
+
" 1. Generate Tavily DSL only (no natural language outside the JSON).\n",
|
| 76 |
+
" 2. Map the job description into five categories:\n",
|
| 77 |
+
" • query1: recent developments\n",
|
| 78 |
+
" • query2: recent news\n",
|
| 79 |
+
" • query3:company profile\n",
|
| 80 |
+
" • query4: key customers & partners\n",
|
| 81 |
+
" • query5: culture & values\n",
|
| 82 |
+
" 3. Each value is a two‑element list:\n",
|
| 83 |
+
" [<query string>, <one‑sentence rationale>]\n",
|
| 84 |
+
" 4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
|
| 85 |
+
" 5. If information is missing in the JD, fall back sensibly\n",
|
| 86 |
+
" (e.g. search for “employee testimonials”).\n",
|
| 87 |
+
" 6. Return **only** valid JSON.\n",
|
| 88 |
+
" \n",
|
| 89 |
+
"\n",
|
| 90 |
+
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"Hello World\n"
|
| 93 |
+
]
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"source": [
|
| 97 |
+
"messages.pretty_print()"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "code",
|
| 102 |
+
"execution_count": 14,
|
| 103 |
+
"id": "7ebd0d0d",
|
| 104 |
+
"metadata": {},
|
| 105 |
+
"outputs": [],
|
| 106 |
+
"source": [
|
| 107 |
+
"from langchain.prompts import (\n",
|
| 108 |
+
" ChatPromptTemplate,\n",
|
| 109 |
+
" HumanMessagePromptTemplate,\n",
|
| 110 |
+
" SystemMessagePromptTemplate,\n",
|
| 111 |
+
")\n",
|
| 112 |
+
"\n",
|
| 113 |
+
"input_message = HumanMessagePromptTemplate.from_template(\"Below is the required job description and resume: {background_information}\", input_variables=[\"background_information\"])"
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": 17,
|
| 119 |
+
"id": "cd6b3cb8",
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"outputs": [
|
| 122 |
+
{
|
| 123 |
+
"data": {
|
| 124 |
+
"text/plain": [
|
| 125 |
+
"HumanMessage(content='Below is the required job description and resume: This is Rishabh', additional_kwargs={}, response_metadata={})"
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
"execution_count": 17,
|
| 129 |
+
"metadata": {},
|
| 130 |
+
"output_type": "execute_result"
|
| 131 |
+
}
|
| 132 |
+
],
|
| 133 |
+
"source": [
|
| 134 |
+
"input_message.format(background_information=\"This is Rishabh\")"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"cell_type": "code",
|
| 139 |
+
"execution_count": 18,
|
| 140 |
+
"id": "c9628bed",
|
| 141 |
+
"metadata": {},
|
| 142 |
+
"outputs": [],
|
| 143 |
+
"source": [
|
| 144 |
+
"import re\n",
|
| 145 |
+
"from pathlib import Path\n",
|
| 146 |
+
"from typing import List\n",
|
| 147 |
+
"\n",
|
| 148 |
+
"from langchain_community.document_loaders import PyPDFLoader\n",
|
| 149 |
+
"from langchain.text_splitter import (\n",
|
| 150 |
+
" MarkdownHeaderTextSplitter,\n",
|
| 151 |
+
" RecursiveCharacterTextSplitter,\n",
|
| 152 |
+
")\n",
|
| 153 |
+
"from langchain.schema import Document"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"cell_type": "code",
|
| 158 |
+
"execution_count": 29,
|
| 159 |
+
"id": "c352da72",
|
| 160 |
+
"metadata": {},
|
| 161 |
+
"outputs": [],
|
| 162 |
+
"source": [
|
| 163 |
+
"def _collapse_ws(text: str) -> str:\n",
|
| 164 |
+
" \"\"\"Collapse stray whitespace but keep bullet breaks.\"\"\"\n",
|
| 165 |
+
" text = re.sub(r\"\\n\\s*([•\\-–])\\s*\", r\"\\n\\1 \", text)\n",
|
| 166 |
+
" return re.sub(r\"[ \\t\\r\\f\\v]+\", \" \", text).replace(\" \\n\", \"\\n\").strip()\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"def _is_heading(line: str) -> bool:\n",
|
| 170 |
+
" return (\n",
|
| 171 |
+
" line.isupper()\n",
|
| 172 |
+
" and len(line.split()) <= 5\n",
|
| 173 |
+
" and not re.search(r\"\\d\", line)\n",
|
| 174 |
+
" )\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"\n",
|
| 177 |
+
"def parse_resume(pdf_path: str | Path) -> List[Document]:\n",
|
| 178 |
+
" \"\"\"\n",
|
| 179 |
+
" Load a single‑page résumé PDF → list[Document] chunks\n",
|
| 180 |
+
" (≈400 chars, 50‑char overlap) with {source, section} metadata.\n",
|
| 181 |
+
" \"\"\"\n",
|
| 182 |
+
" text = PyPDFLoader(str(pdf_path), extraction_mode=\"layout\").load()[0].page_content\n",
|
| 183 |
+
" print(text)\n",
|
| 184 |
+
" text = _collapse_ws(text)\n",
|
| 185 |
+
"\n",
|
| 186 |
+
" # Tag headings with \"###\" so Markdown splitter can see them\n",
|
| 187 |
+
" tagged_lines = [\n",
|
| 188 |
+
" f\"### {ln}\" if _is_heading(ln) else ln\n",
|
| 189 |
+
" for ln in text.splitlines()\n",
|
| 190 |
+
" ]\n",
|
| 191 |
+
" md_text = \"\\n\".join(tagged_lines)\n",
|
| 192 |
+
"\n",
|
| 193 |
+
" if \"###\" in md_text:\n",
|
| 194 |
+
" splitter = MarkdownHeaderTextSplitter(\n",
|
| 195 |
+
" headers_to_split_on=[(\"###\", \"section\")]\n",
|
| 196 |
+
" )\n",
|
| 197 |
+
" chunks = splitter.split_text(md_text) # already returns Documents\n",
|
| 198 |
+
" else:\n",
|
| 199 |
+
" print(f\"No headings found.\")\n",
|
| 200 |
+
" splitter = RecursiveCharacterTextSplitter(\n",
|
| 201 |
+
" chunk_size=400, chunk_overlap=50\n",
|
| 202 |
+
" )\n",
|
| 203 |
+
" chunks = [\n",
|
| 204 |
+
" Document(page_content=chunk, metadata={})\n",
|
| 205 |
+
" for chunk in splitter.split_text(md_text)\n",
|
| 206 |
+
" ]\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" # Attach metadata\n",
|
| 209 |
+
" for doc in chunks:\n",
|
| 210 |
+
" doc.metadata.setdefault(\"source\", str(pdf_path))\n",
|
| 211 |
+
" # section already present if header‑splitter was used\n",
|
| 212 |
+
" return chunks\n"
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"cell_type": "code",
|
| 217 |
+
"execution_count": 31,
|
| 218 |
+
"id": "14e062e4",
|
| 219 |
+
"metadata": {},
|
| 220 |
+
"outputs": [
|
| 221 |
+
{
|
| 222 |
+
"name": "stdout",
|
| 223 |
+
"output_type": "stream",
|
| 224 |
+
"text": [
|
| 225 |
+
"Rishabh Aggarwal\n",
|
| 226 |
+
" (602) 580-5734 • raggar15@asu.edu • LinkedIn • Tempe, AZ\n",
|
| 227 |
+
"TECHNICAL SKILLS\n",
|
| 228 |
+
"Programming Languages: Python, Java, JavaScript, Bash, HTML, CSS\n",
|
| 229 |
+
"Databases: SQL (PostgreSQL, MySQL, SQLite), NoSQL (MongoDB, Redis, DynamoDB, Pinecone)\n",
|
| 230 |
+
"Frameworks/Tools: SpringBoot, React, JUnit, Node.js, RESTful APIs, Django, Kafka, Airflow, FastAPI, Pydantic, Tableau\n",
|
| 231 |
+
"DevOps/Cloud: AWS, GCP, GitHub Actions, Docker, Jenkins, Terraform, Kubernetes, MLFlow, GitLab\n",
|
| 232 |
+
"AI Tools/Frameworks: PyTorch, Tensorflow, scikit-learn, LangGraph, LangChain, LangSmith, ChatGPT\n",
|
| 233 |
+
"PROFESSIONAL EXPERIENCE\n",
|
| 234 |
+
"Amazon Inc, Tempe, AZ: Software Development Engineer | Seller Payment Services Dec 2023 - Aug 2024\n",
|
| 235 |
+
"● Established AWS Evidently setup to handle 50K+ daily API requests to new Lambda service using AWS CDK(TypeScript)\n",
|
| 236 |
+
"● Added metrics to monitor traffic and enhance service observability of the Lambda service through CloudWatch logs\n",
|
| 237 |
+
"● Developed SNS Event Publishers in Java using Spring Boot to process 10K+ daily events in an event-driven architecture\n",
|
| 238 |
+
"● Led load balancer migration planning for a microservice with a focus on safe rollbacks and minimum downtime\n",
|
| 239 |
+
"● Designed a dashboard for ALB migration to monitor traffic with high-severity alarms to enhance observability\n",
|
| 240 |
+
"● Directed weekly meetings with a 7-member agile team to analyze metrics and customer data, guiding decision-making for\n",
|
| 241 |
+
" live campaigns involving over 50K sellers\n",
|
| 242 |
+
"MetaJungle, Ozark, MO: Lead Backend Engineer Jun 2023 - Dec 2023\n",
|
| 243 |
+
"● Architected a scalable AWS cloud infrastructure for a Marketplace using Terraform IaC with ECS and Fargate\n",
|
| 244 |
+
" instances, reduced costs by 40% while maintaining high reliability using Blue/Green deployment strategy\n",
|
| 245 |
+
"● Engineered and managed Jenkins CI/CD pipeline allowing faster iterative development by reducing deployment time by\n",
|
| 246 |
+
" 75% , leveraging Github hooks and Docker Containerization\n",
|
| 247 |
+
"● Migrated over 1.2TB on-premises Microsoft SQL Server database with over 2 million records to AWS RDS, utilizing\n",
|
| 248 |
+
" AWS DMS ensuring efficient indexing and retrieval\n",
|
| 249 |
+
"● Developed 10+ RESTful APIs in Node.js to manage data for over 500 NFT collections and 10,000 listings from MongoDB\n",
|
| 250 |
+
"● Automated extraction and compression of 50,000+ images from Ethereum Blockchain and stored on AWS S3 using\n",
|
| 251 |
+
" Airflow workflows in Python, leading to almost 30% storage cost savings\n",
|
| 252 |
+
"Omnipresent Robot Technologies, Delhi, India: Software Engineer Jun 2018 - Jul 2021\n",
|
| 253 |
+
"● Engineered a distributed, scalable AI surveillance application with edge-device computation using Python, OpenCV,\n",
|
| 254 |
+
" and scikit-learn, ensuring security for 10,000+ daily park visitors\n",
|
| 255 |
+
"● Architected a distributed system for real-time video streaming using Apache Kafka and Python to process 50+ parallel\n",
|
| 256 |
+
" video streams, reducing latency by 60% by rigorous debugging and performance optimization\n",
|
| 257 |
+
"● Led the development of an analytics dashboard using Django, React and Postgres to show breach records, alerts, and\n",
|
| 258 |
+
" intuitive data visualizations using Google Charts, allowing data-driven decision making\n",
|
| 259 |
+
"● Developed a drone compliance platform using Django to automate flight authorization and authentication process,\n",
|
| 260 |
+
" leading to enhanced productivity of the drone engineering team\n",
|
| 261 |
+
"● Led collaboration of a team of engineers and drone operators to conduct real-world testing of the compliance system\n",
|
| 262 |
+
"● Mentored interns to understand software development best practices, coding standards, and version control systems\n",
|
| 263 |
+
"ADDITIONAL EXPERIENCE\n",
|
| 264 |
+
"ML Software Developer at ASU Jul 2022 - May 2023\n",
|
| 265 |
+
"● Trained deep learning models using PyTorch and Scikit to detect low-resolution objects in 15,000+ satellite images\n",
|
| 266 |
+
"● Executed adversarial attacks and utilized MLFlow for fine-tuning multi-class classification machine learning model,\n",
|
| 267 |
+
" enhancing model robustness and improving accuracy by 20%\n",
|
| 268 |
+
"Mayhem Heroes Cybersecurity Open Source Hackathon Apr 2022\n",
|
| 269 |
+
"Integrated Mayhem into CI/CD pipeline for Open Source repos using GitHub Actions, reducing security risks by over 80%\n",
|
| 270 |
+
" EDUCATION\n",
|
| 271 |
+
"Master of Science in Information Technology\n",
|
| 272 |
+
"Arizona State University, Tempe, Arizona\n"
|
| 273 |
+
]
|
| 274 |
+
}
|
| 275 |
+
],
|
| 276 |
+
"source": [
|
| 277 |
+
"chunks = parse_resume(\"C:\\\\Users\\\\risha\\\\Downloads\\\\Rishabh_SDE_Resume.pdf\")"
|
| 278 |
+
]
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"cell_type": "code",
|
| 282 |
+
"execution_count": 40,
|
| 283 |
+
"id": "0100cc62",
|
| 284 |
+
"metadata": {},
|
| 285 |
+
"outputs": [
|
| 286 |
+
{
|
| 287 |
+
"name": "stdout",
|
| 288 |
+
"output_type": "stream",
|
| 289 |
+
"text": [
|
| 290 |
+
"Resume chunk: Rishabh Aggarwal\n",
|
| 291 |
+
"(602) 580-5734 • raggar15@asu.edu • LinkedIn • Tempe, AZ\n",
|
| 292 |
+
"Resume chunk: Programming Languages: Python, Java, JavaScript, Bash, HTML, CSS\n",
|
| 293 |
+
"Databases: SQL (PostgreSQL, MySQL, SQLite), NoSQL (MongoDB, Redis, DynamoDB, Pinecone)\n",
|
| 294 |
+
"Frameworks/Tools: SpringBoot, React, JUnit, Node.js, RESTful APIs, Django, Kafka, Airflow, FastAPI, Pydantic, Tableau\n",
|
| 295 |
+
"DevOps/Cloud: AWS, GCP, GitHub Actions, Docker, Jenkins, Terraform, Kubernetes, MLFlow, GitLab\n",
|
| 296 |
+
"AI Tools/Frameworks: PyTorch, Tensorflow, scikit-learn, LangGraph, LangChain, LangSmith, ChatGPT\n",
|
| 297 |
+
"Resume chunk: Amazon Inc, Tempe, AZ: Software Development Engineer | Seller Payment Services Dec 2023 - Aug 2024\n",
|
| 298 |
+
"● Established AWS Evidently setup to handle 50K+ daily API requests to new Lambda service using AWS CDK(TypeScript)\n",
|
| 299 |
+
"● Added metrics to monitor traffic and enhance service observability of the Lambda service through CloudWatch logs\n",
|
| 300 |
+
"● Developed SNS Event Publishers in Java using Spring Boot to process 10K+ daily events in an event-driven architecture\n",
|
| 301 |
+
"● Led load balancer migration planning for a microservice with a focus on safe rollbacks and minimum downtime\n",
|
| 302 |
+
"● Designed a dashboard for ALB migration to monitor traffic with high-severity alarms to enhance observability\n",
|
| 303 |
+
"● Directed weekly meetings with a 7-member agile team to analyze metrics and customer data, guiding decision-making for\n",
|
| 304 |
+
"live campaigns involving over 50K sellers\n",
|
| 305 |
+
"MetaJungle, Ozark, MO: Lead Backend Engineer Jun 2023 - Dec 2023\n",
|
| 306 |
+
"● Architected a scalable AWS cloud infrastructure for a Marketplace using Terraform IaC with ECS and Fargate\n",
|
| 307 |
+
"instances, reduced costs by 40% while maintaining high reliability using Blue/Green deployment strategy\n",
|
| 308 |
+
"● Engineered and managed Jenkins CI/CD pipeline allowing faster iterative development by reducing deployment time by\n",
|
| 309 |
+
"75% , leveraging Github hooks and Docker Containerization\n",
|
| 310 |
+
"● Migrated over 1.2TB on-premises Microsoft SQL Server database with over 2 million records to AWS RDS, utilizing\n",
|
| 311 |
+
"AWS DMS ensuring efficient indexing and retrieval\n",
|
| 312 |
+
"● Developed 10+ RESTful APIs in Node.js to manage data for over 500 NFT collections and 10,000 listings from MongoDB\n",
|
| 313 |
+
"● Automated extraction and compression of 50,000+ images from Ethereum Blockchain and stored on AWS S3 using\n",
|
| 314 |
+
"Airflow workflows in Python, leading to almost 30% storage cost savings\n",
|
| 315 |
+
"Omnipresent Robot Technologies, Delhi, India: Software Engineer Jun 2018 - Jul 2021\n",
|
| 316 |
+
"● Engineered a distributed, scalable AI surveillance application with edge-device computation using Python, OpenCV,\n",
|
| 317 |
+
"and scikit-learn, ensuring security for 10,000+ daily park visitors\n",
|
| 318 |
+
"● Architected a distributed system for real-time video streaming using Apache Kafka and Python to process 50+ parallel\n",
|
| 319 |
+
"video streams, reducing latency by 60% by rigorous debugging and performance optimization\n",
|
| 320 |
+
"● Led the development of an analytics dashboard using Django, React and Postgres to show breach records, alerts, and\n",
|
| 321 |
+
"intuitive data visualizations using Google Charts, allowing data-driven decision making\n",
|
| 322 |
+
"● Developed a drone compliance platform using Django to automate flight authorization and authentication process,\n",
|
| 323 |
+
"leading to enhanced productivity of the drone engineering team\n",
|
| 324 |
+
"● Led collaboration of a team of engineers and drone operators to conduct real-world testing of the compliance system\n",
|
| 325 |
+
"● Mentored interns to understand software development best practices, coding standards, and version control systems\n",
|
| 326 |
+
"Resume chunk: ML Software Developer at ASU Jul 2022 - May 2023\n",
|
| 327 |
+
"● Trained deep learning models using PyTorch and Scikit to detect low-resolution objects in 15,000+ satellite images\n",
|
| 328 |
+
"● Executed adversarial attacks and utilized MLFlow for fine-tuning multi-class classification machine learning model,\n",
|
| 329 |
+
"enhancing model robustness and improving accuracy by 20%\n",
|
| 330 |
+
"Mayhem Heroes Cybersecurity Open Source Hackathon Apr 2022\n",
|
| 331 |
+
"Integrated Mayhem into CI/CD pipeline for Open Source repos using GitHub Actions, reducing security risks by over 80%\n",
|
| 332 |
+
"Resume chunk: Master of Science in Information Technology\n",
|
| 333 |
+
"Arizona State University, Tempe, Arizona\n"
|
| 334 |
+
]
|
| 335 |
+
}
|
| 336 |
+
],
|
| 337 |
+
"source": [
|
| 338 |
+
"resume_text = \"\"\n",
|
| 339 |
+
"for chunk in chunks:\n",
|
| 340 |
+
" print(f\"Resume chunk: {chunk.page_content}\")\n",
|
| 341 |
+
" resume_text+= (chunk.page_content)"
|
| 342 |
+
]
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"cell_type": "code",
|
| 346 |
+
"execution_count": 41,
|
| 347 |
+
"id": "b045de91",
|
| 348 |
+
"metadata": {},
|
| 349 |
+
"outputs": [],
|
| 350 |
+
"source": [
|
| 351 |
+
"from pydantic import BaseModel, Field\n",
|
| 352 |
+
"\n",
|
| 353 |
+
"class TavilyQuerySet(BaseModel):\n",
|
| 354 |
+
" query1: tuple[str, str] = Field(\n",
|
| 355 |
+
" ...,\n",
|
| 356 |
+
" description=\"DSL for Recent Developments + 1‑sentence rationale\",\n",
|
| 357 |
+
" )\n",
|
| 358 |
+
" query2: tuple[str, str] = Field(\n",
|
| 359 |
+
" ...,\n",
|
| 360 |
+
" description=\"DSL for Recent News + rationale\",\n",
|
| 361 |
+
" )\n",
|
| 362 |
+
" query3: tuple[str, str]\n",
|
| 363 |
+
" query4: tuple[str, str]\n",
|
| 364 |
+
" query5: tuple[str, str]"
|
| 365 |
+
]
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"cell_type": "code",
|
| 369 |
+
"execution_count": 42,
|
| 370 |
+
"id": "eda95e9a",
|
| 371 |
+
"metadata": {},
|
| 372 |
+
"outputs": [],
|
| 373 |
+
"source": [
|
| 374 |
+
"from langchain.output_parsers import PydanticOutputParser\n",
|
| 375 |
+
"parser = PydanticOutputParser(pydantic_object=TavilyQuerySet)\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"messages = SystemMessage(content=f\"\"\"\n",
|
| 378 |
+
" You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
|
| 379 |
+
" {parser.get_format_instructions()}\n",
|
| 380 |
+
"\n",
|
| 381 |
+
" \n",
|
| 382 |
+
" Rules:\n",
|
| 383 |
+
" 1. Generate Tavily DSL only (no natural language outside the JSON).\n",
|
| 384 |
+
" 2. Map the job description into five categories:\n",
|
| 385 |
+
" • query1: recent developments\n",
|
| 386 |
+
" • query2: recent news\n",
|
| 387 |
+
" • query3:company profile\n",
|
| 388 |
+
" • query4: key customers & partners\n",
|
| 389 |
+
" • query5: culture & values\n",
|
| 390 |
+
" 3. Each value is a two‑element list:\n",
|
| 391 |
+
" [<query string>, <one‑sentence rationale>]\n",
|
| 392 |
+
" 4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
|
| 393 |
+
" 5. If information is missing in the JD, fall back sensibly\n",
|
| 394 |
+
" (e.g. search for “employee testimonials”).\n",
|
| 395 |
+
" 6. Return **only** valid JSON.\n",
|
| 396 |
+
" \"\"\")"
|
| 397 |
+
]
|
| 398 |
+
},
|
| 399 |
+
{
|
| 400 |
+
"cell_type": "code",
|
| 401 |
+
"execution_count": 53,
|
| 402 |
+
"id": "9738103e",
|
| 403 |
+
"metadata": {},
|
| 404 |
+
"outputs": [
|
| 405 |
+
{
|
| 406 |
+
"data": {
|
| 407 |
+
"text/plain": [
|
| 408 |
+
"'The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\"properties\": {\"query1\": {\"description\": \"DSL for Recent Developments + 1‑sentence rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query1\", \"type\": \"array\"}, \"query2\": {\"description\": \"DSL for Recent News + rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query2\", \"type\": \"array\"}, \"query3\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query3\", \"type\": \"array\"}, \"query4\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query4\", \"type\": \"array\"}, \"query5\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query5\", \"type\": \"array\"}}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]}\\n```'"
|
| 409 |
+
]
|
| 410 |
+
},
|
| 411 |
+
"execution_count": 53,
|
| 412 |
+
"metadata": {},
|
| 413 |
+
"output_type": "execute_result"
|
| 414 |
+
}
|
| 415 |
+
],
|
| 416 |
+
"source": [
|
| 417 |
+
"parser.get_format_instructions()"
|
| 418 |
+
]
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"cell_type": "code",
|
| 422 |
+
"execution_count": 52,
|
| 423 |
+
"id": "c3174432",
|
| 424 |
+
"metadata": {},
|
| 425 |
+
"outputs": [
|
| 426 |
+
{
|
| 427 |
+
"data": {
|
| 428 |
+
"text/plain": [
|
| 429 |
+
"{'properties': {'query1': {'description': 'DSL for Recent Developments + 1‑sentence rationale',\n",
|
| 430 |
+
" 'maxItems': 2,\n",
|
| 431 |
+
" 'minItems': 2,\n",
|
| 432 |
+
" 'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
|
| 433 |
+
" 'title': 'Query1',\n",
|
| 434 |
+
" 'type': 'array'},\n",
|
| 435 |
+
" 'query2': {'description': 'DSL for Recent News + rationale',\n",
|
| 436 |
+
" 'maxItems': 2,\n",
|
| 437 |
+
" 'minItems': 2,\n",
|
| 438 |
+
" 'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
|
| 439 |
+
" 'title': 'Query2',\n",
|
| 440 |
+
" 'type': 'array'},\n",
|
| 441 |
+
" 'query3': {'maxItems': 2,\n",
|
| 442 |
+
" 'minItems': 2,\n",
|
| 443 |
+
" 'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
|
| 444 |
+
" 'title': 'Query3',\n",
|
| 445 |
+
" 'type': 'array'},\n",
|
| 446 |
+
" 'query4': {'maxItems': 2,\n",
|
| 447 |
+
" 'minItems': 2,\n",
|
| 448 |
+
" 'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
|
| 449 |
+
" 'title': 'Query4',\n",
|
| 450 |
+
" 'type': 'array'},\n",
|
| 451 |
+
" 'query5': {'maxItems': 2,\n",
|
| 452 |
+
" 'minItems': 2,\n",
|
| 453 |
+
" 'prefixItems': [{'type': 'string'}, {'type': 'string'}],\n",
|
| 454 |
+
" 'title': 'Query5',\n",
|
| 455 |
+
" 'type': 'array'}},\n",
|
| 456 |
+
" 'required': ['query1', 'query2', 'query3', 'query4', 'query5'],\n",
|
| 457 |
+
" 'title': 'TavilyQuerySet',\n",
|
| 458 |
+
" 'type': 'object'}"
|
| 459 |
+
]
|
| 460 |
+
},
|
| 461 |
+
"execution_count": 52,
|
| 462 |
+
"metadata": {},
|
| 463 |
+
"output_type": "execute_result"
|
| 464 |
+
}
|
| 465 |
+
],
|
| 466 |
+
"source": [
|
| 467 |
+
"TavilyQuerySet.model_json_schema()"
|
| 468 |
+
]
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"cell_type": "code",
|
| 472 |
+
"execution_count": 44,
|
| 473 |
+
"id": "5884df35",
|
| 474 |
+
"metadata": {},
|
| 475 |
+
"outputs": [
|
| 476 |
+
{
|
| 477 |
+
"name": "stdout",
|
| 478 |
+
"output_type": "stream",
|
| 479 |
+
"text": [
|
| 480 |
+
"================================\u001b[1m System Message \u001b[0m================================\n",
|
| 481 |
+
"\n",
|
| 482 |
+
"\n",
|
| 483 |
+
" You are a Tavily Search Query specialist. Follow the JSON schema below exactly:\n",
|
| 484 |
+
" The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
|
| 485 |
+
"\n",
|
| 486 |
+
"As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\n",
|
| 487 |
+
"the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
|
| 488 |
+
"\n",
|
| 489 |
+
"Here is the output schema:\n",
|
| 490 |
+
"```\n",
|
| 491 |
+
"{\"properties\": {\"query1\": {\"description\": \"DSL for Recent Developments + 1‑sentence rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query1\", \"type\": \"array\"}, \"query2\": {\"description\": \"DSL for Recent News + rationale\", \"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query2\", \"type\": \"array\"}, \"query3\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query3\", \"type\": \"array\"}, \"query4\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query4\", \"type\": \"array\"}, \"query5\": {\"maxItems\": 2, \"minItems\": 2, \"prefixItems\": [{\"type\": \"string\"}, {\"type\": \"string\"}], \"title\": \"Query5\", \"type\": \"array\"}}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]}\n",
|
| 492 |
+
"```\n",
|
| 493 |
+
"\n",
|
| 494 |
+
"\n",
|
| 495 |
+
" Rules:\n",
|
| 496 |
+
" 1. Generate Tavily DSL only (no natural language outside the JSON).\n",
|
| 497 |
+
" 2. Map the job description into five categories:\n",
|
| 498 |
+
" • query1: recent developments\n",
|
| 499 |
+
" • query2: recent news\n",
|
| 500 |
+
" • query3:company profile\n",
|
| 501 |
+
" • query4: key customers & partners\n",
|
| 502 |
+
" • query5: culture & values\n",
|
| 503 |
+
" 3. Each value is a two‑element list:\n",
|
| 504 |
+
" [<query string>, <one‑sentence rationale>]\n",
|
| 505 |
+
" 4. Use filters (source:, date:[now-30d TO now], site:…, etc.) where helpful.\n",
|
| 506 |
+
" 5. If information is missing in the JD, fall back sensibly\n",
|
| 507 |
+
" (e.g. search for “employee testimonials”).\n",
|
| 508 |
+
" 6. Return **only** valid JSON.\n",
|
| 509 |
+
" \n"
|
| 510 |
+
]
|
| 511 |
+
}
|
| 512 |
+
],
|
| 513 |
+
"source": [
|
| 514 |
+
"messages.pretty_print()"
|
| 515 |
+
]
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"cell_type": "code",
|
| 519 |
+
"execution_count": 46,
|
| 520 |
+
"id": "d2c3cc8b",
|
| 521 |
+
"metadata": {},
|
| 522 |
+
"outputs": [],
|
| 523 |
+
"source": [
|
| 524 |
+
"x = \"\"\"properties\": {\"query1\": [{\"query\": \"Shalin Mehta AND \\\"Computational Microscopy Platform\\\"\", \"rationale\": \"Recent developments within the company\"}, {\"query\": \"Shalin Mehta AND \\\"Biohub SF\\\"\", \"rationale\": \"Recent developments within the company\"}], \"query2\": [{\"query\": \"Chan Zuckerberg Biohub - San Francisco AND recent news\", \"rationale\": \"Recent news about the company\"}, {\"query\": \"COVID-19 AND Chan Zuckerberg Biohub - San Francisco\", \"rationale\": \"Recent news about the company\"}], \"query3\": [{\"query\": \"Shalin Mehta AND \\\"role: Software Engineer\\\"\", \"rationale\": \"Information about the company that relates to the role\"}, {\"query\": \"Chan Zuckerberg Biohub - San Francisco AND \\\"team: Bioengineering\\\"\", \"rationale\": \"Information about the company that relates to the role\"}], \"query4\": [{\"query\": \"key customers: Chan Zuckerberg Biohub\", \"rationale\": \"Key customers & partners\"}, {\"query\": \"partners: Chan Zuckerberg Biohub SF\", \"rationale\": \"Key customers & partners\"}], \"query5\": [{\"query\": \"company culture: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}, {\"query\": \"values: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}]}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]\"\"\""
|
| 525 |
+
]
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"cell_type": "code",
|
| 529 |
+
"execution_count": 49,
|
| 530 |
+
"id": "7d8508a4",
|
| 531 |
+
"metadata": {},
|
| 532 |
+
"outputs": [
|
| 533 |
+
{
|
| 534 |
+
"name": "stdout",
|
| 535 |
+
"output_type": "stream",
|
| 536 |
+
"text": [
|
| 537 |
+
"properties\": {\"query1\": [{\"query\": \"Shalin Mehta AND \"Computational Microscopy Platform\"\", \"rationale\": \"Recent developments within the company\"}, {\"query\": \"Shalin Mehta AND \"Biohub SF\"\", \"rationale\": \"Recent developments within the company\"}], \"query2\": [{\"query\": \"Chan Zuckerberg Biohub - San Francisco AND recent news\", \"rationale\": \"Recent news about the company\"}, {\"query\": \"COVID-19 AND Chan Zuckerberg Biohub - San Francisco\", \"rationale\": \"Recent news about the company\"}], \"query3\": [{\"query\": \"Shalin Mehta AND \"role: Software Engineer\"\", \"rationale\": \"Information about the company that relates to the role\"}, {\"query\": \"Chan Zuckerberg Biohub - San Francisco AND \"team: Bioengineering\"\", \"rationale\": \"Information about the company that relates to the role\"}], \"query4\": [{\"query\": \"key customers: Chan Zuckerberg Biohub\", \"rationale\": \"Key customers & partners\"}, {\"query\": \"partners: Chan Zuckerberg Biohub SF\", \"rationale\": \"Key customers & partners\"}], \"query5\": [{\"query\": \"company culture: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}, {\"query\": \"values: Chan Zuckerberg Biohub\", \"rationale\": \"Culture & values of the company\"}]}, \"required\": [\"query1\", \"query2\", \"query3\", \"query4\", \"query5\"]\n"
|
| 538 |
+
]
|
| 539 |
+
}
|
| 540 |
+
],
|
| 541 |
+
"source": [
|
| 542 |
+
"print(x)"
|
| 543 |
+
]
|
| 544 |
+
},
|
| 545 |
+
{
|
| 546 |
+
"cell_type": "code",
|
| 547 |
+
"execution_count": 54,
|
| 548 |
+
"id": "1fab5ee9",
|
| 549 |
+
"metadata": {},
|
| 550 |
+
"outputs": [],
|
| 551 |
+
"source": [
|
| 552 |
+
"from langchain_core.prompts import (\n",
|
| 553 |
+
" PromptTemplate,\n",
|
| 554 |
+
")"
|
| 555 |
+
]
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"cell_type": "code",
|
| 559 |
+
"execution_count": null,
|
| 560 |
+
"id": "e93695ff",
|
| 561 |
+
"metadata": {},
|
| 562 |
+
"outputs": [],
|
| 563 |
+
"source": [
|
| 564 |
+
"prompt = PromptTemplate.from_template(\"Below is the required job description and resume: {background_information}\", input_variables=[\"background_information\"])"
|
| 565 |
+
]
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"cell_type": "code",
|
| 569 |
+
"execution_count": 55,
|
| 570 |
+
"id": "f5330010",
|
| 571 |
+
"metadata": {},
|
| 572 |
+
"outputs": [],
|
| 573 |
+
"source": [
|
| 574 |
+
"x = ('query1', ('recent developments within the company', 'The Associate Software engineer will build open source software tools for managing and processing 10-100 terabyte-scale datasets.'))"
|
| 575 |
+
]
|
| 576 |
+
},
|
| 577 |
+
{
|
| 578 |
+
"cell_type": "code",
|
| 579 |
+
"execution_count": 61,
|
| 580 |
+
"id": "5753afd4",
|
| 581 |
+
"metadata": {},
|
| 582 |
+
"outputs": [],
|
| 583 |
+
"source": [
|
| 584 |
+
"keys = ('q', ('y', 'z'))\n",
|
| 585 |
+
"\n",
|
| 586 |
+
"dict_x = dict(zip(keys, x))"
|
| 587 |
+
]
|
| 588 |
+
},
|
| 589 |
+
{
|
| 590 |
+
"cell_type": "code",
|
| 591 |
+
"execution_count": 63,
|
| 592 |
+
"id": "06d50119",
|
| 593 |
+
"metadata": {},
|
| 594 |
+
"outputs": [
|
| 595 |
+
{
|
| 596 |
+
"data": {
|
| 597 |
+
"text/plain": [
|
| 598 |
+
"('recent developments within the company',\n",
|
| 599 |
+
" 'The Associate Software engineer will build open source software tools for managing and processing 10-100 terabyte-scale datasets.')"
|
| 600 |
+
]
|
| 601 |
+
},
|
| 602 |
+
"execution_count": 63,
|
| 603 |
+
"metadata": {},
|
| 604 |
+
"output_type": "execute_result"
|
| 605 |
+
}
|
| 606 |
+
],
|
| 607 |
+
"source": [
|
| 608 |
+
"dict_x[('y', 'z')]"
|
| 609 |
+
]
|
| 610 |
+
},
|
| 611 |
+
{
|
| 612 |
+
"cell_type": "code",
|
| 613 |
+
"execution_count": null,
|
| 614 |
+
"id": "f03d758e",
|
| 615 |
+
"metadata": {},
|
| 616 |
+
"outputs": [],
|
| 617 |
+
"source": [
|
| 618 |
+
"from langchain.output_parsers import PydanticOutputParser, OutputFixingParser, RetryOutputParser\n",
|
| 619 |
+
"base_parser = PydanticOutputParser(pydantic_object=TavilyQuerySet)\n",
|
| 620 |
+
"\n"
|
| 621 |
+
]
|
| 622 |
+
},
|
| 623 |
+
{
|
| 624 |
+
"cell_type": "code",
|
| 625 |
+
"execution_count": 1,
|
| 626 |
+
"id": "d8dd9c74",
|
| 627 |
+
"metadata": {},
|
| 628 |
+
"outputs": [
|
| 629 |
+
{
|
| 630 |
+
"ename": "NameError",
|
| 631 |
+
"evalue": "name 'parser' is not defined",
|
| 632 |
+
"output_type": "error",
|
| 633 |
+
"traceback": [
|
| 634 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 635 |
+
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
| 636 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m format_instructions = \u001b[43mparser\u001b[49m.get_format_instructions()\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mollama\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m chat\n\u001b[32m 5\u001b[39m tavily_role_messages = SystemMessage(content=\n\u001b[32m 6\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\"\"\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[33m When you reply, output **only** valid JSON that can be parsed\u001b[39m\n\u001b[32m (...)\u001b[39m\u001b[32m 31\u001b[39m \u001b[33m 5. Return **only** valid JSON that matches the schema exactly. No other fields\u001b[39m\n\u001b[32m 32\u001b[39m \u001b[33m \u001b[39m\u001b[33m\"\"\"\u001b[39m)\n",
|
| 637 |
+
"\u001b[31mNameError\u001b[39m: name 'parser' is not defined"
|
| 638 |
+
]
|
| 639 |
+
}
|
| 640 |
+
],
|
| 641 |
+
"source": [
|
| 642 |
+
"format_instructions = parser.get_format_instructions()\n",
|
| 643 |
+
"from ollama import chat\n",
|
| 644 |
+
"\n",
|
| 645 |
+
"\n",
|
| 646 |
+
"tavily_role_messages = SystemMessage(content=\n",
|
| 647 |
+
" f\"\"\"\n",
|
| 648 |
+
" When you reply, output **only** valid JSON that can be parsed\n",
|
| 649 |
+
" into the Pydantic model shown below. Do **not** wrap it in \"properties\"\n",
|
| 650 |
+
" or \"required\".:\n",
|
| 651 |
+
" \n",
|
| 652 |
+
" ------------------------------------------------\n",
|
| 653 |
+
"\n",
|
| 654 |
+
"\n",
|
| 655 |
+
" {format_instructions}\n",
|
| 656 |
+
"\n",
|
| 657 |
+
" \n",
|
| 658 |
+
" -------------------------------------------------\n",
|
| 659 |
+
"\n",
|
| 660 |
+
" Rules:\n",
|
| 661 |
+
" 1. Generate Tavily DSL only (no natural language outside the JSON).\n",
|
| 662 |
+
" 2. Map the job description into five categories:\n",
|
| 663 |
+
" • query1: recent developments within the company\n",
|
| 664 |
+
" • query2: recent news about the company\n",
|
| 665 |
+
" • query3: information about the company that relates to the role\n",
|
| 666 |
+
" • query4: key customers & partners\n",
|
| 667 |
+
" • query5: culture & values of the company\n",
|
| 668 |
+
" 3. Each value is a two‑element list:\n",
|
| 669 |
+
" [<query string>, <one‑sentence rationale>]\n",
|
| 670 |
+
" 4. If information is missing in the JD, fall back sensibly\n",
|
| 671 |
+
" (e.g. search for “employee testimonials”).\n",
|
| 672 |
+
" 5. Return **only** valid JSON that matches the schema exactly. No other fields\n",
|
| 673 |
+
" \"\"\")\n",
|
| 674 |
+
"\n",
|
| 675 |
+
"\n",
|
| 676 |
+
"response = chat(\n",
|
| 677 |
+
" messages=[{\n",
|
| 678 |
+
" tavily_role_messages,\n",
|
| 679 |
+
" input_message}\n",
|
| 680 |
+
" ],\n",
|
| 681 |
+
" model='llama3.2:latest',\n",
|
| 682 |
+
" format=TavilyQuerySet.model_json_schema(),\n",
|
| 683 |
+
" )"
|
| 684 |
+
]
|
| 685 |
+
},
|
| 686 |
+
{
|
| 687 |
+
"cell_type": "code",
|
| 688 |
+
"execution_count": 2,
|
| 689 |
+
"id": "8deb0abd",
|
| 690 |
+
"metadata": {},
|
| 691 |
+
"outputs": [],
|
| 692 |
+
"source": [
|
| 693 |
+
"p = ('query1', ['Recent developments within the company using computational microscopy platform', 'This project will require working on microscopes in a BSL-2 imaging laboratory'])"
|
| 694 |
+
]
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"cell_type": "code",
|
| 698 |
+
"execution_count": 3,
|
| 699 |
+
"id": "d2fcab19",
|
| 700 |
+
"metadata": {},
|
| 701 |
+
"outputs": [
|
| 702 |
+
{
|
| 703 |
+
"data": {
|
| 704 |
+
"text/plain": [
|
| 705 |
+
"'Recent developments within the company using computational microscopy platform'"
|
| 706 |
+
]
|
| 707 |
+
},
|
| 708 |
+
"execution_count": 3,
|
| 709 |
+
"metadata": {},
|
| 710 |
+
"output_type": "execute_result"
|
| 711 |
+
}
|
| 712 |
+
],
|
| 713 |
+
"source": [
|
| 714 |
+
"p[1][0]"
|
| 715 |
+
]
|
| 716 |
+
},
|
| 717 |
+
{
|
| 718 |
+
"cell_type": "code",
|
| 719 |
+
"execution_count": 6,
|
| 720 |
+
"id": "55e3f46a",
|
| 721 |
+
"metadata": {},
|
| 722 |
+
"outputs": [],
|
| 723 |
+
"source": [
|
| 724 |
+
"COVER_LETTER_PROMPT = SystemMessage(content=\"\"\"You are my dedicated assistant for writing job application content, including cover letters, LinkedIn outreach messages, and responses to job-specific questions (e.g., experience, culture fit, or motivation).\n",
|
| 725 |
+
"\n",
|
| 726 |
+
"Your goal is to generate content that:\n",
|
| 727 |
+
"1. Reflects **my personality**, tone, and authentic voice, based on examples I provide.\n",
|
| 728 |
+
"2. Matches **my knowledge, experience, and interests**, which I’ll also share or update as needed.\n",
|
| 729 |
+
"3. Adopts **my writing style and energy** (e.g., grounded, confident, thoughtful—but not overly polished or generic).\n",
|
| 730 |
+
"4. Embeds **genuine enthusiasm or alignment** with the company or role, without sounding performative.\n",
|
| 731 |
+
"5. Avoids filler, clichés, or overused corporate phrases—keep it **authentic and specific**.\n",
|
| 732 |
+
"6. Learns over time by asking me relevant clarifying questions when needed (e.g., change in tone, new experience, updates to goals).\n",
|
| 733 |
+
"7. Balances job description alignment with personal storytelling, roughly in a 75:25 ratio.\n",
|
| 734 |
+
"8. Keeps outputs **concise** and within any given word or character limits.\"\"\")\n"
|
| 735 |
+
]
|
| 736 |
+
},
|
| 737 |
+
{
|
| 738 |
+
"cell_type": "code",
|
| 739 |
+
"execution_count": 7,
|
| 740 |
+
"id": "ea061e0e",
|
| 741 |
+
"metadata": {},
|
| 742 |
+
"outputs": [],
|
| 743 |
+
"source": [
|
| 744 |
+
"from langchain_core.prompts import (\n",
|
| 745 |
+
" ChatPromptTemplate,\n",
|
| 746 |
+
" HumanMessagePromptTemplate,\n",
|
| 747 |
+
" SystemMessagePromptTemplate,\n",
|
| 748 |
+
")\n",
|
| 749 |
+
"from langchain_core.messages import (\n",
|
| 750 |
+
" AIMessage,\n",
|
| 751 |
+
" HumanMessage,\n",
|
| 752 |
+
" SystemMessage,\n",
|
| 753 |
+
")\n",
|
| 754 |
+
"\n",
|
| 755 |
+
"FirstDraftGenerationPromptTemplate = ChatPromptTemplate.from_messages([COVER_LETTER_PROMPT])"
|
| 756 |
+
]
|
| 757 |
+
},
|
| 758 |
+
{
|
| 759 |
+
"cell_type": "code",
|
| 760 |
+
"execution_count": 8,
|
| 761 |
+
"id": "b96cbe64",
|
| 762 |
+
"metadata": {},
|
| 763 |
+
"outputs": [
|
| 764 |
+
{
|
| 765 |
+
"data": {
|
| 766 |
+
"text/plain": [
|
| 767 |
+
"ChatPromptTemplate(input_variables=[], input_types={}, partial_variables={}, messages=[SystemMessage(content='You are my dedicated assistant for writing job application content, including cover letters, LinkedIn outreach messages, and responses to job-specific questions (e.g., experience, culture fit, or motivation).\\n\\nYour goal is to generate content that:\\n1. Reflects **my personality**, tone, and authentic voice, based on examples I provide.\\n2. Matches **my knowledge, experience, and interests**, which I’ll also share or update as needed.\\n3. Adopts **my writing style and energy** (e.g., grounded, confident, thoughtful—but not overly polished or generic).\\n4. Embeds **genuine enthusiasm or alignment** with the company or role, without sounding performative.\\n5. Avoids filler, clichés, or overused corporate phrases—keep it **authentic and specific**.\\n6. Learns over time by asking me relevant clarifying questions when needed (e.g., change in tone, new experience, updates to goals).\\n7. Balances job description alignment with personal storytelling, roughly in a 75:25 ratio.\\n8. Keeps outputs **concise** and within any given word or character limits.', additional_kwargs={}, response_metadata={})])"
|
| 768 |
+
]
|
| 769 |
+
},
|
| 770 |
+
"execution_count": 8,
|
| 771 |
+
"metadata": {},
|
| 772 |
+
"output_type": "execute_result"
|
| 773 |
+
}
|
| 774 |
+
],
|
| 775 |
+
"source": [
|
| 776 |
+
"FirstDraftGenerationPromptTemplate"
|
| 777 |
+
]
|
| 778 |
+
},
|
| 779 |
+
{
|
| 780 |
+
"cell_type": "code",
|
| 781 |
+
"execution_count": null,
|
| 782 |
+
"id": "dfd03f8d",
|
| 783 |
+
"metadata": {},
|
| 784 |
+
"outputs": [],
|
| 785 |
+
"source": [
|
| 786 |
+
"current_application_session = \"Heello World\"\n",
|
| 787 |
+
"company_research_data = \"Company Research Data\""
|
| 788 |
+
]
|
| 789 |
+
},
|
| 790 |
+
{
|
| 791 |
+
"cell_type": "code",
|
| 792 |
+
"execution_count": 10,
|
| 793 |
+
"id": "c5fef665",
|
| 794 |
+
"metadata": {},
|
| 795 |
+
"outputs": [],
|
| 796 |
+
"source": [
|
| 797 |
+
"CurrentSessionContextMessage = HumanMessagePromptTemplate.from_template(\n",
|
| 798 |
+
" \"\"\"\n",
|
| 799 |
+
" # Resume and Job Description\n",
|
| 800 |
+
" {current_job_role}\n",
|
| 801 |
+
"\n",
|
| 802 |
+
" # Company Information\n",
|
| 803 |
+
" {company_research_data}\n",
|
| 804 |
+
"\n",
|
| 805 |
+
" Create a cover letter that highlights the match between my qualifications and the job requirements.\n",
|
| 806 |
+
" \"\"\",\n",
|
| 807 |
+
" input_variables=[\"current_job_role\",\n",
|
| 808 |
+
" \"company_research_data\"])"
|
| 809 |
+
]
|
| 810 |
+
},
|
| 811 |
+
{
|
| 812 |
+
"cell_type": "code",
|
| 813 |
+
"execution_count": 17,
|
| 814 |
+
"id": "c89ba644",
|
| 815 |
+
"metadata": {},
|
| 816 |
+
"outputs": [],
|
| 817 |
+
"source": [
|
| 818 |
+
"FirstDraftGenerationPromptTemplate.append(CurrentSessionContextMessage)"
|
| 819 |
+
]
|
| 820 |
+
},
|
| 821 |
+
{
|
| 822 |
+
"cell_type": "code",
|
| 823 |
+
"execution_count": 18,
|
| 824 |
+
"id": "6997c553",
|
| 825 |
+
"metadata": {},
|
| 826 |
+
"outputs": [],
|
| 827 |
+
"source": [
|
| 828 |
+
"chain = (\n",
|
| 829 |
+
" ({\"current_job_role\": lambda x: x[\"current_job_role\"],\n",
|
| 830 |
+
" \"company_research_data\": lambda x: x[\"company_research_data\"]})\n",
|
| 831 |
+
" | FirstDraftGenerationPromptTemplate\n",
|
| 832 |
+
" )"
|
| 833 |
+
]
|
| 834 |
+
},
|
| 835 |
+
{
|
| 836 |
+
"cell_type": "code",
|
| 837 |
+
"execution_count": 19,
|
| 838 |
+
"id": "55f51dbf",
|
| 839 |
+
"metadata": {},
|
| 840 |
+
"outputs": [
|
| 841 |
+
{
|
| 842 |
+
"data": {
|
| 843 |
+
"text/plain": [
|
| 844 |
+
"{\n",
|
| 845 |
+
" current_job_role: RunnableLambda(...),\n",
|
| 846 |
+
" company_research_data: RunnableLambda(...)\n",
|
| 847 |
+
"}\n",
|
| 848 |
+
"| ChatPromptTemplate(input_variables=[], input_types={}, partial_variables={}, messages=[SystemMessage(content='You are my dedicated assistant for writing job application content, including cover letters, LinkedIn outreach messages, and responses to job-specific questions (e.g., experience, culture fit, or motivation).\\n\\nYour goal is to generate content that:\\n1. Reflects **my personality**, tone, and authentic voice, based on examples I provide.\\n2. Matches **my knowledge, experience, and interests**, which I’ll also share or update as needed.\\n3. Adopts **my writing style and energy** (e.g., grounded, confident, thoughtful—but not overly polished or generic).\\n4. Embeds **genuine enthusiasm or alignment** with the company or role, without sounding performative.\\n5. Avoids filler, clichés, or overused corporate phrases—keep it **authentic and specific**.\\n6. Learns over time by asking me relevant clarifying questions when needed (e.g., change in tone, new experience, updates to goals).\\n7. Balances job description alignment with personal storytelling, roughly in a 75:25 ratio.\\n8. Keeps outputs **concise** and within any given word or character limits.', additional_kwargs={}, response_metadata={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['company_research_data', 'current_job_role'], input_types={}, partial_variables={}, template='\\n # Resume and Job Description\\n {current_job_role}\\n\\n # Company Information\\n {company_research_data}\\n\\n Create a cover letter that highlights the match between my qualifications and the job requirements.\\n '), additional_kwargs={})])"
|
| 849 |
+
]
|
| 850 |
+
},
|
| 851 |
+
"execution_count": 19,
|
| 852 |
+
"metadata": {},
|
| 853 |
+
"output_type": "execute_result"
|
| 854 |
+
}
|
| 855 |
+
],
|
| 856 |
+
"source": [
|
| 857 |
+
"chain"
|
| 858 |
+
]
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"cell_type": "code",
|
| 862 |
+
"execution_count": null,
|
| 863 |
+
"id": "48c54667",
|
| 864 |
+
"metadata": {},
|
| 865 |
+
"outputs": [
|
| 866 |
+
{
|
| 867 |
+
"ename": "ModuleNotFoundError",
|
| 868 |
+
"evalue": "No module named 'utils'",
|
| 869 |
+
"output_type": "error",
|
| 870 |
+
"traceback": [
|
| 871 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 872 |
+
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
| 873 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[25]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mllm_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LLMClient\n\u001b[32m 3\u001b[39m LLM = LLMClient()\n\u001b[32m 4\u001b[39m llm = LLMClient().get_llm()\n",
|
| 874 |
+
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'utils'"
|
| 875 |
+
]
|
| 876 |
+
}
|
| 877 |
+
],
|
| 878 |
+
"source": [
|
| 879 |
+
"from job_writer.utils.llm_client import LLMClient\n",
|
| 880 |
+
"\n",
|
| 881 |
+
"LLM = LLMClient()\n",
|
| 882 |
+
"llm = LLMClient().get_llm()"
|
| 883 |
+
]
|
| 884 |
+
},
|
| 885 |
+
{
|
| 886 |
+
"cell_type": "code",
|
| 887 |
+
"execution_count": null,
|
| 888 |
+
"id": "421df9ca",
|
| 889 |
+
"metadata": {},
|
| 890 |
+
"outputs": [],
|
| 891 |
+
"source": [
|
| 892 |
+
"from job_writer.tools.TavilySearch import search_company\n",
|
| 893 |
+
"\n",
|
| 894 |
+
"# Test job description\n",
|
| 895 |
+
"test_job = \"\"\"\n",
|
| 896 |
+
"Software Engineer - Backend\n",
|
| 897 |
+
"OpenAI\n",
|
| 898 |
+
"\n",
|
| 899 |
+
"We are looking for experienced backend engineers to join our team. Our ideal candidate will have experience with one or more of the following technologies: Python, Java, C++. \n",
|
| 900 |
+
"\n",
|
| 901 |
+
"Responsibilities:\n",
|
| 902 |
+
"- Design and implement scalable and efficient backend systems\n",
|
| 903 |
+
"- Write clean, maintainable code\n",
|
| 904 |
+
"- Work with cross-functional teams\n",
|
| 905 |
+
"\n",
|
| 906 |
+
"Requirements:\n",
|
| 907 |
+
"- Strong proficiency in one or more programming languages\n",
|
| 908 |
+
"- Strong understanding of software design patterns and principles\n",
|
| 909 |
+
"- Experience with distributed systems\n",
|
| 910 |
+
"\"\"\"\n",
|
| 911 |
+
"\n",
|
| 912 |
+
"# Test the search_company function\n",
|
| 913 |
+
"results = search_company(test_job)\n",
|
| 914 |
+
"for query_key, data in results.items():\n",
|
| 915 |
+
" print(f\"\\n{query_key}:\")\n",
|
| 916 |
+
" print(f\"Query: {data['query']}\")\n",
|
| 917 |
+
" print(f\"Rationale: {data['rationale']}\")\n",
|
| 918 |
+
" if data['results']:\n",
|
| 919 |
+
" print(f\"First result: {data['results'][0][:200]}...\")\n",
|
| 920 |
+
" else:\n",
|
| 921 |
+
" print(\"No results found\")\n"
|
| 922 |
+
]
|
| 923 |
+
},
|
| 924 |
+
{
|
| 925 |
+
"cell_type": "code",
|
| 926 |
+
"execution_count": 1,
|
| 927 |
+
"id": "18f12ff8",
|
| 928 |
+
"metadata": {},
|
| 929 |
+
"outputs": [],
|
| 930 |
+
"source": [
|
| 931 |
+
"from langchain_core.prompts import (\n",
|
| 932 |
+
" ChatPromptTemplate,\n",
|
| 933 |
+
" HumanMessagePromptTemplate,\n",
|
| 934 |
+
" SystemMessagePromptTemplate,\n",
|
| 935 |
+
")\n",
|
| 936 |
+
"from langchain_core.messages import (\n",
|
| 937 |
+
" AIMessage,\n",
|
| 938 |
+
" HumanMessage,\n",
|
| 939 |
+
" SystemMessage,\n",
|
| 940 |
+
")"
|
| 941 |
+
]
|
| 942 |
+
},
|
| 943 |
+
{
|
| 944 |
+
"cell_type": "code",
|
| 945 |
+
"execution_count": 2,
|
| 946 |
+
"id": "3ba77224",
|
| 947 |
+
"metadata": {},
|
| 948 |
+
"outputs": [],
|
| 949 |
+
"source": [
|
| 950 |
+
"from job_writer.prompts.templates import (\n",
|
| 951 |
+
" TAVILY_QUERY_PROMPT\n",
|
| 952 |
+
")"
|
| 953 |
+
]
|
| 954 |
+
},
|
| 955 |
+
{
|
| 956 |
+
"cell_type": "code",
|
| 957 |
+
"execution_count": 3,
|
| 958 |
+
"id": "50bb7c0c",
|
| 959 |
+
"metadata": {},
|
| 960 |
+
"outputs": [],
|
| 961 |
+
"source": [
|
| 962 |
+
"tavily_search_prompt = ChatPromptTemplate.from_messages([\n",
|
| 963 |
+
" SystemMessage(content=TAVILY_QUERY_PROMPT),\n",
|
| 964 |
+
" HumanMessage(\n",
|
| 965 |
+
" \"Below is the required job description and resume: {background_information}\",\n",
|
| 966 |
+
" input_variables=[\"background_information\"]\n",
|
| 967 |
+
" )\n",
|
| 968 |
+
"])"
|
| 969 |
+
]
|
| 970 |
+
},
|
| 971 |
+
{
|
| 972 |
+
"cell_type": "code",
|
| 973 |
+
"execution_count": 5,
|
| 974 |
+
"id": "372e6346",
|
| 975 |
+
"metadata": {},
|
| 976 |
+
"outputs": [],
|
| 977 |
+
"source": [
|
| 978 |
+
"job_description = \"\"\"\n",
|
| 979 |
+
"Software Engineer - Backend\n",
|
| 980 |
+
"OpenAI\n",
|
| 981 |
+
"\n",
|
| 982 |
+
"We are looking for experienced backend engineers to join our team. Our ideal candidate will have experience with one or more of the following technologies: Python, Java, C++. \n",
|
| 983 |
+
"\n",
|
| 984 |
+
"Responsibilities:\n",
|
| 985 |
+
"- Design and implement scalable and efficient backend systems\n",
|
| 986 |
+
"- Write clean, maintainable code\n",
|
| 987 |
+
"- Work with cross-functional teams\n",
|
| 988 |
+
"\n",
|
| 989 |
+
"Requirements:\n",
|
| 990 |
+
"- Strong proficiency in one or more programming languages\n",
|
| 991 |
+
"- Strong understanding of software design patterns and principles\n",
|
| 992 |
+
"- Experience with distributed systems\n",
|
| 993 |
+
"\"\"\""
|
| 994 |
+
]
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"cell_type": "code",
|
| 998 |
+
"execution_count": 6,
|
| 999 |
+
"id": "3a27365f",
|
| 1000 |
+
"metadata": {},
|
| 1001 |
+
"outputs": [
|
| 1002 |
+
{
|
| 1003 |
+
"data": {
|
| 1004 |
+
"text/plain": [
|
| 1005 |
+
"'System: \\n<Background>\\nSINCE THE USER IS APPPLYING FOR A JOB, THE QUERIES SHOULD BE WRITTEN IN A WAY THAT RESULST IN RELEVANT INFORMATION ABOUT THE COMPANY. THIS WILL HELP THE USER WRITE A MORE PERSONALIZED AND RELEVANT APPLICATION.\\n\\nCategory mapping (remember this!):\\n query1 : recent developments\\n query2 : recent news\\n query3 : role-related info\\n query4 : key customers & partners \\n query5 : culture & values\\n\\nNote: The above are just categories. The queries should be written in a way that results in relevant information about the company. Must include the company name in the query to ensure results have a higher confidence.\\n</Background>\\n\\n<Instructions>\\n 1. Each array must contain **exactly two** strings: [search_query, one_sentence_rationale] \\n 2. If data is missing, craft a sensible fallback query; never return an empty array. \\n 3. If the employer name cannot be found, use `\"UNKNOWN\"`. \\n 4. Escape JSON only where required.\\n 5. Query cannot be repeated. It will lead to irrelevant results.\\n</Instructions>\\n\\n<EXAMPLE>\\n JSON->\\n \"query1\": (\"....\", \"...\")\\n \"query2\": (\"....\", \"...\")\\n \"query3\": (\"....\", \"...\")\\n \"query4\": (\"....\", \"...\")\\n \"query5\": (\"....\", \"...\")\\n</EXAMPLE>\\n \\nHuman: Below is the required job description and resume: {background_information}'"
|
| 1006 |
+
]
|
| 1007 |
+
},
|
| 1008 |
+
"execution_count": 6,
|
| 1009 |
+
"metadata": {},
|
| 1010 |
+
"output_type": "execute_result"
|
| 1011 |
+
}
|
| 1012 |
+
],
|
| 1013 |
+
"source": [
|
| 1014 |
+
"tavily_search_prompt.format(background_information=job_description)"
|
| 1015 |
+
]
|
| 1016 |
+
},
|
| 1017 |
+
{
|
| 1018 |
+
"cell_type": "code",
|
| 1019 |
+
"execution_count": 8,
|
| 1020 |
+
"id": "6b973991",
|
| 1021 |
+
"metadata": {},
|
| 1022 |
+
"outputs": [
|
| 1023 |
+
{
|
| 1024 |
+
"name": "stdout",
|
| 1025 |
+
"output_type": "stream",
|
| 1026 |
+
"text": [
|
| 1027 |
+
"Initializing LLM with model llama3.2:latest and provider ollama in c:\\users\\risha\\python-dir\\knowledgebase\\job_writer\\utils\\llm_client.py\n",
|
| 1028 |
+
"Initializing LLM with model llama3.2:latest and provider ollama in c:\\users\\risha\\python-dir\\knowledgebase\\job_writer\\utils\\llm_client.py\n"
|
| 1029 |
+
]
|
| 1030 |
+
}
|
| 1031 |
+
],
|
| 1032 |
+
"source": [
|
| 1033 |
+
"from job_writer.utils.llm_client import LLMClient\n",
|
| 1034 |
+
"\n",
|
| 1035 |
+
"LLM = LLMClient()\n",
|
| 1036 |
+
"llm = LLMClient().get_llm()"
|
| 1037 |
+
]
|
| 1038 |
+
},
|
| 1039 |
+
{
|
| 1040 |
+
"cell_type": "code",
|
| 1041 |
+
"execution_count": null,
|
| 1042 |
+
"id": "5ff5ac65",
|
| 1043 |
+
"metadata": {},
|
| 1044 |
+
"outputs": [],
|
| 1045 |
+
"source": []
|
| 1046 |
+
}
|
| 1047 |
+
],
|
| 1048 |
+
"metadata": {
|
| 1049 |
+
"kernelspec": {
|
| 1050 |
+
"display_name": "Python 3",
|
| 1051 |
+
"language": "python",
|
| 1052 |
+
"name": "python3"
|
| 1053 |
+
},
|
| 1054 |
+
"language_info": {
|
| 1055 |
+
"codemirror_mode": {
|
| 1056 |
+
"name": "ipython",
|
| 1057 |
+
"version": 3
|
| 1058 |
+
},
|
| 1059 |
+
"file_extension": ".py",
|
| 1060 |
+
"mimetype": "text/x-python",
|
| 1061 |
+
"name": "python",
|
| 1062 |
+
"nbconvert_exporter": "python",
|
| 1063 |
+
"pygments_lexer": "ipython3",
|
| 1064 |
+
"version": "3.12.10"
|
| 1065 |
+
}
|
| 1066 |
+
},
|
| 1067 |
+
"nbformat": 4,
|
| 1068 |
+
"nbformat_minor": 5
|
| 1069 |
+
}
|
tools/TavilySearch.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import asyncio
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
| 8 |
+
from langchain_core.prompt_values import PromptValue
|
| 9 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 10 |
+
from langchain_community.tools import tool
|
| 11 |
+
from langchain.output_parsers import PydanticOutputParser, RetryOutputParser
|
| 12 |
+
from openevals.llm import create_async_llm_as_judge
|
| 13 |
+
from openevals.prompts import (
|
| 14 |
+
RAG_RETRIEVAL_RELEVANCE_PROMPT,
|
| 15 |
+
RAG_HELPFULNESS_PROMPT
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
from ..utils.llm_client import LLMClient
|
| 19 |
+
from ..agents.output_schema import TavilyQuerySet
|
| 20 |
+
from ..prompts.templates import TAVILY_QUERY_PROMPT
|
| 21 |
+
from ..classes.classes import ResearchState
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
LLM = LLMClient()
|
| 26 |
+
llm_client = LLM.get_instance(model_name="ejschwar/llama3.2-better-prompts:latest", model_provider="ollama_llm")
|
| 27 |
+
llm_structured = llm_client.get_llm()
|
| 28 |
+
|
| 29 |
+
relevance_evaluator = create_async_llm_as_judge(
|
| 30 |
+
judge=llm_structured,
|
| 31 |
+
prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,
|
| 32 |
+
feedback_key="retrieval_relevance",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
helpfulness_evaluator = create_async_llm_as_judge(
|
| 36 |
+
judge=llm_structured,
|
| 37 |
+
prompt=RAG_HELPFULNESS_PROMPT
|
| 38 |
+
+ '\nReturn "true" if the answer is helpful, and "false" otherwise.',
|
| 39 |
+
feedback_key="helpfulness",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
@tool
|
| 43 |
+
def search_company(job_description: str, company_name: str) -> dict:
|
| 44 |
+
"""Gather information about a company to understand more about the role,
|
| 45 |
+
recent developments, culture, and values of the company."""
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
# Get format instructions from the parser
|
| 49 |
+
base_parser = PydanticOutputParser(pydantic_object=TavilyQuerySet)
|
| 50 |
+
parser = RetryOutputParser.from_llm(llm_structured, base_parser)
|
| 51 |
+
format_instructions = parser.get_format_instructions()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Create the prompt with both messages
|
| 55 |
+
chat_prompt_tavily: ChatPromptTemplate = ChatPromptTemplate.from_messages([
|
| 56 |
+
SystemMessagePromptTemplate.from_template(
|
| 57 |
+
TAVILY_QUERY_PROMPT,
|
| 58 |
+
input_variables=["company_name"]
|
| 59 |
+
),
|
| 60 |
+
HumanMessagePromptTemplate.from_template(
|
| 61 |
+
"Below is the required job description to parse:\n\n{job_description}",
|
| 62 |
+
input_variables=["job_description"]
|
| 63 |
+
)
|
| 64 |
+
])
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
chat_prompt_value: PromptValue = chat_prompt_tavily.format_prompt(
|
| 68 |
+
company_name=company_name,
|
| 69 |
+
job_description=job_description
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# Format messages and get LLM response
|
| 74 |
+
chat_prompt_tavily_messages = chat_prompt_tavily.format_messages(
|
| 75 |
+
company_name=company_name,
|
| 76 |
+
job_description=job_description
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Get response from LLM
|
| 81 |
+
search_results_llm = llm_structured.invoke(chat_prompt_tavily_messages)
|
| 82 |
+
# logger.info("Raw LLM Response content: %s", search_results_llm.content)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
parsed_query_set: TavilyQuerySet = parser.parse_with_prompt(search_results_llm.content, chat_prompt_value)
|
| 87 |
+
logger.info("Parsed TavilyQuerySet: %s", parsed_query_set.model_dump_json(indent=2))
|
| 88 |
+
except json.JSONDecodeError as e:
|
| 89 |
+
logger.error("JSON decoding error while parsing LLM response: %s. LLM content was: %s", e, search_results_llm.content, exc_info=True)
|
| 90 |
+
raise
|
| 91 |
+
except Exception as e: # Catches PydanticValidationErrors and other parsing issues
|
| 92 |
+
logger.error("Error parsing TavilyQuerySet from LLM completion: %s. LLM content was: %s", e, search_results_llm.content, exc_info=True)
|
| 93 |
+
raise
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Initialize search with advanced parameters
|
| 97 |
+
search = TavilySearchResults(max_results=4, search_depth="advanced")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# Prepare the structure for storing queries, rationales, and Tavily results
|
| 101 |
+
company_research_data = {}
|
| 102 |
+
attempted_queries = []
|
| 103 |
+
query_attributes = [f"query{i}" for i in range(1, 6)]
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
for attr_name in query_attributes:
|
| 107 |
+
query_list = getattr(parsed_query_set, attr_name, None)
|
| 108 |
+
if query_list and isinstance(query_list, list) and len(query_list) > 0:
|
| 109 |
+
actual_query = query_list[0]
|
| 110 |
+
rationale = query_list[1] if len(query_list) > 1 else "N/A" # Handle if rationale is missing
|
| 111 |
+
company_research_data[attr_name] = {
|
| 112 |
+
'query': actual_query,
|
| 113 |
+
'rationale': rationale,
|
| 114 |
+
'results': []
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# logger.info("Prepared company research structure: %s", json.dumps(company_research_data, indent=2))
|
| 119 |
+
# Execute each query and store results
|
| 120 |
+
for query_key, query_info in company_research_data.items():
|
| 121 |
+
try:
|
| 122 |
+
if not isinstance(query_info['query'], str) or not query_info['query'].strip():
|
| 123 |
+
logger.warning("Skipping Tavily search for %s due to invalid/empty query: '%s'", query_key, query_info['query'])
|
| 124 |
+
query_info['results'] = []
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
logger.info("Executing Tavily search for %s: '%s'", query_key, query_info['query'])
|
| 129 |
+
# tool.invoke({"args": {'query': 'who won the last french open'}, "type": "tool_call", "id": "foo", "name": "tavily"})
|
| 130 |
+
tavily_api_results = search.invoke({"args": {'query': query_info['query']}, "type": "tool_call", "id": "job_search", "name": "tavily"})
|
| 131 |
+
attempted_queries.append(query_info['query'])
|
| 132 |
+
del query_info['query']
|
| 133 |
+
|
| 134 |
+
if tavily_api_results and isinstance(tavily_api_results, list) and len(tavily_api_results) > 0:
|
| 135 |
+
query_info['results'] = [result['content'] for result in tavily_api_results if 'content' in result]
|
| 136 |
+
else:
|
| 137 |
+
logger.info("No results or unexpected format from Tavily for %s.", query_key)
|
| 138 |
+
query_info['results'] = []
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error("Error executing Tavily search for query %s ('%s'): %s", query_key, query_info['query'], str(e), exc_info=True)
|
| 141 |
+
query_info['results'] = []
|
| 142 |
+
|
| 143 |
+
# print("Results: ", results)
|
| 144 |
+
return company_research_data, attempted_queries
|
| 145 |
+
|
| 146 |
+
except json.JSONDecodeError as e:
|
| 147 |
+
logger.error("JSON decoding error: %s", e)
|
| 148 |
+
raise
|
| 149 |
+
except AttributeError as e:
|
| 150 |
+
logger.error("Attribute error: %s", e)
|
| 151 |
+
raise
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.error("Unexpected error: %s", e)
|
| 154 |
+
raise
|
| 155 |
+
|
| 156 |
+
async def relevance_filter(state: ResearchState) -> ResearchState:
|
| 157 |
+
try:
|
| 158 |
+
# Mark the current node
|
| 159 |
+
state["current_node"] = "relevance_filter"
|
| 160 |
+
|
| 161 |
+
# Check if company_research_data exists
|
| 162 |
+
if not state.get("company_research_data"):
|
| 163 |
+
print("ERROR: company_research_data not found in state")
|
| 164 |
+
return state
|
| 165 |
+
|
| 166 |
+
# Check if tavily_search results exist
|
| 167 |
+
if not state["company_research_data"].get("tavily_search"):
|
| 168 |
+
print("ERROR: tavily_search not found in company_research_data")
|
| 169 |
+
state["company_research_data"]["tavily_search"] = []
|
| 170 |
+
return state
|
| 171 |
+
|
| 172 |
+
# Initialize compiled_results if not present
|
| 173 |
+
if "compiled_results" not in state:
|
| 174 |
+
state["compiled_results"] = []
|
| 175 |
+
|
| 176 |
+
print("Filtering results...")
|
| 177 |
+
# Get the company research data which contains results for different queries
|
| 178 |
+
# Example: {'query1': {'rationale': ..., 'results': [...]}, 'query2': ...}
|
| 179 |
+
|
| 180 |
+
all_query_data = state["company_research_data"].get("tavily_search", {})
|
| 181 |
+
# print("All query data:", all_query_data)
|
| 182 |
+
filtered_results_for_current_run = [] # Stores results deemed relevant in this specific call
|
| 183 |
+
|
| 184 |
+
# Create a semaphore to limit concurrent tasks to 2
|
| 185 |
+
semaphore = asyncio.Semaphore(2)
|
| 186 |
+
|
| 187 |
+
async def evaluate_with_semaphore(query_result_item: dict):
|
| 188 |
+
# query_result_item is a dict like {'rationale': '...', 'results': [...]}
|
| 189 |
+
async with semaphore:
|
| 190 |
+
# Safely get the query to use for relevance evaluation
|
| 191 |
+
attempted_queries_list = state.get("attempted_search_queries", [])
|
| 192 |
+
input_query = attempted_queries_list[-1] if attempted_queries_list else "No query context available"
|
| 193 |
+
|
| 194 |
+
eval_result = await relevance_evaluator(
|
| 195 |
+
inputs=input_query, context=query_result_item # context is the whole result block for the query
|
| 196 |
+
)
|
| 197 |
+
return query_result_item, eval_result
|
| 198 |
+
|
| 199 |
+
# Create tasks for all results
|
| 200 |
+
tasks = [evaluate_with_semaphore(query_info) for query_info in all_query_data.values() if isinstance(query_info, dict) and "results" in query_info]
|
| 201 |
+
|
| 202 |
+
# Process tasks as they complete
|
| 203 |
+
for completed_task in asyncio.as_completed(tasks):
|
| 204 |
+
query_result_item, eval_result = await completed_task
|
| 205 |
+
if eval_result.get("score"): # Safely check for score
|
| 206 |
+
# Assuming query_result_item["results"] is a list of content strings
|
| 207 |
+
if isinstance(query_result_item.get("results"), list):
|
| 208 |
+
# print(f"Evaluated result: {query_result_item}")
|
| 209 |
+
filtered_results_for_current_run.extend(query_result_item["results"])
|
| 210 |
+
else:
|
| 211 |
+
# Handle cases where "results" might not be a list or is missing
|
| 212 |
+
logger.warning("Expected a list for 'results' in query_result_item, got: %s", type(query_result_item.get('results')))
|
| 213 |
+
|
| 214 |
+
logger.info("Filtered results for current run: %s",filtered_results_for_current_run)
|
| 215 |
+
|
| 216 |
+
# The error occurs at a line like the following (line 178 in your traceback):
|
| 217 |
+
# This print statement will now safely access "compiled_results"
|
| 218 |
+
# print("Compiled results (before append): ", state["compiled_results"]) # Append the newly filtered results to the main compiled_results list
|
| 219 |
+
state["compiled_results"].extend(filtered_results_for_current_run)
|
| 220 |
+
state["company_research_data"]["tavily_search"] = filtered_results_for_current_run
|
| 221 |
+
# logger.info(f"Compiled results (after append): {state['compiled_results']}")
|
| 222 |
+
return state
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
print(f"ERROR in relevance_filter: {e}")
|
| 226 |
+
import traceback
|
| 227 |
+
traceback.print_exc()
|
| 228 |
+
logger.error(f"Error in relevance_filter: {str(e)}")
|
| 229 |
+
# Return original state to avoid breaking the flow
|
| 230 |
+
return state
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Mon Oct 23 16:49:52 2023
|
| 4 |
+
@author: rishabhaggarwal
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .TavilySearch import search_company, relevance_filter
|
| 8 |
+
|
| 9 |
+
__all__ = ["search_company", "relevance_filter"]
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility modules for the job_writer package.
|
| 3 |
+
"""
|
utils/config.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration utilities for the job writer application.
|
| 3 |
+
|
| 4 |
+
This module provides functions for initializing and configuring
|
| 5 |
+
language models and other resources.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from typing_extensions import Dict, Any, Tuple, Optional
|
| 10 |
+
from langchain.chat_models import init_chat_model
|
| 11 |
+
|
| 12 |
+
def init_models(config: Optional[Dict[str, Any]] = None) -> Tuple[Any, Any]:
|
| 13 |
+
"""Initialize language models based on configuration."""
|
| 14 |
+
config = config or {}
|
| 15 |
+
|
| 16 |
+
# Model configuration with defaults
|
| 17 |
+
model_name = config.get("model_name", os.getenv("OLLAMA_MODEL", "llama3.2:latest"))
|
| 18 |
+
temperature = float(config.get("temperature", "0.3"))
|
| 19 |
+
precise_temperature = float(config.get("precise_temperature", "0.2"))
|
| 20 |
+
|
| 21 |
+
# Initialize models
|
| 22 |
+
llm = init_chat_model(f"ollama:{model_name}", temperature=temperature)
|
| 23 |
+
llm_precise = init_chat_model(f"ollama:{model_name}", temperature=precise_temperature)
|
| 24 |
+
|
| 25 |
+
return llm, llm_precise
|
utils/document_processing.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document processing utilities for parsing resumes and job descriptions.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from urllib.parse import urlparse
|
| 12 |
+
from typing_extensions import Dict, List, Any
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Langchain imports
|
| 16 |
+
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
|
| 17 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
|
| 18 |
+
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
| 19 |
+
from langchain_core.messages import SystemMessage
|
| 20 |
+
from langchain_core.documents import Document
|
| 21 |
+
from langchain_core.output_parsers.json import JsonOutputParser
|
| 22 |
+
from langfuse.decorators import observe, langfuse_context
|
| 23 |
+
from pydantic import BaseModel, Field
|
| 24 |
+
|
| 25 |
+
# Local imports - using relative imports
|
| 26 |
+
from .errors import URLExtractionError, LLMProcessingError, JobDescriptionParsingError
|
| 27 |
+
from .llm_client import LLMClient
|
| 28 |
+
from ..prompts.templates import JOB_DESCRIPTION_PROMPT
|
| 29 |
+
|
| 30 |
+
# Set up logging
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
logging.basicConfig(level=logging.INFO)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Default paths
|
| 36 |
+
DEFAULT_RESUME_PATH: str = os.getenv("DEFAULT_RESUME_PATH", "")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Most Occurring Resume Section Headers
|
| 40 |
+
RESUME_SECTIONS: list[str] = [
|
| 41 |
+
"EDUCATION", "EXPERIENCE", "SKILLS", "WORK EXPERIENCE",
|
| 42 |
+
"PROFESSIONAL EXPERIENCE", "PROJECTS", "CERTIFICATIONS",
|
| 43 |
+
"SUMMARY", "OBJECTIVE", "CONTACT", "PUBLICATIONS",
|
| 44 |
+
"AWARDS", "LANGUAGES", "INTERESTS", "REFERENCES"
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
# Initialize LLM client
|
| 48 |
+
LLM: LLMClient = LLMClient()
|
| 49 |
+
|
| 50 |
+
llm_client: LLMClient = LLM.get_instance(
|
| 51 |
+
model_name="ejschwar/llama3.2-better-prompts:latest",
|
| 52 |
+
model_provider="ollama_json")
|
| 53 |
+
llm_structured = llm_client.get_llm()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ResumeSection(BaseModel):
|
| 57 |
+
"""Model for a structured resume section."""
|
| 58 |
+
title: str = Field(description="The section title (e.g., 'Experience', 'Education')")
|
| 59 |
+
content: str = Field(description="The full content of this section")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class StructuredResume(BaseModel):
|
| 63 |
+
"""Model for a structured resume with sections."""
|
| 64 |
+
sections: List[ResumeSection] = Field(description="List of resume sections")
|
| 65 |
+
contact_info: Dict[str, str] = Field(description="Contact information extracted from the resume")
|
| 66 |
+
|
| 67 |
+
class JobDescriptionComponents(BaseModel):
|
| 68 |
+
"""Model for job description components."""
|
| 69 |
+
company_name: str = Field(description="The company name")
|
| 70 |
+
job_description: str = Field(description="The job description")
|
| 71 |
+
reasoning: str = Field(description="The reasoning for the extracted information")
|
| 72 |
+
|
| 73 |
+
@observe()
|
| 74 |
+
def clean_resume_text(text: str) -> str:
|
| 75 |
+
"""Clean and normalize resume text by removing extra whitespace, fixing common PDF extraction issues.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
text: Raw text extracted from resume
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
Cleaned text
|
| 82 |
+
"""
|
| 83 |
+
# Remove excessive whitespace
|
| 84 |
+
text = re.sub(r'\s+', ' ', text)
|
| 85 |
+
|
| 86 |
+
# Fix common PDF extraction issues
|
| 87 |
+
text = re.sub(r'([a-z])- ([a-z])', r'\1\2', text) # Fix hyphenated words
|
| 88 |
+
|
| 89 |
+
# Remove header/footer page numbers
|
| 90 |
+
text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
|
| 91 |
+
|
| 92 |
+
# Replace bullet variations with standard markdown bullets
|
| 93 |
+
text = re.sub(r'[•●○◘◙♦♣♠★]', '* ', text)
|
| 94 |
+
|
| 95 |
+
return text.strip()
|
| 96 |
+
|
| 97 |
+
@observe()
|
| 98 |
+
def extract_contact_info(text: str) -> Dict[str, str]:
|
| 99 |
+
"""Extract contact information from resume text.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
text: Resume text to extract from
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
Dictionary with contact information
|
| 106 |
+
"""
|
| 107 |
+
contact_info = {}
|
| 108 |
+
|
| 109 |
+
# Extract email
|
| 110 |
+
email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
| 111 |
+
if email_match:
|
| 112 |
+
contact_info['email'] = email_match.group(0)
|
| 113 |
+
|
| 114 |
+
# Extract phone (various formats)
|
| 115 |
+
phone_match = re.search(r'(\+\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}', text)
|
| 116 |
+
if phone_match:
|
| 117 |
+
contact_info['phone'] = phone_match.group(0)
|
| 118 |
+
|
| 119 |
+
# Extract LinkedIn URL
|
| 120 |
+
linkedin_match = re.search(r'linkedin\.com/in/[a-zA-Z0-9_-]+/?', text)
|
| 121 |
+
if linkedin_match:
|
| 122 |
+
contact_info['linkedin'] = 'https://www.' + linkedin_match.group(0)
|
| 123 |
+
|
| 124 |
+
# Try to extract name (this is approximate and might need LLM for better accuracy)
|
| 125 |
+
# Typically name appears at the top of the resume
|
| 126 |
+
first_line = text.strip().split('\n')[0].strip()
|
| 127 |
+
if len(first_line) < 40 and not any(char.isdigit() for char in first_line):
|
| 128 |
+
contact_info['name'] = first_line
|
| 129 |
+
|
| 130 |
+
return contact_info
|
| 131 |
+
|
| 132 |
+
@observe()
|
| 133 |
+
def identify_resume_sections(text: str) -> List[Dict[str, Any]]:
|
| 134 |
+
"""Identify sections in a resume text.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
text: Full resume text
|
| 138 |
+
llm: Optional language model for advanced section detection
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
List of dictionaries with section info
|
| 142 |
+
"""
|
| 143 |
+
sections = []
|
| 144 |
+
|
| 145 |
+
# if llm:
|
| 146 |
+
# # Use LLM for more accurate section identification
|
| 147 |
+
# prompt = ChatPromptTemplate.from_messages([
|
| 148 |
+
# SystemMessage(content="""You are an expert at parsing resumes.
|
| 149 |
+
# Identify the main sections in this resume text and structure them.
|
| 150 |
+
# For each section, extract the title and content."""),
|
| 151 |
+
# HumanMessage(content=f"Resume text:\n\n{text}")
|
| 152 |
+
# ])
|
| 153 |
+
|
| 154 |
+
# class ResumeStructure(BaseModel):
|
| 155 |
+
# sections: List[Dict[str, str]] = Field(description="List of identified sections with title and content")
|
| 156 |
+
|
| 157 |
+
# parser = PydanticOutputParser(pydantic_object=ResumeStructure)
|
| 158 |
+
# chain = prompt | llm | parser
|
| 159 |
+
|
| 160 |
+
# try:
|
| 161 |
+
# result = chain.invoke({})
|
| 162 |
+
# return result.sections
|
| 163 |
+
# except Exception as e:
|
| 164 |
+
# print(f"LLM section extraction failed: {e}")
|
| 165 |
+
|
| 166 |
+
# Regex-based section identification
|
| 167 |
+
# Create a pattern that matches common section headers
|
| 168 |
+
section_pattern = r'(?:^|\n)(?:[^a-zA-Z\d\s]|\s)*(' + '|'.join(RESUME_SECTIONS) + r')(?:[^a-zA-Z\d\s]|\s)*(?:$|\n)'
|
| 169 |
+
matches = list(re.finditer(section_pattern, text, re.IGNORECASE))
|
| 170 |
+
|
| 171 |
+
if not matches:
|
| 172 |
+
# If no sections found, treat the whole resume as one section
|
| 173 |
+
sections.append({
|
| 174 |
+
"title": "resume",
|
| 175 |
+
"content": text,
|
| 176 |
+
})
|
| 177 |
+
return sections
|
| 178 |
+
|
| 179 |
+
# Process each section
|
| 180 |
+
for i, match in enumerate(matches):
|
| 181 |
+
section_title = match.group(1).strip()
|
| 182 |
+
start_pos = match.start()
|
| 183 |
+
|
| 184 |
+
# Find the end position (start of next section or end of text)
|
| 185 |
+
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(text)
|
| 186 |
+
|
| 187 |
+
# Extract section content (excluding the header)
|
| 188 |
+
section_content = text[start_pos:end_pos].strip()
|
| 189 |
+
|
| 190 |
+
sections.append({
|
| 191 |
+
"title": section_title.lower(),
|
| 192 |
+
"content": section_content
|
| 193 |
+
})
|
| 194 |
+
|
| 195 |
+
return sections
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def _collapse_ws(text: str) -> str:
|
| 199 |
+
"""Collapse stray whitespace but keep bullet breaks."""
|
| 200 |
+
text = re.sub(r"\n\s*([•\-–])\s*", r"\n\1 ", text)
|
| 201 |
+
return re.sub(r"[ \t\r\f\v]+", " ", text).replace(" \n", "\n").strip()
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def _is_heading(line: str) -> bool:
|
| 205 |
+
return (
|
| 206 |
+
line.isupper()
|
| 207 |
+
and len(line.split()) <= 5
|
| 208 |
+
and not re.search(r"\d", line)
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
def parse_resume(file_path: str | Path) -> List[Document]:
|
| 212 |
+
"""
|
| 213 |
+
Load a résumé from PDF or TXT file → list[Document] chunks
|
| 214 |
+
(≈400 chars, 50‑char overlap) with {source, section} metadata.
|
| 215 |
+
"""
|
| 216 |
+
file_extension = Path(file_path).suffix.lower()
|
| 217 |
+
|
| 218 |
+
# Handle different file types
|
| 219 |
+
if file_extension == '.pdf':
|
| 220 |
+
text = PyPDFLoader(str(file_path), extraction_mode="layout").load()[0].page_content
|
| 221 |
+
elif file_extension == '.txt':
|
| 222 |
+
try:
|
| 223 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 224 |
+
text = f.read()
|
| 225 |
+
if not text.strip():
|
| 226 |
+
raise ValueError("File is empty")
|
| 227 |
+
except Exception as e:
|
| 228 |
+
logger.error(f"Error reading text file: {str(e)}")
|
| 229 |
+
raise ValueError(f"Could not read text file: {file_path}. Error: {str(e)}")
|
| 230 |
+
else:
|
| 231 |
+
raise ValueError(f"Unsupported resume file type: {file_path}. Supported types: .pdf, .txt")
|
| 232 |
+
|
| 233 |
+
text = _collapse_ws(text)
|
| 234 |
+
|
| 235 |
+
# Tag headings with "###" so Markdown splitter can see them
|
| 236 |
+
tagged_lines = [
|
| 237 |
+
f"### {ln}" if _is_heading(ln) else ln
|
| 238 |
+
for ln in text.splitlines()]
|
| 239 |
+
|
| 240 |
+
md_text = "\n".join(tagged_lines)
|
| 241 |
+
|
| 242 |
+
if "###" in md_text:
|
| 243 |
+
splitter = MarkdownHeaderTextSplitter(
|
| 244 |
+
headers_to_split_on=[("###", "section")]
|
| 245 |
+
)
|
| 246 |
+
chunks = splitter.split_text(md_text) # already returns Documents
|
| 247 |
+
else:
|
| 248 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 249 |
+
chunk_size=400, chunk_overlap=50
|
| 250 |
+
)
|
| 251 |
+
chunks: list[Document] = [Document(page_content=chunk, metadata={}) for chunk in splitter.split_text(md_text)] # Attach metadata
|
| 252 |
+
for doc in chunks:
|
| 253 |
+
doc.metadata.setdefault("source", str(file_path))
|
| 254 |
+
# section already present if header‑splitter was used
|
| 255 |
+
|
| 256 |
+
return chunks
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def get_job_description(file_path_or_url: str) -> Document:
|
| 260 |
+
"""Parse a job description from a file or URL into chunks.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
file_path_or_url: Local file path or URL of job posting
|
| 264 |
+
|
| 265 |
+
Returns:
|
| 266 |
+
|
| 267 |
+
Document containing the job description
|
| 268 |
+
"""
|
| 269 |
+
# Check if the input is a URL
|
| 270 |
+
if file_path_or_url.startswith(('http://', 'https://')):
|
| 271 |
+
return parse_job_desc_from_url(file_path_or_url)
|
| 272 |
+
|
| 273 |
+
# Handle local files based on extension
|
| 274 |
+
file_extension = Path(file_path_or_url).suffix.lower()
|
| 275 |
+
|
| 276 |
+
# Handle txt files
|
| 277 |
+
if file_extension == '.txt':
|
| 278 |
+
try:
|
| 279 |
+
with open(file_path_or_url, 'r', encoding='utf-8') as f:
|
| 280 |
+
content = f.read()
|
| 281 |
+
if not content.strip():
|
| 282 |
+
raise ValueError(f"File is empty: {file_path_or_url}")
|
| 283 |
+
return Document(page_content=content, metadata={"source": file_path_or_url})
|
| 284 |
+
except Exception as e:
|
| 285 |
+
logger.error(f"Error reading text file: {str(e)}")
|
| 286 |
+
raise ValueError(f"Could not read text file: {file_path_or_url}. Error: {str(e)}")
|
| 287 |
+
|
| 288 |
+
# For other file types
|
| 289 |
+
raise ValueError(f"Unsupported file type: {file_path_or_url}. Supported types: .pdf, .docx, .txt, .md")
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def parse_job_desc_from_url(url: str) -> Document:
|
| 293 |
+
"""Extract job description from a URL.
|
| 294 |
+
|
| 295 |
+
Args:
|
| 296 |
+
url: URL of the job posting
|
| 297 |
+
|
| 298 |
+
Returns:
|
| 299 |
+
List[str]: [job_description_markdown, company_name]
|
| 300 |
+
|
| 301 |
+
Raises:
|
| 302 |
+
ValueError: If URL format is invalid
|
| 303 |
+
URLExtractionError: If content extraction fails
|
| 304 |
+
LLMProcessingError: If LLM processing fails
|
| 305 |
+
"""
|
| 306 |
+
|
| 307 |
+
logger.info("Starting job description extraction from URL: %s", url)
|
| 308 |
+
# langfuse_handler = langfuse_context.get_current_langchain_handler()
|
| 309 |
+
extracted_text = None
|
| 310 |
+
|
| 311 |
+
try:
|
| 312 |
+
# Validate URL format
|
| 313 |
+
parsed_url = urlparse(url)
|
| 314 |
+
if not all([parsed_url.scheme, parsed_url.netloc]):
|
| 315 |
+
logger.error("Invalid URL format: %s", url)
|
| 316 |
+
raise ValueError("URL must start with http:// or https://")
|
| 317 |
+
|
| 318 |
+
# Extract content from URL
|
| 319 |
+
try:
|
| 320 |
+
loader = WebBaseLoader(url)
|
| 321 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 322 |
+
chunk_size=1000,
|
| 323 |
+
chunk_overlap=200,
|
| 324 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 325 |
+
)
|
| 326 |
+
document_splitted = loader.load_and_split(text_splitter=text_splitter)
|
| 327 |
+
|
| 328 |
+
if not document_splitted:
|
| 329 |
+
logger.error("No content could be extracted from URL: %s", url)
|
| 330 |
+
raise URLExtractionError("No content could be extracted from URL")
|
| 331 |
+
|
| 332 |
+
extracted_text = " ".join(doc.page_content for doc in document_splitted)
|
| 333 |
+
logger.info("Successfully extracted %d characters from URL", len(extracted_text))
|
| 334 |
+
|
| 335 |
+
except Exception as e:
|
| 336 |
+
raise URLExtractionError(f"Failed to extract content from URL: {str(e)}") from e
|
| 337 |
+
|
| 338 |
+
# Process with LLM
|
| 339 |
+
if not llm_structured:
|
| 340 |
+
logger.warning("LLM not available, returning raw extracted text")
|
| 341 |
+
return [extracted_text, "Unknown Company"]
|
| 342 |
+
|
| 343 |
+
try:
|
| 344 |
+
output_parser: JsonOutputParser = JsonOutputParser(pydantic_object=JobDescriptionComponents)
|
| 345 |
+
|
| 346 |
+
human_prompt = "Below is the job description enclosed in triple quotes:\n\n '''{extracted_text}'''\n\n"
|
| 347 |
+
|
| 348 |
+
job_description_parser_system_message = SystemMessagePromptTemplate.from_template(
|
| 349 |
+
template=JOB_DESCRIPTION_PROMPT,
|
| 350 |
+
input_variables=[])
|
| 351 |
+
job_description_parser_human_message = HumanMessagePromptTemplate.from_template(
|
| 352 |
+
template=human_prompt,
|
| 353 |
+
input_variables=["extracted_text"])
|
| 354 |
+
chat_prompt = ChatPromptTemplate.from_messages([job_description_parser_system_message, job_description_parser_human_message])
|
| 355 |
+
|
| 356 |
+
# print("Chat prompt created successfully")
|
| 357 |
+
chain = chat_prompt | llm_structured | output_parser
|
| 358 |
+
|
| 359 |
+
try:
|
| 360 |
+
# Process with LLM
|
| 361 |
+
|
| 362 |
+
try:
|
| 363 |
+
result = chain.invoke({"extracted_text": extracted_text})
|
| 364 |
+
except Exception as e:
|
| 365 |
+
logger.error("LLM invocation failed: %s", str(e))
|
| 366 |
+
raise LLMProcessingError(f"LLM invocation failed: {str(e)}") from e
|
| 367 |
+
print("LLM processing result: ", result)
|
| 368 |
+
# Handle different types of LLM results
|
| 369 |
+
if isinstance(result, JobDescriptionComponents):
|
| 370 |
+
# Direct Pydantic model
|
| 371 |
+
result = result.model_dump()
|
| 372 |
+
if isinstance(result, dict):
|
| 373 |
+
print("LLM returned a dictionary, converting to JobDescriptionComponents model", result)
|
| 374 |
+
else:
|
| 375 |
+
# Unexpected result type
|
| 376 |
+
print(f"Unexpected LLM result type: {type(result)}")
|
| 377 |
+
logger.error("Unexpected LLM result type: %s", type(result))
|
| 378 |
+
raise LLMProcessingError("Invalid LLM response format")
|
| 379 |
+
|
| 380 |
+
# Validate required fields
|
| 381 |
+
if not result.get("job_description") or not result.get("company_name"):
|
| 382 |
+
logger.warning("LLM returned empty required fields")
|
| 383 |
+
raise LLMProcessingError("Missing required fields in LLM response")
|
| 384 |
+
|
| 385 |
+
logger.info("Successfully processed job description with LLM")
|
| 386 |
+
# Create a Document object for the job description
|
| 387 |
+
job_doc = Document(
|
| 388 |
+
page_content=result["job_description"],
|
| 389 |
+
metadata={"company_name": result["company_name"]}
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
# print("Job description Document created successfully. Company name: ", result["company_name"])
|
| 393 |
+
# print("Job description content: ", job_doc.metadata) # Print first 100 chars for debugging
|
| 394 |
+
return job_doc
|
| 395 |
+
|
| 396 |
+
except Exception as e:
|
| 397 |
+
# Handle LLM processing errors first
|
| 398 |
+
if isinstance(e, LLMProcessingError):
|
| 399 |
+
raise
|
| 400 |
+
|
| 401 |
+
# Try to recover from JSON parsing errors
|
| 402 |
+
error_msg = str(e)
|
| 403 |
+
if "Invalid json output" in error_msg:
|
| 404 |
+
logger.warning("Attempting to recover from invalid JSON output")
|
| 405 |
+
|
| 406 |
+
# Extract JSON from error message
|
| 407 |
+
output = error_msg.split("Invalid json output:", 1)[1].strip()
|
| 408 |
+
start = output.find('{')
|
| 409 |
+
end = output.rfind('}') + 1
|
| 410 |
+
|
| 411 |
+
if start >= 0 and end > start:
|
| 412 |
+
try:
|
| 413 |
+
clean_json = output[start:end]
|
| 414 |
+
result = output_parser.parse(clean_json)
|
| 415 |
+
if hasattr(result, "job_description") and hasattr(result, "company_name"):
|
| 416 |
+
return [result.job_description, result.company_name]
|
| 417 |
+
except json.JSONDecodeError as json_e:
|
| 418 |
+
logger.error("Failed to recover from JSON error: %s", json_e)
|
| 419 |
+
|
| 420 |
+
raise LLMProcessingError(f"Failed to process job description with LLM: {str(e)}") from e
|
| 421 |
+
|
| 422 |
+
except Exception as e:
|
| 423 |
+
if isinstance(e, LLMProcessingError):
|
| 424 |
+
if extracted_text:
|
| 425 |
+
logger.warning("LLM processing failed, falling back to raw text")
|
| 426 |
+
raise e
|
| 427 |
+
return [extracted_text, "Unknown Company"]
|
| 428 |
+
raise LLMProcessingError(f"Failed to process job description with LLM: {str(e)}") from e
|
| 429 |
+
|
| 430 |
+
except ValueError as e:
|
| 431 |
+
logger.error("URL validation error: %s", str(e))
|
| 432 |
+
raise
|
| 433 |
+
except URLExtractionError as e:
|
| 434 |
+
logger.error("Content extraction error: %s", str(e))
|
| 435 |
+
raise
|
| 436 |
+
except LLMProcessingError as e:
|
| 437 |
+
if extracted_text:
|
| 438 |
+
logger.warning("Using extracted text as fallback")
|
| 439 |
+
return [extracted_text, "Unknown Company"]
|
| 440 |
+
raise
|
| 441 |
+
except Exception as e:
|
| 442 |
+
logger.error("Unexpected error during job description parsing: %s", str(e))
|
| 443 |
+
raise JobDescriptionParsingError(f"Failed to parse job description: {str(e)}") from e
|
utils/errors.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ModelNotFoundError(Exception):
|
| 2 |
+
"""Exception raised when a requested model is not found."""
|
| 3 |
+
def __init__(self, model_name: str):
|
| 4 |
+
super().__init__(f"Model '{model_name}' not found.")
|
| 5 |
+
self.model_name = model_name
|
| 6 |
+
|
| 7 |
+
def __str__(self):
|
| 8 |
+
return f"ModelNotFoundError: {self.model_name}"
|
| 9 |
+
|
| 10 |
+
class URLExtractionError(Exception):
|
| 11 |
+
"""Raised when content cannot be extracted from a URL."""
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
class LLMProcessingError(Exception):
|
| 15 |
+
"""Raised when LLM processing fails."""
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
class JobDescriptionParsingError(Exception):
|
| 19 |
+
"""Base class for job description parsing errors."""
|
| 20 |
+
pass
|
utils/langfuse_handler.py
ADDED
|
File without changes
|
utils/llm_client.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM Client module for managing language model interactions.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from typing_extensions import Optional, Union
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
from langchain_core.language_models.chat_models import BaseChatModel
|
| 10 |
+
from langchain_core.language_models.llms import BaseLLM
|
| 11 |
+
from langchain_ollama import ChatOllama
|
| 12 |
+
from langchain_openai import ChatOpenAI
|
| 13 |
+
|
| 14 |
+
from .errors import ModelNotFoundError
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LLMClient:
|
| 18 |
+
"""
|
| 19 |
+
Client for managing language model interactions.
|
| 20 |
+
Provides a unified interface for different LLM backends.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
_instance = None # Singleton instance
|
| 24 |
+
|
| 25 |
+
@classmethod
|
| 26 |
+
def get_instance(cls, model_name: Optional[str] = None, model_provider: Optional[str] = None):
|
| 27 |
+
"""Get or create a singleton instance of the LLM client.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
model_name: Optional model name to override the default
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
LLMClient instance
|
| 34 |
+
"""
|
| 35 |
+
if cls._instance is None:
|
| 36 |
+
cls._instance = LLMClient(model_name, model_provider)
|
| 37 |
+
elif model_name is not None and cls._instance.model_name != model_name:
|
| 38 |
+
# Reinitialize if a different model is requested
|
| 39 |
+
cls._instance = LLMClient(model_name)
|
| 40 |
+
|
| 41 |
+
return cls._instance
|
| 42 |
+
|
| 43 |
+
def __init__(self, model_name: Optional[str] = None, model_provider: Optional[str] = None):
|
| 44 |
+
"""Initialize the LLM client with the specified model.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
model_name: Name of the model to use (default: from environment or "llama3.2:latest")
|
| 48 |
+
"""
|
| 49 |
+
print("Initializing LLM Client with model:", model_name, "and provider:", model_provider)
|
| 50 |
+
self.model_name = model_name or os.getenv("DEFAULT_LLM_MODEL", "llama3.2:latest")
|
| 51 |
+
self.model_provider = model_provider or os.getenv("LLM_PROVIDER", "ollama").lower()
|
| 52 |
+
self.llm = self._initialize_llm()
|
| 53 |
+
|
| 54 |
+
def __str__(self):
|
| 55 |
+
return f"LLMClient(model_name={self.model_name}, provider={self.model_provider})"
|
| 56 |
+
|
| 57 |
+
def _initialize_llm(self) -> Union[BaseLLM, BaseChatModel]:
|
| 58 |
+
"""Initialize the appropriate LLM based on configuration.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Initialized LLM instance
|
| 62 |
+
"""
|
| 63 |
+
print(f"Initializing LLM with model {self.model_name} and provider {self.model_provider} in {__file__}")
|
| 64 |
+
if self.model_provider == "ollama":
|
| 65 |
+
return self._initialize_llama()
|
| 66 |
+
elif self.model_provider == "openai":
|
| 67 |
+
return self._initialize_openai()
|
| 68 |
+
elif self.model_provider == "ollama_json":
|
| 69 |
+
return self._initialize_jsonllm()
|
| 70 |
+
else:
|
| 71 |
+
raise ValueError(f"Unsupported LLM provider: {self.model_provider}")
|
| 72 |
+
|
| 73 |
+
def _initialize_llama(self) -> BaseChatModel:
|
| 74 |
+
"""Initialize an Ollama LLM.
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Ollama LLM instance
|
| 78 |
+
"""
|
| 79 |
+
try:
|
| 80 |
+
# model = OllamaLLM(model=self.model_name, temperature=0.1, top_k=1, repeat_penalty=1.2)
|
| 81 |
+
model: ChatOllama = ChatOllama(model=self.model_name, temperature=0.1, top_k=1, repeat_penalty=1.2)
|
| 82 |
+
return model
|
| 83 |
+
except Exception as e:
|
| 84 |
+
raise ModelNotFoundError(f"Failed to initialize Ollama with model {self.model_name}: {e}") from e
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _initialize_jsonllm(self) -> BaseChatModel:
|
| 88 |
+
"""
|
| 89 |
+
Initialize a Mistral chat model.
|
| 90 |
+
Returns:
|
| 91 |
+
Mistral chat model instance
|
| 92 |
+
"""
|
| 93 |
+
try:
|
| 94 |
+
model: ChatOllama = ChatOllama(model=self.model_name, format='json', temperature=0.1, top_k=1, repeat_penalty=1.2)
|
| 95 |
+
return model
|
| 96 |
+
except Exception as e:
|
| 97 |
+
raise ModelNotFoundError(f"Failed to initialize Ollama with model {self.model_name}: {e}") from e
|
| 98 |
+
|
| 99 |
+
def _initialize_openai(self) -> BaseChatModel:
|
| 100 |
+
"""Initialize an OpenAI chat model.
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
OpenAI chat model instance
|
| 104 |
+
"""
|
| 105 |
+
api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJhcHAiLCJleHAiOjE3OTk5OTk5OTksInN1YiI6NjU1MDM3LCJhdWQiOiJXRUIiLCJpYXQiOjE2OTQwNzY4NTF9.hBcFcCqO1UF2Jb-m8Nv5u5zJPvQIuXUSZgyqggAD-ww"
|
| 106 |
+
# api_key = os.getenv("OPENAI_API_KEY")
|
| 107 |
+
if not api_key:
|
| 108 |
+
raise ValueError("OPENAI_API_KEY environment variable not set")
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
return ChatOpenAI(model_name=self.model_name, api_key=api_key)
|
| 112 |
+
except Exception as e:
|
| 113 |
+
raise ModelNotFoundError(f"Failed to initialize Ollama with model {self.model_name}: {e}") from e
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def get_llm(self) -> Union[BaseLLM, BaseChatModel]:
|
| 117 |
+
"""Get the initialized LLM instance.
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
LLM instance
|
| 121 |
+
"""
|
| 122 |
+
if self.llm is None:
|
| 123 |
+
raise RuntimeError("LLM client not initialized")
|
| 124 |
+
return self.llm
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def reinitialize(self, model_name: Optional[str] = None, provider: Optional[str] = None) -> None:
|
| 128 |
+
"""Reinitialize the LLM with a different model or provider.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
model_name: New model name to use
|
| 132 |
+
provider: New provider to use
|
| 133 |
+
"""
|
| 134 |
+
print(f"Reinitializing LLM client from {self.model_name} to {model_name}")
|
| 135 |
+
if model_name:
|
| 136 |
+
self.model_name = model_name
|
| 137 |
+
if provider:
|
| 138 |
+
self.model_provider = provider.lower()
|
| 139 |
+
|
| 140 |
+
self.llm = self._initialize_llm()
|
| 141 |
+
|
utils/vector_store.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector storage utilities for the job writer application.
|
| 3 |
+
|
| 4 |
+
This module provides functions for storing and retrieving
|
| 5 |
+
documents from vector databases.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Standard library imports
|
| 9 |
+
import os
|
| 10 |
+
from typing_extensions import List, Optional
|
| 11 |
+
|
| 12 |
+
# Third-party library imports
|
| 13 |
+
from langchain_core.documents import Document
|
| 14 |
+
from langchain_community.vectorstores import Pinecone
|
| 15 |
+
from langchain_ollama import OllamaEmbeddings
|
| 16 |
+
from pinecone import Pinecone as PineconeClient, ServerlessSpec
|
| 17 |
+
|
| 18 |
+
# Default configuration
|
| 19 |
+
DEFAULT_PINECONE_INDEX = "job-writer-vector"
|
| 20 |
+
|
| 21 |
+
class VectorStoreManager:
|
| 22 |
+
"""Manager class for vector store operations."""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
index_name: str = DEFAULT_PINECONE_INDEX,
|
| 27 |
+
embedding_model: str = "llama3.2:latest"
|
| 28 |
+
):
|
| 29 |
+
"""Initialize the vector store manager.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
api_key: Pinecone API key (will use env var if not provided)
|
| 33 |
+
index_name: Name of the Pinecone index to use
|
| 34 |
+
embedding_model: Name of the Ollama model to use for embeddings
|
| 35 |
+
"""
|
| 36 |
+
api_key= os.getenv("PINECONE_API_KEY")
|
| 37 |
+
if not api_key:
|
| 38 |
+
raise ValueError("Environment variable PINECONE_API_KEY not set.")
|
| 39 |
+
|
| 40 |
+
self.index_name = index_name
|
| 41 |
+
|
| 42 |
+
# Initialize embeddings
|
| 43 |
+
self.embeddings = OllamaEmbeddings(
|
| 44 |
+
model=embedding_model
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Initialize Pinecone client
|
| 48 |
+
self.client = PineconeClient(api_key=api_key)
|
| 49 |
+
|
| 50 |
+
# Ensure index exists
|
| 51 |
+
self._ensure_index_exists()
|
| 52 |
+
|
| 53 |
+
def _ensure_index_exists(self):
|
| 54 |
+
"""Make sure the required index exists, create if not."""
|
| 55 |
+
# Get embedding dimension from our embeddings model
|
| 56 |
+
try:
|
| 57 |
+
sample_embedding = self.embeddings.embed_query("Test query")
|
| 58 |
+
embedding_dim = len(sample_embedding)
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"Error determining embedding dimension: {e}")
|
| 61 |
+
print("Falling back to default dimension of 384")
|
| 62 |
+
embedding_dim = 384 # Common default for Ollama embeddings
|
| 63 |
+
|
| 64 |
+
# Check if the index exists
|
| 65 |
+
index_exists = False
|
| 66 |
+
try:
|
| 67 |
+
index_list = self.client.list_indexes()
|
| 68 |
+
index_list = [i.name for i in index_list]
|
| 69 |
+
index_exists = self.index_name in index_list
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"Error checking Pinecone indexes: {e}")
|
| 72 |
+
|
| 73 |
+
# Create index if it doesn't exist
|
| 74 |
+
if not index_exists:
|
| 75 |
+
try:
|
| 76 |
+
print(f"Creating new index: {self.index_name}")
|
| 77 |
+
self.client.create_index(
|
| 78 |
+
name=self.index_name,
|
| 79 |
+
dimension=embedding_dim,
|
| 80 |
+
spec=ServerlessSpec(region="us-east-1", cloud="aws"),
|
| 81 |
+
metric="cosine"
|
| 82 |
+
)
|
| 83 |
+
print(f"Successfully created index: {self.index_name}")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
if "ALREADY_EXISTS" in str(e):
|
| 86 |
+
print(f"Index {self.index_name} already exists (created in another process)")
|
| 87 |
+
else:
|
| 88 |
+
print(f"Error creating index: {e}")
|
| 89 |
+
else:
|
| 90 |
+
print(f"Using Pinecone Index: {self.index_name}")
|
| 91 |
+
|
| 92 |
+
def store_documents(self, docs: List[Document], namespace: str) -> None:
|
| 93 |
+
"""Store documents in vector database.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
docs: List of Document objects to store
|
| 97 |
+
namespace: Namespace to store documents under
|
| 98 |
+
"""
|
| 99 |
+
try:
|
| 100 |
+
# Get the index
|
| 101 |
+
index = self.client.Index(self.index_name)
|
| 102 |
+
|
| 103 |
+
# Create the vector store
|
| 104 |
+
vector_store = Pinecone(
|
| 105 |
+
index=index,
|
| 106 |
+
embedding=self.embeddings,
|
| 107 |
+
text_key="text",
|
| 108 |
+
namespace=namespace
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Add documents
|
| 112 |
+
vector_store.add_documents(docs)
|
| 113 |
+
print(f"Successfully stored {len(docs)} documents in namespace: {namespace}")
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Error storing documents: {e}")
|
| 116 |
+
raise
|
| 117 |
+
|
| 118 |
+
def retrieve_similar(self, query: str, namespace: str, k: int = 3):
|
| 119 |
+
"""Retrieve similar documents based on a query.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
query: The query text to search for
|
| 123 |
+
namespace: Namespace to search in
|
| 124 |
+
k: Number of results to return
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
List of Document objects
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
# Get the index
|
| 131 |
+
index = self.client.Index(self.index_name)
|
| 132 |
+
|
| 133 |
+
# Create the vector store
|
| 134 |
+
vectorstore = Pinecone(
|
| 135 |
+
index=index,
|
| 136 |
+
embedding=self.embeddings,
|
| 137 |
+
text_key="text",
|
| 138 |
+
namespace=namespace
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Search for similar documents
|
| 142 |
+
docs = vectorstore.similarity_search(query, k=k, namespace=namespace)
|
| 143 |
+
return docs
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"Error retrieving documents: {e}")
|
| 146 |
+
return []
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
VectorStoreManager = VectorStoreManager()
|
| 152 |
+
|
| 153 |
+
VectorStoreManager.store_documents(
|
| 154 |
+
docs=[Document(page_content="Sample content", metadata={"source": "test"})],
|
| 155 |
+
namespace="test_namespace"
|
| 156 |
+
)
|
workflow.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Workflow runner for the job application writer.
|
| 3 |
+
|
| 4 |
+
This module provides functions for running the job application
|
| 5 |
+
writer graph in both interactive and batch modes.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import argparse
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from langchain_core.tracers import ConsoleCallbackHandler
|
| 14 |
+
from langgraph.graph import StateGraph
|
| 15 |
+
from langfuse import Langfuse
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
from job_writer.nodes import Dataloading
|
| 19 |
+
from job_writer.nodes.research_workflow import research_workflow
|
| 20 |
+
from job_writer.classes import AppState, DataLoadState
|
| 21 |
+
from job_writer.agents.nodes import (
|
| 22 |
+
create_draft,
|
| 23 |
+
critique_draft,
|
| 24 |
+
finalize_document,
|
| 25 |
+
human_approval,
|
| 26 |
+
)
|
| 27 |
+
from job_writer.nodes import (
|
| 28 |
+
generate_variations,
|
| 29 |
+
self_consistency_vote
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class JobWorkflow:
|
| 34 |
+
"""
|
| 35 |
+
Workflow runner for the job application writer.
|
| 36 |
+
Args:
|
| 37 |
+
resume: Resume text or file path
|
| 38 |
+
job_description: Job description text or URL
|
| 39 |
+
content:
|
| 40 |
+
Type of application material to generate
|
| 41 |
+
model_config: Configuration for language models
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
#
|
| 45 |
+
def __init__(self, resume=None, job_description_source=None, content=None, model_configuration=None):
|
| 46 |
+
"""Initialize the Writing Workflow."""
|
| 47 |
+
print(f"Initializing Workflow for {content}")
|
| 48 |
+
self.resume = resume
|
| 49 |
+
self.job_description_source = job_description_source
|
| 50 |
+
self.content = content
|
| 51 |
+
self.model_configuration = model_configuration
|
| 52 |
+
|
| 53 |
+
# Initialize the app state
|
| 54 |
+
self.app_state = AppState(
|
| 55 |
+
resume_path=resume,
|
| 56 |
+
job_description_source=job_description_source,
|
| 57 |
+
company_research_data=None,
|
| 58 |
+
draft="",
|
| 59 |
+
feedback="",
|
| 60 |
+
final="",
|
| 61 |
+
content=content,
|
| 62 |
+
current_node=""
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
self.__init__nodes()
|
| 66 |
+
self._build_workflow()
|
| 67 |
+
|
| 68 |
+
self.langfuse = Langfuse()
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def __init__nodes(self):
|
| 72 |
+
self.dataloading = Dataloading()
|
| 73 |
+
# self.createdraft = create_draft()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _build_workflow(self):
|
| 77 |
+
# Build the graph with config
|
| 78 |
+
self.job_app_graph = StateGraph(DataLoadState)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
self.job_app_graph.add_node("initialize_system", self.dataloading.system_setup)
|
| 82 |
+
self.job_app_graph.add_node("load", self.dataloading.run)
|
| 83 |
+
# self.job_app_graph.add_node("build_persona", select_persona)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# Add research workflow as a node
|
| 87 |
+
self.job_app_graph.add_node("research", research_workflow)
|
| 88 |
+
self.job_app_graph.add_node("create_draft", create_draft)
|
| 89 |
+
self.job_app_graph.add_node("variations", generate_variations)
|
| 90 |
+
self.job_app_graph.add_node("self_consistency", self_consistency_vote)
|
| 91 |
+
self.job_app_graph.add_node("critique", critique_draft)
|
| 92 |
+
self.job_app_graph.add_node("human_approval", human_approval)
|
| 93 |
+
self.job_app_graph.add_node("finalize", finalize_document)
|
| 94 |
+
|
| 95 |
+
self.job_app_graph.set_entry_point("initialize_system")
|
| 96 |
+
self.job_app_graph.set_finish_point("finalize")
|
| 97 |
+
|
| 98 |
+
self.job_app_graph.add_edge("initialize_system", "load")
|
| 99 |
+
self.job_app_graph.add_conditional_edges("load", self.dataloading.verify_inputs)
|
| 100 |
+
self.job_app_graph.add_edge("research", "create_draft")
|
| 101 |
+
self.job_app_graph.add_edge("create_draft", "variations")
|
| 102 |
+
self.job_app_graph.add_edge("variations", "self_consistency")
|
| 103 |
+
self.job_app_graph.add_edge("self_consistency", "critique")
|
| 104 |
+
self.job_app_graph.add_edge("critique", "human_approval")
|
| 105 |
+
self.job_app_graph.add_edge("human_approval", "finalize")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
async def run(self) -> str | None:
|
| 109 |
+
"""
|
| 110 |
+
Run the job application writer workflow.
|
| 111 |
+
"""
|
| 112 |
+
# Compile the graph
|
| 113 |
+
try:
|
| 114 |
+
compiled_graph = self.compile()
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"Error compiling graph: {e}")
|
| 117 |
+
return
|
| 118 |
+
# Set up run configuration
|
| 119 |
+
run_name = f"Job Application Writer - {self.app_state['content']} - {datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
|
| 120 |
+
config = {
|
| 121 |
+
"configurable": {
|
| 122 |
+
"thread_id": f"job_app_session_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
| 123 |
+
"callbacks": [ConsoleCallbackHandler()],
|
| 124 |
+
"run_name": run_name,
|
| 125 |
+
"tags": ["job-application", self.app_state['content']]
|
| 126 |
+
},
|
| 127 |
+
"recursion_limit": 10
|
| 128 |
+
}
|
| 129 |
+
# Run the graph
|
| 130 |
+
try:
|
| 131 |
+
self.app_state["current_node"] = "initialize_system"
|
| 132 |
+
graph_output = await compiled_graph.ainvoke(self.app_state, config=config)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"Error running graph: {e}")
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
return graph_output
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def compile(self):
|
| 141 |
+
"""Compile the graph."""
|
| 142 |
+
graph = self.job_app_graph.compile()
|
| 143 |
+
return graph
|
| 144 |
+
|
| 145 |
+
def print_result(self, content_type, final_content):
|
| 146 |
+
"""Print the final generated content to the console."""
|
| 147 |
+
print("\n" + "="*80)
|
| 148 |
+
print(f"FINAL {content_type.upper()}:")
|
| 149 |
+
print(final_content)
|
| 150 |
+
print("="*80)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def save_result(self, content_type, final_content):
|
| 154 |
+
"""Save the final generated content to a file and return the filename."""
|
| 155 |
+
output_file = f"{content_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
| 156 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 157 |
+
f.write(final_content)
|
| 158 |
+
print(f"\nSaved to {output_file}")
|
| 159 |
+
return output_file
|
| 160 |
+
|
| 161 |
+
if __name__ == "__main__":
|
| 162 |
+
|
| 163 |
+
parser = argparse.ArgumentParser(description="Generate job application materials")
|
| 164 |
+
parser.add_argument("--resume", required=True, help="Path to resume file or resume text")
|
| 165 |
+
parser.add_argument("--job", required=True, help="Path/URL to job description or description text")
|
| 166 |
+
parser.add_argument("--type", default="cover_letter",
|
| 167 |
+
choices=["cover_letter", "bullets", "linkedin_note"],
|
| 168 |
+
help="Type of application material to generate")
|
| 169 |
+
parser.add_argument("--model", help="Ollama model to use")
|
| 170 |
+
parser.add_argument("--temp", type=float, help="Temperature for generation")
|
| 171 |
+
|
| 172 |
+
args = parser.parse_args()
|
| 173 |
+
|
| 174 |
+
# Configure models if specified
|
| 175 |
+
model_config = {}
|
| 176 |
+
if args.model:
|
| 177 |
+
model_config["model_name"] = args.model
|
| 178 |
+
if args.temp is not None:
|
| 179 |
+
model_config["temperature"] = min(0.25, args.temp)
|
| 180 |
+
model_config["precise_temperature"] = min(0.2, args.temp)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# Initialize the workflow
|
| 184 |
+
workflow = JobWorkflow(
|
| 185 |
+
resume=args.resume,
|
| 186 |
+
job_description_source=args.job,
|
| 187 |
+
content=args.type,
|
| 188 |
+
model_configuration=model_config
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Run the workflow
|
| 192 |
+
result = asyncio.run(workflow.run())
|
| 193 |
+
|
| 194 |
+
if result:
|
| 195 |
+
# Print the result to the console
|
| 196 |
+
workflow.print_result(args.type, result["final"])
|
| 197 |
+
else:
|
| 198 |
+
print("Error running workflow.")
|
| 199 |
+
sys.exit(1)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# Save the result to a file
|
| 203 |
+
if result:
|
| 204 |
+
workflow.save_result(args.type, result["final"])
|
| 205 |
+
else:
|
| 206 |
+
print("Error saving result.")
|
| 207 |
+
sys.exit(1)
|
| 208 |
+
|
| 209 |
+
# Print a success message
|
| 210 |
+
print("Workflow completed successfully.")
|