Spaces:

Rishabh2095
/

AgentWorkflowJobApplications

Sleeping

App Files Files Community

Rishabh2095 commited on Jan 3

Commit

046508a

1 Parent(s): 6e2bf85

Code Refactoring and Central Logging

Browse files

Files changed (34) hide show

.dockerignore +38 -0
.gitignore +2 -0
.vscode/settings.json +8 -1
DEPLOYMENT_GUIDE.md +303 -0
DOCKERFILE_EXPLANATION.md +147 -0
Dockerfile +41 -0
docker-compose.override.example.yml +21 -0
docker-compose.yml +59 -0
langgraph.json +5 -3
pyproject.toml +17 -15
pyrightconfig.json +18 -0
src/job_writing_agent/agents/nodes.py +220 -53
src/job_writing_agent/agents/output_schema.py +48 -11
src/job_writing_agent/classes/__init__.py +2 -2
src/job_writing_agent/classes/classes.py +60 -7
src/job_writing_agent/logs/job_writer.log +0 -0
src/job_writing_agent/nodes/initializing.py +408 -205
src/job_writing_agent/nodes/job_description_loader.py +192 -0
src/job_writing_agent/nodes/research_workflow.py +260 -53
src/job_writing_agent/nodes/resume_loader.py +140 -0
src/job_writing_agent/nodes/selfconsistency.py +28 -20
src/job_writing_agent/nodes/variations.py +10 -8
src/job_writing_agent/prompts/templates.py +22 -11
src/job_writing_agent/prompts/test_prompts.py +38 -0
src/job_writing_agent/tools/SearchTool.py +203 -79
src/job_writing_agent/tools/__init__.py +2 -2
src/job_writing_agent/utils/application_cli_interface.py +2 -2
src/job_writing_agent/utils/document_processing.py +129 -87
src/job_writing_agent/utils/llm_client.py +143 -127
src/job_writing_agent/utils/llm_provider_factory.py +3 -0
src/job_writing_agent/utils/logging/logging_config.py +132 -0
src/job_writing_agent/utils/logging/logging_decorators.py +103 -0
src/job_writing_agent/workflow.py +221 -42
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,38 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+app_env/
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Logs
+*.log
+logs/
+# OS
+.DS_Store
+Thumbs.db
+# Project specific
+*.pdf
+cover_letter_*.txt
+uv.lock
+# Git
+.git/
+.gitignore
+# Documentation
+*.md
+!README.md

.gitignore CHANGED Viewed

@@ -4,6 +4,8 @@
 # Environment / secret files
 job_writing_agent/.env
 job_writing_agent/.env.*
 # Jupyter notebooks
 job_writing_agent/*.ipynb

 # Environment / secret files
 job_writing_agent/.env
 job_writing_agent/.env.*
+src/job_writing_agent/.env
+src/job_writing_agent/.env.*
 # Jupyter notebooks
 job_writing_agent/*.ipynb

.vscode/settings.json CHANGED Viewed

@@ -1,3 +1,10 @@
 {
-    "python.defaultInterpreterPath": "/home/icangdb/application_writing_agent/.venv/bin/python"
 }

 {
+    "python.defaultInterpreterPath": "C:\\Users\\risha\\python-dir\\job_application_agent\\job_writer\\app_env\\Scripts\\python.exe",
+    "python.formatting.provider": "black",
+    "editor.formatOnSave": true,
+    "python.formatting.blackArgs": ["--line-length", "88"],
+     "python.linting.enabled": true,
+    "python.linting.pylintEnabled": true,
+    "python.linting.lintOnSave": true,
+    "python.linting.mypyEnabled": true
 }

DEPLOYMENT_GUIDE.md ADDED Viewed

	@@ -0,0 +1,303 @@

+# Deployment Guide for Job Application Agent
+## Option 1: LangGraph Cloud (Easiest & Recommended)
+### Prerequisites
+- LangGraph CLI installed (`langgraph-cli` in requirements.txt)
+- `langgraph.json` already configured ✅
+### Steps
+1. **Install LangGraph CLI** (if not already):
+```powershell
+pip install langgraph-cli
+```
+2. **Login to LangGraph Cloud**:
+```powershell
+langgraph login
+```
+3. **Deploy your agent**:
+```powershell
+langgraph deploy
+```
+4. **Get your API endpoint** - LangGraph Cloud provides a REST API automatically
+### Cost
+- **Free tier**: Limited requests/month
+- **Paid**: Pay-per-use pricing
+### Pros
+- ✅ Zero infrastructure management
+- ✅ Built-in state persistence
+- ✅ Automatic API generation
+- ✅ LangSmith integration
+- ✅ Perfect for LangGraph apps
+### Cons
+- ⚠️ Vendor lock-in
+- ⚠️ Limited customization
+---
+## Option 2: Railway.app (Simple & Cheap)
+### Steps
+1. **Create a FastAPI wrapper** (create `api.py`):
+```python
+from fastapi import FastAPI, File, UploadFile
+from job_writing_agent.workflow import JobWorkflow
+import tempfile
+import os
+app = FastAPI()
+@app.post("/generate")
+async def generate_application(
+    resume: UploadFile = File(...),
+    job_description: str,
+    content_type: str = "cover_letter"
+):
+    # Save resume temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(await resume.read())
+        resume_path = tmp.name
+    try:
+        workflow = JobWorkflow(
+            resume=resume_path,
+            job_description_source=job_description,
+            content=content_type
+        )
+        result = await workflow.run()
+        return {"result": result}
+    finally:
+        os.unlink(resume_path)
+```
+2. **Create `Procfile`**:
+```
+web: uvicorn api:app --host 0.0.0.0 --port $PORT
+```
+3. **Deploy to Railway**:
+   - Sign up at [railway.app](https://railway.app)
+   - Connect GitHub repo
+   - Railway auto-detects Python and runs `Procfile`
+### Cost
+- **Free tier**: $5 credit/month
+- **Hobby**: $5/month for 512MB RAM
+- **Pro**: $20/month for 2GB RAM
+### Pros
+- ✅ Very simple deployment
+- ✅ Auto-scaling
+- ✅ Free tier available
+- ✅ Automatic HTTPS
+### Cons
+- ⚠️ Need to add FastAPI wrapper
+- ⚠️ State management needs Redis/Postgres
+---
+## Option 3: Render.com (Similar to Railway)
+### Steps
+1. **Create `render.yaml`**:
+```yaml
+services:
+  - type: web
+    name: job-writer-api
+    env: python
+    buildCommand: pip install -r requirements.txt
+    startCommand: uvicorn api:app --host 0.0.0.0 --port $PORT
+    envVars:
+      - key: OPENROUTER_API_KEY
+        sync: false
+      - key: TAVILY_API_KEY
+        sync: false
+```
+2. **Deploy**:
+   - Connect GitHub repo to Render
+   - Render auto-detects `render.yaml`
+### Cost
+- **Free tier**: 750 hours/month (sleeps after 15min inactivity)
+- **Starter**: $7/month (always on)
+### Pros
+- ✅ Free tier for testing
+- ✅ Simple YAML config
+- ✅ Auto-deploy from Git
+### Cons
+- ⚠️ Free tier sleeps (cold starts)
+- ⚠️ Need FastAPI wrapper
+---
+## Option 4: Fly.io (Good Free Tier)
+### Steps
+1. **Install Fly CLI**:
+```powershell
+iwr https://fly.io/install.ps1 -useb | iex
+```
+2. **Create `Dockerfile`**:
+```dockerfile
+FROM python:3.12-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8080"]
+```
+3. **Deploy**:
+```powershell
+fly launch
+fly deploy
+```
+### Cost
+- **Free tier**: 3 shared-cpu VMs, 3GB storage
+- **Paid**: $1.94/month per VM
+### Pros
+- ✅ Generous free tier
+- ✅ Global edge deployment
+- ✅ Docker-based (flexible)
+### Cons
+- ⚠️ Need Docker knowledge
+- ⚠️ Need FastAPI wrapper
+---
+## Option 5: AWS Lambda (Serverless - Pay Per Use)
+### Steps
+1. **Create Lambda handler** (`lambda_handler.py`):
+```python
+import json
+from job_writing_agent.workflow import JobWorkflow
+def lambda_handler(event, context):
+    # Parse event
+    body = json.loads(event['body'])
+    workflow = JobWorkflow(
+        resume=body['resume_path'],
+        job_description_source=body['job_description'],
+        content=body.get('content_type', 'cover_letter')
+    )
+    result = workflow.run()
+    return {
+        'statusCode': 200,
+        'body': json.dumps({'result': result})
+    }
+```
+2. **Package and deploy** using AWS SAM or Serverless Framework
+### Cost
+- **Free tier**: 1M requests/month
+- **Paid**: $0.20 per 1M requests + compute time
+### Pros
+- ✅ Pay only for usage
+- ✅ Auto-scaling
+- ✅ Very cheap for low traffic
+### Cons
+- ⚠️ 15min timeout limit
+- ⚠️ Cold starts
+- ⚠️ Complex setup
+- ⚠️ Need to handle state externally
+---
+## Recommendation
+**For your use case, I recommend:**
+1. **Start with LangGraph Cloud** - Easiest, built for your stack
+2. **If you need more control → Railway** - Simple, good free tier
+3. **If you need serverless → AWS Lambda** - Cheapest for low traffic
+---
+## Quick Start: FastAPI Wrapper (for Railway/Render/Fly.io)
+Create `api.py` in your project root:
+```python
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from job_writing_agent.workflow import JobWorkflow
+import tempfile
+import os
+import asyncio
+app = FastAPI(title="Job Application Writer API")
+@app.get("/")
+def health():
+    return {"status": "ok"}
+@app.post("/generate")
+async def generate_application(
+    resume: UploadFile = File(...),
+    job_description: str,
+    content_type: str = "cover_letter"
+):
+    """Generate job application material."""
+    # Save resume temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        content = await resume.read()
+        tmp.write(content)
+        resume_path = tmp.name
+    try:
+        workflow = JobWorkflow(
+            resume=resume_path,
+            job_description_source=job_description,
+            content=content_type
+        )
+        # Run workflow (assuming it's async or can be wrapped)
+        result = await asyncio.to_thread(workflow.run)
+        return JSONResponse({
+            "status": "success",
+            "result": result
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup
+        if os.path.exists(resume_path):
+            os.unlink(resume_path)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+```
+Then update `requirements.txt` to ensure FastAPI and uvicorn are included (they already are ✅).

DOCKERFILE_EXPLANATION.md ADDED Viewed

	@@ -0,0 +1,147 @@

+# Dockerfile Explanation
+This Dockerfile is specifically designed for **LangGraph Cloud/LangServe deployment**. It uses the official LangGraph API base image and configures your agent graphs to be served as REST APIs.
+## Line-by-Line Breakdown
+### 1. Base Image (Line 1)
+```dockerfile
+FROM langchain/langgraph-api:3.12
+```
+- **Purpose**: Uses the official LangGraph API base image with Python 3.12
+- **What it includes**: Pre-configured LangGraph runtime, LangServe server, and all LangGraph dependencies
+- **Why**: This image already has everything needed to serve LangGraph workflows as REST APIs
+---
+### 2. Install Node Dependencies (Line 9)
+```dockerfile
+RUN PYTHONDONTWRITEBYTECODE=1 uv pip install --system --no-cache-dir -c /api/constraints.txt nodes
+```
+- **Purpose**: Installs the `nodes` package (likely a dependency from your `langgraph.json`)
+- **`PYTHONDONTWRITEBYTECODE=1`**: Prevents creating `.pyc` files (smaller image)
+- **`uv pip`**: Uses `uv` (fast Python package installer) instead of regular `pip`
+- **`--system`**: Installs to system Python (not virtual env)
+- **`--no-cache-dir`**: Doesn't cache pip downloads (smaller image)
+- **`-c /api/constraints.txt`**: Uses constraint file from base image (ensures compatible versions)
+---
+### 3. Copy Your Code (Line 14)
+```dockerfile
+ADD . /deps/job_writer
+```
+- **Purpose**: Copies your entire project into `/deps/job_writer` in the container
+- **Why `/deps/`**: LangGraph API expects dependencies in this directory
+- **What gets copied**: All your source code, `pyproject.toml`, `requirements.txt`, etc.
+---
+### 4. Install Your Package (Lines 19-21)
+```dockerfile
+RUN for dep in /deps/*; do
+    echo "Installing $dep";
+    if [ -d "$dep" ]; then
+        echo "Installing $dep";
+        (cd "$dep" && PYTHONDONTWRITEBYTECODE=1 uv pip install --system --no-cache-dir -c /api/constraints.txt -e .);
+    fi;
+done
+```
+- **Purpose**: Installs your `job_writer` package in editable mode (`-e`)
+- **How it works**:
+  - Loops through all directories in `/deps/`
+  - For each directory, changes into it and runs `pip install -e .`
+  - The `-e` flag installs in "editable" mode (changes to code are reflected)
+- **Why**: Makes your package importable as `job_writing_agent` inside the container
+---
+### 5. Register Your Graphs (Line 25)
+```dockerfile
+ENV LANGSERVE_GRAPHS='{"job_app_graph": "/deps/job_writer/src/job_writing_agent/workflow.py:job_app_graph", ...}'
+```
+- **Purpose**: Tells LangServe which graphs to expose as REST APIs
+- **Format**: JSON mapping of `graph_name` → `module_path:attribute_name`
+- **What it does**:
+  - `job_app_graph` → Exposes `JobWorkflow.job_app_graph` property as an API endpoint
+  - `research_workflow` → Exposes the research subgraph
+  - `data_loading_workflow` → Exposes the data loading subgraph
+- **Result**: Each graph becomes a REST API endpoint like `/invoke/job_app_graph`
+---
+### 6. Protect LangGraph API (Lines 33-35)
+```dockerfile
+RUN mkdir -p /api/langgraph_api /api/langgraph_runtime /api/langgraph_license && \
+    touch /api/langgraph_api/__init__.py /api/langgraph_runtime/__init__.py /api/langgraph_license/__init__.py
+RUN PYTHONDONTWRITEBYTECODE=1 uv pip install --system --no-cache-dir --no-deps -e /api
+```
+- **Purpose**: Prevents your dependencies from accidentally overwriting LangGraph API packages
+- **How**:
+  1. Creates placeholder `__init__.py` files for LangGraph packages
+  2. Reinstalls LangGraph API (without dependencies) to ensure it's not overwritten
+- **Why**: If your `requirements.txt` has conflicting versions, this ensures LangGraph API stays intact
+---
+### 7. Cleanup Build Tools (Lines 37-41)
+```dockerfile
+RUN pip uninstall -y pip setuptools wheel
+RUN rm -rf /usr/local/lib/python*/site-packages/pip* ...
+RUN uv pip uninstall --system pip setuptools wheel && rm /usr/bin/uv /usr/bin/uvx
+```
+- **Purpose**: Removes all build tools to make the image smaller and more secure
+- **What gets removed**:
+  - `pip`, `setuptools`, `wheel` (Python build tools)
+  - `uv` and `uvx` (package installers)
+- **Why**: These tools aren't needed at runtime, only during build
+- **Security**: Smaller attack surface (can't install malicious packages at runtime)
+---
+### 8. Set Working Directory (Line 45)
+```dockerfile
+WORKDIR /deps/job_writer
+```
+- **Purpose**: Sets the default directory when the container starts
+- **Why**: Makes it easier to reference files relative to your project root
+---
+## How It Works at Runtime
+When this container runs:
+1. **LangServe starts automatically** (from base image)
+2. **Reads `LANGSERVE_GRAPHS`** environment variable
+3. **Imports your graphs** from the specified paths
+4. **Exposes REST API endpoints**:
+   - `POST /invoke/job_app_graph` - Main workflow
+   - `POST /invoke/research_workflow` - Research subgraph
+   - `POST /invoke/data_loading_workflow` - Data loading subgraph
+5. **Handles state management** automatically (checkpointing, persistence)
+## Example API Usage
+Once deployed, you can call your agent like this:
+```bash
+curl -X POST http://your-deployment/invoke/job_app_graph \
+  -H "Content-Type: application/json" \
+  -d '{
+    "resume_path": "...",
+    "job_description_source": "...",
+    "content": "cover_letter"
+  }'
+```
+## Key Points
+✅ **Optimized for LangGraph Cloud** - Uses official base image
+✅ **Automatic API generation** - No need to write FastAPI code
+✅ **State management** - Built-in checkpointing and persistence
+✅ **Security** - Removes build tools from final image
+✅ **Small image** - No-cache installs, no bytecode files
+This is the **easiest deployment option** for LangGraph apps - just build and push this Docker image!

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM langchain/langgraph-api:3.12
+# -- Adding local package . --
+ADD . /deps/job_writer
+# -- End of local package . --
+# -- Installing all local dependencies --
+RUN for dep in /deps/*; do             echo "Installing $dep";             if [ -d "$dep" ]; then                 echo "Installing $dep";                 (cd "$dep" && PYTHONDONTWRITEBYTECODE=1 uv pip install --system --no-cache-dir -c /api/constraints.txt -e .);             fi;         done
+# -- End of local dependencies install --
+ENV LANGSERVE_GRAPHS='{"job_app_graph": "/deps/job_writer/src/job_writing_agent/workflow.py:job_app_graph", "research_workflow": "/deps/job_writer/src/job_writing_agent/nodes/research_workflow.py:research_workflow", "data_loading_workflow": "/deps/job_writer/src/job_writing_agent/nodes/initializing.py:data_loading_workflow"}'
+# -- Ensure user deps didn't inadvertently overwrite langgraph-api
+RUN mkdir -p /api/langgraph_api /api/langgraph_runtime /api/langgraph_license && touch /api/langgraph_api/__init__.py /api/langgraph_runtime/__init__.py /api/langgraph_license/__init__.py
+RUN PYTHONDONTWRITEBYTECODE=1 uv pip install --system --no-cache-dir --no-deps -e /api
+# -- End of ensuring user deps didn't inadvertently overwrite langgraph-api --
+# -- Removing build deps from the final image ~<:===~~~ --
+RUN pip uninstall -y pip setuptools wheel
+RUN rm -rf /usr/local/lib/python*/site-packages/pip* /usr/local/lib/python*/site-packages/setuptools* /usr/local/lib/python*/site-packages/wheel* && find /usr/local/bin -name "pip*" -delete || true
+RUN rm -rf /usr/lib/python*/site-packages/pip* /usr/lib/python*/site-packages/setuptools* /usr/lib/python*/site-packages/wheel* && find /usr/bin -name "pip*" -delete || true
+RUN uv pip uninstall --system pip setuptools wheel && rm /usr/bin/uv /usr/bin/uvx
+WORKDIR /deps/job_writer

docker-compose.override.example.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+# Example override file for local development
+# Copy this to docker-compose.override.yml to customize settings
+# docker-compose automatically loads override files
+version: "3.9"
+services:
+  redis:
+    # Override Redis port for local development
+    ports:
+      - "6380:6379"  # Use different port if 6379 is already in use
+  postgres:
+    # Override Postgres port for local development
+    ports:
+      - "5433:5432"  # Use different port if 5432 is already in use
+    environment:
+      # Override credentials for local dev
+      - POSTGRES_USER=dev_user
+      - POSTGRES_PASSWORD=dev_password
+      - POSTGRES_DB=job_app_dev

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,59 @@

+services:
+  redis:
+    image: redis:6-alpine
+    container_name: job-app-redis
+    ports:
+      - "6379:6379"
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 5
+    networks:
+      - job-app-network
+  postgres:
+    image: postgres:16-alpine
+    container_name: job-app-postgres
+    environment:
+      - POSTGRES_USER=postgres
+      - POSTGRES_PASSWORD=postgres
+      - POSTGRES_DB=postgres
+    ports:
+      - "5432:5432"
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+    volumes:
+      - pg_data_local:/var/lib/postgresql/data
+    networks:
+      - job-app-network
+  # Optional: Uncomment to run your agent container alongside Redis/Postgres
+  agent:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: job-app-workflow:latest
+    container_name: job-app-agent
+    ports:
+      - "8000:8000"
+    environment:
+      - REDIS_URL=redis://redis:6379
+      - POSTGRES_URL=postgresql://postgres:postgres@postgres:5432/postgres
+    depends_on:
+      redis:
+        condition: service_healthy
+      postgres:
+        condition: service_healthy
+    networks:
+      - job-app-network
+networks:
+  job-app-network:
+    driver: bridge
+volumes:
+  pg_data_local:

langgraph.json CHANGED Viewed

@@ -1,10 +1,12 @@
 {
   "dependencies": [
-    "nodes"
   ],
   "graphs": {
-    "research_workflow": "./src/job_writing_agent/nodes/research_workflow.py:research_workflow"
   },
-  "env": "./job_writer_env",
   "python_version": "3.12"
 }

 {
   "dependencies": [
+    "."
   ],
   "graphs": {
+    "job_app_graph": "./src/job_writing_agent/workflow.py:job_app_graph",
+    "research_workflow": "./src/job_writing_agent/nodes/research_workflow.py:research_workflow",
+    "data_loading_workflow": "./src/job_writing_agent/nodes/initializing.py:data_loading_workflow"
   },
+  "env": "./app_env",
   "python_version": "3.12"
 }

pyproject.toml CHANGED Viewed

@@ -23,6 +23,7 @@ dependencies = [
     "babel==2.17.0",
     "backoff==2.2.1",
     "beautifulsoup4==4.14.2",
     "blinker==1.9.0",
     "blockbuster==1.5.25",
     "bs4==0.0.2",
@@ -103,20 +104,19 @@ dependencies = [
     "jsonschema-specifications==2025.9.1",
     "justext==3.0.2",
     "kiwisolver==1.4.9",
-    "langchain==0.3.27",
-    "langchain-cerebras==0.5.0",
-    "langchain-community==0.3.30",
-    "langchain-core==0.3.78",
-    "langchain-ollama==0.3.10",
-    "langchain-openai==0.3.34",
-    "langchain-tavily==0.2.12",
-    "langchain-text-splitters==0.3.11",
     "langfuse==3.6.1",
-    "langgraph==0.6.8",
-    "langgraph-api==0.4.46",
-    "langgraph-checkpoint==2.1.1",
-    "langgraph-cli==0.4.4",
-    "langgraph-prebuilt==0.6.4",
     "langgraph-runtime-inmem==0.14.1",
     "langgraph-sdk==0.2.9",
     "langsmith==0.4.32",
@@ -168,8 +168,8 @@ dependencies = [
     "opentelemetry-sdk==1.37.0",
     "opentelemetry-semantic-conventions==0.58b0",
     "optuna==4.5.0",
-    "orjson==3.11.3",
-    "ormsgpack==1.10.0",
     "packaging==25.0",
     "pandas==2.3.3",
     "parse==1.20.2",
@@ -212,6 +212,7 @@ dependencies = [
     "rich-rst==1.3.1",
     "rpds-py==0.27.1",
     "rsa==4.9.1",
     "scikit-learn==1.7.2",
     "scipy==1.16.2",
     "setuptools==80.9.0",
@@ -258,3 +259,4 @@ dependencies = [
 [tool.setuptools.packages.find]
 where = ["src"]

     "babel==2.17.0",
     "backoff==2.2.1",
     "beautifulsoup4==4.14.2",
+    "black>=25.12.0",
     "blinker==1.9.0",
     "blockbuster==1.5.25",
     "bs4==0.0.2",
     "jsonschema-specifications==2025.9.1",
     "justext==3.0.2",
     "kiwisolver==1.4.9",
+    "langchain",
+    "langchain-cerebras",
+    "langchain-community",
+    "langchain-core>=1.0.0",
+    "langchain-ollama",
+    "langchain-openai",
+    "langchain-tavily",
+    "langchain-text-splitters",
     "langfuse==3.6.1",
+    "langgraph",
+    "langgraph-api",
+    "langgraph-cli",
+    "langgraph-prebuilt",
     "langgraph-runtime-inmem==0.14.1",
     "langgraph-sdk==0.2.9",
     "langsmith==0.4.32",
     "opentelemetry-sdk==1.37.0",
     "opentelemetry-semantic-conventions==0.58b0",
     "optuna==4.5.0",
+    "orjson>=3.9.7,<3.10.17",
+    "ormsgpack>=1.12.0",
     "packaging==25.0",
     "pandas==2.3.3",
     "parse==1.20.2",
     "rich-rst==1.3.1",
     "rpds-py==0.27.1",
     "rsa==4.9.1",
+    "ruff>=0.14.10",
     "scikit-learn==1.7.2",
     "scipy==1.16.2",
     "setuptools==80.9.0",
 [tool.setuptools.packages.find]
 where = ["src"]

pyrightconfig.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "include": [
+        "src"
+    ],
+    "exclude": [
+        "**/__pycache__",
+        "**/.*",
+        "app_env",
+        "node_modules"
+    ],
+    "extraPaths": [
+        "src"
+    ],
+    "pythonVersion": "3.12",
+    "typeCheckingMode": "basic",
+    "reportMissingImports": true,
+    "reportMissingTypeStubs": false
+}

src/job_writing_agent/agents/nodes.py CHANGED Viewed

@@ -10,8 +10,9 @@ from datetime import datetime
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from ..classes.classes import AppState, ResearchState
 from ..prompts.templates import (
     CRITIQUE_PROMPT,
     PERSONA_DEVELOPMENT_PROMPT,
@@ -26,30 +27,38 @@ logger = logging.getLogger(__name__)
 # Constants
 CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
-llm_provider = LLMFactory()
-llm = llm_provider.create_langchain(
-    "qwen/qwen3-4b:free", provider="openrouter", temperature=0.3
-)
-def create_draft(state: ResearchState) -> AppState:
     """Create initial draft of the application material."""
     # Determine which type of content we're creating
-    current_application_session = state.get("company_research_data", {})
     content_category = state.get("content_category", "cover_letter")
     try:
         if state.get("vector_store"):
             vector_store = state.get("vector_store")
             # Extract key requirements from job description
             prompt = PERSONA_DEVELOPMENT_PROMPT | llm | StrOutputParser()
-            if current_application_session:
                 key_requirements = prompt.invoke(
-                    {"job_description": current_application_session["job_description"]}
                 )
             else:
                 return key_requirements
@@ -68,13 +77,16 @@ def create_draft(state: ResearchState) -> AppState:
             highly_relevant_resume = "\n".join(
                 [doc.page_content for doc in relevant_docs]
             )
             resume_text = f"""
             # Most Relevant Experience
             {highly_relevant_resume}
             # Full Resume
-            {resume_text}
             """
     except Exception as e:
         logger.warning(f"Could not use vector search for relevant resume parts: {e}")
         # Continue with regular resume text
@@ -91,31 +103,42 @@ def create_draft(state: ResearchState) -> AppState:
     # Create the draft using the selected prompt template
     CurrentSessionContextMessage = HumanMessagePromptTemplate.from_template(
         """
-            Below is the Job Description and Resume enclosed in triple backticks.
-            Job Description and Resume:
-            ```
             {current_job_role}
-            ```
-            Use the Company Research Data below in to create a cover letter that highlights the match between my qualifications and the job requirements and aligns with the company's values and culture.
-            Company Research Data:
-            #company_research_data
-            Create a cover letter that highlights the match between my qualifications and the job requirements.
             """,
-        input_variables=["current_job_role", "company_research_data"],
     )
     FirstDraftGenerationPromptTemplate.append(CurrentSessionContextMessage)
     # Invoke the chain with the appropriate inputs
-    chain = (
         (
             {
                 "current_job_role": lambda x: x["current_job_role"],
                 "company_research_data": lambda x: x["company_research_data"],
             }
         )
         | FirstDraftGenerationPromptTemplate
@@ -123,59 +146,203 @@ def create_draft(state: ResearchState) -> AppState:
     )
     # Prepare the inputs
-    inputs = {
-        "current_job_role": current_application_session["job_description"],
-        "company_research_data": current_application_session["tavily_search"],
     }
-    response = chain.invoke(inputs)
-    logger.info(f"Draft has been created: {response}")
-    state["draft"] = response
-    return state
-def critique_draft(state: AppState) -> AppState:
-    """Critique the draft for improvements."""
-    critique = llm.invoke(
-        CRITIQUE_PROMPT.format(
-            job_description=state["job_description"][0], draft=state["draft"]
         )
-    )
-    # Store the critique for reference during human feedback
-    state["critique"] = critique
-    return state
-def human_approval(state: AppState) -> AppState:
     """Human-in-the-loop checkpoint for feedback on the draft."""
     # This is a placeholder function that would be replaced by actual UI interaction
     print("\n" + "=" * 80)
     print("DRAFT FOR REVIEW:")
     print(state["draft"])
     print("\nAUTOMATIC CRITIQUE:")
-    print(state.get("critique", "No critique available"))
     print("=" * 80)
     print("\nPlease provide your feedback (press Enter to continue with no changes):")
     # In a real implementation, this would be handled by the UI
-    feedback = input()
-    state["feedback"] = feedback
-    return state
-def finalize_document(state: AppState) -> AppState:
     """Incorporate feedback and finalize the document."""
-    if not state["feedback"].strip():
-        state["final"] = state["draft"]
-        return state
-    final = llm.invoke(
-        REVISION_PROMPT.format(draft=state["draft"], feedback=state["feedback"])
     )
-    state["final"] = final
-    return state
 """

 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import SystemMessage
+from ..classes.classes import AppState, ResearchState, ResultState, DataLoadState
 from ..prompts.templates import (
     CRITIQUE_PROMPT,
     PERSONA_DEVELOPMENT_PROMPT,
 # Constants
 CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
+def create_draft(state: ResearchState) -> ResultState:
     """Create initial draft of the application material."""
+    # Create LLM inside function (lazy initialization)
+    llm_provider = LLMFactory()
+    llm = llm_provider.create_langchain(
+        "mistralai/mistral-7b-instruct:free", provider="openrouter", temperature=0.3
+    )
     # Determine which type of content we're creating
+    company_background_information = state.get("company_research_data", {})
     content_category = state.get("content_category", "cover_letter")
+    # Get the original resume text from state (used later if vector search is available)
+    original_resume_text = company_background_information.get("resume", "")
     try:
+        # Not yet implemented
         if state.get("vector_store"):
             vector_store = state.get("vector_store")
             # Extract key requirements from job description
             prompt = PERSONA_DEVELOPMENT_PROMPT | llm | StrOutputParser()
+            if company_background_information:
                 key_requirements = prompt.invoke(
+                    {
+                        "job_description": company_background_information[
+                            "job_description"
+                        ]
+                    }
                 )
             else:
                 return key_requirements
             highly_relevant_resume = "\n".join(
                 [doc.page_content for doc in relevant_docs]
             )
+            # Combine highly relevant parts with full resume text
             resume_text = f"""
             # Most Relevant Experience
             {highly_relevant_resume}
             # Full Resume
+            {original_resume_text}
             """
+            # Update the company_background_information with the enhanced resume
+            company_background_information["resume"] = resume_text
     except Exception as e:
         logger.warning(f"Could not use vector search for relevant resume parts: {e}")
         # Continue with regular resume text
     # Create the draft using the selected prompt template
     CurrentSessionContextMessage = HumanMessagePromptTemplate.from_template(
         """
+            Below is the Job Description, Candidate Resume, and Company Research Data enclosed in triple backticks.
+            **Job Description:**
+            '''
             {current_job_role}
+            '''
+            **Candidate Resume:**
+            '''
+            {candidate_resume}
+            '''
+            **Company Research Data:**
+            '''
+            {company_research_data}
+            '''
             """,
+        input_variables=[
+            "current_job_role",
+            "company_research_data",
+            "candidate_resume",
+        ],
     )
     FirstDraftGenerationPromptTemplate.append(CurrentSessionContextMessage)
     # Invoke the chain with the appropriate inputs
+    draft_generation_chain = (
         (
             {
                 "current_job_role": lambda x: x["current_job_role"],
                 "company_research_data": lambda x: x["company_research_data"],
+                "candidate_resume": lambda x: x["candidate_resume"],
             }
         )
         | FirstDraftGenerationPromptTemplate
     )
     # Prepare the inputs
+    application_background_data = {
+        "current_job_role": company_background_information["job_description"],
+        "company_research_data": company_background_information[
+            "company_research_data_summary"
+        ],
+        "candidate_resume": company_background_information["resume"],
     }
+    response = draft_generation_chain.invoke(application_background_data)
+    logger.info(f"Draft has been created: {response.content}")
+    app_state = ResultState(
+        draft=response.content,
+        feedback="",
+        critique_feedback="",
+        current_node="create_draft",
+        company_research_data=company_background_information,
+        output_data={},
+    )
+    return app_state
+def critique_draft(state: ResultState) -> ResultState:
+    """
+    Critique the draft for improvements.
+    Provides external evaluation focusing on job requirements, tone, clarity, and style.
+    """
+    try:
+        logger.info("Critiquing draft...")
+        # Create LLM inside function (lazy initialization)
+        llm_provider = LLMFactory()
+        llm = llm_provider.create_langchain(
+            "mistralai/mistral-7b-instruct:free", provider="openrouter", temperature=0.3
         )
+        job_description = str(state["company_research_data"].get("job_description", ""))
+        draft = str(state.get("draft", ""))
+        # Debug logging to verify values
+        logger.debug(f"Job description length: {len(job_description)}")
+        logger.debug(f"Draft length: {len(draft)}")
+        if not job_description or not draft:
+            logger.warning("Missing job_description or draft in state")
+            # Return state with empty feedback
+            return ResultState(
+                draft=draft,
+                feedback="",
+                critique_feedback="",
+                current_node="critique",
+                company_research_data=state["company_research_data"],
+                output_data=state["output_data"],
+            )
+        # Use the same pattern as create_draft:
+        # 1. Create ChatPromptTemplate from SystemMessage
+        # 2. Append HumanMessagePromptTemplate with variables
+        # 3. Create chain and invoke
+        # Extract SystemMessage from CRITIQUE_PROMPT
+        critique_system_message = SystemMessage(
+            content="You are a professional editor who specializes in job applications. Provide constructive feedback."
+        )
+        # Create ChatPromptTemplate from SystemMessage (like line 90-94 in create_draft)
+        CritiquePromptTemplate = ChatPromptTemplate([critique_system_message])
+        # Append HumanMessagePromptTemplate with variables (like line 97-124 in create_draft)
+        CritiqueContextMessage = HumanMessagePromptTemplate.from_template(
+            """
+    # Job Description
+    {job_description}
+    # Current Draft
+    {draft}
+    Critique this draft and suggest specific improvements. Focus on:
+    1. How well it targets the job requirements
+    2. Professional tone and language
+    3. Clarity and impact
+    4. Grammar and style
+    Return your critique in a constructive, actionable format.
+    """,
+            input_variables=["job_description", "draft"],
+        )
+        CritiquePromptTemplate.append(CritiqueContextMessage)
+        # Create chain (like line 129-139 in create_draft)
+        critique_chain = (
+            {
+                "job_description": lambda x: x["job_description"],
+                "draft": lambda x: x["draft"],
+            }
+            | CritiquePromptTemplate
+            | llm
+        )
+        # Invoke with input variables (like line 150 in create_draft)
+        critique = critique_chain.invoke(
+            {
+                "job_description": job_description,
+                "draft": draft,
+            }
+        )
+        critique_content = (
+            critique.content if hasattr(critique, "content") else str(critique)
+        )
+        logger.info("Draft critique completed")
+        # Store the critique for reference during revision
+        app_state = ResultState(
+            draft=state["draft"],
+            feedback=state["feedback"],
+            critique_feedback=critique_content,
+            current_node="critique",
+            company_research_data=state["company_research_data"],
+            output_data=state["output_data"],
+        )
+        return app_state
+    except Exception as e:
+        logger.error(f"Error in critique_draft: {e}", exc_info=True)
+        # Return state unchanged on error
+        return state
+def human_approval(state: ResultState) -> ResultState:
     """Human-in-the-loop checkpoint for feedback on the draft."""
     # This is a placeholder function that would be replaced by actual UI interaction
     print("\n" + "=" * 80)
     print("DRAFT FOR REVIEW:")
     print(state["draft"])
     print("\nAUTOMATIC CRITIQUE:")
+    print(state.get("critique_feedback", "No critique available"))
     print("=" * 80)
     print("\nPlease provide your feedback (press Enter to continue with no changes):")
     # In a real implementation, this would be handled by the UI
+    human_feedback = input()
+    result_state = ResultState(
+        draft=state["draft"],
+        feedback=human_feedback,
+        critique_feedback=state["critique_feedback"],
+        current_node="human_approval",
+        company_research_data=state["company_research_data"],
+        output_data=state["output_data"],
+    )
+    return result_state
+def finalize_document(state: ResultState) -> DataLoadState:
     """Incorporate feedback and finalize the document."""
+    # Create LLM inside function (lazy initialization)
+    llm_provider = LLMFactory()
+    llm = llm_provider.create_langchain(
+        "mistralai/mistral-7b-instruct:free", provider="openrouter", temperature=0.3
+    )
+    # Create chain like in critique_draft (line 229-236)
+    revision_chain = (
+        {
+            "draft": lambda x: x["draft"],
+            "feedback": lambda x: x["feedback"],
+            "critique_feedback": lambda x: x["critique_feedback"],
+        }
+        | REVISION_PROMPT
+        | llm
+    )
+    print(f"revision_chain: {revision_chain}")
+    # Invoke with input variables (like line 239 in critique_draft)
+    final_content = revision_chain.invoke(
+        {
+            "draft": state["draft"],
+            "feedback": state["feedback"],
+            "critique_feedback": state["critique_feedback"],
+        }
     )
+    app_state = DataLoadState(
+        draft=state["draft"],
+        feedback=state["feedback"],
+        critique_feedback=state["critique_feedback"],
+        company_research_data=state["company_research_data"],
+        current_node="finalize",
+        output_data=final_content.content
+        if hasattr(final_content, "content")
+        else str(final_content),
+    )
+    return app_state
 """

src/job_writing_agent/agents/output_schema.py CHANGED Viewed

@@ -2,12 +2,24 @@ from pydantic import BaseModel, Field, field_validator
 from typing import List, Optional
 import dspy
 class TavilyQuerySet(BaseModel):
-    query1: Optional[List[str]] = Field(default=None, description="First search query and its rationale, e.g., ['query text']")
-    query2: Optional[List[str]] = Field(default=None, description="Second search query and its rationale")
-    query3: Optional[List[str]] = Field(default=None, description="Third search query and its rationale")
-    query4: Optional[List[str]] = Field(default=None, description="Fourth search query and its rationale")
-    query5: Optional[List[str]] = Field(default=None, description="Fifth search query and its rationale")
     @field_validator("query1", "query2", "query3", "query4", "query5", mode="after")
     @classmethod
@@ -16,13 +28,38 @@ class TavilyQuerySet(BaseModel):
         if v is not None:  # Only validate if the list is actually provided
             if len(v) != 1:
                 # Updated error message for clarity
-                raise ValueError("Each query list, when provided, must contain exactly one string: the query text.")
         return v
 class TavilySearchQueries(dspy.Signature):
-    """Use the job description and company name
     to create exactly 5 search queries for the tavily search tool in JSON Format"""
-    job_description = dspy.InputField(desc="Job description of the role that candidate is applying for.")
-    company_name = dspy.InputField(desc="Name of the company the candidate is applying for.")
-    search_queries = dspy.OutputField(desc="Dictionary of tavily search queries which will gather understanding of the company and it's culture", json=True)
-    search_query_relevance = dspy.OutputField(desc="Dictionary of relevance for each tavily search query that is generated", json=True)

 from typing import List, Optional
 import dspy
 class TavilyQuerySet(BaseModel):
+    query1: Optional[List[str]] = Field(
+        default=None,
+        description="First search query and its rationale, e.g., ['query text']",
+    )
+    query2: Optional[List[str]] = Field(
+        default=None, description="Second search query and its rationale"
+    )
+    query3: Optional[List[str]] = Field(
+        default=None, description="Third search query and its rationale"
+    )
+    query4: Optional[List[str]] = Field(
+        default=None, description="Fourth search query and its rationale"
+    )
+    query5: Optional[List[str]] = Field(
+        default=None, description="Fifth search query and its rationale"
+    )
     @field_validator("query1", "query2", "query3", "query4", "query5", mode="after")
     @classmethod
         if v is not None:  # Only validate if the list is actually provided
             if len(v) != 1:
                 # Updated error message for clarity
+                raise ValueError(
+                    "Each query list, when provided, must contain exactly one string: the query text."
+                )
         return v
 class TavilySearchQueries(dspy.Signature):
+    """Use the job description and company name
     to create exactly 5 search queries for the tavily search tool in JSON Format"""
+    job_description = dspy.InputField(
+        desc="Job description of the role that candidate is applying for."
+    )
+    company_name = dspy.InputField(
+        desc="Name of the company the candidate is applying for."
+    )
+    search_queries = dspy.OutputField(
+        desc="Dictionary of tavily search queries which will gather understanding of the company and it's culture",
+        json=True,
+    )
+    search_query_relevance = dspy.OutputField(
+        desc="Dictionary of relevance for each tavily search query that is generated",
+        json=True,
+    )
+class CompanyResearchDataSummarizationSchema(dspy.Signature):
+    """This schema is used to summarize the company research data into a concise summary to provide a clear understanding of the company."""
+    company_research_data = dspy.InputField(
+        desc="These are the results of the tavily search queries that were generated. They have been filtered for relevance and are now ready to be summarized."
+    )
+    company_research_data_summary = dspy.OutputField(
+        desc="This is summary of the company research data that will be used by a job application writer to assist the candidate in writing content supporting the job application. The summary should be relevant to the job application and the company.",
+    )

src/job_writing_agent/classes/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .classes import AppState, ResearchState, DataLoadState
-__all__ = ["AppState", "ResearchState", "DataLoadState"]


1	+ from .classes import AppState, ResearchState, DataLoadState, ResultState
2
3	+ __all__ = ["AppState", "ResearchState", "DataLoadState", "ResultState"]

src/job_writing_agent/classes/classes.py CHANGED Viewed

@@ -2,11 +2,36 @@
 State definitions for the Job Writer LangGraph Workflow.
 """
-from langgraph.store.base import Op
 from typing_extensions import List, Dict, Any
 from langgraph.graph import MessagesState
 from dataclasses import dataclass
 @dataclass
 class AppState(MessagesState):
     """
@@ -23,33 +48,45 @@ class AppState(MessagesState):
         final: Final version of the application material
         content: Type of application material to generate
     """
     resume_path: str
     job_description_source: str
-    company_research_data: Dict[str, Any]
-    draft: str
-    feedback: str
-    final_version: str
     content: str  # "cover_letter", "bullets", "linkedin_note"
     current_node: str
-class DataLoadState(MessagesState):
     """
     State container for the job application writer workflow.
     Attributes:
         resume: List of text chunks from the candidate's resume
         job_description: List of text chunks from the job description
         persona: The writing persona to use ("recruiter" or "hiring_manager")
         content: Type of application material to generate
     """
     resume_path: str
     job_description_source: str
     resume: str
     job_description: str
     company_name: str
     current_node: str
-    company_research_data: Dict[str, Any]
 class ResearchState(MessagesState):
@@ -60,6 +97,22 @@ class ResearchState(MessagesState):
         attempted_search_queries: List of queries used extracted from the job description
         compiled_knowledge: Compiled knowledge from the research
     """
     company_research_data: Dict[str, Any]
     attempted_search_queries: List[str]
     current_node: str

 State definitions for the Job Writer LangGraph Workflow.
 """
+from typing import Annotated
 from typing_extensions import List, Dict, Any
 from langgraph.graph import MessagesState
 from dataclasses import dataclass
+def merge_dict_reducer(
+    x: Dict[str, Any] | None, y: Dict[str, Any] | None
+) -> Dict[str, Any]:
+    """
+    Reducer function to merge two dictionaries.
+    Used for company_research_data to allow parallel nodes to update it.
+    Args:
+        x: First dictionary (existing state or None)
+        y: Second dictionary (new update or None)
+    Returns:
+        Merged dictionary with y taking precedence for overlapping keys
+    """
+    # Handle None cases - treat as empty dict
+    if x is None:
+        x = {}
+    if y is None:
+        y = {}
+    # Merge dictionaries, with y taking precedence for overlapping keys
+    return {**x, **y}
 @dataclass
 class AppState(MessagesState):
     """
         final: Final version of the application material
         content: Type of application material to generate
     """
     resume_path: str
     job_description_source: str
     content: str  # "cover_letter", "bullets", "linkedin_note"
     current_node: str
+class DataLoadState(MessagesState, total=False):
     """
     State container for the job application writer workflow.
+    Includes all fields needed throughout the entire workflow.
     Attributes:
         resume: List of text chunks from the candidate's resume
         job_description: List of text chunks from the job description
         persona: The writing persona to use ("recruiter" or "hiring_manager")
         content: Type of application material to generate
+        draft: Current draft of the application material
+        feedback: Human feedback on the draft
+        critique_feedback: Automated critique feedback
+        output_data: Final output data
+        next_node: Next node to route to after data loading subgraph
     """
     resume_path: str
     job_description_source: str
+    content: str  # "cover_letter", "bullets", "linkedin_note"
     resume: str
     job_description: str
     company_name: str
     current_node: str
+    next_node: str  # For routing after data loading subgraph
+    # Use Annotated with reducer to allow parallel nodes to merge dictionary updates
+    company_research_data: Annotated[Dict[str, Any], merge_dict_reducer]
+    # Result fields (added for final output - optional, populated later)
+    draft: str
+    feedback: str
+    critique_feedback: str
+    output_data: str
 class ResearchState(MessagesState):
         attempted_search_queries: List of queries used extracted from the job description
         compiled_knowledge: Compiled knowledge from the research
     """
     company_research_data: Dict[str, Any]
     attempted_search_queries: List[str]
     current_node: str
+class ResultState(MessagesState):
+    """
+    State container for the job application writer workflow.
+    Attributes:
+        final_result: The final generated application material
+    """
+    draft: str
+    feedback: str
+    critique_feedback: str
+    current_node: str
+    company_research_data: Dict[str, Any]
+    output_data: str

src/job_writing_agent/logs/job_writer.log CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/job_writing_agent/nodes/initializing.py CHANGED Viewed

@@ -8,74 +8,80 @@ job descriptions, managing missing inputs, and populating application state.
 The module includes utilities for:
 - Parsing resume files and extracting text content
-- Parsing job descriptions and extracting company information
 - Orchestrating input loading with validation
 - Providing user prompts for missing information during verification
 """
 import logging
-from typing import Tuple
-from typing_extensions import Literal
 from langchain_core.documents import Document
 from langchain_core.messages import SystemMessage
-from job_writing_agent.classes import AppState, DataLoadState
-from job_writing_agent.utils.document_processing import parse_resume, get_job_description
 from job_writing_agent.prompts.templates import agent_system_prompt
 logger = logging.getLogger(__name__)
-# ---------------------------------------------------------------------------
-# Helper decorator to log exceptions for async methods
-# ---------------------------------------------------------------------------
-def log_exceptions(func):
-    """Decorator to log exceptions in async functions."""
-    async def wrapper(*args, **kwargs):
-        try:
-            return await func(*args, **kwargs)
-        except Exception as exc:
-            logger.error(
-                "Exception in %s: %s", func.__name__, exc, exc_info=True
-            )
-            raise
-    return wrapper
 class Dataloading:
     """
-    Node for loading and initializing resume and job description data.
     Methods
     -------
-    set_agent_system_message(state: AppState) -> DataLoadState
         Adds the system prompt to the conversation state.
     get_resume(resume_source) -> str
         Parses a resume file and returns its plain‑text content.
     parse_job_description(job_description_source) -> Tuple[str, str]
         Parses a job description and returns its text and company name.
-    load_inputs(state: DataLoadState) -> AppState
-        Orchestrates loading of resume and job description.
-    validate_data_load_state(state: DataLoadState)
-        Ensures required fields are present in company_research_data.
-    verify_inputs(state: AppState) -> Literal["load", "research"]
-        Validates inputs and decides the next workflow node.
-    run(state: DataLoadState) -> AppState
-        Executes the loading step of the workflow.
     """
     def __init__(self):
         pass
-    async def set_agent_system_message(self, state: AppState) -> DataLoadState:
         """Add the system prompt to the conversation state.
         Parameters
         ----------
-        state: AppState
             Current workflow state.
         Returns
@@ -83,9 +89,7 @@ class Dataloading:
         DataLoadState
             Updated state with the system message and the next node identifier.
         """
-        agent_initialization_system_message = SystemMessage(
-            content=agent_system_prompt
-        )
         messages = state.get("messages", [])
         messages.append(agent_initialization_system_message)
         return {
@@ -94,217 +98,416 @@ class Dataloading:
             "current_node": "initialize_system",
         }
     async def get_resume(self, resume_source):
-        """Parse a resume file and return its plain‑text content.
         Parameters
         ----------
         resume_source: Any
             Path or file‑like object accepted by ``parse_resume``.
         """
-        try:
-            logger.info("Parsing resume...")
-            resume_text = ""
-            assert resume_source is not None
-            resume_chunks = parse_resume(resume_source)
-            for chunk in resume_chunks:
-                if hasattr(chunk, "page_content") and chunk.page_content:
-                    resume_text += chunk.page_content
-                elif isinstance(chunk, str) and chunk:
-                    resume_text += chunk
-                else:
-                    logger.debug(
-                        "Skipping empty or invalid chunk in resume: %s", chunk
-                    )
-            return resume_text
-        except Exception as e:
-            logger.error("Error parsing resume: %s", e)
-            raise
     async def parse_job_description(self, job_description_source):
-        """Parse a job description and return its text and company name.
         Parameters
         ----------
         job_description_source: Any
-            Source accepted by ``get_job_description``.
         """
-        try:
-            logger.info(
-                "Parsing job description from: %s", job_description_source
             )
-            assert (
-                job_description_source is not None
-            ), "Job description source cannot be None"
-            job_description_document: Document = await get_job_description(
-                job_description_source
             )
-            company_name = ""
-            job_posting_text = ""
-            if job_description_document:
-                if hasattr(
-                    job_description_document, "metadata"
-                ) and isinstance(job_description_document.metadata, dict):
-                    company_name = job_description_document.metadata.get(
-                        "company_name", ""
-                    )
-                    if not company_name:
-                        logger.warning(
-                            "Company name not found in job description metadata."
-                        )
-                else:
-                    logger.warning(
-                        "Metadata attribute missing or not a dict in job "
-                        "description document."
-                    )
-                if hasattr(job_description_document, "page_content"):
-                    job_posting_text = job_description_document.page_content or ""
-                    if not job_posting_text:
-                        logger.info("Parsed job posting text is empty.")
                 else:
-                    logger.warning(
-                        "page_content attribute missing in job description document."
-                    )
-            else:
-                logger.warning(
-                    "get_job_description returned None for source: %s",
-                    job_description_source,
-                )
-            return job_posting_text, company_name
-        except Exception as e:
-            logger.error(
-                "Error parsing job description from source '%s': %s",
-                job_description_source,
-                e,
-                exc_info=True,
-            )
-            raise
-    # -----------------------------------------------------------------------
-    # Private helper methods used by load_inputs
-    # -----------------------------------------------------------------------
-    @log_exceptions
     async def _load_resume(self, resume_source) -> str:
-        """Load resume content, raising if the source is missing."""
         if not resume_source:
             raise ValueError("resume_source is required")
         return await self.get_resume(resume_source)
-    @log_exceptions
     async def _load_job_description(self, jd_source) -> Tuple[str, str]:
-        """Load job description text and company name, raising if missing."""
         if not jd_source:
             raise ValueError("job_description_source is required")
         return await self.parse_job_description(jd_source)
-    @log_exceptions
     async def _prompt_user(self, prompt_msg: str) -> str:
-        """Prompt the user for input (synchronous ``input`` wrapped for async use)."""
-        # In a real async UI replace ``input`` with an async call.
-        return input(prompt_msg)
-    async def load_inputs(self, state: DataLoadState) -> AppState:
-        """Orchestrate loading of resume and job description.
-        The method populates ``state['company_research_data']`` with the parsed
-        resume, job description, and company name, then advances the workflow
-        to the ``load_inputs`` node.
         """
-        resume_src = state.get("resume_path")
-        jd_src = state.get("job_description_source")
-        # -------------------------------------------------------------------
-        # Load job description (or prompt if missing during verification)
-        # -------------------------------------------------------------------
-        job_text = ""
-        company_name = ""
-        if jd_src:
-            job_text, company_name = await self._load_job_description(jd_src)
-        elif state.get("current_node") == "verify":
-            job_text = await self._prompt_user(
-                "Please paste the job posting in text format: "
-            )
-        # -------------------------------------------------------------------
-        # Load resume (or prompt if missing during verification)
-        # -------------------------------------------------------------------
-        resume_text = ""
-        if resume_src:
-            resume_text = await self._load_resume(resume_src)
-        elif state.get("current_node") == "verify":
-            raw = await self._prompt_user(
-                "Please paste the resume in text format: "
-            )
-            resume_text = raw
-        # Populate state
-        state["company_research_data"] = {
-            "resume": resume_text,
-            "job_description": job_text,
-            "company_name": company_name,
-        }
-        state["current_node"] = "load_inputs"
-        return state
-    def validate_data_load_state(self, state: DataLoadState):
-        """Ensure required fields are present in ``company_research_data``."""
-        assert state.company_research_data.get(
-            "resume"
-        ), "Resume is missing in company_research_data"
-        assert state.company_research_data.get(
-            "job_description"
-        ), "Job description is missing"
-    def verify_inputs(self, state: AppState) -> Literal["load", "research"]:
-        """Validate inputs and decide the next workflow node.
-        Returns
-        -------
-        Literal["load", "research"]
-            ``"load"`` if required data is missing, otherwise ``"research"``.
-        """
-        print("Verifying Inputs")
-        state["current_node"] = "verify"
-        logger.info("Verifying loaded inputs!")
-        assert state["company_research_data"].get(
-            "resume"
-        ), "Resume is missing in company_research_data"
-        assert state["company_research_data"].get(
-            "job_description"
-        ), "Job description is missing"
-        if not state.get("company_research_data"):
-            missing_items = []
-            if not state["company_research_data"].get("resume", ""):
-                missing_items.append("resume")
-            if not state["company_research_data"].get("job_description", ""):
-                missing_items.append("job description")
-            logger.error("Missing required data: %s", ", ".join(missing_items))
-            return "load"
-        # Normalise values to strings
-        for key in ["resume", "job_description"]:
-            try:
-                value = state["company_research_data"][key]
-                if isinstance(value, (list, tuple)):
-                    state["company_research_data"][key] = " ".join(
-                        str(x) for x in value
-                    )
-                elif isinstance(value, dict):
-                    state["company_research_data"][key] = str(value)
-                else:
-                    state["company_research_data"][key] = str(value)
-            except Exception as e:
-                logger.warning("Error converting %s to string: %s", key, e)
-                raise
-        return "research"
-    async def run(self, state: DataLoadState) -> AppState:
-        """Execute the loading step of the workflow."""
-        state = await self.load_inputs(state)
-        return state

 The module includes utilities for:
 - Parsing resume files and extracting text content
+- Parsing job descriptions and extracting company information
 - Orchestrating input loading with validation
 - Providing user prompts for missing information during verification
 """
 import logging
+from typing import Tuple, Optional
 from langchain_core.documents import Document
 from langchain_core.messages import SystemMessage
+from langgraph.graph import StateGraph, END, START
+from job_writing_agent.classes import DataLoadState
+from job_writing_agent.utils.document_processing import (
+    parse_resume,
+    get_job_description,
+)
 from job_writing_agent.prompts.templates import agent_system_prompt
+from job_writing_agent.utils.logging.logging_decorators import (
+    log_async,
+    log_execution,
+    log_errors,
+)
 logger = logging.getLogger(__name__)
+# Note: Using centralized logging decorators from utils.logging.logging_decorators
 class Dataloading:
     """
+    Helper class providing utility methods for loading and parsing data.
+    This class provides helper methods used by the data loading subgraph nodes.
+    The actual workflow orchestration is handled by the data_loading_workflow subgraph.
     Methods
     -------
+    set_agent_system_message(state: DataLoadState) -> DataLoadState
         Adds the system prompt to the conversation state.
     get_resume(resume_source) -> str
         Parses a resume file and returns its plain‑text content.
     parse_job_description(job_description_source) -> Tuple[str, str]
         Parses a job description and returns its text and company name.
+    verify_inputs(state: DataLoadState) -> DataLoadState
+        Validates inputs and sets next_node for routing.
+    Private Methods (used by subgraph nodes)
+    -----------------------------------------
+    _load_resume(resume_source) -> str
+        Load resume content, raising if the source is missing.
+    _load_job_description(jd_source) -> Tuple[str, str]
+        Load job description text and company name, raising if missing.
+    _prompt_user(prompt_msg: str) -> str
+        Prompt the user for input (synchronous input wrapped for async use).
     """
     def __init__(self):
+        """Initialize Dataloading helper class."""
         pass
+    # =======================================================================
+    # System/Initialization Methods
+    # =======================================================================
+    @log_async
+    async def set_agent_system_message(self, state: DataLoadState) -> DataLoadState:
         """Add the system prompt to the conversation state.
         Parameters
         ----------
+        state: DataLoadState
             Current workflow state.
         Returns
         DataLoadState
             Updated state with the system message and the next node identifier.
         """
+        agent_initialization_system_message = SystemMessage(content=agent_system_prompt)
         messages = state.get("messages", [])
         messages.append(agent_initialization_system_message)
         return {
             "current_node": "initialize_system",
         }
+    # =======================================================================
+    # Public Parsing Methods
+    # =======================================================================
+    @log_async
+    @log_errors
     async def get_resume(self, resume_source):
+        """
+        Parse a resume file and return its plain‑text content.
+        This method extracts text from resume chunks, handling both Document
+        objects and plain strings. Empty or invalid chunks are skipped.
         Parameters
         ----------
         resume_source: Any
             Path or file‑like object accepted by ``parse_resume``.
+        Returns
+        -------
+        str
+            Plain text content of the resume.
+        Raises
+        ------
+        AssertionError
+            If resume_source is None.
+        Exception
+            If parsing fails.
         """
+        logger.info("Parsing resume...")
+        resume_text = ""
+        assert resume_source is not None
+        resume_chunks = parse_resume(resume_source)
+        for chunk in resume_chunks:
+            if hasattr(chunk, "page_content") and chunk.page_content:
+                resume_text += chunk.page_content
+            elif isinstance(chunk, str) and chunk:
+                resume_text += chunk
+            else:
+                logger.debug("Skipping empty or invalid chunk in resume: %s", chunk)
+        return resume_text
+    @log_async
+    @log_errors
     async def parse_job_description(self, job_description_source):
+        """
+        Parse a job description and return its text and company name.
+        Extracts both the job posting text and company name from the document.
+        Company name is extracted from document metadata if available.
         Parameters
         ----------
         job_description_source: Any
+            Source accepted by ``get_job_description`` (URL, file path, etc.).
+        Returns
+        -------
+        Tuple[str, str]
+            A tuple of (job_posting_text, company_name).
+        Raises
+        ------
+        AssertionError
+            If job_description_source is None.
+        Exception
+            If parsing fails.
         """
+        company_name = ""
+        job_posting_text = ""
+        logger.info("Parsing job description from: %s", job_description_source)
+        assert job_description_source is not None, (
+            "Job description source cannot be None"
+        )
+        job_description_document: Optional[Document] = await get_job_description(
+            job_description_source
+        )
+        # Extract company name from metadata
+        if hasattr(job_description_document, "metadata") and isinstance(
+            job_description_document.metadata, dict
+        ):
+            company_name = job_description_document.metadata.get("company_name", "")
+            if not company_name:
+                logger.warning("Company name not found in job description metadata.")
+        else:
+            logger.warning(
+                "Metadata attribute missing or not a dict in job description document."
             )
+        # Extract job posting text
+        if hasattr(job_description_document, "page_content"):
+            job_posting_text = job_description_document.page_content or ""
+            if not job_posting_text:
+                logger.info("Parsed job posting text is empty.")
+        else:
+            logger.warning(
+                "page_content attribute missing in job description document."
             )
+        return job_posting_text, company_name
+    @log_async
+    async def get_application_form_details(self, job_description_source):
+        """
+        Placeholder for future method to get application form details.
+        This method will be implemented to extract form fields and requirements
+        from job application forms.
+        Parameters
+        ----------
+        job_description_source: Any
+            Source of the job description or application form.
+        """
+        # TODO: Implement form field extraction
+        pass
+    # =======================================================================
+    # Validation Methods
+    # =======================================================================
+    @log_execution
+    @log_errors
+    def verify_inputs(self, state: DataLoadState) -> DataLoadState:
+        """
+        Validate inputs and set next_node for routing.
+        This method validates that both resume and job description are present
+        in the state, normalizes their values to strings, and sets the next_node
+        field for conditional routing in the main workflow.
+        Parameters
+        ----------
+        state: DataLoadState
+            Current workflow state containing company_research_data.
+        Returns
+        -------
+        DataLoadState
+            Updated state with next_node set to "load" (if validation fails)
+            or "research" (if validation passes).
+        Raises
+        ------
+        Exception
+            If normalization fails for any field.
+        """
+        logger.info("Verifying loaded inputs!")
+        state["current_node"] = "verify"
+        # Validate required fields
+        company_research_data = state.get("company_research_data", {})
+        if not company_research_data.get("resume"):
+            logger.error("Resume is missing in company_research_data")
+            state["next_node"] = "load"  # Loop back to load subgraph
+            return state
+        if not company_research_data.get("job_description"):
+            logger.error("Job description is missing in company_research_data")
+            state["next_node"] = "load"  # Loop back to load subgraph
+            return state
+        # Normalize values to strings
+        for key in ["resume", "job_description"]:
+            try:
+                value = company_research_data[key]
+                if isinstance(value, (list, tuple)):
+                    company_research_data[key] = " ".join(str(x) for x in value)
+                elif isinstance(value, dict):
+                    company_research_data[key] = str(value)
                 else:
+                    company_research_data[key] = str(value)
+            except Exception as e:
+                logger.warning("Error converting %s to string: %s", key, e)
+                state["next_node"] = "load"
+                return state
+        # All validations passed
+        state["next_node"] = "research"
+        logger.info("Inputs verified successfully, proceeding to research")
+        return state
+    # =======================================================================
+    # Private Helper Methods (used by subgraph nodes)
+    # =======================================================================
+    @log_async
+    @log_errors
     async def _load_resume(self, resume_source) -> str:
+        """
+        Load resume content, raising if the source is missing.
+        This is a wrapper around get_resume() that validates the source first.
+        Used by subgraph nodes for consistent error handling.
+        Parameters
+        ----------
+        resume_source: Any
+            Path or file-like object for the resume.
+        Returns
+        -------
+        str
+            Plain text content of the resume.
+        Raises
+        ------
+        ValueError
+            If resume_source is None or empty.
+        """
         if not resume_source:
             raise ValueError("resume_source is required")
         return await self.get_resume(resume_source)
+    @log_async
+    @log_errors
     async def _load_job_description(self, jd_source) -> Tuple[str, str]:
+        """
+        Load job description text and company name, raising if missing.
+        This is a wrapper around parse_job_description() that validates the source first.
+        Used by subgraph nodes for consistent error handling.
+        Parameters
+        ----------
+        jd_source: Any
+            Source for the job description (URL, file path, etc.).
+        Returns
+        -------
+        Tuple[str, str]
+            A tuple of (job_posting_text, company_name).
+        Raises
+        ------
+        ValueError
+            If jd_source is None or empty.
+        """
         if not jd_source:
             raise ValueError("job_description_source is required")
         return await self.parse_job_description(jd_source)
+    @log_async
+    @log_errors
     async def _prompt_user(self, prompt_msg: str) -> str:
+        """
+        Prompt the user for input (synchronous input wrapped for async use).
+        This method wraps the synchronous input() function to be used in async contexts.
+        In a production async UI, this would be replaced with an async input mechanism.
+        Parameters
+        ----------
+        prompt_msg: str
+            Message to display to the user.
+        Returns
+        -------
+        str
+            User input string.
         """
+        # In a real async UI replace input with an async call.
+        return input(prompt_msg)
+# ============================================================================
+# Data Loading Subgraph Nodes
+# ============================================================================
+@log_async
+async def parse_resume_node(state: DataLoadState) -> DataLoadState:
+    """
+    Node to parse resume in parallel with job description parsing.
+    Extracts resume parsing logic from load_inputs for parallel execution.
+    Returns only the resume data - reducer will merge with job description data.
+    """
+    dataloading = Dataloading()
+    resume_src = state.get("resume_path")
+    resume_text = ""
+    if resume_src:
+        resume_text = await dataloading._load_resume(resume_src)
+    elif state.get("current_node") == "verify":
+        resume_text = await dataloading._prompt_user(
+            "Please paste the resume in text format: "
+        )
+    # Return only the resume data - reducer will merge this with job description data
+    logger.info(f"Resume parsed: {len(resume_text)} characters")
+    # Return partial state update - LangGraph will merge this with other parallel updates
+    return {
+        "company_research_data": {"resume": resume_text},
+    }
+@log_async
+async def parse_job_description_node(state: DataLoadState) -> DataLoadState:
+    """
+    Node to parse job description in parallel with resume parsing.
+    Extracts job description parsing logic from load_inputs for parallel execution.
+    Returns only the job description data - reducer will merge with resume data.
+    """
+    dataloading = Dataloading()
+    jd_src = state.get("job_description_source")
+    job_text = ""
+    company_name = ""
+    if jd_src:
+        job_text, company_name = await dataloading._load_job_description(jd_src)
+    elif state.get("current_node") == "verify":
+        job_text = await dataloading._prompt_user(
+            "Please paste the job posting in text format: "
+        )
+    # Return only the job description data - reducer will merge this with resume data
+    logger.info(
+        f"Job description parsed: {len(job_text)} characters, company: {company_name}"
+    )
+    # Return partial state update - LangGraph will merge this with other parallel updates
+    return {
+        "company_research_data": {
+            "job_description": job_text,
+            "company_name": company_name,
+        },
+    }
+@log_execution
+def aggregate_data_loading_results(state: DataLoadState) -> DataLoadState:
+    """
+    Aggregate results from parallel resume and job description parsing nodes.
+    This node runs after both parse_resume_node and parse_job_description_node
+    complete. It ensures both results are present and normalizes the state.
+    """
+    # Ensure company_research_data exists
+    if "company_research_data" not in state:
+        state["company_research_data"] = {}
+    # Get results from parallel nodes
+    resume_text = state["company_research_data"].get("resume", "")
+    job_text = state["company_research_data"].get("job_description", "")
+    company_name = state["company_research_data"].get("company_name", "")
+    # Validate both are present
+    if not resume_text:
+        logger.warning("Resume text is empty after parsing")
+    if not job_text:
+        logger.warning("Job description text is empty after parsing")
+    # Ensure final structure is correct
+    state["company_research_data"] = {
+        "resume": resume_text,
+        "job_description": job_text,
+        "company_name": company_name,
+    }
+    state["current_node"] = "aggregate_results"
+    logger.info("Data loading results aggregated successfully")
+    return state
+@log_execution
+def verify_inputs_node(state: DataLoadState) -> DataLoadState:
+    """
+    Verify that required inputs are present and set next_node for routing.
+    Modified from verify_inputs to return state with next_node instead of string.
+    """
+    dataloading = Dataloading()
+    return dataloading.verify_inputs(state)
+# ============================================================================
+# Data Loading Subgraph
+# ============================================================================
+# Create data loading subgraph
+data_loading_subgraph = StateGraph(DataLoadState)
+# Add subgraph nodes
+dataloading_instance = Dataloading()
+data_loading_subgraph.add_node(
+    "set_agent_system_message", dataloading_instance.set_agent_system_message
+)
+data_loading_subgraph.add_node("parse_resume", parse_resume_node)
+data_loading_subgraph.add_node("parse_job_description", parse_job_description_node)
+data_loading_subgraph.add_node("aggregate_results", aggregate_data_loading_results)
+data_loading_subgraph.add_node("verify_inputs", verify_inputs_node)
+# Add subgraph edges
+data_loading_subgraph.add_edge(START, "set_agent_system_message")
+# Parallel execution: both nodes start after set_agent_system_message
+data_loading_subgraph.add_edge("set_agent_system_message", "parse_resume")
+data_loading_subgraph.add_edge("set_agent_system_message", "parse_job_description")
+# Both parallel nodes feed into aggregate (LangGraph waits for both)
+data_loading_subgraph.add_edge("parse_resume", "aggregate_results")
+data_loading_subgraph.add_edge("parse_job_description", "aggregate_results")
+# Aggregate feeds into verification
+data_loading_subgraph.add_edge("aggregate_results", "verify_inputs")
+# Verification ends the subgraph
+data_loading_subgraph.add_edge("verify_inputs", END)
+# Compile data loading subgraph
+data_loading_workflow = data_loading_subgraph.compile()

src/job_writing_agent/nodes/job_description_loader.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# -*- coding: utf-8 -*-
+"""
+Job Description Loader Module
+This module provides the JobDescriptionLoader class responsible for loading and parsing
+job description files and URLs, extracting both the job posting text and company name.
+"""
+import logging
+from typing import Callable, Any, Optional, Tuple, Awaitable
+from langchain_core.documents import Document
+from job_writing_agent.utils.document_processing import get_job_description
+from job_writing_agent.utils.logging.logging_decorators import (
+    log_async,
+    log_errors,
+)
+logger = logging.getLogger(__name__)
+class JobDescriptionLoader:
+    """
+    Responsible for loading and parsing job description documents.
+    This class follows SOLID principles:
+    - Single Responsibility: Only handles job description parsing
+    - Dependency Inversion: Parser is injected for testability
+    - Open/Closed: Can extend with different parsers without modification
+    - Interface Segregation: Focused interface (only job description methods)
+    Example:
+        >>> loader = JobDescriptionLoader()
+        >>> job_text, company = await loader.parse_job_description("https://example.com/job")
+        >>>
+        >>> # With custom parser for testing
+        >>> async def mock_parser(source):
+        ...     return Document(page_content="test", metadata={"company_name": "TestCo"})
+        >>> loader = JobDescriptionLoader(parser=mock_parser)
+    """
+    def __init__(self, parser: Optional[Callable[[Any], Awaitable[Document]]] = None):
+        """
+        Initialize JobDescriptionLoader with optional parser dependency injection.
+        Parameters
+        ----------
+        parser: Optional[Callable[[Any], Awaitable[Document]]]
+            Async function to parse job description documents. Defaults to
+            `get_job_description` from document_processing. Can be injected
+            for testing or custom parsing.
+            The parser should:
+            - Take one argument (source: str) - URL or file path
+            - Return an awaitable that resolves to a Document object
+            - Document should have page_content (str) and metadata (dict)
+        """
+        self._parser = parser or get_job_description
+    @log_async
+    @log_errors
+    async def parse_job_description(
+        self, job_description_source: Any
+    ) -> Tuple[str, str]:
+        """
+        Parse a job description and return its text and company name.
+        Extracts both the job posting text and company name from the document.
+        Company name is extracted from document metadata if available.
+        Parameters
+        ----------
+        job_description_source: Any
+            Source accepted by the parser function (URL, file path, etc.).
+            Can be a URL starting with http:// or https://, or a local file path.
+        Returns
+        -------
+        Tuple[str, str]
+            A tuple of (job_posting_text, company_name).
+            If company name is not found in metadata, returns empty string.
+        Raises
+        ------
+        AssertionError
+            If job_description_source is None.
+        Exception
+            If parsing fails.
+        """
+        company_name = ""
+        job_posting_text = ""
+        logger.info("Parsing job description from: %s", job_description_source)
+        assert job_description_source is not None, (
+            "Job description source cannot be None"
+        )
+        job_description_document: Document = await self._parser(job_description_source)
+        # Extract company name from metadata
+        if hasattr(job_description_document, "metadata") and isinstance(
+            job_description_document.metadata, dict
+        ):
+            company_name = job_description_document.metadata.get("company_name", "")
+            if not company_name:
+                logger.warning("Company name not found in job description metadata.")
+        else:
+            logger.warning(
+                "Metadata attribute missing or not a dict in job description document."
+            )
+        # Extract job posting text
+        if hasattr(job_description_document, "page_content"):
+            job_posting_text = job_description_document.page_content or ""
+            if not job_posting_text:
+                logger.info("Parsed job posting text is empty.")
+        else:
+            logger.warning(
+                "page_content attribute missing in job description document."
+            )
+        return job_posting_text, company_name
+    @log_async
+    @log_errors
+    async def _load_job_description(self, jd_source: Any) -> Tuple[str, str]:
+        """
+        Load job description text and company name, raising if missing.
+        This is a wrapper around parse_job_description() that validates the
+        source first. Used by subgraph nodes for consistent error handling.
+        Parameters
+        ----------
+        jd_source: Any
+            Source for the job description (URL, file path, etc.).
+        Returns
+        -------
+        Tuple[str, str]
+            A tuple of (job_posting_text, company_name).
+        Raises
+        ------
+        ValueError
+            If jd_source is None or empty.
+        """
+        if not jd_source:
+            raise ValueError("job_description_source is required")
+        return await self.parse_job_description(jd_source)
+    @log_async
+    async def get_application_form_details(self, job_description_source: Any):
+        """
+        Placeholder for future method to get application form details.
+        This method will be implemented to extract form fields and requirements
+        from job application forms.
+        Parameters
+        ----------
+        job_description_source: Any
+            Source of the job description or application form.
+        """
+        # TODO: Implement form field extraction
+        pass
+    async def _prompt_user(self) -> str:
+        """
+        Prompt the user for input (synchronous input wrapped for async use).
+        This method wraps the synchronous input() function to be used in async
+        contexts. In a production async UI, this would be replaced with an
+        async input mechanism.
+        Note: This is a shared utility method. In a future refactoring, this
+        could be extracted to a separate UserInputHelper class following the
+        Interface Segregation Principle.
+        Parameters
+        ----------
+        prompt_msg: str
+            Message to display to the user.
+        Returns
+        -------
+        str
+            User input string.
+        """
+        # In a real async UI replace input with an async call.
+        return input("Please paste the job description in text format: ")

src/job_writing_agent/nodes/research_workflow.py CHANGED Viewed

@@ -1,97 +1,304 @@
-# -*- coding: utf-8 -*-
-"""
-This module performs the research phase of the job application writing process.
-One of the stages is Tavily Search which will be use to search for the company
-"""
 import logging
 import json
-from langgraph.graph import StateGraph, START, END
 from job_writing_agent.tools.SearchTool import TavilyResearchTool
 from job_writing_agent.classes.classes import ResearchState
-from job_writing_agent.tools.SearchTool import relevance_filter
 logger = logging.getLogger(__name__)
-# Set up logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-async def research_company(state: ResearchState) -> ResearchState:
-    """Research the company if name is available."""
-    state["current_node"] = "research_company"
     try:
-        # Extract values from state
-        company_name: str = state["company_research_data"].get("company_name", None)
-        job_description = state["company_research_data"].get("job_description", None)
-        assert company_name is not None, "Company name is required for research_company"
-        assert job_description is not None, (
-            "Job description is required for research_company"
-        )
-        logger.info(f"Researching company: {company_name}")
-        # Call search_company using the invoke method instead of __call__
-        # The tool expects job_description and company_name and returns a tuple
-        tavily_search = TavilyResearchTool(
-            job_description=job_description, company_name=company_name
-        )
-        tavily_search_queries = tavily_search.create_tavily_queries()
-        tavily_search_queries_json: dict = json.loads(
-            tavily_search_queries["search_queries"]
-        )
-        logger.info(list(tavily_search_queries_json.values()))
-        tavily_search_results: list[list[str]] = tavily_search.tavily_search_company(
-            tavily_search_queries_json
-        )
-        assert isinstance(tavily_search_results, list), (
-            "Expected list or tuple from tavily_search_company"
         )
-        assert len(tavily_search_results) > 0, (
-            "No results returned from tavily_search_company"
         )
-        assert len(tavily_search_queries_json) > 0, "No search queries were attempted"
-        logger.info(
-            f"Search completed with results and {len(tavily_search_queries)} queries"
         )
-        # Store results in state - note that results is the first item in the tuple
-        state["attempted_search_queries"] = list(tavily_search_queries_json.values())
-        state["company_research_data"]["tavily_search"] = tavily_search_results
     except Exception as e:
-        logger.error(f"Error in research_company: {str(e)}")
-        # Provide empty results to avoid breaking the workflow
         state["company_research_data"]["tavily_search"] = []
         state["attempted_search_queries"] = []
-    finally:
         return state
-print("\n\n\nInitializing research workflow...\n\n\n")
 # Create research subgraph
 research_subgraph = StateGraph(ResearchState)
 # Add research subgraph nodes
 research_subgraph.add_node("research_company", research_company)
-research_subgraph.add_node("relevance_filter", relevance_filter)
 # Add research subgraph edges
 research_subgraph.add_edge(START, "research_company")
 research_subgraph.add_edge("research_company", "relevance_filter")
-research_subgraph.add_edge("relevance_filter", END)
 # Compile research subgraph
 research_workflow = research_subgraph.compile()

+# research_workflow.py
 import logging
 import json
+import asyncio
+from typing import Dict, Any, cast
+from langgraph.graph import StateGraph, END, START
+import dspy
 from job_writing_agent.tools.SearchTool import TavilyResearchTool
 from job_writing_agent.classes.classes import ResearchState
+from job_writing_agent.tools.SearchTool import filter_research_results_by_relevance
+from job_writing_agent.agents.output_schema import (
+    CompanyResearchDataSummarizationSchema,
+)
+from job_writing_agent.utils.llm_provider_factory import LLMFactory
 logger = logging.getLogger(__name__)
+# Configuration
+MAX_RETRIES = 3
+RETRY_DELAY = 2  # seconds
+QUERY_TIMEOUT = 30  # seconds
+EVAL_TIMEOUT = 15  # seconds per evaluation
+def validate_research_inputs(state: ResearchState) -> tuple[bool, str, str]:
+    """
+    Validate that required inputs are present.
+    Returns: (is_valid, company_name, job_description)
+    """
+    try:
+        company_name = state["company_research_data"].get("company_name", "")
+        job_description = state["company_research_data"].get("job_description", "")
+        if not company_name or not company_name.strip():
+            logger.error("Company name is missing or empty")
+            return False, "", ""
+        if not job_description or not job_description.strip():
+            logger.error("Job description is missing or empty")
+            return False, "", ""
+        return True, company_name.strip(), job_description.strip()
+    except (KeyError, TypeError, AttributeError) as e:
+        logger.error(f"Invalid state structure: {e}")
+        return False, "", ""
+def parse_dspy_queries_with_fallback(
+    raw_queries: Dict[str, Any], company_name: str
+) -> Dict[str, str]:
+    """
+    Parse DSPy query output with multiple fallback strategies.
+    Returns a dict of query_id -> query_string.
+    """
     try:
+        # Try to extract search_queries field
+        if isinstance(raw_queries, dict) and "search_queries" in raw_queries:
+            queries_data = raw_queries["search_queries"]
+            # If it's a JSON string, parse it
+            if isinstance(queries_data, str):
+                try:
+                    queries_data = json.loads(queries_data)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"JSON decode failed: {e}. Using fallback queries.")
+                    return get_fallback_queries(company_name)
+            # Extract query strings
+            if isinstance(queries_data, dict):
+                parsed = {}
+                for key, value in queries_data.items():
+                    if isinstance(value, str):
+                        parsed[key] = value
+                    elif isinstance(value, list) and len(value) > 0:
+                        parsed[key] = str(value[0])
+                if parsed:
+                    return parsed
+        # If we reach here, parsing failed
+        logger.warning("Could not parse DSPy queries. Using fallback.")
+        return get_fallback_queries(company_name)
+    except Exception as e:
+        logger.error(f"Error parsing DSPy queries: {e}. Using fallback.")
+        return get_fallback_queries(company_name)
+def get_fallback_queries(company_name: str) -> Dict[str, str]:
+    """
+    Generate basic fallback queries when DSPy fails.
+    """
+    return {
+        "query1": f"{company_name} company culture and values",
+        "query2": f"{company_name} recent news and achievements",
+        "query3": f"{company_name} mission statement and goals",
+    }
+def company_research_data_summary(state: ResearchState) -> ResearchState:
+    """
+    Summarize the filtered research data into a concise summary.
+    Replaces the raw tavily_search results with a summarized version.
+    """
+    try:
+        state["current_node"] = "company_research_data_summary"
+        # Extract the current research data
+        company_research_data = state.get("company_research_data", {})
+        tavily_search_data = company_research_data.get("tavily_search", [])
+        # If no research data, skip summarization
+        if not tavily_search_data or len(tavily_search_data) == 0:
+            logger.warning("No research data to summarize. Skipping summarization.")
+            return state
+        logger.info(f"Summarizing {len(tavily_search_data)} research result sets...")
+        # Create DSPy summarization chain
+        company_research_data_summarization = dspy.ChainOfThought(
+            CompanyResearchDataSummarizationSchema
         )
+        # Initialize LLM provider
+        llm_provider = LLMFactory()
+        llm = llm_provider.create_dspy(
+            model="mistralai/mistral-7b-instruct:free",
+            provider="openrouter",
+            temperature=0.3,
         )
+        # Generate summary using DSPy
+        with dspy.context(lm=llm, adapter=dspy.JSONAdapter()):
+            response = company_research_data_summarization(
+                company_research_data=company_research_data
+            )
+        # Extract the summary from the response
+        # The response should have a 'company_research_data_summary' field (JSON string)
+        if hasattr(response, "company_research_data_summary"):
+            summary_json_str = response.company_research_data_summary
+        elif isinstance(response, dict) and "company_research_data_summary" in response:
+            summary_json_str = response["company_research_data_summary"]
+        else:
+            logger.error(
+                f"Unexpected response format from summarization: {type(response)}"
+            )
+            return state
+        # Parse the JSON summary
+        state["company_research_data"]["company_research_data_summary"] = (
+            summary_json_str
         )
+        return state
     except Exception as e:
+        logger.error(f"Error in company_research_data_summary: {e}", exc_info=True)
+        # Return state unchanged on error
+        return state
+async def research_company_with_retry(state: ResearchState) -> ResearchState:
+    """
+    Research company with retry logic and timeouts.
+    """
+    state["current_node"] = "research_company"
+    # Validate inputs
+    is_valid, company_name, job_description = validate_research_inputs(state)
+    if not is_valid:
+        logger.error("Invalid inputs for research. Skipping research phase.")
         state["company_research_data"]["tavily_search"] = []
         state["attempted_search_queries"] = []
         return state
+    logger.info(f"Researching company: {company_name}")
+    # Try with retries
+    for attempt in range(MAX_RETRIES):
+        try:
+            # Create tool instance
+            tavily_search = TavilyResearchTool(
+                job_description=job_description, company_name=company_name
+            )
+            # Generate queries with timeout
+            queries_task = asyncio.create_task(
+                asyncio.to_thread(tavily_search.create_tavily_queries)
+            )
+            try:
+                raw_queries = await asyncio.wait_for(
+                    queries_task, timeout=QUERY_TIMEOUT
+                )
+            except asyncio.TimeoutError:
+                logger.warning(
+                    f"Query generation timed out (attempt {attempt + 1}/{MAX_RETRIES})"
+                )
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY)
+                    continue
+                else:
+                    raise
+            # Parse queries with fallback
+            # Convert DSPy Prediction to dict if needed
+            if hasattr(raw_queries, "dict"):
+                raw_queries_dict = cast(Dict[str, Any], raw_queries.dict())
+            elif hasattr(raw_queries, "__dict__"):
+                raw_queries_dict = cast(Dict[str, Any], raw_queries.__dict__)
+            elif isinstance(raw_queries, dict):
+                raw_queries_dict = cast(Dict[str, Any], raw_queries)
+            else:
+                raw_queries_dict = cast(Dict[str, Any], dict(raw_queries))
+            queries = parse_dspy_queries_with_fallback(raw_queries_dict, company_name)
+            if not queries:
+                logger.warning("No valid queries generated")
+                queries = get_fallback_queries(company_name)
+            logger.info(
+                f"Generated {len(queries)} search queries: {list(queries.keys())}"
+            )
+            # Perform searches with timeout
+            search_task = asyncio.create_task(
+                asyncio.to_thread(tavily_search.tavily_search_company, queries)
+            )
+            try:
+                search_results = await asyncio.wait_for(
+                    search_task, timeout=QUERY_TIMEOUT * len(queries)
+                )
+            except asyncio.TimeoutError:
+                logger.warning(
+                    f"Search timed out (attempt {attempt + 1}/{MAX_RETRIES})"
+                )
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY)
+                    continue
+                else:
+                    raise
+            # Validate results
+            if not isinstance(search_results, list):
+                logger.warning(f"Invalid search results type: {type(search_results)}")
+                search_results = []
+            if len(search_results) == 0:
+                logger.warning("No search results returned")
+            # Store results
+            state["attempted_search_queries"] = list(queries.values())
+            state["company_research_data"]["tavily_search"] = search_results
+            logger.info(
+                f"Research completed successfully with {len(search_results)} result sets"
+            )
+            return state
+        except Exception as e:
+            logger.error(
+                f"Error in research_company (attempt {attempt + 1}/{MAX_RETRIES}): {e}",
+                exc_info=True,
+            )
+            if attempt < MAX_RETRIES - 1:
+                await asyncio.sleep(RETRY_DELAY * (attempt + 1))  # Exponential backoff
+            else:
+                logger.error("All retry attempts exhausted. Using empty results.")
+                state["company_research_data"]["tavily_search"] = []
+                state["attempted_search_queries"] = []
+    return state
+async def research_company(state: ResearchState) -> ResearchState:
+    """Wrapper to call the retry version."""
+    return await research_company_with_retry(state)
 # Create research subgraph
 research_subgraph = StateGraph(ResearchState)
 # Add research subgraph nodes
 research_subgraph.add_node("research_company", research_company)
+research_subgraph.add_node("relevance_filter", filter_research_results_by_relevance)
+research_subgraph.add_node(
+    "company_research_data_summary", company_research_data_summary
+)
 # Add research subgraph edges
 research_subgraph.add_edge(START, "research_company")
 research_subgraph.add_edge("research_company", "relevance_filter")
+research_subgraph.add_edge("relevance_filter", "company_research_data_summary")
+research_subgraph.add_edge("company_research_data_summary", END)
 # Compile research subgraph
 research_workflow = research_subgraph.compile()

src/job_writing_agent/nodes/resume_loader.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# -*- coding: utf-8 -*-
+"""
+Resume Loader Module
+This module provides the ResumeLoader class responsible for loading and parsing
+the resume file and returning the resume in the required format.
+"""
+import logging
+from typing import Callable, Any, Optional
+from job_writing_agent.utils.document_processing import parse_resume
+from job_writing_agent.utils.logging.logging_decorators import (
+    log_async,
+    log_errors,
+)
+logger = logging.getLogger(__name__)
+class ResumeLoader:
+    """
+    Responsible for loading and parsing resume documents.
+    Example:
+        >>> loader = ResumeLoader()
+        >>> resume_text = await loader.get_resume("path/to/resume.pdf")
+        >>>
+        >>> # With custom parser for testing
+        >>> mock_parser = lambda x: [Document(page_content="test")]
+        >>> loader = ResumeLoader(parser=mock_parser)
+    """
+    def __init__(self, parser: Optional[Callable[[Any], Any]] = None):
+        """
+        Initialize ResumeLoader with optional parser dependency injection.
+        Parameters
+        ----------
+        parser: Optional[Callable[[Any], Any]]
+            Function to parse resume documents. Defaults to `parse_resume` from
+            document_processing. Can be injected for testing or custom parsing.
+        """
+        self._parser = parser or parse_resume
+    @log_async
+    @log_errors
+    async def get_resume(self, resume_source: Any) -> str:
+        """
+        Parse a resume file and return its plain-text content.
+        This method extracts text from resume chunks, handling both Document
+        objects and plain strings. Empty or invalid chunks are skipped.
+        Parameters
+        ----------
+        resume_source: Any
+            Path or file-like object accepted by the parser function.
+            Can be a file path, URL, or file-like object.
+        Returns
+        -------
+        str
+            Plain text content of the resume.
+        Raises
+        ------
+        AssertionError
+            If resume_source is None.
+        Exception
+            If parsing fails.
+        """
+        logger.info("Parsing resume...")
+        resume_text = ""
+        assert resume_source is not None, "resume_source cannot be None"
+        resume_chunks = self._parser(resume_source)
+        for chunk in resume_chunks:
+            if hasattr(chunk, "page_content") and chunk.page_content:
+                resume_text += chunk.page_content
+            elif isinstance(chunk, str) and chunk:
+                resume_text += chunk
+            else:
+                logger.debug("Skipping empty or invalid chunk in resume: %s", chunk)
+        return resume_text
+    @log_async
+    @log_errors
+    async def _load_resume(self, resume_source: Any) -> str:
+        """
+        Load resume content, raising if the source is missing.
+        This is a wrapper around get_resume() that validates the source first.
+        Used by subgraph nodes for consistent error handling.
+        Parameters
+        ----------
+        resume_source: Any
+            Path or file-like object for the resume.
+        Returns
+        -------
+        str
+            Plain text content of the resume.
+        Raises
+        ------
+        ValueError
+            If resume_source is None or empty.
+        """
+        if not resume_source:
+            raise ValueError("resume_source is required")
+        return await self.get_resume(resume_source)
+    async def _prompt_user_for_resume(self) -> str:
+        """
+        Prompt the user for input (synchronous input wrapped for async use).
+        This method wraps the synchronous input() function to be used in async
+        contexts. In a production async UI, this would be replaced with an
+        async input mechanism.
+        Note: This is a shared utility method. In a future refactoring, this
+        could be extracted to a separate UserInputHelper class following the
+        Interface Segregation Principle.
+        Parameters
+        ----------
+        prompt_msg: str
+            Message to display to the user.
+        Returns
+        -------
+        str
+            User input string.
+        """
+        # In a real async UI replace input with an async call.
+        return input("Please paste the resume in text format: ")

src/job_writing_agent/nodes/selfconsistency.py CHANGED Viewed

@@ -4,23 +4,23 @@ import json
 import re
 from ..classes.classes import AppState
-from ..prompts.templates import (
-    DRAFT_RATING_PROMPT,
-    BEST_DRAFT_SELECTION_PROMPT
-)
 from ..utils.llm_provider_factory import LLMFactory
 logger = logging.getLogger(__name__)
 # Constants
 CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
-llm_factory = LLMFactory()
-llm_precise = llm_factory.create_langchain(model="qwen/qwen3-4b:free", provider="openrouter", temperature=0.1)
 def self_consistency_vote(state: AppState) -> AppState:
     """Choose the best draft from multiple variations."""
     variations = state.get("variations", {"variations": []})
     all_drafts = [state["draft"]] + variations["variations"]
@@ -31,7 +31,7 @@ def self_consistency_vote(state: AppState) -> AppState:
     # Get resume and job summaries, handling different formats
     try:
         if isinstance(state["resume_path"], list) and len(state["resume_path"]) > 0:
-            if hasattr(state["resume_path"][0], 'page_content'):
                 resume_summary = state["resume_path"][0].page_content
             else:
                 resume_summary = state["resume_path"][0]
@@ -42,7 +42,10 @@ def self_consistency_vote(state: AppState) -> AppState:
         resume_summary = str(state["resume_path"])
     try:
-        if isinstance(state["job_description_source"], list) and len(state["job_description_source"]) > 0:
             job_summary = state["job_description_source"][0]
         else:
             job_summary = str(state["job_description_source"])
@@ -51,33 +54,38 @@ def self_consistency_vote(state: AppState) -> AppState:
         job_summary = str(state["job_description_source"])
     for i, draft in enumerate(all_drafts):
-        rating = llm_precise.invoke(DRAFT_RATING_PROMPT.format(
-            resume_summary=resume_summary,
-            job_summary=job_summary,
-            draft=draft,
-            draft_number=i+1
-        ))
         ratings.append(rating)
     # Create a clearer, more structured prompt for draft selection
     selection_prompt = BEST_DRAFT_SELECTION_PROMPT.format(
-        ratings_json=json.dumps(ratings, indent=2),
-        num_drafts=len(all_drafts)
     )
     # Get the selected draft index with error handling
     try:
         selection = llm_precise.invoke(selection_prompt).strip()
         # Extract just the first number found in the response
-        number_match = re.search(r'\d+', selection)
         if not number_match:
-            print("Warning: Could not extract draft number from LLM response. Using original draft.")
             best_draft_idx = 0
         else:
             best_draft_idx = int(number_match.group()) - 1
             # Validate the index is in range
             if best_draft_idx < 0 or best_draft_idx >= len(all_drafts):
-                print(f"Warning: Selected draft index {best_draft_idx + 1} out of range. Using original draft.")
                 best_draft_idx = 0
     except (ValueError, TypeError) as e:
         print(f"Warning: Error selecting best draft: {e}. Using original draft.")

 import re
 from ..classes.classes import AppState
+from ..prompts.templates import DRAFT_RATING_PROMPT, BEST_DRAFT_SELECTION_PROMPT
 from ..utils.llm_provider_factory import LLMFactory
 logger = logging.getLogger(__name__)
 # Constants
 CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
 def self_consistency_vote(state: AppState) -> AppState:
     """Choose the best draft from multiple variations."""
+    # Create LLM inside function (lazy initialization)
+    llm_factory = LLMFactory()
+    llm_precise = llm_factory.create_langchain(
+        model="google/gemma-3-12b-it:free", provider="openrouter", temperature=0.1
+    )
     variations = state.get("variations", {"variations": []})
     all_drafts = [state["draft"]] + variations["variations"]
     # Get resume and job summaries, handling different formats
     try:
         if isinstance(state["resume_path"], list) and len(state["resume_path"]) > 0:
+            if hasattr(state["resume_path"][0], "page_content"):
                 resume_summary = state["resume_path"][0].page_content
             else:
                 resume_summary = state["resume_path"][0]
         resume_summary = str(state["resume_path"])
     try:
+        if (
+            isinstance(state["job_description_source"], list)
+            and len(state["job_description_source"]) > 0
+        ):
             job_summary = state["job_description_source"][0]
         else:
             job_summary = str(state["job_description_source"])
         job_summary = str(state["job_description_source"])
     for i, draft in enumerate(all_drafts):
+        rating = llm_precise.invoke(
+            DRAFT_RATING_PROMPT.format(
+                resume_summary=resume_summary,
+                job_summary=job_summary,
+                draft=draft,
+                draft_number=i + 1,
+            )
+        )
         ratings.append(rating)
     # Create a clearer, more structured prompt for draft selection
     selection_prompt = BEST_DRAFT_SELECTION_PROMPT.format(
+        ratings_json=json.dumps(ratings, indent=2), num_drafts=len(all_drafts)
     )
     # Get the selected draft index with error handling
     try:
         selection = llm_precise.invoke(selection_prompt).strip()
         # Extract just the first number found in the response
+        number_match = re.search(r"\d+", selection)
         if not number_match:
+            print(
+                "Warning: Could not extract draft number from LLM response. Using original draft."
+            )
             best_draft_idx = 0
         else:
             best_draft_idx = int(number_match.group()) - 1
             # Validate the index is in range
             if best_draft_idx < 0 or best_draft_idx >= len(all_drafts):
+                print(
+                    f"Warning: Selected draft index {best_draft_idx + 1} out of range. Using original draft."
+                )
                 best_draft_idx = 0
     except (ValueError, TypeError) as e:
         print(f"Warning: Error selecting best draft: {e}. Using original draft.")

src/job_writing_agent/nodes/variations.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing_extensions import Dict, List
 from langchain_core.documents import Document
-from ..classes.classes import AppState
 from ..utils.llm_provider_factory import LLMFactory
 from ..prompts.templates import VARIATION_PROMPT
@@ -14,15 +14,15 @@ logger = logging.getLogger(__name__)
 # Constants
 CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
-llm_provider = LLMFactory()
-llm = llm_provider.create_langchain(
-    "qwen/qwen3-4b:free", provider="openrouter", temperature=0.3
-)
-def generate_variations(state: AppState) -> Dict[str, List[str]]:
     """Generate multiple variations of the draft for self-consistency voting."""
     variations = []
     # Get resume and job text, handling both string and Document types
@@ -70,6 +70,8 @@ def generate_variations(state: AppState) -> Dict[str, List[str]]:
             response = configured_llm.invoke(variation)
             if response and response.strip():  # Only add non-empty variations
                 variations.append(response)
         except Exception as e:

 from langchain_core.documents import Document
+from ..classes.classes import ResultState
 from ..utils.llm_provider_factory import LLMFactory
 from ..prompts.templates import VARIATION_PROMPT
 # Constants
 CURRENT_DATE = datetime.now().strftime("%A, %B %d, %Y")
+def generate_variations(state: ResultState) -> Dict[str, List[str]]:
     """Generate multiple variations of the draft for self-consistency voting."""
+    # Create LLM inside function (lazy initialization)
+    llm_provider = LLMFactory()
+    llm = llm_provider.create_langchain(
+        "google/gemma-3-27b-it:free", provider="openrouter", temperature=0.3
+    )
     variations = []
     # Get resume and job text, handling both string and Document types
             response = configured_llm.invoke(variation)
+            print(f"Response for setting:  {variation} has a response: {response}")
             if response and response.strip():  # Only add non-empty variations
                 variations.append(response)
         except Exception as e:

src/job_writing_agent/prompts/templates.py CHANGED Viewed

@@ -5,7 +5,11 @@ This module contains all prompt templates used throughout the job application
 generation process, organized by task.
 """
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.messages import SystemMessage, HumanMessage
 # Persona selection prompts
@@ -201,19 +205,26 @@ Example: If draft #2 is best, return ONLY '2'.
 REVISION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages(
     [
-        SystemMessage(
-            content="You are an expert job application writer. Revise the draft based on feedback."
         ),
-        HumanMessage(
-            content="""
-    # Original Draft
     {draft}
-    # Feedback
     {feedback}
-    Revise the draft to incorporate this feedback while maintaining professionalism and impact.
-    Return the complete, final version.
     """
         ),
     ]

 generation process, organized by task.
 """
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
 from langchain_core.messages import SystemMessage, HumanMessage
 # Persona selection prompts
 REVISION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages(
     [
+        SystemMessagePromptTemplate.from_template(
+            "You are an expert job application writer. Revise the draft based on BOTH the self-evaluation and external feedback provided."
         ),
+        HumanMessagePromptTemplate.from_template(
+            """
+    --------------------------------Original Draft--------------------------------
     {draft}
+    ----------------------------------------------------------------------------------------
+    --------------------------------Candidate Feedback--------------------------------
     {feedback}
+    ----------------------------------------------------------------------------------------
+    --------------------------------Critique Feedback--------------------------------
+    {critique_feedback}
+    ----------------------------------------------------------------------------------------
+    Based on the self evaluation in the Original Draft, Critique Feedback and the Candidates' Feedback, revise the content taking essence of the self evaluation, Critique Feedback and the Candidates' Feedback into account. Do not repeat the same content from the Original Draft, Critique Feedback and the Candidates' Feedback.
+    Return the content of the revised draft. Make sure the output is only the content that is the revised content and nothing else.
     """
         ),
     ]

src/job_writing_agent/prompts/test_prompts.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+REVISION_PROMPT: ChatPromptTemplate = ChatPromptTemplate.from_messages(
+    [
+        SystemMessagePromptTemplate.from_template(
+            "You are an expert job application writer. Revise the draft based on BOTH the self-evaluation and external feedback provided."
+        ),
+        HumanMessagePromptTemplate.from_template(
+            """
+    # Original Draft Content with Evaluation Section at the end
+    {draft}
+    # Candidates' Feedback (Human Feedback)
+    {feedback}
+    # Critique Feedback (AI Feedback)
+    {critique_feedback}
+    Based on the self evaluation in the Original Draft, Critique Feedback and the Candidates' Feedback, revise the content taking essence of the self evaluation, Critique Feedback and the Candidates' Feedback into account. Do not repeat the same content from the Original Draft, Critique Feedback and the Candidates' Feedback.
+    Return the content of the revised draft. Make sure the output is only the content that is the revised content and nothing else.
+    """
+        ),
+    ]
+)
+print(
+    REVISION_PROMPT.format_messages(
+        draft="Hello, how are you?",
+        feedback="I like your draft.",
+        critique_feedback="Your draft is good.",
+    )
+)

src/job_writing_agent/tools/SearchTool.py CHANGED Viewed

@@ -6,37 +6,40 @@ from pathlib import Path
 from langchain_tavily import TavilySearch
 from openevals.llm import create_async_llm_as_judge
-from openevals.prompts import (
-    RAG_RETRIEVAL_RELEVANCE_PROMPT,
-    RAG_HELPFULNESS_PROMPT
-)
 import dspy
 from ..agents.output_schema import TavilySearchQueries
 from ..classes.classes import ResearchState
 from ..utils.llm_provider_factory import LLMFactory
 logger = logging.getLogger(__name__)
-env_path = Path(__file__).parent / '.env'
 load_dotenv(dotenv_path=env_path, override=True)
 openrouter_api_key = os.environ["OPENROUTER_API_KEY"]
-llm_provider = LLMFactory()
 class TavilyResearchTool:
-    def __init__(self, job_description, company_name, max_results=5, model_name="qwen/qwen3-4b:free"):
-        self.dspy_llm = llm_provider.create_dspy(model=model_name,
-                                                 provider="openrouter",
-                                                 temperature=0.3)
         self.job_description = job_description
         self.company_name = company_name
-        self.tavily_searchtool  = TavilySearch(max_results=max_results)
     def create_tavily_queries(self):
         """
@@ -46,101 +49,222 @@ class TavilyResearchTool:
         """
         tavily_query_generator = dspy.ChainOfThought(TavilySearchQueries)
         with dspy.context(lm=self.dspy_llm, adapter=dspy.JSONAdapter()):
-            response = tavily_query_generator(job_description=self.job_description, company_name=self.company_name)
             return response
     def tavily_search_company(self, queries):
         query_results: list[list[str]] = []
         for query in queries:
             try:
-                search_query_response = self.tavily_searchtool.invoke({"query": queries[query]})
-                query_results.append([res['content'] for res in search_query_response['results']])
                 # print(f"Tavily Search Tool Response for query '{search_query_response['query']}': {query_results_map[search_query_response['query']]}")
             except Exception as e:
-                logger.error(f"Failed to perform company research using TavilySearchTool. Error : {e}")
                 continue
         return query_results
-llm_structured = llm_provider.create_langchain("llama3.1-8b",
-                                                 provider="cerebras",
-                                                 temperature=0.3)
 def get_relevance_evaluator():
     return create_async_llm_as_judge(
-                                    judge=llm_structured,
-                                    prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,
-                                    feedback_key="retrieval_relevance",
-                                    )
 def get_helpfulness_evaluator():
     return create_async_llm_as_judge(
-                                    judge=llm_structured,
-                                    prompt=RAG_HELPFULNESS_PROMPT
-                                    + '\nReturn "true" if the answer is helpful, and "false" otherwise.',
-                                    feedback_key="helpfulness",
-                                    )
-async def relevance_filter(state: ResearchState) -> ResearchState:
     try:
-        # Set the current node
-        state["current_node"] = "relevance_filter"
-        # Get the all_query_data and attempted_queries_list
-        tavily_search_results = state["company_research_data"]["tavily_search"]
-        attempted_tavily_query_list = state["attempted_search_queries"]
-        # Check if all_query_data and attempted_queries_list are lists
-        assert isinstance(tavily_search_results, list), "tavily_search_results is not a list"
-        assert isinstance(attempted_tavily_query_list, list), "attempted_tavily_query_list is not a list"
-        print("Filtering results...")
-        filtered_search_results = []  # Stores results deemed relevant in this specific call
-        # Create a semaphore to limit concurrent tasks to 2
-        semaphore = asyncio.Semaphore(2)
-        async def evaluate_with_semaphore(query_result_item, input_query: str):
-            # query_result_item is a dict like {'rationale': '...', 'results': [...]}
-            async with semaphore:
-                relevance_evaluator = get_relevance_evaluator()
-                eval_result = await relevance_evaluator(
-                    inputs=input_query, context=query_result_item  # context is the whole result block for the query
-                )
-                return query_result_item, eval_result
-        # Create tasks for all results
-        tasks: list = []
-        for query_result, attempted_query in zip(tavily_search_results, attempted_tavily_query_list):
-            tasks.append(evaluate_with_semaphore(query_result, attempted_query))
-        # Process tasks as they complete
-        for completed_task in asyncio.as_completed(tasks):
-            query_result_item, eval_result = await completed_task
-            # logger.info(f"Evaluated query result for '{query_result_item}': {eval_result}")
-            if eval_result.get("score"):  # Safely check for score
-                if isinstance(query_result_item, list):
-                    filtered_search_results.extend(query_result_item)
-                else:
-                    # Handle cases where "results" might not be a list or is missing
-                    logger.warning("Expected a list in query_result_item, got: %s", type(query_result_item))
-        # Append the newly filtered results to the main compiled_results list
-        state["company_research_data"]["tavily_search"] = filtered_search_results
-        logger.info(f"Relevance filtering completed. {len(filtered_search_results)} relevant results found.")
         return state
     except Exception as e:
-        print(f"ERROR in relevance_filter: {e}")
-        import traceback
-        traceback.print_exc()
-        logger.error(f"Error in relevance_filter: {str(e)}")
-        # Return original state to avoid breaking the flow
         return state

 from langchain_tavily import TavilySearch
 from openevals.llm import create_async_llm_as_judge
+from openevals.prompts import RAG_RETRIEVAL_RELEVANCE_PROMPT, RAG_HELPFULNESS_PROMPT
 import dspy
 from ..agents.output_schema import TavilySearchQueries
 from ..classes.classes import ResearchState
 from ..utils.llm_provider_factory import LLMFactory
 logger = logging.getLogger(__name__)
+env_path = Path(__file__).parent / ".env"
 load_dotenv(dotenv_path=env_path, override=True)
 openrouter_api_key = os.environ["OPENROUTER_API_KEY"]
 class TavilyResearchTool:
+    def __init__(
+        self,
+        job_description,
+        company_name,
+        max_results=5,
+        model_name="mistralai/mistral-7b-instruct:free",
+    ):
+        # Create LLM inside __init__ (lazy initialization)
+        llm_provider = LLMFactory()
+        self.dspy_llm = llm_provider.create_dspy(
+            model=model_name, provider="openrouter", temperature=0.3
+        )
         self.job_description = job_description
         self.company_name = company_name
+        self.tavily_searchtool = TavilySearch(max_results=max_results)
     def create_tavily_queries(self):
         """
         """
         tavily_query_generator = dspy.ChainOfThought(TavilySearchQueries)
         with dspy.context(lm=self.dspy_llm, adapter=dspy.JSONAdapter()):
+            response = tavily_query_generator(
+                job_description=self.job_description, company_name=self.company_name
+            )
             return response
     def tavily_search_company(self, queries):
         query_results: list[list[str]] = []
         for query in queries:
             try:
+                search_query_response = self.tavily_searchtool.invoke(
+                    {"query": queries[query]}
+                )
+                query_results.append(
+                    [res["content"] for res in search_query_response["results"]]
+                )
                 # print(f"Tavily Search Tool Response for query '{search_query_response['query']}': {query_results_map[search_query_response['query']]}")
             except Exception as e:
+                logger.error(
+                    f"Failed to perform company research using TavilySearchTool. Error : {e}"
+                )
                 continue
         return query_results
 def get_relevance_evaluator():
+    """
+    Create an LLM-as-judge evaluator for relevance filtering.
+    Creates the LLM on-demand (lazy initialization) to avoid startup delays.
+    """
+    # Create LLM inside function (lazy initialization)
+    llm_provider = LLMFactory()
+    llm_structured = llm_provider.create_langchain(
+        "llama3.1-8b", provider="cerebras", temperature=0.3
+    )
     return create_async_llm_as_judge(
+        judge=llm_structured,
+        prompt=RAG_RETRIEVAL_RELEVANCE_PROMPT,
+        feedback_key="retrieval_relevance",
+    )
 def get_helpfulness_evaluator():
+    """
+    Create an LLM-as-judge evaluator for helpfulness filtering.
+    Creates the LLM on-demand (lazy initialization) to avoid startup delays.
+    """
+    # Create LLM inside function (lazy initialization)
+    llm_provider = LLMFactory()
+    llm_structured = llm_provider.create_langchain(
+        "llama3.1-8b", provider="cerebras", temperature=0.3
+    )
     return create_async_llm_as_judge(
+        judge=llm_structured,
+        prompt=RAG_HELPFULNESS_PROMPT
+        + '\nReturn "true" if the answer is helpful, and "false" otherwise.',
+        feedback_key="helpfulness",
+    )
+async def filter_research_results_by_relevance(state: ResearchState) -> ResearchState:
+    """
+    Filter search results to keep only relevant company information.
+    Uses LLM-as-judge to evaluate if each result set is relevant to its query.
+    Irrelevant results are REMOVED from the final output.
+    """
     try:
+        state["current_node"] = "filter_research_results_by_relevance"
+        # Extract search data from state
+        raw_search_results = state.get("company_research_data", {}).get(
+            "tavily_search", []
+        )
+        search_queries_used = state.get("attempted_search_queries", [])
+        # Validate data types
+        if not isinstance(raw_search_results, list):
+            logger.warning(f"Invalid search results type: {type(raw_search_results)}")
+            return state
+        if not isinstance(search_queries_used, list):
+            logger.warning(f"Invalid queries type: {type(search_queries_used)}")
+            search_queries_used = []
+        # Early exit if no results
+        if len(raw_search_results) == 0:
+            logger.info("No search results to filter.")
+            state["company_research_data"]["tavily_search"] = []
+            return state
+        logger.info(
+            f"Starting relevance filtering for {len(raw_search_results)} result sets..."
+        )
+        # Track filtering statistics
+        results_kept = []
+        results_removed_count = 0
+        evaluation_errors_count = 0
+        # Limit concurrent evaluations to prevent rate limiting
+        concurrency_limiter = asyncio.Semaphore(2)
+        async def evaluate_result_set_relevance(
+            search_result_content, original_query: str
+        ):
+            """
+            Evaluate if a search result set is relevant to its query.
+            Returns:
+                tuple: (search_result_content, is_relevant: bool, error: str|None)
+            """
+            async with concurrency_limiter:
+                try:
+                    # Skip empty result sets
+                    if not search_result_content:
+                        logger.debug(
+                            f"Skipping empty result set for query: {original_query[:50]}..."
+                        )
+                        return (None, False, "empty")
+                    # Create relevance evaluator
+                    llm_relevance_judge = get_relevance_evaluator()
+                    # Evaluate with timeout protection
+                    evaluation_task = llm_relevance_judge(
+                        inputs=original_query, context=search_result_content
+                    )
+                    evaluation_result = await asyncio.wait_for(
+                        evaluation_task, timeout=15
+                    )
+                    # Extract relevance score (True = relevant, False = not relevant)
+                    is_result_relevant = bool(evaluation_result.get("score", False))
+                    if is_result_relevant:
+                        logger.debug(
+                            f"KEPT: Result relevant for query: {original_query[:60]}..."
+                        )
+                        return (search_result_content, True, None)
+                    else:
+                        logger.debug(
+                            f"REMOVED: Result not relevant for query: {original_query[:60]}..."
+                        )
+                        return (None, False, None)
+                except asyncio.TimeoutError:
+                    logger.warning(
+                        f"Evaluation timed out for query: {original_query[:60]}... (KEEPING result)"
+                    )
+                    return (search_result_content, True, "timeout")
+                except Exception as e:
+                    logger.error(
+                        f"Evaluation failed for query: {original_query[:60]}... - {e} (KEEPING result)"
+                    )
+                    return (search_result_content, True, f"error:{str(e)}")
+        # Create evaluation tasks for all result sets
+        evaluation_tasks = []
+        for result_set, query in zip(raw_search_results, search_queries_used):
+            task = evaluate_result_set_relevance(result_set, query)
+            evaluation_tasks.append(task)
+        # Execute all evaluations concurrently
+        all_evaluation_results = await asyncio.gather(
+            *evaluation_tasks, return_exceptions=True
+        )
+        # Process evaluation results and separate kept vs removed
+        for eval_result in all_evaluation_results:
+            # Handle exceptions from gather
+            if isinstance(eval_result, Exception):
+                logger.error(f"Evaluation task failed with exception: {eval_result}")
+                evaluation_errors_count += 1
+                continue
+            # Type guard: eval_result is now guaranteed to be a tuple
+            if not isinstance(eval_result, tuple) or len(eval_result) != 3:
+                logger.error(
+                    f"Unexpected evaluation result format: {type(eval_result)}"
+                )
+                evaluation_errors_count += 1
+                continue
+            result_content, is_relevant, error = eval_result
+            # Track errors
+            if error:
+                evaluation_errors_count += 1
+            # Keep relevant results, discard irrelevant ones
+            if result_content is not None and is_relevant:
+                results_kept.append(result_content)
+            else:
+                results_removed_count += 1
+        # Update state with ONLY the relevant results
+        state["company_research_data"]["tavily_search"] = results_kept
+        # Log filtering summary
+        total_evaluated = len(raw_search_results)
+        kept_count = len(results_kept)
+        removed_count = results_removed_count
+        logger.info(
+            f"Relevance filtering complete: "
+            f"KEPT {kept_count} | REMOVED {removed_count} | TOTAL {total_evaluated} "
+            f"({evaluation_errors_count} evaluation errors)"
+        )
         return state
     except Exception as e:
+        logger.error(f"Critical error in relevance filtering: {e}", exc_info=True)
+        # On critical error, return original state unchanged
         return state

src/job_writing_agent/tools/__init__.py CHANGED Viewed

@@ -4,6 +4,6 @@ Created on Mon Oct 23 16:49:52 2023
 @author: rishabhaggarwal
 """
-from .SearchTool import relevance_filter
-__all__ = ["relevance_filter"]

 @author: rishabhaggarwal
 """
+from .SearchTool import filter_research_results_by_relevance
+__all__ = ["filter_research_results_by_relevance"]

src/job_writing_agent/utils/application_cli_interface.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import argparse
 import os
-from typing import Optional, Any, Iterable
 import requests
 from requests.exceptions import RequestException
-DEFAULT_MODEL = "qwen/qwen3-4b:free"
 DEFAULT_CONTENT_TYPE = "cover_letter"

 import argparse
 import os
+from typing import Iterable
 import requests
 from requests.exceptions import RequestException
+DEFAULT_MODEL = "mistralai/mistral-7b-instruct:free"
 DEFAULT_CONTENT_TYPE = "cover_letter"

src/job_writing_agent/utils/document_processing.py CHANGED Viewed

@@ -13,54 +13,66 @@ from typing_extensions import Dict, List, Any
 import dspy
 from langchain_community.document_loaders import PyPDFLoader, AsyncChromiumLoader
 from langchain_community.document_transformers import Html2TextTransformer
-from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
-from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from langfuse import observe
 from pydantic import BaseModel, Field
 # Local imports - using relative imports
 from .errors import URLExtractionError, LLMProcessingError, JobDescriptionParsingError
-from .llm_provider_factory import LLMFactory
 # Set up logging
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
-llm_provider = LLMFactory()
-llm = llm_provider.create_langchain("qwen-3-32b",
-                                    provider="cerebras",
-                                    temperature=0.3,
-                                    )
 # Default paths
 DEFAULT_RESUME_PATH: str = os.getenv("DEFAULT_RESUME_PATH", "")
 # Most Occurring Resume Section Headers
 RESUME_SECTIONS: list[str] = [
-    "EDUCATION", "EXPERIENCE", "SKILLS", "WORK EXPERIENCE",
-    "PROFESSIONAL EXPERIENCE", "PROJECTS", "CERTIFICATIONS",
-    "SUMMARY", "OBJECTIVE", "CONTACT", "PUBLICATIONS",
-    "AWARDS", "LANGUAGES", "INTERESTS", "REFERENCES"
 ]
 class ResumeSection(BaseModel):
     """Model for a structured resume section."""
-    title: str = Field(description="The section title (e.g., 'Experience', 'Education')")
     content: str = Field(description="The full content of this section")
 class StructuredResume(BaseModel):
     """Model for a structured resume with sections."""
     sections: List[ResumeSection] = Field(description="List of resume sections")
-    contact_info: Dict[str, str] = Field(description="Contact information extracted from the resume")
 class JobDescriptionComponents(BaseModel):
     """Model for job description components."""
     company_name: str = Field(description="The company name")
     job_description: str = Field(description="The job description")
     reasoning: str = Field(description="The reasoning for the extracted information")
@@ -72,8 +84,13 @@ class ExtractJobDescription(dspy.Signature):
     Role Introduction,Qualifications and Requirements, Prefrred Qualifications, Salary, Location.
     Do not alter the content of the job description.
     """
-    job_description_html_content = dspy.InputField(desc="HTML content of the job posting.")
-    job_description = dspy.OutputField(desc="Clean job description which is free of HTML tags and irrelevant information.")
     job_role = dspy.OutputField(desc="The job role in the posting.")
     company_name = dspy.OutputField(desc="Company Name of the Job listing.")
     location = dspy.OutputField(desc="The location for the provided job posting.")
@@ -90,19 +107,20 @@ def clean_resume_text(text: str) -> str:
         Cleaned text
     """
     # Remove excessive whitespace
-    text = re.sub(r'\s+', ' ', text)
     # Fix common PDF extraction issues
-    text = re.sub(r'([a-z])- ([a-z])', r'\1\2', text)  # Fix hyphenated words
     # Remove header/footer page numbers
-    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
     # Replace bullet variations with standard markdown bullets
-    text = re.sub(r'[•●○◘◙♦♣♠★]', '* ', text)
     return text.strip()
 @observe()
 def extract_contact_info(text: str) -> Dict[str, str]:
     """Extract contact information from resume text.
@@ -116,28 +134,33 @@ def extract_contact_info(text: str) -> Dict[str, str]:
     contact_info = {}
     # Extract email
-    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
     if email_match:
-        contact_info['email'] = email_match.group(0)
     # Extract phone (various formats)
-    phone_match = re.search(r'(\+\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}', text)
     if phone_match:
-        contact_info['phone'] = phone_match.group(0)
     # Extract LinkedIn URL
-    linkedin_match = re.search(r'linkedin\.com/in/[a-zA-Z0-9_-]+/?', text)
     if linkedin_match:
-        contact_info['linkedin'] = 'https://www.' + linkedin_match.group(0)
     # Try to extract name (this is approximate and might need LLM for better accuracy)
     # Typically name appears at the top of the resume
-    first_line = text.strip().split('\n')[0].strip()
     if len(first_line) < 40 and not any(char.isdigit() for char in first_line):
-        contact_info['name'] = first_line
     return contact_info
 @observe()
 def identify_resume_sections(text: str) -> List[Dict[str, Any]]:
     """Identify sections in a resume text.
@@ -174,15 +197,21 @@ def identify_resume_sections(text: str) -> List[Dict[str, Any]]:
     # Regex-based section identification
     # Create a pattern that matches common section headers
-    section_pattern = r'(?:^|\n)(?:[^a-zA-Z\d\s]|\s)*(' + '|'.join(RESUME_SECTIONS) + r')(?:[^a-zA-Z\d\s]|\s)*(?:$|\n)'
     matches = list(re.finditer(section_pattern, text, re.IGNORECASE))
     if not matches:
         # If no sections found, treat the whole resume as one section
-        sections.append({
-            "title": "resume",
-            "content": text,
-        })
         return sections
     # Process each section
@@ -191,15 +220,12 @@ def identify_resume_sections(text: str) -> List[Dict[str, Any]]:
         start_pos = match.start()
         # Find the end position (start of next section or end of text)
-        end_pos = matches[i+1].start() if i < len(matches) - 1 else len(text)
         # Extract section content (excluding the header)
         section_content = text[start_pos:end_pos].strip()
-        sections.append({
-            "title": section_title.lower(),
-            "content": section_content
-        })
     return sections
@@ -211,11 +237,8 @@ def _collapse_ws(text: str) -> str:
 def _is_heading(line: str) -> bool:
-    return (
-        line.isupper()
-        and len(line.split()) <= 5
-        and not re.search(r"\d", line)
-    )
 def parse_resume(file_path: str | Path) -> List[Document]:
     """
@@ -225,11 +248,13 @@ def parse_resume(file_path: str | Path) -> List[Document]:
     file_extension = Path(file_path).suffix.lower()
     # Handle different file types
-    if file_extension == '.pdf':
-        text = PyPDFLoader(str(file_path), extraction_mode="layout").load()[0].page_content
-    elif file_extension == '.txt':
         try:
-            with open(file_path, 'r', encoding='utf-8') as f:
                 text = f.read()
                 if not text.strip():
                     raise ValueError("File is empty")
@@ -237,27 +262,26 @@ def parse_resume(file_path: str | Path) -> List[Document]:
             logger.error(f"Error reading text file: {str(e)}")
             raise ValueError(f"Could not read text file: {file_path}. Error: {str(e)}")
     else:
-        raise ValueError(f"Unsupported resume file type: {file_path}. Supported types: .pdf, .txt")
     text = _collapse_ws(text)
     # Tag headings with "###" so Markdown splitter can see them
-    tagged_lines = [
-        f"### {ln}" if _is_heading(ln) else ln
-        for ln in text.splitlines()]
     md_text = "\n".join(tagged_lines)
     if "###" in md_text:
-        splitter = MarkdownHeaderTextSplitter(
-            headers_to_split_on=[("###", "section")]
-        )
         chunks = splitter.split_text(md_text)  # already returns Documents
     else:
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=400, chunk_overlap=50
-        )
-        chunks: list[Document] = [Document(page_content=chunk, metadata={}) for chunk in splitter.split_text(md_text)]    # Attach metadata
     for doc in chunks:
         doc.metadata.setdefault("source", str(file_path))
         # section already present if header‑splitter was used
@@ -274,26 +298,32 @@ async def get_job_description(file_path_or_url: str) -> Document:
         Document containing the job description
     """
     # Check if the input is a URL
-    if file_path_or_url.startswith(('http://', 'https://')):
         return await parse_job_description_from_url(file_path_or_url)
     # Handle local files based on extension
     file_extension = Path(file_path_or_url).suffix.lower()
     # Handle txt files
-    if file_extension == '.txt':
         try:
-            with open(file_path_or_url, 'r', encoding='utf-8') as f:
                 content = f.read()
                 if not content.strip():
                     raise ValueError(f"File is empty: {file_path_or_url}")
-                return Document(page_content=content, metadata={"source": file_path_or_url})
         except Exception as e:
             logger.error(f"Error reading text file: {str(e)}")
-            raise ValueError(f"Could not read text file: {file_path_or_url}. Error: {str(e)}")
     # For other file types
-    raise ValueError(f"Unsupported file type: {file_path_or_url}. Supported types: .pdf, .docx, .txt, .md")
 async def scrape_job_description_from_web(urls: List[str]):
@@ -304,7 +334,9 @@ async def scrape_job_description_from_web(urls: List[str]):
     scraped_data_documents = await loader.aload()
     html2text = Html2TextTransformer()
-    markdown_scraped_data_documents = html2text.transform_documents(scraped_data_documents)
     # Grab the first 1000 tokens of the site
     splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
@@ -312,14 +344,14 @@ async def scrape_job_description_from_web(urls: List[str]):
     )
     extracted_content = splitter.split_documents(markdown_scraped_data_documents)
     return ".".join(doc.page_content for doc in extracted_content)
 async def parse_job_description_from_url(url: str) -> Document:
     """Extracts and structures a job description from a URL using an LLM.
-    This function fetches content from a URL, uses a DSPy model to extract key details,
     and returns a structured LangChain Document. If the LLM processing fails, it falls
     back to returning the raw extracted text.
@@ -334,8 +366,8 @@ async def parse_job_description_from_url(url: str) -> Document:
         JobDescriptionParsingError: For any unexpected errors during the process.
     """
     logger.info("Starting job description extraction from URL: %s", url)
-    # 1. Validate URL first (fail fast)
     parsed_url = urlparse(url)
     if not all([parsed_url.scheme, parsed_url.netloc]):
         logger.error("Invalid URL format: %s", url)
@@ -348,27 +380,33 @@ async def parse_job_description_from_url(url: str) -> Document:
             logger.info("Fetching content from URL...")
             raw_content = await scrape_job_description_from_web([url])
             if not raw_content or not raw_content.strip():
-                raise URLExtractionError("Failed to extract any meaningful content from the URL.")
             logger.info("Successfully fetched raw content from URL.")
         except Exception as e:
             # Wrap any fetching error into our custom exception
-            raise URLExtractionError(f"Failed to download or read content from {url}: {e}") from e
         # 3. Process content with the LLM
         try:
             logger.info("Processing content with DSPy LLM...")
             # Configure DSPy LM (it's good practice to do this here if it can change)
-            dspy.configure(lm=dspy.LM(
-                "cerebras/qwen-3-32b",
-                api_key=os.environ.get("CEREBRAS_API_KEY"),
-                temperature=0.1,
-                max_tokens=60000 # Note: This max_tokens is unusually high
-            ))
             job_extract_fn = dspy.Predict(ExtractJobDescription)
             result = job_extract_fn(job_description_html_content=raw_content)
             logger.info("Successfully processed job description with LLM.")
             # 4. Create the final Document with structured data
             job_doc = Document(
                 page_content=result.job_description,
@@ -376,8 +414,8 @@ async def parse_job_description_from_url(url: str) -> Document:
                     "company_name": result.company_name,
                     "source": url,
                     "job_role": result.job_role,
-                    "location": result.location
-                }
             )
             return job_doc
@@ -392,11 +430,13 @@ async def parse_job_description_from_url(url: str) -> Document:
         if raw_content:
             return Document(
                 page_content=raw_content,
-                metadata={"company_name": "Unknown", "source": url, "error": str(e)}
             )
         # If raw_content is also None, then the failure was catastrophic.
-        raise LLMProcessingError("LLM processing failed and no raw content was available for fallback.") from e
     except URLExtractionError as e:
         logger.error(f"Could not extract content from URL: {e}")
         raise URLExtractionError("Failed to extract content from the URL.") from e
@@ -404,4 +444,6 @@ async def parse_job_description_from_url(url: str) -> Document:
     # 6. Catch any other unexpected errors
     except Exception as e:
         logger.error(f"An unexpected error occurred: {e}", exc_info=True)
-        raise JobDescriptionParsingError(f"An unexpected error occurred while parsing the job description: {e}") from e

 import dspy
 from langchain_community.document_loaders import PyPDFLoader, AsyncChromiumLoader
 from langchain_community.document_transformers import Html2TextTransformer
+from langchain_text_splitters import (
+    RecursiveCharacterTextSplitter,
+    MarkdownHeaderTextSplitter,
+)
 from langchain_core.documents import Document
 from langfuse import observe
 from pydantic import BaseModel, Field
 # Local imports - using relative imports
 from .errors import URLExtractionError, LLMProcessingError, JobDescriptionParsingError
 # Set up logging
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 # Default paths
 DEFAULT_RESUME_PATH: str = os.getenv("DEFAULT_RESUME_PATH", "")
 # Most Occurring Resume Section Headers
 RESUME_SECTIONS: list[str] = [
+    "EDUCATION",
+    "EXPERIENCE",
+    "SKILLS",
+    "WORK EXPERIENCE",
+    "PROFESSIONAL EXPERIENCE",
+    "PROJECTS",
+    "CERTIFICATIONS",
+    "SUMMARY",
+    "OBJECTIVE",
+    "CONTACT",
+    "PUBLICATIONS",
+    "AWARDS",
+    "LANGUAGES",
+    "INTERESTS",
+    "REFERENCES",
 ]
 class ResumeSection(BaseModel):
     """Model for a structured resume section."""
+    title: str = Field(
+        description="The section title (e.g., 'Experience', 'Education')"
+    )
     content: str = Field(description="The full content of this section")
 class StructuredResume(BaseModel):
     """Model for a structured resume with sections."""
     sections: List[ResumeSection] = Field(description="List of resume sections")
+    contact_info: Dict[str, str] = Field(
+        description="Contact information extracted from the resume"
+    )
 class JobDescriptionComponents(BaseModel):
     """Model for job description components."""
     company_name: str = Field(description="The company name")
     job_description: str = Field(description="The job description")
     reasoning: str = Field(description="The reasoning for the extracted information")
     Role Introduction,Qualifications and Requirements, Prefrred Qualifications, Salary, Location.
     Do not alter the content of the job description.
     """
+    job_description_html_content = dspy.InputField(
+        desc="HTML content of the job posting."
+    )
+    job_description = dspy.OutputField(
+        desc="Clean job description which is free of HTML tags and irrelevant information."
+    )
     job_role = dspy.OutputField(desc="The job role in the posting.")
     company_name = dspy.OutputField(desc="Company Name of the Job listing.")
     location = dspy.OutputField(desc="The location for the provided job posting.")
         Cleaned text
     """
     # Remove excessive whitespace
+    text = re.sub(r"\s+", " ", text)
     # Fix common PDF extraction issues
+    text = re.sub(r"([a-z])- ([a-z])", r"\1\2", text)  # Fix hyphenated words
     # Remove header/footer page numbers
+    text = re.sub(r"\n\s*\d+\s*\n", "\n", text)
     # Replace bullet variations with standard markdown bullets
+    text = re.sub(r"[•●○◘◙♦♣♠★]", "* ", text)
     return text.strip()
 @observe()
 def extract_contact_info(text: str) -> Dict[str, str]:
     """Extract contact information from resume text.
     contact_info = {}
     # Extract email
+    email_match = re.search(
+        r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text
+    )
     if email_match:
+        contact_info["email"] = email_match.group(0)
     # Extract phone (various formats)
+    phone_match = re.search(
+        r"(\+\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text
+    )
     if phone_match:
+        contact_info["phone"] = phone_match.group(0)
     # Extract LinkedIn URL
+    linkedin_match = re.search(r"linkedin\.com/in/[a-zA-Z0-9_-]+/?", text)
     if linkedin_match:
+        contact_info["linkedin"] = "https://www." + linkedin_match.group(0)
     # Try to extract name (this is approximate and might need LLM for better accuracy)
     # Typically name appears at the top of the resume
+    first_line = text.strip().split("\n")[0].strip()
     if len(first_line) < 40 and not any(char.isdigit() for char in first_line):
+        contact_info["name"] = first_line
     return contact_info
 @observe()
 def identify_resume_sections(text: str) -> List[Dict[str, Any]]:
     """Identify sections in a resume text.
     # Regex-based section identification
     # Create a pattern that matches common section headers
+    section_pattern = (
+        r"(?:^|\n)(?:[^a-zA-Z\d\s]|\s)*("
+        + "|".join(RESUME_SECTIONS)
+        + r")(?:[^a-zA-Z\d\s]|\s)*(?:$|\n)"
+    )
     matches = list(re.finditer(section_pattern, text, re.IGNORECASE))
     if not matches:
         # If no sections found, treat the whole resume as one section
+        sections.append(
+            {
+                "title": "resume",
+                "content": text,
+            }
+        )
         return sections
     # Process each section
         start_pos = match.start()
         # Find the end position (start of next section or end of text)
+        end_pos = matches[i + 1].start() if i < len(matches) - 1 else len(text)
         # Extract section content (excluding the header)
         section_content = text[start_pos:end_pos].strip()
+        sections.append({"title": section_title.lower(), "content": section_content})
     return sections
 def _is_heading(line: str) -> bool:
+    return line.isupper() and len(line.split()) <= 5 and not re.search(r"\d", line)
 def parse_resume(file_path: str | Path) -> List[Document]:
     """
     file_extension = Path(file_path).suffix.lower()
     # Handle different file types
+    if file_extension == ".pdf":
+        text = (
+            PyPDFLoader(str(file_path), extraction_mode="layout").load()[0].page_content
+        )
+    elif file_extension == ".txt":
         try:
+            with open(file_path, "r", encoding="utf-8") as f:
                 text = f.read()
                 if not text.strip():
                     raise ValueError("File is empty")
             logger.error(f"Error reading text file: {str(e)}")
             raise ValueError(f"Could not read text file: {file_path}. Error: {str(e)}")
     else:
+        raise ValueError(
+            f"Unsupported resume file type: {file_path}. Supported types: .pdf, .txt"
+        )
     text = _collapse_ws(text)
     # Tag headings with "###" so Markdown splitter can see them
+    tagged_lines = [f"### {ln}" if _is_heading(ln) else ln for ln in text.splitlines()]
     md_text = "\n".join(tagged_lines)
     if "###" in md_text:
+        splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("###", "section")])
         chunks = splitter.split_text(md_text)  # already returns Documents
     else:
+        splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
+        chunks: list[Document] = [
+            Document(page_content=chunk, metadata={})
+            for chunk in splitter.split_text(md_text)
+        ]  # Attach metadata
     for doc in chunks:
         doc.metadata.setdefault("source", str(file_path))
         # section already present if header‑splitter was used
         Document containing the job description
     """
     # Check if the input is a URL
+    if file_path_or_url.startswith(("http://", "https://")):
         return await parse_job_description_from_url(file_path_or_url)
     # Handle local files based on extension
     file_extension = Path(file_path_or_url).suffix.lower()
     # Handle txt files
+    if file_extension == ".txt":
         try:
+            with open(file_path_or_url, "r", encoding="utf-8") as f:
                 content = f.read()
                 if not content.strip():
                     raise ValueError(f"File is empty: {file_path_or_url}")
+                return Document(
+                    page_content=content, metadata={"source": file_path_or_url}
+                )
         except Exception as e:
             logger.error(f"Error reading text file: {str(e)}")
+            raise ValueError(
+                f"Could not read text file: {file_path_or_url}. Error: {str(e)}"
+            )
     # For other file types
+    raise ValueError(
+        f"Unsupported file type: {file_path_or_url}. Supported types: .pdf, .docx, .txt, .md"
+    )
 async def scrape_job_description_from_web(urls: List[str]):
     scraped_data_documents = await loader.aload()
     html2text = Html2TextTransformer()
+    markdown_scraped_data_documents = html2text.transform_documents(
+        scraped_data_documents
+    )
     # Grab the first 1000 tokens of the site
     splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
     )
     extracted_content = splitter.split_documents(markdown_scraped_data_documents)
     return ".".join(doc.page_content for doc in extracted_content)
 async def parse_job_description_from_url(url: str) -> Document:
     """Extracts and structures a job description from a URL using an LLM.
+    This function fetches content from a URL, uses a DSPy to extract key details,
     and returns a structured LangChain Document. If the LLM processing fails, it falls
     back to returning the raw extracted text.
         JobDescriptionParsingError: For any unexpected errors during the process.
     """
     logger.info("Starting job description extraction from URL: %s", url)
+    # 1. Validate URL
     parsed_url = urlparse(url)
     if not all([parsed_url.scheme, parsed_url.netloc]):
         logger.error("Invalid URL format: %s", url)
             logger.info("Fetching content from URL...")
             raw_content = await scrape_job_description_from_web([url])
             if not raw_content or not raw_content.strip():
+                raise URLExtractionError(
+                    "Failed to extract any meaningful content from the URL."
+                )
             logger.info("Successfully fetched raw content from URL.")
         except Exception as e:
             # Wrap any fetching error into our custom exception
+            raise URLExtractionError(
+                f"Failed to download or read content from {url}: {e}"
+            ) from e
         # 3. Process content with the LLM
         try:
             logger.info("Processing content with DSPy LLM...")
             # Configure DSPy LM (it's good practice to do this here if it can change)
+            dspy.configure(
+                lm=dspy.LM(
+                    "cerebras/qwen-3-32b",
+                    api_key=os.environ.get("CEREBRAS_API_KEY"),
+                    temperature=0.1,
+                    max_tokens=60000,  # Note: This max_tokens is unusually high
+                )
+            )
             job_extract_fn = dspy.Predict(ExtractJobDescription)
             result = job_extract_fn(job_description_html_content=raw_content)
             logger.info("Successfully processed job description with LLM.")
             # 4. Create the final Document with structured data
             job_doc = Document(
                 page_content=result.job_description,
                     "company_name": result.company_name,
                     "source": url,
                     "job_role": result.job_role,
+                    "location": result.location,
+                },
             )
             return job_doc
         if raw_content:
             return Document(
                 page_content=raw_content,
+                metadata={"company_name": "Unknown", "source": url, "error": str(e)},
             )
         # If raw_content is also None, then the failure was catastrophic.
+        raise LLMProcessingError(
+            "LLM processing failed and no raw content was available for fallback."
+        ) from e
     except URLExtractionError as e:
         logger.error(f"Could not extract content from URL: {e}")
         raise URLExtractionError("Failed to extract content from the URL.") from e
     # 6. Catch any other unexpected errors
     except Exception as e:
         logger.error(f"An unexpected error occurred: {e}", exc_info=True)
+        raise JobDescriptionParsingError(
+            f"An unexpected error occurred while parsing the job description: {e}"
+        ) from e

src/job_writing_agent/utils/llm_client.py CHANGED Viewed

@@ -14,85 +14,88 @@ import dspy
 logger = logging.getLogger(__name__)
 __all__ = [
-            "OllamaChatProvider",
-            "CerebrasChatProvider",
-            "OpenRouterChatProvider",
 ]
 class LLMProvider(ABC):
     """Base class for LLM provider strategies."""
     @abstractmethod
     def get_default_config(self) -> Dict[str, Any]:
         pass
     @abstractmethod
     def get_langchain_params(self) -> set[str]:
         pass
     @abstractmethod
     def get_dspy_params(self) -> set[str]:
         pass
     @abstractmethod
     def format_model_name_for_provider(self, model: str) -> str:
         """Convert model name to DSPy format.
         Different providers require different prefixes in DSPy.
         Args:
             model: Model name as used in LangChain
         Returns:
             Model name formatted for DSPy
         """
         pass
     @abstractmethod
     def validate_config(self, **config) -> Dict[str, Any]:
         pass
     def create_llm_instance(
-        self,
-        model: str,
-        framework: Literal['langchain', 'dspy'] = 'langchain',
-        **config
     ) -> BaseChatModel | dspy.LM:
         """Create LLM instance for specified framework."""
         defaults = self.get_default_config()
         # Get framework-specific supported params
-        if framework == 'langchain':
             supported = self.get_langchain_params()
         else:
             supported = self.get_dspy_params()
         # Filter unsupported params
         filtered_config = {k: v for k, v in config.items() if k in supported}
         # Warn about ignored params
         ignored = set(config.keys()) - supported
         if ignored:
-            logger.warning(f"Ignoring unsupported parameters for {framework}: {ignored}")
         # Merge configs
         merged_config = {**defaults, **filtered_config}
         # Validate
         validated_config = self.validate_config(**merged_config)
         # Create instance based on framework
-        if framework == 'langchain':
             return self._create_langchain_instance(model, **validated_config)
-        elif framework == 'dspy':
             return self._create_dspy_instance(model, **validated_config)
         else:
             raise ValueError(f"Unsupported framework: {framework}")
     @abstractmethod
     def _create_langchain_instance(self, model: str, **config) -> BaseChatModel:
         pass
     @abstractmethod
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         pass
@@ -100,224 +103,237 @@ class LLMProvider(ABC):
 class OpenRouterChatProvider(LLMProvider):
     """Provider for OpenRouter.
     Model format:
     - LangChain: "openai/gpt-4", "anthropic/claude-3-opus"
     - DSPy: Same - "openai/gpt-4", "anthropic/claude-3-opus"
     Docs: https://openrouter.ai/docs
     """
     OPENROUTER_API_URL = "https://openrouter.ai/api/v1"
     def get_default_config(self) -> Dict[str, Any]:
-        return {'temperature': 0.2}
     def get_langchain_params(self) -> set[str]:
         return {
-            'temperature', 'max_tokens', 'top_p',
-            'frequency_penalty', 'presence_penalty',
-            'stop', 'n', 'stream'
         }
     def get_dspy_params(self) -> set[str]:
-        return {'temperature', 'max_tokens', 'top_p', 'stop', 'n'}
     def format_model_name_for_provider(self, model: str) -> str:
         """OpenRouter models are used as-is in DSPy.
         Examples:
             "openai/gpt-4" -> "openai/gpt-4"
             "anthropic/claude-3-opus" -> "anthropic/claude-3-opus"
         """
         return f"{model}"  # ✅ Use as-is - already has provider/model format
     def validate_config(self, **config) -> Dict[str, Any]:
-        if 'temperature' in config:
-            temp = config['temperature']
             if not 0 <= temp <= 2:
                 logger.warning(f"Temperature must be 0-2, got {temp}")
-        if 'api_key' not in config:
-            api_key = os.getenv('OPENROUTER_API_KEY')
             if not api_key:
                 raise ValueError("OPENROUTER_API_KEY not set")
-            config['api_key'] = api_key
         return config
     def _create_langchain_instance(self, model: str, **config) -> ChatOpenAI:
         """Create LangChain instance.
         Example model: "openai/gpt-4"
         """
-        api_key = config.pop('api_key')
         return ChatOpenAI(
-            model=self.format_model_name_for_provider(model),  # ✅ Use model as-is: "openai/gpt-4"
             api_key=SecretStr(api_key),
             base_url=self.OPENROUTER_API_URL,
-            **config
         )
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         """Create DSPy instance.
         Example model: "openai/gpt-4"
         """
-        api_key = config.pop('api_key')
         return dspy.LM(
-            model=self.format_model_name_for_provider(model),  # ✅ Use as-is: "openai/gpt-4"
             api_key=api_key,
             api_base=self.OPENROUTER_API_URL,
-            **config
         )
 class CerebrasChatProvider(LLMProvider):
     """Provider for Cerebras.
     Model format:
     - LangChain: "llama3.1-8b", "llama3.1-70b" (direct names)
     - DSPy: "openai/llama3.1-8b" (needs openai/ prefix for compatibility)
     Docs: https://inference-docs.cerebras.ai/
     """
     CEREBRAS_API_URL = "https://api.cerebras.ai/v1"
     def get_default_config(self) -> Dict[str, Any]:
-        return {'temperature': 0.2, 'max_tokens': 1024}
     def get_langchain_params(self) -> set[str]:
-        return {
-            'temperature', 'max_tokens', 'top_p',
-            'stop', 'stream', 'seed'
-        }
     def get_dspy_params(self) -> set[str]:
-        return {'temperature', 'max_tokens', 'top_p', 'stop'}
     def format_model_name_for_provider(self, model: str) -> str:
         """Cerebras models need 'cerebras/' prefix.
         Examples:
             "llama3.1-8b" -> "cerebras/llama3.1-8b"
             "llama3.1-70b" -> "cerebras/llama3.1-70b"
         """
         return f"cerebras/{model}"  # ✅ Add openai/ prefix for OpenAI-compatible API
     def validate_config(self, **config) -> Dict[str, Any]:
-        if 'temperature' in config:
-            temp = config['temperature']
             if not 0 <= temp <= 1.5:
                 raise ValueError(f"Temperature must be 0-1.5, got {temp}")
-        if 'api_key' not in config:
-            api_key = os.getenv('CEREBRAS_API_KEY')
             if not api_key:
                 raise ValueError("CEREBRAS_API_KEY not set")
-            config['api_key'] = api_key
         return config
     def _create_langchain_instance(self, model: str, **config) -> ChatCerebras:
         """Create LangChain instance.
         Example model: "llama3.1-8b"
         """
         return ChatCerebras(
             model=model,  # Direct name: "llama3.1-8b"
-            **config
         )
     @DeprecationWarning
-    def _create_langchain_instance_openaiclient(self, model: str, **config) -> ChatOpenAI:
         """
         Create LangChain instance
         Example model: "llama3.1-8b"
         """
-        api_key = config.pop('api_key')
         return ChatOpenAI(
-            model=self.format_model_name_for_provider(model),  # Direct name: "llama3.1-8b"
             api_key=SecretStr(api_key),
             base_url=self.CEREBRAS_API_URL,
-            **config
         )
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         """Create DSPy instance.
         Example model input: "llama3.1-8b"
         DSPy format: "openai/llama3.1-8b"
         """
-        api_key = config.pop('api_key')
         return dspy.LM(
-            model=self.format_model_name_for_provider(model),  # With prefix: "openai/llama3.1-8b"
             api_key=api_key,
             api_base=self.CEREBRAS_API_URL,
-            **config
         )
 class OllamaChatProvider(LLMProvider):
     """Provider for Ollama.
     Model format:
     - LangChain: "llama3.2", "llama3.2:latest" (direct names with optional tags)
     - DSPy: "ollama_chat/llama3.2" (needs ollama_chat/ prefix)
     Docs: https://ollama.com/
     """
     def get_default_config(self) -> Dict[str, Any]:
-        return {'temperature': 0.2, 'top_k': 40, 'top_p': 0.9}
     def get_langchain_params(self) -> set[str]:
         return {
-            'temperature', 'top_k', 'top_p', 'repeat_penalty',
-            'num_ctx', 'num_predict', 'format', 'seed'
         }
     def get_dspy_params(self) -> set[str]:
-        return {'temperature', 'top_p', 'num_ctx', 'seed'}
     def format_model_name_for_provider(self, model: str) -> str:
         """Ollama models need 'ollama_chat/' prefix for DSPy.
         Examples:
             "llama3.2" -> "ollama_chat/llama3.2"
             "llama3.2:latest" -> "ollama_chat/llama3.2:latest"
         """
         return f"ollama_chat/{model}"  # ✅ Add ollama_chat/ prefix
     def validate_config(self, **config) -> Dict[str, Any]:
-        if 'temperature' in config:
-            temp = config['temperature']
             if not 0 <= temp <= 2:
                 raise ValueError(f"Temperature must be 0-2, got {temp}")
-        if 'top_k' in config:
-            if not isinstance(config['top_k'], int) or config['top_k'] < 1:
                 raise ValueError("top_k must be positive integer")
         return config
     def _create_langchain_instance(self, model: str, **config) -> ChatOllama:
-        return ChatOllama(
-            model=self.format_model_name_for_provider(model),
-            **config)
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         return dspy.LM(
-            model=self.format_model_name_for_provider(model),  # ✅ With prefix: "ollama_chat/llama3.2"
-            **config
-            )

 logger = logging.getLogger(__name__)
 __all__ = [
+    "OllamaChatProvider",
+    "CerebrasChatProvider",
+    "OpenRouterChatProvider",
 ]
 class LLMProvider(ABC):
     """Base class for LLM provider strategies."""
     @abstractmethod
     def get_default_config(self) -> Dict[str, Any]:
         pass
     @abstractmethod
     def get_langchain_params(self) -> set[str]:
         pass
     @abstractmethod
     def get_dspy_params(self) -> set[str]:
         pass
     @abstractmethod
     def format_model_name_for_provider(self, model: str) -> str:
         """Convert model name to DSPy format.
         Different providers require different prefixes in DSPy.
         Args:
             model: Model name as used in LangChain
         Returns:
             Model name formatted for DSPy
         """
         pass
     @abstractmethod
     def validate_config(self, **config) -> Dict[str, Any]:
         pass
     def create_llm_instance(
+        self,
+        model: str,
+        framework: Literal["langchain", "dspy"] = "langchain",
+        **config,
     ) -> BaseChatModel | dspy.LM:
         """Create LLM instance for specified framework."""
         defaults = self.get_default_config()
         # Get framework-specific supported params
+        if framework == "langchain":
             supported = self.get_langchain_params()
         else:
             supported = self.get_dspy_params()
         # Filter unsupported params
         filtered_config = {k: v for k, v in config.items() if k in supported}
         # Warn about ignored params
         ignored = set(config.keys()) - supported
         if ignored:
+            logger.warning(
+                f"Ignoring unsupported parameters for {framework}: {ignored}"
+            )
         # Merge configs
         merged_config = {**defaults, **filtered_config}
         # Validate
         validated_config = self.validate_config(**merged_config)
         # Create instance based on framework
+        if framework == "langchain":
             return self._create_langchain_instance(model, **validated_config)
+        elif framework == "dspy":
             return self._create_dspy_instance(model, **validated_config)
         else:
             raise ValueError(f"Unsupported framework: {framework}")
     @abstractmethod
     def _create_langchain_instance(self, model: str, **config) -> BaseChatModel:
         pass
     @abstractmethod
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         pass
 class OpenRouterChatProvider(LLMProvider):
     """Provider for OpenRouter.
     Model format:
     - LangChain: "openai/gpt-4", "anthropic/claude-3-opus"
     - DSPy: Same - "openai/gpt-4", "anthropic/claude-3-opus"
     Docs: https://openrouter.ai/docs
     """
     OPENROUTER_API_URL = "https://openrouter.ai/api/v1"
     def get_default_config(self) -> Dict[str, Any]:
+        return {"temperature": 0.2}
     def get_langchain_params(self) -> set[str]:
         return {
+            "temperature",
+            "max_tokens",
+            "top_p",
+            "frequency_penalty",
+            "presence_penalty",
+            "stop",
+            "n",
+            "stream",
         }
     def get_dspy_params(self) -> set[str]:
+        return {"temperature", "max_tokens", "top_p", "stop", "n"}
     def format_model_name_for_provider(self, model: str) -> str:
         """OpenRouter models are used as-is in DSPy.
         Examples:
             "openai/gpt-4" -> "openai/gpt-4"
             "anthropic/claude-3-opus" -> "anthropic/claude-3-opus"
         """
         return f"{model}"  # ✅ Use as-is - already has provider/model format
     def validate_config(self, **config) -> Dict[str, Any]:
+        if "temperature" in config:
+            temp = config["temperature"]
             if not 0 <= temp <= 2:
                 logger.warning(f"Temperature must be 0-2, got {temp}")
+        if "api_key" not in config:
+            api_key = os.getenv("OPENROUTER_API_KEY")
             if not api_key:
                 raise ValueError("OPENROUTER_API_KEY not set")
+            config["api_key"] = api_key
         return config
     def _create_langchain_instance(self, model: str, **config) -> ChatOpenAI:
         """Create LangChain instance.
         Example model: "openai/gpt-4"
         """
+        api_key = config.pop("api_key")
         return ChatOpenAI(
+            model=self.format_model_name_for_provider(
+                model
+            ),  # ✅ Use model as-is: "openai/gpt-4"
             api_key=SecretStr(api_key),
             base_url=self.OPENROUTER_API_URL,
+            **config,
         )
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         """Create DSPy instance.
         Example model: "openai/gpt-4"
         """
+        api_key = config.pop("api_key")
         return dspy.LM(
+            model=f"openrouter/{self.format_model_name_for_provider(model)}",  # ✅ Use as-is: "openai/gpt-4"
             api_key=api_key,
             api_base=self.OPENROUTER_API_URL,
+            **config,
         )
 class CerebrasChatProvider(LLMProvider):
     """Provider for Cerebras.
     Model format:
     - LangChain: "llama3.1-8b", "llama3.1-70b" (direct names)
     - DSPy: "openai/llama3.1-8b" (needs openai/ prefix for compatibility)
     Docs: https://inference-docs.cerebras.ai/
     """
     CEREBRAS_API_URL = "https://api.cerebras.ai/v1"
     def get_default_config(self) -> Dict[str, Any]:
+        return {"temperature": 0.2, "max_tokens": 1024}
     def get_langchain_params(self) -> set[str]:
+        return {"temperature", "max_tokens", "top_p", "stop", "stream", "seed"}
     def get_dspy_params(self) -> set[str]:
+        return {"temperature", "max_tokens", "top_p", "stop"}
     def format_model_name_for_provider(self, model: str) -> str:
         """Cerebras models need 'cerebras/' prefix.
         Examples:
             "llama3.1-8b" -> "cerebras/llama3.1-8b"
             "llama3.1-70b" -> "cerebras/llama3.1-70b"
         """
         return f"cerebras/{model}"  # ✅ Add openai/ prefix for OpenAI-compatible API
     def validate_config(self, **config) -> Dict[str, Any]:
+        if "temperature" in config:
+            temp = config["temperature"]
             if not 0 <= temp <= 1.5:
                 raise ValueError(f"Temperature must be 0-1.5, got {temp}")
+        if "api_key" not in config:
+            api_key = os.getenv("CEREBRAS_API_KEY")
             if not api_key:
                 raise ValueError("CEREBRAS_API_KEY not set")
+            config["api_key"] = api_key
         return config
     def _create_langchain_instance(self, model: str, **config) -> ChatCerebras:
         """Create LangChain instance.
         Example model: "llama3.1-8b"
         """
         return ChatCerebras(
             model=model,  # Direct name: "llama3.1-8b"
+            **config,
         )
     @DeprecationWarning
+    def _create_langchain_instance_openaiclient(
+        self, model: str, **config
+    ) -> ChatOpenAI:
         """
         Create LangChain instance
         Example model: "llama3.1-8b"
         """
+        api_key = config.pop("api_key")
         return ChatOpenAI(
+            model=self.format_model_name_for_provider(
+                model
+            ),  # Direct name: "llama3.1-8b"
             api_key=SecretStr(api_key),
             base_url=self.CEREBRAS_API_URL,
+            **config,
         )
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         """Create DSPy instance.
         Example model input: "llama3.1-8b"
         DSPy format: "openai/llama3.1-8b"
         """
+        api_key = config.pop("api_key")
         return dspy.LM(
+            model=self.format_model_name_for_provider(
+                model
+            ),  # With prefix: "openai/llama3.1-8b"
             api_key=api_key,
             api_base=self.CEREBRAS_API_URL,
+            **config,
         )
 class OllamaChatProvider(LLMProvider):
     """Provider for Ollama.
     Model format:
     - LangChain: "llama3.2", "llama3.2:latest" (direct names with optional tags)
     - DSPy: "ollama_chat/llama3.2" (needs ollama_chat/ prefix)
     Docs: https://ollama.com/
     """
     def get_default_config(self) -> Dict[str, Any]:
+        return {"temperature": 0.2, "top_k": 40, "top_p": 0.9}
     def get_langchain_params(self) -> set[str]:
         return {
+            "temperature",
+            "top_k",
+            "top_p",
+            "repeat_penalty",
+            "num_ctx",
+            "num_predict",
+            "format",
+            "seed",
         }
     def get_dspy_params(self) -> set[str]:
+        return {"temperature", "top_p", "num_ctx", "seed"}
     def format_model_name_for_provider(self, model: str) -> str:
         """Ollama models need 'ollama_chat/' prefix for DSPy.
         Examples:
             "llama3.2" -> "ollama_chat/llama3.2"
             "llama3.2:latest" -> "ollama_chat/llama3.2:latest"
         """
         return f"ollama_chat/{model}"  # ✅ Add ollama_chat/ prefix
     def validate_config(self, **config) -> Dict[str, Any]:
+        if "temperature" in config:
+            temp = config["temperature"]
             if not 0 <= temp <= 2:
                 raise ValueError(f"Temperature must be 0-2, got {temp}")
+        if "top_k" in config:
+            if not isinstance(config["top_k"], int) or config["top_k"] < 1:
                 raise ValueError("top_k must be positive integer")
         return config
     def _create_langchain_instance(self, model: str, **config) -> ChatOllama:
+        return ChatOllama(model=self.format_model_name_for_provider(model), **config)
     def _create_dspy_instance(self, model: str, **config) -> dspy.LM:
         return dspy.LM(
+            model=self.format_model_name_for_provider(
+                model
+            ),  # ✅ With prefix: "ollama_chat/llama3.2"
+            **config,
+        )

src/job_writing_agent/utils/llm_provider_factory.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .llm_client import (
     OllamaChatProvider,
     OpenRouterChatProvider,
 )
 logger = logging.getLogger(__name__)
@@ -32,6 +33,7 @@ class LLMFactory:
         >>> dspy.configure(lm=lm)
     """
     def __init__(self, default_provider: str = "openrouter"):
         """Initialize factory with available providers.
@@ -50,6 +52,7 @@ class LLMFactory:
             f"default: {default_provider}"
         )
     def create(
         self,
         model: str,

     OllamaChatProvider,
     OpenRouterChatProvider,
 )
+from .logging.logging_decorators import log_execution
 logger = logging.getLogger(__name__)
         >>> dspy.configure(lm=lm)
     """
+    @log_execution
     def __init__(self, default_provider: str = "openrouter"):
         """Initialize factory with available providers.
             f"default: {default_provider}"
         )
+    @log_execution
     def create(
         self,
         model: str,

src/job_writing_agent/utils/logging/logging_config.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Logging configuration for the application
+This module provides a centralized logging manager that configures
+logging once at application startup, ensuring consistent log format
+and behavior across all modules.
+"""
+import logging
+import sys
+from pathlib import Path
+from typing_extensions import Optional
+class LoggingManager:
+    """
+    Centralized logging configuration manager.
+    Uses Singleton pattern to ensure logging is configured only once.
+    Example:
+        >>> manager = LoggingManager()
+        >>> manager.configure_logging(log_level=logging.INFO)
+        >>> logger = logging.getLogger(__name__)
+        >>> logger.info("This will be logged consistently")
+    """
+    _instance: Optional["LoggingManager"] = None
+    _configured: bool = False
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._configured = False
+        return cls._instance
+    def configure_logging(
+        self,
+        log_level: int = logging.INFO,
+        log_file: Optional[Path] = None,
+        log_format: Optional[str] = None,
+        date_format: Optional[str] = None,
+    ) -> None:
+        """
+        Configure logging for the entire application.
+        This should be called once at application startup (e.g., in main()).
+        Subsequent calls are ignored if already configured.
+        Args:
+            log_level: Logging level (logging.DEBUG, INFO, WARNING, ERROR)
+            log_file: Optional path to log file. If None, logs only to console.
+            log_format: Optional custom format string. Default includes timestamp, level, module, message.
+            date_format: Optional date format string. Default: "%Y-%m-%d %H:%M:%S"
+        Example:
+            >>> manager = LoggingManager()
+            >>> manager.configure_logging(
+            ...     log_level=logging.INFO,
+            ...     log_file=Path("logs/app.log")
+            ... )
+        """
+        if self._configured:
+            # Already configured - don't reconfigure
+            return
+        # Default format: [2025-01-15 10:30:45] INFO    module_name: message
+        if log_format is None:
+            log_format = "[%(asctime)s] %(levelname)-8s %(name)s: %(message)s"
+        if date_format is None:
+            date_format = "%Y-%m-%d %H:%M:%S"
+        # Create formatter
+        formatter = logging.Formatter(log_format, datefmt=date_format)
+        # Configure root logger
+        root_logger = logging.getLogger()
+        root_logger.setLevel(log_level)
+        # Remove existing handlers to avoid duplicates
+        root_logger.handlers.clear()
+        # Console handler (always add)
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(log_level)
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+        # File handler (if log_file specified)
+        if log_file:
+            # Create log directory if it doesn't exist
+            log_file.parent.mkdir(parents=True, exist_ok=True)
+            file_handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
+            file_handler.setLevel(log_level)
+            file_handler.setFormatter(formatter)
+            root_logger.addHandler(file_handler)
+        self._configured = True
+        # Log that logging is configured
+        logger = logging.getLogger(__name__)
+        logger.info(
+            f"Logging configured: level={logging.getLevelName(log_level)}, "
+            f"file={'enabled' if log_file else 'disabled'}"
+        )
+    def is_configured(self) -> bool:
+        """Check if logging has been configured."""
+        return self._configured
+# Convenience function for easy access
+def get_logger(name: str) -> logging.Logger:
+    """
+    Get a logger instance for a module.
+    This is a convenience function that ensures consistent logger creation.
+    Use this instead of logging.getLogger(__name__) for consistency.
+    Args:
+        name: Logger name (typically __name__)
+    Returns:
+        Logger instance
+    Example:
+        >>> logger = get_logger(__name__)
+        >>> logger.info("Application started")
+    """
+    return logging.getLogger(name)

src/job_writing_agent/utils/logging/logging_decorators.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Simple decorators for logging.
+These decorators add logging behavior without cluttering your function code.
+Keep it simple - just the essentials.
+"""
+import functools
+import logging
+import time
+from typing import Callable, TypeVar
+# Type variable for function signatures
+F = TypeVar("F", bound=Callable)
+logger = logging.getLogger(__name__)
+def log_execution(func: F) -> F:
+    """
+    Simple decorator to log when a function starts and finishes.
+    Logs entry, exit, and how long it took.
+    Example:
+        >>> @log_execution
+        >>> def process_data(data: str) -> str:
+        ...     return data.upper()
+        >>> process_data("hello")
+        # Logs: "Entering process_data" ... "Exiting process_data (took 0.001s)"
+    """
+    @functools.wraps(func)
+    def log_execution_wrapper(*args, **kwargs):
+        func_name = func.__name__
+        logger.info(f"Entering {func_name}")
+        start_time = time.time()
+        try:
+            result = func(*args, **kwargs)
+            elapsed = time.time() - start_time
+            logger.info(f"Exiting {func_name} (took {elapsed:.3f}s)")
+            return result
+        except Exception as e:
+            elapsed = time.time() - start_time
+            logger.error(f"{func_name} failed after {elapsed:.3f}s: {e}", exc_info=True)
+            raise
+    return log_execution_wrapper
+def log_async(func: F) -> F:
+    """
+    Simple decorator for async functions - logs entry, exit, and timing.
+    Example:
+        >>> @log_async
+        >>> async def fetch_data(url: str) -> dict:
+        ...     return await http.get(url)
+    """
+    @functools.wraps(func)
+    async def log_async_wrapper(*args, **kwargs):
+        func_name = func.__name__
+        logger.info(f"Entering async {func_name}")
+        start_time = time.time()
+        try:
+            result = await func(*args, **kwargs)
+            elapsed = time.time() - start_time
+            logger.info(f"Exiting async {func_name} (took {elapsed:.3f}s)")
+            return result
+        except Exception as e:
+            elapsed = time.time() - start_time
+            logger.error(f"{func_name} failed after {elapsed:.3f}s: {e}", exc_info=True)
+            raise
+    return log_async_wrapper
+def log_errors(func: F) -> F:
+    """
+    Simple decorator to catch and log exceptions.
+    Logs the error, then re-raises it so your code still fails normally.
+    Example:
+        >>> @log_errors
+        >>> def risky_operation():
+        ...     raise ValueError("Something went wrong")
+        >>> risky_operation()
+        # Logs the error, then raises it
+    """
+    @functools.wraps(func)
+    def log_errors_wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"Error in {func.__name__}: {e}", exc_info=True)
+            raise
+    return log_errors_wrapper

src/job_writing_agent/workflow.py CHANGED Viewed

@@ -6,13 +6,13 @@ This module provides the JobWorkflow class and CLI runner.
 import asyncio
 import logging
 import sys
 from datetime import datetime
 from functools import cached_property
 from typing import Optional, Dict, Any
-from langchain_core.tracers import ConsoleCallbackHandler
 from langgraph.graph import StateGraph
-from langfuse import Langfuse
 from langgraph.graph.state import CompiledStateGraph
 from job_writing_agent.agents.nodes import (
@@ -21,96 +21,274 @@ from job_writing_agent.agents.nodes import (
     finalize_document,
     human_approval,
 )
-from job_writing_agent.classes import AppState, DataLoadState
-from job_writing_agent.nodes import Dataloading, generate_variations, self_consistency_vote
 from job_writing_agent.nodes.research_workflow import research_workflow
 from job_writing_agent.utils.application_cli_interface import handle_cli
 from job_writing_agent.utils.result_utils import print_result, save_result
 logger = logging.getLogger(__name__)
 class JobWorkflow:
     """
-    Workflow runner for the job application writer.
     """
     def __init__(self, resume: str, job_description_source: str, content: str):
         self.resume = resume
         self.job_description_source = job_description_source
         self.content = content
-        self.dataloading = Dataloading()
-        self.langfuse = Langfuse()
     @cached_property
-    def app_state(self) -> AppState:
-        return AppState(
-            resume_path=self.resume,
-            job_description_source=self.job_description_source,
-            company_research_data=None,
-            draft="",
-            feedback="",
-            final="",
-            content=self.content,
-            current_node="",
-        )
-    @cached_property
     def job_app_graph(self) -> StateGraph:
         graph = StateGraph(DataLoadState)
-        graph.add_node("initialize_system", self.dataloading.set_agent_system_message)
-        graph.add_node("load", self.dataloading.run)
         graph.add_node("research", research_workflow)
         graph.add_node("create_draft", create_draft)
-        graph.add_node("variations", generate_variations)
-        graph.add_node("self_consistency", self_consistency_vote)
         graph.add_node("critique", critique_draft)
         graph.add_node("human_approval", human_approval)
         graph.add_node("finalize", finalize_document)
-        graph.set_entry_point("initialize_system")
         graph.set_finish_point("finalize")
-        graph.add_edge("initialize_system", "load")
-        graph.add_conditional_edges("load", self.dataloading.verify_inputs)
         graph.add_edge("research", "create_draft")
-        graph.add_edge("create_draft", "variations")
-        graph.add_edge("variations", "self_consistency")
-        graph.add_edge("self_consistency", "critique")
         graph.add_edge("critique", "human_approval")
         graph.add_edge("human_approval", "finalize")
         return graph
     async def run(self) -> Optional[Dict[str, Any]]:
         """
-        Run the job application writer workflow.
         """
         try:
             compiled_graph = self.compile()
         except Exception as e:
-            logger.error("Error compiling graph: %s", e)
             return None
-        run_name = f"Job Application Writer - {self.app_state['content']} - {datetime.now():%Y-%m-%d-%H%M%S}"
         config = {
             "configurable": {
-                "thread_id": f"job_app_session_{datetime.now():%Y%m%d%H%M%S}",
-                "callbacks": [ConsoleCallbackHandler()],
                 "run_name": run_name,
-                "tags": ["job-application", self.app_state["content"]],
             },
             "recursion_limit": 10,
         }
         try:
-            self.app_state["current_node"] = "initialize_system"
             graph_output = await compiled_graph.ainvoke(self.app_state, config=config)
         except Exception as e:
-            logger.error("Error running graph: %s", e)
             return None
-        return graph_output
     def compile(self) -> CompiledStateGraph:
-        """Compile the workflow graph."""
         return self.job_app_graph.compile()
@@ -122,9 +300,10 @@ def main():
         content=args.content_type,
     )
     result = asyncio.run(workflow.run())
     if result:
-        print_result(args.content_type, result["final"])
-        save_result(args.content_type, result["final"])
         print("Workflow completed successfully.")
     else:
         print("Error running workflow.")

 import asyncio
 import logging
 import sys
+import os
 from datetime import datetime
 from functools import cached_property
 from typing import Optional, Dict, Any
+from langchain_core.tracers import ConsoleCallbackHandler, LangChainTracer
 from langgraph.graph import StateGraph
 from langgraph.graph.state import CompiledStateGraph
 from job_writing_agent.agents.nodes import (
     finalize_document,
     human_approval,
 )
+from job_writing_agent.classes import DataLoadState
+from job_writing_agent.nodes.initializing import data_loading_workflow
 from job_writing_agent.nodes.research_workflow import research_workflow
 from job_writing_agent.utils.application_cli_interface import handle_cli
 from job_writing_agent.utils.result_utils import print_result, save_result
+from job_writing_agent.utils.logging.logging_decorators import (
+    log_execution,
+    log_errors,
+)
 logger = logging.getLogger(__name__)
 class JobWorkflow:
     """
+    Workflow orchestrator for the job application writer.
+    This class coordinates the execution of the job application writing workflow,
+    managing the LangGraph state machine and LangSmith tracing. It follows the
+    orchestrator pattern, coordinating multiple subgraphs and nodes without
+    implementing business logic itself.
+    The workflow consists of:
+    1. Data Loading: Parse resume and job description (parallel subgraph)
+    2. Research: Company research and relevance filtering (subgraph)
+    3. Draft Creation: Generate initial application material
+    4. Critique: AI-powered feedback on the draft
+    5. Human Approval: User feedback collection
+    6. Finalization: Incorporate feedback and produce final output
     """
     def __init__(self, resume: str, job_description_source: str, content: str):
+        """
+        Initialize the JobWorkflow orchestrator.
+        Parameters
+        ----------
+        resume: str
+            Path to the resume file or resume text.
+        job_description_source: str
+            URL, file path, or text content of the job description.
+        content: str
+            Type of application material to generate ("cover_letter", "bullets", "linkedin_note").
+        """
         self.resume = resume
         self.job_description_source = job_description_source
         self.content = content
     @cached_property
+    def app_state(self) -> DataLoadState:
+        """
+        Get the initial application state for the workflow.
+        Returns
+        -------
+        DataLoadState
+            Initialized state dictionary with resume path, job description source,
+            content type, and empty messages list.
+        """
+        return {
+            "resume_path": self.resume,
+            "job_description_source": self.job_description_source,
+            "content": self.content,
+            "current_node": "",
+            "messages": [],
+            "company_research_data": {},
+        }
     def job_app_graph(self) -> StateGraph:
+        """
+        Build and configure the job application workflow graph.
+        This method constructs the LangGraph state machine with all nodes and edges.
+        The graph is cached as a property to avoid rebuilding on each access.
+        Workflow Structure:
+        - Entry: Data loading subgraph (parallel resume + job description parsing)
+        - Research: Company research subgraph
+        - Draft Creation: Generate initial application material
+        - Critique: AI feedback on draft
+        - Human Approval: User feedback collection
+        - Finalization: Produce final output
+        - Exit: Finalize node
+        Returns
+        -------
+        StateGraph
+            Configured LangGraph state machine ready for compilation.
+        """
         graph = StateGraph(DataLoadState)
+        # Add workflow nodes (subgraphs and individual nodes)
+        graph.add_node("load", data_loading_workflow)
         graph.add_node("research", research_workflow)
         graph.add_node("create_draft", create_draft)
         graph.add_node("critique", critique_draft)
         graph.add_node("human_approval", human_approval)
         graph.add_node("finalize", finalize_document)
+        # Set entry and exit points
+        graph.set_entry_point("load")
         graph.set_finish_point("finalize")
+        # Conditional routing after data loading
+        def route_after_load(state: DataLoadState) -> str:
+            """
+            Route based on next_node set by data loading subgraph.
+            The data loading subgraph sets next_node to either "load" (if validation
+            fails) or "research" (if validation passes).
+            Parameters
+            ----------
+            state: DataLoadState
+                Current workflow state.
+            Returns
+            -------
+            str
+                Next node name: "load" or "research".
+            """
+            next_node = state.get("next_node", "research")  # Default to research
+            logger.info(f"Routing after load: {next_node}")
+            return next_node
+        graph.add_conditional_edges(
+            "load",
+            route_after_load,
+            {
+                "load": "load",  # Loop back to load subgraph if validation fails
+                "research": "research",  # Proceed to research if validation passes
+            },
+        )
+        # Sequential edges for main workflow
         graph.add_edge("research", "create_draft")
+        graph.add_edge("create_draft", "critique")
         graph.add_edge("critique", "human_approval")
         graph.add_edge("human_approval", "finalize")
         return graph
+    def _get_callbacks(self) -> list:
+        """
+        Get list of callbacks including LangSmith tracer with enhanced metadata.
+        This method creates callback handlers for LangGraph execution, including
+        LangSmith tracing with workflow-level metadata and tags for better
+        observability and filtering in the LangSmith UI.
+        Returns
+        -------
+        list
+            List of callback handlers for LangGraph execution, including:
+            - ConsoleCallbackHandler: Console output
+            - LangChainTracer: LangSmith tracing (if enabled)
+        """
+        callbacks = [ConsoleCallbackHandler()]
+        # Add LangSmith tracer if tracing is enabled via environment variable
+        if os.getenv("LANGSMITH_TRACING", "").lower() == "true":
+            try:
+                # LangChainTracer automatically reads from environment variables:
+                # - LANGSMITH_API_KEY
+                # - LANGSMITH_PROJECT (optional, defaults to "default")
+                # - LANGSMITH_ENDPOINT (optional, defaults to https://api.smith.langchain.com)
+                langsmith_tracer = LangChainTracer(
+                    project_name=os.getenv(
+                        "LANGSMITH_PROJECT", "job_application_writer"
+                    )
+                )
+                callbacks.append(langsmith_tracer)
+                logger.info("LangSmith tracing enabled with metadata")
+            except Exception as e:
+                logger.warning(
+                    f"Failed to initialize LangSmith tracer: {e}. Continuing without tracing."
+                )
+        else:
+            logger.debug(
+                "LangSmith tracing is not enabled (LANGSMITH_TRACING != 'true')"
+            )
+        return callbacks
+    @log_execution
+    @log_errors
     async def run(self) -> Optional[Dict[str, Any]]:
         """
+        Execute the complete job application writer workflow.
+        This method compiles the graph, configures LangSmith tracing with
+        enhanced metadata, and executes the workflow. It handles errors
+        gracefully and returns the final state or None if execution fails.
+        Returns
+        -------
+        Optional[Dict[str, Any]]
+            Final workflow state containing the generated application material
+            in the "output_data" field, or None if execution fails.
         """
         try:
             compiled_graph = self.compile()
         except Exception as e:
+            logger.error("Error compiling graph: %s", e, exc_info=True)
             return None
+        # Prepare enhanced LangSmith metadata and tags
+        content = self.app_state.get("content", "cover_letter")
+        thread_id = f"job_app_session_{datetime.now():%Y%m%d%H%M%S}"
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        # Enhanced metadata for better trace filtering and analysis
+        metadata = {
+            "workflow": "job_application_writer",
+            "content_type": content,
+            "session_id": thread_id,
+        }
+        # Enhanced tags for trace organization
+        tags = [
+            "job-application",
+            content,
+        ]
+        # Descriptive run name for LangSmith UI
+        run_name = f"JobAppWriter.{content}.{timestamp}"
         config = {
             "configurable": {
+                "thread_id": thread_id,
+                "callbacks": self._get_callbacks(),
                 "run_name": run_name,
+                "metadata": metadata,
+                "tags": tags,
             },
             "recursion_limit": 10,
         }
         try:
+            self.app_state["current_node"] = "load"
+            logger.info(
+                f"Starting workflow execution: {run_name} "
+                f"(content_type={content}, session_id={thread_id})"
+            )
             graph_output = await compiled_graph.ainvoke(self.app_state, config=config)
+            logger.info("Workflow execution completed successfully")
+            return graph_output
         except Exception as e:
+            logger.error("Error running graph: %s", e, exc_info=True)
             return None
+    @log_execution
+    @log_errors
     def compile(self) -> CompiledStateGraph:
+        """
+        Compile the workflow graph into an executable state machine.
+        Returns
+        -------
+        CompiledStateGraph
+            Compiled LangGraph state machine ready for execution.
+        Raises
+        ------
+        Exception
+            If graph compilation fails (e.g., invalid edges, missing nodes).
+        """
         return self.job_app_graph.compile()
         content=args.content_type,
     )
     result = asyncio.run(workflow.run())
+    # print(f"result: {result}")
     if result:
+        print_result(args.content_type, result["output_data"])
+        save_result(args.content_type, result["output_data"])
         print("Workflow completed successfully.")
     else:
         print("Error running workflow.")

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff