Spaces:
Sleeping
Sleeping
Decim@97 commited on
Commit Β·
2fe451c
1
Parent(s): 758cd70
Askmyresume version 1.0.0
Browse files- .env.example +8 -0
- .gitattributes +27 -0
- .gitignore +141 -0
- README.md +61 -1
- app.py +12 -0
- download.py +42 -0
- ingestion/__init__.py +0 -0
- ingestion/ingest_all.py +33 -0
- ingestion/pdf_loader.py +28 -0
- ingestion/text_loader.py +28 -0
- local_agents/__init__.py +0 -0
- local_agents/escalation_agent.py +48 -0
- local_agents/prompt.py +47 -0
- local_agents/resume_agent.py +99 -0
- notifications/__init__.py +0 -0
- notifications/pushover_client.py +56 -0
- notifications/sendgrid_client.py +61 -0
- rag/__init__.py +0 -0
- rag/build_index.py +52 -0
- rag/chunker.py +55 -0
- rag/embedder.py +52 -0
- rag/main.py +40 -0
- rag/retriever.py +59 -0
- rag/test.py +45 -0
- rag/vector_store.py +89 -0
- requirements.txt +0 -0
- ui/__init__.py +0 -0
- ui/chat_handler.py +47 -0
- ui/contact_handler.py +34 -0
- ui/generate_session.py +4 -0
- ui/gradio_app.py +115 -0
- utils/__init__.py +0 -0
- utils/central_logging.py +51 -0
.env.example
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
OPENAI_API_KEY=
|
| 2 |
+
ANTHROPIC_API_KEY=
|
| 3 |
+
SENDGRID_API_KEY=
|
| 4 |
+
PUSHOVER_USER_KEY=
|
| 5 |
+
PUSHOVER_API_TOKEN=
|
| 6 |
+
SENDGRID_FROM_EMAIL=
|
| 7 |
+
SENDGRID_TO_EMAIL=
|
| 8 |
+
HF_TOKEN=
|
.gitattributes
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
@@ -8,6 +9,8 @@
|
|
| 8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 11 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
@@ -33,3 +36,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.avro filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 9 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 10 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 11 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.mds filter=lfs diff=lfs merge=lfs -text
|
| 14 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 15 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 36 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
# Audio files - uncompressed
|
| 40 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
# Audio files - compressed
|
| 44 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
# Image files - uncompressed
|
| 50 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
# Image files - compressed
|
| 55 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
# Video files - compressed
|
| 59 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
faiss.index filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
resume.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ===============================
|
| 2 |
+
# Python
|
| 3 |
+
# ===============================
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
*.so
|
| 9 |
+
*.egg-info/
|
| 10 |
+
.eggs/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
|
| 14 |
+
# Virtual environments
|
| 15 |
+
.env
|
| 16 |
+
.venv
|
| 17 |
+
venv/
|
| 18 |
+
env/
|
| 19 |
+
myenv/
|
| 20 |
+
ENV/
|
| 21 |
+
|
| 22 |
+
# ===============================
|
| 23 |
+
# Environment & Secrets
|
| 24 |
+
# ===============================
|
| 25 |
+
.env.local
|
| 26 |
+
.env.*.local
|
| 27 |
+
.env.production
|
| 28 |
+
.env.development
|
| 29 |
+
.env.test
|
| 30 |
+
*.key
|
| 31 |
+
*.pem
|
| 32 |
+
|
| 33 |
+
# API keys / credentials
|
| 34 |
+
secrets/
|
| 35 |
+
credentials/
|
| 36 |
+
config/secrets.yaml
|
| 37 |
+
config/secrets.json
|
| 38 |
+
|
| 39 |
+
# ===============================
|
| 40 |
+
# Jupyter / Data Science
|
| 41 |
+
# ===============================
|
| 42 |
+
.ipynb_checkpoints/
|
| 43 |
+
*.ipynb
|
| 44 |
+
|
| 45 |
+
# ===============================
|
| 46 |
+
# ML / AI Artifacts
|
| 47 |
+
# ===============================
|
| 48 |
+
models/
|
| 49 |
+
checkpoints/
|
| 50 |
+
weights/
|
| 51 |
+
*.pt
|
| 52 |
+
*.pth
|
| 53 |
+
*.onnx
|
| 54 |
+
*.joblib
|
| 55 |
+
*.pkl
|
| 56 |
+
|
| 57 |
+
# Vector stores / RAG indexes
|
| 58 |
+
faiss_index/
|
| 59 |
+
chroma/
|
| 60 |
+
vectorstore/
|
| 61 |
+
embeddings/
|
| 62 |
+
|
| 63 |
+
# ===============================
|
| 64 |
+
# Logs & Runtime Files
|
| 65 |
+
# ===============================
|
| 66 |
+
logs/
|
| 67 |
+
*.log
|
| 68 |
+
*.out
|
| 69 |
+
*.err
|
| 70 |
+
|
| 71 |
+
# ===============================
|
| 72 |
+
# Gradio / FastAPI
|
| 73 |
+
# ===============================
|
| 74 |
+
gradio_cached_examples/
|
| 75 |
+
.gradio/
|
| 76 |
+
tmp/
|
| 77 |
+
uploads/
|
| 78 |
+
|
| 79 |
+
# ===============================
|
| 80 |
+
# Cache / Temp
|
| 81 |
+
# ===============================
|
| 82 |
+
.cache/
|
| 83 |
+
.mypy_cache/
|
| 84 |
+
.pytest_cache/
|
| 85 |
+
ruff_cache/
|
| 86 |
+
coverage/
|
| 87 |
+
htmlcov/
|
| 88 |
+
|
| 89 |
+
# ===============================
|
| 90 |
+
# OS / Editor
|
| 91 |
+
# ===============================
|
| 92 |
+
.DS_Store
|
| 93 |
+
Thumbs.db
|
| 94 |
+
.idea/
|
| 95 |
+
.vscode/
|
| 96 |
+
*.swp
|
| 97 |
+
*.swo
|
| 98 |
+
|
| 99 |
+
# ===============================
|
| 100 |
+
# Docker
|
| 101 |
+
# ===============================
|
| 102 |
+
docker-data/
|
| 103 |
+
*.tar
|
| 104 |
+
|
| 105 |
+
# ===============================
|
| 106 |
+
# Deployment
|
| 107 |
+
# ===============================
|
| 108 |
+
*.local
|
| 109 |
+
*.tfstate
|
| 110 |
+
*.tfstate.backup
|
| 111 |
+
.envrc
|
| 112 |
+
|
| 113 |
+
# ===============================
|
| 114 |
+
# Reports / Generated Content
|
| 115 |
+
# ===============================
|
| 116 |
+
reports/
|
| 117 |
+
outputs/
|
| 118 |
+
generated_images/
|
| 119 |
+
charts/
|
| 120 |
+
visualizations/
|
| 121 |
+
|
| 122 |
+
# ===============================
|
| 123 |
+
# Misc
|
| 124 |
+
# ===============================
|
| 125 |
+
*.bak
|
| 126 |
+
*.tmp
|
| 127 |
+
|
| 128 |
+
#==================================
|
| 129 |
+
# FAISS / embeddings
|
| 130 |
+
data/*.index
|
| 131 |
+
data/*.bin
|
| 132 |
+
data/chunks_metadata.json
|
| 133 |
+
|
| 134 |
+
# Resume files
|
| 135 |
+
data/*.pdf
|
| 136 |
+
documents/*.pdf
|
| 137 |
+
|
| 138 |
+
# Assets
|
| 139 |
+
data/*.png
|
| 140 |
+
data/*.jpg
|
| 141 |
+
data/*.jpeg
|
README.md
CHANGED
|
@@ -10,4 +10,64 @@ pinned: false
|
|
| 10 |
short_description: Interactive Resume Chatbot
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
short_description: Interactive Resume Chatbot
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# AskMyResume π€π
|
| 14 |
+
**Resume-Aware LLM Chatbot with RAG + Human Escalation + Recruiter Contact Bridge**
|
| 15 |
+
|
| 16 |
+
AskMyResume is a recruiter-friendly chatbot powered by **OpenAI Agents SDK** and a **FAISS RAG pipeline**.
|
| 17 |
+
It answers questions about my resume, portfolio, GitHub projects, and LinkedIn summary.
|
| 18 |
+
|
| 19 |
+
If the chatbot is unsure or detects a high-intent recruiter question (salary, relocation, interview request, etc.), it escalates by sending you a **push notification (Pushover)** and an **email (SendGrid)**.
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## π Features
|
| 24 |
+
|
| 25 |
+
### β
Resume Q&A (RAG-powered)
|
| 26 |
+
- Loads resume PDF + portfolio + GitHub READMEs + LinkedIn summary
|
| 27 |
+
- Splits text using **RecursiveCharacterTextSplitter**
|
| 28 |
+
- Stores embeddings in **FAISS (cosine similarity)**
|
| 29 |
+
- Retrieves top-k relevant chunks and generates grounded answers
|
| 30 |
+
|
| 31 |
+
### β
Recruiter-Friendly Responses
|
| 32 |
+
- Concise, professional tone
|
| 33 |
+
- Provides sources from retrieved chunks
|
| 34 |
+
- Avoids hallucinations
|
| 35 |
+
|
| 36 |
+
### β
Human Escalation (High Intent Detection)
|
| 37 |
+
Triggers escalation when:
|
| 38 |
+
- confidence score is low
|
| 39 |
+
- question contains keywords (salary, visa, relocation, interview, etc.)
|
| 40 |
+
- recruiter requests contact information
|
| 41 |
+
|
| 42 |
+
Escalation actions:
|
| 43 |
+
- π² Push notification via **Pushover**
|
| 44 |
+
- π§ Email notification via **SendGrid**
|
| 45 |
+
|
| 46 |
+
### β
Recruiter Contact Bridge
|
| 47 |
+
Recruiters can:
|
| 48 |
+
- leave their name, email, company, LinkedIn
|
| 49 |
+
- send a message directly to you
|
| 50 |
+
|
| 51 |
+
### β
Gradio UI (v6.5)
|
| 52 |
+
- Streaming chatbot UI
|
| 53 |
+
- Contact form
|
| 54 |
+
- Resume download button
|
| 55 |
+
- Session IDs
|
| 56 |
+
- Escalation log panel
|
| 57 |
+
- Custom bot avatar (your picture)
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## π§± Tech Stack
|
| 62 |
+
|
| 63 |
+
- **Python 3.10+**
|
| 64 |
+
- **OpenAI Agents SDK**
|
| 65 |
+
- **FAISS** (cosine similarity retrieval)
|
| 66 |
+
- **PyPDF / pypdf**
|
| 67 |
+
- **Gradio 6.5**
|
| 68 |
+
- **SendGrid API**
|
| 69 |
+
- **Pushover API**
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
|
app.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.central_logging import setup_logging
|
| 2 |
+
from download import download_assets
|
| 3 |
+
from ui.gradio_app import launch_ui
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
setup_logging()
|
| 9 |
+
download_assets()
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
launch_ui()
|
download.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from ntpath import isdir
|
| 3 |
+
from utils.central_logging import get_logger
|
| 4 |
+
from huggingface_hub import snapshot_download
|
| 5 |
+
from huggingface_hub import login
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
logger = get_logger("download")
|
| 10 |
+
|
| 11 |
+
load_dotenv(override=True)
|
| 12 |
+
|
| 13 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 14 |
+
path = "data"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
if hf_token:
|
| 18 |
+
logger.info("Hugging face token has been set")
|
| 19 |
+
else:
|
| 20 |
+
logger.info("Error no hugging face token found")
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
login(hf_token)
|
| 24 |
+
logger.info("Logging to Hugging face has been successful")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.exception(f"An error occurred when loging to Hugging face: {e}")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def download_assets():
|
| 30 |
+
if not os.path.isdir(path):
|
| 31 |
+
os.makedirs(path, exist_ok=True)
|
| 32 |
+
repo_id = "Cedric07/data"
|
| 33 |
+
local_download_path = snapshot_download(
|
| 34 |
+
repo_id=repo_id,
|
| 35 |
+
repo_type="dataset",
|
| 36 |
+
local_dir = path,
|
| 37 |
+
local_dir_use_symlinks=False,
|
| 38 |
+
)
|
| 39 |
+
logger.info(f"Downloaded to: {local_download_path}")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
download_assets()
|
ingestion/__init__.py
ADDED
|
File without changes
|
ingestion/ingest_all.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from ingestion.pdf_loader import extract_text_from_pdf
|
| 3 |
+
from ingestion.text_loader import load_text,load_markdown_folder
|
| 4 |
+
from utils.central_logging import setup_logging,get_logger
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
setup_logging()
|
| 8 |
+
logger = get_logger("loader")
|
| 9 |
+
|
| 10 |
+
def ingest_data(resume_pdf_path:str,linked_path:str,github_folder_path:str) -> list[dict]:
|
| 11 |
+
|
| 12 |
+
dataset = []
|
| 13 |
+
|
| 14 |
+
pdf_pages = extract_text_from_pdf(resume_pdf_path,"resume.pdf")
|
| 15 |
+
dataset.append(pdf_pages)
|
| 16 |
+
linked_resume = extract_text_from_pdf(linked_path,"linked_page.pdf")
|
| 17 |
+
dataset.append(linked_resume)
|
| 18 |
+
|
| 19 |
+
github_docs = load_markdown_folder(github_folder_path,"github_readmes")
|
| 20 |
+
dataset.append(github_docs)
|
| 21 |
+
|
| 22 |
+
return dataset
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
|
| 27 |
+
dataset = ingest_data("./data/resume.pdf","./data/profile.pdf","./data")
|
| 28 |
+
|
| 29 |
+
with open("./data/dataset.json","w",encoding="utf-8") as file:
|
| 30 |
+
json.dump(dataset,file,indent=2,ensure_ascii=False)
|
| 31 |
+
|
| 32 |
+
logger.info(f"Ingestion completed. Total document loaded: {len(dataset)}")
|
| 33 |
+
logger.info("Saved to: data/dataset.json")
|
ingestion/pdf_loader.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pypdf import PdfReader
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def extract_text_from_pdf(pdf_path:str,source:str) -> list[dict]:
|
| 5 |
+
|
| 6 |
+
reader = PdfReader(pdf_path)
|
| 7 |
+
pages_data = []
|
| 8 |
+
|
| 9 |
+
for i,page in enumerate(reader.pages):
|
| 10 |
+
text = page.extract_text() or ""
|
| 11 |
+
pages_data.append(
|
| 12 |
+
{
|
| 13 |
+
"page": i+1,
|
| 14 |
+
"text": text.strip(),
|
| 15 |
+
"source": source
|
| 16 |
+
}
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
return pages_data
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def pdf_to_text(pdf_path:str,source:str) -> str:
|
| 25 |
+
|
| 26 |
+
pages = extract_text_from_pdf(pdf_path,source)
|
| 27 |
+
full_text = "\n\n".join([f"---Page {p["page"]}--- {p["source"]} \n --- \n {p["text"]}" for p in pages])
|
| 28 |
+
return full_text
|
ingestion/text_loader.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
from unittest import result
|
| 4 |
+
|
| 5 |
+
def load_text(file_path:str) -> str:
|
| 6 |
+
text = ""
|
| 7 |
+
if not os.path.exists(file_path):
|
| 8 |
+
return ""
|
| 9 |
+
|
| 10 |
+
with open(file_path,"r",encoding="utf-8") as file:
|
| 11 |
+
text = file.read().strip()
|
| 12 |
+
|
| 13 |
+
return text
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_markdown_folder(folder_path:str,source:str) -> list[dict]:
|
| 17 |
+
results = []
|
| 18 |
+
md_files = glob.glob(os.path.join(folder_path,"*.md"))
|
| 19 |
+
|
| 20 |
+
for md_file in md_files:
|
| 21 |
+
text = load_text(md_file)
|
| 22 |
+
results.append({
|
| 23 |
+
"source":source,
|
| 24 |
+
"file":os.path.basename(md_file),
|
| 25 |
+
"text":text
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
return results
|
local_agents/__init__.py
ADDED
|
File without changes
|
local_agents/escalation_agent.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from notifications.pushover_client import PushoverClient
|
| 2 |
+
from notifications.sendgrid_client import SendGridClient
|
| 3 |
+
|
| 4 |
+
ESCALATION_KEYWORDS = [
|
| 5 |
+
"salary", "compensation", "pay", "rate",
|
| 6 |
+
"visa", "sponsorship", "relocation", "relocate",
|
| 7 |
+
"interview", "availability", "schedule",
|
| 8 |
+
"start date", "notice period",
|
| 9 |
+
"contract", "freelance",
|
| 10 |
+
"phone", "email", "contact", "meeting", "call"
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
class EscalationAgent:
|
| 14 |
+
def __init__(self, confidence_threshold: float = 0.65):
|
| 15 |
+
self.confidence_threshold = confidence_threshold
|
| 16 |
+
self.pushover = PushoverClient()
|
| 17 |
+
self.sendgrid = SendGridClient()
|
| 18 |
+
|
| 19 |
+
def should_escalate(self, question: str, confidence: float) -> bool:
|
| 20 |
+
q = question.lower()
|
| 21 |
+
|
| 22 |
+
if confidence < self.confidence_threshold:
|
| 23 |
+
return True
|
| 24 |
+
|
| 25 |
+
for kw in ESCALATION_KEYWORDS:
|
| 26 |
+
if kw in q:
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
return False
|
| 30 |
+
|
| 31 |
+
def notify(self, question: str, answer: str, confidence: float):
|
| 32 |
+
title = "Resume Bot Escalation"
|
| 33 |
+
|
| 34 |
+
push_message = (
|
| 35 |
+
f"Recruiter Question:\n{question}\n\n"
|
| 36 |
+
f"Confidence: {confidence:.2f}\n\n"
|
| 37 |
+
f"Draft Answer:\n{answer[:800]}"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
email_subject = "Resume Bot Escalation Alert"
|
| 41 |
+
email_content = (
|
| 42 |
+
f"Recruiter Question:\n{question}\n\n"
|
| 43 |
+
f"Confidence: {confidence:.2f}\n\n"
|
| 44 |
+
f"Draft Answer:\n{answer}\n"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
self.pushover.send(title=title, message=push_message, priority=1)
|
| 48 |
+
self.sendgrid.send_email(subject=email_subject, content=email_content)
|
local_agents/prompt.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_system_prompt():
|
| 2 |
+
return """
|
| 3 |
+
You are a recruiter-facing Resume Chatbot for Cheun Da.
|
| 4 |
+
|
| 5 |
+
You MUST follow these rules:
|
| 6 |
+
|
| 7 |
+
1. Always call retrieve_resume_context(query) before answering.
|
| 8 |
+
2. Answer ONLY from the retrieved chunks.
|
| 9 |
+
3. If the answer is not clearly supported by the context:
|
| 10 |
+
- do NOT guess
|
| 11 |
+
- do NOT hallucinate
|
| 12 |
+
- instead say:
|
| 13 |
+
"I donβt have that information on the resume. I can notify Cheun Da directly."
|
| 14 |
+
- then draft a short helpful response and call notify_candidate()
|
| 15 |
+
|
| 16 |
+
4. If the question involves any sensitive topic, ALWAYS escalate:
|
| 17 |
+
- salary / compensation
|
| 18 |
+
- relocation / visa / sponsorship
|
| 19 |
+
- interview scheduling / availability
|
| 20 |
+
- start date / notice period
|
| 21 |
+
- direct contact request
|
| 22 |
+
- contract rates / freelance terms
|
| 23 |
+
|
| 24 |
+
5. When escalating:
|
| 25 |
+
- generate a recruiter-friendly draft response
|
| 26 |
+
- call notify_candidate(question, draft_answer, confidence)
|
| 27 |
+
|
| 28 |
+
6. Tone:
|
| 29 |
+
- concise
|
| 30 |
+
- professional
|
| 31 |
+
- recruiter-friendly
|
| 32 |
+
- bullet points preferred
|
| 33 |
+
- no long essays
|
| 34 |
+
|
| 35 |
+
7. When possible, include:
|
| 36 |
+
- relevant project names
|
| 37 |
+
- relevant technologies
|
| 38 |
+
- links if they exist in the context
|
| 39 |
+
|
| 40 |
+
8. If recruiter wants to stay in touch:
|
| 41 |
+
- ask for their name, email, Message, company, and optional LinkedIn
|
| 42 |
+
- then call save_recruiter_contact()
|
| 43 |
+
|
| 44 |
+
OUTPUT FORMAT:
|
| 45 |
+
- Provide a clear answer
|
| 46 |
+
- Optionally include: "Relevant Sources:" with 1β3 short citations (resume, GitHub, LinkedIn)
|
| 47 |
+
"""
|
local_agents/resume_agent.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional
|
| 2 |
+
from agents import Agent, Runner, trace, function_tool
|
| 3 |
+
from rag.retriever import ResumeRetriever
|
| 4 |
+
from local_agents.escalation_agent import EscalationAgent
|
| 5 |
+
from local_agents.prompt import get_system_prompt
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
retriever = ResumeRetriever()
|
| 9 |
+
escalation_agent = EscalationAgent(confidence_threshold=0.65)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@function_tool
|
| 13 |
+
def retrieve_resume_context(query:str,top_k:int) -> Dict[str, Any]:
|
| 14 |
+
|
| 15 |
+
chunks = retriever.retrieve(query=query,top_k=top_k)
|
| 16 |
+
|
| 17 |
+
if not chunks:
|
| 18 |
+
return {
|
| 19 |
+
"confidence":0.0,
|
| 20 |
+
"chunks": []
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
top_score = chunks[0]["score"]
|
| 24 |
+
|
| 25 |
+
return {
|
| 26 |
+
"confident": float(top_score),
|
| 27 |
+
"chunks": chunks
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@function_tool
|
| 32 |
+
def notify_candidate(question: str, draft_answer: str, confidence: float) -> str:
|
| 33 |
+
escalation_agent.notify(question,draft_answer,confidence)
|
| 34 |
+
return "Candidate have been notify via push + email"
|
| 35 |
+
|
| 36 |
+
@function_tool
|
| 37 |
+
def save_recruiter_contact(name:str,email:str,message:str,linkedin:Optional[str]=None,company:Optional[str]=None) -> str:
|
| 38 |
+
|
| 39 |
+
summary = (
|
| 40 |
+
f"π© New recruiter contact request\n\n"
|
| 41 |
+
f"Name: {name}\n"
|
| 42 |
+
f"Email: {email}\n"
|
| 43 |
+
f"Company: {company or 'N/A'}\n"
|
| 44 |
+
f"LinkedIn: {linkedin or 'N/A'}\n\n"
|
| 45 |
+
f"Message:\n{message}\n"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
title = "Recruiter contact request"
|
| 49 |
+
escalation_agent.pushover.send(title=title,message=summary,priority=1)
|
| 50 |
+
escalation_agent.sendgrid.send_email(subject=title,content=summary)
|
| 51 |
+
|
| 52 |
+
return "Recruiter contact info received. Cheun Da has been notified."
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _save_recruiter_contact(name:str,email:str,message:str,linkedin:Optional[str]=None,company:Optional[str]=None) -> str:
|
| 56 |
+
|
| 57 |
+
summary = (
|
| 58 |
+
f"π© New recruiter contact request\n\n"
|
| 59 |
+
f"Name: {name}\n"
|
| 60 |
+
f"Email: {email}\n"
|
| 61 |
+
f"Company: {company or 'N/A'}\n"
|
| 62 |
+
f"LinkedIn: {linkedin or 'N/A'}\n\n"
|
| 63 |
+
f"Message:\n{message}\n"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
title = "Recruiter contact request"
|
| 67 |
+
escalation_agent.pushover.send(title=title,message=summary,priority=1)
|
| 68 |
+
escalation_agent.sendgrid.send_email(subject=title,content=summary)
|
| 69 |
+
|
| 70 |
+
return "Recruiter contact info received. Cheun Da has been notified."
|
| 71 |
+
|
| 72 |
+
resume_agent = Agent(
|
| 73 |
+
name="Resume Aware LLM Chatbot",
|
| 74 |
+
model="gpt-4o-mini",
|
| 75 |
+
instructions = get_system_prompt(),
|
| 76 |
+
tools=[retrieve_resume_context,notify_candidate,save_recruiter_contact]
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
async def answer_resume_question(question:str):
|
| 80 |
+
with trace("resume-agent-stream"):
|
| 81 |
+
result = Runner.run_streamed(resume_agent,input=question)
|
| 82 |
+
|
| 83 |
+
async for event in result.stream_events():
|
| 84 |
+
|
| 85 |
+
if event.type != "raw_response_event":
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
data = event.data
|
| 89 |
+
event_type = getattr(data, "type", None)
|
| 90 |
+
|
| 91 |
+
if event_type == "response.output_text.delta":
|
| 92 |
+
yield data.delta
|
| 93 |
+
|
| 94 |
+
#print(f"type: {event.type}")
|
| 95 |
+
#print(f"event: {event}")
|
| 96 |
+
#if event.type == "final_output_delta":
|
| 97 |
+
# yield event.delta
|
| 98 |
+
|
| 99 |
+
|
notifications/__init__.py
ADDED
|
File without changes
|
notifications/pushover_client.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
from utils.central_logging import get_logger
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
logger = get_logger("pushover")
|
| 8 |
+
load_dotenv(override=True)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
pushover_api_token = os.getenv("PUSHOVER_API_TOKEN")
|
| 12 |
+
pushover_user_key = os.getenv("PUSHOVER_USER_KEY")
|
| 13 |
+
|
| 14 |
+
if pushover_api_token:
|
| 15 |
+
logger.info("Pushover api token has been set")
|
| 16 |
+
else:
|
| 17 |
+
message = "Error pushover api token has been set"
|
| 18 |
+
logger.error(message)
|
| 19 |
+
raise ValueError(message)
|
| 20 |
+
|
| 21 |
+
if pushover_user_key:
|
| 22 |
+
logger.info("Pushover user key has been set")
|
| 23 |
+
else:
|
| 24 |
+
message = "Error pushover user key has been set"
|
| 25 |
+
logger.error(message)
|
| 26 |
+
raise ValueError(message)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class PushoverClient:
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
|
| 33 |
+
self.api_token = pushover_api_token
|
| 34 |
+
self.user_key = pushover_user_key
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def send(self,title:str,message:str,priority:int=1) -> dict:
|
| 38 |
+
url = "https://api.pushover.net/1/messages.json"
|
| 39 |
+
|
| 40 |
+
payload = {
|
| 41 |
+
"token": self.api_token,
|
| 42 |
+
"user": self.user_key,
|
| 43 |
+
"title": title,
|
| 44 |
+
"message": message,
|
| 45 |
+
"priority": priority
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
response = requests.post(url,data=payload)
|
| 49 |
+
|
| 50 |
+
if response.status_code != 200:
|
| 51 |
+
message = f"Pushover error: {response.status_code} - {response.text}"
|
| 52 |
+
logger.exception(message)
|
| 53 |
+
raise Exception(message)
|
| 54 |
+
|
| 55 |
+
return response.json()
|
| 56 |
+
|
notifications/sendgrid_client.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.central_logging import get_logger
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from sendgrid import SendGridAPIClient
|
| 4 |
+
from sendgrid.helpers.mail import Mail
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
logger = get_logger("sendgrid")
|
| 8 |
+
|
| 9 |
+
load_dotenv(override=True)
|
| 10 |
+
|
| 11 |
+
sendgrid_api_key = os.getenv("SENDGRID_API_KEY")
|
| 12 |
+
sendgrid_from_email = os.getenv("SENDGRID_FROM_EMAIL")
|
| 13 |
+
sendgrid_to_email = os.getenv("SENDGRID_TO_EMAIL")
|
| 14 |
+
|
| 15 |
+
if sendgrid_api_key:
|
| 16 |
+
logger.info("Sendgrid api key has been set")
|
| 17 |
+
else:
|
| 18 |
+
message = "Missing sendgrid api key"
|
| 19 |
+
logger.error(message)
|
| 20 |
+
raise ValueError(message)
|
| 21 |
+
|
| 22 |
+
if sendgrid_from_email:
|
| 23 |
+
logger.info("Sendgrid from email has been set")
|
| 24 |
+
else:
|
| 25 |
+
message = "Missing sendgrid from email"
|
| 26 |
+
logger.error(message)
|
| 27 |
+
raise ValueError(message)
|
| 28 |
+
|
| 29 |
+
if sendgrid_to_email:
|
| 30 |
+
logger.info("Sendgrid to email has been set")
|
| 31 |
+
else:
|
| 32 |
+
message = "Missing sendgrid to email"
|
| 33 |
+
logger.error(message)
|
| 34 |
+
raise ValueError(message)
|
| 35 |
+
|
| 36 |
+
class SendGridClient:
|
| 37 |
+
|
| 38 |
+
def __init__(self) :
|
| 39 |
+
self.sendgrid_api_key = sendgrid_api_key
|
| 40 |
+
self.from_email = sendgrid_from_email
|
| 41 |
+
self.to_email = sendgrid_to_email
|
| 42 |
+
self.client = SendGridAPIClient(self.sendgrid_api_key)
|
| 43 |
+
|
| 44 |
+
def send_email(self,subject:str,content:str) -> int:
|
| 45 |
+
message = Mail(
|
| 46 |
+
from_email=self.from_email,
|
| 47 |
+
to_emails=self.to_email,
|
| 48 |
+
subject=subject,
|
| 49 |
+
plain_text_content=content
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
response = self.client.send(message)
|
| 53 |
+
|
| 54 |
+
print(f"status code: {response.status_code}")
|
| 55 |
+
|
| 56 |
+
if response.status_code not in (200,202):
|
| 57 |
+
message = f"SendGrid failed: {response.status_code} {response.body}"
|
| 58 |
+
logger.exception(message)
|
| 59 |
+
raise Exception(message)
|
| 60 |
+
|
| 61 |
+
return response.status_code
|
rag/__init__.py
ADDED
|
File without changes
|
rag/build_index.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from rag.embedder import OpenaiEmbedder
|
| 4 |
+
from rag.vector_store import FaissVectorStore
|
| 5 |
+
from rag.chunker import chunk_document
|
| 6 |
+
from utils.central_logging import setup_logging,get_logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
setup_logging()
|
| 11 |
+
logger = get_logger("index_building")
|
| 12 |
+
|
| 13 |
+
DATASET_PATH = "./data/dataset.json"
|
| 14 |
+
FAISS_INDEX_PATH = "./data/faiss.index"
|
| 15 |
+
CHUNKS_METADATA_PATH = "./data/chunks_metadata.json"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def build_index(chunk_size:int=800,chunk_overlap=150):
|
| 19 |
+
|
| 20 |
+
if not os.path.exists(DATASET_PATH):
|
| 21 |
+
message = f"Missing dataset file: {DATASET_PATH}"
|
| 22 |
+
logger.error(message)
|
| 23 |
+
raise FileNotFoundError(message)
|
| 24 |
+
|
| 25 |
+
with open(DATASET_PATH,"r",encoding="utf-8") as file:
|
| 26 |
+
dataset = json.load(file)
|
| 27 |
+
|
| 28 |
+
chunks = chunk_document(dataset,chunk_size=chunk_size,chunk_overlap=chunk_overlap)
|
| 29 |
+
|
| 30 |
+
if not chunks:
|
| 31 |
+
message = "No chunks were created. Check dataset content."
|
| 32 |
+
logger.error(message)
|
| 33 |
+
raise ValueError(message)
|
| 34 |
+
|
| 35 |
+
texts = [chunk["text"] for chunk in chunks]
|
| 36 |
+
|
| 37 |
+
embedder = OpenaiEmbedder(model="text-embedding-3-small")
|
| 38 |
+
embedding = embedder.embed_multiple_texts(texts)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
dim = len(embedding[0])
|
| 42 |
+
|
| 43 |
+
store = FaissVectorStore(dim)
|
| 44 |
+
store.add_embedding(embedding,chunks)
|
| 45 |
+
|
| 46 |
+
store.save_data(FAISS_INDEX_PATH,CHUNKS_METADATA_PATH)
|
| 47 |
+
|
| 48 |
+
logger.info("Saving FAISS index + chunk metadata...")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
build_index(chunk_size=800,chunk_overlap=150)
|
rag/chunker.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def generate_chink_id(text: str, prefix: str = "chunk") -> str:
|
| 8 |
+
|
| 9 |
+
h = hashlib.md5(text.encode("utf-8")).hexdigest()[:12]
|
| 10 |
+
return f"{prefix}_{h}"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def chunk_document( dataset: List[Dict],chunk_size: int = 800,chunk_overlap: int = 150) -> List[Dict]:
|
| 14 |
+
|
| 15 |
+
all_chunks = []
|
| 16 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 17 |
+
chunk_size=chunk_size,
|
| 18 |
+
chunk_overlap=chunk_overlap,
|
| 19 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
for docs in dataset:
|
| 23 |
+
for doc in docs:
|
| 24 |
+
raw_text = doc.get("text","")
|
| 25 |
+
|
| 26 |
+
if not raw_text or not raw_text.strip():
|
| 27 |
+
continue
|
| 28 |
+
|
| 29 |
+
chunks = splitter.split_text(raw_text)
|
| 30 |
+
|
| 31 |
+
for i,chunk in enumerate(chunks):
|
| 32 |
+
chunk = chunk.strip()
|
| 33 |
+
if not chunk:
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
prefix = f"{doc.get('source','unknown')}"
|
| 37 |
+
|
| 38 |
+
if doc.get("page") is not None:
|
| 39 |
+
prefix += f"_page{doc['page']}"
|
| 40 |
+
|
| 41 |
+
if doc.get("file") is not None:
|
| 42 |
+
prefix += f"_{doc['file']}"
|
| 43 |
+
|
| 44 |
+
chunk_id = generate_chink_id(raw_text,prefix)
|
| 45 |
+
|
| 46 |
+
all_chunks.append({
|
| 47 |
+
"chunk_id":chunk_id,
|
| 48 |
+
"text":raw_text,
|
| 49 |
+
"source":doc.get("source"),
|
| 50 |
+
"page":doc.get("page"),
|
| 51 |
+
"file":doc.get("file")
|
| 52 |
+
})
|
| 53 |
+
|
| 54 |
+
return all_chunks
|
| 55 |
+
|
rag/embedder.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.central_logging import get_logger
|
| 2 |
+
from openai import OpenAI
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
logger = get_logger("embedding")
|
| 7 |
+
|
| 8 |
+
load_dotenv(override=True)
|
| 9 |
+
|
| 10 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 11 |
+
|
| 12 |
+
if openai_api_key:
|
| 13 |
+
logger.info("Openai api key has been set")
|
| 14 |
+
else:
|
| 15 |
+
logger.error("Openai api key has not been found")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class OpenaiEmbedder:
|
| 20 |
+
|
| 21 |
+
def __init__(self, model: str = "text-embedding-3-small") :
|
| 22 |
+
self.client = OpenAI(api_key=openai_api_key)
|
| 23 |
+
self.model = model
|
| 24 |
+
|
| 25 |
+
def embed_text(self,text:str) -> list[float]:
|
| 26 |
+
|
| 27 |
+
if not text:
|
| 28 |
+
message = "Can not be an empty text"
|
| 29 |
+
logger.error(message)
|
| 30 |
+
raise ValueError(message)
|
| 31 |
+
|
| 32 |
+
response = self.client.embeddings.create(
|
| 33 |
+
model=self.model,
|
| 34 |
+
input=text
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
return response.data[0].embedding
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def embed_multiple_texts(self,texts:list[str]) -> list[list[float]]:
|
| 41 |
+
|
| 42 |
+
if not texts:
|
| 43 |
+
message = "Can not be an empty list of texts"
|
| 44 |
+
logger.error(message)
|
| 45 |
+
raise ValueError(message)
|
| 46 |
+
|
| 47 |
+
response = self.client.embeddings.create(
|
| 48 |
+
model= self.model,
|
| 49 |
+
input= texts
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
return [item.embedding for item in response.data]
|
rag/main.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag.embedder import OpenaiEmbedder
|
| 2 |
+
from rag.vector_store import FaissVectorStore
|
| 3 |
+
from rag.retriever import ResumeRetriever
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
FAISS_INDEX_PATH = "data/faiss.index"
|
| 7 |
+
CHUNKS_METADATA_PATH = "data/chunks_metadata.json"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
|
| 12 |
+
dim = 1536
|
| 13 |
+
|
| 14 |
+
store = FaissVectorStore(dim=dim)
|
| 15 |
+
store.load(FAISS_INDEX_PATH, CHUNKS_METADATA_PATH)
|
| 16 |
+
|
| 17 |
+
print(f"Index loaded. Total vectors: {store.index.ntotal}")
|
| 18 |
+
|
| 19 |
+
embedder = OpenaiEmbedder(model="text-embedding-3-small")
|
| 20 |
+
retriever = ResumeRetriever(embedder, store)
|
| 21 |
+
|
| 22 |
+
query = input("\nAsk a resume question: ")
|
| 23 |
+
|
| 24 |
+
results = retriever.retrieve(query, top_k=5)
|
| 25 |
+
|
| 26 |
+
print("\nπ Top Results:\n")
|
| 27 |
+
for i, r in enumerate(results, start=1):
|
| 28 |
+
print(f"#{i}")
|
| 29 |
+
print(f"Score: {r['score']:.4f}")
|
| 30 |
+
print(f"Source: {r['source']}")
|
| 31 |
+
print(f"Page: {r.get('page')}")
|
| 32 |
+
print(f"File: {r.get('file')}")
|
| 33 |
+
print(f"Chunk ID: {r.get('chunk_id')}")
|
| 34 |
+
print("Text Preview:")
|
| 35 |
+
print(r["text"][:400])
|
| 36 |
+
print("-" * 60)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
main()
|
rag/retriever.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rag.embedder import OpenaiEmbedder
|
| 2 |
+
from rag.vector_store import FaissVectorStore
|
| 3 |
+
from typing import List, Dict, Any, Optional
|
| 4 |
+
|
| 5 |
+
class ResumeRetriever:
|
| 6 |
+
|
| 7 |
+
DIM = 1536
|
| 8 |
+
FAISS_INDEX_PATH = "./data/faiss.index"
|
| 9 |
+
CHUNKS_METADATA_PATH = "./data/chunks_metadata.json"
|
| 10 |
+
EMBEDDING_MODEL = "text-embedding-3-small"
|
| 11 |
+
|
| 12 |
+
_instance = None
|
| 13 |
+
|
| 14 |
+
def __new__(cls):
|
| 15 |
+
if cls._instance is None:
|
| 16 |
+
cls._instance = super(ResumeRetriever, cls).__new__(cls)
|
| 17 |
+
cls._instance._initialized = False
|
| 18 |
+
return cls._instance
|
| 19 |
+
|
| 20 |
+
def __init__(self,embedder:Optional[OpenaiEmbedder] = None,vector_store:Optional[FaissVectorStore]=None):
|
| 21 |
+
|
| 22 |
+
if self._initialized:
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
if embedder is None:
|
| 26 |
+
embedder = OpenaiEmbedder(model=self.EMBEDDING_MODEL)
|
| 27 |
+
|
| 28 |
+
if vector_store is None:
|
| 29 |
+
vector_store = FaissVectorStore(dim=self.DIM)
|
| 30 |
+
vector_store.load(self.FAISS_INDEX_PATH,self.CHUNKS_METADATA_PATH)
|
| 31 |
+
|
| 32 |
+
self.embedder = embedder
|
| 33 |
+
self.vector_store = vector_store
|
| 34 |
+
self._initialized = True
|
| 35 |
+
|
| 36 |
+
def retrieve(self, query: str, top_k: int = 5) -> list[dict]:
|
| 37 |
+
|
| 38 |
+
formatted = []
|
| 39 |
+
|
| 40 |
+
if not query or not query.strip():
|
| 41 |
+
return []
|
| 42 |
+
|
| 43 |
+
query_embedding = self.embedder.embed_text(query)
|
| 44 |
+
results = self.vector_store.search(query_embedding, top_k=top_k)
|
| 45 |
+
|
| 46 |
+
for r in results:
|
| 47 |
+
meta = r["metadata"]
|
| 48 |
+
formatted.append({
|
| 49 |
+
"score": r["score"],
|
| 50 |
+
"source": meta.get("source", "unknown"),
|
| 51 |
+
"text": meta.get("text", ""),
|
| 52 |
+
"page":meta.get("page",""),
|
| 53 |
+
"file":meta.get("file",""),
|
| 54 |
+
"chunk_id":meta.get("chunk_id",""),
|
| 55 |
+
"metadata": meta
|
| 56 |
+
})
|
| 57 |
+
|
| 58 |
+
return formatted
|
| 59 |
+
|
rag/test.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from rag.embedder import OpenaiEmbedder
|
| 3 |
+
from rag.vector_store import FaissVectorStore
|
| 4 |
+
from rag.retriever import ResumeRetriever
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
with open("./data/dataset.json","r",encoding="utf-8") as file:
|
| 8 |
+
dataset = json.load(file)
|
| 9 |
+
|
| 10 |
+
metadatas = []
|
| 11 |
+
texts = []
|
| 12 |
+
|
| 13 |
+
for docs in dataset:
|
| 14 |
+
for doc in docs:
|
| 15 |
+
texts.append(doc["text"])
|
| 16 |
+
metadatas.append(doc)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
embedder = OpenaiEmbedder()
|
| 20 |
+
embeddings = embedder.embed_multiple_texts(texts)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
dim = len(embeddings[0])
|
| 24 |
+
store = FaissVectorStore(dim=dim)
|
| 25 |
+
|
| 26 |
+
store.add_embedding(embeddings, metadatas)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
store.save_data("./data/faiss.index", "./data/metadata.json")
|
| 30 |
+
|
| 31 |
+
store2 = FaissVectorStore(dim=dim)
|
| 32 |
+
store2.load("data/faiss.index", "data/metadata.json")
|
| 33 |
+
|
| 34 |
+
retriever = ResumeRetriever(embedder, store2)
|
| 35 |
+
|
| 36 |
+
query = "What is knowbot?"
|
| 37 |
+
results = retriever.retrieve(query, top_k=5)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
print("\nTop Results:\n")
|
| 41 |
+
for r in results:
|
| 42 |
+
print("Score:", r["score"])
|
| 43 |
+
print("Source:", r["source"])
|
| 44 |
+
print("Preview:", r["text"][:200])
|
| 45 |
+
print("-" * 50)
|
rag/vector_store.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.central_logging import get_logger
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import faiss
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
logger = get_logger("vectore_store")
|
| 9 |
+
|
| 10 |
+
def normalize_vectors(vectors: np.ndarray) -> np.ndarray:
|
| 11 |
+
faiss.normalize_L2(vectors)
|
| 12 |
+
return vectors
|
| 13 |
+
|
| 14 |
+
class FaissVectorStore:
|
| 15 |
+
|
| 16 |
+
def __init__(self,dim:int):
|
| 17 |
+
self.dim = dim
|
| 18 |
+
self.index = faiss.IndexFlatIP(dim)
|
| 19 |
+
self.metadata = []
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def add_embedding(self, embeddings: list[list[float]], metadatas: list[dict]):
|
| 23 |
+
|
| 24 |
+
if len(embeddings) == 0:
|
| 25 |
+
message = "Embedding list is empty"
|
| 26 |
+
logger.error(message)
|
| 27 |
+
raise ValueError(message)
|
| 28 |
+
|
| 29 |
+
if len(embeddings) != len(metadatas):
|
| 30 |
+
message = "Embeddings and metadata must have same length."
|
| 31 |
+
logger.error(message)
|
| 32 |
+
raise ValueError(message)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
vectors = np.array(embeddings,dtype="float32")
|
| 36 |
+
vectors = normalize_vectors(vectors)
|
| 37 |
+
self.index.add(vectors)
|
| 38 |
+
self.metadata.extend(metadatas)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def search(self, query_embedding: list[float], top_k: int = 5):
|
| 42 |
+
|
| 43 |
+
results = []
|
| 44 |
+
|
| 45 |
+
if self.index.ntotal == 0:
|
| 46 |
+
return []
|
| 47 |
+
|
| 48 |
+
query_vec = np.array([query_embedding],dtype="float32")
|
| 49 |
+
distances, indices = self.index.search(query_vec, top_k)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
for dist, idx in zip(distances[0], indices[0]):
|
| 53 |
+
if idx == -1:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
results.append({
|
| 57 |
+
"score": float(dist),
|
| 58 |
+
"metadata": self.metadata[idx]
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
return results
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def save_data(self, index_path: str, metadata_path: str):
|
| 65 |
+
|
| 66 |
+
faiss.write_index(self.index,index_path)
|
| 67 |
+
|
| 68 |
+
with open(metadata_path,"w",encoding="utf-8") as file:
|
| 69 |
+
json.dump(self.metadata,file,indent=2,ensure_ascii=False)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def load(self, index_path: str, metadata_path: str):
|
| 73 |
+
|
| 74 |
+
if not os.path.exists(index_path):
|
| 75 |
+
message = f"FAISS index file not found: {index_path}"
|
| 76 |
+
logger.error(message)
|
| 77 |
+
raise FileNotFoundError(message)
|
| 78 |
+
|
| 79 |
+
if not os.path.exists(metadata_path):
|
| 80 |
+
message = f"Metadata file not found: {metadata_path}"
|
| 81 |
+
logger.error(message)
|
| 82 |
+
raise FileNotFoundError(message)
|
| 83 |
+
|
| 84 |
+
self.index = faiss.read_index(index_path)
|
| 85 |
+
|
| 86 |
+
with open(metadata_path,'r',encoding="utf-8") as file:
|
| 87 |
+
self.metadata = json.load(file)
|
| 88 |
+
|
| 89 |
+
|
requirements.txt
ADDED
|
Binary file (3.71 kB). View file
|
|
|
ui/__init__.py
ADDED
|
File without changes
|
ui/chat_handler.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from local_agents.resume_agent import answer_resume_question
|
| 2 |
+
from utils.central_logging import get_logger
|
| 3 |
+
from ui.generate_session import generate_session_id
|
| 4 |
+
|
| 5 |
+
logger = get_logger("chat_handler")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
async def chat_handler(message,history,session_id):
|
| 11 |
+
|
| 12 |
+
if not session_id:
|
| 13 |
+
session_id = generate_session_id()
|
| 14 |
+
|
| 15 |
+
if history is None:
|
| 16 |
+
history = []
|
| 17 |
+
|
| 18 |
+
history.append({"role":"user","content":message})
|
| 19 |
+
|
| 20 |
+
partial_answer = ""
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
async for token in answer_resume_question(message):
|
| 24 |
+
#print(f"token: {token}")
|
| 25 |
+
partial_answer += token
|
| 26 |
+
|
| 27 |
+
if len(history) > 0 and history[-1]["role"] == "assistant":
|
| 28 |
+
history[-1]["content"] = partial_answer
|
| 29 |
+
else:
|
| 30 |
+
history.append({"role":"assistant","content":partial_answer})
|
| 31 |
+
|
| 32 |
+
yield history,session_id,""
|
| 33 |
+
|
| 34 |
+
if "notified" in partial_answer.lower() and "anthony" in partial_answer.lower():
|
| 35 |
+
logger.info({
|
| 36 |
+
"type": "escalation",
|
| 37 |
+
"session_id": session_id,
|
| 38 |
+
"question": message,
|
| 39 |
+
"answer_preview": partial_answer[:500]
|
| 40 |
+
})
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
except Exception as e:
|
| 44 |
+
note = f"An error occured: {str(e)}"
|
| 45 |
+
logger.exception(note)
|
| 46 |
+
history[-1]["content"] = note
|
| 47 |
+
yield history,session_id
|
ui/contact_handler.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from local_agents.resume_agent import _save_recruiter_contact
|
| 2 |
+
from ui.generate_session import generate_session_id
|
| 3 |
+
from utils.central_logging import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger("contact_handler")
|
| 6 |
+
|
| 7 |
+
def contact_handler(name, email, message, linkedin, company, session_id):
|
| 8 |
+
|
| 9 |
+
if not session_id:
|
| 10 |
+
session_id = generate_session_id()
|
| 11 |
+
|
| 12 |
+
if not name or not email or not message:
|
| 13 |
+
return "β οΈ Name, Email, and Message are required."
|
| 14 |
+
|
| 15 |
+
print(f"name: {name}")
|
| 16 |
+
print(f"email: {email}")
|
| 17 |
+
print(f"message: {message}")
|
| 18 |
+
print(f"linkedin: {linkedin}, company: {company}")
|
| 19 |
+
try:
|
| 20 |
+
result = _save_recruiter_contact(name,email,message,linkedin=linkedin if linkedin else None,company=company if company else None)
|
| 21 |
+
logger.info({
|
| 22 |
+
"type": "contact_request",
|
| 23 |
+
"session_id": session_id,
|
| 24 |
+
"recruiter_name": name,
|
| 25 |
+
"recruiter_email": email,
|
| 26 |
+
"company": company,
|
| 27 |
+
"linkedin": linkedin,
|
| 28 |
+
"message_preview": message[:500]
|
| 29 |
+
})
|
| 30 |
+
return f"β
Thanks! Your message has been sent.\n\n{result}","", "", "", "", ""
|
| 31 |
+
except Exception as e:
|
| 32 |
+
note = f"β οΈ Failed to send contact info: {str(e)}"
|
| 33 |
+
logger.info(note)
|
| 34 |
+
return note,"", "", "", "", ""
|
ui/generate_session.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
|
| 3 |
+
def generate_session_id():
|
| 4 |
+
return str(uuid.uuid4())[:8]
|
ui/gradio_app.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ui.generate_session import generate_session_id
|
| 2 |
+
from utils.central_logging import get_logger
|
| 3 |
+
from ui.chat_handler import chat_handler
|
| 4 |
+
from ui.contact_handler import contact_handler
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
RESUME_FILE_PATH = "./data/resume.pdf"
|
| 10 |
+
USER_AVATAR = None
|
| 11 |
+
BOT_AVATAR = "./data/thony.png"
|
| 12 |
+
|
| 13 |
+
logger = get_logger("gradio")
|
| 14 |
+
|
| 15 |
+
def get_resume_file():
|
| 16 |
+
if os.path.exists(RESUME_FILE_PATH):
|
| 17 |
+
return RESUME_FILE_PATH
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
def reset_session():
|
| 21 |
+
return generate_session_id()
|
| 22 |
+
|
| 23 |
+
css = """
|
| 24 |
+
.avatar-container {
|
| 25 |
+
width:80px !important;
|
| 26 |
+
height: 80px !important;
|
| 27 |
+
}
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def launch_ui():
|
| 31 |
+
with gr.Blocks(title="ResumeBot - Cheun Da", theme=gr.themes.Soft(),css=css) as demo:
|
| 32 |
+
|
| 33 |
+
gr.Markdown(
|
| 34 |
+
"""
|
| 35 |
+
#AskMyResume (Cheun Da)
|
| 36 |
+
|
| 37 |
+
Answer questions about my resume**.
|
| 38 |
+
|
| 39 |
+
**Features**
|
| 40 |
+
- π¬ Chat Resume Q&A
|
| 41 |
+
- π© Stay in touch form
|
| 42 |
+
- π Resume download """ )
|
| 43 |
+
|
| 44 |
+
session_state = gr.State(generate_session_id())
|
| 45 |
+
|
| 46 |
+
with gr.Row():
|
| 47 |
+
session_display = gr.Textbox(
|
| 48 |
+
label="Session Initialisation",
|
| 49 |
+
interactive=False
|
| 50 |
+
)
|
| 51 |
+
reset_session_btn = gr.Button("π New Session")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
demo.load(
|
| 55 |
+
fn=lambda sid: sid,
|
| 56 |
+
inputs=session_state,
|
| 57 |
+
outputs=session_display
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
reset_session_btn.click(
|
| 61 |
+
fn=reset_session,
|
| 62 |
+
outputs=session_state
|
| 63 |
+
).then(
|
| 64 |
+
fn=lambda sid: sid,
|
| 65 |
+
inputs=session_state,
|
| 66 |
+
outputs=session_display
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
with gr.Tabs():
|
| 70 |
+
with gr.Tab("π¬ Chat"):
|
| 71 |
+
chatbot = gr.Chatbot(label="Resume Chat", height=450,avatar_images=(USER_AVATAR, BOT_AVATAR))
|
| 72 |
+
msg = gr.Textbox(label="Ask a question", placeholder="e.g. What projects have you worked on?")
|
| 73 |
+
#send_btn = gr.Button("Send")
|
| 74 |
+
|
| 75 |
+
#send_btn.click(
|
| 76 |
+
# fn=chat_handler,
|
| 77 |
+
# inputs=[msg, chatbot, session_state],
|
| 78 |
+
# outputs=[chatbot, session_state,msg])
|
| 79 |
+
|
| 80 |
+
msg.submit(
|
| 81 |
+
fn=chat_handler,
|
| 82 |
+
inputs=[msg, chatbot, session_state],
|
| 83 |
+
outputs=[chatbot, session_state,msg]
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
clear_btn = gr.Button("π§Ή Clear Chat")
|
| 87 |
+
clear_btn.click(lambda: [], outputs=chatbot)
|
| 88 |
+
|
| 89 |
+
with gr.Tab("π© Stay In Touch"):
|
| 90 |
+
gr.Markdown("### Recruiter Contact Form (Message Cheun Da Directly)")
|
| 91 |
+
|
| 92 |
+
name = gr.Textbox(label="Your Name *")
|
| 93 |
+
email = gr.Textbox(label="Your Email *")
|
| 94 |
+
company = gr.Textbox(label="Company (optional)")
|
| 95 |
+
linkedin = gr.Textbox(label="LinkedIn Profile (optional)")
|
| 96 |
+
message = gr.Textbox(label="Message *", lines=5)
|
| 97 |
+
|
| 98 |
+
contact_btn = gr.Button("Send Message")
|
| 99 |
+
contact_status = gr.Textbox(label="Status", interactive=False)
|
| 100 |
+
|
| 101 |
+
contact_btn.click(
|
| 102 |
+
fn=contact_handler,
|
| 103 |
+
inputs=[name, email, message, linkedin, company, session_state],
|
| 104 |
+
outputs=[contact_status,name, email, message, linkedin, company]
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
with gr.Tab("π Download Resume"):
|
| 108 |
+
gr.Markdown("### Download Resume PDF")
|
| 109 |
+
resume_file = gr.File(label="Resume File", value=get_resume_file)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
demo.queue(default_concurrency_limit=64)
|
| 113 |
+
demo.launch(debug=True, share=False)
|
| 114 |
+
|
| 115 |
+
|
utils/__init__.py
ADDED
|
File without changes
|
utils/central_logging.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import logging.handlers
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
LOG_DIR = Path("logs")
|
| 6 |
+
LOG_DIR.mkdir(exist_ok=True)
|
| 7 |
+
|
| 8 |
+
LOG_FILE = LOG_DIR / "advisor.log"
|
| 9 |
+
|
| 10 |
+
LOG_FORMAT = (
|
| 11 |
+
"%(asctime)s | %(levelname)s | %(name)s | "
|
| 12 |
+
"%(funcName)s:%(lineno)d | %(message)s"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def setup_logging(log_level=logging.INFO):
|
| 19 |
+
"""Global logging configuration"""
|
| 20 |
+
|
| 21 |
+
root_logger = logging.getLogger()
|
| 22 |
+
root_logger.setLevel(log_level)
|
| 23 |
+
|
| 24 |
+
# Prevent duplicate logs in notebooks / reloads
|
| 25 |
+
if root_logger.handlers:
|
| 26 |
+
return
|
| 27 |
+
|
| 28 |
+
formatter = logging.Formatter(LOG_FORMAT, DATE_FORMAT)
|
| 29 |
+
|
| 30 |
+
# ---- File Handler (advisor.log) ----
|
| 31 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
| 32 |
+
LOG_FILE,
|
| 33 |
+
maxBytes=10 * 1024 * 1024, # 10 MB
|
| 34 |
+
backupCount=5,
|
| 35 |
+
encoding="utf-8",
|
| 36 |
+
)
|
| 37 |
+
file_handler.setFormatter(formatter)
|
| 38 |
+
file_handler.setLevel(log_level)
|
| 39 |
+
|
| 40 |
+
# ---- Console Handler ----
|
| 41 |
+
console_handler = logging.StreamHandler()
|
| 42 |
+
console_handler.setFormatter(formatter)
|
| 43 |
+
console_handler.setLevel(log_level)
|
| 44 |
+
|
| 45 |
+
root_logger.addHandler(file_handler)
|
| 46 |
+
root_logger.addHandler(console_handler)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def get_logger(name: str) -> logging.Logger:
|
| 50 |
+
"""Get a named logger"""
|
| 51 |
+
return logging.getLogger(name)
|