Spaces:
Running
Running
github-actions[bot] commited on
Commit ·
0064c35
0
Parent(s):
Deploy snapshot to HF (binaries stripped)
Browse files- .github/workflows/push_to_hf_space.yml +62 -0
- .gitignore +9 -0
- .gradio/certificate.pem +31 -0
- README.md +11 -0
- app.py +414 -0
- exp.ipynb +0 -0
- rag_core/FusedRetreiver.py +107 -0
- rag_core/PrefixRetreiver.py +59 -0
- rag_core/config.py +13 -0
- rag_core/crawler.py +54 -0
- rag_core/embeddings_model.py +18 -0
- rag_core/evaluator.py +85 -0
- rag_core/evaluator_schema.py +11 -0
- rag_core/index_builder.py +300 -0
- rag_core/models_groq.py +15 -0
- rag_core/rag_chain.py +200 -0
- rag_core/rag_chain_helper.py +137 -0
- rag_core/sources.py +25 -0
- requirements.txt +14 -0
.github/workflows/push_to_hf_space.yml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face Space (always strip binaries)
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ "hf-deploy" ]
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
deploy:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Checkout source
|
| 14 |
+
uses: actions/checkout@v4
|
| 15 |
+
with:
|
| 16 |
+
fetch-depth: 0
|
| 17 |
+
ref: hf-deploy
|
| 18 |
+
|
| 19 |
+
- name: Create clean snapshot (remove .pkl/.faiss no matter what)
|
| 20 |
+
run: |
|
| 21 |
+
set -euxo pipefail
|
| 22 |
+
|
| 23 |
+
# Export current commit as plain files (no .git history)
|
| 24 |
+
rm -rf /tmp/snapshot
|
| 25 |
+
mkdir -p /tmp/snapshot
|
| 26 |
+
git archive --format=tar HEAD | tar -x -C /tmp/snapshot
|
| 27 |
+
|
| 28 |
+
echo "== Files matching .pkl/.faiss BEFORE removal =="
|
| 29 |
+
(cd /tmp/snapshot && find . -type f \( -name "*.pkl" -o -name "*.faiss" \) -print) || true
|
| 30 |
+
|
| 31 |
+
# Always delete these binaries (even if someone committed them)
|
| 32 |
+
(cd /tmp/snapshot && find . -type f \( -name "*.pkl" -o -name "*.faiss" \) -print -delete) || true
|
| 33 |
+
|
| 34 |
+
echo "== Files matching .pkl/.faiss AFTER removal =="
|
| 35 |
+
(cd /tmp/snapshot && find . -type f \( -name "*.pkl" -o -name "*.faiss" \) -print) || true
|
| 36 |
+
|
| 37 |
+
# Ensure they won't be committed into the snapshot repo
|
| 38 |
+
printf "\n# Never deploy vectorstore binaries to Spaces\n*.pkl\n*.faiss\n" >> /tmp/snapshot/.gitignore
|
| 39 |
+
|
| 40 |
+
# Build a new single-commit git repo from the cleaned snapshot
|
| 41 |
+
cd /tmp/snapshot
|
| 42 |
+
git init
|
| 43 |
+
git config user.name "github-actions[bot]"
|
| 44 |
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 45 |
+
git add -A
|
| 46 |
+
git commit -m "Deploy snapshot to HF (binaries stripped)"
|
| 47 |
+
|
| 48 |
+
- name: Push snapshot to Hugging Face Space (HF main)
|
| 49 |
+
env:
|
| 50 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 51 |
+
HF_USERNAME: ritup3 # CHANGE if needed
|
| 52 |
+
SPACE_NAME: PersonaRag # CHANGE if needed
|
| 53 |
+
run: |
|
| 54 |
+
set -euxo pipefail
|
| 55 |
+
cd /tmp/snapshot
|
| 56 |
+
|
| 57 |
+
# Authenticated remote for HF Space
|
| 58 |
+
git remote add space https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME
|
| 59 |
+
|
| 60 |
+
# Spaces deploy from branch "main"
|
| 61 |
+
git branch -M main
|
| 62 |
+
git push space main --force
|
.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.faiss
|
| 4 |
+
*.pkl
|
| 5 |
+
.env
|
| 6 |
+
.DS_Store
|
| 7 |
+
# Never deploy vectorstore binaries to Spaces
|
| 8 |
+
*.pkl
|
| 9 |
+
*.faiss
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: PersonaRag
|
| 3 |
+
emoji: "🤖"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "5.49.1"
|
| 8 |
+
python_version: "3.10.13"
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
---
|
app.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import time
|
| 9 |
+
import threading
|
| 10 |
+
import atexit
|
| 11 |
+
|
| 12 |
+
from rag_core.index_builder import load_vectorstore
|
| 13 |
+
from rag_core.rag_chain_helper import rewrite_question_with_history
|
| 14 |
+
from rag_core.rag_chain import build_rag_chain
|
| 15 |
+
from rag_core.evaluator import evaluate_answer
|
| 16 |
+
from rag_core.index_builder import build_and_save_index
|
| 17 |
+
|
| 18 |
+
from rag_core.config import VECTORSTORE_PATH
|
| 19 |
+
|
| 20 |
+
# ---------- Refresh config ----------
|
| 21 |
+
REFRESH_ENABLED = os.getenv("REFRESH_ENABLED", "true").lower() == "true"
|
| 22 |
+
REFRESH_INTERVAL_SECONDS = int(os.getenv("REFRESH_INTERVAL_SECONDS", str(24 * 60 * 60)))
|
| 23 |
+
REFRESH_AT_HOUR = int(os.getenv("REFRESH_AT_HOUR", "3"))
|
| 24 |
+
REFRESH_AT_MINUTE = int(os.getenv("REFRESH_AT_MINUTE", "0"))
|
| 25 |
+
REFRESH_ONLY_FIXED_URLS = os.getenv("REFRESH_ONLY_FIXED_URLS", "false").lower() == "true"
|
| 26 |
+
|
| 27 |
+
state_lock = threading.RLock()
|
| 28 |
+
stop_refresh_event = threading.Event()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
load_dotenv()
|
| 32 |
+
|
| 33 |
+
# ---------- Logging (Model Flow) ----------
|
| 34 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
|
| 35 |
+
|
| 36 |
+
logger = logging.getLogger("model_flow")
|
| 37 |
+
logger.setLevel(LOG_LEVEL)
|
| 38 |
+
|
| 39 |
+
if not logger.handlers:
|
| 40 |
+
h = logging.StreamHandler()
|
| 41 |
+
h.setLevel(LOG_LEVEL)
|
| 42 |
+
formatter = logging.Formatter(
|
| 43 |
+
"%(asctime)s | %(levelname)s | %(name)s | %(message)s"
|
| 44 |
+
)
|
| 45 |
+
h.setFormatter(formatter)
|
| 46 |
+
logger.addHandler(h)
|
| 47 |
+
|
| 48 |
+
def log_event(event: str, **payload):
|
| 49 |
+
"""Structured-ish logging for tracing model flow."""
|
| 50 |
+
safe = {}
|
| 51 |
+
for k, v in payload.items():
|
| 52 |
+
try:
|
| 53 |
+
json.dumps(v) # ensure serializable
|
| 54 |
+
safe[k] = v
|
| 55 |
+
except TypeError:
|
| 56 |
+
safe[k] = str(v)
|
| 57 |
+
logger.info("%s | %s", event, json.dumps(safe, ensure_ascii=False))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ---------- Global state ----------
|
| 61 |
+
vectorstore = None
|
| 62 |
+
rag_chain = None
|
| 63 |
+
retriever = None
|
| 64 |
+
system_prompt = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def init_rag():
|
| 68 |
+
global vectorstore, rag_chain, retriever, system_prompt
|
| 69 |
+
|
| 70 |
+
# HARD DISABLE: no crawling / no auto-index build
|
| 71 |
+
index_path = Path(VECTORSTORE_PATH) / "index.faiss"
|
| 72 |
+
if not index_path.exists():
|
| 73 |
+
n_chunks, _ = build_and_save_index()
|
| 74 |
+
log_event("refresh.index_built", mode="crawl", chunks=n_chunks)
|
| 75 |
+
|
| 76 |
+
vectorstore = load_vectorstore()
|
| 77 |
+
|
| 78 |
+
rag_chain, retriever, system_prompt = build_rag_chain(
|
| 79 |
+
vectorstore,
|
| 80 |
+
k=5,
|
| 81 |
+
max_docs=2
|
| 82 |
+
)
|
| 83 |
+
log_event("init_rag.ready", vectorstore_path=VECTORSTORE_PATH)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
init_rag()
|
| 87 |
+
|
| 88 |
+
def refresh_rag_once():
|
| 89 |
+
"""
|
| 90 |
+
Refetch website docs and rebuild the index + chain.
|
| 91 |
+
Never crashes the app; logs errors.
|
| 92 |
+
"""
|
| 93 |
+
global vectorstore, rag_chain, retriever, system_prompt
|
| 94 |
+
|
| 95 |
+
log_event("refresh.start", only_fixed_urls=REFRESH_ONLY_FIXED_URLS)
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
from rag_core.rag_chain import build_rag_chain
|
| 99 |
+
from rag_core.index_builder import load_vectorstore
|
| 100 |
+
|
| 101 |
+
n_chunks, _ = build_and_save_index()
|
| 102 |
+
log_event("refresh.index_built", mode="crawl", chunks=n_chunks)
|
| 103 |
+
|
| 104 |
+
# Reload from disk (ensures consistent serialization)
|
| 105 |
+
vs = load_vectorstore()
|
| 106 |
+
|
| 107 |
+
# Build new chain
|
| 108 |
+
new_chain, new_retriever, new_system_prompt = build_rag_chain(
|
| 109 |
+
vs,
|
| 110 |
+
k=5,
|
| 111 |
+
max_docs=2,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Atomic swap
|
| 115 |
+
with state_lock:
|
| 116 |
+
vectorstore = vs
|
| 117 |
+
rag_chain = new_chain
|
| 118 |
+
retriever = new_retriever
|
| 119 |
+
system_prompt = new_system_prompt
|
| 120 |
+
|
| 121 |
+
log_event("refresh.done", status="ok")
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
log_event("refresh.error", error=str(e))
|
| 125 |
+
|
| 126 |
+
def _seconds_until_next_run(hour: int, minute: int) -> int:
|
| 127 |
+
# compute sleep until next local time (hour:minute)
|
| 128 |
+
now = time.localtime()
|
| 129 |
+
target = time.mktime((
|
| 130 |
+
now.tm_year, now.tm_mon, now.tm_mday,
|
| 131 |
+
hour, minute, 0,
|
| 132 |
+
now.tm_wday, now.tm_yday, now.tm_isdst
|
| 133 |
+
))
|
| 134 |
+
now_ts = time.time()
|
| 135 |
+
if target <= now_ts:
|
| 136 |
+
target += 24 * 60 * 60
|
| 137 |
+
return int(target - now_ts)
|
| 138 |
+
|
| 139 |
+
def _daily_refresh_loop():
|
| 140 |
+
# small startup delay
|
| 141 |
+
time.sleep(3)
|
| 142 |
+
|
| 143 |
+
while not stop_refresh_event.is_set():
|
| 144 |
+
# sleep until next scheduled time
|
| 145 |
+
sleep_s = _seconds_until_next_run(REFRESH_AT_HOUR, REFRESH_AT_MINUTE)
|
| 146 |
+
log_event("refresh.sleep", seconds=sleep_s, at_hour=REFRESH_AT_HOUR, at_minute=REFRESH_AT_MINUTE)
|
| 147 |
+
|
| 148 |
+
# sleep in chunks so shutdown responds quickly
|
| 149 |
+
while sleep_s > 0 and not stop_refresh_event.is_set():
|
| 150 |
+
step = min(5, sleep_s)
|
| 151 |
+
time.sleep(step)
|
| 152 |
+
sleep_s -= step
|
| 153 |
+
|
| 154 |
+
if stop_refresh_event.is_set():
|
| 155 |
+
break
|
| 156 |
+
|
| 157 |
+
refresh_rag_once()
|
| 158 |
+
|
| 159 |
+
def start_refresh_thread():
|
| 160 |
+
if not REFRESH_ENABLED:
|
| 161 |
+
log_event("refresh.disabled")
|
| 162 |
+
return
|
| 163 |
+
t = threading.Thread(target=_daily_refresh_loop, daemon=True)
|
| 164 |
+
t.start()
|
| 165 |
+
log_event("refresh.thread_started", daily_at=f"{REFRESH_AT_HOUR:02d}:{REFRESH_AT_MINUTE:02d}")
|
| 166 |
+
|
| 167 |
+
atexit.register(lambda: stop_refresh_event.set())
|
| 168 |
+
start_refresh_thread()
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ---------- Helpers ----------
|
| 173 |
+
def _history_to_text(history):
|
| 174 |
+
"""Convert Gradio history ([[user, bot], ...]) to a readable text snippet."""
|
| 175 |
+
if not history:
|
| 176 |
+
return ""
|
| 177 |
+
lines = []
|
| 178 |
+
for turn in history:
|
| 179 |
+
if not turn or len(turn) < 2:
|
| 180 |
+
continue
|
| 181 |
+
user_msg, assistant_msg = turn[0], turn[1]
|
| 182 |
+
lines.append(f"User: {user_msg}")
|
| 183 |
+
lines.append(f"Assistant: {assistant_msg}")
|
| 184 |
+
return "\n".join(lines)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _docs_to_loggable(docs, max_chars=220):
|
| 188 |
+
"""Return lightweight doc info for logs (no full dump)."""
|
| 189 |
+
out = []
|
| 190 |
+
for d in (docs or []):
|
| 191 |
+
src = (d.metadata or {}).get("source", "unknown")
|
| 192 |
+
txt = (d.page_content or "").strip().replace("\n", " ")
|
| 193 |
+
out.append({
|
| 194 |
+
"source": src,
|
| 195 |
+
"preview": (txt[:max_chars] + ("..." if len(txt) > max_chars else "")),
|
| 196 |
+
"metadata": (d.metadata or {}),
|
| 197 |
+
})
|
| 198 |
+
return out
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def generate_answer(message, history):
|
| 202 |
+
"""
|
| 203 |
+
Core logic:
|
| 204 |
+
- rewrite question with history (best-effort)
|
| 205 |
+
- run RAG (required; if this fails, return a fallback reply)
|
| 206 |
+
- evaluate (best-effort; if this fails, skip retry)
|
| 207 |
+
- optionally retry once based on evaluator signal
|
| 208 |
+
Returns ONLY the final answer string (no sources/context/evaluator in UI).
|
| 209 |
+
"""
|
| 210 |
+
log_event("request.start", user_message=message)
|
| 211 |
+
|
| 212 |
+
# ---------- 1. Rewrite with history (best-effort) ----------
|
| 213 |
+
try:
|
| 214 |
+
standalone_question = rewrite_question_with_history(history, message)
|
| 215 |
+
except Exception as e:
|
| 216 |
+
log_event("rewrite.error", error=str(e))
|
| 217 |
+
standalone_question = message # fallback: use original message
|
| 218 |
+
|
| 219 |
+
history_text = _history_to_text(history)
|
| 220 |
+
|
| 221 |
+
log_event(
|
| 222 |
+
"rewrite.done",
|
| 223 |
+
standalone_question=standalone_question,
|
| 224 |
+
history_chars=len(history_text),
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# ---------- 2. Run RAG (if this fails, we bail with generic error text) ----------
|
| 228 |
+
try:
|
| 229 |
+
with state_lock:
|
| 230 |
+
local_rag_chain = rag_chain
|
| 231 |
+
local_system_prompt = system_prompt
|
| 232 |
+
|
| 233 |
+
rag_res = local_rag_chain.invoke({
|
| 234 |
+
"input": standalone_question,
|
| 235 |
+
"chat_history": history_text,
|
| 236 |
+
})
|
| 237 |
+
except Exception as e:
|
| 238 |
+
log_event("rag.error", error=str(e))
|
| 239 |
+
fallback = (
|
| 240 |
+
"I'm having trouble accessing my knowledge base right now. "
|
| 241 |
+
"Please try again in a moment."
|
| 242 |
+
)
|
| 243 |
+
log_event(
|
| 244 |
+
"request.end",
|
| 245 |
+
final_answer_preview=fallback[:400] + ("..." if len(fallback) > 400 else "")
|
| 246 |
+
)
|
| 247 |
+
return fallback
|
| 248 |
+
|
| 249 |
+
answer_1 = rag_res.get("answer", "") or ""
|
| 250 |
+
ctx_docs_1 = rag_res.get("context", []) or []
|
| 251 |
+
|
| 252 |
+
log_event(
|
| 253 |
+
"rag.done",
|
| 254 |
+
answer_preview=answer_1[:400] + ("..." if len(answer_1) > 400 else ""),
|
| 255 |
+
retrieved_count=len(ctx_docs_1),
|
| 256 |
+
retrieved_docs=_docs_to_loggable(ctx_docs_1),
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# ---------- 3. Evaluate (best-effort; never crash on judge failure) ----------
|
| 260 |
+
eval_res_1 = None
|
| 261 |
+
try:
|
| 262 |
+
eval_res_1 = evaluate_answer(
|
| 263 |
+
system_prompt=local_system_prompt,
|
| 264 |
+
question=message,
|
| 265 |
+
context_docs=ctx_docs_1,
|
| 266 |
+
answer=answer_1,
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
log_event(
|
| 270 |
+
"eval.done",
|
| 271 |
+
overall_score=float(eval_res_1.overall_score),
|
| 272 |
+
grounded=float(eval_res_1.grounded_in_context_score),
|
| 273 |
+
hallucination=bool(eval_res_1.hallucination_detected),
|
| 274 |
+
feedback=str(eval_res_1.feedback),
|
| 275 |
+
)
|
| 276 |
+
except Exception as e:
|
| 277 |
+
log_event("eval.error", error=str(e))
|
| 278 |
+
# We just skip retry logic; answer_1 is still valid.
|
| 279 |
+
|
| 280 |
+
final_answer = answer_1
|
| 281 |
+
|
| 282 |
+
# ---------- 4. Single retry (only if evaluator succeeded & says to retry) ----------
|
| 283 |
+
try:
|
| 284 |
+
if (
|
| 285 |
+
eval_res_1 is not None and
|
| 286 |
+
( eval_res_1.overall_score < 0.70 or getattr(eval_res_1, "should_retry", True))
|
| 287 |
+
):
|
| 288 |
+
revision_prompt = (
|
| 289 |
+
f"{standalone_question}\n\n"
|
| 290 |
+
f"You previously answered this:\n{answer_1}\n\n"
|
| 291 |
+
"An evaluator found issues. Revise your answer to address the feedback below.\n"
|
| 292 |
+
"Rules:\n"
|
| 293 |
+
"- Use ONLY the provided context.\n"
|
| 294 |
+
"- If the context does not support the claim, say \"I don't know\".\n"
|
| 295 |
+
"- Be specific and grounded.\n\n"
|
| 296 |
+
f"Evaluator feedback:\n{eval_res_1.feedback}\n"
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
log_event(
|
| 300 |
+
"retry.triggered",
|
| 301 |
+
reason="eval_score_below_threshold",
|
| 302 |
+
threshold=0.90,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# RAG retry — if this fails, we keep the original answer_1
|
| 306 |
+
try:
|
| 307 |
+
rag_res_2 = rag_chain.invoke({
|
| 308 |
+
"input": revision_prompt,
|
| 309 |
+
"chat_history": history_text,
|
| 310 |
+
})
|
| 311 |
+
answer_2 = rag_res_2.get("answer", "") or ""
|
| 312 |
+
ctx_docs_2 = rag_res_2.get("context", []) or []
|
| 313 |
+
|
| 314 |
+
log_event(
|
| 315 |
+
"rag.retry_done",
|
| 316 |
+
answer_preview=answer_2[:400] + ("..." if len(answer_2) > 400 else ""),
|
| 317 |
+
retrieved_count=len(ctx_docs_2),
|
| 318 |
+
retrieved_docs=_docs_to_loggable(ctx_docs_2),
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# Optional: re-evaluate the revised answer (ignore errors)
|
| 322 |
+
try:
|
| 323 |
+
eval_res_2 = evaluate_answer(
|
| 324 |
+
system_prompt=system_prompt,
|
| 325 |
+
question=message,
|
| 326 |
+
context_docs=ctx_docs_2,
|
| 327 |
+
answer=answer_2,
|
| 328 |
+
)
|
| 329 |
+
log_event(
|
| 330 |
+
"eval.retry_done",
|
| 331 |
+
overall_score=float(eval_res_2.overall_score),
|
| 332 |
+
grounded=float(eval_res_2.grounded_in_context_score),
|
| 333 |
+
hallucination=bool(eval_res_2.hallucination_detected),
|
| 334 |
+
feedback=str(eval_res_2.feedback),
|
| 335 |
+
)
|
| 336 |
+
except Exception as e_eval2:
|
| 337 |
+
log_event("eval.retry_error", error=str(e_eval2))
|
| 338 |
+
|
| 339 |
+
# If we got here, second answer is safe to use
|
| 340 |
+
if eval_res_1.overall_score>eval_res_2.overall_score:
|
| 341 |
+
final_answer = answer_1
|
| 342 |
+
else:
|
| 343 |
+
final_answer = answer_2
|
| 344 |
+
|
| 345 |
+
except Exception as e_rag2:
|
| 346 |
+
# Retry RAG failed; log and fall back to first answer
|
| 347 |
+
log_event("rag.retry_error", error=str(e_rag2))
|
| 348 |
+
final_answer = answer_1
|
| 349 |
+
|
| 350 |
+
except Exception as e_retry_block:
|
| 351 |
+
# Any unexpected error in retry logic should not crash the whole request
|
| 352 |
+
log_event("retry.block_error", error=str(e_retry_block))
|
| 353 |
+
final_answer = answer_1
|
| 354 |
+
|
| 355 |
+
# ---------- 5. Final logging & return ----------
|
| 356 |
+
log_event(
|
| 357 |
+
"request.end",
|
| 358 |
+
final_answer_preview=final_answer[:400] + ("..." if len(final_answer) > 400 else "")
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
return final_answer
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def respond(message, history):
|
| 365 |
+
"""
|
| 366 |
+
Gradio wrapper that is resilient to unexpected exceptions.
|
| 367 |
+
If anything explodes inside generate_answer, we log it and return
|
| 368 |
+
a safe fallback message.
|
| 369 |
+
"""
|
| 370 |
+
if not message:
|
| 371 |
+
return "", history
|
| 372 |
+
|
| 373 |
+
try:
|
| 374 |
+
answer = generate_answer(message, history)
|
| 375 |
+
except Exception as e:
|
| 376 |
+
log_event("respond.fatal_error", error=str(e))
|
| 377 |
+
answer = (
|
| 378 |
+
"Something went wrong on my side while trying to answer. "
|
| 379 |
+
"Please try again in a moment."
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
history = history + [[message, answer]]
|
| 383 |
+
return "", history
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# ---------- Gradio UI ----------
|
| 387 |
+
with gr.Blocks(title="Ask Ritam (Career QA Bot)") as demo:
|
| 388 |
+
gr.Markdown(
|
| 389 |
+
"# Ask Ritam\n"
|
| 390 |
+
"A RAG-powered career assistant over my resume, website, and projects.\n"
|
| 391 |
+
"Ask anything about my experience, projects, research, or education."
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
with gr.Row():
|
| 395 |
+
with gr.Column(scale=3):
|
| 396 |
+
chatbot = gr.Chatbot(label="Conversation", height=500)
|
| 397 |
+
|
| 398 |
+
with gr.Row():
|
| 399 |
+
msg = gr.Textbox(
|
| 400 |
+
placeholder="Ask anything about my career, projects, or research...",
|
| 401 |
+
lines=2,
|
| 402 |
+
scale=4,
|
| 403 |
+
show_label=False,
|
| 404 |
+
)
|
| 405 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 406 |
+
|
| 407 |
+
clear_btn = gr.Button("Clear chat")
|
| 408 |
+
|
| 409 |
+
send_btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
|
| 410 |
+
msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
|
| 411 |
+
|
| 412 |
+
clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
|
| 413 |
+
|
| 414 |
+
demo.launch()
|
exp.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rag_core/FusedRetreiver.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, List, Set, Tuple
|
| 2 |
+
from langchain.schema import BaseRetriever, Document
|
| 3 |
+
|
| 4 |
+
class FusedRetriever(BaseRetriever):
|
| 5 |
+
"""
|
| 6 |
+
Pydantic-compatible fused retriever that wraps a header-aware PrefixRetriever
|
| 7 |
+
and a vector retriever. Declares fields so Pydantic validation succeeds.
|
| 8 |
+
"""
|
| 9 |
+
prefix_retriever: Any
|
| 10 |
+
vector_retriever: Any
|
| 11 |
+
k: int = 4
|
| 12 |
+
prefix_first: bool = True
|
| 13 |
+
|
| 14 |
+
class Config:
|
| 15 |
+
# allows storing arbitrary Python objects in model fields
|
| 16 |
+
arbitrary_types_allowed = True
|
| 17 |
+
|
| 18 |
+
def get_relevant_documents(self, query: str) -> List[Document]:
|
| 19 |
+
# 1) prefix candidates
|
| 20 |
+
prefix_docs = []
|
| 21 |
+
if hasattr(self.prefix_retriever, "get_relevant_documents"):
|
| 22 |
+
prefix_docs = self.prefix_retriever.get_relevant_documents(query)
|
| 23 |
+
elif hasattr(self.prefix_retriever, "_get_relevant_documents"):
|
| 24 |
+
prefix_docs = self.prefix_retriever._get_relevant_documents(query)
|
| 25 |
+
|
| 26 |
+
# 2) vector candidates
|
| 27 |
+
vector_docs = []
|
| 28 |
+
try:
|
| 29 |
+
# many LangChain retrievers implement get_relevant_documents
|
| 30 |
+
if hasattr(self.vector_retriever, "get_relevant_documents"):
|
| 31 |
+
vector_docs = self.vector_retriever.get_relevant_documents(query)
|
| 32 |
+
elif hasattr(self.vector_retriever, "retrieve"):
|
| 33 |
+
vector_docs = self.vector_retriever.retrieve(query)
|
| 34 |
+
elif hasattr(self.vector_retriever, "get_relevant_documents_async"):
|
| 35 |
+
vector_docs = self.vector_retriever.get_relevant_documents_async(query)
|
| 36 |
+
except Exception:
|
| 37 |
+
vector_docs = []
|
| 38 |
+
|
| 39 |
+
# fuse with dedupe
|
| 40 |
+
seen: Set[Tuple[str, str]] = set()
|
| 41 |
+
out: List[Document] = []
|
| 42 |
+
|
| 43 |
+
def add_docs(docs: List[Document]) -> bool:
|
| 44 |
+
for d in docs:
|
| 45 |
+
key = (d.metadata.get("source"), d.page_content[:200])
|
| 46 |
+
if key in seen:
|
| 47 |
+
continue
|
| 48 |
+
seen.add(key)
|
| 49 |
+
out.append(d)
|
| 50 |
+
if len(out) >= self.k:
|
| 51 |
+
return True
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
if self.prefix_first:
|
| 55 |
+
finished = add_docs(prefix_docs)
|
| 56 |
+
if not finished:
|
| 57 |
+
add_docs(vector_docs)
|
| 58 |
+
else:
|
| 59 |
+
finished = add_docs(vector_docs)
|
| 60 |
+
if not finished:
|
| 61 |
+
add_docs(prefix_docs)
|
| 62 |
+
|
| 63 |
+
return out[: self.k]
|
| 64 |
+
|
| 65 |
+
async def aget_relevant_documents(self, query: str) -> List[Document]:
|
| 66 |
+
# Try to call async variants when available, else fall back to sync
|
| 67 |
+
prefix_docs = []
|
| 68 |
+
if hasattr(self.prefix_retriever, "aget_relevant_documents"):
|
| 69 |
+
prefix_docs = await self.prefix_retriever.aget_relevant_documents(query)
|
| 70 |
+
elif hasattr(self.prefix_retriever, "get_relevant_documents"):
|
| 71 |
+
prefix_docs = self.prefix_retriever.get_relevant_documents(query)
|
| 72 |
+
elif hasattr(self.prefix_retriever, "_get_relevant_documents"):
|
| 73 |
+
prefix_docs = self.prefix_retriever._get_relevant_documents(query)
|
| 74 |
+
|
| 75 |
+
vector_docs = []
|
| 76 |
+
if hasattr(self.vector_retriever, "aget_relevant_documents"):
|
| 77 |
+
vector_docs = await self.vector_retriever.aget_relevant_documents(query)
|
| 78 |
+
elif hasattr(self.vector_retriever, "get_relevant_documents"):
|
| 79 |
+
vector_docs = self.vector_retriever.get_relevant_documents(query)
|
| 80 |
+
elif hasattr(self.vector_retriever, "retrieve"):
|
| 81 |
+
vector_docs = self.vector_retriever.retrieve(query)
|
| 82 |
+
|
| 83 |
+
# Reuse sync fuse logic by delegating to get_relevant_documents after patching
|
| 84 |
+
# Build a temporary 'self' like structure — easiest is to merge candidate lists here
|
| 85 |
+
seen = set()
|
| 86 |
+
out = []
|
| 87 |
+
def add_docs_sync(docs):
|
| 88 |
+
for d in docs:
|
| 89 |
+
key = (d.metadata.get("source"), d.page_content[:200])
|
| 90 |
+
if key in seen:
|
| 91 |
+
continue
|
| 92 |
+
seen.add(key)
|
| 93 |
+
out.append(d)
|
| 94 |
+
if len(out) >= self.k:
|
| 95 |
+
return True
|
| 96 |
+
return False
|
| 97 |
+
|
| 98 |
+
if self.prefix_first:
|
| 99 |
+
finished = add_docs_sync(prefix_docs)
|
| 100 |
+
if not finished:
|
| 101 |
+
add_docs_sync(vector_docs)
|
| 102 |
+
else:
|
| 103 |
+
finished = add_docs_sync(vector_docs)
|
| 104 |
+
if not finished:
|
| 105 |
+
add_docs_sync(prefix_docs)
|
| 106 |
+
|
| 107 |
+
return out[: self.k]
|
rag_core/PrefixRetreiver.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
from langchain_core.documents import Document
|
| 3 |
+
from langchain_core.retrievers import BaseRetriever
|
| 4 |
+
from pydantic import Field
|
| 5 |
+
from langchain.schema import Document
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
def _norm(text: str) -> str:
|
| 9 |
+
# lower, remove punctuation-ish chars, compact whitespace
|
| 10 |
+
text = text or ""
|
| 11 |
+
text = text.lower()
|
| 12 |
+
text = re.sub(r"[^a-z0-9\s]", " ", text)
|
| 13 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 14 |
+
return text
|
| 15 |
+
|
| 16 |
+
class PrefixRetriever:
|
| 17 |
+
def __init__(self, docs: List[Document], k: int = 3, max_lines: int = 8):
|
| 18 |
+
self.docs = docs
|
| 19 |
+
self.k = k
|
| 20 |
+
self.max_lines = max_lines
|
| 21 |
+
|
| 22 |
+
def _head(self, content: str) -> str:
|
| 23 |
+
# return up to max_lines lines from the beginning to be used for prefix matching
|
| 24 |
+
lines = [ln for ln in content.splitlines() if ln.strip()]
|
| 25 |
+
return "\n".join(lines[: self.max_lines])
|
| 26 |
+
|
| 27 |
+
def _get_relevant_documents(self, query: str) -> List[Document]:
|
| 28 |
+
q = _norm(query)
|
| 29 |
+
tokens = [t for t in q.split() if t]
|
| 30 |
+
out: List[Document] = []
|
| 31 |
+
|
| 32 |
+
for d in self.docs:
|
| 33 |
+
# build a matching head: prefer explicit header metadata
|
| 34 |
+
header = d.metadata.get("section_header") or d.metadata.get("section_label") or ""
|
| 35 |
+
head_text = header + "\n" + self._head(d.page_content)
|
| 36 |
+
head = _norm(head_text)
|
| 37 |
+
|
| 38 |
+
matched = False
|
| 39 |
+
# 1) exact-substring match (most important)
|
| 40 |
+
if q and q in head:
|
| 41 |
+
matched = True
|
| 42 |
+
else:
|
| 43 |
+
# 2) token-based match: ensure every token exists in head
|
| 44 |
+
if tokens and all(tok in head for tok in tokens):
|
| 45 |
+
matched = True
|
| 46 |
+
|
| 47 |
+
if matched:
|
| 48 |
+
out.append(d)
|
| 49 |
+
if len(out) >= self.k:
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
return out
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# --- Helper to recover Documents from a FAISS vectorstore
|
| 56 |
+
def faiss_all_docs(faiss_store):
|
| 57 |
+
ids = list(faiss_store.index_to_docstore_id.values())
|
| 58 |
+
return [faiss_store.docstore.search(_id) for _id in ids]
|
| 59 |
+
|
rag_core/config.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 4 |
+
DATA_DIR = BASE_DIR / "data"
|
| 5 |
+
VECTORSTORE_DIR = DATA_DIR / "vectorstore"
|
| 6 |
+
|
| 7 |
+
VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)
|
| 8 |
+
|
| 9 |
+
VECTORSTORE_PATH = str(VECTORSTORE_DIR / "career_faiss_index")
|
| 10 |
+
|
| 11 |
+
GROQ_CHAT_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 12 |
+
GROQ_EVAL_MODEL = "llama-3.1-8b-instant"
|
| 13 |
+
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
rag_core/crawler.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/crawler.py
|
| 2 |
+
from urllib.parse import urljoin, urlparse
|
| 3 |
+
from collections import deque
|
| 4 |
+
import requests
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
|
| 7 |
+
def crawl_subpages(root_url: str, max_pages: int = 30, max_depth: int = 1):
|
| 8 |
+
"""
|
| 9 |
+
Crawl subpages under the same path as root_url.
|
| 10 |
+
E.g., root_url = https://site.com/projects/
|
| 11 |
+
Will collect https://site.com/projects/* links.
|
| 12 |
+
"""
|
| 13 |
+
parsed_root = urlparse(root_url)
|
| 14 |
+
base_domain = parsed_root.netloc
|
| 15 |
+
base_path = parsed_root.path.rstrip("/") # "/projects"
|
| 16 |
+
|
| 17 |
+
visited = set()
|
| 18 |
+
to_visit = deque([(root_url, 0)])
|
| 19 |
+
collected = set([root_url])
|
| 20 |
+
|
| 21 |
+
while to_visit and len(collected) < max_pages:
|
| 22 |
+
url, depth = to_visit.popleft()
|
| 23 |
+
if url in visited or depth > max_depth:
|
| 24 |
+
continue
|
| 25 |
+
visited.add(url)
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
resp = requests.get(url, timeout=10)
|
| 29 |
+
if not resp.ok:
|
| 30 |
+
continue
|
| 31 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
| 32 |
+
except Exception:
|
| 33 |
+
continue
|
| 34 |
+
|
| 35 |
+
for a in soup.find_all("a", href=True):
|
| 36 |
+
href = a["href"].strip()
|
| 37 |
+
full_url = urljoin(url, href)
|
| 38 |
+
parsed = urlparse(full_url)
|
| 39 |
+
|
| 40 |
+
# Stay in same domain
|
| 41 |
+
if parsed.netloc != base_domain:
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# Stay under the same base path
|
| 45 |
+
if not parsed.path.startswith(base_path):
|
| 46 |
+
continue
|
| 47 |
+
|
| 48 |
+
if full_url not in collected:
|
| 49 |
+
collected.add(full_url)
|
| 50 |
+
to_visit.append((full_url, depth + 1))
|
| 51 |
+
|
| 52 |
+
visited.add(url)
|
| 53 |
+
|
| 54 |
+
return list(collected)
|
rag_core/embeddings_model.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/embeddings_model.py
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from .config import EMBEDDING_MODEL
|
| 5 |
+
|
| 6 |
+
load_dotenv() # loads HUGGINGFACEHUB_API_TOKEN if present
|
| 7 |
+
|
| 8 |
+
def get_embeddings():
|
| 9 |
+
"""
|
| 10 |
+
Returns a HuggingFaceEmbeddings instance using all-MiniLM.
|
| 11 |
+
This runs locally via sentence-transformers and will download
|
| 12 |
+
the model from Hugging Face (using your HF token if needed).
|
| 13 |
+
"""
|
| 14 |
+
return HuggingFaceEmbeddings(
|
| 15 |
+
model_name=EMBEDDING_MODEL,
|
| 16 |
+
model_kwargs={"device": "cpu"}, # change to "cuda" if you have a GPU
|
| 17 |
+
encode_kwargs={"normalize_embeddings": True}, # optional but often helpful
|
| 18 |
+
)
|
rag_core/evaluator.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/evaluator.py
|
| 2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 3 |
+
|
| 4 |
+
from .models_groq import get_judge_llm
|
| 5 |
+
from .evaluator_schema import EvalResult
|
| 6 |
+
|
| 7 |
+
# Base LLM
|
| 8 |
+
_base_judge_llm = get_judge_llm()
|
| 9 |
+
|
| 10 |
+
# Wrap LLM with structured output (EvalResult)
|
| 11 |
+
_judge_llm_structured = _base_judge_llm.with_structured_output(EvalResult)
|
| 12 |
+
|
| 13 |
+
# Prompt template for the judge
|
| 14 |
+
JUDGE_PROMPT = ChatPromptTemplate.from_messages(
|
| 15 |
+
[
|
| 16 |
+
("system", """
|
| 17 |
+
You are an impartial evaluator for a RAG-based assistant about Ritam's career.
|
| 18 |
+
|
| 19 |
+
You get:
|
| 20 |
+
- System prompt,
|
| 21 |
+
- User question,
|
| 22 |
+
- Retrieved context,
|
| 23 |
+
- Assistant answer.
|
| 24 |
+
|
| 25 |
+
Check:
|
| 26 |
+
1. Does the answer follow the system prompt? (only Ritam's career / projects / research.)
|
| 27 |
+
2. Does the answer actually respond to the user's question?
|
| 28 |
+
3. Is the answer having too much extra information
|
| 29 |
+
3. How well is the answer grounded in the context? Penalize hallucinations.
|
| 30 |
+
|
| 31 |
+
Return a strict JSON object matching the EvalResult schema.
|
| 32 |
+
"""),
|
| 33 |
+
("human", """
|
| 34 |
+
System prompt:
|
| 35 |
+
----------------
|
| 36 |
+
{system_prompt}
|
| 37 |
+
|
| 38 |
+
User question:
|
| 39 |
+
----------------
|
| 40 |
+
{question}
|
| 41 |
+
|
| 42 |
+
Retrieved context:
|
| 43 |
+
----------------
|
| 44 |
+
{context}
|
| 45 |
+
|
| 46 |
+
Assistant answer:
|
| 47 |
+
----------------
|
| 48 |
+
{answer}
|
| 49 |
+
"""),
|
| 50 |
+
]
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
def evaluate_answer(system_prompt: str, question: str, context_docs, answer: str) -> EvalResult:
|
| 54 |
+
"""
|
| 55 |
+
Run the LLM judge with structured output over the given answer.
|
| 56 |
+
If the judge fails, return a safe EvalResult that disables retry.
|
| 57 |
+
"""
|
| 58 |
+
ctx_text = "\n\n".join(
|
| 59 |
+
f"[DOC {i}] (source={d.metadata.get('source', 'unknown')})\n{d.page_content}"
|
| 60 |
+
for i, d in enumerate(context_docs)
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
chain = JUDGE_PROMPT | _judge_llm_structured
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
result: EvalResult = chain.invoke({
|
| 67 |
+
"system_prompt": system_prompt,
|
| 68 |
+
"question": question,
|
| 69 |
+
"context": ctx_text,
|
| 70 |
+
"answer": answer,
|
| 71 |
+
})
|
| 72 |
+
return result
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
# Judge died / timed out / bad response
|
| 76 |
+
# -> don't retry, but don't crash the main answer either.
|
| 77 |
+
return EvalResult(
|
| 78 |
+
follows_system_prompt=True, # neutral / optimistic defaults
|
| 79 |
+
answers_user_question=True,
|
| 80 |
+
grounded_in_context_score=1.0,
|
| 81 |
+
hallucination_detected=False,
|
| 82 |
+
overall_score=1.0, # high score so threshold logic won't trigger
|
| 83 |
+
should_retry=False, # explicitly NO RETRY
|
| 84 |
+
feedback=f"[Evaluator failure] Judge could not evaluate this answer: {e}",
|
| 85 |
+
)
|
rag_core/evaluator_schema.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/evaluator_schema.py
|
| 2 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
class EvalResult(BaseModel):
|
| 5 |
+
follows_system_prompt: bool = Field(...)
|
| 6 |
+
answers_user_question: bool = Field(...)
|
| 7 |
+
grounded_in_context_score: float = Field(ge=0.0, le=1.0)
|
| 8 |
+
hallucination_detected: bool = Field(...)
|
| 9 |
+
overall_score: float = Field(ge=0.0, le=1.0)
|
| 10 |
+
should_retry: bool = Field(...)
|
| 11 |
+
feedback: str = Field(...)
|
rag_core/index_builder.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/index_builder.py
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from langchain_community.document_loaders import OnlinePDFLoader
|
| 8 |
+
from langchain_text_splitters import HTMLSectionSplitter, RecursiveCharacterTextSplitter
|
| 9 |
+
from langchain_community.vectorstores import FAISS
|
| 10 |
+
|
| 11 |
+
from .embeddings_model import get_embeddings
|
| 12 |
+
from .config import VECTORSTORE_PATH
|
| 13 |
+
from .sources import CRAWL_ROOTS, FIXED_URLS
|
| 14 |
+
from .crawler import crawl_subpages
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _is_gdrive_file(url: str) -> bool:
|
| 18 |
+
"""Return True if this looks like a Google Drive file view URL."""
|
| 19 |
+
return "drive.google.com" in url and "/file/d/" in url
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _gdrive_view_to_download(url: str) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Convert a Google Drive view URL to a direct download URL.
|
| 25 |
+
|
| 26 |
+
Example:
|
| 27 |
+
https://drive.google.com/file/d/<ID>/view
|
| 28 |
+
-> https://drive.google.com/uc?export=download&id=<ID>
|
| 29 |
+
"""
|
| 30 |
+
m = re.search(r"/file/d/([^/]+)/", url)
|
| 31 |
+
if not m:
|
| 32 |
+
return url
|
| 33 |
+
file_id = m.group(1)
|
| 34 |
+
return f"https://drive.google.com/uc?export=download&id={file_id}"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _infer_section_label_from_url(url: str) -> str:
|
| 38 |
+
"""
|
| 39 |
+
Heuristic: guess a section label from the URL path.
|
| 40 |
+
e.g.
|
| 41 |
+
https://your-site.com/about -> 'about'
|
| 42 |
+
https://your-site.com/experience/juniper -> 'experience/juniper'
|
| 43 |
+
"""
|
| 44 |
+
try:
|
| 45 |
+
path = url.split("://", 1)[-1].split("/", 1)[-1]
|
| 46 |
+
except Exception:
|
| 47 |
+
return url
|
| 48 |
+
path = path.strip("/")
|
| 49 |
+
if not path:
|
| 50 |
+
return "root"
|
| 51 |
+
return path
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _normalize_label(text: str, max_len: int = 80) -> str:
|
| 55 |
+
"""
|
| 56 |
+
Create a short normalized label from a header text.
|
| 57 |
+
- lowercases, removes newlines, collapses whitespace
|
| 58 |
+
- trims to max_len and replaces spaces with '-' for compact labels
|
| 59 |
+
"""
|
| 60 |
+
if not text:
|
| 61 |
+
return ""
|
| 62 |
+
lab = " ".join(text.split()) # collapse whitespace/newlines
|
| 63 |
+
lab = lab.strip().lower()
|
| 64 |
+
# shorten if too long
|
| 65 |
+
if len(lab) > max_len:
|
| 66 |
+
lab = lab[: max_len - 3].rstrip() + "..."
|
| 67 |
+
# use a compact label form (slug-like) but keep readability
|
| 68 |
+
slug = lab.replace(" ", "-")
|
| 69 |
+
# remove characters that would be awkward in metadata keys
|
| 70 |
+
slug = re.sub(r"[^a-z0-9_\-\.]+", "", slug)
|
| 71 |
+
return slug
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def load_web_docs(urls: List[str]):
|
| 75 |
+
"""
|
| 76 |
+
Load documents from a list of URLs.
|
| 77 |
+
|
| 78 |
+
- HTML URLs:
|
| 79 |
+
* Fetch raw HTML with `requests`
|
| 80 |
+
* Split into sections with HTMLSectionSplitter (h1/h2)
|
| 81 |
+
* Infer section labels from the actual section header lines (preferred)
|
| 82 |
+
- PDF URLs (including Google Drive file links):
|
| 83 |
+
* Load with OnlinePDFLoader (one doc per page)
|
| 84 |
+
"""
|
| 85 |
+
html_urls: List[str] = []
|
| 86 |
+
pdf_urls: List[str] = []
|
| 87 |
+
|
| 88 |
+
for url in urls:
|
| 89 |
+
u = url.strip()
|
| 90 |
+
if not u:
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
# Google Drive file links (view) -> direct download PDF
|
| 94 |
+
if _is_gdrive_file(u):
|
| 95 |
+
pdf_urls.append(_gdrive_view_to_download(u))
|
| 96 |
+
# Direct PDF URLs
|
| 97 |
+
elif u.lower().endswith(".pdf"):
|
| 98 |
+
pdf_urls.append(u)
|
| 99 |
+
# Everything else is treated as HTML
|
| 100 |
+
else:
|
| 101 |
+
html_urls.append(u)
|
| 102 |
+
|
| 103 |
+
docs = []
|
| 104 |
+
|
| 105 |
+
# --- HTML via HTMLSectionSplitter on raw HTML string ---
|
| 106 |
+
if html_urls:
|
| 107 |
+
print(f"[index_builder] Loading HTML for {len(html_urls)} URLs with HTMLSectionSplitter")
|
| 108 |
+
|
| 109 |
+
headers_to_split_on = [
|
| 110 |
+
("h1", "Header 1"),
|
| 111 |
+
("h2", "Header 2"),
|
| 112 |
+
]
|
| 113 |
+
html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)
|
| 114 |
+
|
| 115 |
+
for url in html_urls:
|
| 116 |
+
try:
|
| 117 |
+
print(f"[index_builder] Fetching HTML from {url}")
|
| 118 |
+
resp = requests.get(url, timeout=15)
|
| 119 |
+
resp.raise_for_status()
|
| 120 |
+
html_string = resp.text
|
| 121 |
+
|
| 122 |
+
# HTMLSectionSplitter returns a list[Document] where each Document starts with the header line
|
| 123 |
+
html_header_splits = html_splitter.split_text(html_string)
|
| 124 |
+
|
| 125 |
+
# we will prefer to use the header line as the canonical label for each section
|
| 126 |
+
# but ensure we deduplicate very similar headers within the same page
|
| 127 |
+
seen_labels_in_page = set()
|
| 128 |
+
|
| 129 |
+
print(f"[index_builder] {url}: {len(html_header_splits)} HTML sections")
|
| 130 |
+
|
| 131 |
+
for d in html_header_splits:
|
| 132 |
+
# source URL
|
| 133 |
+
d.metadata["source"] = url
|
| 134 |
+
# extract the first meaningful non-empty line as the header
|
| 135 |
+
first_lines = [ln.strip() for ln in d.page_content.splitlines() if ln.strip()]
|
| 136 |
+
header_line = first_lines[0] if first_lines else ""
|
| 137 |
+
|
| 138 |
+
# normalize header_line for metadata and label
|
| 139 |
+
# keep section_header as human readable short header (truncated if necessary)
|
| 140 |
+
human_header = header_line
|
| 141 |
+
if len(human_header) > 300:
|
| 142 |
+
human_header = human_header[:300] + "..."
|
| 143 |
+
|
| 144 |
+
if human_header != "More Project":
|
| 145 |
+
d.metadata["section_header"] = human_header
|
| 146 |
+
else:
|
| 147 |
+
# Extract endpoint from URL as fallback header
|
| 148 |
+
try:
|
| 149 |
+
# e.g. https://site.com/projects/my-cool-project -> "my-cool-project"
|
| 150 |
+
endpoint = url.rstrip("/").split("/")[-1]
|
| 151 |
+
# Make it human-readable
|
| 152 |
+
endpoint = endpoint.replace("-", " ").replace("_", " ").title()
|
| 153 |
+
d.metadata["section_header"] = endpoint
|
| 154 |
+
d.metadata["section_label"] = endpoint
|
| 155 |
+
except Exception:
|
| 156 |
+
# absolute fallback
|
| 157 |
+
d.metadata["section_header"] = human_header
|
| 158 |
+
d.metadata["section_label"] = human_header
|
| 159 |
+
|
| 160 |
+
# produce a short machine-friendly label from header; fallback to URL-based label
|
| 161 |
+
# label_from_header = _normalize_label(header_line)
|
| 162 |
+
# if not label_from_header:
|
| 163 |
+
# label_from_header = _infer_section_label_from_url(url)
|
| 164 |
+
|
| 165 |
+
# # dedupe labels within this page (if splitter produced repeated headers)
|
| 166 |
+
# dedup_label = label_from_header
|
| 167 |
+
# suffix = 1
|
| 168 |
+
# while dedup_label in seen_labels_in_page:
|
| 169 |
+
# dedup_label = f"{label_from_header}-{suffix}"
|
| 170 |
+
# suffix += 1
|
| 171 |
+
# seen_labels_in_page.add(dedup_label)
|
| 172 |
+
|
| 173 |
+
# d.metadata["section_label"] = dedup_label
|
| 174 |
+
d.metadata["section_type"] = "remote_html"
|
| 175 |
+
# append docs
|
| 176 |
+
docs.extend(html_header_splits)
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"[index_builder] Error processing HTML from {url}: {e}")
|
| 179 |
+
|
| 180 |
+
# --- PDF files (including Drive) via OnlinePDFLoader ---
|
| 181 |
+
for pdf_url in pdf_urls:
|
| 182 |
+
print(f"[index_builder] Loading PDF from {pdf_url}")
|
| 183 |
+
try:
|
| 184 |
+
pdf_loader = OnlinePDFLoader(pdf_url)
|
| 185 |
+
pdf_docs = pdf_loader.load()
|
| 186 |
+
section_label = _infer_section_label_from_url(pdf_url)
|
| 187 |
+
for d in pdf_docs:
|
| 188 |
+
d.metadata["source"] = pdf_url
|
| 189 |
+
d.metadata["section_label"] = section_label
|
| 190 |
+
d.metadata["section_type"] = "remote_pdf"
|
| 191 |
+
docs.extend(pdf_docs)
|
| 192 |
+
print(f"[index_builder] {pdf_url}: {len(pdf_docs)} PDF pages")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
print(f"[index_builder] Failed to load PDF from {pdf_url}: {e}")
|
| 195 |
+
|
| 196 |
+
return docs
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def split_docs(docs, chunk_size: int = 1000, chunk_overlap: int = 200):
|
| 200 |
+
"""
|
| 201 |
+
Split loaded documents into chunks for embedding.
|
| 202 |
+
|
| 203 |
+
- HTML docs (from HTMLSectionSplitter) are already section-level chunks → keep as-is.
|
| 204 |
+
- Non-HTML docs (PDF pages, etc.) are split with RecursiveCharacterTextSplitter.
|
| 205 |
+
"""
|
| 206 |
+
html_docs = [d for d in docs if d.metadata.get("section_type") == "remote_html"]
|
| 207 |
+
other_docs = [d for d in docs if d.metadata.get("section_type") != "remote_html"]
|
| 208 |
+
|
| 209 |
+
chunks: List = []
|
| 210 |
+
|
| 211 |
+
# Keep HTML sections as they are
|
| 212 |
+
chunks.extend(html_docs)
|
| 213 |
+
|
| 214 |
+
# Split other docs (PDFs, etc.) into text chunks
|
| 215 |
+
if other_docs:
|
| 216 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 217 |
+
chunk_size=chunk_size,
|
| 218 |
+
chunk_overlap=chunk_overlap,
|
| 219 |
+
add_start_index=True,
|
| 220 |
+
)
|
| 221 |
+
other_chunks = splitter.split_documents(other_docs)
|
| 222 |
+
for c in other_chunks:
|
| 223 |
+
c.metadata.setdefault("section_label", c.metadata.get("source", "unknown"))
|
| 224 |
+
c.metadata.setdefault(
|
| 225 |
+
"section_type",
|
| 226 |
+
c.metadata.get("section_type", "remote_pdf"),
|
| 227 |
+
)
|
| 228 |
+
chunks.extend(other_chunks)
|
| 229 |
+
|
| 230 |
+
print(
|
| 231 |
+
f"[index_builder] split_docs: {len(html_docs)} HTML section chunks, "
|
| 232 |
+
f"{len(chunks) - len(html_docs)} non-HTML chunks"
|
| 233 |
+
)
|
| 234 |
+
return chunks
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def build_and_save_index():
|
| 238 |
+
"""
|
| 239 |
+
Crawl URLs, load docs, split into chunks, build FAISS index, and save it.
|
| 240 |
+
|
| 241 |
+
Returns:
|
| 242 |
+
(int, list): number of chunks indexed, and the chunks themselves.
|
| 243 |
+
"""
|
| 244 |
+
# 1. Crawl project roots (and any other CRAWL_ROOTS) to get sub-URLs
|
| 245 |
+
crawl_urls: List[str] = []
|
| 246 |
+
for root in CRAWL_ROOTS:
|
| 247 |
+
try:
|
| 248 |
+
urls = crawl_subpages(root)
|
| 249 |
+
print(f"[index_builder] Crawled {len(urls)} URLs under {root}")
|
| 250 |
+
crawl_urls.extend(urls)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"[index_builder] Failed to crawl {root}: {e}")
|
| 253 |
+
|
| 254 |
+
# 2. Combine fixed URLs (resume, about, scholar, GitHub, etc.) + crawled URLs
|
| 255 |
+
all_urls = list(set(FIXED_URLS + crawl_urls))
|
| 256 |
+
|
| 257 |
+
print(f"[index_builder] Total URLs to load: {len(all_urls)}")
|
| 258 |
+
for u in all_urls:
|
| 259 |
+
print(f" - {u}")
|
| 260 |
+
|
| 261 |
+
docs = load_web_docs(all_urls)
|
| 262 |
+
print(f"[index_builder] Loaded {len(docs)} raw documents")
|
| 263 |
+
|
| 264 |
+
if not docs:
|
| 265 |
+
print("[index_builder] WARNING: No documents loaded; aborting index build.")
|
| 266 |
+
return 0, []
|
| 267 |
+
|
| 268 |
+
# 3. Split into chunks (HTML via HTMLSectionSplitter, PDFs via recursive splitter)
|
| 269 |
+
chunks = split_docs(docs)
|
| 270 |
+
print(f"[index_builder] Split into {len(chunks)} chunks")
|
| 271 |
+
|
| 272 |
+
if not chunks:
|
| 273 |
+
print("[index_builder] WARNING: No chunks produced; aborting FAISS build.")
|
| 274 |
+
return 0, []
|
| 275 |
+
|
| 276 |
+
# 4. Build vector store with HF embeddings (e.g., all-MiniLM)
|
| 277 |
+
embeddings = get_embeddings()
|
| 278 |
+
vs = FAISS.from_documents(chunks, embeddings)
|
| 279 |
+
|
| 280 |
+
# 5. Save to disk
|
| 281 |
+
vs.save_local(VECTORSTORE_PATH)
|
| 282 |
+
print(
|
| 283 |
+
f"[index_builder] Saved FAISS index to {VECTORSTORE_PATH} "
|
| 284 |
+
f"(chunks={len(chunks)})"
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
return len(chunks), chunks
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def load_vectorstore():
|
| 291 |
+
"""
|
| 292 |
+
Load the FAISS vector store from disk using the same embedding model.
|
| 293 |
+
"""
|
| 294 |
+
embeddings = get_embeddings()
|
| 295 |
+
vs = FAISS.load_local(
|
| 296 |
+
VECTORSTORE_PATH,
|
| 297 |
+
embeddings,
|
| 298 |
+
allow_dangerous_deserialization=True,
|
| 299 |
+
)
|
| 300 |
+
return vs
|
rag_core/models_groq.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/models_groq.py
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_core.callbacks import Callbacks
|
| 4 |
+
from langchain_core.caches import BaseCache
|
| 5 |
+
from langchain_groq import ChatGroq
|
| 6 |
+
from .config import GROQ_CHAT_MODEL, GROQ_EVAL_MODEL
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
ChatGroq.model_rebuild()
|
| 10 |
+
|
| 11 |
+
def get_answer_llm():
|
| 12 |
+
return ChatGroq(model=GROQ_CHAT_MODEL, temperature=0.1)
|
| 13 |
+
|
| 14 |
+
def get_judge_llm():
|
| 15 |
+
return ChatGroq(model=GROQ_EVAL_MODEL, temperature=0.0)
|
rag_core/rag_chain.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/rag_chain.py (top of file)
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple, Dict, Any
|
| 4 |
+
from langchain.schema import Document, BaseRetriever
|
| 5 |
+
from transformers import pipeline
|
| 6 |
+
from rag_core.index_builder import load_vectorstore, build_and_save_index
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# ---- Label routing helpers ----
|
| 10 |
+
|
| 11 |
+
# Global zero-shot classifier (loaded once)
|
| 12 |
+
label_classifier = pipeline(
|
| 13 |
+
"zero-shot-classification",
|
| 14 |
+
model="facebook/bart-large-mnli"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
def build_label_vocab(docs: List[Document]) -> List[str]:
|
| 18 |
+
labels = []
|
| 19 |
+
seen = set()
|
| 20 |
+
for d in docs:
|
| 21 |
+
header = (d.metadata.get("section_header") or "").strip()
|
| 22 |
+
s_label = (d.metadata.get("section_label") or "").strip()
|
| 23 |
+
candidates = [header, s_label]
|
| 24 |
+
for c in candidates:
|
| 25 |
+
if not c:
|
| 26 |
+
continue
|
| 27 |
+
normalized = " ".join(c.split())
|
| 28 |
+
if len(normalized) > 120:
|
| 29 |
+
normalized = normalized[:120] + "..."
|
| 30 |
+
if normalized not in seen:
|
| 31 |
+
seen.add(normalized)
|
| 32 |
+
labels.append(normalized)
|
| 33 |
+
return labels
|
| 34 |
+
|
| 35 |
+
def map_query_to_labels_zero_shot(
|
| 36 |
+
query: str,
|
| 37 |
+
candidate_labels: List[str],
|
| 38 |
+
top_k: int = 5,
|
| 39 |
+
score_threshold: float = 0.40,
|
| 40 |
+
) -> List[Tuple[str, float]]:
|
| 41 |
+
if not candidate_labels:
|
| 42 |
+
return []
|
| 43 |
+
out = label_classifier(query, candidate_labels, multi_label=True)
|
| 44 |
+
labels_out = out["labels"]
|
| 45 |
+
scores_out = out["scores"]
|
| 46 |
+
selected: List[Tuple[str, float]] = []
|
| 47 |
+
for lbl, score in zip(labels_out[:top_k], scores_out[:top_k]):
|
| 48 |
+
if score >= score_threshold:
|
| 49 |
+
selected.append((lbl, float(score)))
|
| 50 |
+
if not selected and labels_out:
|
| 51 |
+
selected = [(labels_out[0], float(scores_out[0]))]
|
| 52 |
+
return selected
|
| 53 |
+
|
| 54 |
+
def fetch_docs_by_labels_with_scores(
|
| 55 |
+
selected_labels: List[Tuple[str, float]],
|
| 56 |
+
docs: List[Document],
|
| 57 |
+
) -> List[Tuple[Document, float, List[str]]]:
|
| 58 |
+
"""
|
| 59 |
+
For each doc, determine which of selected_labels it matches
|
| 60 |
+
(substring match on section_header / section_label).
|
| 61 |
+
Return (Document, score, [matched_labels]) where score is max label score.
|
| 62 |
+
"""
|
| 63 |
+
if not selected_labels:
|
| 64 |
+
return []
|
| 65 |
+
label_score: Dict[str, float] = {lbl.lower(): sc for lbl, sc in selected_labels}
|
| 66 |
+
out: List[Tuple[Document, float, List[str]]] = []
|
| 67 |
+
for d in docs:
|
| 68 |
+
header = (d.metadata.get("section_header") or "").lower()
|
| 69 |
+
s_label = (d.metadata.get("section_label") or "").lower()
|
| 70 |
+
combined = header + " " + s_label
|
| 71 |
+
|
| 72 |
+
matched_labels: List[str] = []
|
| 73 |
+
matched_scores: List[float] = []
|
| 74 |
+
for lbl_lower, sc in label_score.items():
|
| 75 |
+
if lbl_lower and lbl_lower in combined:
|
| 76 |
+
matched_labels.append(lbl_lower)
|
| 77 |
+
matched_scores.append(sc)
|
| 78 |
+
|
| 79 |
+
if matched_labels:
|
| 80 |
+
doc_score = max(matched_scores)
|
| 81 |
+
out.append((d, doc_score, matched_labels))
|
| 82 |
+
return out
|
| 83 |
+
|
| 84 |
+
class LabelRoutingRetriever(BaseRetriever):
|
| 85 |
+
"""
|
| 86 |
+
Retriever that:
|
| 87 |
+
1) Uses BART-MNLI to map query -> section labels.
|
| 88 |
+
2) Fetches all docs whose header/label match those labels.
|
| 89 |
+
3) Ranks docs by label confidence.
|
| 90 |
+
4) Falls back to vector retriever if no labels match.
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
docs: List[Document]
|
| 94 |
+
vector_retriever: Any = None
|
| 95 |
+
top_k_labels: int = 5
|
| 96 |
+
label_score_threshold: float = 0.35
|
| 97 |
+
k_docs: int = 6
|
| 98 |
+
|
| 99 |
+
class Config:
|
| 100 |
+
arbitrary_types_allowed = True
|
| 101 |
+
|
| 102 |
+
def __init__(
|
| 103 |
+
self,
|
| 104 |
+
docs: List[Document],
|
| 105 |
+
vector_retriever: Any = None,
|
| 106 |
+
top_k_labels: int = 5,
|
| 107 |
+
label_score_threshold: float = 0.35,
|
| 108 |
+
k_docs: int = 6,
|
| 109 |
+
**kwargs,
|
| 110 |
+
):
|
| 111 |
+
super().__init__(
|
| 112 |
+
docs=docs,
|
| 113 |
+
vector_retriever=vector_retriever,
|
| 114 |
+
top_k_labels=top_k_labels,
|
| 115 |
+
label_score_threshold=label_score_threshold,
|
| 116 |
+
k_docs=k_docs,
|
| 117 |
+
**kwargs,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def get_relevant_documents(self, query: str) -> List[Document]:
|
| 121 |
+
# 1) build candidate labels from docs
|
| 122 |
+
candidate_labels = build_label_vocab(self.docs)
|
| 123 |
+
|
| 124 |
+
# 2) map query -> (label, score)
|
| 125 |
+
mapped = map_query_to_labels_zero_shot(
|
| 126 |
+
query,
|
| 127 |
+
candidate_labels,
|
| 128 |
+
top_k=self.top_k_labels,
|
| 129 |
+
score_threshold=self.label_score_threshold,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# 3) fetch docs with scores
|
| 133 |
+
docs_with_scores = fetch_docs_by_labels_with_scores(mapped, self.docs)
|
| 134 |
+
|
| 135 |
+
if not docs_with_scores and self.vector_retriever is not None:
|
| 136 |
+
# fallback to vector retriever (semantic retrieval)
|
| 137 |
+
vec_docs = self.vector_retriever.get_relevant_documents(query)
|
| 138 |
+
return vec_docs[: self.k_docs]
|
| 139 |
+
|
| 140 |
+
# 4) sort by score desc + dedupe
|
| 141 |
+
seen_keys = set()
|
| 142 |
+
ranked_docs: List[Document] = []
|
| 143 |
+
for d, sc, matched in sorted(docs_with_scores, key=lambda x: x[1], reverse=True):
|
| 144 |
+
if d.metadata.get("Header 2")=='More Project':
|
| 145 |
+
continue
|
| 146 |
+
key = (d.metadata.get("source"), d.page_content[:200])
|
| 147 |
+
if key in seen_keys:
|
| 148 |
+
continue
|
| 149 |
+
seen_keys.add(key)
|
| 150 |
+
ranked_docs.append(d)
|
| 151 |
+
if len(ranked_docs) >= self.k_docs:
|
| 152 |
+
break
|
| 153 |
+
return ranked_docs
|
| 154 |
+
|
| 155 |
+
async def aget_relevant_documents(self, query: str) -> List[Document]:
|
| 156 |
+
# simple async wrapper
|
| 157 |
+
return self.get_relevant_documents(query)
|
| 158 |
+
|
| 159 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 160 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 161 |
+
from langchain.chains import create_retrieval_chain
|
| 162 |
+
from .models_groq import get_answer_llm, get_judge_llm
|
| 163 |
+
|
| 164 |
+
SYSTEM_PROMPT = """You are Ritam's personal QA bot.
|
| 165 |
+
Use the following context from his website and resume to answer.
|
| 166 |
+
|
| 167 |
+
Question: {input}
|
| 168 |
+
Context:
|
| 169 |
+
{context}
|
| 170 |
+
|
| 171 |
+
Answer in first person as Ritam."""
|
| 172 |
+
|
| 173 |
+
def build_rag_chain(vs, k=5, max_docs=6):
|
| 174 |
+
|
| 175 |
+
all_docs = list(vs.docstore._dict.values())
|
| 176 |
+
vector_retriever = vs.as_retriever(search_kwargs={"k": 8})
|
| 177 |
+
|
| 178 |
+
# old: ensemble/PrefixRetriever
|
| 179 |
+
# new: label routing retriever
|
| 180 |
+
retriever = LabelRoutingRetriever(
|
| 181 |
+
docs=all_docs,
|
| 182 |
+
vector_retriever=vector_retriever,
|
| 183 |
+
top_k_labels=k,
|
| 184 |
+
label_score_threshold=0.4,
|
| 185 |
+
k_docs=max_docs,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
prompt = ChatPromptTemplate.from_template(
|
| 189 |
+
SYSTEM_PROMPT
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
llm = get_answer_llm()
|
| 193 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 194 |
+
("system", SYSTEM_PROMPT),
|
| 195 |
+
("human", "{input}"),
|
| 196 |
+
])
|
| 197 |
+
|
| 198 |
+
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
|
| 199 |
+
rag_chain = create_retrieval_chain(retriever, combine_docs_chain)
|
| 200 |
+
return rag_chain, retriever, SYSTEM_PROMPT
|
rag_core/rag_chain_helper.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/rag_chain.py
|
| 2 |
+
from langchain.chains import create_retrieval_chain
|
| 3 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 5 |
+
from rag_core.FusedRetreiver import FusedRetriever
|
| 6 |
+
from rag_core.PrefixRetreiver import PrefixRetriever
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
from .models_groq import get_answer_llm, get_judge_llm
|
| 10 |
+
|
| 11 |
+
def faiss_all_docs(faiss_store):
|
| 12 |
+
# Map FAISS index positions -> docstore ids
|
| 13 |
+
ids = list(faiss_store.index_to_docstore_id.values())
|
| 14 |
+
# Pull Documents from the docstore in that order
|
| 15 |
+
return [faiss_store.docstore.search(_id) for _id in ids]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
SYSTEM_PROMPT = """
|
| 19 |
+
You are a QA assistant for Ritam that answers questions about Ritam's career posing as Ritam's digital twin.
|
| 20 |
+
|
| 21 |
+
You will receive:
|
| 22 |
+
- `context`: text chunks retrieved from Ritam's resume, website, project pages, scholar profile, etc.
|
| 23 |
+
- `chat_history`: a short summary of the prior conversation turns for coherence.
|
| 24 |
+
|
| 25 |
+
Context:
|
| 26 |
+
{context}
|
| 27 |
+
|
| 28 |
+
Chat history (for reference):
|
| 29 |
+
{chat_history}
|
| 30 |
+
|
| 31 |
+
Rules:
|
| 32 |
+
- Use ONLY the provided context for factual claims.
|
| 33 |
+
- If the answer is not clearly supported by the context, say "I don't know"
|
| 34 |
+
and suggest rephrasing in terms of Ritam's work, projects, or research.
|
| 35 |
+
- If the user asks generic questions unrelated to Ritam, politely refuse saying that please ask only career related questions to me.
|
| 36 |
+
|
| 37 |
+
The chat_history is only for understanding the follow-up question; do NOT invent facts
|
| 38 |
+
that aren't in the context even if they appear earlier in chat_history.
|
| 39 |
+
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def build_rag_chain(vectorstore, k=2, max_lines=3, weights=(0.35, 0),top_k=None):
|
| 43 |
+
"""
|
| 44 |
+
Returns (rag_chain, retriever, SYSTEM_PROMPT)
|
| 45 |
+
|
| 46 |
+
- vectorstore: your FAISS/Chroma/… already loaded
|
| 47 |
+
- k: per-retriever fetch size (each retriever will pull up to k docs)
|
| 48 |
+
- max_lines: how many leading lines to consider for prefix matching
|
| 49 |
+
- weights: contribution of (Prefix, Vector) retrievers in the fusion
|
| 50 |
+
- top_k: final number of docs returned by the ensemble (defaults to k)
|
| 51 |
+
"""
|
| 52 |
+
# ---- Retrievers
|
| 53 |
+
all_docs = faiss_all_docs(vectorstore)
|
| 54 |
+
prefix_retriever = PrefixRetriever(docs=all_docs, k=2, max_lines=3)
|
| 55 |
+
|
| 56 |
+
# Pull a few extra from vectorstore to give the fusion more to work with
|
| 57 |
+
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": max(k, 1)})
|
| 58 |
+
|
| 59 |
+
fused_retriever = FusedRetriever(
|
| 60 |
+
prefix_retriever=prefix_retriever,
|
| 61 |
+
vector_retriever=vector_retriever,
|
| 62 |
+
k=6,
|
| 63 |
+
prefix_first=True,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# ---- LLM + prompt
|
| 67 |
+
llm = get_answer_llm()
|
| 68 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 69 |
+
("system", SYSTEM_PROMPT),
|
| 70 |
+
("human", "{input}"),
|
| 71 |
+
])
|
| 72 |
+
|
| 73 |
+
# ---- Stuff docs into the prompt, then make the RAG chain
|
| 74 |
+
document_chain = create_stuff_documents_chain(llm, prompt)
|
| 75 |
+
rag_chain = create_retrieval_chain(fused_retriever, document_chain)
|
| 76 |
+
|
| 77 |
+
return rag_chain, fused_retriever, SYSTEM_PROMPT
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ---------- Conversational question rewriter ----------
|
| 82 |
+
|
| 83 |
+
# We’ll use a low-temperature model (judge_llm) to rewrite follow-up questions
|
| 84 |
+
_rewriter_llm = get_judge_llm()
|
| 85 |
+
|
| 86 |
+
QUESTION_REWRITE_PROMPT = ChatPromptTemplate.from_messages(
|
| 87 |
+
[
|
| 88 |
+
("system", """
|
| 89 |
+
You are a helpful assistant that rewrites follow-up questions into standalone questions.
|
| 90 |
+
|
| 91 |
+
You are given a chat history between a user and an assistant, plus the user's new question.
|
| 92 |
+
Your job is to rewrite the new question so that it is self-contained and can be understood
|
| 93 |
+
without the previous turns.
|
| 94 |
+
|
| 95 |
+
The rewritten question MUST stay faithful to the user's intent and be about Ritam's
|
| 96 |
+
career, projects, research, or education.
|
| 97 |
+
If the question is already standalone, return it as-is.
|
| 98 |
+
"""),
|
| 99 |
+
("human", """
|
| 100 |
+
Chat history:
|
| 101 |
+
------------
|
| 102 |
+
{chat_history}
|
| 103 |
+
|
| 104 |
+
New user question:
|
| 105 |
+
------------
|
| 106 |
+
{question}
|
| 107 |
+
|
| 108 |
+
Rewrite the new question into a single, self-contained question:
|
| 109 |
+
"""),
|
| 110 |
+
]
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
def rewrite_question_with_history(history, question: str) -> str:
|
| 114 |
+
"""
|
| 115 |
+
history: list of [user_msg, assistant_msg] pairs (from Gradio ChatInterface)
|
| 116 |
+
question: current user string
|
| 117 |
+
|
| 118 |
+
Returns a standalone question string that incorporates context from history.
|
| 119 |
+
"""
|
| 120 |
+
# If no history, just return the question
|
| 121 |
+
if not history:
|
| 122 |
+
return question
|
| 123 |
+
|
| 124 |
+
# Convert history to a single text block
|
| 125 |
+
history_lines = []
|
| 126 |
+
for turn in history:
|
| 127 |
+
if not turn or len(turn) < 2:
|
| 128 |
+
continue
|
| 129 |
+
user_msg, assistant_msg = turn[0], turn[1]
|
| 130 |
+
history_lines.append(f"User: {user_msg}")
|
| 131 |
+
history_lines.append(f"Assistant: {assistant_msg}")
|
| 132 |
+
history_text = "\n".join(history_lines)
|
| 133 |
+
|
| 134 |
+
chain = QUESTION_REWRITE_PROMPT | _rewriter_llm
|
| 135 |
+
resp = chain.invoke({"chat_history": history_text, "question": question})
|
| 136 |
+
standalone = resp.content.strip()
|
| 137 |
+
return standalone or question
|
rag_core/sources.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rag_core/sources.py
|
| 2 |
+
|
| 3 |
+
# Resume URL (HTML or PDF, both okay; PDF will be text-extracted)
|
| 4 |
+
LINKEDIN_URL = "https://www.linkedin.com/in/ritam-upadhyay-51ba81192/"
|
| 5 |
+
|
| 6 |
+
# Root projects page that links to each project subpage
|
| 7 |
+
PROJECTS_ROOT_URL = "https://fearless-writers-028990.framer.app/project"
|
| 8 |
+
|
| 9 |
+
# Other URLs directly relevant to your career
|
| 10 |
+
OTHER_CAREER_URLS = [
|
| 11 |
+
"https://fearless-writers-028990.framer.app/old-home",
|
| 12 |
+
"https://fearless-writers-028990.framer.app/",
|
| 13 |
+
"https://scholar.google.com/citations?user=04o0bdcAAAAJ&hl=en",
|
| 14 |
+
"https://fearless-writers-028990.framer.app/stack",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
# These roots will be crawled:
|
| 18 |
+
CRAWL_ROOTS = [
|
| 19 |
+
PROJECTS_ROOT_URL,
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
# These are direct, non-crawling URLs:
|
| 23 |
+
FIXED_URLS = [
|
| 24 |
+
*OTHER_CAREER_URLS,
|
| 25 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.49.1
|
| 2 |
+
langchain==0.3.12
|
| 3 |
+
python-dotenv==1.0.1
|
| 4 |
+
langchain-core==0.3.28
|
| 5 |
+
langchain-openai==0.2.12
|
| 6 |
+
langchain-community==0.3.12
|
| 7 |
+
langchain-text-splitters==0.3.4
|
| 8 |
+
langchain-groq==0.2.1
|
| 9 |
+
sentence-transformers==3.3.1
|
| 10 |
+
faiss-cpu==1.9.0.post1
|
| 11 |
+
beautifulsoup4==4.12.3
|
| 12 |
+
requests==2.32.3
|
| 13 |
+
pypdf==5.1.0
|
| 14 |
+
lxml==6.0.2
|