github-actions[bot] commited on
Commit
0064c35
·
0 Parent(s):

Deploy snapshot to HF (binaries stripped)

Browse files
.github/workflows/push_to_hf_space.yml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Space (always strip binaries)
2
+
3
+ on:
4
+ push:
5
+ branches: [ "hf-deploy" ]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout source
14
+ uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+ ref: hf-deploy
18
+
19
+ - name: Create clean snapshot (remove .pkl/.faiss no matter what)
20
+ run: |
21
+ set -euxo pipefail
22
+
23
+ # Export current commit as plain files (no .git history)
24
+ rm -rf /tmp/snapshot
25
+ mkdir -p /tmp/snapshot
26
+ git archive --format=tar HEAD | tar -x -C /tmp/snapshot
27
+
28
+ echo "== Files matching .pkl/.faiss BEFORE removal =="
29
+ (cd /tmp/snapshot && find . -type f \( -name "*.pkl" -o -name "*.faiss" \) -print) || true
30
+
31
+ # Always delete these binaries (even if someone committed them)
32
+ (cd /tmp/snapshot && find . -type f \( -name "*.pkl" -o -name "*.faiss" \) -print -delete) || true
33
+
34
+ echo "== Files matching .pkl/.faiss AFTER removal =="
35
+ (cd /tmp/snapshot && find . -type f \( -name "*.pkl" -o -name "*.faiss" \) -print) || true
36
+
37
+ # Ensure they won't be committed into the snapshot repo
38
+ printf "\n# Never deploy vectorstore binaries to Spaces\n*.pkl\n*.faiss\n" >> /tmp/snapshot/.gitignore
39
+
40
+ # Build a new single-commit git repo from the cleaned snapshot
41
+ cd /tmp/snapshot
42
+ git init
43
+ git config user.name "github-actions[bot]"
44
+ git config user.email "github-actions[bot]@users.noreply.github.com"
45
+ git add -A
46
+ git commit -m "Deploy snapshot to HF (binaries stripped)"
47
+
48
+ - name: Push snapshot to Hugging Face Space (HF main)
49
+ env:
50
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
51
+ HF_USERNAME: ritup3 # CHANGE if needed
52
+ SPACE_NAME: PersonaRag # CHANGE if needed
53
+ run: |
54
+ set -euxo pipefail
55
+ cd /tmp/snapshot
56
+
57
+ # Authenticated remote for HF Space
58
+ git remote add space https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME
59
+
60
+ # Spaces deploy from branch "main"
61
+ git branch -M main
62
+ git push space main --force
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.faiss
4
+ *.pkl
5
+ .env
6
+ .DS_Store
7
+ # Never deploy vectorstore binaries to Spaces
8
+ *.pkl
9
+ *.faiss
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PersonaRag
3
+ emoji: "🤖"
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "5.49.1"
8
+ python_version: "3.10.13"
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
app.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from dotenv import load_dotenv
7
+ import gradio as gr
8
+ import time
9
+ import threading
10
+ import atexit
11
+
12
+ from rag_core.index_builder import load_vectorstore
13
+ from rag_core.rag_chain_helper import rewrite_question_with_history
14
+ from rag_core.rag_chain import build_rag_chain
15
+ from rag_core.evaluator import evaluate_answer
16
+ from rag_core.index_builder import build_and_save_index
17
+
18
+ from rag_core.config import VECTORSTORE_PATH
19
+
20
+ # ---------- Refresh config ----------
21
+ REFRESH_ENABLED = os.getenv("REFRESH_ENABLED", "true").lower() == "true"
22
+ REFRESH_INTERVAL_SECONDS = int(os.getenv("REFRESH_INTERVAL_SECONDS", str(24 * 60 * 60)))
23
+ REFRESH_AT_HOUR = int(os.getenv("REFRESH_AT_HOUR", "3"))
24
+ REFRESH_AT_MINUTE = int(os.getenv("REFRESH_AT_MINUTE", "0"))
25
+ REFRESH_ONLY_FIXED_URLS = os.getenv("REFRESH_ONLY_FIXED_URLS", "false").lower() == "true"
26
+
27
+ state_lock = threading.RLock()
28
+ stop_refresh_event = threading.Event()
29
+
30
+
31
+ load_dotenv()
32
+
33
+ # ---------- Logging (Model Flow) ----------
34
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
35
+
36
+ logger = logging.getLogger("model_flow")
37
+ logger.setLevel(LOG_LEVEL)
38
+
39
+ if not logger.handlers:
40
+ h = logging.StreamHandler()
41
+ h.setLevel(LOG_LEVEL)
42
+ formatter = logging.Formatter(
43
+ "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
44
+ )
45
+ h.setFormatter(formatter)
46
+ logger.addHandler(h)
47
+
48
+ def log_event(event: str, **payload):
49
+ """Structured-ish logging for tracing model flow."""
50
+ safe = {}
51
+ for k, v in payload.items():
52
+ try:
53
+ json.dumps(v) # ensure serializable
54
+ safe[k] = v
55
+ except TypeError:
56
+ safe[k] = str(v)
57
+ logger.info("%s | %s", event, json.dumps(safe, ensure_ascii=False))
58
+
59
+
60
+ # ---------- Global state ----------
61
+ vectorstore = None
62
+ rag_chain = None
63
+ retriever = None
64
+ system_prompt = None
65
+
66
+
67
+ def init_rag():
68
+ global vectorstore, rag_chain, retriever, system_prompt
69
+
70
+ # HARD DISABLE: no crawling / no auto-index build
71
+ index_path = Path(VECTORSTORE_PATH) / "index.faiss"
72
+ if not index_path.exists():
73
+ n_chunks, _ = build_and_save_index()
74
+ log_event("refresh.index_built", mode="crawl", chunks=n_chunks)
75
+
76
+ vectorstore = load_vectorstore()
77
+
78
+ rag_chain, retriever, system_prompt = build_rag_chain(
79
+ vectorstore,
80
+ k=5,
81
+ max_docs=2
82
+ )
83
+ log_event("init_rag.ready", vectorstore_path=VECTORSTORE_PATH)
84
+
85
+
86
+ init_rag()
87
+
88
+ def refresh_rag_once():
89
+ """
90
+ Refetch website docs and rebuild the index + chain.
91
+ Never crashes the app; logs errors.
92
+ """
93
+ global vectorstore, rag_chain, retriever, system_prompt
94
+
95
+ log_event("refresh.start", only_fixed_urls=REFRESH_ONLY_FIXED_URLS)
96
+
97
+ try:
98
+ from rag_core.rag_chain import build_rag_chain
99
+ from rag_core.index_builder import load_vectorstore
100
+
101
+ n_chunks, _ = build_and_save_index()
102
+ log_event("refresh.index_built", mode="crawl", chunks=n_chunks)
103
+
104
+ # Reload from disk (ensures consistent serialization)
105
+ vs = load_vectorstore()
106
+
107
+ # Build new chain
108
+ new_chain, new_retriever, new_system_prompt = build_rag_chain(
109
+ vs,
110
+ k=5,
111
+ max_docs=2,
112
+ )
113
+
114
+ # Atomic swap
115
+ with state_lock:
116
+ vectorstore = vs
117
+ rag_chain = new_chain
118
+ retriever = new_retriever
119
+ system_prompt = new_system_prompt
120
+
121
+ log_event("refresh.done", status="ok")
122
+
123
+ except Exception as e:
124
+ log_event("refresh.error", error=str(e))
125
+
126
+ def _seconds_until_next_run(hour: int, minute: int) -> int:
127
+ # compute sleep until next local time (hour:minute)
128
+ now = time.localtime()
129
+ target = time.mktime((
130
+ now.tm_year, now.tm_mon, now.tm_mday,
131
+ hour, minute, 0,
132
+ now.tm_wday, now.tm_yday, now.tm_isdst
133
+ ))
134
+ now_ts = time.time()
135
+ if target <= now_ts:
136
+ target += 24 * 60 * 60
137
+ return int(target - now_ts)
138
+
139
+ def _daily_refresh_loop():
140
+ # small startup delay
141
+ time.sleep(3)
142
+
143
+ while not stop_refresh_event.is_set():
144
+ # sleep until next scheduled time
145
+ sleep_s = _seconds_until_next_run(REFRESH_AT_HOUR, REFRESH_AT_MINUTE)
146
+ log_event("refresh.sleep", seconds=sleep_s, at_hour=REFRESH_AT_HOUR, at_minute=REFRESH_AT_MINUTE)
147
+
148
+ # sleep in chunks so shutdown responds quickly
149
+ while sleep_s > 0 and not stop_refresh_event.is_set():
150
+ step = min(5, sleep_s)
151
+ time.sleep(step)
152
+ sleep_s -= step
153
+
154
+ if stop_refresh_event.is_set():
155
+ break
156
+
157
+ refresh_rag_once()
158
+
159
+ def start_refresh_thread():
160
+ if not REFRESH_ENABLED:
161
+ log_event("refresh.disabled")
162
+ return
163
+ t = threading.Thread(target=_daily_refresh_loop, daemon=True)
164
+ t.start()
165
+ log_event("refresh.thread_started", daily_at=f"{REFRESH_AT_HOUR:02d}:{REFRESH_AT_MINUTE:02d}")
166
+
167
+ atexit.register(lambda: stop_refresh_event.set())
168
+ start_refresh_thread()
169
+
170
+
171
+
172
+ # ---------- Helpers ----------
173
+ def _history_to_text(history):
174
+ """Convert Gradio history ([[user, bot], ...]) to a readable text snippet."""
175
+ if not history:
176
+ return ""
177
+ lines = []
178
+ for turn in history:
179
+ if not turn or len(turn) < 2:
180
+ continue
181
+ user_msg, assistant_msg = turn[0], turn[1]
182
+ lines.append(f"User: {user_msg}")
183
+ lines.append(f"Assistant: {assistant_msg}")
184
+ return "\n".join(lines)
185
+
186
+
187
+ def _docs_to_loggable(docs, max_chars=220):
188
+ """Return lightweight doc info for logs (no full dump)."""
189
+ out = []
190
+ for d in (docs or []):
191
+ src = (d.metadata or {}).get("source", "unknown")
192
+ txt = (d.page_content or "").strip().replace("\n", " ")
193
+ out.append({
194
+ "source": src,
195
+ "preview": (txt[:max_chars] + ("..." if len(txt) > max_chars else "")),
196
+ "metadata": (d.metadata or {}),
197
+ })
198
+ return out
199
+
200
+
201
+ def generate_answer(message, history):
202
+ """
203
+ Core logic:
204
+ - rewrite question with history (best-effort)
205
+ - run RAG (required; if this fails, return a fallback reply)
206
+ - evaluate (best-effort; if this fails, skip retry)
207
+ - optionally retry once based on evaluator signal
208
+ Returns ONLY the final answer string (no sources/context/evaluator in UI).
209
+ """
210
+ log_event("request.start", user_message=message)
211
+
212
+ # ---------- 1. Rewrite with history (best-effort) ----------
213
+ try:
214
+ standalone_question = rewrite_question_with_history(history, message)
215
+ except Exception as e:
216
+ log_event("rewrite.error", error=str(e))
217
+ standalone_question = message # fallback: use original message
218
+
219
+ history_text = _history_to_text(history)
220
+
221
+ log_event(
222
+ "rewrite.done",
223
+ standalone_question=standalone_question,
224
+ history_chars=len(history_text),
225
+ )
226
+
227
+ # ---------- 2. Run RAG (if this fails, we bail with generic error text) ----------
228
+ try:
229
+ with state_lock:
230
+ local_rag_chain = rag_chain
231
+ local_system_prompt = system_prompt
232
+
233
+ rag_res = local_rag_chain.invoke({
234
+ "input": standalone_question,
235
+ "chat_history": history_text,
236
+ })
237
+ except Exception as e:
238
+ log_event("rag.error", error=str(e))
239
+ fallback = (
240
+ "I'm having trouble accessing my knowledge base right now. "
241
+ "Please try again in a moment."
242
+ )
243
+ log_event(
244
+ "request.end",
245
+ final_answer_preview=fallback[:400] + ("..." if len(fallback) > 400 else "")
246
+ )
247
+ return fallback
248
+
249
+ answer_1 = rag_res.get("answer", "") or ""
250
+ ctx_docs_1 = rag_res.get("context", []) or []
251
+
252
+ log_event(
253
+ "rag.done",
254
+ answer_preview=answer_1[:400] + ("..." if len(answer_1) > 400 else ""),
255
+ retrieved_count=len(ctx_docs_1),
256
+ retrieved_docs=_docs_to_loggable(ctx_docs_1),
257
+ )
258
+
259
+ # ---------- 3. Evaluate (best-effort; never crash on judge failure) ----------
260
+ eval_res_1 = None
261
+ try:
262
+ eval_res_1 = evaluate_answer(
263
+ system_prompt=local_system_prompt,
264
+ question=message,
265
+ context_docs=ctx_docs_1,
266
+ answer=answer_1,
267
+ )
268
+
269
+ log_event(
270
+ "eval.done",
271
+ overall_score=float(eval_res_1.overall_score),
272
+ grounded=float(eval_res_1.grounded_in_context_score),
273
+ hallucination=bool(eval_res_1.hallucination_detected),
274
+ feedback=str(eval_res_1.feedback),
275
+ )
276
+ except Exception as e:
277
+ log_event("eval.error", error=str(e))
278
+ # We just skip retry logic; answer_1 is still valid.
279
+
280
+ final_answer = answer_1
281
+
282
+ # ---------- 4. Single retry (only if evaluator succeeded & says to retry) ----------
283
+ try:
284
+ if (
285
+ eval_res_1 is not None and
286
+ ( eval_res_1.overall_score < 0.70 or getattr(eval_res_1, "should_retry", True))
287
+ ):
288
+ revision_prompt = (
289
+ f"{standalone_question}\n\n"
290
+ f"You previously answered this:\n{answer_1}\n\n"
291
+ "An evaluator found issues. Revise your answer to address the feedback below.\n"
292
+ "Rules:\n"
293
+ "- Use ONLY the provided context.\n"
294
+ "- If the context does not support the claim, say \"I don't know\".\n"
295
+ "- Be specific and grounded.\n\n"
296
+ f"Evaluator feedback:\n{eval_res_1.feedback}\n"
297
+ )
298
+
299
+ log_event(
300
+ "retry.triggered",
301
+ reason="eval_score_below_threshold",
302
+ threshold=0.90,
303
+ )
304
+
305
+ # RAG retry — if this fails, we keep the original answer_1
306
+ try:
307
+ rag_res_2 = rag_chain.invoke({
308
+ "input": revision_prompt,
309
+ "chat_history": history_text,
310
+ })
311
+ answer_2 = rag_res_2.get("answer", "") or ""
312
+ ctx_docs_2 = rag_res_2.get("context", []) or []
313
+
314
+ log_event(
315
+ "rag.retry_done",
316
+ answer_preview=answer_2[:400] + ("..." if len(answer_2) > 400 else ""),
317
+ retrieved_count=len(ctx_docs_2),
318
+ retrieved_docs=_docs_to_loggable(ctx_docs_2),
319
+ )
320
+
321
+ # Optional: re-evaluate the revised answer (ignore errors)
322
+ try:
323
+ eval_res_2 = evaluate_answer(
324
+ system_prompt=system_prompt,
325
+ question=message,
326
+ context_docs=ctx_docs_2,
327
+ answer=answer_2,
328
+ )
329
+ log_event(
330
+ "eval.retry_done",
331
+ overall_score=float(eval_res_2.overall_score),
332
+ grounded=float(eval_res_2.grounded_in_context_score),
333
+ hallucination=bool(eval_res_2.hallucination_detected),
334
+ feedback=str(eval_res_2.feedback),
335
+ )
336
+ except Exception as e_eval2:
337
+ log_event("eval.retry_error", error=str(e_eval2))
338
+
339
+ # If we got here, second answer is safe to use
340
+ if eval_res_1.overall_score>eval_res_2.overall_score:
341
+ final_answer = answer_1
342
+ else:
343
+ final_answer = answer_2
344
+
345
+ except Exception as e_rag2:
346
+ # Retry RAG failed; log and fall back to first answer
347
+ log_event("rag.retry_error", error=str(e_rag2))
348
+ final_answer = answer_1
349
+
350
+ except Exception as e_retry_block:
351
+ # Any unexpected error in retry logic should not crash the whole request
352
+ log_event("retry.block_error", error=str(e_retry_block))
353
+ final_answer = answer_1
354
+
355
+ # ---------- 5. Final logging & return ----------
356
+ log_event(
357
+ "request.end",
358
+ final_answer_preview=final_answer[:400] + ("..." if len(final_answer) > 400 else "")
359
+ )
360
+
361
+ return final_answer
362
+
363
+
364
+ def respond(message, history):
365
+ """
366
+ Gradio wrapper that is resilient to unexpected exceptions.
367
+ If anything explodes inside generate_answer, we log it and return
368
+ a safe fallback message.
369
+ """
370
+ if not message:
371
+ return "", history
372
+
373
+ try:
374
+ answer = generate_answer(message, history)
375
+ except Exception as e:
376
+ log_event("respond.fatal_error", error=str(e))
377
+ answer = (
378
+ "Something went wrong on my side while trying to answer. "
379
+ "Please try again in a moment."
380
+ )
381
+
382
+ history = history + [[message, answer]]
383
+ return "", history
384
+
385
+
386
+ # ---------- Gradio UI ----------
387
+ with gr.Blocks(title="Ask Ritam (Career QA Bot)") as demo:
388
+ gr.Markdown(
389
+ "# Ask Ritam\n"
390
+ "A RAG-powered career assistant over my resume, website, and projects.\n"
391
+ "Ask anything about my experience, projects, research, or education."
392
+ )
393
+
394
+ with gr.Row():
395
+ with gr.Column(scale=3):
396
+ chatbot = gr.Chatbot(label="Conversation", height=500)
397
+
398
+ with gr.Row():
399
+ msg = gr.Textbox(
400
+ placeholder="Ask anything about my career, projects, or research...",
401
+ lines=2,
402
+ scale=4,
403
+ show_label=False,
404
+ )
405
+ send_btn = gr.Button("Send", variant="primary", scale=1)
406
+
407
+ clear_btn = gr.Button("Clear chat")
408
+
409
+ send_btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
410
+ msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
411
+
412
+ clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
413
+
414
+ demo.launch()
exp.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
rag_core/FusedRetreiver.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, List, Set, Tuple
2
+ from langchain.schema import BaseRetriever, Document
3
+
4
+ class FusedRetriever(BaseRetriever):
5
+ """
6
+ Pydantic-compatible fused retriever that wraps a header-aware PrefixRetriever
7
+ and a vector retriever. Declares fields so Pydantic validation succeeds.
8
+ """
9
+ prefix_retriever: Any
10
+ vector_retriever: Any
11
+ k: int = 4
12
+ prefix_first: bool = True
13
+
14
+ class Config:
15
+ # allows storing arbitrary Python objects in model fields
16
+ arbitrary_types_allowed = True
17
+
18
+ def get_relevant_documents(self, query: str) -> List[Document]:
19
+ # 1) prefix candidates
20
+ prefix_docs = []
21
+ if hasattr(self.prefix_retriever, "get_relevant_documents"):
22
+ prefix_docs = self.prefix_retriever.get_relevant_documents(query)
23
+ elif hasattr(self.prefix_retriever, "_get_relevant_documents"):
24
+ prefix_docs = self.prefix_retriever._get_relevant_documents(query)
25
+
26
+ # 2) vector candidates
27
+ vector_docs = []
28
+ try:
29
+ # many LangChain retrievers implement get_relevant_documents
30
+ if hasattr(self.vector_retriever, "get_relevant_documents"):
31
+ vector_docs = self.vector_retriever.get_relevant_documents(query)
32
+ elif hasattr(self.vector_retriever, "retrieve"):
33
+ vector_docs = self.vector_retriever.retrieve(query)
34
+ elif hasattr(self.vector_retriever, "get_relevant_documents_async"):
35
+ vector_docs = self.vector_retriever.get_relevant_documents_async(query)
36
+ except Exception:
37
+ vector_docs = []
38
+
39
+ # fuse with dedupe
40
+ seen: Set[Tuple[str, str]] = set()
41
+ out: List[Document] = []
42
+
43
+ def add_docs(docs: List[Document]) -> bool:
44
+ for d in docs:
45
+ key = (d.metadata.get("source"), d.page_content[:200])
46
+ if key in seen:
47
+ continue
48
+ seen.add(key)
49
+ out.append(d)
50
+ if len(out) >= self.k:
51
+ return True
52
+ return False
53
+
54
+ if self.prefix_first:
55
+ finished = add_docs(prefix_docs)
56
+ if not finished:
57
+ add_docs(vector_docs)
58
+ else:
59
+ finished = add_docs(vector_docs)
60
+ if not finished:
61
+ add_docs(prefix_docs)
62
+
63
+ return out[: self.k]
64
+
65
+ async def aget_relevant_documents(self, query: str) -> List[Document]:
66
+ # Try to call async variants when available, else fall back to sync
67
+ prefix_docs = []
68
+ if hasattr(self.prefix_retriever, "aget_relevant_documents"):
69
+ prefix_docs = await self.prefix_retriever.aget_relevant_documents(query)
70
+ elif hasattr(self.prefix_retriever, "get_relevant_documents"):
71
+ prefix_docs = self.prefix_retriever.get_relevant_documents(query)
72
+ elif hasattr(self.prefix_retriever, "_get_relevant_documents"):
73
+ prefix_docs = self.prefix_retriever._get_relevant_documents(query)
74
+
75
+ vector_docs = []
76
+ if hasattr(self.vector_retriever, "aget_relevant_documents"):
77
+ vector_docs = await self.vector_retriever.aget_relevant_documents(query)
78
+ elif hasattr(self.vector_retriever, "get_relevant_documents"):
79
+ vector_docs = self.vector_retriever.get_relevant_documents(query)
80
+ elif hasattr(self.vector_retriever, "retrieve"):
81
+ vector_docs = self.vector_retriever.retrieve(query)
82
+
83
+ # Reuse sync fuse logic by delegating to get_relevant_documents after patching
84
+ # Build a temporary 'self' like structure — easiest is to merge candidate lists here
85
+ seen = set()
86
+ out = []
87
+ def add_docs_sync(docs):
88
+ for d in docs:
89
+ key = (d.metadata.get("source"), d.page_content[:200])
90
+ if key in seen:
91
+ continue
92
+ seen.add(key)
93
+ out.append(d)
94
+ if len(out) >= self.k:
95
+ return True
96
+ return False
97
+
98
+ if self.prefix_first:
99
+ finished = add_docs_sync(prefix_docs)
100
+ if not finished:
101
+ add_docs_sync(vector_docs)
102
+ else:
103
+ finished = add_docs_sync(vector_docs)
104
+ if not finished:
105
+ add_docs_sync(prefix_docs)
106
+
107
+ return out[: self.k]
rag_core/PrefixRetreiver.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from langchain_core.documents import Document
3
+ from langchain_core.retrievers import BaseRetriever
4
+ from pydantic import Field
5
+ from langchain.schema import Document
6
+ import re
7
+
8
+ def _norm(text: str) -> str:
9
+ # lower, remove punctuation-ish chars, compact whitespace
10
+ text = text or ""
11
+ text = text.lower()
12
+ text = re.sub(r"[^a-z0-9\s]", " ", text)
13
+ text = re.sub(r"\s+", " ", text).strip()
14
+ return text
15
+
16
+ class PrefixRetriever:
17
+ def __init__(self, docs: List[Document], k: int = 3, max_lines: int = 8):
18
+ self.docs = docs
19
+ self.k = k
20
+ self.max_lines = max_lines
21
+
22
+ def _head(self, content: str) -> str:
23
+ # return up to max_lines lines from the beginning to be used for prefix matching
24
+ lines = [ln for ln in content.splitlines() if ln.strip()]
25
+ return "\n".join(lines[: self.max_lines])
26
+
27
+ def _get_relevant_documents(self, query: str) -> List[Document]:
28
+ q = _norm(query)
29
+ tokens = [t for t in q.split() if t]
30
+ out: List[Document] = []
31
+
32
+ for d in self.docs:
33
+ # build a matching head: prefer explicit header metadata
34
+ header = d.metadata.get("section_header") or d.metadata.get("section_label") or ""
35
+ head_text = header + "\n" + self._head(d.page_content)
36
+ head = _norm(head_text)
37
+
38
+ matched = False
39
+ # 1) exact-substring match (most important)
40
+ if q and q in head:
41
+ matched = True
42
+ else:
43
+ # 2) token-based match: ensure every token exists in head
44
+ if tokens and all(tok in head for tok in tokens):
45
+ matched = True
46
+
47
+ if matched:
48
+ out.append(d)
49
+ if len(out) >= self.k:
50
+ break
51
+
52
+ return out
53
+
54
+
55
+ # --- Helper to recover Documents from a FAISS vectorstore
56
+ def faiss_all_docs(faiss_store):
57
+ ids = list(faiss_store.index_to_docstore_id.values())
58
+ return [faiss_store.docstore.search(_id) for _id in ids]
59
+
rag_core/config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ BASE_DIR = Path(__file__).resolve().parent.parent
4
+ DATA_DIR = BASE_DIR / "data"
5
+ VECTORSTORE_DIR = DATA_DIR / "vectorstore"
6
+
7
+ VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)
8
+
9
+ VECTORSTORE_PATH = str(VECTORSTORE_DIR / "career_faiss_index")
10
+
11
+ GROQ_CHAT_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
12
+ GROQ_EVAL_MODEL = "llama-3.1-8b-instant"
13
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
rag_core/crawler.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/crawler.py
2
+ from urllib.parse import urljoin, urlparse
3
+ from collections import deque
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+
7
+ def crawl_subpages(root_url: str, max_pages: int = 30, max_depth: int = 1):
8
+ """
9
+ Crawl subpages under the same path as root_url.
10
+ E.g., root_url = https://site.com/projects/
11
+ Will collect https://site.com/projects/* links.
12
+ """
13
+ parsed_root = urlparse(root_url)
14
+ base_domain = parsed_root.netloc
15
+ base_path = parsed_root.path.rstrip("/") # "/projects"
16
+
17
+ visited = set()
18
+ to_visit = deque([(root_url, 0)])
19
+ collected = set([root_url])
20
+
21
+ while to_visit and len(collected) < max_pages:
22
+ url, depth = to_visit.popleft()
23
+ if url in visited or depth > max_depth:
24
+ continue
25
+ visited.add(url)
26
+
27
+ try:
28
+ resp = requests.get(url, timeout=10)
29
+ if not resp.ok:
30
+ continue
31
+ soup = BeautifulSoup(resp.text, "html.parser")
32
+ except Exception:
33
+ continue
34
+
35
+ for a in soup.find_all("a", href=True):
36
+ href = a["href"].strip()
37
+ full_url = urljoin(url, href)
38
+ parsed = urlparse(full_url)
39
+
40
+ # Stay in same domain
41
+ if parsed.netloc != base_domain:
42
+ continue
43
+
44
+ # Stay under the same base path
45
+ if not parsed.path.startswith(base_path):
46
+ continue
47
+
48
+ if full_url not in collected:
49
+ collected.add(full_url)
50
+ to_visit.append((full_url, depth + 1))
51
+
52
+ visited.add(url)
53
+
54
+ return list(collected)
rag_core/embeddings_model.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/embeddings_model.py
2
+ from dotenv import load_dotenv
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from .config import EMBEDDING_MODEL
5
+
6
+ load_dotenv() # loads HUGGINGFACEHUB_API_TOKEN if present
7
+
8
+ def get_embeddings():
9
+ """
10
+ Returns a HuggingFaceEmbeddings instance using all-MiniLM.
11
+ This runs locally via sentence-transformers and will download
12
+ the model from Hugging Face (using your HF token if needed).
13
+ """
14
+ return HuggingFaceEmbeddings(
15
+ model_name=EMBEDDING_MODEL,
16
+ model_kwargs={"device": "cpu"}, # change to "cuda" if you have a GPU
17
+ encode_kwargs={"normalize_embeddings": True}, # optional but often helpful
18
+ )
rag_core/evaluator.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/evaluator.py
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+
4
+ from .models_groq import get_judge_llm
5
+ from .evaluator_schema import EvalResult
6
+
7
+ # Base LLM
8
+ _base_judge_llm = get_judge_llm()
9
+
10
+ # Wrap LLM with structured output (EvalResult)
11
+ _judge_llm_structured = _base_judge_llm.with_structured_output(EvalResult)
12
+
13
+ # Prompt template for the judge
14
+ JUDGE_PROMPT = ChatPromptTemplate.from_messages(
15
+ [
16
+ ("system", """
17
+ You are an impartial evaluator for a RAG-based assistant about Ritam's career.
18
+
19
+ You get:
20
+ - System prompt,
21
+ - User question,
22
+ - Retrieved context,
23
+ - Assistant answer.
24
+
25
+ Check:
26
+ 1. Does the answer follow the system prompt? (only Ritam's career / projects / research.)
27
+ 2. Does the answer actually respond to the user's question?
28
+ 3. Is the answer having too much extra information
29
+ 3. How well is the answer grounded in the context? Penalize hallucinations.
30
+
31
+ Return a strict JSON object matching the EvalResult schema.
32
+ """),
33
+ ("human", """
34
+ System prompt:
35
+ ----------------
36
+ {system_prompt}
37
+
38
+ User question:
39
+ ----------------
40
+ {question}
41
+
42
+ Retrieved context:
43
+ ----------------
44
+ {context}
45
+
46
+ Assistant answer:
47
+ ----------------
48
+ {answer}
49
+ """),
50
+ ]
51
+ )
52
+
53
+ def evaluate_answer(system_prompt: str, question: str, context_docs, answer: str) -> EvalResult:
54
+ """
55
+ Run the LLM judge with structured output over the given answer.
56
+ If the judge fails, return a safe EvalResult that disables retry.
57
+ """
58
+ ctx_text = "\n\n".join(
59
+ f"[DOC {i}] (source={d.metadata.get('source', 'unknown')})\n{d.page_content}"
60
+ for i, d in enumerate(context_docs)
61
+ )
62
+
63
+ chain = JUDGE_PROMPT | _judge_llm_structured
64
+
65
+ try:
66
+ result: EvalResult = chain.invoke({
67
+ "system_prompt": system_prompt,
68
+ "question": question,
69
+ "context": ctx_text,
70
+ "answer": answer,
71
+ })
72
+ return result
73
+
74
+ except Exception as e:
75
+ # Judge died / timed out / bad response
76
+ # -> don't retry, but don't crash the main answer either.
77
+ return EvalResult(
78
+ follows_system_prompt=True, # neutral / optimistic defaults
79
+ answers_user_question=True,
80
+ grounded_in_context_score=1.0,
81
+ hallucination_detected=False,
82
+ overall_score=1.0, # high score so threshold logic won't trigger
83
+ should_retry=False, # explicitly NO RETRY
84
+ feedback=f"[Evaluator failure] Judge could not evaluate this answer: {e}",
85
+ )
rag_core/evaluator_schema.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/evaluator_schema.py
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+
4
+ class EvalResult(BaseModel):
5
+ follows_system_prompt: bool = Field(...)
6
+ answers_user_question: bool = Field(...)
7
+ grounded_in_context_score: float = Field(ge=0.0, le=1.0)
8
+ hallucination_detected: bool = Field(...)
9
+ overall_score: float = Field(ge=0.0, le=1.0)
10
+ should_retry: bool = Field(...)
11
+ feedback: str = Field(...)
rag_core/index_builder.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/index_builder.py
2
+
3
+ import re
4
+ from typing import List
5
+
6
+ import requests
7
+ from langchain_community.document_loaders import OnlinePDFLoader
8
+ from langchain_text_splitters import HTMLSectionSplitter, RecursiveCharacterTextSplitter
9
+ from langchain_community.vectorstores import FAISS
10
+
11
+ from .embeddings_model import get_embeddings
12
+ from .config import VECTORSTORE_PATH
13
+ from .sources import CRAWL_ROOTS, FIXED_URLS
14
+ from .crawler import crawl_subpages
15
+
16
+
17
+ def _is_gdrive_file(url: str) -> bool:
18
+ """Return True if this looks like a Google Drive file view URL."""
19
+ return "drive.google.com" in url and "/file/d/" in url
20
+
21
+
22
+ def _gdrive_view_to_download(url: str) -> str:
23
+ """
24
+ Convert a Google Drive view URL to a direct download URL.
25
+
26
+ Example:
27
+ https://drive.google.com/file/d/<ID>/view
28
+ -> https://drive.google.com/uc?export=download&id=<ID>
29
+ """
30
+ m = re.search(r"/file/d/([^/]+)/", url)
31
+ if not m:
32
+ return url
33
+ file_id = m.group(1)
34
+ return f"https://drive.google.com/uc?export=download&id={file_id}"
35
+
36
+
37
+ def _infer_section_label_from_url(url: str) -> str:
38
+ """
39
+ Heuristic: guess a section label from the URL path.
40
+ e.g.
41
+ https://your-site.com/about -> 'about'
42
+ https://your-site.com/experience/juniper -> 'experience/juniper'
43
+ """
44
+ try:
45
+ path = url.split("://", 1)[-1].split("/", 1)[-1]
46
+ except Exception:
47
+ return url
48
+ path = path.strip("/")
49
+ if not path:
50
+ return "root"
51
+ return path
52
+
53
+
54
+ def _normalize_label(text: str, max_len: int = 80) -> str:
55
+ """
56
+ Create a short normalized label from a header text.
57
+ - lowercases, removes newlines, collapses whitespace
58
+ - trims to max_len and replaces spaces with '-' for compact labels
59
+ """
60
+ if not text:
61
+ return ""
62
+ lab = " ".join(text.split()) # collapse whitespace/newlines
63
+ lab = lab.strip().lower()
64
+ # shorten if too long
65
+ if len(lab) > max_len:
66
+ lab = lab[: max_len - 3].rstrip() + "..."
67
+ # use a compact label form (slug-like) but keep readability
68
+ slug = lab.replace(" ", "-")
69
+ # remove characters that would be awkward in metadata keys
70
+ slug = re.sub(r"[^a-z0-9_\-\.]+", "", slug)
71
+ return slug
72
+
73
+
74
+ def load_web_docs(urls: List[str]):
75
+ """
76
+ Load documents from a list of URLs.
77
+
78
+ - HTML URLs:
79
+ * Fetch raw HTML with `requests`
80
+ * Split into sections with HTMLSectionSplitter (h1/h2)
81
+ * Infer section labels from the actual section header lines (preferred)
82
+ - PDF URLs (including Google Drive file links):
83
+ * Load with OnlinePDFLoader (one doc per page)
84
+ """
85
+ html_urls: List[str] = []
86
+ pdf_urls: List[str] = []
87
+
88
+ for url in urls:
89
+ u = url.strip()
90
+ if not u:
91
+ continue
92
+
93
+ # Google Drive file links (view) -> direct download PDF
94
+ if _is_gdrive_file(u):
95
+ pdf_urls.append(_gdrive_view_to_download(u))
96
+ # Direct PDF URLs
97
+ elif u.lower().endswith(".pdf"):
98
+ pdf_urls.append(u)
99
+ # Everything else is treated as HTML
100
+ else:
101
+ html_urls.append(u)
102
+
103
+ docs = []
104
+
105
+ # --- HTML via HTMLSectionSplitter on raw HTML string ---
106
+ if html_urls:
107
+ print(f"[index_builder] Loading HTML for {len(html_urls)} URLs with HTMLSectionSplitter")
108
+
109
+ headers_to_split_on = [
110
+ ("h1", "Header 1"),
111
+ ("h2", "Header 2"),
112
+ ]
113
+ html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)
114
+
115
+ for url in html_urls:
116
+ try:
117
+ print(f"[index_builder] Fetching HTML from {url}")
118
+ resp = requests.get(url, timeout=15)
119
+ resp.raise_for_status()
120
+ html_string = resp.text
121
+
122
+ # HTMLSectionSplitter returns a list[Document] where each Document starts with the header line
123
+ html_header_splits = html_splitter.split_text(html_string)
124
+
125
+ # we will prefer to use the header line as the canonical label for each section
126
+ # but ensure we deduplicate very similar headers within the same page
127
+ seen_labels_in_page = set()
128
+
129
+ print(f"[index_builder] {url}: {len(html_header_splits)} HTML sections")
130
+
131
+ for d in html_header_splits:
132
+ # source URL
133
+ d.metadata["source"] = url
134
+ # extract the first meaningful non-empty line as the header
135
+ first_lines = [ln.strip() for ln in d.page_content.splitlines() if ln.strip()]
136
+ header_line = first_lines[0] if first_lines else ""
137
+
138
+ # normalize header_line for metadata and label
139
+ # keep section_header as human readable short header (truncated if necessary)
140
+ human_header = header_line
141
+ if len(human_header) > 300:
142
+ human_header = human_header[:300] + "..."
143
+
144
+ if human_header != "More Project":
145
+ d.metadata["section_header"] = human_header
146
+ else:
147
+ # Extract endpoint from URL as fallback header
148
+ try:
149
+ # e.g. https://site.com/projects/my-cool-project -> "my-cool-project"
150
+ endpoint = url.rstrip("/").split("/")[-1]
151
+ # Make it human-readable
152
+ endpoint = endpoint.replace("-", " ").replace("_", " ").title()
153
+ d.metadata["section_header"] = endpoint
154
+ d.metadata["section_label"] = endpoint
155
+ except Exception:
156
+ # absolute fallback
157
+ d.metadata["section_header"] = human_header
158
+ d.metadata["section_label"] = human_header
159
+
160
+ # produce a short machine-friendly label from header; fallback to URL-based label
161
+ # label_from_header = _normalize_label(header_line)
162
+ # if not label_from_header:
163
+ # label_from_header = _infer_section_label_from_url(url)
164
+
165
+ # # dedupe labels within this page (if splitter produced repeated headers)
166
+ # dedup_label = label_from_header
167
+ # suffix = 1
168
+ # while dedup_label in seen_labels_in_page:
169
+ # dedup_label = f"{label_from_header}-{suffix}"
170
+ # suffix += 1
171
+ # seen_labels_in_page.add(dedup_label)
172
+
173
+ # d.metadata["section_label"] = dedup_label
174
+ d.metadata["section_type"] = "remote_html"
175
+ # append docs
176
+ docs.extend(html_header_splits)
177
+ except Exception as e:
178
+ print(f"[index_builder] Error processing HTML from {url}: {e}")
179
+
180
+ # --- PDF files (including Drive) via OnlinePDFLoader ---
181
+ for pdf_url in pdf_urls:
182
+ print(f"[index_builder] Loading PDF from {pdf_url}")
183
+ try:
184
+ pdf_loader = OnlinePDFLoader(pdf_url)
185
+ pdf_docs = pdf_loader.load()
186
+ section_label = _infer_section_label_from_url(pdf_url)
187
+ for d in pdf_docs:
188
+ d.metadata["source"] = pdf_url
189
+ d.metadata["section_label"] = section_label
190
+ d.metadata["section_type"] = "remote_pdf"
191
+ docs.extend(pdf_docs)
192
+ print(f"[index_builder] {pdf_url}: {len(pdf_docs)} PDF pages")
193
+ except Exception as e:
194
+ print(f"[index_builder] Failed to load PDF from {pdf_url}: {e}")
195
+
196
+ return docs
197
+
198
+
199
+ def split_docs(docs, chunk_size: int = 1000, chunk_overlap: int = 200):
200
+ """
201
+ Split loaded documents into chunks for embedding.
202
+
203
+ - HTML docs (from HTMLSectionSplitter) are already section-level chunks → keep as-is.
204
+ - Non-HTML docs (PDF pages, etc.) are split with RecursiveCharacterTextSplitter.
205
+ """
206
+ html_docs = [d for d in docs if d.metadata.get("section_type") == "remote_html"]
207
+ other_docs = [d for d in docs if d.metadata.get("section_type") != "remote_html"]
208
+
209
+ chunks: List = []
210
+
211
+ # Keep HTML sections as they are
212
+ chunks.extend(html_docs)
213
+
214
+ # Split other docs (PDFs, etc.) into text chunks
215
+ if other_docs:
216
+ splitter = RecursiveCharacterTextSplitter(
217
+ chunk_size=chunk_size,
218
+ chunk_overlap=chunk_overlap,
219
+ add_start_index=True,
220
+ )
221
+ other_chunks = splitter.split_documents(other_docs)
222
+ for c in other_chunks:
223
+ c.metadata.setdefault("section_label", c.metadata.get("source", "unknown"))
224
+ c.metadata.setdefault(
225
+ "section_type",
226
+ c.metadata.get("section_type", "remote_pdf"),
227
+ )
228
+ chunks.extend(other_chunks)
229
+
230
+ print(
231
+ f"[index_builder] split_docs: {len(html_docs)} HTML section chunks, "
232
+ f"{len(chunks) - len(html_docs)} non-HTML chunks"
233
+ )
234
+ return chunks
235
+
236
+
237
+ def build_and_save_index():
238
+ """
239
+ Crawl URLs, load docs, split into chunks, build FAISS index, and save it.
240
+
241
+ Returns:
242
+ (int, list): number of chunks indexed, and the chunks themselves.
243
+ """
244
+ # 1. Crawl project roots (and any other CRAWL_ROOTS) to get sub-URLs
245
+ crawl_urls: List[str] = []
246
+ for root in CRAWL_ROOTS:
247
+ try:
248
+ urls = crawl_subpages(root)
249
+ print(f"[index_builder] Crawled {len(urls)} URLs under {root}")
250
+ crawl_urls.extend(urls)
251
+ except Exception as e:
252
+ print(f"[index_builder] Failed to crawl {root}: {e}")
253
+
254
+ # 2. Combine fixed URLs (resume, about, scholar, GitHub, etc.) + crawled URLs
255
+ all_urls = list(set(FIXED_URLS + crawl_urls))
256
+
257
+ print(f"[index_builder] Total URLs to load: {len(all_urls)}")
258
+ for u in all_urls:
259
+ print(f" - {u}")
260
+
261
+ docs = load_web_docs(all_urls)
262
+ print(f"[index_builder] Loaded {len(docs)} raw documents")
263
+
264
+ if not docs:
265
+ print("[index_builder] WARNING: No documents loaded; aborting index build.")
266
+ return 0, []
267
+
268
+ # 3. Split into chunks (HTML via HTMLSectionSplitter, PDFs via recursive splitter)
269
+ chunks = split_docs(docs)
270
+ print(f"[index_builder] Split into {len(chunks)} chunks")
271
+
272
+ if not chunks:
273
+ print("[index_builder] WARNING: No chunks produced; aborting FAISS build.")
274
+ return 0, []
275
+
276
+ # 4. Build vector store with HF embeddings (e.g., all-MiniLM)
277
+ embeddings = get_embeddings()
278
+ vs = FAISS.from_documents(chunks, embeddings)
279
+
280
+ # 5. Save to disk
281
+ vs.save_local(VECTORSTORE_PATH)
282
+ print(
283
+ f"[index_builder] Saved FAISS index to {VECTORSTORE_PATH} "
284
+ f"(chunks={len(chunks)})"
285
+ )
286
+
287
+ return len(chunks), chunks
288
+
289
+
290
+ def load_vectorstore():
291
+ """
292
+ Load the FAISS vector store from disk using the same embedding model.
293
+ """
294
+ embeddings = get_embeddings()
295
+ vs = FAISS.load_local(
296
+ VECTORSTORE_PATH,
297
+ embeddings,
298
+ allow_dangerous_deserialization=True,
299
+ )
300
+ return vs
rag_core/models_groq.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/models_groq.py
2
+ from dotenv import load_dotenv
3
+ from langchain_core.callbacks import Callbacks
4
+ from langchain_core.caches import BaseCache
5
+ from langchain_groq import ChatGroq
6
+ from .config import GROQ_CHAT_MODEL, GROQ_EVAL_MODEL
7
+
8
+ load_dotenv()
9
+ ChatGroq.model_rebuild()
10
+
11
+ def get_answer_llm():
12
+ return ChatGroq(model=GROQ_CHAT_MODEL, temperature=0.1)
13
+
14
+ def get_judge_llm():
15
+ return ChatGroq(model=GROQ_EVAL_MODEL, temperature=0.0)
rag_core/rag_chain.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/rag_chain.py (top of file)
2
+
3
+ from typing import List, Tuple, Dict, Any
4
+ from langchain.schema import Document, BaseRetriever
5
+ from transformers import pipeline
6
+ from rag_core.index_builder import load_vectorstore, build_and_save_index
7
+
8
+
9
+ # ---- Label routing helpers ----
10
+
11
+ # Global zero-shot classifier (loaded once)
12
+ label_classifier = pipeline(
13
+ "zero-shot-classification",
14
+ model="facebook/bart-large-mnli"
15
+ )
16
+
17
+ def build_label_vocab(docs: List[Document]) -> List[str]:
18
+ labels = []
19
+ seen = set()
20
+ for d in docs:
21
+ header = (d.metadata.get("section_header") or "").strip()
22
+ s_label = (d.metadata.get("section_label") or "").strip()
23
+ candidates = [header, s_label]
24
+ for c in candidates:
25
+ if not c:
26
+ continue
27
+ normalized = " ".join(c.split())
28
+ if len(normalized) > 120:
29
+ normalized = normalized[:120] + "..."
30
+ if normalized not in seen:
31
+ seen.add(normalized)
32
+ labels.append(normalized)
33
+ return labels
34
+
35
+ def map_query_to_labels_zero_shot(
36
+ query: str,
37
+ candidate_labels: List[str],
38
+ top_k: int = 5,
39
+ score_threshold: float = 0.40,
40
+ ) -> List[Tuple[str, float]]:
41
+ if not candidate_labels:
42
+ return []
43
+ out = label_classifier(query, candidate_labels, multi_label=True)
44
+ labels_out = out["labels"]
45
+ scores_out = out["scores"]
46
+ selected: List[Tuple[str, float]] = []
47
+ for lbl, score in zip(labels_out[:top_k], scores_out[:top_k]):
48
+ if score >= score_threshold:
49
+ selected.append((lbl, float(score)))
50
+ if not selected and labels_out:
51
+ selected = [(labels_out[0], float(scores_out[0]))]
52
+ return selected
53
+
54
+ def fetch_docs_by_labels_with_scores(
55
+ selected_labels: List[Tuple[str, float]],
56
+ docs: List[Document],
57
+ ) -> List[Tuple[Document, float, List[str]]]:
58
+ """
59
+ For each doc, determine which of selected_labels it matches
60
+ (substring match on section_header / section_label).
61
+ Return (Document, score, [matched_labels]) where score is max label score.
62
+ """
63
+ if not selected_labels:
64
+ return []
65
+ label_score: Dict[str, float] = {lbl.lower(): sc for lbl, sc in selected_labels}
66
+ out: List[Tuple[Document, float, List[str]]] = []
67
+ for d in docs:
68
+ header = (d.metadata.get("section_header") or "").lower()
69
+ s_label = (d.metadata.get("section_label") or "").lower()
70
+ combined = header + " " + s_label
71
+
72
+ matched_labels: List[str] = []
73
+ matched_scores: List[float] = []
74
+ for lbl_lower, sc in label_score.items():
75
+ if lbl_lower and lbl_lower in combined:
76
+ matched_labels.append(lbl_lower)
77
+ matched_scores.append(sc)
78
+
79
+ if matched_labels:
80
+ doc_score = max(matched_scores)
81
+ out.append((d, doc_score, matched_labels))
82
+ return out
83
+
84
+ class LabelRoutingRetriever(BaseRetriever):
85
+ """
86
+ Retriever that:
87
+ 1) Uses BART-MNLI to map query -> section labels.
88
+ 2) Fetches all docs whose header/label match those labels.
89
+ 3) Ranks docs by label confidence.
90
+ 4) Falls back to vector retriever if no labels match.
91
+ """
92
+
93
+ docs: List[Document]
94
+ vector_retriever: Any = None
95
+ top_k_labels: int = 5
96
+ label_score_threshold: float = 0.35
97
+ k_docs: int = 6
98
+
99
+ class Config:
100
+ arbitrary_types_allowed = True
101
+
102
+ def __init__(
103
+ self,
104
+ docs: List[Document],
105
+ vector_retriever: Any = None,
106
+ top_k_labels: int = 5,
107
+ label_score_threshold: float = 0.35,
108
+ k_docs: int = 6,
109
+ **kwargs,
110
+ ):
111
+ super().__init__(
112
+ docs=docs,
113
+ vector_retriever=vector_retriever,
114
+ top_k_labels=top_k_labels,
115
+ label_score_threshold=label_score_threshold,
116
+ k_docs=k_docs,
117
+ **kwargs,
118
+ )
119
+
120
+ def get_relevant_documents(self, query: str) -> List[Document]:
121
+ # 1) build candidate labels from docs
122
+ candidate_labels = build_label_vocab(self.docs)
123
+
124
+ # 2) map query -> (label, score)
125
+ mapped = map_query_to_labels_zero_shot(
126
+ query,
127
+ candidate_labels,
128
+ top_k=self.top_k_labels,
129
+ score_threshold=self.label_score_threshold,
130
+ )
131
+
132
+ # 3) fetch docs with scores
133
+ docs_with_scores = fetch_docs_by_labels_with_scores(mapped, self.docs)
134
+
135
+ if not docs_with_scores and self.vector_retriever is not None:
136
+ # fallback to vector retriever (semantic retrieval)
137
+ vec_docs = self.vector_retriever.get_relevant_documents(query)
138
+ return vec_docs[: self.k_docs]
139
+
140
+ # 4) sort by score desc + dedupe
141
+ seen_keys = set()
142
+ ranked_docs: List[Document] = []
143
+ for d, sc, matched in sorted(docs_with_scores, key=lambda x: x[1], reverse=True):
144
+ if d.metadata.get("Header 2")=='More Project':
145
+ continue
146
+ key = (d.metadata.get("source"), d.page_content[:200])
147
+ if key in seen_keys:
148
+ continue
149
+ seen_keys.add(key)
150
+ ranked_docs.append(d)
151
+ if len(ranked_docs) >= self.k_docs:
152
+ break
153
+ return ranked_docs
154
+
155
+ async def aget_relevant_documents(self, query: str) -> List[Document]:
156
+ # simple async wrapper
157
+ return self.get_relevant_documents(query)
158
+
159
+ from langchain.chains.combine_documents import create_stuff_documents_chain
160
+ from langchain_core.prompts import ChatPromptTemplate
161
+ from langchain.chains import create_retrieval_chain
162
+ from .models_groq import get_answer_llm, get_judge_llm
163
+
164
+ SYSTEM_PROMPT = """You are Ritam's personal QA bot.
165
+ Use the following context from his website and resume to answer.
166
+
167
+ Question: {input}
168
+ Context:
169
+ {context}
170
+
171
+ Answer in first person as Ritam."""
172
+
173
+ def build_rag_chain(vs, k=5, max_docs=6):
174
+
175
+ all_docs = list(vs.docstore._dict.values())
176
+ vector_retriever = vs.as_retriever(search_kwargs={"k": 8})
177
+
178
+ # old: ensemble/PrefixRetriever
179
+ # new: label routing retriever
180
+ retriever = LabelRoutingRetriever(
181
+ docs=all_docs,
182
+ vector_retriever=vector_retriever,
183
+ top_k_labels=k,
184
+ label_score_threshold=0.4,
185
+ k_docs=max_docs,
186
+ )
187
+
188
+ prompt = ChatPromptTemplate.from_template(
189
+ SYSTEM_PROMPT
190
+ )
191
+
192
+ llm = get_answer_llm()
193
+ prompt = ChatPromptTemplate.from_messages([
194
+ ("system", SYSTEM_PROMPT),
195
+ ("human", "{input}"),
196
+ ])
197
+
198
+ combine_docs_chain = create_stuff_documents_chain(llm, prompt)
199
+ rag_chain = create_retrieval_chain(retriever, combine_docs_chain)
200
+ return rag_chain, retriever, SYSTEM_PROMPT
rag_core/rag_chain_helper.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/rag_chain.py
2
+ from langchain.chains import create_retrieval_chain
3
+ from langchain.chains.combine_documents import create_stuff_documents_chain
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from rag_core.FusedRetreiver import FusedRetriever
6
+ from rag_core.PrefixRetreiver import PrefixRetriever
7
+
8
+
9
+ from .models_groq import get_answer_llm, get_judge_llm
10
+
11
+ def faiss_all_docs(faiss_store):
12
+ # Map FAISS index positions -> docstore ids
13
+ ids = list(faiss_store.index_to_docstore_id.values())
14
+ # Pull Documents from the docstore in that order
15
+ return [faiss_store.docstore.search(_id) for _id in ids]
16
+
17
+
18
+ SYSTEM_PROMPT = """
19
+ You are a QA assistant for Ritam that answers questions about Ritam's career posing as Ritam's digital twin.
20
+
21
+ You will receive:
22
+ - `context`: text chunks retrieved from Ritam's resume, website, project pages, scholar profile, etc.
23
+ - `chat_history`: a short summary of the prior conversation turns for coherence.
24
+
25
+ Context:
26
+ {context}
27
+
28
+ Chat history (for reference):
29
+ {chat_history}
30
+
31
+ Rules:
32
+ - Use ONLY the provided context for factual claims.
33
+ - If the answer is not clearly supported by the context, say "I don't know"
34
+ and suggest rephrasing in terms of Ritam's work, projects, or research.
35
+ - If the user asks generic questions unrelated to Ritam, politely refuse saying that please ask only career related questions to me.
36
+
37
+ The chat_history is only for understanding the follow-up question; do NOT invent facts
38
+ that aren't in the context even if they appear earlier in chat_history.
39
+
40
+ """
41
+
42
+ def build_rag_chain(vectorstore, k=2, max_lines=3, weights=(0.35, 0),top_k=None):
43
+ """
44
+ Returns (rag_chain, retriever, SYSTEM_PROMPT)
45
+
46
+ - vectorstore: your FAISS/Chroma/… already loaded
47
+ - k: per-retriever fetch size (each retriever will pull up to k docs)
48
+ - max_lines: how many leading lines to consider for prefix matching
49
+ - weights: contribution of (Prefix, Vector) retrievers in the fusion
50
+ - top_k: final number of docs returned by the ensemble (defaults to k)
51
+ """
52
+ # ---- Retrievers
53
+ all_docs = faiss_all_docs(vectorstore)
54
+ prefix_retriever = PrefixRetriever(docs=all_docs, k=2, max_lines=3)
55
+
56
+ # Pull a few extra from vectorstore to give the fusion more to work with
57
+ vector_retriever = vectorstore.as_retriever(search_kwargs={"k": max(k, 1)})
58
+
59
+ fused_retriever = FusedRetriever(
60
+ prefix_retriever=prefix_retriever,
61
+ vector_retriever=vector_retriever,
62
+ k=6,
63
+ prefix_first=True,
64
+ )
65
+
66
+ # ---- LLM + prompt
67
+ llm = get_answer_llm()
68
+ prompt = ChatPromptTemplate.from_messages([
69
+ ("system", SYSTEM_PROMPT),
70
+ ("human", "{input}"),
71
+ ])
72
+
73
+ # ---- Stuff docs into the prompt, then make the RAG chain
74
+ document_chain = create_stuff_documents_chain(llm, prompt)
75
+ rag_chain = create_retrieval_chain(fused_retriever, document_chain)
76
+
77
+ return rag_chain, fused_retriever, SYSTEM_PROMPT
78
+
79
+
80
+
81
+ # ---------- Conversational question rewriter ----------
82
+
83
+ # We’ll use a low-temperature model (judge_llm) to rewrite follow-up questions
84
+ _rewriter_llm = get_judge_llm()
85
+
86
+ QUESTION_REWRITE_PROMPT = ChatPromptTemplate.from_messages(
87
+ [
88
+ ("system", """
89
+ You are a helpful assistant that rewrites follow-up questions into standalone questions.
90
+
91
+ You are given a chat history between a user and an assistant, plus the user's new question.
92
+ Your job is to rewrite the new question so that it is self-contained and can be understood
93
+ without the previous turns.
94
+
95
+ The rewritten question MUST stay faithful to the user's intent and be about Ritam's
96
+ career, projects, research, or education.
97
+ If the question is already standalone, return it as-is.
98
+ """),
99
+ ("human", """
100
+ Chat history:
101
+ ------------
102
+ {chat_history}
103
+
104
+ New user question:
105
+ ------------
106
+ {question}
107
+
108
+ Rewrite the new question into a single, self-contained question:
109
+ """),
110
+ ]
111
+ )
112
+
113
+ def rewrite_question_with_history(history, question: str) -> str:
114
+ """
115
+ history: list of [user_msg, assistant_msg] pairs (from Gradio ChatInterface)
116
+ question: current user string
117
+
118
+ Returns a standalone question string that incorporates context from history.
119
+ """
120
+ # If no history, just return the question
121
+ if not history:
122
+ return question
123
+
124
+ # Convert history to a single text block
125
+ history_lines = []
126
+ for turn in history:
127
+ if not turn or len(turn) < 2:
128
+ continue
129
+ user_msg, assistant_msg = turn[0], turn[1]
130
+ history_lines.append(f"User: {user_msg}")
131
+ history_lines.append(f"Assistant: {assistant_msg}")
132
+ history_text = "\n".join(history_lines)
133
+
134
+ chain = QUESTION_REWRITE_PROMPT | _rewriter_llm
135
+ resp = chain.invoke({"chat_history": history_text, "question": question})
136
+ standalone = resp.content.strip()
137
+ return standalone or question
rag_core/sources.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_core/sources.py
2
+
3
+ # Resume URL (HTML or PDF, both okay; PDF will be text-extracted)
4
+ LINKEDIN_URL = "https://www.linkedin.com/in/ritam-upadhyay-51ba81192/"
5
+
6
+ # Root projects page that links to each project subpage
7
+ PROJECTS_ROOT_URL = "https://fearless-writers-028990.framer.app/project"
8
+
9
+ # Other URLs directly relevant to your career
10
+ OTHER_CAREER_URLS = [
11
+ "https://fearless-writers-028990.framer.app/old-home",
12
+ "https://fearless-writers-028990.framer.app/",
13
+ "https://scholar.google.com/citations?user=04o0bdcAAAAJ&hl=en",
14
+ "https://fearless-writers-028990.framer.app/stack",
15
+ ]
16
+
17
+ # These roots will be crawled:
18
+ CRAWL_ROOTS = [
19
+ PROJECTS_ROOT_URL,
20
+ ]
21
+
22
+ # These are direct, non-crawling URLs:
23
+ FIXED_URLS = [
24
+ *OTHER_CAREER_URLS,
25
+ ]
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.49.1
2
+ langchain==0.3.12
3
+ python-dotenv==1.0.1
4
+ langchain-core==0.3.28
5
+ langchain-openai==0.2.12
6
+ langchain-community==0.3.12
7
+ langchain-text-splitters==0.3.4
8
+ langchain-groq==0.2.1
9
+ sentence-transformers==3.3.1
10
+ faiss-cpu==1.9.0.post1
11
+ beautifulsoup4==4.12.3
12
+ requests==2.32.3
13
+ pypdf==5.1.0
14
+ lxml==6.0.2