Spaces:
Sleeping
Sleeping
anirudh-np-ds commited on
Commit ·
6b6d437
1
Parent(s): 95ac500
feat: add chat memory + web URL + YouTube ingestion
Browse files- requirements.txt +3 -1
- src/streamlit_app.py +421 -322
- upgraded_app.py +567 -0
- upgraded_requirements.txt +7 -0
requirements.txt
CHANGED
|
@@ -2,4 +2,6 @@ streamlit>=1.32.0
|
|
| 2 |
chromadb>=0.4.22
|
| 3 |
sentence-transformers>=2.7.0
|
| 4 |
requests>=2.31.0
|
| 5 |
-
PyMuPDF>=1.24.0
|
|
|
|
|
|
|
|
|
| 2 |
chromadb>=0.4.22
|
| 3 |
sentence-transformers>=2.7.0
|
| 4 |
requests>=2.31.0
|
| 5 |
+
PyMuPDF>=1.24.0
|
| 6 |
+
beautifulsoup4>=4.12.0
|
| 7 |
+
youtube-transcript-api>=0.6.2
|
src/streamlit_app.py
CHANGED
|
@@ -4,13 +4,16 @@ from sentence_transformers import SentenceTransformer
|
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
import os
|
| 6 |
import requests
|
| 7 |
-
import re
|
| 8 |
import hashlib
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# ─── Page Config ──────────────────────────────────────────────────────────────
|
| 11 |
st.set_page_config(
|
| 12 |
-
page_title="
|
| 13 |
-
page_icon="
|
| 14 |
layout="wide",
|
| 15 |
initial_sidebar_state="expanded"
|
| 16 |
)
|
|
@@ -28,441 +31,537 @@ html, body, [class*="css"] { font-family: 'IBM Plex Sans', sans-serif; }
|
|
| 28 |
border: 1px solid #1e2a3e;
|
| 29 |
border-top: 3px solid #22d3ee;
|
| 30 |
border-radius: 12px;
|
| 31 |
-
padding:
|
| 32 |
-
margin-bottom:
|
| 33 |
}
|
| 34 |
-
.hero h1 { font-size: 1.
|
| 35 |
-
.hero p { color: #64748b; font-size: 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
-
.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
}
|
| 46 |
-
.
|
| 47 |
-
.
|
| 48 |
-
.
|
| 49 |
-
.phase-icon { font-size: 1.1rem; display: block; margin-bottom: 2px; }
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
border-radius: 10px;
|
| 55 |
-
padding: 14px 16px;
|
| 56 |
-
margin: 8px 0;
|
| 57 |
-
display: flex;
|
| 58 |
-
align-items: center;
|
| 59 |
-
justify-content: space-between;
|
| 60 |
}
|
| 61 |
-
.
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
border: 1px solid rgba(34,211,238,0.25); padding: 3px 10px; border-radius: 20px;
|
| 67 |
}
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
background: #0d1424;
|
| 71 |
-
border: 1px solid #1e3a4a;
|
| 72 |
-
border-left: 3px solid #22d3ee;
|
| 73 |
-
border-radius: 10px;
|
| 74 |
-
padding: 22px 24px;
|
| 75 |
-
color: #e2e8f0;
|
| 76 |
-
line-height: 1.75;
|
| 77 |
-
font-size: 0.96rem;
|
| 78 |
-
margin: 12px 0 20px 0;
|
| 79 |
}
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
background: #
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
padding: 14px 18px;
|
| 86 |
-
margin: 8px 0;
|
| 87 |
}
|
| 88 |
-
.
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
}
|
| 92 |
-
.
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
.score-bar {
|
| 96 |
-
height: 4px; border-radius: 2px; background: #1e2a3e; width: 80px; overflow: hidden;
|
| 97 |
}
|
| 98 |
-
.
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
flex: 1; background: #0d1424; border: 1px solid #1e2a3e;
|
| 105 |
-
border-radius: 8px; padding: 12px; text-align: center;
|
| 106 |
}
|
| 107 |
-
.stat-val { font-size: 1.35rem; font-weight: 600; color: #22d3ee; }
|
| 108 |
-
.stat-lbl { font-size: 0.7rem; color: #475569; margin-top: 2px; }
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
}
|
| 118 |
|
| 119 |
-
|
|
|
|
|
|
|
| 120 |
text-align: center; padding: 48px 24px;
|
| 121 |
-
border: 2px dashed #1e2a3e; border-radius: 12px;
|
| 122 |
}
|
| 123 |
-
.empty-state .icon { font-size: 2.5rem; margin-bottom: 12px; }
|
| 124 |
-
.empty-state p { font-size: 0.9rem; line-height: 1.6; }
|
| 125 |
</style>
|
| 126 |
""", unsafe_allow_html=True)
|
| 127 |
|
| 128 |
|
| 129 |
# ─── Session State ────────────────────────────────────────────────────────────
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
|
| 139 |
|
| 140 |
-
# ───
|
| 141 |
@st.cache_resource(show_spinner=False)
|
| 142 |
def load_embed_model():
|
| 143 |
return SentenceTransformer('all-MiniLM-L6-v2')
|
| 144 |
|
| 145 |
|
| 146 |
-
|
| 147 |
-
def extract_text_from_pdf(pdf_bytes: bytes) -> list[dict]:
|
| 148 |
-
"""Returns list of {page, text} dicts."""
|
| 149 |
-
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 150 |
-
pages = []
|
| 151 |
-
for page_num, page in enumerate(doc, start=1):
|
| 152 |
-
text = page.get_text("text").strip()
|
| 153 |
-
if text:
|
| 154 |
-
pages.append({"page": page_num, "text": text})
|
| 155 |
-
doc.close()
|
| 156 |
-
return pages
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
# ─── Chunking ─────────────────────────────────────────────────────────────────
|
| 160 |
-
def chunk_text(pages: list[dict], chunk_size: int = 400, overlap: int = 60) -> list[dict]:
|
| 161 |
-
"""Splits page text into overlapping word-based chunks."""
|
| 162 |
-
chunks = []
|
| 163 |
-
for p in pages:
|
| 164 |
-
words = p["text"].split()
|
| 165 |
-
start = 0
|
| 166 |
-
while start < len(words):
|
| 167 |
-
end = start + chunk_size
|
| 168 |
-
chunk_words = words[start:end]
|
| 169 |
-
chunk_text_str = " ".join(chunk_words).strip()
|
| 170 |
-
if len(chunk_text_str) > 60:
|
| 171 |
-
chunks.append({"text": chunk_text_str, "page": p["page"]})
|
| 172 |
-
start += chunk_size - overlap
|
| 173 |
-
return chunks
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# ─── Index PDF into ChromaDB ──────────────────────────────────────────────────
|
| 177 |
-
def index_pdf(filename: str, pdf_bytes: bytes, embed_model):
|
| 178 |
-
# Init or reuse ChromaDB
|
| 179 |
if st.session_state.chroma_client is None:
|
| 180 |
st.session_state.chroma_client = chromadb.Client()
|
| 181 |
st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(
|
| 182 |
-
name="
|
| 183 |
)
|
|
|
|
| 184 |
|
| 185 |
-
collection = st.session_state.chroma_collection
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
-
if not chunks:
|
| 192 |
-
return 0, 0
|
| 193 |
|
| 194 |
-
|
|
|
|
| 195 |
texts = [c["text"] for c in chunks]
|
| 196 |
embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
|
| 197 |
-
|
| 198 |
ids, docs, metas, embeds = [], [], [], []
|
| 199 |
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
| 200 |
-
|
| 201 |
-
ids.append(chunk_id)
|
| 202 |
docs.append(chunk["text"])
|
| 203 |
-
metas.append({"
|
|
|
|
| 204 |
embeds.append(emb)
|
| 205 |
-
|
| 206 |
collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas)
|
| 207 |
-
|
| 208 |
-
st.session_state.indexed_files[filename] = {
|
| 209 |
-
"chunks": len(chunks),
|
| 210 |
-
"pages": len(pages),
|
| 211 |
-
"size_kb": round(len(pdf_bytes) / 1024, 1)
|
| 212 |
-
}
|
| 213 |
st.session_state.total_chunks += len(chunks)
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
|
| 217 |
-
# ───
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
q_emb = embed_model.encode(question).tolist()
|
| 221 |
results = collection.query(query_embeddings=[q_emb], n_results=top_k)
|
| 222 |
|
| 223 |
chunks = []
|
| 224 |
for i in range(len(results["documents"][0])):
|
| 225 |
-
|
|
|
|
| 226 |
chunks.append({
|
| 227 |
"text": results["documents"][0][i],
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
"
|
|
|
|
|
|
|
| 231 |
})
|
| 232 |
|
| 233 |
context = "\n\n".join([
|
| 234 |
-
f"[Source: {c['
|
|
|
|
| 235 |
])
|
| 236 |
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
| 241 |
|
| 242 |
-
|
|
|
|
| 243 |
|
| 244 |
-
|
|
|
|
| 245 |
|
| 246 |
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
| 247 |
payload = {
|
| 248 |
"model": "llama-3.3-70b-versatile",
|
| 249 |
"messages": [{"role": "user", "content": prompt}],
|
| 250 |
-
"max_tokens":
|
| 251 |
-
"temperature": 0.
|
| 252 |
}
|
| 253 |
-
r = requests.post("https://api.groq.com/openai/v1/chat/completions",
|
|
|
|
| 254 |
r.raise_for_status()
|
| 255 |
answer = r.json()["choices"][0]["message"]["content"]
|
| 256 |
return answer, chunks
|
| 257 |
|
| 258 |
|
| 259 |
-
# ─── Determine current phase ──────────────────────────────────────────────────
|
| 260 |
-
has_docs = len(st.session_state.indexed_files) > 0
|
| 261 |
-
phase = 1 if not has_docs else 2
|
| 262 |
-
|
| 263 |
-
|
| 264 |
# ─── Sidebar ──────────────────────────────────────────────────────────────────
|
| 265 |
with st.sidebar:
|
| 266 |
-
st.markdown("##
|
| 267 |
-
st.markdown("<div style='color:#374151;font-size:0.
|
| 268 |
st.markdown("---")
|
| 269 |
|
| 270 |
env_key = os.environ.get("GROQ_API_KEY", "")
|
| 271 |
-
if env_key
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
st.caption("Get free key → [console.groq.com](https://console.groq.com)")
|
| 278 |
|
| 279 |
st.markdown("---")
|
| 280 |
-
st.markdown("<div class='section-label'>Indexed
|
| 281 |
|
| 282 |
-
if st.session_state.
|
| 283 |
-
for
|
|
|
|
|
|
|
|
|
|
| 284 |
st.markdown(f"""
|
| 285 |
-
<div
|
| 286 |
-
<div
|
| 287 |
-
|
| 288 |
-
|
| 289 |
</div>
|
|
|
|
| 290 |
</div>""", unsafe_allow_html=True)
|
| 291 |
|
| 292 |
-
st.markdown("
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
st.rerun()
|
| 297 |
else:
|
| 298 |
-
st.markdown("<div style='color:#374151;font-size:0.82rem'>
|
| 299 |
|
| 300 |
st.markdown("---")
|
| 301 |
st.markdown("""
|
| 302 |
-
<div style='font-size:0.
|
| 303 |
<b style='color:#4b5563'>Stack</b><br>
|
| 304 |
-
📄 PDF
|
| 305 |
-
|
|
|
|
| 306 |
🔢 Embeddings: all-MiniLM-L6-v2<br>
|
| 307 |
-
🗄️ Vector DB: ChromaDB
|
| 308 |
-
🧠 LLM: Groq · Llama 3.3 70B
|
| 309 |
-
|
| 310 |
-
</div>
|
| 311 |
-
""", unsafe_allow_html=True)
|
| 312 |
|
| 313 |
|
| 314 |
-
# ───
|
| 315 |
st.markdown("""
|
| 316 |
<div class='hero'>
|
| 317 |
-
<h1>
|
| 318 |
-
<p>
|
| 319 |
</div>
|
| 320 |
""", unsafe_allow_html=True)
|
| 321 |
|
| 322 |
-
# Phase bar
|
| 323 |
-
st.markdown(f"""
|
| 324 |
-
<div class='phase-bar'>
|
| 325 |
-
<div class='phase {"done" if phase > 1 else "active"}'>
|
| 326 |
-
<span class='phase-icon'>📤</span>Upload PDFs
|
| 327 |
-
</div>
|
| 328 |
-
<div class='phase {"active" if phase == 1 else "done"}'>
|
| 329 |
-
<span class='phase-icon'>📑</span>Extract Text
|
| 330 |
-
</div>
|
| 331 |
-
<div class='phase {"active" if phase == 1 else "done"}'>
|
| 332 |
-
<span class='phase-icon'>✂️</span>Chunk
|
| 333 |
-
</div>
|
| 334 |
-
<div class='phase {"active" if phase == 1 else "done"}'>
|
| 335 |
-
<span class='phase-icon'>🔢</span>Embed
|
| 336 |
-
</div>
|
| 337 |
-
<div class='phase {"active" if phase == 1 else "done"}'>
|
| 338 |
-
<span class='phase-icon'>🗄️</span>Index
|
| 339 |
-
</div>
|
| 340 |
-
<div class='phase {"active" if phase == 2 else ""}'>
|
| 341 |
-
<span class='phase-icon'>💬</span>Ask Questions
|
| 342 |
-
</div>
|
| 343 |
-
</div>
|
| 344 |
-
""", unsafe_allow_html=True)
|
| 345 |
-
|
| 346 |
-
# ─── Load model ───────────────────────────────────────────────────────────────
|
| 347 |
with st.spinner("⚙️ Loading embedding model..."):
|
| 348 |
embed_model = load_embed_model()
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
|
| 351 |
-
# ════════════════════════════════════════════════════════════
|
| 352 |
-
# PHASE 1 — Upload & Index
|
| 353 |
-
# ════════════════════════════════════════════════════════════
|
| 354 |
-
st.markdown("<div class='section-label'>Step 1 — Upload PDF Documents</div>", unsafe_allow_html=True)
|
| 355 |
-
|
| 356 |
-
uploaded_files = st.file_uploader(
|
| 357 |
-
"Drop your PDF files here",
|
| 358 |
-
type=["pdf"],
|
| 359 |
-
accept_multiple_files=True,
|
| 360 |
-
label_visibility="collapsed"
|
| 361 |
-
)
|
| 362 |
-
|
| 363 |
-
if uploaded_files:
|
| 364 |
-
new_files = [f for f in uploaded_files if f.name not in st.session_state.indexed_files]
|
| 365 |
-
|
| 366 |
-
if new_files:
|
| 367 |
-
st.markdown(f"**{len(new_files)} new file(s) ready to index:**")
|
| 368 |
-
for f in new_files:
|
| 369 |
-
st.markdown(f"<div class='pdf-card'><div><div class='pdf-name'>📄 {f.name}</div><div class='pdf-meta'>{round(f.size/1024,1)} KB</div></div><div class='pdf-badge'>ready</div></div>", unsafe_allow_html=True)
|
| 370 |
-
|
| 371 |
-
if st.button(f"⚡ Extract & Index {len(new_files)} PDF(s)", type="primary", use_container_width=True):
|
| 372 |
-
progress = st.progress(0, text="Starting...")
|
| 373 |
-
for idx, f in enumerate(new_files):
|
| 374 |
-
progress.progress((idx) / len(new_files), text=f"Processing: {f.name}")
|
| 375 |
-
pdf_bytes = f.read()
|
| 376 |
-
|
| 377 |
-
with st.spinner(f"Extracting & indexing **{f.name}**..."):
|
| 378 |
-
n_chunks, n_pages = index_pdf(f.name, pdf_bytes, embed_model)
|
| 379 |
-
|
| 380 |
-
st.success(f"✅ **{f.name}** → {n_pages} pages · {n_chunks} chunks indexed")
|
| 381 |
-
|
| 382 |
-
progress.progress(1.0, text="Done!")
|
| 383 |
-
st.balloons()
|
| 384 |
-
st.rerun()
|
| 385 |
-
|
| 386 |
-
else:
|
| 387 |
-
st.info("All uploaded files are already indexed. Upload new files or ask questions below.")
|
| 388 |
-
|
| 389 |
-
elif not has_docs:
|
| 390 |
-
st.markdown("""
|
| 391 |
-
<div class='empty-state'>
|
| 392 |
-
<div class='icon'>📂</div>
|
| 393 |
-
<p><b style='color:#94a3b8'>No documents uploaded yet</b><br>
|
| 394 |
-
Upload one or more PDF files above to get started.<br>
|
| 395 |
-
Any topic works — reports, manuals, research papers, policies.</p>
|
| 396 |
-
</div>
|
| 397 |
-
""", unsafe_allow_html=True)
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
# ════════════════════════════════════════════════════════════
|
| 401 |
-
# PHASE 2 — Stats & Query
|
| 402 |
-
# ════════════════════════════════════════════════════════════
|
| 403 |
-
if has_docs:
|
| 404 |
-
total_pages = sum(v["pages"] for v in st.session_state.indexed_files.values())
|
| 405 |
-
|
| 406 |
-
st.markdown("<div class='section-label' style='margin-top:24px'>Index Summary</div>", unsafe_allow_html=True)
|
| 407 |
st.markdown(f"""
|
| 408 |
<div class='stat-row'>
|
| 409 |
-
<div class='stat-box'><div class='stat-val'>{
|
| 410 |
-
<div class='stat-box'><div class='stat-val'>{
|
| 411 |
-
<div class='stat-box'><div class='stat-val'>{
|
| 412 |
-
<div class='stat-box'><div class='stat-val'>
|
|
|
|
| 413 |
</div>
|
| 414 |
""", unsafe_allow_html=True)
|
| 415 |
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
st.markdown("
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
top_k = st.selectbox("Top K", [2, 3, 4, 5], index=1, help="Number of chunks to retrieve")
|
| 428 |
|
| 429 |
-
|
|
|
|
|
|
|
| 430 |
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
try:
|
| 434 |
-
answer, chunks = rag_query(question, embed_model, top_k, api_key)
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
| 443 |
st.markdown(f"""
|
| 444 |
<div class='chunk-card'>
|
| 445 |
-
<div class='chunk-
|
| 446 |
-
<div>
|
| 447 |
-
|
| 448 |
-
<div class='chunk-page'>Page {chunk['page']}</div>
|
| 449 |
-
</div>
|
| 450 |
-
<div class='score-bar-wrap'>
|
| 451 |
-
<div class='score-bar'><div class='score-fill' style='width:{bar_width}%'></div></div>
|
| 452 |
-
<div class='score-num'>{chunk['relevance']}%</div>
|
| 453 |
-
</div>
|
| 454 |
</div>
|
| 455 |
-
<div class='chunk-text'>{chunk['text']}</div>
|
| 456 |
-
</div>
|
| 457 |
-
""", unsafe_allow_html=True)
|
| 458 |
-
|
| 459 |
-
except requests.HTTPError as e:
|
| 460 |
-
if e.response.status_code == 401:
|
| 461 |
-
st.error("❌ Invalid Groq API key.")
|
| 462 |
-
else:
|
| 463 |
-
st.error(f"❌ API error: {str(e)}")
|
| 464 |
-
except Exception as e:
|
| 465 |
-
st.error(f"❌ Error: {str(e)}")
|
| 466 |
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
import os
|
| 6 |
import requests
|
|
|
|
| 7 |
import hashlib
|
| 8 |
+
import re
|
| 9 |
+
from urllib.parse import urlparse, parse_qs
|
| 10 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
|
| 13 |
# ─── Page Config ──────────────────────────────────────────────────────────────
|
| 14 |
st.set_page_config(
|
| 15 |
+
page_title="RAG Assistant · Chat",
|
| 16 |
+
page_icon="🤖",
|
| 17 |
layout="wide",
|
| 18 |
initial_sidebar_state="expanded"
|
| 19 |
)
|
|
|
|
| 31 |
border: 1px solid #1e2a3e;
|
| 32 |
border-top: 3px solid #22d3ee;
|
| 33 |
border-radius: 12px;
|
| 34 |
+
padding: 24px 28px;
|
| 35 |
+
margin-bottom: 20px;
|
| 36 |
}
|
| 37 |
+
.hero h1 { font-size: 1.7rem; font-weight: 600; color: #e2e8f0; margin: 0 0 4px 0; }
|
| 38 |
+
.hero p { color: #64748b; font-size: 0.88rem; margin: 0; }
|
| 39 |
+
|
| 40 |
+
/* Source type tabs */
|
| 41 |
+
.source-tabs { display: flex; gap: 8px; margin-bottom: 16px; }
|
| 42 |
+
.source-tab {
|
| 43 |
+
flex: 1; padding: 10px; text-align: center;
|
| 44 |
+
background: #0d1424; border: 1px solid #1e2a3e;
|
| 45 |
+
border-radius: 8px; font-size: 0.82rem; color: #64748b; cursor: pointer;
|
| 46 |
+
}
|
| 47 |
+
.source-tab.active { border-color: #22d3ee; color: #22d3ee; background: rgba(34,211,238,0.07); }
|
| 48 |
|
| 49 |
+
/* Indexed source cards */
|
| 50 |
+
.source-card {
|
| 51 |
+
background: #0d1424; border: 1px solid #1e2a3e;
|
| 52 |
+
border-radius: 8px; padding: 10px 14px; margin: 6px 0;
|
| 53 |
+
display: flex; align-items: center; justify-content: space-between;
|
| 54 |
}
|
| 55 |
+
.source-name { font-size: 0.82rem; color: #e2e8f0; font-weight: 500; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; max-width: 160px; }
|
| 56 |
+
.source-meta { font-family: 'IBM Plex Mono', monospace; font-size: 0.68rem; color: #475569; }
|
| 57 |
+
.source-type-badge {
|
| 58 |
+
font-size: 0.68rem; padding: 2px 8px; border-radius: 20px;
|
| 59 |
+
font-family: 'IBM Plex Mono', monospace; white-space: nowrap;
|
| 60 |
}
|
| 61 |
+
.badge-pdf { background: rgba(99,102,241,0.12); color: #a5b4fc; border: 1px solid rgba(99,102,241,0.25); }
|
| 62 |
+
.badge-url { background: rgba(34,197,94,0.1); color: #4ade80; border: 1px solid rgba(34,197,94,0.25); }
|
| 63 |
+
.badge-yt { background: rgba(239,68,68,0.1); color: #f87171; border: 1px solid rgba(239,68,68,0.25); }
|
|
|
|
| 64 |
|
| 65 |
+
/* Chat messages */
|
| 66 |
+
.chat-user {
|
| 67 |
+
display: flex; justify-content: flex-end; margin: 10px 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
}
|
| 69 |
+
.chat-user-bubble {
|
| 70 |
+
background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2);
|
| 71 |
+
border-radius: 16px 16px 4px 16px;
|
| 72 |
+
padding: 12px 18px; max-width: 70%;
|
| 73 |
+
color: #e2e8f0; font-size: 0.92rem; line-height: 1.6;
|
|
|
|
| 74 |
}
|
| 75 |
+
.chat-assistant {
|
| 76 |
+
display: flex; justify-content: flex-start; margin: 10px 0; gap: 10px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
| 78 |
+
.chat-avatar {
|
| 79 |
+
width: 32px; height: 32px; border-radius: 50%;
|
| 80 |
+
background: linear-gradient(135deg, #22d3ee, #6366f1);
|
| 81 |
+
display: flex; align-items: center; justify-content: center;
|
| 82 |
+
font-size: 0.9rem; flex-shrink: 0; margin-top: 2px;
|
|
|
|
|
|
|
| 83 |
}
|
| 84 |
+
.chat-assistant-bubble {
|
| 85 |
+
background: #0d1424; border: 1px solid #1e2a3e;
|
| 86 |
+
border-radius: 4px 16px 16px 16px;
|
| 87 |
+
padding: 14px 18px; max-width: 75%;
|
| 88 |
+
color: #e2e8f0; font-size: 0.92rem; line-height: 1.7;
|
| 89 |
}
|
| 90 |
+
.chat-sources {
|
| 91 |
+
margin-top: 10px; padding-top: 10px;
|
| 92 |
+
border-top: 1px solid #1e2a3e;
|
|
|
|
|
|
|
| 93 |
}
|
| 94 |
+
.chat-source-chip {
|
| 95 |
+
display: inline-block; font-size: 0.72rem;
|
| 96 |
+
font-family: 'IBM Plex Mono', monospace;
|
| 97 |
+
background: #0b0f1a; border: 1px solid #1e2a3e;
|
| 98 |
+
border-radius: 20px; padding: 2px 10px; margin: 3px 3px 0 0;
|
| 99 |
+
color: #475569;
|
|
|
|
|
|
|
| 100 |
}
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
/* Chunk expander styling */
|
| 103 |
+
.chunk-card {
|
| 104 |
+
background: #0b0f1a; border: 1px solid #1e2a3e;
|
| 105 |
+
border-radius: 8px; padding: 12px 16px; margin: 6px 0;
|
| 106 |
}
|
| 107 |
+
.chunk-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
|
| 108 |
+
.chunk-src { font-size: 0.75rem; font-weight: 600; color: #22d3ee; text-transform: uppercase; letter-spacing: 0.04em; }
|
| 109 |
+
.chunk-score { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #475569; }
|
| 110 |
+
.chunk-text { font-size: 0.84rem; color: #94a3b8; line-height: 1.6; }
|
| 111 |
|
| 112 |
+
.stat-row { display: flex; gap: 8px; margin: 12px 0; }
|
| 113 |
+
.stat-box { flex: 1; background: #0d1424; border: 1px solid #1e2a3e; border-radius: 8px; padding: 10px; text-align: center; }
|
| 114 |
+
.stat-val { font-size: 1.2rem; font-weight: 600; color: #22d3ee; }
|
| 115 |
+
.stat-lbl { font-size: 0.68rem; color: #475569; margin-top: 2px; }
|
| 116 |
+
|
| 117 |
+
.section-label {
|
| 118 |
+
font-size: 0.68rem; text-transform: uppercase; letter-spacing: 0.1em;
|
| 119 |
+
color: #374151; font-weight: 600; margin: 16px 0 8px 0;
|
| 120 |
}
|
| 121 |
|
| 122 |
+
section[data-testid="stSidebar"] { background-color: #080c14; border-right: 1px solid #131c2e; }
|
| 123 |
+
|
| 124 |
+
.empty-chat {
|
| 125 |
text-align: center; padding: 48px 24px;
|
| 126 |
+
color: #374151; border: 2px dashed #1e2a3e; border-radius: 12px;
|
| 127 |
}
|
|
|
|
|
|
|
| 128 |
</style>
|
| 129 |
""", unsafe_allow_html=True)
|
| 130 |
|
| 131 |
|
| 132 |
# ─── Session State ────────────────────────────────────────────────────────────
|
| 133 |
+
defaults = {
|
| 134 |
+
"indexed_sources": {}, # name → {type, chunks, meta}
|
| 135 |
+
"chroma_collection": None,
|
| 136 |
+
"chroma_client": None,
|
| 137 |
+
"total_chunks": 0,
|
| 138 |
+
"chat_history": [], # [{role, content, sources}]
|
| 139 |
+
}
|
| 140 |
+
for k, v in defaults.items():
|
| 141 |
+
if k not in st.session_state:
|
| 142 |
+
st.session_state[k] = v
|
| 143 |
|
| 144 |
|
| 145 |
+
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
| 146 |
@st.cache_resource(show_spinner=False)
|
| 147 |
def load_embed_model():
|
| 148 |
return SentenceTransformer('all-MiniLM-L6-v2')
|
| 149 |
|
| 150 |
|
| 151 |
+
def get_or_create_collection():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
if st.session_state.chroma_client is None:
|
| 153 |
st.session_state.chroma_client = chromadb.Client()
|
| 154 |
st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(
|
| 155 |
+
name="rag_store", metadata={"hnsw:space": "cosine"}
|
| 156 |
)
|
| 157 |
+
return st.session_state.chroma_collection
|
| 158 |
|
|
|
|
| 159 |
|
| 160 |
+
def chunk_text(text: str, source_name: str, source_type: str, meta: dict,
|
| 161 |
+
chunk_size: int = 400, overlap: int = 60) -> list[dict]:
|
| 162 |
+
words = text.split()
|
| 163 |
+
chunks = []
|
| 164 |
+
start = 0
|
| 165 |
+
while start < len(words):
|
| 166 |
+
end = start + chunk_size
|
| 167 |
+
chunk_str = " ".join(words[start:end]).strip()
|
| 168 |
+
if len(chunk_str) > 60:
|
| 169 |
+
chunks.append({"text": chunk_str, "source": source_name, "type": source_type, **meta})
|
| 170 |
+
start += chunk_size - overlap
|
| 171 |
+
return chunks
|
| 172 |
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
def index_chunks(chunks: list[dict], source_name: str, source_type: str, embed_model):
|
| 175 |
+
collection = get_or_create_collection()
|
| 176 |
texts = [c["text"] for c in chunks]
|
| 177 |
embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
|
| 178 |
+
prefix = hashlib.md5(source_name.encode()).hexdigest()[:8]
|
| 179 |
ids, docs, metas, embeds = [], [], [], []
|
| 180 |
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
| 181 |
+
ids.append(f"{prefix}_chunk_{i}")
|
|
|
|
| 182 |
docs.append(chunk["text"])
|
| 183 |
+
metas.append({"source": chunk["source"], "type": chunk["type"],
|
| 184 |
+
"page": chunk.get("page", 1), "timestamp": chunk.get("timestamp", "")})
|
| 185 |
embeds.append(emb)
|
|
|
|
| 186 |
collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
st.session_state.total_chunks += len(chunks)
|
| 188 |
+
st.session_state.indexed_sources[source_name] = {
|
| 189 |
+
"type": source_type, "chunks": len(chunks),
|
| 190 |
+
"meta": {k: v for k, v in chunks[0].items() if k not in ["text", "source", "type"]}
|
| 191 |
+
}
|
| 192 |
|
| 193 |
|
| 194 |
+
# ─── Source-specific extractors ───────────────────────────────────────────────
|
| 195 |
+
|
| 196 |
+
## PDF
|
| 197 |
+
def process_pdf(filename: str, pdf_bytes: bytes, embed_model):
|
| 198 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 199 |
+
chunks = []
|
| 200 |
+
for page_num, page in enumerate(doc, start=1):
|
| 201 |
+
text = page.get_text("text").strip()
|
| 202 |
+
if text:
|
| 203 |
+
page_chunks = chunk_text(text, filename, "pdf", {"page": page_num})
|
| 204 |
+
chunks.extend(page_chunks)
|
| 205 |
+
doc.close()
|
| 206 |
+
index_chunks(chunks, filename, "pdf", embed_model)
|
| 207 |
+
return len(chunks)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
## Web URL
|
| 211 |
+
def process_url(url: str, embed_model):
|
| 212 |
+
headers = {"User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0)"}
|
| 213 |
+
r = requests.get(url, headers=headers, timeout=15)
|
| 214 |
+
r.raise_for_status()
|
| 215 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 216 |
+
# Remove nav, footer, script, style tags
|
| 217 |
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
| 218 |
+
tag.decompose()
|
| 219 |
+
text = soup.get_text(separator=" ", strip=True)
|
| 220 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 221 |
+
if len(text) < 100:
|
| 222 |
+
raise ValueError("Could not extract meaningful text from this URL.")
|
| 223 |
+
parsed = urlparse(url)
|
| 224 |
+
source_name = parsed.netloc + parsed.path[:40]
|
| 225 |
+
chunks = chunk_text(text, source_name, "url", {"page": 1})
|
| 226 |
+
index_chunks(chunks, source_name, "url", embed_model)
|
| 227 |
+
return len(chunks), source_name
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
## YouTube
|
| 231 |
+
def get_youtube_id(url: str) -> str:
|
| 232 |
+
patterns = [
|
| 233 |
+
r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})',
|
| 234 |
+
r'(?:embed/)([a-zA-Z0-9_-]{11})',
|
| 235 |
+
]
|
| 236 |
+
for p in patterns:
|
| 237 |
+
m = re.search(p, url)
|
| 238 |
+
if m:
|
| 239 |
+
return m.group(1)
|
| 240 |
+
raise ValueError("Could not extract YouTube video ID from URL.")
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def process_youtube(url: str, embed_model):
|
| 244 |
+
video_id = get_youtube_id(url)
|
| 245 |
+
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
| 246 |
+
# Build text with timestamps
|
| 247 |
+
chunks = []
|
| 248 |
+
buffer_text = ""
|
| 249 |
+
buffer_start = None
|
| 250 |
+
word_count = 0
|
| 251 |
+
for entry in transcript_list:
|
| 252 |
+
if buffer_start is None:
|
| 253 |
+
buffer_start = int(entry["start"])
|
| 254 |
+
buffer_text += " " + entry["text"]
|
| 255 |
+
word_count += len(entry["text"].split())
|
| 256 |
+
if word_count >= 350:
|
| 257 |
+
ts = f"{buffer_start//60}:{buffer_start%60:02d}"
|
| 258 |
+
chunks.append({
|
| 259 |
+
"text": buffer_text.strip(),
|
| 260 |
+
"source": f"youtube:{video_id}",
|
| 261 |
+
"type": "youtube",
|
| 262 |
+
"page": 1,
|
| 263 |
+
"timestamp": ts
|
| 264 |
+
})
|
| 265 |
+
buffer_text = ""
|
| 266 |
+
buffer_start = None
|
| 267 |
+
word_count = 0
|
| 268 |
+
if buffer_text.strip():
|
| 269 |
+
ts = f"{buffer_start//60}:{buffer_start%60:02d}" if buffer_start else "0:00"
|
| 270 |
+
chunks.append({
|
| 271 |
+
"text": buffer_text.strip(),
|
| 272 |
+
"source": f"youtube:{video_id}",
|
| 273 |
+
"type": "youtube",
|
| 274 |
+
"page": 1,
|
| 275 |
+
"timestamp": ts
|
| 276 |
+
})
|
| 277 |
+
index_chunks(chunks, f"youtube:{video_id}", "youtube", embed_model)
|
| 278 |
+
return len(chunks), video_id
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# ─── RAG Query with Chat Memory ───────────────────────────────────────────────
|
| 282 |
+
def rag_query(question: str, embed_model, top_k: int, api_key: str) -> tuple[str, list]:
|
| 283 |
+
collection = get_or_create_collection()
|
| 284 |
q_emb = embed_model.encode(question).tolist()
|
| 285 |
results = collection.query(query_embeddings=[q_emb], n_results=top_k)
|
| 286 |
|
| 287 |
chunks = []
|
| 288 |
for i in range(len(results["documents"][0])):
|
| 289 |
+
dist = results["distances"][0][i]
|
| 290 |
+
meta = results["metadatas"][0][i]
|
| 291 |
chunks.append({
|
| 292 |
"text": results["documents"][0][i],
|
| 293 |
+
"source": meta["source"],
|
| 294 |
+
"type": meta["type"],
|
| 295 |
+
"page": meta.get("page", 1),
|
| 296 |
+
"timestamp": meta.get("timestamp", ""),
|
| 297 |
+
"relevance": round((1 - dist) * 100, 1),
|
| 298 |
})
|
| 299 |
|
| 300 |
context = "\n\n".join([
|
| 301 |
+
f"[Source: {c['source']} | Type: {c['type']} | Page/Time: {c['page'] or c['timestamp']}]\n{c['text']}"
|
| 302 |
+
for c in chunks
|
| 303 |
])
|
| 304 |
|
| 305 |
+
# Build conversation history for multi-turn memory
|
| 306 |
+
history_text = ""
|
| 307 |
+
if st.session_state.chat_history:
|
| 308 |
+
recent = st.session_state.chat_history[-6:] # last 3 turns
|
| 309 |
+
for msg in recent:
|
| 310 |
+
role = "User" if msg["role"] == "user" else "Assistant"
|
| 311 |
+
history_text += f"{role}: {msg['content']}\n"
|
| 312 |
|
| 313 |
+
prompt = f"""You are a helpful assistant that answers questions based on indexed documents. Use ONLY the context below to answer. Be concise and conversational. Always cite your source (filename, URL, or YouTube timestamp) inline. If the answer isn't in the context, say "I couldn't find that in the indexed sources."
|
| 314 |
+
|
| 315 |
+
Conversation so far:
|
| 316 |
+
{history_text if history_text else "(This is the start of the conversation)"}
|
| 317 |
|
| 318 |
+
Relevant context from documents:
|
| 319 |
+
{context}
|
| 320 |
|
| 321 |
+
User: {question}
|
| 322 |
+
Assistant:"""
|
| 323 |
|
| 324 |
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
| 325 |
payload = {
|
| 326 |
"model": "llama-3.3-70b-versatile",
|
| 327 |
"messages": [{"role": "user", "content": prompt}],
|
| 328 |
+
"max_tokens": 700,
|
| 329 |
+
"temperature": 0.3,
|
| 330 |
}
|
| 331 |
+
r = requests.post("https://api.groq.com/openai/v1/chat/completions",
|
| 332 |
+
headers=headers, json=payload, timeout=30)
|
| 333 |
r.raise_for_status()
|
| 334 |
answer = r.json()["choices"][0]["message"]["content"]
|
| 335 |
return answer, chunks
|
| 336 |
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
# ─── Sidebar ──────────────────────────────────────────────────────────────────
|
| 339 |
with st.sidebar:
|
| 340 |
+
st.markdown("## 🤖 RAG Chat Assistant")
|
| 341 |
+
st.markdown("<div style='color:#374151;font-size:0.78rem'>PDF · Web · YouTube → Chat</div>", unsafe_allow_html=True)
|
| 342 |
st.markdown("---")
|
| 343 |
|
| 344 |
env_key = os.environ.get("GROQ_API_KEY", "")
|
| 345 |
+
api_key = env_key if env_key else st.text_input(
|
| 346 |
+
"🔑 Groq API Key", type="password", placeholder="gsk_...",
|
| 347 |
+
help="Free at console.groq.com"
|
| 348 |
+
)
|
| 349 |
+
if not env_key and not api_key:
|
| 350 |
+
st.caption("Get free key → [console.groq.com](https://console.groq.com)")
|
|
|
|
| 351 |
|
| 352 |
st.markdown("---")
|
| 353 |
+
st.markdown("<div class='section-label'>Indexed Sources</div>", unsafe_allow_html=True)
|
| 354 |
|
| 355 |
+
if st.session_state.indexed_sources:
|
| 356 |
+
for name, info in st.session_state.indexed_sources.items():
|
| 357 |
+
badge_class = f"badge-{info['type']}"
|
| 358 |
+
icon = "📄" if info['type'] == 'pdf' else "🌐" if info['type'] == 'url' else "▶️"
|
| 359 |
+
label = info['type'].upper()
|
| 360 |
st.markdown(f"""
|
| 361 |
+
<div class='source-card'>
|
| 362 |
+
<div>
|
| 363 |
+
<div class='source-name'>{icon} {name}</div>
|
| 364 |
+
<div class='source-meta'>{info['chunks']} chunks</div>
|
| 365 |
</div>
|
| 366 |
+
<div class='source-type-badge {badge_class}'>{label}</div>
|
| 367 |
</div>""", unsafe_allow_html=True)
|
| 368 |
|
| 369 |
+
st.markdown("")
|
| 370 |
+
col1, col2 = st.columns(2)
|
| 371 |
+
if col1.button("🗑️ Clear index", use_container_width=True):
|
| 372 |
+
for k in ["indexed_sources", "chroma_collection", "chroma_client", "total_chunks"]:
|
| 373 |
+
del st.session_state[k]
|
| 374 |
+
st.rerun()
|
| 375 |
+
if col2.button("💬 Clear chat", use_container_width=True):
|
| 376 |
+
st.session_state.chat_history = []
|
| 377 |
st.rerun()
|
| 378 |
else:
|
| 379 |
+
st.markdown("<div style='color:#374151;font-size:0.82rem'>Nothing indexed yet.</div>", unsafe_allow_html=True)
|
| 380 |
|
| 381 |
st.markdown("---")
|
| 382 |
st.markdown("""
|
| 383 |
+
<div style='font-size:0.75rem;color:#374151;line-height:2'>
|
| 384 |
<b style='color:#4b5563'>Stack</b><br>
|
| 385 |
+
📄 PDF: PyMuPDF<br>
|
| 386 |
+
🌐 Web: BeautifulSoup4<br>
|
| 387 |
+
▶️ YouTube: youtube-transcript-api<br>
|
| 388 |
🔢 Embeddings: all-MiniLM-L6-v2<br>
|
| 389 |
+
🗄️ Vector DB: ChromaDB<br>
|
| 390 |
+
🧠 LLM: Groq · Llama 3.3 70B
|
| 391 |
+
</div>""", unsafe_allow_html=True)
|
|
|
|
|
|
|
| 392 |
|
| 393 |
|
| 394 |
+
# ─── Main UI ──────────────────────────────────────────────────────────────────
|
| 395 |
st.markdown("""
|
| 396 |
<div class='hero'>
|
| 397 |
+
<h1>🤖 RAG Chat Assistant</h1>
|
| 398 |
+
<p>Index PDFs · Web pages · YouTube videos — then have a multi-turn conversation across all of them</p>
|
| 399 |
</div>
|
| 400 |
""", unsafe_allow_html=True)
|
| 401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
with st.spinner("⚙️ Loading embedding model..."):
|
| 403 |
embed_model = load_embed_model()
|
| 404 |
|
| 405 |
+
# ════════════════════════════════════════════════════════
|
| 406 |
+
# INGEST PANEL
|
| 407 |
+
# ════════════════════════════════════════════════════════
|
| 408 |
+
with st.expander("➕ Add a new source (PDF / Web URL / YouTube)", expanded=len(st.session_state.indexed_sources) == 0):
|
| 409 |
+
tab_pdf, tab_url, tab_yt = st.tabs(["📄 PDF Upload", "🌐 Web URL", "▶️ YouTube"])
|
| 410 |
+
|
| 411 |
+
# ── PDF Tab ──
|
| 412 |
+
with tab_pdf:
|
| 413 |
+
uploaded = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True, label_visibility="collapsed")
|
| 414 |
+
if uploaded:
|
| 415 |
+
new = [f for f in uploaded if f.name not in st.session_state.indexed_sources]
|
| 416 |
+
if new:
|
| 417 |
+
if st.button(f"⚡ Index {len(new)} PDF(s)", type="primary", key="idx_pdf"):
|
| 418 |
+
for f in new:
|
| 419 |
+
with st.spinner(f"Indexing {f.name}..."):
|
| 420 |
+
n = process_pdf(f.name, f.read(), embed_model)
|
| 421 |
+
st.success(f"✅ {f.name} → {n} chunks")
|
| 422 |
+
st.rerun()
|
| 423 |
+
else:
|
| 424 |
+
st.info("Already indexed.")
|
| 425 |
+
|
| 426 |
+
# ── URL Tab ──
|
| 427 |
+
with tab_url:
|
| 428 |
+
url_input = st.text_input("Paste a public webpage URL", placeholder="https://en.wikipedia.org/wiki/...", label_visibility="collapsed")
|
| 429 |
+
if st.button("⚡ Fetch & Index URL", type="primary", key="idx_url"):
|
| 430 |
+
if url_input:
|
| 431 |
+
with st.spinner(f"Fetching and indexing {url_input}..."):
|
| 432 |
+
try:
|
| 433 |
+
n, source_name = process_url(url_input, embed_model)
|
| 434 |
+
st.success(f"✅ {source_name} → {n} chunks indexed")
|
| 435 |
+
st.rerun()
|
| 436 |
+
except Exception as e:
|
| 437 |
+
st.error(f"❌ {str(e)}")
|
| 438 |
+
else:
|
| 439 |
+
st.warning("Please enter a URL.")
|
| 440 |
+
|
| 441 |
+
# ── YouTube Tab ──
|
| 442 |
+
with tab_yt:
|
| 443 |
+
yt_input = st.text_input("Paste a YouTube video URL", placeholder="https://www.youtube.com/watch?v=...", label_visibility="collapsed")
|
| 444 |
+
st.caption("Works with any video that has English captions/subtitles enabled.")
|
| 445 |
+
if st.button("⚡ Fetch Transcript & Index", type="primary", key="idx_yt"):
|
| 446 |
+
if yt_input:
|
| 447 |
+
with st.spinner("Fetching YouTube transcript..."):
|
| 448 |
+
try:
|
| 449 |
+
n, vid_id = process_youtube(yt_input, embed_model)
|
| 450 |
+
st.success(f"✅ youtube:{vid_id} → {n} chunks indexed")
|
| 451 |
+
st.rerun()
|
| 452 |
+
except Exception as e:
|
| 453 |
+
st.error(f"❌ {str(e)}")
|
| 454 |
+
else:
|
| 455 |
+
st.warning("Please enter a YouTube URL.")
|
| 456 |
+
|
| 457 |
+
# ════════════════════════════════════════════════════════
|
| 458 |
+
# STATS
|
| 459 |
+
# ═══════════════════════════════════════════════════���════
|
| 460 |
+
if st.session_state.indexed_sources:
|
| 461 |
+
pdf_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "pdf")
|
| 462 |
+
url_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "url")
|
| 463 |
+
yt_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "youtube")
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
st.markdown(f"""
|
| 466 |
<div class='stat-row'>
|
| 467 |
+
<div class='stat-box'><div class='stat-val'>{pdf_count}</div><div class='stat-lbl'>PDFs</div></div>
|
| 468 |
+
<div class='stat-box'><div class='stat-val'>{url_count}</div><div class='stat-lbl'>Web Pages</div></div>
|
| 469 |
+
<div class='stat-box'><div class='stat-val'>{yt_count}</div><div class='stat-lbl'>YouTube Videos</div></div>
|
| 470 |
+
<div class='stat-box'><div class='stat-val'>{st.session_state.total_chunks}</div><div class='stat-lbl'>Total Chunks</div></div>
|
| 471 |
+
<div class='stat-box'><div class='stat-val'>{len(st.session_state.chat_history)}</div><div class='stat-lbl'>Messages</div></div>
|
| 472 |
</div>
|
| 473 |
""", unsafe_allow_html=True)
|
| 474 |
|
| 475 |
+
# ═══════════════════════════════════════════════════��════
|
| 476 |
+
# CHAT UI
|
| 477 |
+
# ════════════════════════════════════════════════════════
|
| 478 |
+
if not st.session_state.indexed_sources:
|
| 479 |
+
st.markdown("""
|
| 480 |
+
<div class='empty-chat'>
|
| 481 |
+
<div style='font-size:2.5rem;margin-bottom:12px'>📂</div>
|
| 482 |
+
<p style='color:#4b5563'>Add at least one source above to start chatting.<br>
|
| 483 |
+
Try a PDF, a Wikipedia URL, or a YouTube video.</p>
|
| 484 |
+
</div>""", unsafe_allow_html=True)
|
| 485 |
+
st.stop()
|
|
|
|
| 486 |
|
| 487 |
+
if not api_key:
|
| 488 |
+
st.warning("👈 Add your Groq API key in the sidebar to start chatting.")
|
| 489 |
+
st.stop()
|
| 490 |
|
| 491 |
+
st.markdown("---")
|
| 492 |
+
st.markdown("<div class='section-label'>Conversation</div>", unsafe_allow_html=True)
|
|
|
|
|
|
|
| 493 |
|
| 494 |
+
# Render chat history
|
| 495 |
+
if not st.session_state.chat_history:
|
| 496 |
+
st.markdown("""
|
| 497 |
+
<div class='empty-chat' style='padding:28px'>
|
| 498 |
+
<p style='color:#4b5563;margin:0'>Ask anything about your indexed sources below 👇</p>
|
| 499 |
+
</div>""", unsafe_allow_html=True)
|
| 500 |
|
| 501 |
+
for msg in st.session_state.chat_history:
|
| 502 |
+
if msg["role"] == "user":
|
| 503 |
+
st.markdown(f"""
|
| 504 |
+
<div class='chat-user'>
|
| 505 |
+
<div class='chat-user-bubble'>{msg['content']}</div>
|
| 506 |
+
</div>""", unsafe_allow_html=True)
|
| 507 |
+
else:
|
| 508 |
+
source_chips = ""
|
| 509 |
+
if msg.get("sources"):
|
| 510 |
+
for s in msg["sources"][:4]:
|
| 511 |
+
label = f"{s['source']} · {s['relevance']}%"
|
| 512 |
+
if s.get("timestamp"):
|
| 513 |
+
label += f" @ {s['timestamp']}"
|
| 514 |
+
source_chips += f"<span class='chat-source-chip'>{label}</span>"
|
| 515 |
+
|
| 516 |
+
st.markdown(f"""
|
| 517 |
+
<div class='chat-assistant'>
|
| 518 |
+
<div class='chat-avatar'>🤖</div>
|
| 519 |
+
<div class='chat-assistant-bubble'>
|
| 520 |
+
{msg['content']}
|
| 521 |
+
{f"<div class='chat-sources'>{source_chips}</div>" if source_chips else ""}
|
| 522 |
+
</div>
|
| 523 |
+
</div>""", unsafe_allow_html=True)
|
| 524 |
|
| 525 |
+
if msg.get("sources"):
|
| 526 |
+
with st.expander("🔍 View retrieved chunks", expanded=False):
|
| 527 |
+
for chunk in msg["sources"]:
|
| 528 |
+
icon = "📄" if chunk["type"] == "pdf" else "🌐" if chunk["type"] == "url" else "▶️"
|
| 529 |
+
detail = f"Page {chunk['page']}" if chunk["type"] != "youtube" else f"@ {chunk['timestamp']}"
|
| 530 |
st.markdown(f"""
|
| 531 |
<div class='chunk-card'>
|
| 532 |
+
<div class='chunk-header'>
|
| 533 |
+
<div class='chunk-src'>{icon} {chunk['source']}</div>
|
| 534 |
+
<div class='chunk-score'>{detail} · {chunk['relevance']}% match</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
</div>
|
| 536 |
+
<div class='chunk-text'>{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}</div>
|
| 537 |
+
</div>""", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
+
# Chat input
|
| 540 |
+
st.markdown("")
|
| 541 |
+
col_input, col_k, col_btn = st.columns([6, 1, 1])
|
| 542 |
+
with col_input:
|
| 543 |
+
user_input = st.text_input("", placeholder="Ask something about your indexed sources...", label_visibility="collapsed", key="chat_input")
|
| 544 |
+
with col_k:
|
| 545 |
+
top_k = st.selectbox("K", [2, 3, 4, 5], index=1, label_visibility="collapsed")
|
| 546 |
+
with col_btn:
|
| 547 |
+
send = st.button("Send ➤", type="primary", use_container_width=True)
|
| 548 |
+
|
| 549 |
+
if send and user_input:
|
| 550 |
+
# Add user message
|
| 551 |
+
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
| 552 |
+
|
| 553 |
+
with st.spinner("Thinking..."):
|
| 554 |
+
try:
|
| 555 |
+
answer, chunks = rag_query(user_input, embed_model, top_k, api_key)
|
| 556 |
+
st.session_state.chat_history.append({
|
| 557 |
+
"role": "assistant",
|
| 558 |
+
"content": answer,
|
| 559 |
+
"sources": chunks
|
| 560 |
+
})
|
| 561 |
+
except requests.HTTPError as e:
|
| 562 |
+
st.session_state.chat_history.append({
|
| 563 |
+
"role": "assistant",
|
| 564 |
+
"content": f"❌ API error: {str(e)}",
|
| 565 |
+
"sources": []
|
| 566 |
+
})
|
| 567 |
+
st.rerun()
|
upgraded_app.py
ADDED
|
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import chromadb
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
import fitz # PyMuPDF
|
| 5 |
+
import os
|
| 6 |
+
import requests
|
| 7 |
+
import hashlib
|
| 8 |
+
import re
|
| 9 |
+
from urllib.parse import urlparse, parse_qs
|
| 10 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
+
|
| 13 |
+
# ─── Page Config ──────────────────────────────────────────────────────────────
|
| 14 |
+
st.set_page_config(
|
| 15 |
+
page_title="RAG Assistant · Chat",
|
| 16 |
+
page_icon="🤖",
|
| 17 |
+
layout="wide",
|
| 18 |
+
initial_sidebar_state="expanded"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# ─── CSS ──────────────────────────────────────────────────────────────────────
|
| 22 |
+
st.markdown("""
|
| 23 |
+
<style>
|
| 24 |
+
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap');
|
| 25 |
+
|
| 26 |
+
html, body, [class*="css"] { font-family: 'IBM Plex Sans', sans-serif; }
|
| 27 |
+
.main { background-color: #0b0f1a; }
|
| 28 |
+
|
| 29 |
+
.hero {
|
| 30 |
+
background: linear-gradient(160deg, #0d1424 0%, #0b0f1a 100%);
|
| 31 |
+
border: 1px solid #1e2a3e;
|
| 32 |
+
border-top: 3px solid #22d3ee;
|
| 33 |
+
border-radius: 12px;
|
| 34 |
+
padding: 24px 28px;
|
| 35 |
+
margin-bottom: 20px;
|
| 36 |
+
}
|
| 37 |
+
.hero h1 { font-size: 1.7rem; font-weight: 600; color: #e2e8f0; margin: 0 0 4px 0; }
|
| 38 |
+
.hero p { color: #64748b; font-size: 0.88rem; margin: 0; }
|
| 39 |
+
|
| 40 |
+
/* Source type tabs */
|
| 41 |
+
.source-tabs { display: flex; gap: 8px; margin-bottom: 16px; }
|
| 42 |
+
.source-tab {
|
| 43 |
+
flex: 1; padding: 10px; text-align: center;
|
| 44 |
+
background: #0d1424; border: 1px solid #1e2a3e;
|
| 45 |
+
border-radius: 8px; font-size: 0.82rem; color: #64748b; cursor: pointer;
|
| 46 |
+
}
|
| 47 |
+
.source-tab.active { border-color: #22d3ee; color: #22d3ee; background: rgba(34,211,238,0.07); }
|
| 48 |
+
|
| 49 |
+
/* Indexed source cards */
|
| 50 |
+
.source-card {
|
| 51 |
+
background: #0d1424; border: 1px solid #1e2a3e;
|
| 52 |
+
border-radius: 8px; padding: 10px 14px; margin: 6px 0;
|
| 53 |
+
display: flex; align-items: center; justify-content: space-between;
|
| 54 |
+
}
|
| 55 |
+
.source-name { font-size: 0.82rem; color: #e2e8f0; font-weight: 500; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; max-width: 160px; }
|
| 56 |
+
.source-meta { font-family: 'IBM Plex Mono', monospace; font-size: 0.68rem; color: #475569; }
|
| 57 |
+
.source-type-badge {
|
| 58 |
+
font-size: 0.68rem; padding: 2px 8px; border-radius: 20px;
|
| 59 |
+
font-family: 'IBM Plex Mono', monospace; white-space: nowrap;
|
| 60 |
+
}
|
| 61 |
+
.badge-pdf { background: rgba(99,102,241,0.12); color: #a5b4fc; border: 1px solid rgba(99,102,241,0.25); }
|
| 62 |
+
.badge-url { background: rgba(34,197,94,0.1); color: #4ade80; border: 1px solid rgba(34,197,94,0.25); }
|
| 63 |
+
.badge-yt { background: rgba(239,68,68,0.1); color: #f87171; border: 1px solid rgba(239,68,68,0.25); }
|
| 64 |
+
|
| 65 |
+
/* Chat messages */
|
| 66 |
+
.chat-user {
|
| 67 |
+
display: flex; justify-content: flex-end; margin: 10px 0;
|
| 68 |
+
}
|
| 69 |
+
.chat-user-bubble {
|
| 70 |
+
background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2);
|
| 71 |
+
border-radius: 16px 16px 4px 16px;
|
| 72 |
+
padding: 12px 18px; max-width: 70%;
|
| 73 |
+
color: #e2e8f0; font-size: 0.92rem; line-height: 1.6;
|
| 74 |
+
}
|
| 75 |
+
.chat-assistant {
|
| 76 |
+
display: flex; justify-content: flex-start; margin: 10px 0; gap: 10px;
|
| 77 |
+
}
|
| 78 |
+
.chat-avatar {
|
| 79 |
+
width: 32px; height: 32px; border-radius: 50%;
|
| 80 |
+
background: linear-gradient(135deg, #22d3ee, #6366f1);
|
| 81 |
+
display: flex; align-items: center; justify-content: center;
|
| 82 |
+
font-size: 0.9rem; flex-shrink: 0; margin-top: 2px;
|
| 83 |
+
}
|
| 84 |
+
.chat-assistant-bubble {
|
| 85 |
+
background: #0d1424; border: 1px solid #1e2a3e;
|
| 86 |
+
border-radius: 4px 16px 16px 16px;
|
| 87 |
+
padding: 14px 18px; max-width: 75%;
|
| 88 |
+
color: #e2e8f0; font-size: 0.92rem; line-height: 1.7;
|
| 89 |
+
}
|
| 90 |
+
.chat-sources {
|
| 91 |
+
margin-top: 10px; padding-top: 10px;
|
| 92 |
+
border-top: 1px solid #1e2a3e;
|
| 93 |
+
}
|
| 94 |
+
.chat-source-chip {
|
| 95 |
+
display: inline-block; font-size: 0.72rem;
|
| 96 |
+
font-family: 'IBM Plex Mono', monospace;
|
| 97 |
+
background: #0b0f1a; border: 1px solid #1e2a3e;
|
| 98 |
+
border-radius: 20px; padding: 2px 10px; margin: 3px 3px 0 0;
|
| 99 |
+
color: #475569;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
/* Chunk expander styling */
|
| 103 |
+
.chunk-card {
|
| 104 |
+
background: #0b0f1a; border: 1px solid #1e2a3e;
|
| 105 |
+
border-radius: 8px; padding: 12px 16px; margin: 6px 0;
|
| 106 |
+
}
|
| 107 |
+
.chunk-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
|
| 108 |
+
.chunk-src { font-size: 0.75rem; font-weight: 600; color: #22d3ee; text-transform: uppercase; letter-spacing: 0.04em; }
|
| 109 |
+
.chunk-score { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #475569; }
|
| 110 |
+
.chunk-text { font-size: 0.84rem; color: #94a3b8; line-height: 1.6; }
|
| 111 |
+
|
| 112 |
+
.stat-row { display: flex; gap: 8px; margin: 12px 0; }
|
| 113 |
+
.stat-box { flex: 1; background: #0d1424; border: 1px solid #1e2a3e; border-radius: 8px; padding: 10px; text-align: center; }
|
| 114 |
+
.stat-val { font-size: 1.2rem; font-weight: 600; color: #22d3ee; }
|
| 115 |
+
.stat-lbl { font-size: 0.68rem; color: #475569; margin-top: 2px; }
|
| 116 |
+
|
| 117 |
+
.section-label {
|
| 118 |
+
font-size: 0.68rem; text-transform: uppercase; letter-spacing: 0.1em;
|
| 119 |
+
color: #374151; font-weight: 600; margin: 16px 0 8px 0;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
section[data-testid="stSidebar"] { background-color: #080c14; border-right: 1px solid #131c2e; }
|
| 123 |
+
|
| 124 |
+
.empty-chat {
|
| 125 |
+
text-align: center; padding: 48px 24px;
|
| 126 |
+
color: #374151; border: 2px dashed #1e2a3e; border-radius: 12px;
|
| 127 |
+
}
|
| 128 |
+
</style>
|
| 129 |
+
""", unsafe_allow_html=True)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ─── Session State ────────────────────────────────────────────────────────────
|
| 133 |
+
defaults = {
|
| 134 |
+
"indexed_sources": {}, # name → {type, chunks, meta}
|
| 135 |
+
"chroma_collection": None,
|
| 136 |
+
"chroma_client": None,
|
| 137 |
+
"total_chunks": 0,
|
| 138 |
+
"chat_history": [], # [{role, content, sources}]
|
| 139 |
+
}
|
| 140 |
+
for k, v in defaults.items():
|
| 141 |
+
if k not in st.session_state:
|
| 142 |
+
st.session_state[k] = v
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
| 146 |
+
@st.cache_resource(show_spinner=False)
|
| 147 |
+
def load_embed_model():
|
| 148 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def get_or_create_collection():
|
| 152 |
+
if st.session_state.chroma_client is None:
|
| 153 |
+
st.session_state.chroma_client = chromadb.Client()
|
| 154 |
+
st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(
|
| 155 |
+
name="rag_store", metadata={"hnsw:space": "cosine"}
|
| 156 |
+
)
|
| 157 |
+
return st.session_state.chroma_collection
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def chunk_text(text: str, source_name: str, source_type: str, meta: dict,
|
| 161 |
+
chunk_size: int = 400, overlap: int = 60) -> list[dict]:
|
| 162 |
+
words = text.split()
|
| 163 |
+
chunks = []
|
| 164 |
+
start = 0
|
| 165 |
+
while start < len(words):
|
| 166 |
+
end = start + chunk_size
|
| 167 |
+
chunk_str = " ".join(words[start:end]).strip()
|
| 168 |
+
if len(chunk_str) > 60:
|
| 169 |
+
chunks.append({"text": chunk_str, "source": source_name, "type": source_type, **meta})
|
| 170 |
+
start += chunk_size - overlap
|
| 171 |
+
return chunks
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def index_chunks(chunks: list[dict], source_name: str, source_type: str, embed_model):
|
| 175 |
+
collection = get_or_create_collection()
|
| 176 |
+
texts = [c["text"] for c in chunks]
|
| 177 |
+
embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
|
| 178 |
+
prefix = hashlib.md5(source_name.encode()).hexdigest()[:8]
|
| 179 |
+
ids, docs, metas, embeds = [], [], [], []
|
| 180 |
+
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
|
| 181 |
+
ids.append(f"{prefix}_chunk_{i}")
|
| 182 |
+
docs.append(chunk["text"])
|
| 183 |
+
metas.append({"source": chunk["source"], "type": chunk["type"],
|
| 184 |
+
"page": chunk.get("page", 1), "timestamp": chunk.get("timestamp", "")})
|
| 185 |
+
embeds.append(emb)
|
| 186 |
+
collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas)
|
| 187 |
+
st.session_state.total_chunks += len(chunks)
|
| 188 |
+
st.session_state.indexed_sources[source_name] = {
|
| 189 |
+
"type": source_type, "chunks": len(chunks),
|
| 190 |
+
"meta": {k: v for k, v in chunks[0].items() if k not in ["text", "source", "type"]}
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# ─── Source-specific extractors ───────────────────────────────────────────────
|
| 195 |
+
|
| 196 |
+
## PDF
|
| 197 |
+
def process_pdf(filename: str, pdf_bytes: bytes, embed_model):
|
| 198 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 199 |
+
chunks = []
|
| 200 |
+
for page_num, page in enumerate(doc, start=1):
|
| 201 |
+
text = page.get_text("text").strip()
|
| 202 |
+
if text:
|
| 203 |
+
page_chunks = chunk_text(text, filename, "pdf", {"page": page_num})
|
| 204 |
+
chunks.extend(page_chunks)
|
| 205 |
+
doc.close()
|
| 206 |
+
index_chunks(chunks, filename, "pdf", embed_model)
|
| 207 |
+
return len(chunks)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
## Web URL
|
| 211 |
+
def process_url(url: str, embed_model):
|
| 212 |
+
headers = {"User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0)"}
|
| 213 |
+
r = requests.get(url, headers=headers, timeout=15)
|
| 214 |
+
r.raise_for_status()
|
| 215 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 216 |
+
# Remove nav, footer, script, style tags
|
| 217 |
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
| 218 |
+
tag.decompose()
|
| 219 |
+
text = soup.get_text(separator=" ", strip=True)
|
| 220 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 221 |
+
if len(text) < 100:
|
| 222 |
+
raise ValueError("Could not extract meaningful text from this URL.")
|
| 223 |
+
parsed = urlparse(url)
|
| 224 |
+
source_name = parsed.netloc + parsed.path[:40]
|
| 225 |
+
chunks = chunk_text(text, source_name, "url", {"page": 1})
|
| 226 |
+
index_chunks(chunks, source_name, "url", embed_model)
|
| 227 |
+
return len(chunks), source_name
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
## YouTube
|
| 231 |
+
def get_youtube_id(url: str) -> str:
|
| 232 |
+
patterns = [
|
| 233 |
+
r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})',
|
| 234 |
+
r'(?:embed/)([a-zA-Z0-9_-]{11})',
|
| 235 |
+
]
|
| 236 |
+
for p in patterns:
|
| 237 |
+
m = re.search(p, url)
|
| 238 |
+
if m:
|
| 239 |
+
return m.group(1)
|
| 240 |
+
raise ValueError("Could not extract YouTube video ID from URL.")
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def process_youtube(url: str, embed_model):
|
| 244 |
+
video_id = get_youtube_id(url)
|
| 245 |
+
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
| 246 |
+
# Build text with timestamps
|
| 247 |
+
chunks = []
|
| 248 |
+
buffer_text = ""
|
| 249 |
+
buffer_start = None
|
| 250 |
+
word_count = 0
|
| 251 |
+
for entry in transcript_list:
|
| 252 |
+
if buffer_start is None:
|
| 253 |
+
buffer_start = int(entry["start"])
|
| 254 |
+
buffer_text += " " + entry["text"]
|
| 255 |
+
word_count += len(entry["text"].split())
|
| 256 |
+
if word_count >= 350:
|
| 257 |
+
ts = f"{buffer_start//60}:{buffer_start%60:02d}"
|
| 258 |
+
chunks.append({
|
| 259 |
+
"text": buffer_text.strip(),
|
| 260 |
+
"source": f"youtube:{video_id}",
|
| 261 |
+
"type": "youtube",
|
| 262 |
+
"page": 1,
|
| 263 |
+
"timestamp": ts
|
| 264 |
+
})
|
| 265 |
+
buffer_text = ""
|
| 266 |
+
buffer_start = None
|
| 267 |
+
word_count = 0
|
| 268 |
+
if buffer_text.strip():
|
| 269 |
+
ts = f"{buffer_start//60}:{buffer_start%60:02d}" if buffer_start else "0:00"
|
| 270 |
+
chunks.append({
|
| 271 |
+
"text": buffer_text.strip(),
|
| 272 |
+
"source": f"youtube:{video_id}",
|
| 273 |
+
"type": "youtube",
|
| 274 |
+
"page": 1,
|
| 275 |
+
"timestamp": ts
|
| 276 |
+
})
|
| 277 |
+
index_chunks(chunks, f"youtube:{video_id}", "youtube", embed_model)
|
| 278 |
+
return len(chunks), video_id
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# ─── RAG Query with Chat Memory ───────────────────────────────────────────────
|
| 282 |
+
def rag_query(question: str, embed_model, top_k: int, api_key: str) -> tuple[str, list]:
|
| 283 |
+
collection = get_or_create_collection()
|
| 284 |
+
q_emb = embed_model.encode(question).tolist()
|
| 285 |
+
results = collection.query(query_embeddings=[q_emb], n_results=top_k)
|
| 286 |
+
|
| 287 |
+
chunks = []
|
| 288 |
+
for i in range(len(results["documents"][0])):
|
| 289 |
+
dist = results["distances"][0][i]
|
| 290 |
+
meta = results["metadatas"][0][i]
|
| 291 |
+
chunks.append({
|
| 292 |
+
"text": results["documents"][0][i],
|
| 293 |
+
"source": meta["source"],
|
| 294 |
+
"type": meta["type"],
|
| 295 |
+
"page": meta.get("page", 1),
|
| 296 |
+
"timestamp": meta.get("timestamp", ""),
|
| 297 |
+
"relevance": round((1 - dist) * 100, 1),
|
| 298 |
+
})
|
| 299 |
+
|
| 300 |
+
context = "\n\n".join([
|
| 301 |
+
f"[Source: {c['source']} | Type: {c['type']} | Page/Time: {c['page'] or c['timestamp']}]\n{c['text']}"
|
| 302 |
+
for c in chunks
|
| 303 |
+
])
|
| 304 |
+
|
| 305 |
+
# Build conversation history for multi-turn memory
|
| 306 |
+
history_text = ""
|
| 307 |
+
if st.session_state.chat_history:
|
| 308 |
+
recent = st.session_state.chat_history[-6:] # last 3 turns
|
| 309 |
+
for msg in recent:
|
| 310 |
+
role = "User" if msg["role"] == "user" else "Assistant"
|
| 311 |
+
history_text += f"{role}: {msg['content']}\n"
|
| 312 |
+
|
| 313 |
+
prompt = f"""You are a helpful assistant that answers questions based on indexed documents. Use ONLY the context below to answer. Be concise and conversational. Always cite your source (filename, URL, or YouTube timestamp) inline. If the answer isn't in the context, say "I couldn't find that in the indexed sources."
|
| 314 |
+
|
| 315 |
+
Conversation so far:
|
| 316 |
+
{history_text if history_text else "(This is the start of the conversation)"}
|
| 317 |
+
|
| 318 |
+
Relevant context from documents:
|
| 319 |
+
{context}
|
| 320 |
+
|
| 321 |
+
User: {question}
|
| 322 |
+
Assistant:"""
|
| 323 |
+
|
| 324 |
+
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
| 325 |
+
payload = {
|
| 326 |
+
"model": "llama-3.3-70b-versatile",
|
| 327 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 328 |
+
"max_tokens": 700,
|
| 329 |
+
"temperature": 0.3,
|
| 330 |
+
}
|
| 331 |
+
r = requests.post("https://api.groq.com/openai/v1/chat/completions",
|
| 332 |
+
headers=headers, json=payload, timeout=30)
|
| 333 |
+
r.raise_for_status()
|
| 334 |
+
answer = r.json()["choices"][0]["message"]["content"]
|
| 335 |
+
return answer, chunks
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
# ─── Sidebar ──────────────────────────────────────────────────────────────────
|
| 339 |
+
with st.sidebar:
|
| 340 |
+
st.markdown("## 🤖 RAG Chat Assistant")
|
| 341 |
+
st.markdown("<div style='color:#374151;font-size:0.78rem'>PDF · Web · YouTube → Chat</div>", unsafe_allow_html=True)
|
| 342 |
+
st.markdown("---")
|
| 343 |
+
|
| 344 |
+
env_key = os.environ.get("GROQ_API_KEY", "")
|
| 345 |
+
api_key = env_key if env_key else st.text_input(
|
| 346 |
+
"🔑 Groq API Key", type="password", placeholder="gsk_...",
|
| 347 |
+
help="Free at console.groq.com"
|
| 348 |
+
)
|
| 349 |
+
if not env_key and not api_key:
|
| 350 |
+
st.caption("Get free key → [console.groq.com](https://console.groq.com)")
|
| 351 |
+
|
| 352 |
+
st.markdown("---")
|
| 353 |
+
st.markdown("<div class='section-label'>Indexed Sources</div>", unsafe_allow_html=True)
|
| 354 |
+
|
| 355 |
+
if st.session_state.indexed_sources:
|
| 356 |
+
for name, info in st.session_state.indexed_sources.items():
|
| 357 |
+
badge_class = f"badge-{info['type']}"
|
| 358 |
+
icon = "📄" if info['type'] == 'pdf' else "🌐" if info['type'] == 'url' else "▶️"
|
| 359 |
+
label = info['type'].upper()
|
| 360 |
+
st.markdown(f"""
|
| 361 |
+
<div class='source-card'>
|
| 362 |
+
<div>
|
| 363 |
+
<div class='source-name'>{icon} {name}</div>
|
| 364 |
+
<div class='source-meta'>{info['chunks']} chunks</div>
|
| 365 |
+
</div>
|
| 366 |
+
<div class='source-type-badge {badge_class}'>{label}</div>
|
| 367 |
+
</div>""", unsafe_allow_html=True)
|
| 368 |
+
|
| 369 |
+
st.markdown("")
|
| 370 |
+
col1, col2 = st.columns(2)
|
| 371 |
+
if col1.button("🗑️ Clear index", use_container_width=True):
|
| 372 |
+
for k in ["indexed_sources", "chroma_collection", "chroma_client", "total_chunks"]:
|
| 373 |
+
del st.session_state[k]
|
| 374 |
+
st.rerun()
|
| 375 |
+
if col2.button("💬 Clear chat", use_container_width=True):
|
| 376 |
+
st.session_state.chat_history = []
|
| 377 |
+
st.rerun()
|
| 378 |
+
else:
|
| 379 |
+
st.markdown("<div style='color:#374151;font-size:0.82rem'>Nothing indexed yet.</div>", unsafe_allow_html=True)
|
| 380 |
+
|
| 381 |
+
st.markdown("---")
|
| 382 |
+
st.markdown("""
|
| 383 |
+
<div style='font-size:0.75rem;color:#374151;line-height:2'>
|
| 384 |
+
<b style='color:#4b5563'>Stack</b><br>
|
| 385 |
+
📄 PDF: PyMuPDF<br>
|
| 386 |
+
🌐 Web: BeautifulSoup4<br>
|
| 387 |
+
▶️ YouTube: youtube-transcript-api<br>
|
| 388 |
+
🔢 Embeddings: all-MiniLM-L6-v2<br>
|
| 389 |
+
🗄️ Vector DB: ChromaDB<br>
|
| 390 |
+
🧠 LLM: Groq · Llama 3.3 70B
|
| 391 |
+
</div>""", unsafe_allow_html=True)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
# ─── Main UI ──────────────────────────────────────────────────────────────────
|
| 395 |
+
st.markdown("""
|
| 396 |
+
<div class='hero'>
|
| 397 |
+
<h1>🤖 RAG Chat Assistant</h1>
|
| 398 |
+
<p>Index PDFs · Web pages · YouTube videos — then have a multi-turn conversation across all of them</p>
|
| 399 |
+
</div>
|
| 400 |
+
""", unsafe_allow_html=True)
|
| 401 |
+
|
| 402 |
+
with st.spinner("⚙️ Loading embedding model..."):
|
| 403 |
+
embed_model = load_embed_model()
|
| 404 |
+
|
| 405 |
+
# ════════════════════════════════════════════════════════
|
| 406 |
+
# INGEST PANEL
|
| 407 |
+
# ════════════════════════════════════════════════════════
|
| 408 |
+
with st.expander("➕ Add a new source (PDF / Web URL / YouTube)", expanded=len(st.session_state.indexed_sources) == 0):
|
| 409 |
+
tab_pdf, tab_url, tab_yt = st.tabs(["📄 PDF Upload", "🌐 Web URL", "▶️ YouTube"])
|
| 410 |
+
|
| 411 |
+
# ── PDF Tab ──
|
| 412 |
+
with tab_pdf:
|
| 413 |
+
uploaded = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True, label_visibility="collapsed")
|
| 414 |
+
if uploaded:
|
| 415 |
+
new = [f for f in uploaded if f.name not in st.session_state.indexed_sources]
|
| 416 |
+
if new:
|
| 417 |
+
if st.button(f"⚡ Index {len(new)} PDF(s)", type="primary", key="idx_pdf"):
|
| 418 |
+
for f in new:
|
| 419 |
+
with st.spinner(f"Indexing {f.name}..."):
|
| 420 |
+
n = process_pdf(f.name, f.read(), embed_model)
|
| 421 |
+
st.success(f"✅ {f.name} → {n} chunks")
|
| 422 |
+
st.rerun()
|
| 423 |
+
else:
|
| 424 |
+
st.info("Already indexed.")
|
| 425 |
+
|
| 426 |
+
# ── URL Tab ──
|
| 427 |
+
with tab_url:
|
| 428 |
+
url_input = st.text_input("Paste a public webpage URL", placeholder="https://en.wikipedia.org/wiki/...", label_visibility="collapsed")
|
| 429 |
+
if st.button("⚡ Fetch & Index URL", type="primary", key="idx_url"):
|
| 430 |
+
if url_input:
|
| 431 |
+
with st.spinner(f"Fetching and indexing {url_input}..."):
|
| 432 |
+
try:
|
| 433 |
+
n, source_name = process_url(url_input, embed_model)
|
| 434 |
+
st.success(f"✅ {source_name} → {n} chunks indexed")
|
| 435 |
+
st.rerun()
|
| 436 |
+
except Exception as e:
|
| 437 |
+
st.error(f"❌ {str(e)}")
|
| 438 |
+
else:
|
| 439 |
+
st.warning("Please enter a URL.")
|
| 440 |
+
|
| 441 |
+
# ── YouTube Tab ──
|
| 442 |
+
with tab_yt:
|
| 443 |
+
yt_input = st.text_input("Paste a YouTube video URL", placeholder="https://www.youtube.com/watch?v=...", label_visibility="collapsed")
|
| 444 |
+
st.caption("Works with any video that has English captions/subtitles enabled.")
|
| 445 |
+
if st.button("⚡ Fetch Transcript & Index", type="primary", key="idx_yt"):
|
| 446 |
+
if yt_input:
|
| 447 |
+
with st.spinner("Fetching YouTube transcript..."):
|
| 448 |
+
try:
|
| 449 |
+
n, vid_id = process_youtube(yt_input, embed_model)
|
| 450 |
+
st.success(f"✅ youtube:{vid_id} → {n} chunks indexed")
|
| 451 |
+
st.rerun()
|
| 452 |
+
except Exception as e:
|
| 453 |
+
st.error(f"❌ {str(e)}")
|
| 454 |
+
else:
|
| 455 |
+
st.warning("Please enter a YouTube URL.")
|
| 456 |
+
|
| 457 |
+
# ════════════════════════════════════════════════════════
|
| 458 |
+
# STATS
|
| 459 |
+
# ════════════════════════════════════════════════════════
|
| 460 |
+
if st.session_state.indexed_sources:
|
| 461 |
+
pdf_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "pdf")
|
| 462 |
+
url_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "url")
|
| 463 |
+
yt_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "youtube")
|
| 464 |
+
|
| 465 |
+
st.markdown(f"""
|
| 466 |
+
<div class='stat-row'>
|
| 467 |
+
<div class='stat-box'><div class='stat-val'>{pdf_count}</div><div class='stat-lbl'>PDFs</div></div>
|
| 468 |
+
<div class='stat-box'><div class='stat-val'>{url_count}</div><div class='stat-lbl'>Web Pages</div></div>
|
| 469 |
+
<div class='stat-box'><div class='stat-val'>{yt_count}</div><div class='stat-lbl'>YouTube Videos</div></div>
|
| 470 |
+
<div class='stat-box'><div class='stat-val'>{st.session_state.total_chunks}</div><div class='stat-lbl'>Total Chunks</div></div>
|
| 471 |
+
<div class='stat-box'><div class='stat-val'>{len(st.session_state.chat_history)}</div><div class='stat-lbl'>Messages</div></div>
|
| 472 |
+
</div>
|
| 473 |
+
""", unsafe_allow_html=True)
|
| 474 |
+
|
| 475 |
+
# ════════════════════════════════════════════════════════
|
| 476 |
+
# CHAT UI
|
| 477 |
+
# ════════════════════════════════════════════════════════
|
| 478 |
+
if not st.session_state.indexed_sources:
|
| 479 |
+
st.markdown("""
|
| 480 |
+
<div class='empty-chat'>
|
| 481 |
+
<div style='font-size:2.5rem;margin-bottom:12px'>📂</div>
|
| 482 |
+
<p style='color:#4b5563'>Add at least one source above to start chatting.<br>
|
| 483 |
+
Try a PDF, a Wikipedia URL, or a YouTube video.</p>
|
| 484 |
+
</div>""", unsafe_allow_html=True)
|
| 485 |
+
st.stop()
|
| 486 |
+
|
| 487 |
+
if not api_key:
|
| 488 |
+
st.warning("👈 Add your Groq API key in the sidebar to start chatting.")
|
| 489 |
+
st.stop()
|
| 490 |
+
|
| 491 |
+
st.markdown("---")
|
| 492 |
+
st.markdown("<div class='section-label'>Conversation</div>", unsafe_allow_html=True)
|
| 493 |
+
|
| 494 |
+
# Render chat history
|
| 495 |
+
if not st.session_state.chat_history:
|
| 496 |
+
st.markdown("""
|
| 497 |
+
<div class='empty-chat' style='padding:28px'>
|
| 498 |
+
<p style='color:#4b5563;margin:0'>Ask anything about your indexed sources below 👇</p>
|
| 499 |
+
</div>""", unsafe_allow_html=True)
|
| 500 |
+
|
| 501 |
+
for msg in st.session_state.chat_history:
|
| 502 |
+
if msg["role"] == "user":
|
| 503 |
+
st.markdown(f"""
|
| 504 |
+
<div class='chat-user'>
|
| 505 |
+
<div class='chat-user-bubble'>{msg['content']}</div>
|
| 506 |
+
</div>""", unsafe_allow_html=True)
|
| 507 |
+
else:
|
| 508 |
+
source_chips = ""
|
| 509 |
+
if msg.get("sources"):
|
| 510 |
+
for s in msg["sources"][:4]:
|
| 511 |
+
label = f"{s['source']} · {s['relevance']}%"
|
| 512 |
+
if s.get("timestamp"):
|
| 513 |
+
label += f" @ {s['timestamp']}"
|
| 514 |
+
source_chips += f"<span class='chat-source-chip'>{label}</span>"
|
| 515 |
+
|
| 516 |
+
st.markdown(f"""
|
| 517 |
+
<div class='chat-assistant'>
|
| 518 |
+
<div class='chat-avatar'>🤖</div>
|
| 519 |
+
<div class='chat-assistant-bubble'>
|
| 520 |
+
{msg['content']}
|
| 521 |
+
{f"<div class='chat-sources'>{source_chips}</div>" if source_chips else ""}
|
| 522 |
+
</div>
|
| 523 |
+
</div>""", unsafe_allow_html=True)
|
| 524 |
+
|
| 525 |
+
if msg.get("sources"):
|
| 526 |
+
with st.expander("🔍 View retrieved chunks", expanded=False):
|
| 527 |
+
for chunk in msg["sources"]:
|
| 528 |
+
icon = "📄" if chunk["type"] == "pdf" else "🌐" if chunk["type"] == "url" else "▶️"
|
| 529 |
+
detail = f"Page {chunk['page']}" if chunk["type"] != "youtube" else f"@ {chunk['timestamp']}"
|
| 530 |
+
st.markdown(f"""
|
| 531 |
+
<div class='chunk-card'>
|
| 532 |
+
<div class='chunk-header'>
|
| 533 |
+
<div class='chunk-src'>{icon} {chunk['source']}</div>
|
| 534 |
+
<div class='chunk-score'>{detail} · {chunk['relevance']}% match</div>
|
| 535 |
+
</div>
|
| 536 |
+
<div class='chunk-text'>{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}</div>
|
| 537 |
+
</div>""", unsafe_allow_html=True)
|
| 538 |
+
|
| 539 |
+
# Chat input
|
| 540 |
+
st.markdown("")
|
| 541 |
+
col_input, col_k, col_btn = st.columns([6, 1, 1])
|
| 542 |
+
with col_input:
|
| 543 |
+
user_input = st.text_input("", placeholder="Ask something about your indexed sources...", label_visibility="collapsed", key="chat_input")
|
| 544 |
+
with col_k:
|
| 545 |
+
top_k = st.selectbox("K", [2, 3, 4, 5], index=1, label_visibility="collapsed")
|
| 546 |
+
with col_btn:
|
| 547 |
+
send = st.button("Send ➤", type="primary", use_container_width=True)
|
| 548 |
+
|
| 549 |
+
if send and user_input:
|
| 550 |
+
# Add user message
|
| 551 |
+
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
| 552 |
+
|
| 553 |
+
with st.spinner("Thinking..."):
|
| 554 |
+
try:
|
| 555 |
+
answer, chunks = rag_query(user_input, embed_model, top_k, api_key)
|
| 556 |
+
st.session_state.chat_history.append({
|
| 557 |
+
"role": "assistant",
|
| 558 |
+
"content": answer,
|
| 559 |
+
"sources": chunks
|
| 560 |
+
})
|
| 561 |
+
except requests.HTTPError as e:
|
| 562 |
+
st.session_state.chat_history.append({
|
| 563 |
+
"role": "assistant",
|
| 564 |
+
"content": f"❌ API error: {str(e)}",
|
| 565 |
+
"sources": []
|
| 566 |
+
})
|
| 567 |
+
st.rerun()
|
upgraded_requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.32.0
|
| 2 |
+
chromadb>=0.4.22
|
| 3 |
+
sentence-transformers>=2.7.0
|
| 4 |
+
requests>=2.31.0
|
| 5 |
+
PyMuPDF>=1.24.0
|
| 6 |
+
beautifulsoup4>=4.12.0
|
| 7 |
+
youtube-transcript-api>=0.6.2
|