anirudh-np-ds commited on
Commit
6b6d437
·
1 Parent(s): 95ac500

feat: add chat memory + web URL + YouTube ingestion

Browse files
requirements.txt CHANGED
@@ -2,4 +2,6 @@ streamlit>=1.32.0
2
  chromadb>=0.4.22
3
  sentence-transformers>=2.7.0
4
  requests>=2.31.0
5
- PyMuPDF>=1.24.0
 
 
 
2
  chromadb>=0.4.22
3
  sentence-transformers>=2.7.0
4
  requests>=2.31.0
5
+ PyMuPDF>=1.24.0
6
+ beautifulsoup4>=4.12.0
7
+ youtube-transcript-api>=0.6.2
src/streamlit_app.py CHANGED
@@ -4,13 +4,16 @@ from sentence_transformers import SentenceTransformer
4
  import fitz # PyMuPDF
5
  import os
6
  import requests
7
- import re
8
  import hashlib
 
 
 
 
9
 
10
  # ─── Page Config ──────────────────────────────────────────────────────────────
11
  st.set_page_config(
12
- page_title="PDF RAG · Upload & Ask",
13
- page_icon="📂",
14
  layout="wide",
15
  initial_sidebar_state="expanded"
16
  )
@@ -28,441 +31,537 @@ html, body, [class*="css"] { font-family: 'IBM Plex Sans', sans-serif; }
28
  border: 1px solid #1e2a3e;
29
  border-top: 3px solid #22d3ee;
30
  border-radius: 12px;
31
- padding: 28px 32px;
32
- margin-bottom: 24px;
33
  }
34
- .hero h1 { font-size: 1.8rem; font-weight: 600; color: #e2e8f0; margin: 0 0 6px 0; }
35
- .hero p { color: #64748b; font-size: 0.95rem; margin: 0; }
 
 
 
 
 
 
 
 
 
36
 
37
- .phase-bar {
38
- display: flex; gap: 0; margin-bottom: 28px;
39
- border: 1px solid #1e2a3e; border-radius: 10px; overflow: hidden;
 
 
40
  }
41
- .phase {
42
- flex: 1; padding: 10px 6px; text-align: center;
43
- font-size: 0.75rem; color: #4b5563; background: #0d1117;
44
- border-right: 1px solid #1e2a3e; line-height: 1.5;
 
45
  }
46
- .phase:last-child { border-right: none; }
47
- .phase.done { color: #22d3ee; background: rgba(34,211,238,0.05); }
48
- .phase.active { color: #f8fafc; background: rgba(34,211,238,0.1); font-weight: 600; }
49
- .phase-icon { font-size: 1.1rem; display: block; margin-bottom: 2px; }
50
 
51
- .pdf-card {
52
- background: #0d1424;
53
- border: 1px solid #1e2a3e;
54
- border-radius: 10px;
55
- padding: 14px 16px;
56
- margin: 8px 0;
57
- display: flex;
58
- align-items: center;
59
- justify-content: space-between;
60
  }
61
- .pdf-name { font-size: 0.85rem; color: #e2e8f0; font-weight: 500; }
62
- .pdf-meta { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #475569; margin-top: 3px; }
63
- .pdf-badge {
64
- font-size: 0.72rem; font-family: 'IBM Plex Mono', monospace;
65
- background: rgba(34,211,238,0.1); color: #22d3ee;
66
- border: 1px solid rgba(34,211,238,0.25); padding: 3px 10px; border-radius: 20px;
67
  }
68
-
69
- .answer-box {
70
- background: #0d1424;
71
- border: 1px solid #1e3a4a;
72
- border-left: 3px solid #22d3ee;
73
- border-radius: 10px;
74
- padding: 22px 24px;
75
- color: #e2e8f0;
76
- line-height: 1.75;
77
- font-size: 0.96rem;
78
- margin: 12px 0 20px 0;
79
  }
80
-
81
- .chunk-card {
82
- background: #0d1117;
83
- border: 1px solid #1e2a3e;
84
- border-radius: 9px;
85
- padding: 14px 18px;
86
- margin: 8px 0;
87
  }
88
- .chunk-top {
89
- display: flex; justify-content: space-between;
90
- align-items: center; margin-bottom: 8px;
 
 
91
  }
92
- .chunk-source { font-size: 0.77rem; font-weight: 600; color: #22d3ee; text-transform: uppercase; letter-spacing: 0.05em; }
93
- .chunk-page { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #475569; }
94
- .score-bar-wrap { display: flex; align-items: center; gap: 8px; }
95
- .score-bar {
96
- height: 4px; border-radius: 2px; background: #1e2a3e; width: 80px; overflow: hidden;
97
  }
98
- .score-fill { height: 100%; border-radius: 2px; background: #22d3ee; }
99
- .score-num { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #22d3ee; }
100
- .chunk-text { font-size: 0.86rem; color: #94a3b8; line-height: 1.65; }
101
-
102
- .stat-row { display: flex; gap: 10px; margin: 16px 0; }
103
- .stat-box {
104
- flex: 1; background: #0d1424; border: 1px solid #1e2a3e;
105
- border-radius: 8px; padding: 12px; text-align: center;
106
  }
107
- .stat-val { font-size: 1.35rem; font-weight: 600; color: #22d3ee; }
108
- .stat-lbl { font-size: 0.7rem; color: #475569; margin-top: 2px; }
109
 
110
- .section-label {
111
- font-size: 0.7rem; text-transform: uppercase; letter-spacing: 0.1em;
112
- color: #374151; font-weight: 600; margin: 18px 0 8px 0;
 
113
  }
 
 
 
 
114
 
115
- section[data-testid="stSidebar"] {
116
- background-color: #080c14; border-right: 1px solid #131c2e;
 
 
 
 
 
 
117
  }
118
 
119
- .empty-state {
 
 
120
  text-align: center; padding: 48px 24px;
121
- border: 2px dashed #1e2a3e; border-radius: 12px; color: #374151;
122
  }
123
- .empty-state .icon { font-size: 2.5rem; margin-bottom: 12px; }
124
- .empty-state p { font-size: 0.9rem; line-height: 1.6; }
125
  </style>
126
  """, unsafe_allow_html=True)
127
 
128
 
129
  # ─── Session State ────────────────────────────────────────────────────────────
130
- if "indexed_files" not in st.session_state:
131
- st.session_state.indexed_files = {} # filename → {chunks, pages, size}
132
- if "chroma_collection" not in st.session_state:
133
- st.session_state.chroma_collection = None
134
- if "chroma_client" not in st.session_state:
135
- st.session_state.chroma_client = None
136
- if "total_chunks" not in st.session_state:
137
- st.session_state.total_chunks = 0
 
 
138
 
139
 
140
- # ─── Load embedding model (cached globally) ───────────────────────────────────
141
  @st.cache_resource(show_spinner=False)
142
  def load_embed_model():
143
  return SentenceTransformer('all-MiniLM-L6-v2')
144
 
145
 
146
- # ─── PDF Extraction ───────────────────────────────────────────────────────────
147
- def extract_text_from_pdf(pdf_bytes: bytes) -> list[dict]:
148
- """Returns list of {page, text} dicts."""
149
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
150
- pages = []
151
- for page_num, page in enumerate(doc, start=1):
152
- text = page.get_text("text").strip()
153
- if text:
154
- pages.append({"page": page_num, "text": text})
155
- doc.close()
156
- return pages
157
-
158
-
159
- # ─── Chunking ─────────────────────────────────────────────────────────────────
160
- def chunk_text(pages: list[dict], chunk_size: int = 400, overlap: int = 60) -> list[dict]:
161
- """Splits page text into overlapping word-based chunks."""
162
- chunks = []
163
- for p in pages:
164
- words = p["text"].split()
165
- start = 0
166
- while start < len(words):
167
- end = start + chunk_size
168
- chunk_words = words[start:end]
169
- chunk_text_str = " ".join(chunk_words).strip()
170
- if len(chunk_text_str) > 60:
171
- chunks.append({"text": chunk_text_str, "page": p["page"]})
172
- start += chunk_size - overlap
173
- return chunks
174
-
175
-
176
- # ─── Index PDF into ChromaDB ──────────────────────────────────────────────────
177
- def index_pdf(filename: str, pdf_bytes: bytes, embed_model):
178
- # Init or reuse ChromaDB
179
  if st.session_state.chroma_client is None:
180
  st.session_state.chroma_client = chromadb.Client()
181
  st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(
182
- name="pdf_rag", metadata={"hnsw:space": "cosine"}
183
  )
 
184
 
185
- collection = st.session_state.chroma_collection
186
 
187
- # Extract & chunk
188
- pages = extract_text_from_pdf(pdf_bytes)
189
- chunks = chunk_text(pages)
 
 
 
 
 
 
 
 
 
190
 
191
- if not chunks:
192
- return 0, 0
193
 
194
- # Embed & add
 
195
  texts = [c["text"] for c in chunks]
196
  embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
197
-
198
  ids, docs, metas, embeds = [], [], [], []
199
  for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
200
- chunk_id = f"{hashlib.md5(filename.encode()).hexdigest()[:8]}_chunk_{i}"
201
- ids.append(chunk_id)
202
  docs.append(chunk["text"])
203
- metas.append({"filename": filename, "page": chunk["page"]})
 
204
  embeds.append(emb)
205
-
206
  collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas)
207
-
208
- st.session_state.indexed_files[filename] = {
209
- "chunks": len(chunks),
210
- "pages": len(pages),
211
- "size_kb": round(len(pdf_bytes) / 1024, 1)
212
- }
213
  st.session_state.total_chunks += len(chunks)
214
- return len(chunks), len(pages)
 
 
 
215
 
216
 
217
- # ─── RAG Query ────────────────────────────────────────────────────────────────
218
- def rag_query(question: str, embed_model, top_k: int, api_key: str):
219
- collection = st.session_state.chroma_collection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  q_emb = embed_model.encode(question).tolist()
221
  results = collection.query(query_embeddings=[q_emb], n_results=top_k)
222
 
223
  chunks = []
224
  for i in range(len(results["documents"][0])):
225
- distance = results["distances"][0][i]
 
226
  chunks.append({
227
  "text": results["documents"][0][i],
228
- "filename": results["metadatas"][0][i]["filename"],
229
- "page": results["metadatas"][0][i]["page"],
230
- "relevance": round((1 - distance) * 100, 1),
 
 
231
  })
232
 
233
  context = "\n\n".join([
234
- f"[Source: {c['filename']}, Page {c['page']}]\n{c['text']}" for c in chunks
 
235
  ])
236
 
237
- prompt = f"""You are a helpful assistant. Answer the user's question using ONLY the document context provided below. Be concise and clear. Always mention the source filename and page number when referencing specific information. If the answer cannot be found in the provided context, say "I couldn't find that information in the uploaded documents."
 
 
 
 
 
 
238
 
239
- Document Context:
240
- {context}
 
 
241
 
242
- Question: {question}
 
243
 
244
- Answer:"""
 
245
 
246
  headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
247
  payload = {
248
  "model": "llama-3.3-70b-versatile",
249
  "messages": [{"role": "user", "content": prompt}],
250
- "max_tokens": 600,
251
- "temperature": 0.2,
252
  }
253
- r = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload, timeout=30)
 
254
  r.raise_for_status()
255
  answer = r.json()["choices"][0]["message"]["content"]
256
  return answer, chunks
257
 
258
 
259
- # ─── Determine current phase ──────────────────────────────────────────────────
260
- has_docs = len(st.session_state.indexed_files) > 0
261
- phase = 1 if not has_docs else 2
262
-
263
-
264
  # ─── Sidebar ──────────────────────────────────────────────────────────────────
265
  with st.sidebar:
266
- st.markdown("## 📂 PDF RAG Demo")
267
- st.markdown("<div style='color:#374151;font-size:0.8rem'>Upload Extract IndexAsk</div>", unsafe_allow_html=True)
268
  st.markdown("---")
269
 
270
  env_key = os.environ.get("GROQ_API_KEY", "")
271
- if env_key:
272
- api_key = env_key
273
- st.success(" Groq key loaded from secrets")
274
- else:
275
- api_key = st.text_input("🔑 Groq API Key", type="password", placeholder="gsk_...", help="Free at console.groq.com")
276
- if not api_key:
277
- st.caption("Get free key → [console.groq.com](https://console.groq.com)")
278
 
279
  st.markdown("---")
280
- st.markdown("<div class='section-label'>Indexed Documents</div>", unsafe_allow_html=True)
281
 
282
- if st.session_state.indexed_files:
283
- for fname, info in st.session_state.indexed_files.items():
 
 
 
284
  st.markdown(f"""
285
- <div style='padding:6px 0;border-bottom:1px solid #131c2e'>
286
- <div style='font-size:0.8rem;color:#e2e8f0'>📄 {fname}</div>
287
- <div style='font-size:0.72rem;color:#475569;font-family:IBM Plex Mono,monospace'>
288
- {info["pages"]}p · {info["chunks"]} chunks · {info["size_kb"]}KB
289
  </div>
 
290
  </div>""", unsafe_allow_html=True)
291
 
292
- st.markdown("---")
293
- if st.button("🗑️ Clear all & reset", use_container_width=True):
294
- for key in ["indexed_files", "chroma_collection", "chroma_client", "total_chunks"]:
295
- del st.session_state[key]
 
 
 
 
296
  st.rerun()
297
  else:
298
- st.markdown("<div style='color:#374151;font-size:0.82rem'>No documents indexed yet.</div>", unsafe_allow_html=True)
299
 
300
  st.markdown("---")
301
  st.markdown("""
302
- <div style='font-size:0.77rem;color:#374151;line-height:1.9'>
303
  <b style='color:#4b5563'>Stack</b><br>
304
- 📄 PDF parsing: PyMuPDF<br>
305
- ✂️ Chunking: word-overlap (400w)<br>
 
306
  🔢 Embeddings: all-MiniLM-L6-v2<br>
307
- 🗄️ Vector DB: ChromaDB in-memory<br>
308
- 🧠 LLM: Groq · Llama 3.3 70B<br>
309
- 🌐 Hosting: HuggingFace Spaces
310
- </div>
311
- """, unsafe_allow_html=True)
312
 
313
 
314
- # ─── Hero ───────────────────────────────────────��─────────────────────────────
315
  st.markdown("""
316
  <div class='hero'>
317
- <h1>📂 PDF RAG Upload & Ask</h1>
318
- <p>Upload any PDF documents · They get extracted, chunked, embedded, and indexed · Then ask questions across all of them</p>
319
  </div>
320
  """, unsafe_allow_html=True)
321
 
322
- # Phase bar
323
- st.markdown(f"""
324
- <div class='phase-bar'>
325
- <div class='phase {"done" if phase > 1 else "active"}'>
326
- <span class='phase-icon'>📤</span>Upload PDFs
327
- </div>
328
- <div class='phase {"active" if phase == 1 else "done"}'>
329
- <span class='phase-icon'>📑</span>Extract Text
330
- </div>
331
- <div class='phase {"active" if phase == 1 else "done"}'>
332
- <span class='phase-icon'>✂️</span>Chunk
333
- </div>
334
- <div class='phase {"active" if phase == 1 else "done"}'>
335
- <span class='phase-icon'>🔢</span>Embed
336
- </div>
337
- <div class='phase {"active" if phase == 1 else "done"}'>
338
- <span class='phase-icon'>🗄️</span>Index
339
- </div>
340
- <div class='phase {"active" if phase == 2 else ""}'>
341
- <span class='phase-icon'>💬</span>Ask Questions
342
- </div>
343
- </div>
344
- """, unsafe_allow_html=True)
345
-
346
- # ─── Load model ───────────────────────────────────────────────────────────────
347
  with st.spinner("⚙️ Loading embedding model..."):
348
  embed_model = load_embed_model()
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
- # ════════════════════════════════════════════════════════════
352
- # PHASE 1 — Upload & Index
353
- # ════════════════════════════════════════════════════════════
354
- st.markdown("<div class='section-label'>Step 1 — Upload PDF Documents</div>", unsafe_allow_html=True)
355
-
356
- uploaded_files = st.file_uploader(
357
- "Drop your PDF files here",
358
- type=["pdf"],
359
- accept_multiple_files=True,
360
- label_visibility="collapsed"
361
- )
362
-
363
- if uploaded_files:
364
- new_files = [f for f in uploaded_files if f.name not in st.session_state.indexed_files]
365
-
366
- if new_files:
367
- st.markdown(f"**{len(new_files)} new file(s) ready to index:**")
368
- for f in new_files:
369
- st.markdown(f"<div class='pdf-card'><div><div class='pdf-name'>📄 {f.name}</div><div class='pdf-meta'>{round(f.size/1024,1)} KB</div></div><div class='pdf-badge'>ready</div></div>", unsafe_allow_html=True)
370
-
371
- if st.button(f"⚡ Extract & Index {len(new_files)} PDF(s)", type="primary", use_container_width=True):
372
- progress = st.progress(0, text="Starting...")
373
- for idx, f in enumerate(new_files):
374
- progress.progress((idx) / len(new_files), text=f"Processing: {f.name}")
375
- pdf_bytes = f.read()
376
-
377
- with st.spinner(f"Extracting & indexing **{f.name}**..."):
378
- n_chunks, n_pages = index_pdf(f.name, pdf_bytes, embed_model)
379
-
380
- st.success(f"✅ **{f.name}** → {n_pages} pages · {n_chunks} chunks indexed")
381
-
382
- progress.progress(1.0, text="Done!")
383
- st.balloons()
384
- st.rerun()
385
-
386
- else:
387
- st.info("All uploaded files are already indexed. Upload new files or ask questions below.")
388
-
389
- elif not has_docs:
390
- st.markdown("""
391
- <div class='empty-state'>
392
- <div class='icon'>📂</div>
393
- <p><b style='color:#94a3b8'>No documents uploaded yet</b><br>
394
- Upload one or more PDF files above to get started.<br>
395
- Any topic works — reports, manuals, research papers, policies.</p>
396
- </div>
397
- """, unsafe_allow_html=True)
398
-
399
-
400
- # ════════════════════════════════════════════════════════════
401
- # PHASE 2 — Stats & Query
402
- # ════════════════════════════════════════════════════════════
403
- if has_docs:
404
- total_pages = sum(v["pages"] for v in st.session_state.indexed_files.values())
405
-
406
- st.markdown("<div class='section-label' style='margin-top:24px'>Index Summary</div>", unsafe_allow_html=True)
407
  st.markdown(f"""
408
  <div class='stat-row'>
409
- <div class='stat-box'><div class='stat-val'>{len(st.session_state.indexed_files)}</div><div class='stat-lbl'>Documents</div></div>
410
- <div class='stat-box'><div class='stat-val'>{total_pages}</div><div class='stat-lbl'>Pages Parsed</div></div>
411
- <div class='stat-box'><div class='stat-val'>{st.session_state.total_chunks}</div><div class='stat-lbl'>Chunks Indexed</div></div>
412
- <div class='stat-box'><div class='stat-val'>384</div><div class='stat-lbl'>Embedding Dims</div></div>
 
413
  </div>
414
  """, unsafe_allow_html=True)
415
 
416
- if not api_key:
417
- st.warning("👈 Enter your Groq API key in the sidebar to start asking questions.")
418
- st.stop()
419
-
420
- st.markdown("---")
421
- st.markdown("<div class='section-label'>Step 2 — Ask a Question</div>", unsafe_allow_html=True)
422
-
423
- col1, col2 = st.columns([5, 1])
424
- with col1:
425
- question = st.text_input("", placeholder="What does the document say about...?", label_visibility="collapsed")
426
- with col2:
427
- top_k = st.selectbox("Top K", [2, 3, 4, 5], index=1, help="Number of chunks to retrieve")
428
 
429
- ask_btn = st.button("🔍 Search & Answer", type="primary", use_container_width=True)
 
 
430
 
431
- if ask_btn and question:
432
- with st.spinner("🔍 Searching index and generating answer..."):
433
- try:
434
- answer, chunks = rag_query(question, embed_model, top_k, api_key)
435
 
436
- st.markdown(f"<div class='section-label'>Answer</div>", unsafe_allow_html=True)
437
- st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 
 
 
 
438
 
439
- st.markdown("<div class='section-label'>Retrieved Chunks (context sent to LLM)</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
- for i, chunk in enumerate(chunks):
442
- bar_width = int(chunk['relevance'])
 
 
 
443
  st.markdown(f"""
444
  <div class='chunk-card'>
445
- <div class='chunk-top'>
446
- <div>
447
- <div class='chunk-source'>📄 {chunk['filename']}</div>
448
- <div class='chunk-page'>Page {chunk['page']}</div>
449
- </div>
450
- <div class='score-bar-wrap'>
451
- <div class='score-bar'><div class='score-fill' style='width:{bar_width}%'></div></div>
452
- <div class='score-num'>{chunk['relevance']}%</div>
453
- </div>
454
  </div>
455
- <div class='chunk-text'>{chunk['text']}</div>
456
- </div>
457
- """, unsafe_allow_html=True)
458
-
459
- except requests.HTTPError as e:
460
- if e.response.status_code == 401:
461
- st.error("❌ Invalid Groq API key.")
462
- else:
463
- st.error(f"❌ API error: {str(e)}")
464
- except Exception as e:
465
- st.error(f"❌ Error: {str(e)}")
466
 
467
- elif ask_btn and not question:
468
- st.warning("Please enter a question.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import fitz # PyMuPDF
5
  import os
6
  import requests
 
7
  import hashlib
8
+ import re
9
+ from urllib.parse import urlparse, parse_qs
10
+ from youtube_transcript_api import YouTubeTranscriptApi
11
+ from bs4 import BeautifulSoup
12
 
13
  # ─── Page Config ──────────────────────────────────────────────────────────────
14
  st.set_page_config(
15
+ page_title="RAG Assistant · Chat",
16
+ page_icon="🤖",
17
  layout="wide",
18
  initial_sidebar_state="expanded"
19
  )
 
31
  border: 1px solid #1e2a3e;
32
  border-top: 3px solid #22d3ee;
33
  border-radius: 12px;
34
+ padding: 24px 28px;
35
+ margin-bottom: 20px;
36
  }
37
+ .hero h1 { font-size: 1.7rem; font-weight: 600; color: #e2e8f0; margin: 0 0 4px 0; }
38
+ .hero p { color: #64748b; font-size: 0.88rem; margin: 0; }
39
+
40
+ /* Source type tabs */
41
+ .source-tabs { display: flex; gap: 8px; margin-bottom: 16px; }
42
+ .source-tab {
43
+ flex: 1; padding: 10px; text-align: center;
44
+ background: #0d1424; border: 1px solid #1e2a3e;
45
+ border-radius: 8px; font-size: 0.82rem; color: #64748b; cursor: pointer;
46
+ }
47
+ .source-tab.active { border-color: #22d3ee; color: #22d3ee; background: rgba(34,211,238,0.07); }
48
 
49
+ /* Indexed source cards */
50
+ .source-card {
51
+ background: #0d1424; border: 1px solid #1e2a3e;
52
+ border-radius: 8px; padding: 10px 14px; margin: 6px 0;
53
+ display: flex; align-items: center; justify-content: space-between;
54
  }
55
+ .source-name { font-size: 0.82rem; color: #e2e8f0; font-weight: 500; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; max-width: 160px; }
56
+ .source-meta { font-family: 'IBM Plex Mono', monospace; font-size: 0.68rem; color: #475569; }
57
+ .source-type-badge {
58
+ font-size: 0.68rem; padding: 2px 8px; border-radius: 20px;
59
+ font-family: 'IBM Plex Mono', monospace; white-space: nowrap;
60
  }
61
+ .badge-pdf { background: rgba(99,102,241,0.12); color: #a5b4fc; border: 1px solid rgba(99,102,241,0.25); }
62
+ .badge-url { background: rgba(34,197,94,0.1); color: #4ade80; border: 1px solid rgba(34,197,94,0.25); }
63
+ .badge-yt { background: rgba(239,68,68,0.1); color: #f87171; border: 1px solid rgba(239,68,68,0.25); }
 
64
 
65
+ /* Chat messages */
66
+ .chat-user {
67
+ display: flex; justify-content: flex-end; margin: 10px 0;
 
 
 
 
 
 
68
  }
69
+ .chat-user-bubble {
70
+ background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2);
71
+ border-radius: 16px 16px 4px 16px;
72
+ padding: 12px 18px; max-width: 70%;
73
+ color: #e2e8f0; font-size: 0.92rem; line-height: 1.6;
 
74
  }
75
+ .chat-assistant {
76
+ display: flex; justify-content: flex-start; margin: 10px 0; gap: 10px;
 
 
 
 
 
 
 
 
 
77
  }
78
+ .chat-avatar {
79
+ width: 32px; height: 32px; border-radius: 50%;
80
+ background: linear-gradient(135deg, #22d3ee, #6366f1);
81
+ display: flex; align-items: center; justify-content: center;
82
+ font-size: 0.9rem; flex-shrink: 0; margin-top: 2px;
 
 
83
  }
84
+ .chat-assistant-bubble {
85
+ background: #0d1424; border: 1px solid #1e2a3e;
86
+ border-radius: 4px 16px 16px 16px;
87
+ padding: 14px 18px; max-width: 75%;
88
+ color: #e2e8f0; font-size: 0.92rem; line-height: 1.7;
89
  }
90
+ .chat-sources {
91
+ margin-top: 10px; padding-top: 10px;
92
+ border-top: 1px solid #1e2a3e;
 
 
93
  }
94
+ .chat-source-chip {
95
+ display: inline-block; font-size: 0.72rem;
96
+ font-family: 'IBM Plex Mono', monospace;
97
+ background: #0b0f1a; border: 1px solid #1e2a3e;
98
+ border-radius: 20px; padding: 2px 10px; margin: 3px 3px 0 0;
99
+ color: #475569;
 
 
100
  }
 
 
101
 
102
+ /* Chunk expander styling */
103
+ .chunk-card {
104
+ background: #0b0f1a; border: 1px solid #1e2a3e;
105
+ border-radius: 8px; padding: 12px 16px; margin: 6px 0;
106
  }
107
+ .chunk-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
108
+ .chunk-src { font-size: 0.75rem; font-weight: 600; color: #22d3ee; text-transform: uppercase; letter-spacing: 0.04em; }
109
+ .chunk-score { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #475569; }
110
+ .chunk-text { font-size: 0.84rem; color: #94a3b8; line-height: 1.6; }
111
 
112
+ .stat-row { display: flex; gap: 8px; margin: 12px 0; }
113
+ .stat-box { flex: 1; background: #0d1424; border: 1px solid #1e2a3e; border-radius: 8px; padding: 10px; text-align: center; }
114
+ .stat-val { font-size: 1.2rem; font-weight: 600; color: #22d3ee; }
115
+ .stat-lbl { font-size: 0.68rem; color: #475569; margin-top: 2px; }
116
+
117
+ .section-label {
118
+ font-size: 0.68rem; text-transform: uppercase; letter-spacing: 0.1em;
119
+ color: #374151; font-weight: 600; margin: 16px 0 8px 0;
120
  }
121
 
122
+ section[data-testid="stSidebar"] { background-color: #080c14; border-right: 1px solid #131c2e; }
123
+
124
+ .empty-chat {
125
  text-align: center; padding: 48px 24px;
126
+ color: #374151; border: 2px dashed #1e2a3e; border-radius: 12px;
127
  }
 
 
128
  </style>
129
  """, unsafe_allow_html=True)
130
 
131
 
132
  # ─── Session State ────────────────────────────────────────────────────────────
133
+ defaults = {
134
+ "indexed_sources": {}, # name → {type, chunks, meta}
135
+ "chroma_collection": None,
136
+ "chroma_client": None,
137
+ "total_chunks": 0,
138
+ "chat_history": [], # [{role, content, sources}]
139
+ }
140
+ for k, v in defaults.items():
141
+ if k not in st.session_state:
142
+ st.session_state[k] = v
143
 
144
 
145
+ # ─── Helpers ──────────────────────────────────────────────────────────────────
146
  @st.cache_resource(show_spinner=False)
147
  def load_embed_model():
148
  return SentenceTransformer('all-MiniLM-L6-v2')
149
 
150
 
151
+ def get_or_create_collection():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  if st.session_state.chroma_client is None:
153
  st.session_state.chroma_client = chromadb.Client()
154
  st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(
155
+ name="rag_store", metadata={"hnsw:space": "cosine"}
156
  )
157
+ return st.session_state.chroma_collection
158
 
 
159
 
160
+ def chunk_text(text: str, source_name: str, source_type: str, meta: dict,
161
+ chunk_size: int = 400, overlap: int = 60) -> list[dict]:
162
+ words = text.split()
163
+ chunks = []
164
+ start = 0
165
+ while start < len(words):
166
+ end = start + chunk_size
167
+ chunk_str = " ".join(words[start:end]).strip()
168
+ if len(chunk_str) > 60:
169
+ chunks.append({"text": chunk_str, "source": source_name, "type": source_type, **meta})
170
+ start += chunk_size - overlap
171
+ return chunks
172
 
 
 
173
 
174
+ def index_chunks(chunks: list[dict], source_name: str, source_type: str, embed_model):
175
+ collection = get_or_create_collection()
176
  texts = [c["text"] for c in chunks]
177
  embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
178
+ prefix = hashlib.md5(source_name.encode()).hexdigest()[:8]
179
  ids, docs, metas, embeds = [], [], [], []
180
  for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
181
+ ids.append(f"{prefix}_chunk_{i}")
 
182
  docs.append(chunk["text"])
183
+ metas.append({"source": chunk["source"], "type": chunk["type"],
184
+ "page": chunk.get("page", 1), "timestamp": chunk.get("timestamp", "")})
185
  embeds.append(emb)
 
186
  collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas)
 
 
 
 
 
 
187
  st.session_state.total_chunks += len(chunks)
188
+ st.session_state.indexed_sources[source_name] = {
189
+ "type": source_type, "chunks": len(chunks),
190
+ "meta": {k: v for k, v in chunks[0].items() if k not in ["text", "source", "type"]}
191
+ }
192
 
193
 
194
+ # ─── Source-specific extractors ───────────────────────────────────────────────
195
+
196
+ ## PDF
197
+ def process_pdf(filename: str, pdf_bytes: bytes, embed_model):
198
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
199
+ chunks = []
200
+ for page_num, page in enumerate(doc, start=1):
201
+ text = page.get_text("text").strip()
202
+ if text:
203
+ page_chunks = chunk_text(text, filename, "pdf", {"page": page_num})
204
+ chunks.extend(page_chunks)
205
+ doc.close()
206
+ index_chunks(chunks, filename, "pdf", embed_model)
207
+ return len(chunks)
208
+
209
+
210
+ ## Web URL
211
+ def process_url(url: str, embed_model):
212
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0)"}
213
+ r = requests.get(url, headers=headers, timeout=15)
214
+ r.raise_for_status()
215
+ soup = BeautifulSoup(r.text, "html.parser")
216
+ # Remove nav, footer, script, style tags
217
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
218
+ tag.decompose()
219
+ text = soup.get_text(separator=" ", strip=True)
220
+ text = re.sub(r'\s+', ' ', text).strip()
221
+ if len(text) < 100:
222
+ raise ValueError("Could not extract meaningful text from this URL.")
223
+ parsed = urlparse(url)
224
+ source_name = parsed.netloc + parsed.path[:40]
225
+ chunks = chunk_text(text, source_name, "url", {"page": 1})
226
+ index_chunks(chunks, source_name, "url", embed_model)
227
+ return len(chunks), source_name
228
+
229
+
230
+ ## YouTube
231
+ def get_youtube_id(url: str) -> str:
232
+ patterns = [
233
+ r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})',
234
+ r'(?:embed/)([a-zA-Z0-9_-]{11})',
235
+ ]
236
+ for p in patterns:
237
+ m = re.search(p, url)
238
+ if m:
239
+ return m.group(1)
240
+ raise ValueError("Could not extract YouTube video ID from URL.")
241
+
242
+
243
+ def process_youtube(url: str, embed_model):
244
+ video_id = get_youtube_id(url)
245
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
246
+ # Build text with timestamps
247
+ chunks = []
248
+ buffer_text = ""
249
+ buffer_start = None
250
+ word_count = 0
251
+ for entry in transcript_list:
252
+ if buffer_start is None:
253
+ buffer_start = int(entry["start"])
254
+ buffer_text += " " + entry["text"]
255
+ word_count += len(entry["text"].split())
256
+ if word_count >= 350:
257
+ ts = f"{buffer_start//60}:{buffer_start%60:02d}"
258
+ chunks.append({
259
+ "text": buffer_text.strip(),
260
+ "source": f"youtube:{video_id}",
261
+ "type": "youtube",
262
+ "page": 1,
263
+ "timestamp": ts
264
+ })
265
+ buffer_text = ""
266
+ buffer_start = None
267
+ word_count = 0
268
+ if buffer_text.strip():
269
+ ts = f"{buffer_start//60}:{buffer_start%60:02d}" if buffer_start else "0:00"
270
+ chunks.append({
271
+ "text": buffer_text.strip(),
272
+ "source": f"youtube:{video_id}",
273
+ "type": "youtube",
274
+ "page": 1,
275
+ "timestamp": ts
276
+ })
277
+ index_chunks(chunks, f"youtube:{video_id}", "youtube", embed_model)
278
+ return len(chunks), video_id
279
+
280
+
281
+ # ─── RAG Query with Chat Memory ───────────────────────────────────────────────
282
+ def rag_query(question: str, embed_model, top_k: int, api_key: str) -> tuple[str, list]:
283
+ collection = get_or_create_collection()
284
  q_emb = embed_model.encode(question).tolist()
285
  results = collection.query(query_embeddings=[q_emb], n_results=top_k)
286
 
287
  chunks = []
288
  for i in range(len(results["documents"][0])):
289
+ dist = results["distances"][0][i]
290
+ meta = results["metadatas"][0][i]
291
  chunks.append({
292
  "text": results["documents"][0][i],
293
+ "source": meta["source"],
294
+ "type": meta["type"],
295
+ "page": meta.get("page", 1),
296
+ "timestamp": meta.get("timestamp", ""),
297
+ "relevance": round((1 - dist) * 100, 1),
298
  })
299
 
300
  context = "\n\n".join([
301
+ f"[Source: {c['source']} | Type: {c['type']} | Page/Time: {c['page'] or c['timestamp']}]\n{c['text']}"
302
+ for c in chunks
303
  ])
304
 
305
+ # Build conversation history for multi-turn memory
306
+ history_text = ""
307
+ if st.session_state.chat_history:
308
+ recent = st.session_state.chat_history[-6:] # last 3 turns
309
+ for msg in recent:
310
+ role = "User" if msg["role"] == "user" else "Assistant"
311
+ history_text += f"{role}: {msg['content']}\n"
312
 
313
+ prompt = f"""You are a helpful assistant that answers questions based on indexed documents. Use ONLY the context below to answer. Be concise and conversational. Always cite your source (filename, URL, or YouTube timestamp) inline. If the answer isn't in the context, say "I couldn't find that in the indexed sources."
314
+
315
+ Conversation so far:
316
+ {history_text if history_text else "(This is the start of the conversation)"}
317
 
318
+ Relevant context from documents:
319
+ {context}
320
 
321
+ User: {question}
322
+ Assistant:"""
323
 
324
  headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
325
  payload = {
326
  "model": "llama-3.3-70b-versatile",
327
  "messages": [{"role": "user", "content": prompt}],
328
+ "max_tokens": 700,
329
+ "temperature": 0.3,
330
  }
331
+ r = requests.post("https://api.groq.com/openai/v1/chat/completions",
332
+ headers=headers, json=payload, timeout=30)
333
  r.raise_for_status()
334
  answer = r.json()["choices"][0]["message"]["content"]
335
  return answer, chunks
336
 
337
 
 
 
 
 
 
338
  # ─── Sidebar ──────────────────────────────────────────────────────────────────
339
  with st.sidebar:
340
+ st.markdown("## 🤖 RAG Chat Assistant")
341
+ st.markdown("<div style='color:#374151;font-size:0.78rem'>PDF · Web · YouTubeChat</div>", unsafe_allow_html=True)
342
  st.markdown("---")
343
 
344
  env_key = os.environ.get("GROQ_API_KEY", "")
345
+ api_key = env_key if env_key else st.text_input(
346
+ "🔑 Groq API Key", type="password", placeholder="gsk_...",
347
+ help="Free at console.groq.com"
348
+ )
349
+ if not env_key and not api_key:
350
+ st.caption("Get free key → [console.groq.com](https://console.groq.com)")
 
351
 
352
  st.markdown("---")
353
+ st.markdown("<div class='section-label'>Indexed Sources</div>", unsafe_allow_html=True)
354
 
355
+ if st.session_state.indexed_sources:
356
+ for name, info in st.session_state.indexed_sources.items():
357
+ badge_class = f"badge-{info['type']}"
358
+ icon = "📄" if info['type'] == 'pdf' else "🌐" if info['type'] == 'url' else "▶️"
359
+ label = info['type'].upper()
360
  st.markdown(f"""
361
+ <div class='source-card'>
362
+ <div>
363
+ <div class='source-name'>{icon} {name}</div>
364
+ <div class='source-meta'>{info['chunks']} chunks</div>
365
  </div>
366
+ <div class='source-type-badge {badge_class}'>{label}</div>
367
  </div>""", unsafe_allow_html=True)
368
 
369
+ st.markdown("")
370
+ col1, col2 = st.columns(2)
371
+ if col1.button("🗑️ Clear index", use_container_width=True):
372
+ for k in ["indexed_sources", "chroma_collection", "chroma_client", "total_chunks"]:
373
+ del st.session_state[k]
374
+ st.rerun()
375
+ if col2.button("💬 Clear chat", use_container_width=True):
376
+ st.session_state.chat_history = []
377
  st.rerun()
378
  else:
379
+ st.markdown("<div style='color:#374151;font-size:0.82rem'>Nothing indexed yet.</div>", unsafe_allow_html=True)
380
 
381
  st.markdown("---")
382
  st.markdown("""
383
+ <div style='font-size:0.75rem;color:#374151;line-height:2'>
384
  <b style='color:#4b5563'>Stack</b><br>
385
+ 📄 PDF: PyMuPDF<br>
386
+ 🌐 Web: BeautifulSoup4<br>
387
+ ▶️ YouTube: youtube-transcript-api<br>
388
  🔢 Embeddings: all-MiniLM-L6-v2<br>
389
+ 🗄️ Vector DB: ChromaDB<br>
390
+ 🧠 LLM: Groq · Llama 3.3 70B
391
+ </div>""", unsafe_allow_html=True)
 
 
392
 
393
 
394
+ # ─── Main UI ──────────────────────────────────────────────────────────────────
395
  st.markdown("""
396
  <div class='hero'>
397
+ <h1>🤖 RAG Chat Assistant</h1>
398
+ <p>Index PDFs · Web pages · YouTube videos then have a multi-turn conversation across all of them</p>
399
  </div>
400
  """, unsafe_allow_html=True)
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  with st.spinner("⚙️ Loading embedding model..."):
403
  embed_model = load_embed_model()
404
 
405
+ # ════════════════════════════════════════════════════════
406
+ # INGEST PANEL
407
+ # ════════════════════════════════════════════════════════
408
+ with st.expander("➕ Add a new source (PDF / Web URL / YouTube)", expanded=len(st.session_state.indexed_sources) == 0):
409
+ tab_pdf, tab_url, tab_yt = st.tabs(["📄 PDF Upload", "🌐 Web URL", "▶️ YouTube"])
410
+
411
+ # ── PDF Tab ──
412
+ with tab_pdf:
413
+ uploaded = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True, label_visibility="collapsed")
414
+ if uploaded:
415
+ new = [f for f in uploaded if f.name not in st.session_state.indexed_sources]
416
+ if new:
417
+ if st.button(f"⚡ Index {len(new)} PDF(s)", type="primary", key="idx_pdf"):
418
+ for f in new:
419
+ with st.spinner(f"Indexing {f.name}..."):
420
+ n = process_pdf(f.name, f.read(), embed_model)
421
+ st.success(f"✅ {f.name} → {n} chunks")
422
+ st.rerun()
423
+ else:
424
+ st.info("Already indexed.")
425
+
426
+ # ── URL Tab ──
427
+ with tab_url:
428
+ url_input = st.text_input("Paste a public webpage URL", placeholder="https://en.wikipedia.org/wiki/...", label_visibility="collapsed")
429
+ if st.button("⚡ Fetch & Index URL", type="primary", key="idx_url"):
430
+ if url_input:
431
+ with st.spinner(f"Fetching and indexing {url_input}..."):
432
+ try:
433
+ n, source_name = process_url(url_input, embed_model)
434
+ st.success(f"✅ {source_name} → {n} chunks indexed")
435
+ st.rerun()
436
+ except Exception as e:
437
+ st.error(f"❌ {str(e)}")
438
+ else:
439
+ st.warning("Please enter a URL.")
440
+
441
+ # ── YouTube Tab ──
442
+ with tab_yt:
443
+ yt_input = st.text_input("Paste a YouTube video URL", placeholder="https://www.youtube.com/watch?v=...", label_visibility="collapsed")
444
+ st.caption("Works with any video that has English captions/subtitles enabled.")
445
+ if st.button("⚡ Fetch Transcript & Index", type="primary", key="idx_yt"):
446
+ if yt_input:
447
+ with st.spinner("Fetching YouTube transcript..."):
448
+ try:
449
+ n, vid_id = process_youtube(yt_input, embed_model)
450
+ st.success(f"✅ youtube:{vid_id} → {n} chunks indexed")
451
+ st.rerun()
452
+ except Exception as e:
453
+ st.error(f"❌ {str(e)}")
454
+ else:
455
+ st.warning("Please enter a YouTube URL.")
456
+
457
+ # ════════════════════════════════════════════════════════
458
+ # STATS
459
+ # ═══════════════════════════════════════════════════���════
460
+ if st.session_state.indexed_sources:
461
+ pdf_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "pdf")
462
+ url_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "url")
463
+ yt_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "youtube")
464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  st.markdown(f"""
466
  <div class='stat-row'>
467
+ <div class='stat-box'><div class='stat-val'>{pdf_count}</div><div class='stat-lbl'>PDFs</div></div>
468
+ <div class='stat-box'><div class='stat-val'>{url_count}</div><div class='stat-lbl'>Web Pages</div></div>
469
+ <div class='stat-box'><div class='stat-val'>{yt_count}</div><div class='stat-lbl'>YouTube Videos</div></div>
470
+ <div class='stat-box'><div class='stat-val'>{st.session_state.total_chunks}</div><div class='stat-lbl'>Total Chunks</div></div>
471
+ <div class='stat-box'><div class='stat-val'>{len(st.session_state.chat_history)}</div><div class='stat-lbl'>Messages</div></div>
472
  </div>
473
  """, unsafe_allow_html=True)
474
 
475
+ # ═══════════════════════════════════════════════════��════
476
+ # CHAT UI
477
+ # ════════════════════════════════════════════════════════
478
+ if not st.session_state.indexed_sources:
479
+ st.markdown("""
480
+ <div class='empty-chat'>
481
+ <div style='font-size:2.5rem;margin-bottom:12px'>📂</div>
482
+ <p style='color:#4b5563'>Add at least one source above to start chatting.<br>
483
+ Try a PDF, a Wikipedia URL, or a YouTube video.</p>
484
+ </div>""", unsafe_allow_html=True)
485
+ st.stop()
 
486
 
487
+ if not api_key:
488
+ st.warning("👈 Add your Groq API key in the sidebar to start chatting.")
489
+ st.stop()
490
 
491
+ st.markdown("---")
492
+ st.markdown("<div class='section-label'>Conversation</div>", unsafe_allow_html=True)
 
 
493
 
494
+ # Render chat history
495
+ if not st.session_state.chat_history:
496
+ st.markdown("""
497
+ <div class='empty-chat' style='padding:28px'>
498
+ <p style='color:#4b5563;margin:0'>Ask anything about your indexed sources below 👇</p>
499
+ </div>""", unsafe_allow_html=True)
500
 
501
+ for msg in st.session_state.chat_history:
502
+ if msg["role"] == "user":
503
+ st.markdown(f"""
504
+ <div class='chat-user'>
505
+ <div class='chat-user-bubble'>{msg['content']}</div>
506
+ </div>""", unsafe_allow_html=True)
507
+ else:
508
+ source_chips = ""
509
+ if msg.get("sources"):
510
+ for s in msg["sources"][:4]:
511
+ label = f"{s['source']} · {s['relevance']}%"
512
+ if s.get("timestamp"):
513
+ label += f" @ {s['timestamp']}"
514
+ source_chips += f"<span class='chat-source-chip'>{label}</span>"
515
+
516
+ st.markdown(f"""
517
+ <div class='chat-assistant'>
518
+ <div class='chat-avatar'>🤖</div>
519
+ <div class='chat-assistant-bubble'>
520
+ {msg['content']}
521
+ {f"<div class='chat-sources'>{source_chips}</div>" if source_chips else ""}
522
+ </div>
523
+ </div>""", unsafe_allow_html=True)
524
 
525
+ if msg.get("sources"):
526
+ with st.expander("🔍 View retrieved chunks", expanded=False):
527
+ for chunk in msg["sources"]:
528
+ icon = "📄" if chunk["type"] == "pdf" else "🌐" if chunk["type"] == "url" else "▶️"
529
+ detail = f"Page {chunk['page']}" if chunk["type"] != "youtube" else f"@ {chunk['timestamp']}"
530
  st.markdown(f"""
531
  <div class='chunk-card'>
532
+ <div class='chunk-header'>
533
+ <div class='chunk-src'>{icon} {chunk['source']}</div>
534
+ <div class='chunk-score'>{detail} · {chunk['relevance']}% match</div>
 
 
 
 
 
 
535
  </div>
536
+ <div class='chunk-text'>{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}</div>
537
+ </div>""", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
538
 
539
+ # Chat input
540
+ st.markdown("")
541
+ col_input, col_k, col_btn = st.columns([6, 1, 1])
542
+ with col_input:
543
+ user_input = st.text_input("", placeholder="Ask something about your indexed sources...", label_visibility="collapsed", key="chat_input")
544
+ with col_k:
545
+ top_k = st.selectbox("K", [2, 3, 4, 5], index=1, label_visibility="collapsed")
546
+ with col_btn:
547
+ send = st.button("Send ➤", type="primary", use_container_width=True)
548
+
549
+ if send and user_input:
550
+ # Add user message
551
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
552
+
553
+ with st.spinner("Thinking..."):
554
+ try:
555
+ answer, chunks = rag_query(user_input, embed_model, top_k, api_key)
556
+ st.session_state.chat_history.append({
557
+ "role": "assistant",
558
+ "content": answer,
559
+ "sources": chunks
560
+ })
561
+ except requests.HTTPError as e:
562
+ st.session_state.chat_history.append({
563
+ "role": "assistant",
564
+ "content": f"❌ API error: {str(e)}",
565
+ "sources": []
566
+ })
567
+ st.rerun()
upgraded_app.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import chromadb
3
+ from sentence_transformers import SentenceTransformer
4
+ import fitz # PyMuPDF
5
+ import os
6
+ import requests
7
+ import hashlib
8
+ import re
9
+ from urllib.parse import urlparse, parse_qs
10
+ from youtube_transcript_api import YouTubeTranscriptApi
11
+ from bs4 import BeautifulSoup
12
+
13
+ # ─── Page Config ──────────────────────────────────────────────────────────────
14
+ st.set_page_config(
15
+ page_title="RAG Assistant · Chat",
16
+ page_icon="🤖",
17
+ layout="wide",
18
+ initial_sidebar_state="expanded"
19
+ )
20
+
21
+ # ─── CSS ──────────────────────────────────────────────────────────────────────
22
+ st.markdown("""
23
+ <style>
24
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap');
25
+
26
+ html, body, [class*="css"] { font-family: 'IBM Plex Sans', sans-serif; }
27
+ .main { background-color: #0b0f1a; }
28
+
29
+ .hero {
30
+ background: linear-gradient(160deg, #0d1424 0%, #0b0f1a 100%);
31
+ border: 1px solid #1e2a3e;
32
+ border-top: 3px solid #22d3ee;
33
+ border-radius: 12px;
34
+ padding: 24px 28px;
35
+ margin-bottom: 20px;
36
+ }
37
+ .hero h1 { font-size: 1.7rem; font-weight: 600; color: #e2e8f0; margin: 0 0 4px 0; }
38
+ .hero p { color: #64748b; font-size: 0.88rem; margin: 0; }
39
+
40
+ /* Source type tabs */
41
+ .source-tabs { display: flex; gap: 8px; margin-bottom: 16px; }
42
+ .source-tab {
43
+ flex: 1; padding: 10px; text-align: center;
44
+ background: #0d1424; border: 1px solid #1e2a3e;
45
+ border-radius: 8px; font-size: 0.82rem; color: #64748b; cursor: pointer;
46
+ }
47
+ .source-tab.active { border-color: #22d3ee; color: #22d3ee; background: rgba(34,211,238,0.07); }
48
+
49
+ /* Indexed source cards */
50
+ .source-card {
51
+ background: #0d1424; border: 1px solid #1e2a3e;
52
+ border-radius: 8px; padding: 10px 14px; margin: 6px 0;
53
+ display: flex; align-items: center; justify-content: space-between;
54
+ }
55
+ .source-name { font-size: 0.82rem; color: #e2e8f0; font-weight: 500; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; max-width: 160px; }
56
+ .source-meta { font-family: 'IBM Plex Mono', monospace; font-size: 0.68rem; color: #475569; }
57
+ .source-type-badge {
58
+ font-size: 0.68rem; padding: 2px 8px; border-radius: 20px;
59
+ font-family: 'IBM Plex Mono', monospace; white-space: nowrap;
60
+ }
61
+ .badge-pdf { background: rgba(99,102,241,0.12); color: #a5b4fc; border: 1px solid rgba(99,102,241,0.25); }
62
+ .badge-url { background: rgba(34,197,94,0.1); color: #4ade80; border: 1px solid rgba(34,197,94,0.25); }
63
+ .badge-yt { background: rgba(239,68,68,0.1); color: #f87171; border: 1px solid rgba(239,68,68,0.25); }
64
+
65
+ /* Chat messages */
66
+ .chat-user {
67
+ display: flex; justify-content: flex-end; margin: 10px 0;
68
+ }
69
+ .chat-user-bubble {
70
+ background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2);
71
+ border-radius: 16px 16px 4px 16px;
72
+ padding: 12px 18px; max-width: 70%;
73
+ color: #e2e8f0; font-size: 0.92rem; line-height: 1.6;
74
+ }
75
+ .chat-assistant {
76
+ display: flex; justify-content: flex-start; margin: 10px 0; gap: 10px;
77
+ }
78
+ .chat-avatar {
79
+ width: 32px; height: 32px; border-radius: 50%;
80
+ background: linear-gradient(135deg, #22d3ee, #6366f1);
81
+ display: flex; align-items: center; justify-content: center;
82
+ font-size: 0.9rem; flex-shrink: 0; margin-top: 2px;
83
+ }
84
+ .chat-assistant-bubble {
85
+ background: #0d1424; border: 1px solid #1e2a3e;
86
+ border-radius: 4px 16px 16px 16px;
87
+ padding: 14px 18px; max-width: 75%;
88
+ color: #e2e8f0; font-size: 0.92rem; line-height: 1.7;
89
+ }
90
+ .chat-sources {
91
+ margin-top: 10px; padding-top: 10px;
92
+ border-top: 1px solid #1e2a3e;
93
+ }
94
+ .chat-source-chip {
95
+ display: inline-block; font-size: 0.72rem;
96
+ font-family: 'IBM Plex Mono', monospace;
97
+ background: #0b0f1a; border: 1px solid #1e2a3e;
98
+ border-radius: 20px; padding: 2px 10px; margin: 3px 3px 0 0;
99
+ color: #475569;
100
+ }
101
+
102
+ /* Chunk expander styling */
103
+ .chunk-card {
104
+ background: #0b0f1a; border: 1px solid #1e2a3e;
105
+ border-radius: 8px; padding: 12px 16px; margin: 6px 0;
106
+ }
107
+ .chunk-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
108
+ .chunk-src { font-size: 0.75rem; font-weight: 600; color: #22d3ee; text-transform: uppercase; letter-spacing: 0.04em; }
109
+ .chunk-score { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; color: #475569; }
110
+ .chunk-text { font-size: 0.84rem; color: #94a3b8; line-height: 1.6; }
111
+
112
+ .stat-row { display: flex; gap: 8px; margin: 12px 0; }
113
+ .stat-box { flex: 1; background: #0d1424; border: 1px solid #1e2a3e; border-radius: 8px; padding: 10px; text-align: center; }
114
+ .stat-val { font-size: 1.2rem; font-weight: 600; color: #22d3ee; }
115
+ .stat-lbl { font-size: 0.68rem; color: #475569; margin-top: 2px; }
116
+
117
+ .section-label {
118
+ font-size: 0.68rem; text-transform: uppercase; letter-spacing: 0.1em;
119
+ color: #374151; font-weight: 600; margin: 16px 0 8px 0;
120
+ }
121
+
122
+ section[data-testid="stSidebar"] { background-color: #080c14; border-right: 1px solid #131c2e; }
123
+
124
+ .empty-chat {
125
+ text-align: center; padding: 48px 24px;
126
+ color: #374151; border: 2px dashed #1e2a3e; border-radius: 12px;
127
+ }
128
+ </style>
129
+ """, unsafe_allow_html=True)
130
+
131
+
132
+ # ─── Session State ────────────────────────────────────────────────────────────
133
+ defaults = {
134
+ "indexed_sources": {}, # name → {type, chunks, meta}
135
+ "chroma_collection": None,
136
+ "chroma_client": None,
137
+ "total_chunks": 0,
138
+ "chat_history": [], # [{role, content, sources}]
139
+ }
140
+ for k, v in defaults.items():
141
+ if k not in st.session_state:
142
+ st.session_state[k] = v
143
+
144
+
145
+ # ─── Helpers ──────────────────────────────────────────────────────────────────
146
+ @st.cache_resource(show_spinner=False)
147
+ def load_embed_model():
148
+ return SentenceTransformer('all-MiniLM-L6-v2')
149
+
150
+
151
+ def get_or_create_collection():
152
+ if st.session_state.chroma_client is None:
153
+ st.session_state.chroma_client = chromadb.Client()
154
+ st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(
155
+ name="rag_store", metadata={"hnsw:space": "cosine"}
156
+ )
157
+ return st.session_state.chroma_collection
158
+
159
+
160
+ def chunk_text(text: str, source_name: str, source_type: str, meta: dict,
161
+ chunk_size: int = 400, overlap: int = 60) -> list[dict]:
162
+ words = text.split()
163
+ chunks = []
164
+ start = 0
165
+ while start < len(words):
166
+ end = start + chunk_size
167
+ chunk_str = " ".join(words[start:end]).strip()
168
+ if len(chunk_str) > 60:
169
+ chunks.append({"text": chunk_str, "source": source_name, "type": source_type, **meta})
170
+ start += chunk_size - overlap
171
+ return chunks
172
+
173
+
174
+ def index_chunks(chunks: list[dict], source_name: str, source_type: str, embed_model):
175
+ collection = get_or_create_collection()
176
+ texts = [c["text"] for c in chunks]
177
+ embeddings = embed_model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
178
+ prefix = hashlib.md5(source_name.encode()).hexdigest()[:8]
179
+ ids, docs, metas, embeds = [], [], [], []
180
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
181
+ ids.append(f"{prefix}_chunk_{i}")
182
+ docs.append(chunk["text"])
183
+ metas.append({"source": chunk["source"], "type": chunk["type"],
184
+ "page": chunk.get("page", 1), "timestamp": chunk.get("timestamp", "")})
185
+ embeds.append(emb)
186
+ collection.add(ids=ids, embeddings=embeds, documents=docs, metadatas=metas)
187
+ st.session_state.total_chunks += len(chunks)
188
+ st.session_state.indexed_sources[source_name] = {
189
+ "type": source_type, "chunks": len(chunks),
190
+ "meta": {k: v for k, v in chunks[0].items() if k not in ["text", "source", "type"]}
191
+ }
192
+
193
+
194
+ # ─── Source-specific extractors ───────────────────────────────────────────────
195
+
196
+ ## PDF
197
+ def process_pdf(filename: str, pdf_bytes: bytes, embed_model):
198
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
199
+ chunks = []
200
+ for page_num, page in enumerate(doc, start=1):
201
+ text = page.get_text("text").strip()
202
+ if text:
203
+ page_chunks = chunk_text(text, filename, "pdf", {"page": page_num})
204
+ chunks.extend(page_chunks)
205
+ doc.close()
206
+ index_chunks(chunks, filename, "pdf", embed_model)
207
+ return len(chunks)
208
+
209
+
210
+ ## Web URL
211
+ def process_url(url: str, embed_model):
212
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0)"}
213
+ r = requests.get(url, headers=headers, timeout=15)
214
+ r.raise_for_status()
215
+ soup = BeautifulSoup(r.text, "html.parser")
216
+ # Remove nav, footer, script, style tags
217
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
218
+ tag.decompose()
219
+ text = soup.get_text(separator=" ", strip=True)
220
+ text = re.sub(r'\s+', ' ', text).strip()
221
+ if len(text) < 100:
222
+ raise ValueError("Could not extract meaningful text from this URL.")
223
+ parsed = urlparse(url)
224
+ source_name = parsed.netloc + parsed.path[:40]
225
+ chunks = chunk_text(text, source_name, "url", {"page": 1})
226
+ index_chunks(chunks, source_name, "url", embed_model)
227
+ return len(chunks), source_name
228
+
229
+
230
+ ## YouTube
231
+ def get_youtube_id(url: str) -> str:
232
+ patterns = [
233
+ r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})',
234
+ r'(?:embed/)([a-zA-Z0-9_-]{11})',
235
+ ]
236
+ for p in patterns:
237
+ m = re.search(p, url)
238
+ if m:
239
+ return m.group(1)
240
+ raise ValueError("Could not extract YouTube video ID from URL.")
241
+
242
+
243
+ def process_youtube(url: str, embed_model):
244
+ video_id = get_youtube_id(url)
245
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
246
+ # Build text with timestamps
247
+ chunks = []
248
+ buffer_text = ""
249
+ buffer_start = None
250
+ word_count = 0
251
+ for entry in transcript_list:
252
+ if buffer_start is None:
253
+ buffer_start = int(entry["start"])
254
+ buffer_text += " " + entry["text"]
255
+ word_count += len(entry["text"].split())
256
+ if word_count >= 350:
257
+ ts = f"{buffer_start//60}:{buffer_start%60:02d}"
258
+ chunks.append({
259
+ "text": buffer_text.strip(),
260
+ "source": f"youtube:{video_id}",
261
+ "type": "youtube",
262
+ "page": 1,
263
+ "timestamp": ts
264
+ })
265
+ buffer_text = ""
266
+ buffer_start = None
267
+ word_count = 0
268
+ if buffer_text.strip():
269
+ ts = f"{buffer_start//60}:{buffer_start%60:02d}" if buffer_start else "0:00"
270
+ chunks.append({
271
+ "text": buffer_text.strip(),
272
+ "source": f"youtube:{video_id}",
273
+ "type": "youtube",
274
+ "page": 1,
275
+ "timestamp": ts
276
+ })
277
+ index_chunks(chunks, f"youtube:{video_id}", "youtube", embed_model)
278
+ return len(chunks), video_id
279
+
280
+
281
+ # ─── RAG Query with Chat Memory ───────────────────────────────────────────────
282
+ def rag_query(question: str, embed_model, top_k: int, api_key: str) -> tuple[str, list]:
283
+ collection = get_or_create_collection()
284
+ q_emb = embed_model.encode(question).tolist()
285
+ results = collection.query(query_embeddings=[q_emb], n_results=top_k)
286
+
287
+ chunks = []
288
+ for i in range(len(results["documents"][0])):
289
+ dist = results["distances"][0][i]
290
+ meta = results["metadatas"][0][i]
291
+ chunks.append({
292
+ "text": results["documents"][0][i],
293
+ "source": meta["source"],
294
+ "type": meta["type"],
295
+ "page": meta.get("page", 1),
296
+ "timestamp": meta.get("timestamp", ""),
297
+ "relevance": round((1 - dist) * 100, 1),
298
+ })
299
+
300
+ context = "\n\n".join([
301
+ f"[Source: {c['source']} | Type: {c['type']} | Page/Time: {c['page'] or c['timestamp']}]\n{c['text']}"
302
+ for c in chunks
303
+ ])
304
+
305
+ # Build conversation history for multi-turn memory
306
+ history_text = ""
307
+ if st.session_state.chat_history:
308
+ recent = st.session_state.chat_history[-6:] # last 3 turns
309
+ for msg in recent:
310
+ role = "User" if msg["role"] == "user" else "Assistant"
311
+ history_text += f"{role}: {msg['content']}\n"
312
+
313
+ prompt = f"""You are a helpful assistant that answers questions based on indexed documents. Use ONLY the context below to answer. Be concise and conversational. Always cite your source (filename, URL, or YouTube timestamp) inline. If the answer isn't in the context, say "I couldn't find that in the indexed sources."
314
+
315
+ Conversation so far:
316
+ {history_text if history_text else "(This is the start of the conversation)"}
317
+
318
+ Relevant context from documents:
319
+ {context}
320
+
321
+ User: {question}
322
+ Assistant:"""
323
+
324
+ headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
325
+ payload = {
326
+ "model": "llama-3.3-70b-versatile",
327
+ "messages": [{"role": "user", "content": prompt}],
328
+ "max_tokens": 700,
329
+ "temperature": 0.3,
330
+ }
331
+ r = requests.post("https://api.groq.com/openai/v1/chat/completions",
332
+ headers=headers, json=payload, timeout=30)
333
+ r.raise_for_status()
334
+ answer = r.json()["choices"][0]["message"]["content"]
335
+ return answer, chunks
336
+
337
+
338
+ # ─── Sidebar ──────────────────────────────────────────────────────────────────
339
+ with st.sidebar:
340
+ st.markdown("## 🤖 RAG Chat Assistant")
341
+ st.markdown("<div style='color:#374151;font-size:0.78rem'>PDF · Web · YouTube → Chat</div>", unsafe_allow_html=True)
342
+ st.markdown("---")
343
+
344
+ env_key = os.environ.get("GROQ_API_KEY", "")
345
+ api_key = env_key if env_key else st.text_input(
346
+ "🔑 Groq API Key", type="password", placeholder="gsk_...",
347
+ help="Free at console.groq.com"
348
+ )
349
+ if not env_key and not api_key:
350
+ st.caption("Get free key → [console.groq.com](https://console.groq.com)")
351
+
352
+ st.markdown("---")
353
+ st.markdown("<div class='section-label'>Indexed Sources</div>", unsafe_allow_html=True)
354
+
355
+ if st.session_state.indexed_sources:
356
+ for name, info in st.session_state.indexed_sources.items():
357
+ badge_class = f"badge-{info['type']}"
358
+ icon = "📄" if info['type'] == 'pdf' else "🌐" if info['type'] == 'url' else "▶️"
359
+ label = info['type'].upper()
360
+ st.markdown(f"""
361
+ <div class='source-card'>
362
+ <div>
363
+ <div class='source-name'>{icon} {name}</div>
364
+ <div class='source-meta'>{info['chunks']} chunks</div>
365
+ </div>
366
+ <div class='source-type-badge {badge_class}'>{label}</div>
367
+ </div>""", unsafe_allow_html=True)
368
+
369
+ st.markdown("")
370
+ col1, col2 = st.columns(2)
371
+ if col1.button("🗑️ Clear index", use_container_width=True):
372
+ for k in ["indexed_sources", "chroma_collection", "chroma_client", "total_chunks"]:
373
+ del st.session_state[k]
374
+ st.rerun()
375
+ if col2.button("💬 Clear chat", use_container_width=True):
376
+ st.session_state.chat_history = []
377
+ st.rerun()
378
+ else:
379
+ st.markdown("<div style='color:#374151;font-size:0.82rem'>Nothing indexed yet.</div>", unsafe_allow_html=True)
380
+
381
+ st.markdown("---")
382
+ st.markdown("""
383
+ <div style='font-size:0.75rem;color:#374151;line-height:2'>
384
+ <b style='color:#4b5563'>Stack</b><br>
385
+ 📄 PDF: PyMuPDF<br>
386
+ 🌐 Web: BeautifulSoup4<br>
387
+ ▶️ YouTube: youtube-transcript-api<br>
388
+ 🔢 Embeddings: all-MiniLM-L6-v2<br>
389
+ 🗄️ Vector DB: ChromaDB<br>
390
+ 🧠 LLM: Groq · Llama 3.3 70B
391
+ </div>""", unsafe_allow_html=True)
392
+
393
+
394
+ # ─── Main UI ──────────────────────────────────────────────────────────────────
395
+ st.markdown("""
396
+ <div class='hero'>
397
+ <h1>🤖 RAG Chat Assistant</h1>
398
+ <p>Index PDFs · Web pages · YouTube videos — then have a multi-turn conversation across all of them</p>
399
+ </div>
400
+ """, unsafe_allow_html=True)
401
+
402
+ with st.spinner("⚙️ Loading embedding model..."):
403
+ embed_model = load_embed_model()
404
+
405
+ # ════════════════════════════════════════════════════════
406
+ # INGEST PANEL
407
+ # ════════════════════════════════════════════════════════
408
+ with st.expander("➕ Add a new source (PDF / Web URL / YouTube)", expanded=len(st.session_state.indexed_sources) == 0):
409
+ tab_pdf, tab_url, tab_yt = st.tabs(["📄 PDF Upload", "🌐 Web URL", "▶️ YouTube"])
410
+
411
+ # ── PDF Tab ──
412
+ with tab_pdf:
413
+ uploaded = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True, label_visibility="collapsed")
414
+ if uploaded:
415
+ new = [f for f in uploaded if f.name not in st.session_state.indexed_sources]
416
+ if new:
417
+ if st.button(f"⚡ Index {len(new)} PDF(s)", type="primary", key="idx_pdf"):
418
+ for f in new:
419
+ with st.spinner(f"Indexing {f.name}..."):
420
+ n = process_pdf(f.name, f.read(), embed_model)
421
+ st.success(f"✅ {f.name} → {n} chunks")
422
+ st.rerun()
423
+ else:
424
+ st.info("Already indexed.")
425
+
426
+ # ── URL Tab ──
427
+ with tab_url:
428
+ url_input = st.text_input("Paste a public webpage URL", placeholder="https://en.wikipedia.org/wiki/...", label_visibility="collapsed")
429
+ if st.button("⚡ Fetch & Index URL", type="primary", key="idx_url"):
430
+ if url_input:
431
+ with st.spinner(f"Fetching and indexing {url_input}..."):
432
+ try:
433
+ n, source_name = process_url(url_input, embed_model)
434
+ st.success(f"✅ {source_name} → {n} chunks indexed")
435
+ st.rerun()
436
+ except Exception as e:
437
+ st.error(f"❌ {str(e)}")
438
+ else:
439
+ st.warning("Please enter a URL.")
440
+
441
+ # ── YouTube Tab ──
442
+ with tab_yt:
443
+ yt_input = st.text_input("Paste a YouTube video URL", placeholder="https://www.youtube.com/watch?v=...", label_visibility="collapsed")
444
+ st.caption("Works with any video that has English captions/subtitles enabled.")
445
+ if st.button("⚡ Fetch Transcript & Index", type="primary", key="idx_yt"):
446
+ if yt_input:
447
+ with st.spinner("Fetching YouTube transcript..."):
448
+ try:
449
+ n, vid_id = process_youtube(yt_input, embed_model)
450
+ st.success(f"✅ youtube:{vid_id} → {n} chunks indexed")
451
+ st.rerun()
452
+ except Exception as e:
453
+ st.error(f"❌ {str(e)}")
454
+ else:
455
+ st.warning("Please enter a YouTube URL.")
456
+
457
+ # ════════════════════════════════════════════════════════
458
+ # STATS
459
+ # ════════════════════════════════════════════════════════
460
+ if st.session_state.indexed_sources:
461
+ pdf_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "pdf")
462
+ url_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "url")
463
+ yt_count = sum(1 for s in st.session_state.indexed_sources.values() if s["type"] == "youtube")
464
+
465
+ st.markdown(f"""
466
+ <div class='stat-row'>
467
+ <div class='stat-box'><div class='stat-val'>{pdf_count}</div><div class='stat-lbl'>PDFs</div></div>
468
+ <div class='stat-box'><div class='stat-val'>{url_count}</div><div class='stat-lbl'>Web Pages</div></div>
469
+ <div class='stat-box'><div class='stat-val'>{yt_count}</div><div class='stat-lbl'>YouTube Videos</div></div>
470
+ <div class='stat-box'><div class='stat-val'>{st.session_state.total_chunks}</div><div class='stat-lbl'>Total Chunks</div></div>
471
+ <div class='stat-box'><div class='stat-val'>{len(st.session_state.chat_history)}</div><div class='stat-lbl'>Messages</div></div>
472
+ </div>
473
+ """, unsafe_allow_html=True)
474
+
475
+ # ════════════════════════════════════════════════════════
476
+ # CHAT UI
477
+ # ════════════════════════════════════════════════════════
478
+ if not st.session_state.indexed_sources:
479
+ st.markdown("""
480
+ <div class='empty-chat'>
481
+ <div style='font-size:2.5rem;margin-bottom:12px'>📂</div>
482
+ <p style='color:#4b5563'>Add at least one source above to start chatting.<br>
483
+ Try a PDF, a Wikipedia URL, or a YouTube video.</p>
484
+ </div>""", unsafe_allow_html=True)
485
+ st.stop()
486
+
487
+ if not api_key:
488
+ st.warning("👈 Add your Groq API key in the sidebar to start chatting.")
489
+ st.stop()
490
+
491
+ st.markdown("---")
492
+ st.markdown("<div class='section-label'>Conversation</div>", unsafe_allow_html=True)
493
+
494
+ # Render chat history
495
+ if not st.session_state.chat_history:
496
+ st.markdown("""
497
+ <div class='empty-chat' style='padding:28px'>
498
+ <p style='color:#4b5563;margin:0'>Ask anything about your indexed sources below 👇</p>
499
+ </div>""", unsafe_allow_html=True)
500
+
501
+ for msg in st.session_state.chat_history:
502
+ if msg["role"] == "user":
503
+ st.markdown(f"""
504
+ <div class='chat-user'>
505
+ <div class='chat-user-bubble'>{msg['content']}</div>
506
+ </div>""", unsafe_allow_html=True)
507
+ else:
508
+ source_chips = ""
509
+ if msg.get("sources"):
510
+ for s in msg["sources"][:4]:
511
+ label = f"{s['source']} · {s['relevance']}%"
512
+ if s.get("timestamp"):
513
+ label += f" @ {s['timestamp']}"
514
+ source_chips += f"<span class='chat-source-chip'>{label}</span>"
515
+
516
+ st.markdown(f"""
517
+ <div class='chat-assistant'>
518
+ <div class='chat-avatar'>🤖</div>
519
+ <div class='chat-assistant-bubble'>
520
+ {msg['content']}
521
+ {f"<div class='chat-sources'>{source_chips}</div>" if source_chips else ""}
522
+ </div>
523
+ </div>""", unsafe_allow_html=True)
524
+
525
+ if msg.get("sources"):
526
+ with st.expander("🔍 View retrieved chunks", expanded=False):
527
+ for chunk in msg["sources"]:
528
+ icon = "📄" if chunk["type"] == "pdf" else "🌐" if chunk["type"] == "url" else "▶️"
529
+ detail = f"Page {chunk['page']}" if chunk["type"] != "youtube" else f"@ {chunk['timestamp']}"
530
+ st.markdown(f"""
531
+ <div class='chunk-card'>
532
+ <div class='chunk-header'>
533
+ <div class='chunk-src'>{icon} {chunk['source']}</div>
534
+ <div class='chunk-score'>{detail} · {chunk['relevance']}% match</div>
535
+ </div>
536
+ <div class='chunk-text'>{chunk['text'][:400]}{'...' if len(chunk['text']) > 400 else ''}</div>
537
+ </div>""", unsafe_allow_html=True)
538
+
539
+ # Chat input
540
+ st.markdown("")
541
+ col_input, col_k, col_btn = st.columns([6, 1, 1])
542
+ with col_input:
543
+ user_input = st.text_input("", placeholder="Ask something about your indexed sources...", label_visibility="collapsed", key="chat_input")
544
+ with col_k:
545
+ top_k = st.selectbox("K", [2, 3, 4, 5], index=1, label_visibility="collapsed")
546
+ with col_btn:
547
+ send = st.button("Send ➤", type="primary", use_container_width=True)
548
+
549
+ if send and user_input:
550
+ # Add user message
551
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
552
+
553
+ with st.spinner("Thinking..."):
554
+ try:
555
+ answer, chunks = rag_query(user_input, embed_model, top_k, api_key)
556
+ st.session_state.chat_history.append({
557
+ "role": "assistant",
558
+ "content": answer,
559
+ "sources": chunks
560
+ })
561
+ except requests.HTTPError as e:
562
+ st.session_state.chat_history.append({
563
+ "role": "assistant",
564
+ "content": f"❌ API error: {str(e)}",
565
+ "sources": []
566
+ })
567
+ st.rerun()
upgraded_requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit>=1.32.0
2
+ chromadb>=0.4.22
3
+ sentence-transformers>=2.7.0
4
+ requests>=2.31.0
5
+ PyMuPDF>=1.24.0
6
+ beautifulsoup4>=4.12.0
7
+ youtube-transcript-api>=0.6.2