uumerrr684 commited on
Commit
8b2b880
·
verified ·
1 Parent(s): fe0ef09

Delete src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +0 -673
src/streamlit_app.py DELETED
@@ -1,673 +0,0 @@
1
- import streamlit as st
2
- import requests
3
- import os
4
- import json
5
- import uuid
6
- from datetime import datetime, timedelta
7
- from sentence_transformers import SentenceTransformer
8
- import chromadb
9
- from langchain_text_splitters import RecursiveCharacterTextSplitter
10
- import re
11
-
12
- # Page configuration
13
- st.set_page_config(
14
- page_title="RAG Chat Flow 📚",
15
- page_icon="📚",
16
- initial_sidebar_state="expanded"
17
- )
18
-
19
- # Enhanced CSS styling
20
- st.markdown("""
21
- <style>
22
- .stApp {
23
- background: white;
24
- }
25
-
26
- .main .block-container {
27
- max-width: 900px;
28
- }
29
-
30
- #MainMenu {visibility: hidden;}
31
- footer {visibility: hidden;}
32
- header {visibility: hidden;}
33
- .stDeployButton {display: none;}
34
-
35
- .model-id {
36
- color: #28a745;
37
- font-family: monospace;
38
- }
39
-
40
- .model-attribution {
41
- color: #28a745;
42
- font-size: 0.8em;
43
- font-style: italic;
44
- }
45
-
46
- .rag-attribution {
47
- color: #6f42c1;
48
- font-size: 0.8em;
49
- font-style: italic;
50
- background: #f8f9fa;
51
- padding: 8px;
52
- border-radius: 4px;
53
- border-left: 3px solid #6f42c1;
54
- margin-top: 8px;
55
- }
56
-
57
- /* NEW CHAT BUTTON - Black background */
58
- .stButton > button[kind="primary"] {
59
- background-color: #000000 !important;
60
- border-color: #000000 !important;
61
- color: #ffffff !important;
62
- }
63
-
64
- .stButton > button[kind="primary"]:hover {
65
- background-color: #333333 !important;
66
- border-color: #333333 !important;
67
- color: #ffffff !important;
68
- }
69
-
70
- /* Chat history styling */
71
- .chat-history-item {
72
- padding: 8px 12px;
73
- margin: 4px 0;
74
- border-radius: 8px;
75
- border: 1px solid #e0e0e0;
76
- background: #f8f9fa;
77
- cursor: pointer;
78
- transition: all 0.2s;
79
- }
80
-
81
- .chat-history-item:hover {
82
- background: #e9ecef;
83
- border-color: #28a745;
84
- }
85
-
86
- .document-status {
87
- background: #e3f2fd;
88
- padding: 10px;
89
- border-radius: 8px;
90
- border-left: 4px solid #2196f3;
91
- margin: 10px 0;
92
- }
93
-
94
- .rag-stats {
95
- background: #f3e5f5;
96
- padding: 8px;
97
- border-radius: 6px;
98
- font-size: 0.85em;
99
- color: #4a148c;
100
- }
101
- </style>
102
- """, unsafe_allow_html=True)
103
-
104
- # File paths
105
- HISTORY_FILE = "rag_chat_history.json"
106
- SESSIONS_FILE = "rag_chat_sessions.json"
107
- USERS_FILE = "online_users.json"
108
-
109
- # ================= RAG SYSTEM CLASS =================
110
-
111
- @st.cache_resource
112
- def initialize_rag_system():
113
- """Initialize RAG system with caching"""
114
- return ProductionRAGSystem()
115
-
116
- class ProductionRAGSystem:
117
- def __init__(self, collection_name="streamlit_rag_docs"):
118
- self.collection_name = collection_name
119
-
120
- # Initialize embedding model
121
- try:
122
- self.model = SentenceTransformer('all-mpnet-base-v2')
123
- except Exception as e:
124
- st.error(f"Error loading embedding model: {e}")
125
- self.model = None
126
- return
127
-
128
- # Initialize ChromaDB
129
- try:
130
- self.client = chromadb.PersistentClient(path="./chroma_db")
131
- try:
132
- self.collection = self.client.get_collection(collection_name)
133
- except:
134
- self.collection = self.client.create_collection(collection_name)
135
- except Exception as e:
136
- st.error(f"Error initializing ChromaDB: {e}")
137
- self.client = None
138
- return
139
-
140
- # Initialize text splitter
141
- self.text_splitter = RecursiveCharacterTextSplitter(
142
- chunk_size=800,
143
- chunk_overlap=100,
144
- length_function=len,
145
- separators=["\n\n", "\n", ". ", " ", ""]
146
- )
147
-
148
- def get_collection_count(self):
149
- """Get number of documents in collection"""
150
- try:
151
- return self.collection.count() if self.collection else 0
152
- except:
153
- return 0
154
-
155
- def load_documents_from_folder(self, folder_path="documents"):
156
- """Load documents from folder"""
157
- if not os.path.exists(folder_path):
158
- return []
159
-
160
- txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
161
- if not txt_files:
162
- return []
163
-
164
- all_chunks = []
165
- for filename in txt_files:
166
- filepath = os.path.join(folder_path, filename)
167
- try:
168
- with open(filepath, 'r', encoding='utf-8') as f:
169
- content = f.read().strip()
170
-
171
- if content:
172
- chunks = self.text_splitter.split_text(content)
173
- for i, chunk in enumerate(chunks):
174
- all_chunks.append({
175
- 'content': chunk,
176
- 'source_file': filename,
177
- 'chunk_index': i,
178
- 'char_count': len(chunk)
179
- })
180
- except Exception as e:
181
- st.error(f"Error reading {filename}: {e}")
182
-
183
- return all_chunks
184
-
185
- def index_documents(self, document_folder="documents"):
186
- """Index documents with progress bar"""
187
- if not self.model or not self.client:
188
- return False
189
-
190
- chunks = self.load_documents_from_folder(document_folder)
191
- if not chunks:
192
- return False
193
-
194
- # Clear existing collection
195
- try:
196
- self.client.delete_collection(self.collection_name)
197
- self.collection = self.client.create_collection(self.collection_name)
198
- except:
199
- pass
200
-
201
- # Create embeddings with progress bar
202
- progress_bar = st.progress(0)
203
- status_text = st.empty()
204
-
205
- chunk_texts = [chunk['content'] for chunk in chunks]
206
-
207
- try:
208
- status_text.text("Creating embeddings...")
209
- embeddings = self.model.encode(chunk_texts, show_progress_bar=False)
210
-
211
- status_text.text("Storing in database...")
212
- for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
213
- chunk_id = f"{chunk['source_file']}_{chunk['chunk_index']}"
214
-
215
- metadata = {
216
- "source_file": chunk['source_file'],
217
- "chunk_index": chunk['chunk_index'],
218
- "char_count": chunk['char_count']
219
- }
220
-
221
- self.collection.add(
222
- documents=[chunk['content']],
223
- ids=[chunk_id],
224
- embeddings=[embedding.tolist()],
225
- metadatas=[metadata]
226
- )
227
-
228
- progress_bar.progress((i + 1) / len(chunks))
229
-
230
- progress_bar.empty()
231
- status_text.empty()
232
- return True
233
-
234
- except Exception as e:
235
- st.error(f"Error during indexing: {e}")
236
- progress_bar.empty()
237
- status_text.empty()
238
- return False
239
-
240
- def search(self, query, n_results=3):
241
- """Search for relevant chunks"""
242
- if not self.model or not self.collection:
243
- return None
244
-
245
- try:
246
- query_embedding = self.model.encode([query])[0].tolist()
247
-
248
- results = self.collection.query(
249
- query_embeddings=[query_embedding],
250
- n_results=n_results
251
- )
252
-
253
- if not results['documents'][0]:
254
- return None
255
-
256
- search_results = []
257
- for chunk, distance, metadata in zip(
258
- results['documents'][0],
259
- results['distances'][0],
260
- results['metadatas'][0]
261
- ):
262
- similarity = max(0, 1 - distance)
263
- search_results.append({
264
- 'content': chunk,
265
- 'metadata': metadata,
266
- 'similarity': similarity
267
- })
268
-
269
- return search_results
270
- except Exception as e:
271
- st.error(f"Search error: {e}")
272
- return None
273
-
274
- def extract_direct_answer(self, query, content):
275
- """Extract direct answer from content"""
276
- query_lower = query.lower()
277
- sentences = re.split(r'[.!?]+', content)
278
- sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
279
-
280
- query_words = set(query_lower.split())
281
- scored_sentences = []
282
-
283
- for sentence in sentences:
284
- sentence_words = set(sentence.lower().split())
285
- exact_matches = len(query_words.intersection(sentence_words))
286
-
287
- # Bonus scoring for key terms
288
- bonus_score = 0
289
- if '401k' in query_lower and ('401' in sentence.lower() or 'retirement' in sentence.lower()):
290
- bonus_score += 3
291
- if 'sick' in query_lower and 'sick' in sentence.lower():
292
- bonus_score += 3
293
- if 'vacation' in query_lower and 'vacation' in sentence.lower():
294
- bonus_score += 3
295
-
296
- total_score = exact_matches * 2 + bonus_score
297
-
298
- if total_score > 0:
299
- scored_sentences.append((sentence, total_score))
300
-
301
- if scored_sentences:
302
- scored_sentences.sort(key=lambda x: x[1], reverse=True)
303
- best_sentence = scored_sentences[0][0]
304
- if not best_sentence.endswith('.'):
305
- best_sentence += '.'
306
- return best_sentence
307
-
308
- # Fallback
309
- for sentence in sentences:
310
- if len(sentence) > 30:
311
- return sentence + ('.' if not sentence.endswith('.') else '')
312
-
313
- return content[:200] + "..."
314
-
315
- def generate_answer(self, query, search_results):
316
- """Generate both AI and extracted answers"""
317
- if not search_results:
318
- return {
319
- 'ai_answer': "No information found in documents.",
320
- 'extracted_answer': "No information found in documents.",
321
- 'sources': [],
322
- 'confidence': 0,
323
- 'has_both': False
324
- }
325
-
326
- best_result = search_results[0]
327
- sources = list(set([r['metadata']['source_file'] for r in search_results[:2]]))
328
- avg_confidence = sum(r['similarity'] for r in search_results[:2]) / len(search_results[:2])
329
-
330
- # Always generate extracted answer
331
- extracted_answer = self.extract_direct_answer(query, best_result['content'])
332
-
333
- # Try AI answer if API key available
334
- ai_answer = None
335
- openrouter_key = os.environ.get("OPENROUTER_API_KEY")
336
-
337
- if openrouter_key:
338
- context = search_results[0]['content'][:500]
339
- prompt = f"Answer briefly: {query}\n\nContext: {context}\n\nAnswer (1 sentence):"
340
-
341
- try:
342
- response = requests.post(
343
- "https://openrouter.ai/api/v1/chat/completions",
344
- headers={
345
- "Authorization": f"Bearer {openrouter_key}",
346
- "Content-Type": "application/json"
347
- },
348
- json={
349
- "model": "openai/gpt-3.5-turbo",
350
- "messages": [{"role": "user", "content": prompt}],
351
- "max_tokens": 100,
352
- "temperature": 0.1
353
- },
354
- timeout=10
355
- )
356
-
357
- if response.status_code == 200:
358
- ai_answer = response.json()['choices'][0]['message']['content'].strip()
359
- except Exception as e:
360
- st.warning(f"AI API error: {e}")
361
-
362
- return {
363
- 'ai_answer': ai_answer,
364
- 'extracted_answer': extracted_answer,
365
- 'sources': sources,
366
- 'confidence': avg_confidence,
367
- 'has_both': ai_answer is not None
368
- }
369
-
370
- # ================= UTILITY FUNCTIONS =================
371
-
372
- def get_user_id():
373
- """Get unique ID for this user session"""
374
- if 'user_id' not in st.session_state:
375
- st.session_state.user_id = str(uuid.uuid4())[:8]
376
- return st.session_state.user_id
377
-
378
- def update_online_users():
379
- """Update user status"""
380
- try:
381
- users = {}
382
- if os.path.exists(USERS_FILE):
383
- with open(USERS_FILE, 'r') as f:
384
- users = json.load(f)
385
-
386
- user_id = get_user_id()
387
- users[user_id] = {
388
- 'last_seen': datetime.now().isoformat(),
389
- 'name': f'User-{user_id}',
390
- 'session_start': users.get(user_id, {}).get('session_start', datetime.now().isoformat())
391
- }
392
-
393
- # Clean up old users
394
- current_time = datetime.now()
395
- active_users = {}
396
- for uid, data in users.items():
397
- try:
398
- last_seen = datetime.fromisoformat(data['last_seen'])
399
- if current_time - last_seen < timedelta(minutes=5):
400
- active_users[uid] = data
401
- except:
402
- continue
403
-
404
- with open(USERS_FILE, 'w') as f:
405
- json.dump(active_users, f, indent=2)
406
-
407
- return len(active_users)
408
- except:
409
- return 1
410
-
411
- def load_chat_history():
412
- """Load chat history"""
413
- try:
414
- if os.path.exists(HISTORY_FILE):
415
- with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
416
- return json.load(f)
417
- except:
418
- pass
419
- return []
420
-
421
- def save_chat_history(messages):
422
- """Save chat history"""
423
- try:
424
- with open(HISTORY_FILE, 'w', encoding='utf-8') as f:
425
- json.dump(messages, f, ensure_ascii=False, indent=2)
426
- except Exception as e:
427
- st.error(f"Error saving history: {e}")
428
-
429
- def start_new_chat():
430
- """Start new chat session"""
431
- st.session_state.messages = []
432
- st.session_state.session_id = str(uuid.uuid4())
433
-
434
- # ================= MAIN APP =================
435
-
436
- # Initialize session state
437
- if "messages" not in st.session_state:
438
- st.session_state.messages = load_chat_history()
439
-
440
- if "session_id" not in st.session_state:
441
- st.session_state.session_id = str(uuid.uuid4())
442
-
443
- # Initialize RAG system
444
- rag_system = initialize_rag_system()
445
-
446
- # Header
447
- st.title("RAG Chat Flow 📚")
448
- st.caption("Ask questions about your documents with AI-powered retrieval")
449
-
450
- # Sidebar
451
- with st.sidebar:
452
- # New Chat Button
453
- if st.button("➕ New Chat", use_container_width=True, type="primary"):
454
- start_new_chat()
455
- st.rerun()
456
-
457
- st.divider()
458
-
459
- # Document Management
460
- st.header("📂 Document Management")
461
-
462
- if rag_system and rag_system.model:
463
- doc_count = rag_system.get_collection_count()
464
-
465
- if doc_count > 0:
466
- st.markdown(f"""
467
- <div class="document-status">
468
- <strong>📊 Documents Indexed:</strong> {doc_count} chunks<br>
469
- <strong>🔍 Status:</strong> Ready for queries
470
- </div>
471
- """, unsafe_allow_html=True)
472
- else:
473
- st.warning("No documents indexed. Upload documents to get started.")
474
-
475
- # Document indexing
476
- if st.button("🔄 Re-index Documents", use_container_width=True):
477
- with st.spinner("Indexing documents..."):
478
- if rag_system.index_documents("documents"):
479
- st.success("Documents indexed successfully!")
480
- st.rerun()
481
- else:
482
- st.error("Failed to index documents. Check your documents folder.")
483
-
484
- # Upload interface
485
- st.subheader("📤 Upload Documents")
486
- uploaded_files = st.file_uploader(
487
- "Upload text files",
488
- type=['txt'],
489
- accept_multiple_files=True,
490
- help="Upload .txt files to add to your knowledge base"
491
- )
492
-
493
- if uploaded_files:
494
- if st.button("💾 Save & Index Files"):
495
- os.makedirs("documents", exist_ok=True)
496
- saved_files = []
497
-
498
- for uploaded_file in uploaded_files:
499
- file_path = os.path.join("documents", uploaded_file.name)
500
- with open(file_path, "wb") as f:
501
- f.write(uploaded_file.getbuffer())
502
- saved_files.append(uploaded_file.name)
503
-
504
- st.success(f"Saved {len(saved_files)} files!")
505
-
506
- # Auto-index
507
- with st.spinner("Auto-indexing new documents..."):
508
- if rag_system.index_documents("documents"):
509
- st.success("Documents indexed successfully!")
510
- st.rerun()
511
- else:
512
- st.error("RAG system initialization failed. Check your setup.")
513
-
514
- st.divider()
515
-
516
- # Online Users
517
- st.header("👥 Online Users")
518
- online_count = update_online_users()
519
-
520
- if online_count == 1:
521
- st.success("🟢 Just you online")
522
- else:
523
- st.success(f"🟢 {online_count} people online")
524
-
525
- st.divider()
526
-
527
- # Settings
528
- st.header("⚙️ Settings")
529
-
530
- # API Status
531
- openrouter_key = os.environ.get("OPENROUTER_API_KEY")
532
- if openrouter_key:
533
- st.success("🟢 AI API Connected")
534
- else:
535
- st.warning("⚠️ No AI API Key (using extracted answers only)")
536
-
537
- # RAG Settings
538
- use_ai_enhancement = st.checkbox("Use AI Enhancement", value=bool(openrouter_key))
539
- show_sources = st.checkbox("Show Sources", value=True)
540
- show_confidence = st.checkbox("Show Confidence Scores", value=True)
541
-
542
- st.divider()
543
-
544
- # Chat History Controls
545
- st.header("💾 Chat History")
546
-
547
- if st.session_state.messages:
548
- st.info(f"Messages: {len(st.session_state.messages)}")
549
-
550
- col1, col2 = st.columns(2)
551
- with col1:
552
- if st.button("💾 Save", use_container_width=True):
553
- save_chat_history(st.session_state.messages)
554
- st.success("Saved!")
555
-
556
- with col2:
557
- if st.button("🗑️ Clear", use_container_width=True):
558
- start_new_chat()
559
- st.success("Cleared!")
560
- st.rerun()
561
-
562
- # ================= MAIN CHAT AREA =================
563
-
564
- # Display chat messages
565
- for message in st.session_state.messages:
566
- with st.chat_message(message["role"]):
567
- if message["role"] == "assistant" and "rag_info" in message:
568
- # Display AI answer
569
- st.markdown(message["content"])
570
-
571
- # Display RAG information
572
- rag_info = message["rag_info"]
573
-
574
- if show_sources and rag_info.get("sources"):
575
- st.markdown(f"""
576
- <div class="rag-attribution">
577
- <strong>📁 Sources:</strong> {', '.join(rag_info['sources'])}<br>
578
- <strong>🎯 Confidence:</strong> {rag_info['confidence']*100:.1f}%
579
- </div>
580
- """, unsafe_allow_html=True)
581
-
582
- # Show extracted answer if different
583
- if rag_info.get("extracted_answer") and rag_info["extracted_answer"] != message["content"]:
584
- st.markdown("**📄 Extracted Answer:**")
585
- st.markdown(f"_{rag_info['extracted_answer']}_")
586
- else:
587
- st.markdown(message["content"])
588
-
589
- # Chat input
590
- if prompt := st.chat_input("Ask questions about your documents..."):
591
- # Update user tracking
592
- update_online_users()
593
-
594
- # Add user message
595
- user_message = {"role": "user", "content": prompt}
596
- st.session_state.messages.append(user_message)
597
-
598
- # Display user message
599
- with st.chat_message("user"):
600
- st.markdown(prompt)
601
-
602
- # Get RAG response
603
- with st.chat_message("assistant"):
604
- if rag_system and rag_system.model and rag_system.get_collection_count() > 0:
605
- # Search documents
606
- search_results = rag_system.search(prompt, n_results=3)
607
-
608
- if search_results:
609
- # Generate answer
610
- result = rag_system.generate_answer(prompt, search_results)
611
-
612
- # Display AI answer or extracted answer
613
- if use_ai_enhancement and result['has_both']:
614
- answer_text = result['ai_answer']
615
- st.markdown(f"🤖 **AI Answer:** {answer_text}")
616
- else:
617
- answer_text = result['extracted_answer']
618
- st.markdown(f"📄 **Answer:** {answer_text}")
619
-
620
- # Show RAG info
621
- if show_sources and result['sources']:
622
- st.markdown(f"""
623
- <div class="rag-attribution">
624
- <strong>📁 Sources:</strong> {', '.join(result['sources'])}<br>
625
- <strong>🎯 Confidence:</strong> {result['confidence']*100:.1f}%<br>
626
- <strong>📊 Found:</strong> {len(search_results)} relevant sections
627
- </div>
628
- """, unsafe_allow_html=True)
629
-
630
- # Add to messages with RAG info
631
- assistant_message = {
632
- "role": "assistant",
633
- "content": answer_text,
634
- "rag_info": {
635
- "sources": result['sources'],
636
- "confidence": result['confidence'],
637
- "extracted_answer": result['extracted_answer'],
638
- "has_ai": result['has_both']
639
- }
640
- }
641
-
642
- else:
643
- # No relevant documents found
644
- no_info_msg = "I couldn't find relevant information in your documents. Try rephrasing your question or check if the information exists in your uploaded documents."
645
- st.markdown(no_info_msg)
646
-
647
- assistant_message = {
648
- "role": "assistant",
649
- "content": no_info_msg,
650
- "rag_info": {"sources": [], "confidence": 0}
651
- }
652
-
653
- else:
654
- # RAG system not ready
655
- error_msg = "Document system not ready. Please upload and index documents first."
656
- st.error(error_msg)
657
-
658
- assistant_message = {
659
- "role": "assistant",
660
- "content": error_msg,
661
- "rag_info": {"sources": [], "confidence": 0}
662
- }
663
-
664
- # Add assistant message to history
665
- st.session_state.messages.append(assistant_message)
666
-
667
- # Auto-save
668
- save_chat_history(st.session_state.messages)
669
-
670
- # Footer info
671
- if rag_system and rag_system.model:
672
- doc_count = rag_system.get_collection_count()
673
- st.caption(f"📚 Knowledge Base: {doc_count} indexed chunks | 🔍 RAG System Active")