Spaces:

uumerrr684
/

RAG_Chat_Flow

Sleeping

App Files Files Community

uumerrr684 commited on Aug 20, 2025

Commit

fe0ef09

verified ·

1 Parent(s): 8295f5e

Create app.py

Browse files

Files changed (1) hide show

app.py +673 -0

app.py ADDED Viewed

	@@ -0,0 +1,673 @@

+import streamlit as st
+import requests
+import os
+import json
+import uuid
+from datetime import datetime, timedelta
+from sentence_transformers import SentenceTransformer
+import chromadb
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import re
+# Page configuration
+st.set_page_config(
+    page_title="RAG Chat Flow 📚",
+    page_icon="📚",
+    initial_sidebar_state="expanded"
+)
+# Enhanced CSS styling
+st.markdown("""
+<style>
+    .stApp {
+        background: white;
+    }
+    .main .block-container {
+        max-width: 900px;
+    }
+    #MainMenu {visibility: hidden;}
+    footer {visibility: hidden;}
+    header {visibility: hidden;}
+    .stDeployButton {display: none;}
+    .model-id {
+        color: #28a745;
+        font-family: monospace;
+    }
+    .model-attribution {
+        color: #28a745;
+        font-size: 0.8em;
+        font-style: italic;
+    }
+    .rag-attribution {
+        color: #6f42c1;
+        font-size: 0.8em;
+        font-style: italic;
+        background: #f8f9fa;
+        padding: 8px;
+        border-radius: 4px;
+        border-left: 3px solid #6f42c1;
+        margin-top: 8px;
+    }
+    /* NEW CHAT BUTTON - Black background */
+    .stButton > button[kind="primary"] {
+        background-color: #000000 !important;
+        border-color: #000000 !important;
+        color: #ffffff !important;
+    }
+    .stButton > button[kind="primary"]:hover {
+        background-color: #333333 !important;
+        border-color: #333333 !important;
+        color: #ffffff !important;
+    }
+    /* Chat history styling */
+    .chat-history-item {
+        padding: 8px 12px;
+        margin: 4px 0;
+        border-radius: 8px;
+        border: 1px solid #e0e0e0;
+        background: #f8f9fa;
+        cursor: pointer;
+        transition: all 0.2s;
+    }
+    .chat-history-item:hover {
+        background: #e9ecef;
+        border-color: #28a745;
+    }
+    .document-status {
+        background: #e3f2fd;
+        padding: 10px;
+        border-radius: 8px;
+        border-left: 4px solid #2196f3;
+        margin: 10px 0;
+    }
+    .rag-stats {
+        background: #f3e5f5;
+        padding: 8px;
+        border-radius: 6px;
+        font-size: 0.85em;
+        color: #4a148c;
+    }
+</style>
+""", unsafe_allow_html=True)
+# File paths
+HISTORY_FILE = "rag_chat_history.json"
+SESSIONS_FILE = "rag_chat_sessions.json"
+USERS_FILE = "online_users.json"
+# ================= RAG SYSTEM CLASS =================
+@st.cache_resource
+def initialize_rag_system():
+    """Initialize RAG system with caching"""
+    return ProductionRAGSystem()
+class ProductionRAGSystem:
+    def __init__(self, collection_name="streamlit_rag_docs"):
+        self.collection_name = collection_name
+        # Initialize embedding model
+        try:
+            self.model = SentenceTransformer('all-mpnet-base-v2')
+        except Exception as e:
+            st.error(f"Error loading embedding model: {e}")
+            self.model = None
+            return
+        # Initialize ChromaDB
+        try:
+            self.client = chromadb.PersistentClient(path="./chroma_db")
+            try:
+                self.collection = self.client.get_collection(collection_name)
+            except:
+                self.collection = self.client.create_collection(collection_name)
+        except Exception as e:
+            st.error(f"Error initializing ChromaDB: {e}")
+            self.client = None
+            return
+        # Initialize text splitter
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=800,
+            chunk_overlap=100,
+            length_function=len,
+            separators=["\n\n", "\n", ". ", " ", ""]
+        )
+    def get_collection_count(self):
+        """Get number of documents in collection"""
+        try:
+            return self.collection.count() if self.collection else 0
+        except:
+            return 0
+    def load_documents_from_folder(self, folder_path="documents"):
+        """Load documents from folder"""
+        if not os.path.exists(folder_path):
+            return []
+        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
+        if not txt_files:
+            return []
+        all_chunks = []
+        for filename in txt_files:
+            filepath = os.path.join(folder_path, filename)
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    content = f.read().strip()
+                if content:
+                    chunks = self.text_splitter.split_text(content)
+                    for i, chunk in enumerate(chunks):
+                        all_chunks.append({
+                            'content': chunk,
+                            'source_file': filename,
+                            'chunk_index': i,
+                            'char_count': len(chunk)
+                        })
+            except Exception as e:
+                st.error(f"Error reading {filename}: {e}")
+        return all_chunks
+    def index_documents(self, document_folder="documents"):
+        """Index documents with progress bar"""
+        if not self.model or not self.client:
+            return False
+        chunks = self.load_documents_from_folder(document_folder)
+        if not chunks:
+            return False
+        # Clear existing collection
+        try:
+            self.client.delete_collection(self.collection_name)
+            self.collection = self.client.create_collection(self.collection_name)
+        except:
+            pass
+        # Create embeddings with progress bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        chunk_texts = [chunk['content'] for chunk in chunks]
+        try:
+            status_text.text("Creating embeddings...")
+            embeddings = self.model.encode(chunk_texts, show_progress_bar=False)
+            status_text.text("Storing in database...")
+            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                chunk_id = f"{chunk['source_file']}_{chunk['chunk_index']}"
+                metadata = {
+                    "source_file": chunk['source_file'],
+                    "chunk_index": chunk['chunk_index'],
+                    "char_count": chunk['char_count']
+                }
+                self.collection.add(
+                    documents=[chunk['content']],
+                    ids=[chunk_id],
+                    embeddings=[embedding.tolist()],
+                    metadatas=[metadata]
+                )
+                progress_bar.progress((i + 1) / len(chunks))
+            progress_bar.empty()
+            status_text.empty()
+            return True
+        except Exception as e:
+            st.error(f"Error during indexing: {e}")
+            progress_bar.empty()
+            status_text.empty()
+            return False
+    def search(self, query, n_results=3):
+        """Search for relevant chunks"""
+        if not self.model or not self.collection:
+            return None
+        try:
+            query_embedding = self.model.encode([query])[0].tolist()
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=n_results
+            )
+            if not results['documents'][0]:
+                return None
+            search_results = []
+            for chunk, distance, metadata in zip(
+                results['documents'][0],
+                results['distances'][0],
+                results['metadatas'][0]
+            ):
+                similarity = max(0, 1 - distance)
+                search_results.append({
+                    'content': chunk,
+                    'metadata': metadata,
+                    'similarity': similarity
+                })
+            return search_results
+        except Exception as e:
+            st.error(f"Search error: {e}")
+            return None
+    def extract_direct_answer(self, query, content):
+        """Extract direct answer from content"""
+        query_lower = query.lower()
+        sentences = re.split(r'[.!?]+', content)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
+        query_words = set(query_lower.split())
+        scored_sentences = []
+        for sentence in sentences:
+            sentence_words = set(sentence.lower().split())
+            exact_matches = len(query_words.intersection(sentence_words))
+            # Bonus scoring for key terms
+            bonus_score = 0
+            if '401k' in query_lower and ('401' in sentence.lower() or 'retirement' in sentence.lower()):
+                bonus_score += 3
+            if 'sick' in query_lower and 'sick' in sentence.lower():
+                bonus_score += 3
+            if 'vacation' in query_lower and 'vacation' in sentence.lower():
+                bonus_score += 3
+            total_score = exact_matches * 2 + bonus_score
+            if total_score > 0:
+                scored_sentences.append((sentence, total_score))
+        if scored_sentences:
+            scored_sentences.sort(key=lambda x: x[1], reverse=True)
+            best_sentence = scored_sentences[0][0]
+            if not best_sentence.endswith('.'):
+                best_sentence += '.'
+            return best_sentence
+        # Fallback
+        for sentence in sentences:
+            if len(sentence) > 30:
+                return sentence + ('.' if not sentence.endswith('.') else '')
+        return content[:200] + "..."
+    def generate_answer(self, query, search_results):
+        """Generate both AI and extracted answers"""
+        if not search_results:
+            return {
+                'ai_answer': "No information found in documents.",
+                'extracted_answer': "No information found in documents.",
+                'sources': [],
+                'confidence': 0,
+                'has_both': False
+            }
+        best_result = search_results[0]
+        sources = list(set([r['metadata']['source_file'] for r in search_results[:2]]))
+        avg_confidence = sum(r['similarity'] for r in search_results[:2]) / len(search_results[:2])
+        # Always generate extracted answer
+        extracted_answer = self.extract_direct_answer(query, best_result['content'])
+        # Try AI answer if API key available
+        ai_answer = None
+        openrouter_key = os.environ.get("OPENROUTER_API_KEY")
+        if openrouter_key:
+            context = search_results[0]['content'][:500]
+            prompt = f"Answer briefly: {query}\n\nContext: {context}\n\nAnswer (1 sentence):"
+            try:
+                response = requests.post(
+                    "https://openrouter.ai/api/v1/chat/completions",
+                    headers={
+                        "Authorization": f"Bearer {openrouter_key}",
+                        "Content-Type": "application/json"
+                    },
+                    json={
+                        "model": "openai/gpt-3.5-turbo",
+                        "messages": [{"role": "user", "content": prompt}],
+                        "max_tokens": 100,
+                        "temperature": 0.1
+                    },
+                    timeout=10
+                )
+                if response.status_code == 200:
+                    ai_answer = response.json()['choices'][0]['message']['content'].strip()
+            except Exception as e:
+                st.warning(f"AI API error: {e}")
+        return {
+            'ai_answer': ai_answer,
+            'extracted_answer': extracted_answer,
+            'sources': sources,
+            'confidence': avg_confidence,
+            'has_both': ai_answer is not None
+        }
+# ================= UTILITY FUNCTIONS =================
+def get_user_id():
+    """Get unique ID for this user session"""
+    if 'user_id' not in st.session_state:
+        st.session_state.user_id = str(uuid.uuid4())[:8]
+    return st.session_state.user_id
+def update_online_users():
+    """Update user status"""
+    try:
+        users = {}
+        if os.path.exists(USERS_FILE):
+            with open(USERS_FILE, 'r') as f:
+                users = json.load(f)
+        user_id = get_user_id()
+        users[user_id] = {
+            'last_seen': datetime.now().isoformat(),
+            'name': f'User-{user_id}',
+            'session_start': users.get(user_id, {}).get('session_start', datetime.now().isoformat())
+        }
+        # Clean up old users
+        current_time = datetime.now()
+        active_users = {}
+        for uid, data in users.items():
+            try:
+                last_seen = datetime.fromisoformat(data['last_seen'])
+                if current_time - last_seen < timedelta(minutes=5):
+                    active_users[uid] = data
+            except:
+                continue
+        with open(USERS_FILE, 'w') as f:
+            json.dump(active_users, f, indent=2)
+        return len(active_users)
+    except:
+        return 1
+def load_chat_history():
+    """Load chat history"""
+    try:
+        if os.path.exists(HISTORY_FILE):
+            with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
+                return json.load(f)
+    except:
+        pass
+    return []
+def save_chat_history(messages):
+    """Save chat history"""
+    try:
+        with open(HISTORY_FILE, 'w', encoding='utf-8') as f:
+            json.dump(messages, f, ensure_ascii=False, indent=2)
+    except Exception as e:
+        st.error(f"Error saving history: {e}")
+def start_new_chat():
+    """Start new chat session"""
+    st.session_state.messages = []
+    st.session_state.session_id = str(uuid.uuid4())
+# ================= MAIN APP =================
+# Initialize session state
+if "messages" not in st.session_state:
+    st.session_state.messages = load_chat_history()
+if "session_id" not in st.session_state:
+    st.session_state.session_id = str(uuid.uuid4())
+# Initialize RAG system
+rag_system = initialize_rag_system()
+# Header
+st.title("RAG Chat Flow 📚")
+st.caption("Ask questions about your documents with AI-powered retrieval")
+# Sidebar
+with st.sidebar:
+    # New Chat Button
+    if st.button("➕ New Chat", use_container_width=True, type="primary"):
+        start_new_chat()
+        st.rerun()
+    st.divider()
+    # Document Management
+    st.header("📂 Document Management")
+    if rag_system and rag_system.model:
+        doc_count = rag_system.get_collection_count()
+        if doc_count > 0:
+            st.markdown(f"""
+            <div class="document-status">
+                <strong>📊 Documents Indexed:</strong> {doc_count} chunks<br>
+                <strong>🔍 Status:</strong> Ready for queries
+            </div>
+            """, unsafe_allow_html=True)
+        else:
+            st.warning("No documents indexed. Upload documents to get started.")
+        # Document indexing
+        if st.button("🔄 Re-index Documents", use_container_width=True):
+            with st.spinner("Indexing documents..."):
+                if rag_system.index_documents("documents"):
+                    st.success("Documents indexed successfully!")
+                    st.rerun()
+                else:
+                    st.error("Failed to index documents. Check your documents folder.")
+        # Upload interface
+        st.subheader("📤 Upload Documents")
+        uploaded_files = st.file_uploader(
+            "Upload text files",
+            type=['txt'],
+            accept_multiple_files=True,
+            help="Upload .txt files to add to your knowledge base"
+        )
+        if uploaded_files:
+            if st.button("💾 Save & Index Files"):
+                os.makedirs("documents", exist_ok=True)
+                saved_files = []
+                for uploaded_file in uploaded_files:
+                    file_path = os.path.join("documents", uploaded_file.name)
+                    with open(file_path, "wb") as f:
+                        f.write(uploaded_file.getbuffer())
+                    saved_files.append(uploaded_file.name)
+                st.success(f"Saved {len(saved_files)} files!")
+                # Auto-index
+                with st.spinner("Auto-indexing new documents..."):
+                    if rag_system.index_documents("documents"):
+                        st.success("Documents indexed successfully!")
+                        st.rerun()
+    else:
+        st.error("RAG system initialization failed. Check your setup.")
+    st.divider()
+    # Online Users
+    st.header("👥 Online Users")
+    online_count = update_online_users()
+    if online_count == 1:
+        st.success("🟢 Just you online")
+    else:
+        st.success(f"🟢 {online_count} people online")
+    st.divider()
+    # Settings
+    st.header("⚙️ Settings")
+    # API Status
+    openrouter_key = os.environ.get("OPENROUTER_API_KEY")
+    if openrouter_key:
+        st.success("🟢 AI API Connected")
+    else:
+        st.warning("⚠️ No AI API Key (using extracted answers only)")
+    # RAG Settings
+    use_ai_enhancement = st.checkbox("Use AI Enhancement", value=bool(openrouter_key))
+    show_sources = st.checkbox("Show Sources", value=True)
+    show_confidence = st.checkbox("Show Confidence Scores", value=True)
+    st.divider()
+    # Chat History Controls
+    st.header("💾 Chat History")
+    if st.session_state.messages:
+        st.info(f"Messages: {len(st.session_state.messages)}")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("💾 Save", use_container_width=True):
+            save_chat_history(st.session_state.messages)
+            st.success("Saved!")
+    with col2:
+        if st.button("🗑️ Clear", use_container_width=True):
+            start_new_chat()
+            st.success("Cleared!")
+            st.rerun()
+# ================= MAIN CHAT AREA =================
+# Display chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        if message["role"] == "assistant" and "rag_info" in message:
+            # Display AI answer
+            st.markdown(message["content"])
+            # Display RAG information
+            rag_info = message["rag_info"]
+            if show_sources and rag_info.get("sources"):
+                st.markdown(f"""
+                <div class="rag-attribution">
+                    <strong>📁 Sources:</strong> {', '.join(rag_info['sources'])}<br>
+                    <strong>🎯 Confidence:</strong> {rag_info['confidence']*100:.1f}%
+                </div>
+                """, unsafe_allow_html=True)
+            # Show extracted answer if different
+            if rag_info.get("extracted_answer") and rag_info["extracted_answer"] != message["content"]:
+                st.markdown("**📄 Extracted Answer:**")
+                st.markdown(f"_{rag_info['extracted_answer']}_")
+        else:
+            st.markdown(message["content"])
+# Chat input
+if prompt := st.chat_input("Ask questions about your documents..."):
+    # Update user tracking
+    update_online_users()
+    # Add user message
+    user_message = {"role": "user", "content": prompt}
+    st.session_state.messages.append(user_message)
+    # Display user message
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Get RAG response
+    with st.chat_message("assistant"):
+        if rag_system and rag_system.model and rag_system.get_collection_count() > 0:
+            # Search documents
+            search_results = rag_system.search(prompt, n_results=3)
+            if search_results:
+                # Generate answer
+                result = rag_system.generate_answer(prompt, search_results)
+                # Display AI answer or extracted answer
+                if use_ai_enhancement and result['has_both']:
+                    answer_text = result['ai_answer']
+                    st.markdown(f"🤖 **AI Answer:** {answer_text}")
+                else:
+                    answer_text = result['extracted_answer']
+                    st.markdown(f"📄 **Answer:** {answer_text}")
+                # Show RAG info
+                if show_sources and result['sources']:
+                    st.markdown(f"""
+                    <div class="rag-attribution">
+                        <strong>📁 Sources:</strong> {', '.join(result['sources'])}<br>
+                        <strong>🎯 Confidence:</strong> {result['confidence']*100:.1f}%<br>
+                        <strong>📊 Found:</strong> {len(search_results)} relevant sections
+                    </div>
+                    """, unsafe_allow_html=True)
+                # Add to messages with RAG info
+                assistant_message = {
+                    "role": "assistant",
+                    "content": answer_text,
+                    "rag_info": {
+                        "sources": result['sources'],
+                        "confidence": result['confidence'],
+                        "extracted_answer": result['extracted_answer'],
+                        "has_ai": result['has_both']
+                    }
+                }
+            else:
+                # No relevant documents found
+                no_info_msg = "I couldn't find relevant information in your documents. Try rephrasing your question or check if the information exists in your uploaded documents."
+                st.markdown(no_info_msg)
+                assistant_message = {
+                    "role": "assistant",
+                    "content": no_info_msg,
+                    "rag_info": {"sources": [], "confidence": 0}
+                }
+        else:
+            # RAG system not ready
+            error_msg = "Document system not ready. Please upload and index documents first."
+            st.error(error_msg)
+            assistant_message = {
+                "role": "assistant",
+                "content": error_msg,
+                "rag_info": {"sources": [], "confidence": 0}
+            }
+    # Add assistant message to history
+    st.session_state.messages.append(assistant_message)
+    # Auto-save
+    save_chat_history(st.session_state.messages)
+# Footer info
+if rag_system and rag_system.model:
+    doc_count = rag_system.get_collection_count()
+    st.caption(f"📚 Knowledge Base: {doc_count} indexed chunks | 🔍 RAG System Active")