File size: 15,970 Bytes
ca637d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
import os
import streamlit as st
from chromadb import PersistentClient
from dotenv import load_dotenv
from urllib.parse import urlparse, urlunparse

from utils.processor import process_pdf, process_web
from utils.vector_store import create_vector_store
from utils.agent import get_query_rewriter_agent, get_web_search_agent, get_rag_agent

# --- Constants and Configuration ---
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_system") # Provide a default
DB_PATH = os.getenv("DB_PATH", "chroma_db")
DEFAULT_SIMILARITY_THRESHOLD = 0.7
RETRIEVER_K = 5 # Number of documents to retrieve

# --- Helper Functions ---

def initialize_session_state():
    """Initializes Streamlit session state variables if they don't exist."""
    defaults = {
        'google_api_key': GOOGLE_API_KEY,
        'history': [],
        'use_web_search': False,
        'force_web_search': False,
        'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
        'vector_store': None,
        'processed_documents': [],
        'chroma_client': None,
        'chroma_collection': None,
        'url_input': "",
        'clear_url_input_flag': False
    }
    for key, value in defaults.items():
        if key not in st.session_state:
            st.session_state[key] = value

def normalize_url(url: str) -> str:
    """
    Normalizes a URL for consistent checking and storage.
    - Adds 'http' if no scheme is present.
    - Converts scheme and domain to lowercase.
    - Removes 'www.' prefix.
    - Removes trailing slashes from the path.
    - Removes fragments (#...).
    """
    url = url.strip()
    if not url:
        return ""

    # Add scheme if missing (default to http for parsing)
    if '://' not in url:
        url = 'http://' + url

    try:
        parts = urlparse(url)

        # Lowercase scheme and netloc (domain)
        scheme = parts.scheme.lower()
        netloc = parts.netloc.lower()

        # Remove 'www.' prefix
        if netloc.startswith('www.'):
            netloc = netloc[4:]

        # Remove trailing slashes from path, but keep root '/'
        path = parts.path.rstrip('/')
        if not path and parts.path == '/': # Keep root slash if original path was only '/'
             path = '/'
        # If path became empty after stripping and wasn't root, ensure it starts with / if netloc exists
        elif not path and parts.path != '/' and netloc:
             path = '' # Or '/' depending on desired strictness, empty seems safer.
        elif path and not path.startswith('/') and netloc:
            path = '/' + path # Ensure path starts with / if not empty

        # Reconstruct without query params and fragment for basic normalization
        # Note: Ignoring query params for simplicity here. Robust normalization might sort/handle them.
        normalized = urlunparse((scheme, netloc, path, '', '', ''))
        return normalized
    except ValueError:
        st.warning(f"⚠️ Could not properly normalize URL: {url}. Using original.")
        return url


def load_vector_store():
    """Loads or initializes the ChromaDB vector store and retrieves processed documents."""
    if st.session_state.vector_store is None:
        try:
            st.session_state.chroma_client = PersistentClient(path=DB_PATH)
            st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(name=COLLECTION_NAME)

            # Wrap collection in Langchain vector store
            st.session_state.vector_store = create_vector_store(
                st.session_state.google_api_key,
                client=st.session_state.chroma_client
            )

            # Retrieve metadata (source names) of already processed documents
            results = st.session_state.chroma_collection.get(include=['metadatas'])
            if results and 'metadatas' in results and results['metadatas']:
                processed_docs = set()
                for meta in results['metadatas']:
                    if meta and 'source' in meta:
                         processed_docs.add(meta['source'])
                st.session_state.processed_documents = list(processed_docs) # Convert back to list for consistency
                st.success(f"βœ… Loaded {len(st.session_state.processed_documents)} documents from database.")
            else:
                st.session_state.processed_documents = []
                st.info("ℹ️ No existing documents found in the database.")

        except Exception as e:
            st.session_state.vector_store = None
            st.session_state.processed_documents = []
            st.session_state.chroma_client = None
            st.session_state.chroma_collection = None
            st.warning(f"⚠️ Error loading/creating vector store: {e}")

def add_texts_to_vector_store(texts, source_name):
    """Adds processed text documents to the vector store."""
    if not texts:
        st.warning(f"⚠️ No text extracted from {source_name}. Skipping.")
        return False
    try:
        if st.session_state.vector_store is None:
            # Initialize vector store if it doesn't exist yet
             st.session_state.vector_store = create_vector_store(
                 st.session_state.google_api_key,
                 texts=texts, # Pass initial texts if needed by create_vector_store
                 client=st.session_state.chroma_client
             )
             # Ensure collection is updated if vector store was just created
             st.session_state.chroma_collection = st.session_state.chroma_client.get_or_create_collection(name=COLLECTION_NAME)

        else:
            st.session_state.vector_store.add_documents(texts)

        st.session_state.processed_documents.append(source_name)
        st.success(f"βœ… Added source: {source_name} to the database.")
        return True
    except Exception as e:
        st.error(f"❌ Error adding {source_name} to vector store: {e}")
        return False

def clear_chat_history():
    """Clears the chat history."""
    st.session_state.history = []
    st.success("Chat history cleared.")

def clear_vector_database():
    """Clears all documents from the ChromaDB collection."""
    if st.session_state.chroma_collection:
        try:
            existing_ids = st.session_state.chroma_collection.get(include=[])['ids']
            if existing_ids:
                st.session_state.chroma_collection.delete(ids=existing_ids)
                st.session_state.processed_documents = []
                st.success("βœ… Database cleared successfully. Note that this action does not delete the uploaded files in current session state.")
            else:
                st.info("ℹ️ Database is already empty.")
        except Exception as e:
            st.error(f"❌ Error clearing database: {e}")
    else:
        st.warning("⚠️ Vector store not initialized. Cannot clear database.")

def display_processed_sources():
    """Displays the list of processed documents/URLs in the sidebar."""
    if st.session_state.processed_documents:
        st.sidebar.header("πŸ“š Processed Sources")
        for source in sorted(list(set(st.session_state.processed_documents))): # Ensure uniqueness and sort
            icon = "πŸ“„" if source.lower().endswith(".pdf") else "🌐"
            st.sidebar.text(f"{icon} {source}")

def display_chat_history():
    """Displays the chat messages from session state."""
    for chat in st.session_state.history:
        with st.chat_message(chat["role"]):
            st.write(chat["content"])

def rewrite_query(query):
    """Rewrites the user query using the query rewriter agent."""
    try:
        query_rewriter = get_query_rewriter_agent()
        rewritten_query = query_rewriter.run(query).content
        # Optionally display the rewritten query
        # with st.expander("πŸ”„ Rewritten Query"):
        #     st.write(f"Original: {query}")
        #     st.write(f"Rewritten: {rewritten_query}")
        return rewritten_query
    except Exception as e:
        st.error(f"❌ Error rewriting query: {str(e)}")
        return query

def search_documents(query):
    """Searches the vector store for relevant documents."""
    if not st.session_state.vector_store:
        st.info("ℹ️ Vector store is not available for document search.")
        return [], ""

    retriever = st.session_state.vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": RETRIEVER_K,
            "score_threshold": st.session_state.similarity_threshold
        }
    )
    try:
        with st.spinner("Searching documents..."):
            docs = retriever.invoke(query)
            if docs:
                context = "\n\n".join([d.page_content for d in docs])
                st.info(f"πŸ“Š Found {len(docs)} relevant document chunks.")
                return docs, context
            else:
                st.info("ℹ️ No relevant documents found matching the threshold.")
                return [], ""
    except Exception as e:
        st.error(f"❌ Error searching documents: {e}")
        return [], ""

def search_web(query):
    """Searches the web using the web search agent."""
    try:
        with st.spinner("πŸ” Searching the web..."):
            web_search_agent = get_web_search_agent()
            web_results = web_search_agent.run(query).content
            if web_results:
                st.info("🌐 Web search successful.")
                return f"Web Search Results:\n{web_results}"
            else:
                st.info("πŸ•ΈοΈ Web search returned no results.")
                return ""
    except Exception as e:
        st.error(f"❌ Web search error: {str(e)}")
        return ""

def generate_response(original_query, rewritten_query, context):
    """Generates the final response using the RAG agent."""
    try:
        with st.spinner("πŸ€– Generating response..."):
            rag_agent = get_rag_agent()

            if context:
                full_prompt = f"""Based on the following context, answer the question.

Context:
{context}

Original Question: {original_query}
Rewritten Question (for context search): {rewritten_query}

Answer:"""
            else:
                # Fallback if no context from documents or web
                full_prompt = f"Answer the following question: {rewritten_query}"
                st.info("ℹ️ No specific context found. Answering based on general knowledge.")

            response = rag_agent.run(full_prompt)
            return response.content
    except Exception as e:
        st.error(f"❌ Error generating response: {str(e)}")
        return "Sorry, I encountered an error while generating the response."

# --- Streamlit App UI and Logic ---

def main():
    st.set_page_config(layout="wide")
    st.title("πŸ€” RAG System")

    initialize_session_state()
    load_vector_store()

    if st.session_state.get('clear_url_input_flag', False):
        st.session_state.url_input = ""
        st.session_state.clear_url_input_flag = False

    # --- Sidebar ---
    with st.sidebar:
        st.header("βš™οΈ Controls")
        if st.button("πŸ—‘οΈ Clear Chat History"):
            clear_chat_history()
        if st.button("⚠️ Clear Document Database"):
            clear_vector_database()
        
        st.header("πŸ”§ Configuration")
        st.session_state.use_web_search = st.checkbox(
            "Enable Web Search", value=st.session_state.use_web_search
        )
        st.session_state.force_web_search = st.checkbox(
            "Force Web Search", value=st.session_state.force_web_search,
            help="Always use web search, even if documents are found."
        )
        st.session_state.similarity_threshold = st.slider(
            "Document Similarity Threshold",
            min_value=0.0, max_value=1.0, value=st.session_state.similarity_threshold, step=0.05,
            help="Minimum relevance score for document retrieval (higher is stricter)."
        )

        st.header("πŸ’Ύ Data Input")
        uploaded_files = st.file_uploader(
            "Upload PDF Files", type=["pdf"], accept_multiple_files=True
        )
        web_url = st.text_input(
            "Enter Website URL",
            key="url_input"
        )

        display_processed_sources()

    # --- Process Uploads ---
    # Process PDFs
    if uploaded_files:
        for uploaded_file in uploaded_files:
            file_name = uploaded_file.name
            if file_name not in st.session_state.processed_documents:
                with st.spinner(f'Processing PDF: {file_name}...'):
                    texts = process_pdf(uploaded_file)
                    add_texts_to_vector_store(texts, file_name)
                 
    if web_url:
        normalized_url = normalize_url(web_url)
        if normalized_url:
            # Check if the *normalized* URL has already been processed
            if normalized_url not in st.session_state.processed_documents:
                with st.spinner(f'Processing URL: {web_url}...'):
                    # Process using the *original* URL input
                    texts = process_web(web_url)
                    if add_texts_to_vector_store(texts, normalized_url):
                        st.session_state.clear_url_input_flag = True
                        st.rerun()

    # --- Chat Interface ---
    display_chat_history()

    # Get user input
    prompt = st.chat_input("Ask a question about your documents or the web...")

    if prompt:
        # Add user message to UI and history
        st.chat_message("user").write(prompt)
        st.session_state.history.append({"role": "user", "content": prompt})

        # 1. Rewrite Query
        rewritten_query = rewrite_query(prompt)

        # 2. Search Strategy
        doc_context = ""
        web_context = ""
        docs = []

        # Try document search first unless web search is forced
        if not st.session_state.force_web_search:
            docs, doc_context = search_documents(rewritten_query)

        # Decide if web search is needed
        use_web = st.session_state.force_web_search or (st.session_state.use_web_search and not doc_context)

        if use_web:
            web_context = search_web(rewritten_query)
            if st.session_state.force_web_search and not web_context:
                 st.warning("Forced web search did not return results.")
            elif not doc_context and web_context:
                 st.info("Using web search results as fallback.")
            elif st.session_state.force_web_search and web_context:
                 st.info("Using forced web search results.")


        # 3. Combine Context (prioritize document context if available and not forcing web)
        final_context = ""
        if st.session_state.force_web_search:
            final_context = web_context # Use only web if forced
        elif doc_context:
            final_context = doc_context # Use docs if found
        elif web_context: # Use web only if docs weren't found (and web search was enabled/successful)
             final_context = web_context

        # 4. Generate Response
        assistant_response = generate_response(prompt, rewritten_query, final_context)

        # Add assistant response to UI and history
        st.chat_message("assistant").write(assistant_response)
        st.session_state.history.append({"role": "assistant", "content": assistant_response})

        # Optional: Display sources used if context came from documents
        # if not st.session_state.force_web_search and docs:
        #     with st.expander("πŸ“š Document Sources Used"):
        #         for i, doc in enumerate(docs):
        #             source = doc.metadata.get('source', 'Unknown Source')
        #             st.write(f"**{i+1}. {source}**")
        #             st.caption(f"{doc.page_content[:250]}...") # Show snippet

if __name__ == "__main__":
    main()