""" Intelligent Audit Report Chatbot UI """ import os import sys # ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports ===== # Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid omp_threads = os.environ.get("OMP_NUM_THREADS", "") try: if omp_threads: # Handle invalid formats like "3500m" by extracting just the number # Remove any non-numeric suffix and convert to int cleaned = ''.join(filter(str.isdigit, omp_threads)) if cleaned: threads = int(cleaned) if threads <= 0: os.environ["OMP_NUM_THREADS"] = "1" else: # Set the cleaned integer value back os.environ["OMP_NUM_THREADS"] = str(threads) else: os.environ["OMP_NUM_THREADS"] = "1" else: os.environ["OMP_NUM_THREADS"] = "1" except (ValueError, TypeError): os.environ["OMP_NUM_THREADS"] = "1" # ===== Setup HuggingFace cache directories BEFORE any model imports ===== # CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers) # This ensures models downloaded during Docker build are found at runtime cache_dir = "/app/.cache/huggingface" os.environ["HF_HOME"] = cache_dir os.environ["TRANSFORMERS_CACHE"] = cache_dir os.environ["HF_DATASETS_CACHE"] = cache_dir os.environ["HF_HUB_CACHE"] = cache_dir os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir # Ensure cache directory exists (created in Dockerfile, but ensure it's there) try: os.makedirs(cache_dir, mode=0o755, exist_ok=True) except (PermissionError, OSError) as e: # If we can't create it, log but continue (might already exist from Dockerfile) # HuggingFace will try to create subdirectories, but we need parent to exist pass import time import json import uuid import logging from pathlib import Path import argparse import streamlit as st from langchain_core.messages import HumanMessage, AIMessage from multi_agent_chatbot import get_multi_agent_chatbot from smart_chatbot import get_chatbot as get_smart_chatbot from src.reporting.feedback_schema import create_feedback_from_dict # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Log environment setup for debugging logger.info(f"📁 HuggingFace cache directory: {os.environ.get('HF_HOME', 'NOT SET')}") logger.info(f"🔧 OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}") # Page config st.set_page_config( layout="wide", page_icon="🤖", initial_sidebar_state="expanded", page_title="Intelligent Audit Report Chatbot" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) def get_system_type(): """Get the current system type""" system = os.environ.get('CHATBOT_SYSTEM', 'multi-agent') if system == 'smart': return "Smart Chatbot System" else: return "Multi-Agent System" def get_chatbot(): """Initialize and return the chatbot based on system type""" # Check environment variable for system type system = os.environ.get('CHATBOT_SYSTEM', 'multi-agent') if system == 'smart': return get_smart_chatbot() else: return get_multi_agent_chatbot() def serialize_messages(messages): """Serialize LangChain messages to dictionaries""" serialized = [] for msg in messages: if hasattr(msg, 'content'): serialized.append({ "type": type(msg).__name__, "content": str(msg.content) }) return serialized def serialize_documents(sources): """Serialize document objects to dictionaries with deduplication""" serialized = [] seen_content = set() for doc in sources: content = getattr(doc, 'page_content', getattr(doc, 'content', '')) # Skip if we've seen this exact content before if content in seen_content: continue seen_content.add(content) doc_dict = { "content": content, "metadata": getattr(doc, 'metadata', {}), "score": getattr(doc, 'metadata', {}).get('reranked_score', getattr(doc, 'metadata', {}).get('original_score', 0.0)), "id": getattr(doc, 'metadata', {}).get('_id', 'unknown'), "source": getattr(doc, 'metadata', {}).get('source', 'unknown'), "year": getattr(doc, 'metadata', {}).get('year', 'unknown'), "district": getattr(doc, 'metadata', {}).get('district', 'unknown'), "page": getattr(doc, 'metadata', {}).get('page', 'unknown'), "chunk_id": getattr(doc, 'metadata', {}).get('chunk_id', 'unknown'), "page_label": getattr(doc, 'metadata', {}).get('page_label', 'unknown'), "original_score": getattr(doc, 'metadata', {}).get('original_score', 0.0), "reranked_score": getattr(doc, 'metadata', {}).get('reranked_score', None) } serialized.append(doc_dict) return serialized @st.cache_data def load_filter_options(): try: with open("src/config/filter_options.json", "r") as f: return json.load(f) except FileNotFoundError: st.info([x for x in os.listdir() if x.endswith('.json')]) st.error("filter_options.json not found. Please run the metadata analysis script.") return {"sources": [], "years": [], "districts": [], 'filenames': []} def main(): # Initialize session state if 'messages' not in st.session_state: st.session_state.messages = [] if 'conversation_id' not in st.session_state: st.session_state.conversation_id = f"session_{uuid.uuid4().hex[:8]}" if 'session_start_time' not in st.session_state: st.session_state.session_start_time = time.time() if 'active_filters' not in st.session_state: st.session_state.active_filters = {'sources': [], 'years': [], 'districts': [], 'filenames': []} # Track RAG retrieval history for feedback if 'rag_retrieval_history' not in st.session_state: st.session_state.rag_retrieval_history = [] # Initialize chatbot only once per app session (cached) if 'chatbot' not in st.session_state: with st.spinner("🔄 Loading AI models and connecting to database..."): st.session_state.chatbot = get_chatbot() st.success("✅ AI system ready!") # Reset conversation history if needed (but keep chatbot cached) if 'reset_conversation' in st.session_state and st.session_state.reset_conversation: st.session_state.messages = [] st.session_state.conversation_id = f"session_{uuid.uuid4().hex[:8]}" st.session_state.session_start_time = time.time() st.session_state.rag_retrieval_history = [] st.session_state.feedback_submitted = False st.session_state.reset_conversation = False st.rerun() # Header with system indicator col1, col2 = st.columns([3, 1]) with col1: st.markdown('

🤖 Intelligent Audit Report Chatbot

', unsafe_allow_html=True) with col2: system_type = get_system_type() if "Multi-Agent" in system_type: st.success(f"🔧 {system_type}") else: st.info(f"🔧 {system_type}") st.markdown('

Ask questions about audit reports. Use the sidebar filters to narrow down your search!

', unsafe_allow_html=True) # Session info duration = int(time.time() - st.session_state.session_start_time) duration_str = f"{duration // 60}m {duration % 60}s" st.markdown(f'''
Session Info: Messages: {len(st.session_state.messages)} | Duration: {duration_str} | Status: Active | ID: {st.session_state.conversation_id}
''', unsafe_allow_html=True) # Load filter options filter_options = load_filter_options() # Sidebar for filters with st.sidebar: st.markdown("### 🔍 Search Filters") st.markdown("Select filters to narrow down your search. Leave empty to search all data.") st.markdown('
', unsafe_allow_html=True) st.markdown('
📄 Specific Reports (Filename Filter)
', unsafe_allow_html=True) st.markdown('

⚠️ Selecting specific reports will ignore all other filters

', unsafe_allow_html=True) selected_filenames = st.multiselect( "Select specific reports:", options=filter_options.get('filenames', []), default=st.session_state.active_filters.get('filenames', []), key="filenames_filter", help="Choose specific reports to search. When enabled, all other filters are ignored." ) st.markdown('
', unsafe_allow_html=True) # Determine if filename filter is active filename_mode = len(selected_filenames) > 0 # Sources filter st.markdown('
', unsafe_allow_html=True) st.markdown('
📊 Sources
', unsafe_allow_html=True) selected_sources = st.multiselect( "Select sources:", options=filter_options['sources'], default=st.session_state.active_filters['sources'], disabled = filename_mode, key="sources_filter", help="Choose which types of reports to search" ) st.markdown('
', unsafe_allow_html=True) # Years filter st.markdown('
', unsafe_allow_html=True) st.markdown('
📅 Years
', unsafe_allow_html=True) selected_years = st.multiselect( "Select years:", options=filter_options['years'], default=st.session_state.active_filters['years'], disabled = filename_mode, key="years_filter", help="Choose which years to search" ) st.markdown('
', unsafe_allow_html=True) # Districts filter st.markdown('
', unsafe_allow_html=True) st.markdown('
🏘️ Districts
', unsafe_allow_html=True) selected_districts = st.multiselect( "Select districts:", options=filter_options['districts'], default=st.session_state.active_filters['districts'], disabled = filename_mode, key="districts_filter", help="Choose which districts to search" ) st.markdown('
', unsafe_allow_html=True) # Update active filters st.session_state.active_filters = { 'sources': selected_sources if not filename_mode else [], 'years': selected_years if not filename_mode else [], 'districts': selected_districts if not filename_mode else [], 'filenames': selected_filenames } # Clear filters button if st.button("🗑️ Clear All Filters", key="clear_filters_button"): st.session_state.active_filters = {'sources': [], 'years': [], 'districts': [], 'filenames': []} st.rerun() # Main content area with tabs tab1, tab2 = st.tabs(["💬 Chat", "📄 Retrieved Documents"]) with tab1: # Chat container chat_container = st.container() with chat_container: # Display conversation history for message in st.session_state.messages: if isinstance(message, HumanMessage): st.markdown(f'
{message.content}
', unsafe_allow_html=True) elif isinstance(message, AIMessage): st.markdown(f'
{message.content}
', unsafe_allow_html=True) # Input area st.markdown("
", unsafe_allow_html=True) # Create two columns for input and button col1, col2 = st.columns([4, 1]) with col1: # Use a counter to force input clearing if 'input_counter' not in st.session_state: st.session_state.input_counter = 0 user_input = st.text_input( "Type your message here...", placeholder="Ask about budget allocations, expenditures, or audit findings...", key=f"user_input_{st.session_state.input_counter}", label_visibility="collapsed" ) with col2: send_button = st.button("Send", key="send_button", use_container_width=True) # Clear chat button if st.button("🗑️ Clear Chat", key="clear_chat_button"): st.session_state.reset_conversation = True # Clear all conversation files import os conversations_dir = "conversations" if os.path.exists(conversations_dir): for file in os.listdir(conversations_dir): if file.endswith('.json'): os.remove(os.path.join(conversations_dir, file)) st.rerun() # Handle user input if send_button and user_input: # Construct filter context string filter_context_str = "" if selected_filenames: filter_context_str += "FILTER CONTEXT:\n" filter_context_str += f"Filenames: {', '.join(selected_filenames)}\n" filter_context_str += "USER QUERY:\n" elif selected_sources or selected_years or selected_districts: filter_context_str += "FILTER CONTEXT:\n" if selected_sources: filter_context_str += f"Sources: {', '.join(selected_sources)}\n" if selected_years: filter_context_str += f"Years: {', '.join(selected_years)}\n" if selected_districts: filter_context_str += f"Districts: {', '.join(selected_districts)}\n" filter_context_str += "USER QUERY:\n" full_query = filter_context_str + user_input # Add user message to history st.session_state.messages.append(HumanMessage(content=user_input)) # Get chatbot response with st.spinner("🤔 Thinking..."): try: # Pass the full query with filter context chat_result = st.session_state.chatbot.chat(full_query, st.session_state.conversation_id) # Handle both old format (string) and new format (dict) if isinstance(chat_result, dict): response = chat_result['response'] rag_result = chat_result.get('rag_result') st.session_state.last_rag_result = rag_result # Track RAG retrieval for feedback if rag_result: sources = rag_result.get('sources', []) if isinstance(rag_result, dict) else (rag_result.sources if hasattr(rag_result, 'sources') else []) # Get the actual RAG query actual_rag_query = chat_result.get('actual_rag_query', '') if actual_rag_query: # Format it like the log message timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) formatted_query = f"{timestamp} - INFO - 🔍 ACTUAL RAG QUERY: '{actual_rag_query}'" else: formatted_query = "No RAG query available" retrieval_entry = { "conversation_up_to": serialize_messages(st.session_state.messages), "rag_query_expansion": formatted_query, "docs_retrieved": serialize_documents(sources) } st.session_state.rag_retrieval_history.append(retrieval_entry) else: response = chat_result st.session_state.last_rag_result = None # Add bot response to history st.session_state.messages.append(AIMessage(content=response)) except Exception as e: error_msg = f"Sorry, I encountered an error: {str(e)}" st.session_state.messages.append(AIMessage(content=error_msg)) # Clear input and rerun st.session_state.input_counter += 1 # This will clear the input st.rerun() with tab2: # Document retrieval panel if hasattr(st.session_state, 'last_rag_result') and st.session_state.last_rag_result: rag_result = st.session_state.last_rag_result # Handle both PipelineResult object and dictionary formats sources = None if hasattr(rag_result, 'sources'): # PipelineResult object format sources = rag_result.sources elif isinstance(rag_result, dict) and 'sources' in rag_result: # Dictionary format from multi-agent system sources = rag_result['sources'] if sources and len(sources) > 0: # Count unique filenames unique_filenames = set() for doc in sources: filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown') unique_filenames.add(filename) st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 10):**") if len(unique_filenames) < len(sources): st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.") for i, doc in enumerate(sources[:10]): # Show top 10 # Get relevance score and ID if available metadata = getattr(doc, 'metadata', {}) score = metadata.get('reranked_score', metadata.get('original_score', None)) chunk_id = metadata.get('_id', 'Unknown') score_text = f" (Score: {score:.3f}, ID: {chunk_id[:8]}...)" if score is not None else f" (ID: {chunk_id[:8]}...)" with st.expander(f"📄 Document {i+1}: {getattr(doc, 'metadata', {}).get('filename', 'Unknown')[:50]}...{score_text}"): # Display document metadata with emojis metadata = getattr(doc, 'metadata', {}) col1, col2, col3, col4 = st.columns([2, 1.5, 1, 1]) with col1: st.write(f"📄 **File:** {metadata.get('filename', 'Unknown')}") with col2: st.write(f"🏛️ **Source:** {metadata.get('source', 'Unknown')}") with col3: st.write(f"📅 **Year:** {metadata.get('year', 'Unknown')}") with col4: # Display page number and chunk ID page = metadata.get('page_label', metadata.get('page', 'Unknown')) chunk_id = metadata.get('_id', 'Unknown') st.write(f"📖 **Page:** {page}") st.write(f"🆔 **ID:** {chunk_id}") # Display full content (no truncation) content = getattr(doc, 'page_content', 'No content available') st.write(f"**Full Content:**") st.text_area("Full Content", value=content, height=300, disabled=True, label_visibility="collapsed", key=f"preview_{i}") else: st.info("No documents were retrieved for the last query.") else: st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.") # Feedback Dashboard Section st.markdown("---") st.markdown("### 💬 Feedback Dashboard") # Check if there's any conversation to provide feedback on has_conversation = len(st.session_state.messages) > 0 has_retrievals = len(st.session_state.rag_retrieval_history) > 0 if not has_conversation: st.info("💡 Start a conversation to provide feedback!") st.markdown("The feedback dashboard will be enabled once you begin chatting.") else: st.markdown("Help us improve by providing feedback on this conversation.") # Initialize feedback state if not exists if 'feedback_submitted' not in st.session_state: st.session_state.feedback_submitted = False # Feedback form with st.form("feedback_form", clear_on_submit=False): col1, col2 = st.columns([1, 1]) with col1: feedback_score = st.slider( "Rate this conversation (1-5)", min_value=1, max_value=5, help="How satisfied are you with the conversation?" ) with col2: is_feedback_about_last_retrieval = st.checkbox( "Feedback about last retrieval only", value=True, help="If checked, feedback applies to the most recent document retrieval" ) open_ended_feedback = st.text_area( "Your feedback (optional)", placeholder="Tell us what went well or what could be improved...", height=100 ) # Disable submit if no score selected submit_disabled = feedback_score is None submitted = st.form_submit_button( "📤 Submit Feedback", use_container_width=True, disabled=submit_disabled ) if submitted and not st.session_state.feedback_submitted: # Log the feedback data being submitted print("=" * 80) print("🔄 FEEDBACK SUBMISSION: Starting...") print("=" * 80) st.write("🔍 **Debug: Feedback Data Being Submitted:**") # Create feedback data dictionary feedback_dict = { "open_ended_feedback": open_ended_feedback, "score": feedback_score, "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval, "retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [], "conversation_id": st.session_state.conversation_id, "timestamp": time.time(), "message_count": len(st.session_state.messages), "has_retrievals": has_retrievals, "retrieval_count": len(st.session_state.rag_retrieval_history) } print(f"📝 FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}") # Create UserFeedback dataclass instance feedback_obj = None # Initialize outside try block try: feedback_obj = create_feedback_from_dict(feedback_dict) print(f"✅ FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}") st.write(f"✅ **Feedback Object Created**") st.write(f"- Feedback ID: {feedback_obj.feedback_id}") st.write(f"- Score: {feedback_obj.score}/5") st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}") # Convert back to dict for JSON serialization feedback_data = feedback_obj.to_dict() except Exception as e: print(f"❌ FEEDBACK SUBMISSION: Failed to create feedback object: {e}") st.error(f"Failed to create feedback object: {e}") feedback_data = feedback_dict # Display the data being submitted st.json(feedback_data) # Save feedback to file - use absolute path in /app to ensure writability feedback_dir = Path("/app/feedback") try: # Ensure directory exists with write permissions (777 for compatibility) feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True) except (PermissionError, OSError) as e: logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}") # Fallback to relative path feedback_dir = Path("feedback") feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True) feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json" try: # Ensure parent directory exists before writing feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True) # Save to local file print(f"💾 FEEDBACK SAVE: Saving to local file: {feedback_file}") with open(feedback_file, 'w') as f: json.dump(feedback_data, f, indent=2, default=str) print(f"✅ FEEDBACK SAVE: Local file saved successfully") st.success("✅ Thank you for your feedback! It has been saved locally.") st.balloons() # Save to Snowflake if enabled and credentials available logger.info("🔄 FEEDBACK SAVE: Starting Snowflake save process...") logger.info(f"📊 FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}") try: import os snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true" logger.info(f"🔍 SNOWFLAKE CHECK: enabled={snowflake_enabled}") if snowflake_enabled: if feedback_obj: try: from src.reporting.snowflake_connector import save_to_snowflake logger.info("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...") print("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...") # Also print to terminal if save_to_snowflake(feedback_obj): logger.info("✅ SNOWFLAKE UI: Successfully saved to Snowflake") print("✅ SNOWFLAKE UI: Successfully saved to Snowflake") # Also print to terminal st.success("✅ Feedback also saved to Snowflake!") else: logger.warning("⚠️ SNOWFLAKE UI: Save failed") print("⚠️ SNOWFLAKE UI: Save failed") # Also print to terminal st.warning("⚠️ Snowflake save failed, but local save succeeded") except Exception as e: logger.error(f"❌ SNOWFLAKE UI ERROR: {e}") print(f"❌ SNOWFLAKE UI ERROR: {e}") # Also print to terminal import traceback traceback.print_exc() # Print full traceback to terminal st.warning(f"⚠️ Could not save to Snowflake: {e}") else: logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)") print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)") # Also print to terminal st.warning("⚠️ Skipping Snowflake save (feedback object not created)") else: logger.info("💡 SNOWFLAKE UI: Integration disabled") print("💡 SNOWFLAKE UI: Integration disabled") # Also print to terminal st.info("💡 Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)") except NameError as e: import traceback traceback.print_exc() logger.error(f"❌ NameError in Snowflake save: {e}") print(f"❌ NameError in Snowflake save: {e}") # Also print to terminal st.warning(f"⚠️ Snowflake save error: {e}") except Exception as e: logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}") print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}") # Also print to terminal st.warning(f"⚠️ Snowflake save error: {e}") # Mark feedback as submitted to prevent resubmission st.session_state.feedback_submitted = True print("=" * 80) print(f"✅ FEEDBACK SUBMISSION: Completed successfully") print("=" * 80) # Log file location st.info(f"📁 Feedback saved to: {feedback_file}") except Exception as e: print(f"❌ FEEDBACK SUBMISSION: Error saving feedback: {e}") print(f"❌ FEEDBACK SUBMISSION: Error type: {type(e).__name__}") import traceback traceback.print_exc() st.error(f"❌ Error saving feedback: {e}") st.write(f"Debug error: {str(e)}") elif st.session_state.feedback_submitted: st.success("✅ Feedback already submitted for this conversation!") if st.button("🔄 Submit New Feedback", key="new_feedback_button"): st.session_state.feedback_submitted = False st.rerun() # Display retrieval history stats if st.session_state.rag_retrieval_history: st.markdown("---") st.markdown("#### 📊 Retrieval History") with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False): for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1): st.markdown(f"**Retrieval #{idx}**") # Display the actual RAG query rag_query_expansion = entry.get("rag_query_expansion", "No query available") st.code(rag_query_expansion, language="text") # Display summary stats st.json({ "conversation_length": len(entry.get("conversation_up_to", [])), "documents_retrieved": len(entry.get("docs_retrieved", [])) }) st.markdown("---") # Auto-scroll to bottom st.markdown(""" """, unsafe_allow_html=True) if __name__ == "__main__": main()