Ara Yeroyan commited on
Commit
b4984e2
Β·
2 Parent(s): 264ca84 69de8d2

Merge branch 'main' of https://huggingface.co/spaces/akryldigital/audit_assistant

Browse files
app.py CHANGED
@@ -10,18 +10,19 @@ import uuid
10
  import logging
11
  import traceback
12
  from pathlib import Path
13
- from typing import List, Dict, Any
14
  from collections import Counter
 
 
15
 
16
- import streamlit as st
17
- from langchain_core.messages import HumanMessage, AIMessage
18
  import pandas as pd
 
19
  import plotly.express as px
 
20
 
21
  from multi_agent_chatbot import get_multi_agent_chatbot
22
  from smart_chatbot import get_chatbot as get_smart_chatbot
23
- from src.reporting.feedback_schema import create_feedback_from_dict
24
  from src.reporting.snowflake_connector import save_to_snowflake
 
25
  from src.config.paths import (
26
  IS_DEPLOYED,
27
  PROJECT_DIR,
@@ -98,6 +99,8 @@ st.markdown("""
98
  color: #1f77b4;
99
  text-align: center;
100
  margin-bottom: 1rem;
 
 
101
  }
102
 
103
  .subtitle {
@@ -105,54 +108,8 @@ st.markdown("""
105
  color: #666;
106
  text-align: center;
107
  margin-bottom: 2rem;
108
- }
109
-
110
- .example-questions-header {
111
- text-align: center;
112
- margin-bottom: 1rem;
113
- }
114
-
115
- .example-questions-description {
116
- text-align: center;
117
- color: #666;
118
- margin-bottom: 2rem;
119
- }
120
-
121
- /* Hide ALL default Streamlit text input help messages about Enter key */
122
- /* This is the key one - hides "Press Enter to apply" message inside input field */
123
- div[data-testid="InputInstructions"],
124
- span[data-testid="InputInstructions"],
125
- *[data-testid="InputInstructions"] {
126
- display: none !important;
127
- visibility: hidden !important;
128
- opacity: 0 !important;
129
- height: 0 !important;
130
- width: 0 !important;
131
- overflow: hidden !important;
132
- position: absolute !important;
133
- left: -9999px !important;
134
- }
135
-
136
- /* Also hide other potential locations */
137
- div[data-testid="stTextInput"] + div > small,
138
- div[data-testid="stTextInput"] ~ div > small,
139
- div[data-testid="stTextInputContainer"] + div > small,
140
- div[data-testid="stTextInputContainer"] ~ div > small,
141
- div[data-baseweb="input"] + div > small,
142
- div[data-baseweb="input"] ~ div > small {
143
- display: none !important;
144
- visibility: hidden !important;
145
- opacity: 0 !important;
146
- height: 0 !important;
147
- overflow: hidden !important;
148
- }
149
-
150
- /* Custom help text for input */
151
- .input-help-text {
152
- font-size: 0.85rem;
153
- color: #666;
154
- margin-top: 0.25rem;
155
- text-align: left;
156
  }
157
 
158
  .session-info {
@@ -304,6 +261,114 @@ def serialize_documents(sources):
304
 
305
  return serialized
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
308
  """Extract statistics from retrieved chunks."""
309
  if not sources:
@@ -483,11 +548,10 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
483
  return
484
 
485
  # Wrap in styled container
486
- <<<<<<< HEAD
487
  st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
488
- =======
489
  # st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
490
- >>>>>>> 21eb407535b7e67a7dc3ea192c84831c0ae680d3
491
 
492
  st.subheader(f"πŸ“Š {title}")
493
 
@@ -604,7 +668,7 @@ def main():
604
  st.session_state.reset_conversation = False
605
  st.rerun()
606
 
607
- # Header - centered
608
  st.markdown('<h1 class="main-header">πŸ€– Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
609
  st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
610
 
@@ -631,21 +695,15 @@ def main():
631
 
632
  2. **Leave filters empty** to search across all data
633
 
634
- 3. **Type your question** in the chat and click "Send"
635
 
636
- 4. **Choose sample questions from the bottom of the page**
637
 
638
  #### πŸ’‘ Tips
639
 
640
  - Use specific questions for better results
641
  - Combine multiple filters for precise searches
642
- - Check the "Retrieved Documents" tab to get various insights
643
-
644
- #### πŸ’¬ Feedback Section
645
-
646
- - Rate your experience (1-5 stars)
647
- - Provide optional text feedback
648
- - Located at the bottom of the page
649
 
650
  #### ⚠️ Important
651
 
@@ -670,13 +728,11 @@ def main():
670
  help="Choose specific reports to search. When enabled, all other filters are ignored."
671
  )
672
  st.markdown('</div>', unsafe_allow_html=True)
673
-
674
- st.markdown('---')
675
 
676
  # Determine if filename filter is active
677
  filename_mode = len(selected_filenames) > 0
678
  # Sources filter
679
- # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
680
  st.markdown('<div class="filter-title">πŸ“Š Sources</div>', unsafe_allow_html=True)
681
  selected_sources = st.multiselect(
682
  "Select sources:",
@@ -771,67 +827,6 @@ def main():
771
  label_visibility="collapsed",
772
  value=default_value if default_value else None
773
  )
774
-
775
- # Use JavaScript to specifically target and hide "Press Enter to apply" message
776
- st.markdown("""
777
- <script>
778
- (function() {
779
- // Hide InputInstructions element (contains "Press Enter to apply")
780
- function hideInputInstructions() {
781
- // Target the specific Streamlit element
782
- const instructions = document.querySelector('[data-testid="InputInstructions"]');
783
- if (instructions) {
784
- instructions.style.display = 'none';
785
- instructions.style.visibility = 'hidden';
786
- instructions.style.opacity = '0';
787
- instructions.style.height = '0';
788
- instructions.style.width = '0';
789
- instructions.style.overflow = 'hidden';
790
- instructions.style.position = 'absolute';
791
- instructions.style.left = '-9999px';
792
- }
793
-
794
- // Also search for any text containing "Press Enter" or "apply" inside input containers
795
- const allElements = document.querySelectorAll('*');
796
- allElements.forEach(el => {
797
- const text = el.textContent || el.innerText || '';
798
- if ((text.toLowerCase().includes('press enter') ||
799
- text.toLowerCase().includes('enter to') ||
800
- text.toLowerCase().includes('to apply')) &&
801
- (el.tagName === 'SPAN' || el.tagName === 'DIV' || el.tagName === 'SMALL')) {
802
- const style = window.getComputedStyle(el);
803
- const fontSize = parseFloat(style.fontSize);
804
- // Hide if it's small text (likely help text)
805
- if (fontSize < 14 || el.hasAttribute('data-testid')) {
806
- el.style.display = 'none';
807
- el.style.visibility = 'hidden';
808
- el.style.height = '0';
809
- el.style.overflow = 'hidden';
810
- }
811
- }
812
- });
813
- }
814
-
815
- // Run immediately and after delays to catch dynamic elements
816
- hideInputInstructions();
817
- setTimeout(hideInputInstructions, 50);
818
- setTimeout(hideInputInstructions, 100);
819
- setTimeout(hideInputInstructions, 500);
820
-
821
- // Observe for new elements added by Streamlit
822
- const observer = new MutationObserver(function(mutations) {
823
- hideInputInstructions();
824
- });
825
- observer.observe(document.body, { childList: true, subtree: true, attributes: true });
826
- })();
827
- </script>
828
- """, unsafe_allow_html=True)
829
-
830
- # # Show custom help text below input - this replaces the default "Press Enter" message
831
- # st.markdown(
832
- # "<div class='input-help-text'>πŸ’‘ Press the <strong>Send</strong> button to submit your question</div>",
833
- # unsafe_allow_html=True
834
- # )
835
 
836
  with col2:
837
  send_button = st.button("Send", key="send_button", use_container_width=True)
@@ -934,8 +929,7 @@ def main():
934
  # Count unique filenames
935
  unique_filenames = set()
936
  for doc in sources:
937
- metadata = getattr(doc, 'metadata', {})
938
- filename = metadata.get('filename', 'Unknown')
939
  unique_filenames.add(filename)
940
 
941
  st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
@@ -990,44 +984,6 @@ def main():
990
  st.info("No documents were retrieved for the last query.")
991
  else:
992
  st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
993
-
994
- # Display retrieval history stats
995
- st.markdown("---")
996
- if st.session_state.rag_retrieval_history:
997
- st.markdown("#### πŸ“Š Retrieval History")
998
- st.markdown(f"This conversation has **{len(st.session_state.rag_retrieval_history)}** retrieval entries.")
999
-
1000
- with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
1001
- for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
1002
- with st.expander(f"Entry {idx}: {entry.get('rag_query_expansion', 'N/A')[:50]}...", expanded=False):
1003
- st.markdown(f"**Query:** {entry.get('rag_query_expansion', 'N/A')}")
1004
- st.markdown(f"**Documents Retrieved:** {len(entry.get('docs_retrieved', []))}")
1005
-
1006
- # Show conversation up to this point
1007
- conversation = entry.get('conversation_up_to', [])
1008
- if conversation:
1009
- st.markdown("**Conversation Context:**")
1010
- for msg in conversation[-3:]: # Show last 3 messages
1011
- role = msg.get('type', 'unknown')
1012
- content = msg.get('content', '')[:200] + "..." if len(msg.get('content', '')) > 200 else msg.get('content', '')
1013
- if role == 'human':
1014
- st.markdown(f"- **You:** {content}")
1015
- elif role == 'ai':
1016
- st.markdown(f"- **Bot:** {content}")
1017
-
1018
- # Show retrieved documents summary
1019
- docs = entry.get('docs_retrieved', [])
1020
- if docs:
1021
- st.markdown("**Retrieved Documents:**")
1022
- for doc_idx, doc in enumerate(docs[:5], 1): # Show first 5
1023
- doc_meta = doc.get('metadata', {})
1024
- filename = doc_meta.get('filename', 'Unknown')[:50]
1025
- st.markdown(f"{doc_idx}. {filename}")
1026
- if len(docs) > 5:
1027
- st.markdown(f"... and {len(docs) - 5} more documents")
1028
- else:
1029
- st.markdown("---")
1030
- st.info("πŸ“Š Retrieval history will appear here after you start asking questions.")
1031
 
1032
  # Feedback Dashboard Section
1033
  st.markdown("---")
@@ -1089,17 +1045,39 @@ def main():
1089
  print("=" * 80)
1090
  st.write("πŸ” **Debug: Feedback Data Being Submitted:**")
1091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  # Create feedback data dictionary
1093
  feedback_dict = {
1094
  "open_ended_feedback": open_ended_feedback,
1095
  "score": feedback_score,
1096
  "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
1097
- "retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
1098
  "conversation_id": st.session_state.conversation_id,
1099
  "timestamp": time.time(),
1100
  "message_count": len(st.session_state.messages),
1101
  "has_retrievals": has_retrievals,
1102
- "retrieval_count": len(st.session_state.rag_retrieval_history)
 
 
 
 
1103
  }
1104
 
1105
  print(f"πŸ“ FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
@@ -1141,19 +1119,18 @@ def main():
1141
  # Ensure parent directory exists before writing
1142
  feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
1143
 
1144
- # Save to local file
1145
  print(f"πŸ’Ύ FEEDBACK SAVE: Saving to local file: {feedback_file}")
1146
  with open(feedback_file, 'w') as f:
1147
  json.dump(feedback_data, f, indent=2, default=str)
1148
 
1149
  print(f"βœ… FEEDBACK SAVE: Local file saved successfully")
1150
- st.success("βœ… Thank you for your feedback! It has been saved locally.")
1151
- st.balloons()
1152
 
1153
  # Save to Snowflake if enabled and credentials available
1154
  logger.info("πŸ”„ FEEDBACK SAVE: Starting Snowflake save process...")
1155
  logger.info(f"πŸ“Š FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
1156
 
 
1157
  try:
1158
  snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
1159
  logger.info(f"πŸ” SNOWFLAKE CHECK: enabled={snowflake_enabled}")
@@ -1164,36 +1141,39 @@ def main():
1164
  logger.info("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
1165
  print("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
1166
 
1167
- if save_to_snowflake(feedback_obj):
 
1168
  logger.info("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
1169
  print("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
1170
- st.success("βœ… Feedback also saved to Snowflake!")
1171
  else:
1172
  logger.warning("⚠️ SNOWFLAKE UI: Save failed")
1173
  print("⚠️ SNOWFLAKE UI: Save failed")
1174
- st.warning("⚠️ Snowflake save failed, but local save succeeded")
1175
  except Exception as e:
1176
  logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
1177
  print(f"❌ SNOWFLAKE UI ERROR: {e}")
1178
  traceback.print_exc()
1179
- st.warning(f"⚠️ Could not save to Snowflake: {e}")
1180
  else:
1181
  logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
1182
  print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
1183
- st.warning("⚠️ Skipping Snowflake save (feedback object not created)")
1184
  else:
1185
  logger.info("πŸ’‘ SNOWFLAKE UI: Integration disabled")
1186
  print("πŸ’‘ SNOWFLAKE UI: Integration disabled")
1187
- st.info("πŸ’‘ Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
1188
- except NameError as e:
1189
- traceback.print_exc()
1190
- logger.error(f"❌ NameError in Snowflake save: {e}")
1191
- print(f"❌ NameError in Snowflake save: {e}")
1192
- st.warning(f"⚠️ Snowflake save error: {e}")
1193
  except Exception as e:
1194
  logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
1195
  print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
1196
- st.warning(f"⚠️ Snowflake save error: {e}")
 
 
 
 
 
 
 
1197
 
1198
  # Mark feedback as submitted to prevent resubmission
1199
  st.session_state.feedback_submitted = True
@@ -1229,16 +1209,30 @@ def main():
1229
  # Scroll to conversation - this is handled by the auto-scroll at bottom
1230
  pass
1231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1232
  # Example Questions Section
1233
  st.markdown("---")
1234
- st.markdown(
1235
- "<h3 class='example-questions-header'>πŸ’‘ Example Questions</h3>",
1236
- unsafe_allow_html=True
1237
- )
1238
- st.markdown(
1239
- "<p class='example-questions-description'>Click on any question below to use it, or modify the editable examples:</p>",
1240
- unsafe_allow_html=True
1241
- )
1242
 
1243
  # Initialize example question state
1244
  if 'custom_question_1' not in st.session_state:
@@ -1261,53 +1255,56 @@ def main():
1261
 
1262
  st.markdown("---")
1263
 
1264
- # Questions 2 & 3: Editable examples (collapsible, side by side)
1265
- with st.expander("#### ✏️ Customizable Questions (Edit and use)", expanded=False):
1266
- # Place questions side by side
1267
- col1, col2 = st.columns(2)
1268
-
1269
- # Question 2
1270
- with col1:
1271
- st.markdown("**Question 2:**")
1272
- custom_q1 = st.text_area(
1273
- "Edit question 2:",
1274
- value=st.session_state.custom_question_1,
1275
- height=100,
1276
- key="edit_question_2",
1277
- help="Modify this question to fit your needs, then click 'Use This Question'",
1278
- label_visibility="collapsed"
1279
- )
1280
- if st.button("πŸ“‹ Use Question 2", key="use_custom_1", use_container_width=True):
1281
- if custom_q1.strip():
1282
- st.session_state.pending_question = custom_q1.strip()
1283
- st.session_state.custom_question_1 = custom_q1.strip()
1284
- st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1285
- st.rerun()
1286
- else:
1287
- st.warning("Please enter a question first!")
1288
- st.caption("πŸ’‘ Tip: Add specific details like dates, names, or amounts to get more precise answers")
1289
- st.info("πŸ’‘ **Filter to apply:** Select District(s) and Year(s) from sidebar panel")
1290
-
1291
- # Question 3
1292
- with col2:
1293
- st.markdown("**Question 3:**")
1294
- custom_q2 = st.text_area(
1295
- "Edit question 3:",
1296
- value=st.session_state.custom_question_2,
1297
- height=100,
1298
- key="edit_question_3",
1299
- help="Modify this question to fit your needs, then click 'Use This Question'",
1300
- label_visibility="collapsed"
1301
- )
1302
- if st.button("πŸ“‹ Use Question 3", key="use_custom_2", use_container_width=True):
1303
- if custom_q2.strip():
1304
- st.session_state.pending_question = custom_q2.strip()
1305
- st.session_state.custom_question_2 = custom_q2.strip()
1306
- st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1307
- st.rerun()
1308
- else:
1309
- st.warning("Please enter a question first!")
1310
- st.caption("πŸ’‘ Tip: Use specific terms from the documents (e.g., 'PDM', 'SACCOs', 'FY 2022/23')")
 
 
 
1311
 
1312
 
1313
  # Store selected question for next render (handled in input section above)
@@ -1407,5 +1404,32 @@ def main():
1407
  </script>
1408
  """, unsafe_allow_html=True)
1409
 
 
1410
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
  main()
 
10
  import logging
11
  import traceback
12
  from pathlib import Path
 
13
  from collections import Counter
14
+ from typing import List, Dict, Any, Optional
15
+
16
 
 
 
17
  import pandas as pd
18
+ import streamlit as st
19
  import plotly.express as px
20
+ from langchain_core.messages import HumanMessage, AIMessage
21
 
22
  from multi_agent_chatbot import get_multi_agent_chatbot
23
  from smart_chatbot import get_chatbot as get_smart_chatbot
 
24
  from src.reporting.snowflake_connector import save_to_snowflake
25
+ from src.reporting.feedback_schema import create_feedback_from_dict
26
  from src.config.paths import (
27
  IS_DEPLOYED,
28
  PROJECT_DIR,
 
99
  color: #1f77b4;
100
  text-align: center;
101
  margin-bottom: 1rem;
102
+ width: 100%;
103
+ display: block;
104
  }
105
 
106
  .subtitle {
 
108
  color: #666;
109
  text-align: center;
110
  margin-bottom: 2rem;
111
+ width: 100%;
112
+ display: block;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
114
 
115
  .session-info {
 
261
 
262
  return serialized
263
 
264
+
265
+ def extract_transcript(messages: List[Any]) -> List[Dict[str, str]]:
266
+ """Extract transcript from messages - only user and bot messages, no extra metadata"""
267
+ transcript = []
268
+ for msg in messages:
269
+ if isinstance(msg, HumanMessage):
270
+ transcript.append({
271
+ "role": "user",
272
+ "content": str(msg.content) if hasattr(msg, 'content') else str(msg)
273
+ })
274
+ elif isinstance(msg, AIMessage):
275
+ transcript.append({
276
+ "role": "assistant",
277
+ "content": str(msg.content) if hasattr(msg, 'content') else str(msg)
278
+ })
279
+ return transcript
280
+
281
+
282
+ def build_retrievals_structure(rag_retrieval_history: List[Dict[str, Any]], messages: List[Any]) -> List[Dict[str, Any]]:
283
+ """Build retrievals structure from retrieval history"""
284
+ retrievals = []
285
+
286
+ for entry in rag_retrieval_history:
287
+ # Get the user message that triggered this retrieval
288
+ # The entry has conversation_up_to which includes messages up to that point
289
+ conversation_up_to = entry.get("conversation_up_to", [])
290
+
291
+ # Find the last user message in conversation_up_to (this is the trigger)
292
+ user_message_trigger = ""
293
+ for msg_dict in reversed(conversation_up_to):
294
+ if msg_dict.get("type") == "HumanMessage":
295
+ user_message_trigger = msg_dict.get("content", "")
296
+ break
297
+
298
+ # Fallback: if not found in conversation_up_to, get from actual messages
299
+ # This handles edge cases where conversation_up_to might be incomplete
300
+ if not user_message_trigger:
301
+ # Find which retrieval this is (0-indexed)
302
+ retrieval_idx = rag_retrieval_history.index(entry)
303
+ # The user message that triggered this retrieval is at position (retrieval_idx * 2)
304
+ # because each retrieval is preceded by: user message, bot response, user message, ...
305
+ # But we need to account for the fact that the first retrieval happens after the first user message
306
+ user_msgs = [msg for msg in messages if isinstance(msg, HumanMessage)]
307
+ if retrieval_idx < len(user_msgs):
308
+ user_message_trigger = str(user_msgs[retrieval_idx].content)
309
+ elif user_msgs:
310
+ # Fallback to last user message
311
+ user_message_trigger = str(user_msgs[-1].content)
312
+
313
+ # Get retrieved documents and truncate content to 100 chars
314
+ docs_retrieved = entry.get("docs_retrieved", [])
315
+ retrieved_docs = []
316
+ for doc in docs_retrieved:
317
+ doc_copy = doc.copy()
318
+ # Truncate content to 100 characters (keep all other fields)
319
+ if "content" in doc_copy:
320
+ doc_copy["content"] = doc_copy["content"][:100]
321
+ retrieved_docs.append(doc_copy)
322
+
323
+ retrievals.append({
324
+ "retrieved_docs": retrieved_docs,
325
+ "user_message_trigger": user_message_trigger
326
+ })
327
+
328
+ return retrievals
329
+
330
+
331
+ def build_feedback_score_related_retrieval_docs(
332
+ is_feedback_about_last_retrieval: bool,
333
+ messages: List[Any],
334
+ rag_retrieval_history: List[Dict[str, Any]]
335
+ ) -> Optional[Dict[str, Any]]:
336
+ """Build feedback_score_related_retrieval_docs structure"""
337
+ if not rag_retrieval_history:
338
+ return None
339
+
340
+ # Get the relevant retrieval entry
341
+ if is_feedback_about_last_retrieval:
342
+ relevant_entry = rag_retrieval_history[-1]
343
+ else:
344
+ # If feedback is about all retrievals, use the last one as default
345
+ relevant_entry = rag_retrieval_history[-1]
346
+
347
+ # Get conversation up to that point
348
+ conversation_up_to = relevant_entry.get("conversation_up_to", [])
349
+
350
+ # Convert to transcript format (role/content)
351
+ conversation_up_to_point = []
352
+ for msg_dict in conversation_up_to:
353
+ if msg_dict.get("type") == "HumanMessage":
354
+ conversation_up_to_point.append({
355
+ "role": "user",
356
+ "content": msg_dict.get("content", "")
357
+ })
358
+ elif msg_dict.get("type") == "AIMessage":
359
+ conversation_up_to_point.append({
360
+ "role": "assistant",
361
+ "content": msg_dict.get("content", "")
362
+ })
363
+
364
+ # Get retrieved docs with full content (not truncated)
365
+ retrieved_docs = relevant_entry.get("docs_retrieved", [])
366
+
367
+ return {
368
+ "conversation_up_to_point": conversation_up_to_point,
369
+ "retrieved_docs": retrieved_docs
370
+ }
371
+
372
  def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
373
  """Extract statistics from retrieved chunks."""
374
  if not sources:
 
548
  return
549
 
550
  # Wrap in styled container
551
+
552
  st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
 
553
  # st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
554
+
555
 
556
  st.subheader(f"πŸ“Š {title}")
557
 
 
668
  st.session_state.reset_conversation = False
669
  st.rerun()
670
 
671
+ # Header - fully center aligned
672
  st.markdown('<h1 class="main-header">πŸ€– Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
673
  st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
674
 
 
695
 
696
  2. **Leave filters empty** to search across all data
697
 
698
+ 3. **Type your question** in the chat input at the bottom
699
 
700
+ 4. **Click "Send"** to submit your question
701
 
702
  #### πŸ’‘ Tips
703
 
704
  - Use specific questions for better results
705
  - Combine multiple filters for precise searches
706
+ - Check the "Retrieved Documents" tab to see source material
 
 
 
 
 
 
707
 
708
  #### ⚠️ Important
709
 
 
728
  help="Choose specific reports to search. When enabled, all other filters are ignored."
729
  )
730
  st.markdown('</div>', unsafe_allow_html=True)
 
 
731
 
732
  # Determine if filename filter is active
733
  filename_mode = len(selected_filenames) > 0
734
  # Sources filter
735
+ st.markdown('<div class="filter-section">', unsafe_allow_html=True)
736
  st.markdown('<div class="filter-title">πŸ“Š Sources</div>', unsafe_allow_html=True)
737
  selected_sources = st.multiselect(
738
  "Select sources:",
 
827
  label_visibility="collapsed",
828
  value=default_value if default_value else None
829
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830
 
831
  with col2:
832
  send_button = st.button("Send", key="send_button", use_container_width=True)
 
929
  # Count unique filenames
930
  unique_filenames = set()
931
  for doc in sources:
932
+ filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
 
933
  unique_filenames.add(filename)
934
 
935
  st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
 
984
  st.info("No documents were retrieved for the last query.")
985
  else:
986
  st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
987
 
988
  # Feedback Dashboard Section
989
  st.markdown("---")
 
1045
  print("=" * 80)
1046
  st.write("πŸ” **Debug: Feedback Data Being Submitted:**")
1047
 
1048
+ # Extract transcript from messages
1049
+ transcript = extract_transcript(st.session_state.messages)
1050
+
1051
+ # Build retrievals structure
1052
+ retrievals = build_retrievals_structure(
1053
+ st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
1054
+ st.session_state.messages
1055
+ )
1056
+
1057
+ # Build feedback_score_related_retrieval_docs
1058
+ feedback_score_related_retrieval_docs = build_feedback_score_related_retrieval_docs(
1059
+ is_feedback_about_last_retrieval,
1060
+ st.session_state.messages,
1061
+ st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else []
1062
+ )
1063
+
1064
+ # Preserve old retrieved_data format for backward compatibility
1065
+ retrieved_data_old_format = st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else []
1066
+
1067
  # Create feedback data dictionary
1068
  feedback_dict = {
1069
  "open_ended_feedback": open_ended_feedback,
1070
  "score": feedback_score,
1071
  "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
 
1072
  "conversation_id": st.session_state.conversation_id,
1073
  "timestamp": time.time(),
1074
  "message_count": len(st.session_state.messages),
1075
  "has_retrievals": has_retrievals,
1076
+ "retrieval_count": len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0,
1077
+ "transcript": transcript,
1078
+ "retrievals": retrievals,
1079
+ "feedback_score_related_retrieval_docs": feedback_score_related_retrieval_docs,
1080
+ "retrieved_data": retrieved_data_old_format # Preserved old column
1081
  }
1082
 
1083
  print(f"πŸ“ FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
 
1119
  # Ensure parent directory exists before writing
1120
  feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
1121
 
1122
+ # Save to local file first
1123
  print(f"πŸ’Ύ FEEDBACK SAVE: Saving to local file: {feedback_file}")
1124
  with open(feedback_file, 'w') as f:
1125
  json.dump(feedback_data, f, indent=2, default=str)
1126
 
1127
  print(f"βœ… FEEDBACK SAVE: Local file saved successfully")
 
 
1128
 
1129
  # Save to Snowflake if enabled and credentials available
1130
  logger.info("πŸ”„ FEEDBACK SAVE: Starting Snowflake save process...")
1131
  logger.info(f"πŸ“Š FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
1132
 
1133
+ snowflake_success = False
1134
  try:
1135
  snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
1136
  logger.info(f"πŸ” SNOWFLAKE CHECK: enabled={snowflake_enabled}")
 
1141
  logger.info("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
1142
  print("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
1143
 
1144
+ snowflake_success = save_to_snowflake(feedback_obj)
1145
+ if snowflake_success:
1146
  logger.info("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
1147
  print("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
 
1148
  else:
1149
  logger.warning("⚠️ SNOWFLAKE UI: Save failed")
1150
  print("⚠️ SNOWFLAKE UI: Save failed")
 
1151
  except Exception as e:
1152
  logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
1153
  print(f"❌ SNOWFLAKE UI ERROR: {e}")
1154
  traceback.print_exc()
1155
+ snowflake_success = False
1156
  else:
1157
  logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
1158
  print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
1159
+ snowflake_success = False
1160
  else:
1161
  logger.info("πŸ’‘ SNOWFLAKE UI: Integration disabled")
1162
  print("πŸ’‘ SNOWFLAKE UI: Integration disabled")
1163
+ # If Snowflake is disabled, consider it successful (local save only)
1164
+ snowflake_success = True
1165
+
 
 
 
1166
  except Exception as e:
1167
  logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
1168
  print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
1169
+ snowflake_success = False
1170
+
1171
+ # Only show success if Snowflake save succeeded (or if Snowflake is disabled)
1172
+ if snowflake_success:
1173
+ st.success("βœ… Thank you for your feedback! It has been saved successfully.")
1174
+ st.balloons()
1175
+ else:
1176
+ st.warning("⚠️ Feedback saved locally, but Snowflake save failed. Please check logs.")
1177
 
1178
  # Mark feedback as submitted to prevent resubmission
1179
  st.session_state.feedback_submitted = True
 
1209
  # Scroll to conversation - this is handled by the auto-scroll at bottom
1210
  pass
1211
 
1212
+ # Display retrieval history stats
1213
+ if st.session_state.rag_retrieval_history:
1214
+ st.markdown("---")
1215
+ st.markdown("#### πŸ“Š Retrieval History")
1216
+
1217
+ with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
1218
+ for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
1219
+ st.markdown(f"**Retrieval #{idx}**")
1220
+
1221
+ # Display the actual RAG query
1222
+ rag_query_expansion = entry.get("rag_query_expansion", "No query available")
1223
+ st.code(rag_query_expansion, language="text")
1224
+
1225
+ # Display summary stats
1226
+ st.json({
1227
+ "conversation_length": len(entry.get("conversation_up_to", [])),
1228
+ "documents_retrieved": len(entry.get("docs_retrieved", []))
1229
+ })
1230
+ st.markdown("---")
1231
+
1232
  # Example Questions Section
1233
  st.markdown("---")
1234
+ st.markdown("### πŸ’‘ Example Questions")
1235
+ st.markdown("Click on any question below to use it, or modify the editable examples:")
 
 
 
 
 
 
1236
 
1237
  # Initialize example question state
1238
  if 'custom_question_1' not in st.session_state:
 
1255
 
1256
  st.markdown("---")
1257
 
1258
+ # Questions 2 & 3: Editable examples
1259
+ st.markdown("#### ✏️ Customizable Questions (Edit and use)")
1260
+
1261
+ # Question 2
1262
+ # st.markdown("**Question 2:**")
1263
+ custom_q1 = st.text_area(
1264
+ "Edit question 2:",
1265
+ value=st.session_state.custom_question_1,
1266
+ height=80,
1267
+ key="edit_question_2",
1268
+ help="Modify this question to fit your needs, then click 'Use This Question'"
1269
+ )
1270
+ col1, col2 = st.columns([1, 4])
1271
+ with col1:
1272
+ if st.button("πŸ“‹ Use Question 2", key="use_custom_1", use_container_width=True):
1273
+ if custom_q1.strip():
1274
+ st.session_state.pending_question = custom_q1.strip()
1275
+ st.session_state.custom_question_1 = custom_q1.strip()
1276
+ st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1277
+ st.rerun()
1278
+ else:
1279
+ st.warning("Please enter a question first!")
1280
+ with col2:
1281
+ st.caption("πŸ’‘ Tip: Add specific details like dates, names, or amounts to get more precise answers")
1282
+
1283
+ st.info("πŸ’‘ **Filter to apply:** Select District(s) and Year(s) sidebar panel before asking this question.")
1284
+
1285
+ st.markdown("---")
1286
+
1287
+ # Question 3
1288
+ # st.markdown("**Question 3:**")
1289
+ custom_q2 = st.text_area(
1290
+ "Edit question 3:",
1291
+ value=st.session_state.custom_question_2,
1292
+ height=80,
1293
+ key="edit_question_3",
1294
+ help="Modify this question to fit your needs, then click 'Use This Question'"
1295
+ )
1296
+ col1, col2 = st.columns([1, 4])
1297
+ with col1:
1298
+ if st.button("πŸ“‹ Use Question 3", key="use_custom_2", use_container_width=True):
1299
+ if custom_q2.strip():
1300
+ st.session_state.pending_question = custom_q2.strip()
1301
+ st.session_state.custom_question_2 = custom_q2.strip()
1302
+ st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1303
+ st.rerun()
1304
+ else:
1305
+ st.warning("Please enter a question first!")
1306
+ with col2:
1307
+ st.caption("πŸ’‘ Tip: Use specific terms from the documents (e.g., 'PDM', 'SACCOs', 'FY 2022/23')")
1308
 
1309
 
1310
  # Store selected question for next render (handled in input section above)
 
1404
  </script>
1405
  """, unsafe_allow_html=True)
1406
 
1407
+
1408
  if __name__ == "__main__":
1409
+ # Check if running in Streamlit context
1410
+ try:
1411
+ from streamlit.runtime.scriptrunner import get_script_run_ctx
1412
+ if get_script_run_ctx() is None:
1413
+ # Not in Streamlit runtime - show helpful message
1414
+ print("=" * 80)
1415
+ print("⚠️ WARNING: This is a Streamlit app!")
1416
+ print("=" * 80)
1417
+ print("\nPlease run this app using:")
1418
+ print(" streamlit run app.py")
1419
+ print("\nNot: python app.py")
1420
+ print("\nThe app will not function correctly when run with 'python app.py'")
1421
+ print("=" * 80)
1422
+ import sys
1423
+ sys.exit(1)
1424
+ except ImportError:
1425
+ # Streamlit not installed or not in Streamlit context
1426
+ print("=" * 80)
1427
+ print("⚠️ WARNING: This is a Streamlit app!")
1428
+ print("=" * 80)
1429
+ print("\nPlease run this app using:")
1430
+ print(" streamlit run app.py")
1431
+ print("\nNot: python app.py")
1432
+ print("=" * 80)
1433
+ import sys
1434
+ sys.exit(1)
1435
  main()
src/reporting/feedback_schema.py CHANGED
@@ -4,10 +4,12 @@ Feedback Schema for RAG Chatbot
4
  This module defines dataclasses for feedback data structures
5
  and provides Snowflake schema generation.
6
  """
7
-
 
8
  from dataclasses import dataclass, asdict, field
9
  from typing import List, Optional, Dict, Any, Union
10
- from datetime import datetime
 
11
 
12
 
13
  @dataclass
@@ -39,34 +41,20 @@ class UserFeedback:
39
  open_ended_feedback: Optional[str]
40
  score: int
41
  is_feedback_about_last_retrieval: bool
42
- retrieved_data: List[RetrievalEntry]
43
  conversation_id: str
44
  timestamp: float
45
  message_count: int
46
  has_retrievals: bool
47
  retrieval_count: int
48
- user_query: Optional[str] = None
49
- bot_response: Optional[str] = None
 
 
50
  created_at: str = field(default_factory=lambda: datetime.now().isoformat())
51
 
52
  def to_dict(self) -> Dict[str, Any]:
53
  """Convert to dictionary with nested data structures"""
54
  result = asdict(self)
55
- # Handle nested objects
56
- if self.retrieved_data:
57
- result['retrieved_data'] = [self._serialize_retrieval_entry(entry) for entry in self.retrieved_data]
58
- return result
59
-
60
- def _serialize_retrieval_entry(self, entry: RetrievalEntry) -> Dict[str, Any]:
61
- """Serialize retrieval entry to dict"""
62
- # If raw data exists, use it (it's already properly formatted)
63
- if hasattr(entry, '_raw_data') and entry._raw_data:
64
- return entry._raw_data
65
-
66
- # Otherwise, serialize the dataclass
67
- result = asdict(entry)
68
- if entry.documents_retrieved:
69
- result['documents_retrieved'] = [asdict(doc) for doc in entry.documents_retrieved]
70
  return result
71
 
72
  def to_snowflake_schema(self) -> Dict[str, Any]:
@@ -81,28 +69,28 @@ class UserFeedback:
81
  "message_count": "INTEGER",
82
  "has_retrievals": "BOOLEAN",
83
  "retrieval_count": "INTEGER",
84
- "user_query": "VARCHAR(16777216)",
85
- "bot_response": "VARCHAR(16777216)",
 
 
86
  "created_at": "TIMESTAMP_NTZ",
87
- "retrieved_data": "VARIANT", # Array of retrieval entries
88
- # retrieved_data structure:
89
- # [
90
  # {
91
- # "rag_query": "...",
92
- # "conversation_length": 5,
93
- # "timestamp": 1234567890,
94
- # "docs_retrieved": [
95
- # {"filename": "...", "page": 14, "score": 0.95, ...},
96
- # ...
97
- # ]
98
  # },
99
  # ...
100
  # ]
 
 
 
 
101
  }
102
  return schema
103
 
104
  @classmethod
105
- def get_snowflake_create_table_sql(cls, table_name: str = "user_feedback") -> str:
106
  """Generate CREATE TABLE SQL for Snowflake"""
107
  schema = cls.to_snowflake_schema(None)
108
 
@@ -117,16 +105,13 @@ class UserFeedback:
117
  sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
118
  {columns_str},
119
  PRIMARY KEY (feedback_id)
120
- );
121
-
122
- -- Create index on timestamp for querying by time
123
- CREATE INDEX IF NOT EXISTS idx_feedback_timestamp ON {table_name} (timestamp);
124
-
125
- -- Create index on conversation_id for querying by conversation
126
- CREATE INDEX IF NOT EXISTS idx_feedback_conversation ON {table_name} (conversation_id);
127
-
128
- -- Create index on score for feedback analysis
129
- CREATE INDEX IF NOT EXISTS idx_feedback_score ON {table_name} (score);
130
  """
131
  return sql
132
 
@@ -150,47 +135,27 @@ DOCUMENT_SCHEMA = {
150
  }
151
 
152
 
153
- def generate_snowflake_schema_sql() -> str:
154
  """Generate complete Snowflake schema SQL for feedback system"""
155
- return UserFeedback.get_snowflake_create_table_sql("user_feedback")
 
 
156
 
157
 
158
  def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
159
  """Create UserFeedback instance from dictionary"""
160
- # Parse retrieved_data if present
161
- retrieved_data = []
162
- if "retrieved_data" in data and data["retrieved_data"]:
163
- for entry_dict in data.get("retrieved_data", []):
164
- # Map the actual structure from rag_retrieval_history
165
- # Entry has: conversation_up_to, rag_query_expansion, docs_retrieved
166
- try:
167
- # Try to map to expected structure
168
- entry = RetrievalEntry(
169
- rag_query=entry_dict.get("rag_query_expansion", ""),
170
- documents_retrieved=[], # Empty for now, will store as raw data
171
- conversation_length=len(entry_dict.get("conversation_up_to", [])),
172
- filters_applied=None,
173
- timestamp=entry_dict.get("timestamp", None)
174
- )
175
- # Store raw data in the entry
176
- entry._raw_data = entry_dict # Store original for preservation
177
- retrieved_data.append(entry)
178
- except Exception as e:
179
- # If mapping fails, store as-is without strict typing
180
- pass
181
-
182
  return UserFeedback(
183
  feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
184
  open_ended_feedback=data.get("open_ended_feedback"),
185
  score=data["score"],
186
  is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
187
- retrieved_data=retrieved_data,
188
  conversation_id=data["conversation_id"],
189
  timestamp=data["timestamp"],
190
  message_count=data["message_count"],
191
  has_retrievals=data["has_retrievals"],
192
  retrieval_count=data["retrieval_count"],
193
- user_query=data.get("user_query"),
194
- bot_response=data.get("bot_response")
 
 
195
  )
196
-
 
4
  This module defines dataclasses for feedback data structures
5
  and provides Snowflake schema generation.
6
  """
7
+ import os
8
+ from datetime import datetime
9
  from dataclasses import dataclass, asdict, field
10
  from typing import List, Optional, Dict, Any, Union
11
+
12
+
13
 
14
 
15
  @dataclass
 
41
  open_ended_feedback: Optional[str]
42
  score: int
43
  is_feedback_about_last_retrieval: bool
 
44
  conversation_id: str
45
  timestamp: float
46
  message_count: int
47
  has_retrievals: bool
48
  retrieval_count: int
49
+ transcript: List[Dict[str, str]] # List of {"role": "user"/"assistant", "content": "..."}
50
+ retrievals: List[Dict[str, Any]] # List of retrieval objects with retrieved_docs and user_message_trigger
51
+ feedback_score_related_retrieval_docs: Optional[Dict[str, Any]] = None # Conversation subset + retrieved docs
52
+ retrieved_data: Optional[List[Dict[str, Any]]] = None # Preserved old column for backward compatibility
53
  created_at: str = field(default_factory=lambda: datetime.now().isoformat())
54
 
55
  def to_dict(self) -> Dict[str, Any]:
56
  """Convert to dictionary with nested data structures"""
57
  result = asdict(self)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return result
59
 
60
  def to_snowflake_schema(self) -> Dict[str, Any]:
 
69
  "message_count": "INTEGER",
70
  "has_retrievals": "BOOLEAN",
71
  "retrieval_count": "INTEGER",
72
+ "transcript": "VARCHAR(16777216)", # JSON string of ARRAY of {"role": "user"/"assistant", "content": "..."}
73
+ "retrievals": "VARCHAR(16777216)", # JSON string of ARRAY of retrieval objects
74
+ "feedback_score_related_retrieval_docs": "VARCHAR(16777216)", # JSON string of OBJECT with conversation subset + retrieved docs
75
+ "retrieved_data": "VARCHAR(16777216)", # JSON string - preserved old column for backward compatibility
76
  "created_at": "TIMESTAMP_NTZ",
77
+ # transcript structure: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
78
+ # retrievals structure: [
 
79
  # {
80
+ # "retrieved_docs": [{"content": "...", "metadata": {...}, ...}], # content truncated to 100 chars
81
+ # "user_message_trigger": "final user message that triggered this retrieval"
 
 
 
 
 
82
  # },
83
  # ...
84
  # ]
85
+ # feedback_score_related_retrieval_docs structure: {
86
+ # "conversation_up_to_point": [{"role": "user", "content": "..."}, ...], # subset of transcript
87
+ # "retrieved_docs": [{"content": "...", "metadata": {...}, ...}] # full chunks with all info
88
+ # }
89
  }
90
  return schema
91
 
92
  @classmethod
93
+ def get_snowflake_create_table_sql(cls, table_name: str = "USER_FEEDBACK_V3") -> str:
94
  """Generate CREATE TABLE SQL for Snowflake"""
95
  schema = cls.to_snowflake_schema(None)
96
 
 
105
  sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
106
  {columns_str},
107
  PRIMARY KEY (feedback_id)
108
+ )
109
+ CLUSTER BY (timestamp, conversation_id, score);
110
+ -- Note: Snowflake doesn't support traditional indexes on regular tables.
111
+ -- Instead, we use CLUSTER BY to optimize queries on these columns.
112
+ -- Snowflake automatically maintains clustering for efficient querying.
113
+ -- Note: transcript, retrievals, and feedback_score_related_retrieval_docs are stored as VARCHAR (JSON strings),
114
+ -- same approach as the old retrieved_data column. This allows easy storage and retrieval without VARIANT type complexity.
 
 
 
115
  """
116
  return sql
117
 
 
135
  }
136
 
137
 
138
+ def generate_snowflake_schema_sql(table_name: Optional[str] = None) -> str:
139
  """Generate complete Snowflake schema SQL for feedback system"""
140
+ if table_name is None:
141
+ table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
142
+ return UserFeedback.get_snowflake_create_table_sql(table_name)
143
 
144
 
145
  def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
146
  """Create UserFeedback instance from dictionary"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  return UserFeedback(
148
  feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
149
  open_ended_feedback=data.get("open_ended_feedback"),
150
  score=data["score"],
151
  is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
 
152
  conversation_id=data["conversation_id"],
153
  timestamp=data["timestamp"],
154
  message_count=data["message_count"],
155
  has_retrievals=data["has_retrievals"],
156
  retrieval_count=data["retrieval_count"],
157
+ transcript=data.get("transcript", []),
158
+ retrievals=data.get("retrievals", []),
159
+ feedback_score_related_retrieval_docs=data.get("feedback_score_related_retrieval_docs"),
160
+ retrieved_data=data.get("retrieved_data")
161
  )
 
src/reporting/snowflake_connector.py CHANGED
@@ -8,8 +8,11 @@ import os
8
  import json
9
  import logging
10
  from typing import Dict, Any, Optional
 
 
11
  from src.reporting.feedback_schema import UserFeedback
12
 
 
13
  # Try to import snowflake connector
14
  try:
15
  import snowflake.connector
@@ -79,12 +82,16 @@ class SnowflakeFeedbackConnector:
79
  self._connection.close()
80
  print("βœ… Disconnected from Snowflake")
81
 
82
- def insert_feedback(self, feedback: UserFeedback) -> bool:
83
  """Insert a single feedback record into Snowflake"""
84
  logger.info("=" * 80)
85
  logger.info("πŸ”„ SNOWFLAKE INSERT: Starting feedback insertion process")
86
  logger.info(f"πŸ“ Feedback ID: {feedback.feedback_id}")
87
 
 
 
 
 
88
  if not self._connection:
89
  logger.error("❌ Not connected to Snowflake. Call connect() first.")
90
  raise RuntimeError("Not connected to Snowflake. Call connect() first.")
@@ -131,38 +138,53 @@ class SnowflakeFeedbackConnector:
131
  logger.error(f"❌ Could not set context: {e}")
132
  raise
133
 
134
- # Prepare data
135
- logger.info("πŸ”§ DATA PREPARATION: Preparing retrieved_data...")
136
- retrieved_data_raw = feedback.to_dict()['retrieved_data']
137
 
138
- logger.info(f" - Retrieved data type (raw): {type(retrieved_data_raw).__name__}")
139
- logger.info(f" - Retrieved data: {repr(retrieved_data_raw)[:200]}")
 
 
 
 
 
 
 
140
 
141
- # If retrieved_data is already a string (from UI), parse it
142
- if isinstance(retrieved_data_raw, str):
143
- logger.info(" - Parsing string to Python object")
144
- retrieved_data = json.loads(retrieved_data_raw)
145
- elif retrieved_data_raw is None:
146
- retrieved_data = None
147
  else:
148
- # It's already a Python object (list/dict)
149
- logger.info(" - Data is already a Python object")
150
- retrieved_data = retrieved_data_raw
151
 
152
- logger.info(f" - Retrieved data size: {len(str(retrieved_data)) if retrieved_data else 0} characters")
153
- logger.info(f" - Retrieved data type: {type(retrieved_data).__name__}")
 
 
 
 
 
 
 
154
 
155
- # Convert to JSON string for TEXT column
156
- if retrieved_data:
157
- retrieved_data_for_db = json.dumps(retrieved_data)
158
- logger.info(f" - Converting to JSON string for TEXT column")
159
- logger.info(f" - JSON string length: {len(retrieved_data_for_db)}")
 
160
  else:
161
- logger.info(f" - Retrieved data is None, using NULL")
162
  retrieved_data_for_db = None
 
163
 
164
- # Build SQL with retrieved_data as a TEXT column parameter
165
- sql = f"""INSERT INTO user_feedback (
 
166
  feedback_id,
167
  open_ended_feedback,
168
  score,
@@ -172,23 +194,25 @@ class SnowflakeFeedbackConnector:
172
  message_count,
173
  has_retrievals,
174
  retrieval_count,
175
- user_query,
176
- bot_response,
177
- created_at,
178
- retrieved_data
 
179
  ) VALUES (
180
  %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
181
  %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
182
- %(retrieval_count)s, %(user_query)s, %(bot_response)s, %(created_at)s,
183
- %(retrieved_data)s
184
  )"""
185
 
186
  logger.info("πŸ“ SQL PREPARATION: Building INSERT statement...")
187
- logger.info(f" - Target table: user_feedback")
188
  logger.info(f" - Database: {self.database}")
189
  logger.info(f" - Schema: {self.schema}")
190
 
191
  # Prepare parameters
 
192
  params = {
193
  'feedback_id': feedback.feedback_id,
194
  'open_ended_feedback': feedback.open_ended_feedback,
@@ -199,10 +223,11 @@ class SnowflakeFeedbackConnector:
199
  'message_count': feedback.message_count,
200
  'has_retrievals': feedback.has_retrievals,
201
  'retrieval_count': feedback.retrieval_count,
202
- 'user_query': feedback.user_query,
203
- 'bot_response': feedback.bot_response,
204
- 'created_at': feedback.created_at,
205
- 'retrieved_data': retrieved_data_for_db
 
206
  }
207
 
208
  # Execute insert
@@ -265,12 +290,16 @@ def get_snowflake_connector_from_env() -> Optional[SnowflakeFeedbackConnector]:
265
  )
266
 
267
 
268
- def save_to_snowflake(feedback: UserFeedback) -> bool:
269
  """Helper function to save feedback to Snowflake"""
270
  logger.info("=" * 80)
271
  logger.info("πŸ”΅ SNOWFLAKE SAVE: Starting save process")
272
  logger.info(f"πŸ“ Feedback ID: {feedback.feedback_id}")
273
 
 
 
 
 
274
  connector = get_snowflake_connector_from_env()
275
 
276
  if not connector:
@@ -285,7 +314,7 @@ def save_to_snowflake(feedback: UserFeedback) -> bool:
285
  logger.info("βœ… SNOWFLAKE SAVE: Connection established")
286
 
287
  logger.info("πŸ“₯ SNOWFLAKE SAVE: Attempting to insert feedback...")
288
- success = connector.insert_feedback(feedback)
289
 
290
  logger.info("πŸ”Œ SNOWFLAKE SAVE: Disconnecting...")
291
  connector.disconnect()
@@ -302,4 +331,3 @@ def save_to_snowflake(feedback: UserFeedback) -> bool:
302
  logger.error(f" - Error: {e}")
303
  logger.info("=" * 80)
304
  return False
305
-
 
8
  import json
9
  import logging
10
  from typing import Dict, Any, Optional
11
+
12
+
13
  from src.reporting.feedback_schema import UserFeedback
14
 
15
+
16
  # Try to import snowflake connector
17
  try:
18
  import snowflake.connector
 
82
  self._connection.close()
83
  print("βœ… Disconnected from Snowflake")
84
 
85
+ def insert_feedback(self, feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
86
  """Insert a single feedback record into Snowflake"""
87
  logger.info("=" * 80)
88
  logger.info("πŸ”„ SNOWFLAKE INSERT: Starting feedback insertion process")
89
  logger.info(f"πŸ“ Feedback ID: {feedback.feedback_id}")
90
 
91
+ # Get table name from parameter, env var, or default
92
+ if table_name is None:
93
+ table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
94
+
95
  if not self._connection:
96
  logger.error("❌ Not connected to Snowflake. Call connect() first.")
97
  raise RuntimeError("Not connected to Snowflake. Call connect() first.")
 
138
  logger.error(f"❌ Could not set context: {e}")
139
  raise
140
 
141
+ # Prepare data - convert to JSON strings for VARIANT columns (same approach as old retrieved_data)
142
+ logger.info("πŸ”§ DATA PREPARATION: Preparing VARIANT columns...")
143
+ feedback_dict = feedback.to_dict()
144
 
145
+ # Prepare transcript (ARRAY) - convert to JSON string
146
+ transcript_raw = feedback_dict.get('transcript', [])
147
+ if transcript_raw:
148
+ # Convert to JSON string (same approach as old retrieved_data)
149
+ transcript_for_db = json.dumps(transcript_raw)
150
+ logger.info(f" - Transcript: {len(transcript_raw)} messages, JSON length: {len(transcript_for_db)}")
151
+ else:
152
+ transcript_for_db = None
153
+ logger.info(" - Transcript: None")
154
 
155
+ # Prepare retrievals (ARRAY) - convert to JSON string
156
+ retrievals_raw = feedback_dict.get('retrievals', [])
157
+ if retrievals_raw:
158
+ # Convert to JSON string (same approach as old retrieved_data)
159
+ retrievals_for_db = json.dumps(retrievals_raw)
160
+ logger.info(f" - Retrievals: {len(retrievals_raw)} entries, JSON length: {len(retrievals_for_db)}")
161
  else:
162
+ retrievals_for_db = None
163
+ logger.info(" - Retrievals: None")
 
164
 
165
+ # Prepare feedback_score_related_retrieval_docs (OBJECT) - convert to JSON string
166
+ feedback_score_related_raw = feedback_dict.get('feedback_score_related_retrieval_docs')
167
+ if feedback_score_related_raw:
168
+ # Convert to JSON string (same approach as old retrieved_data)
169
+ feedback_score_related_for_db = json.dumps(feedback_score_related_raw)
170
+ logger.info(f" - Feedback score related docs: present, JSON length: {len(feedback_score_related_for_db)}")
171
+ else:
172
+ feedback_score_related_for_db = None
173
+ logger.info(" - Feedback score related docs: None")
174
 
175
+ # Prepare retrieved_data (preserved old column) - convert to JSON string
176
+ retrieved_data_raw = feedback_dict.get('retrieved_data')
177
+ if retrieved_data_raw:
178
+ # Convert to JSON string (same approach as old retrieved_data)
179
+ retrieved_data_for_db = json.dumps(retrieved_data_raw)
180
+ logger.info(f" - Retrieved data (preserved): present, JSON length: {len(retrieved_data_for_db)}")
181
  else:
 
182
  retrieved_data_for_db = None
183
+ logger.info(" - Retrieved data (preserved): None")
184
 
185
+ # Build SQL with new column structure
186
+ # Columns are VARCHAR (storing JSON strings), same approach as old retrieved_data
187
+ sql = f"""INSERT INTO {table_name} (
188
  feedback_id,
189
  open_ended_feedback,
190
  score,
 
194
  message_count,
195
  has_retrievals,
196
  retrieval_count,
197
+ transcript,
198
+ retrievals,
199
+ feedback_score_related_retrieval_docs,
200
+ retrieved_data,
201
+ created_at
202
  ) VALUES (
203
  %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
204
  %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
205
+ %(retrieval_count)s, %(transcript)s, %(retrievals)s, %(feedback_score_related_retrieval_docs)s,
206
+ %(retrieved_data)s, %(created_at)s
207
  )"""
208
 
209
  logger.info("πŸ“ SQL PREPARATION: Building INSERT statement...")
210
+ logger.info(f" - Target table: {table_name}")
211
  logger.info(f" - Database: {self.database}")
212
  logger.info(f" - Schema: {self.schema}")
213
 
214
  # Prepare parameters
215
+ # Pass JSON strings for VARIANT columns (same approach as old retrieved_data)
216
  params = {
217
  'feedback_id': feedback.feedback_id,
218
  'open_ended_feedback': feedback.open_ended_feedback,
 
223
  'message_count': feedback.message_count,
224
  'has_retrievals': feedback.has_retrievals,
225
  'retrieval_count': feedback.retrieval_count,
226
+ 'transcript': transcript_for_db, # JSON string
227
+ 'retrievals': retrievals_for_db, # JSON string
228
+ 'feedback_score_related_retrieval_docs': feedback_score_related_for_db, # JSON string
229
+ 'retrieved_data': retrieved_data_for_db, # JSON string - preserved old column
230
+ 'created_at': feedback.created_at
231
  }
232
 
233
  # Execute insert
 
290
  )
291
 
292
 
293
+ def save_to_snowflake(feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
294
  """Helper function to save feedback to Snowflake"""
295
  logger.info("=" * 80)
296
  logger.info("πŸ”΅ SNOWFLAKE SAVE: Starting save process")
297
  logger.info(f"πŸ“ Feedback ID: {feedback.feedback_id}")
298
 
299
+ # Get table name from parameter or env var
300
+ if table_name is None:
301
+ table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
302
+
303
  connector = get_snowflake_connector_from_env()
304
 
305
  if not connector:
 
314
  logger.info("βœ… SNOWFLAKE SAVE: Connection established")
315
 
316
  logger.info("πŸ“₯ SNOWFLAKE SAVE: Attempting to insert feedback...")
317
+ success = connector.insert_feedback(feedback, table_name=table_name)
318
 
319
  logger.info("πŸ”Œ SNOWFLAKE SAVE: Disconnecting...")
320
  connector.disconnect()
 
331
  logger.error(f" - Error: {e}")
332
  logger.info("=" * 80)
333
  return False