File size: 12,050 Bytes
fa0d641
 
 
 
 
 
 
 
 
 
 
 
 
 
51f8f29
fa0d641
 
 
 
51f8f29
 
fa0d641
 
 
 
51f8f29
 
 
 
 
 
fa0d641
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
fa0d641
 
 
 
 
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
 
 
 
 
 
 
fa0d641
6ae7580
51f8f29
 
 
 
 
 
fa0d641
 
 
 
 
51f8f29
 
 
 
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0d641
51f8f29
 
 
fa0d641
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
9459b34
51f8f29
 
 
 
 
fa0d641
 
 
 
 
 
 
 
 
51f8f29
 
 
 
 
9459b34
51f8f29
 
 
fa0d641
51f8f29
 
 
fa0d641
 
 
 
 
 
 
 
 
 
51f8f29
 
 
7013e30
51f8f29
 
 
fa0d641
 
51f8f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0d641
51f8f29
 
fa0d641
 
22c9599
7013e30
fa0d641
9459b34
51f8f29
7013e30
 
 
51f8f29
a3066cf
 
7013e30
 
 
 
 
 
51f8f29
a3066cf
7013e30
51f8f29
a3066cf
 
 
 
 
 
 
 
 
 
 
 
 
51f8f29
 
fa0d641
 
 
51f8f29
 
 
 
fa0d641
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
"""
RevAI Chatbot Application

This Streamlit application provides a chat interface for users to interact with their documents 
using AI-powered semantic search and natural language processing. The application uses 
OpenAI's embeddings and chat models to provide relevant responses based on document content.

Features:
- Document-based Q&A using semantic search
- User-specific document repositories
- Interactive chat interface
"""

# Standard library imports
import json
import logging
import os
import re
import time
import uuid
from datetime import datetime

# Third-party imports
import streamlit as st
from dotenv import load_dotenv
from openai import OpenAI
from supabase import create_client
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Initialize environment variables
load_dotenv()

# Initialize OpenAI client
client = None

# Initialize Supabase client
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
if not SUPABASE_URL or not SUPABASE_KEY:
    raise ValueError("SUPABASE_URL and SUPABASE_KEY environment variables must be set")
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

def init_openai_client():
    """
    Initialize the OpenAI client with API key from environment variables.
    
    Returns:
        OpenAI: Initialized OpenAI client object
    """
    global client
    api_key = os.environ.get("OPENAI_API_KEY")
    if api_key:
        client = OpenAI(api_key=api_key)
    else:
        st.error("OpenAI API key not found in environment variables")
    return client

# Initialize session state variables
if 'messages' not in st.session_state:
    st.session_state.messages = []

def sanitize_customer_id(customer_id):
    """
    Sanitize customer ID to prevent SQL injection and other security issues.
    
    Args:
        customer_id (str): The raw customer ID input
        
    Returns:
        str or None: Sanitized customer ID if valid, None otherwise
    """
    
    if re.match(r"^[a-zA-Z0-9_-]+$", customer_id):
        return customer_id
    logger.warning(f"Invalid customer ID: {customer_id}")
    return None

def get_customer_table(customer_id):
    """
    Get the database table name associated with a customer ID.
    
    Args:
        customer_id (str): The customer ID to look up
        
    Returns:
        str or None: The table name if found, None otherwise
    """
    sanitized_customer_id = sanitize_customer_id(customer_id)
    if not sanitized_customer_id:
        logger.error("Customer ID is invalid or potentially malicious")
        return None

    try:
        logger.info(f"Fetching table for customer ID: {sanitized_customer_id}")
        response = (
            supabase.table("customer_mappings")
            .select("table_name")
            .eq("customer_id", sanitized_customer_id)
            .execute()
        )
        if response.data and len(response.data) > 0:
            table_name = response.data[0]["table_name"]
            logger.info(f"Found table '{table_name}' for customer ID: {customer_id}")
            return table_name
        logger.warning(f"No table found for customer ID: {customer_id}")
        return None
    except Exception as e:
        logger.error(f"Error finding customer table: {str(e)}")
        return None

def get_embeddings(text):
    """
    Generate embeddings for the provided text using OpenAI's embedding model.
    
    Args:
        text (str): The text to generate embeddings for
        
    Returns:
        list: The embedding vector if successful, empty list otherwise
    """
    try:
        logger.info("Generating embeddings for text")
        if not text or not text.strip():
            logger.warning("Empty or invalid text provided for embeddings")
            return []

        response = client.embeddings.create(
            model="text-embedding-3-large", input=text
        )
        embedding = response.data[0].embedding
        logger.info("Embeddings generated successfully")
        return embedding
    except Exception as e:
        logger.error(f"Error generating embeddings: {str(e)}")
        return []

def perform_similarity_search(customer_id, query_embedding):
    """
    Perform similarity search using the query embedding against customer's documents.
    
    Args:
        customer_id (str): The customer ID to search documents for
        query_embedding (list): The embedding vector for the query
        
    Returns:
        dict or None: Search results containing matches and count, None if error occurs
    """
    try:
        logger.info(f"Performing similarity search for customer ID: {customer_id}")
        table_name = get_customer_table(customer_id)
        if not table_name:
            logger.warning(f"No table found for customer ID: {customer_id}")
            return None

        result = supabase.rpc(
            "match_documents",
            {
                "query_embedding": query_embedding,
                "match_threshold": 0.3,
                "match_count": 3,
                "table_name": table_name,
            },
        ).execute()

        if result.data and len(result.data) > 0:
            logger.info(f"Similarity search returned {len(result.data)} results")
            # Serialize the results to JSON for debugging
            output_file = "search_results.json"
            with open(output_file, "w") as f:
                json.dump(result.data, f, indent=2)
            logger.info(f"Search results saved to {output_file}")
            return {"matches": result.data, "count": len(result.data)}
        logger.info("Similarity search returned no results")
        return {"matches": [], "count": 0}
    except Exception as e:
        logger.error(f"Error in similarity search: {str(e)}")
        return None

def call_inference_api(prompt, user_id):
    """
    Process user query by generating embeddings and performing similarity search.
    
    Args:
        prompt (str): The user's query
        user_id (str): The user's ID for document access
        
    Returns:
        dict: Results or error information
    """
    try:
        query_embedding = get_embeddings(prompt)
        if not query_embedding:
            return {"error": "Failed to generate embeddings for query"}

        search_results = perform_similarity_search(user_id, query_embedding)
        if search_results is None:
            return {"error": f"No data found for customer ID: {user_id}"}

        results = []
        for match in search_results.get("matches", []):
            results.append({
                "chunk_content": match.get("content", ""),
                "extra_key_data": match.get("metadata", {}).get("extra_key_data", {}),
                "document_name": match.get("metadata", {}).get("document_name", "Unknown document"),
            })
        return {"results": results}
    except Exception as e:
        logger.error(f"Error during inference: {str(e)}")
        return {"error": f"Error during inference: {str(e)}"}

def format_chunks(chunks):
    """
    Format the retrieved document chunks into a readable text format.
    
    Args:
        chunks (list): List of document chunks from similarity search
        
    Returns:
        str: Formatted text containing content from retrieved chunks
    """
    chunks_text = ""
    for i, chunk in enumerate(chunks):
        content = chunk.get("chunk_content", "No content")
        extra_data = chunk.get("extra_key_data", "")
        doc_name = chunk.get("document_name", "Unknown document")
        chunks_text += f"Source {i+1} ({doc_name}):\nContent: {content}\n"
        if extra_data:
            chunks_text += f"Summary: {extra_data}\n"
        chunks_text += "\n"
    return chunks_text

def prepare_messages(chunks_text):
    """
    Prepare the messages for the OpenAI API with chat history and context.
    
    Args:
        chunks_text (str): Formatted document chunks to provide as context
        
    Returns:
        list: Messages formatted for OpenAI chat completion API
    """
    messages = [{"role": "system", "content": f"You are a helpful assistant. Use the following information to answer the user's question:\n\n{chunks_text} keep your answer clear and complete and fast "}]
    for msg in st.session_state.messages[-10:]:
        messages.append({"role": msg["role"], "content": msg["content"]})
    return messages

def generate_ai_response(messages):
    """
    Generate a response using the OpenAI API.
    
    Args:
        messages (list): Formatted messages for the OpenAI chat completion API
        
    Returns:
        str: AI-generated response or error message
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            temperature=0.7,
            max_tokens=400,
        )
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Error generating AI response: {str(e)}")
        return f"Error generating response: {str(e)}"

def get_response(prompt, user_id):
    """
    Main function to handle the user query and generate a response.
    
    Args:
        prompt (str): The user's query
        user_id (str): The user's ID for document access
        
    Returns:
        tuple: (AI-generated response or error message, turnaround time)
    """
    start_time = time.time()
    
    # Display status: Calling inference API
    with st.spinner("Gathering insights..."):
        data = call_inference_api(prompt, user_id)
    
    if "error" in data:
        logger.error(f"Error in inference API: {data['error']}")
        response = data["error"]
    elif "results" in data:
        # Display status: Formatting document chunks
        with st.spinner("Formatting document chunks..."):
            chunks_text = format_chunks(data["results"])
        
        # Display status: Preparing messages for OpenAI
        with st.spinner("Preparing messages for OpenAI..."):
            messages = prepare_messages(chunks_text)
        
        # Display status: Generating AI response
        with st.spinner("Generating AI response..."):
            response = generate_ai_response(messages)
    else:
        response = data.get("response", "No response received from server")
    
    turnaround_time = time.time() - start_time
    return response, turnaround_time

def chat_page():
    """
    Render the chat interface page in the Streamlit app.
    
    Uses a hardcoded user_id for document access.
    """
    st.title("StartSmart AI")
    
    # Hardcoded user_id
    user_id = "670e5a5c-149f-4090-bae6-4c69e4cda540"
    logger.info(f"Using hardcoded User ID: {user_id}")
    
    # Initialize a container for the chat messages
    chat_container = st.container()
    
    # Chat input that processes on Enter key
    user_query = st.chat_input("Type your message here...")
    if user_query:
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": user_query})
        
        # Get assistant response
        response, turnaround_time = get_response(user_query, user_id)
        
        # Add assistant response to chat history
        st.session_state.messages.append({"role": "assistant", "content": response, "turnaround_time": turnaround_time})
        
        # Force a rerun to update the UI immediately
        st.rerun()
    
    # Display all chat messages in the container
    with chat_container:
        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.write(message["content"])
                if message["role"] == "assistant" and "turnaround_time" in message:
                    st.caption(f"Response time: {message['turnaround_time']:.2f} seconds")

def main():
    """
    Main function to run the Streamlit application.
    """
    init_openai_client()
    chat_page()

if __name__ == "__main__":
    main()