Spaces:

swaroop77
/

chat_memory

Sleeping

App Files Files Community

Swaroop Ingavale commited on May 9, 2025

Commit

02ce8d2

1 Parent(s): e2b56c1

Update

Browse files

Files changed (1) hide show

app.py +226 -262

app.py CHANGED Viewed

@@ -1,272 +1,236 @@
-import gradio as gr
-from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
 from groq import Groq
-import os
-import datetime
-client = Groq(
-    api_key=os.environ.get("GROQ_API_KEY"),
-)
-# Initialize sentence transformer model
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Global memory buffer with embeddings
-memory = []
-def add_to_memory(role, content):
-    """
-    Add a message to memory along with its embedding.
-    """
-    embedding = embedding_model.encode(content, convert_to_numpy=True)
-    memory.append({"role": role, "content": content, "embedding": embedding})
-def retrieve_relevant_memory(user_input, top_k=5):
-    """
-    Retrieve the top-k most relevant messages from memory based on cosine similarity.
-    """
-    if not memory:
         return []
-    # Compute the embedding of the user input
-    user_embedding = embedding_model.encode(user_input, convert_to_numpy=True)
-    # Calculate similarities
-    similarities = [cosine_similarity([user_embedding], [m["embedding"]])[0][0] for m in memory]
-    # Sort memory by similarity and return the top-k messages
-    relevant_messages = sorted(zip(similarities, memory), key=lambda x: x[0], reverse=True)
-    return [m[1] for m in relevant_messages[:top_k]]
-def construct_prompt(memory, user_input, max_tokens=500):
-    """
-    Construct the prompt by combining relevant memory and the current user input.
-    """
-    relevant_memory = retrieve_relevant_memory(user_input)
-    # Combine relevant memory into the prompt
-    prompt = ""
-    token_count = 0
-    for message in relevant_memory:
-        message_text = f'{message["role"]}: {message["content"]}\n'
-        token_count += len(message_text.split())
-        if token_count > max_tokens:
-            break
-        prompt += message_text
-    # Add the user input at the end
-    prompt += f'user: {user_input}\n'
-    return prompt
-def trim_memory(max_size=50):
-    """
-    Trim the memory to keep it within the specified max size.
-    """
-    if len(memory) > max_size:
-        memory.pop(0)  # Remove the oldest entry
-def summarize_memory():
-    """
-    Summarize the memory buffer to free up space.
-    """
-    if not memory:
-        return
-    long_term_memory = " ".join([m["content"] for m in memory])
-    summary = client.chat.completions.create(
-        messages=[
-            {"role": "system", "content": "Summarize the following text for key points."},
-            {"role": "user", "content": long_term_memory},
-        ],
-        model="meta-llama/llama-4-scout-17b-16e-instruct",
-        max_tokens=4096,
-    )
-    memory.clear()
-    # Match the access pattern from main.py if needed
     try:
-        # Try the format in app.py first
-        summary_content = summary.choices[0].message.content
-    except AttributeError:
-        # Fall back to the format in main.py
-        summary_content = summary.choices[0].text
-    memory.append({"role": "system", "content": summary_content})
-def get_chatbot_response(
-    message,
-    history,
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    use_memory=True,
-    memory_size=50,
-):
-    """
-    Generate a response using the chatbot with memory capabilities.
-    """
-    if use_memory:
-        # Process history to maintain memory
-        for i, (user_msg, bot_msg) in enumerate(history):
-            if i < len(history) - 1:  # Skip the current message which is already in the history
-                add_to_memory("user", user_msg)
-                if bot_msg:  # Check if bot message exists (might be None for the most recent one)
-                    add_to_memory("assistant", bot_msg)
-        # Construct prompt with relevant memory
-        prompt = construct_prompt(memory, message)
-        # Use the prompt with groq client
-        completion = client.chat.completions.create(
             messages=[
-                {"role": "system", "content": system_message},
-                {"role": "user", "content": prompt}
             ],
-            model="deepseek-r1-distill-llama-70b",
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stream=True,
         )
-        # Stream the response
-        response = ""
-        for chunk in completion:
-            response_part = chunk.choices[0].delta.content or ""
-            response += response_part
-            yield response
-        # Update memory with the current message and response
-        add_to_memory("user", message)
-        add_to_memory("assistant", response)
-        # Trim memory if needed
-        trim_memory(max_size=memory_size)
-    else:
-        # If not using memory, just use regular chat completion
-        messages = [{"role": "system", "content": system_message}]
-        for val in history:
-            if val[0]:
-                messages.append({"role": "user", "content": val[0]})
-            if val[1]:
-                messages.append({"role": "assistant", "content": val[1]})
-        messages.append({"role": "user", "content": message})
         completion = client.chat.completions.create(
-            messages=messages,
-            model="deepseek-r1-distill-llama-70b",
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stream=True,
         )
-        response = ""
-        for chunk in completion:
-            response_part = chunk.choices[0].delta.content or ""
-            response += response_part
-            yield response
-def view_memory():
-    """
-    Create a formatted string showing the current memory contents.
-    """
-    if not memory:
-        return "Memory is empty."
-    memory_view = "Current Memory Contents:\n\n"
-    for i, m in enumerate(memory):
-        memory_view += f"Memory {i+1}: {m['role']}: {m['content']}\n\n"
-    return memory_view
-def clear_memory_action():
-    """
-    Clear the memory buffer.
-    """
-    memory.clear()
-    return "Memory has been cleared."
-# Custom CSS for the chat interface - apply using elem_classes
-custom_css = """
-.user-message {
-    background-color: #e3f2fd !important;
-    border-radius: 15px !important;
-    padding: 10px 15px !important;
-}
-.bot-message {
-    background-color: #f1f8e9 !important;
-    border-radius: 15px !important;
-    padding: 10px 15px !important;
-}
-"""
-# Create the Gradio interface
-with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
-    # Header
-    with gr.Row(elem_classes="header-row"):
-        gr.Markdown("""
-        <div style="text-align: center; margin-bottom: 10px; padding: 10px; background-color: #f0f4f8; border-radius: 8px;">
-            <h1 style="margin: 0; color: #2c3e50;">AI Chatbot With Memory</h1>
-            <h3 style="margin: 5px 0 0 0; color: #34495e;">Developed by Dhiraj and Swaroop</h3>
-        </div>
-        """)
-    with gr.Row():
-        with gr.Column(scale=3):
-            # Create ChatInterface without css_classes parameter
-            chatbot = gr.ChatInterface(
-                get_chatbot_response,
-                additional_inputs=[
-                    gr.Textbox(value="You are a helpful assistant with memory capabilities.", label="System message"),
-                    gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-                    gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-                    gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
-                    gr.Checkbox(value=True, label="Use Memory", info="Enable or disable memory capabilities"),
-                    gr.Slider(minimum=10, maximum=200, value=50, step=10, label="Memory Size", info="Maximum number of entries in memory"),
-                ],
-                examples=[
-                    ["Tell me about machine learning"],
-                    ["What are the best practices for data preprocessing?"],
-                    ["Can you explain neural networks?"],
-                ],
-                title="Chat with AI Assistant",
-                # Removed css_classes parameter
-            )
-        with gr.Column(scale=1):
-            with gr.Group():
-                gr.Markdown("## Memory Management")
-                memory_display = gr.Textbox(label="Memory Contents", lines=20, max_lines=30, interactive=False)
-                view_memory_btn = gr.Button("View Memory Contents")
-                clear_memory_btn = gr.Button("Clear Memory")
-                summarize_memory_btn = gr.Button("Summarize Memory")
-                memory_status = gr.Textbox(label="Memory Status", lines=2, interactive=False)
-    # Set up button actions
-    view_memory_btn.click(view_memory, inputs=[], outputs=[memory_display])
-    clear_memory_btn.click(clear_memory_action, inputs=[], outputs=[memory_status])
-    summarize_memory_btn.click(
-        lambda: (summarize_memory(), "Memory summarized successfully."),
-        inputs=[],
-        outputs=[memory_status]
-    )
-    # Footer
-    with gr.Row(elem_classes="footer-row"):
-        gr.Markdown(f"""
-        <div style="text-align: center; margin-top: 20px; padding: 10px; background-color: #f0f4f8; border-radius: 8px;">
-            <p style="margin: 0; color: #2c3e50;">
-                Developed by Dhiraj and Swaroop | © {datetime.datetime.now().year} | Version 1.0
-            </p>
-        </div>
-        """)
-if __name__ == "__main__":
-    demo.launch()

+import os
+from flask import Flask, render_template, request, jsonify, session
 from sklearn.metrics.pairwise import cosine_similarity
 from groq import Groq
+import numpy as np
+import logging
+from transformers import AutoTokenizer, AutoModel # Keep these
+import torch
+import torch.nn.functional as F
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# --- Flask App Setup --- (MUST come before routes or app-dependent code) ---
+app = Flask(__name__)
+app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', 'a_default_secret_key_please_change')
+# --- Initialize Models ---
+device = torch.device("cpu") # Force CPU for free tier
+if torch.cuda.is_available():
+    device = torch.device("cuda") # Should not happen on free tier
+logging.info(f"Using device: {device}")
+tokenizer = None
+model = None
+client = None
+try:
+    # Load tokenizer and model from HuggingFace Hub using transformers
+    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+    # Re-add from_tf=True here for AutoModel.from_pretrained
+    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', from_tf=True).to(device)
+    logging.info("Tokenizer and AutoModel loaded successfully with from_tf=True.")
+except Exception as e:
+    logging.error(f"Error loading Transformer models: {e}")
+    tokenizer = None
+    model = None
+# Initialize the Groq client
+groq_api_key = os.environ.get("GROQ_API_KEY")
+if not groq_api_key:
+    logging.error("GROQ_API_KEY environment variable not set.")
+    client = None
+else:
+    client = Groq(api_key=groq_api_key)
+    logging.info("Groq client initialized.")
+# --- Helper function for Mean Pooling ---
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float().to(token_embeddings.device)
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# --- Function to get embedding ---
+def get_embedding(text):
+    if tokenizer is None or model is None:
+        logging.error("Embedding models not loaded. Cannot generate embedding.")
+        return None
+    try:
+        encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
+        with torch.no_grad():
+            model_output = model(**encoded_input)
+        sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
+        sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
+        return sentence_embedding.cpu().numpy()[0]
+    except Exception as e:
+        logging.error(f"Error generating embedding: {e}")
+        return None
+# --- Memory Management Functions (rely on get_embedding) ---
+# ... (add_to_memory, retrieve_relevant_memory, construct_prompt, trim_memory, summarize_memory - these remain the same, calling get_embedding) ...
+def add_to_memory(mem_list, role, content):
+    if not content or not content.strip():
+         logging.warning(f"Attempted to add empty content to memory for role: {role}")
+         return mem_list
+    embedding = get_embedding(content)
+    if embedding is not None:
+        mem_list.append({"role": role, "content": content, "embedding": embedding.tolist()})
+    else:
+        logging.warning(f"Failed to get embedding for message: {content[:50]}...")
+        mem_list.append({"role": role, "content": content, "embedding": None})
+    return mem_list
+def retrieve_relevant_memory(mem_list, user_input, top_k=5):
+    if not mem_list or tokenizer is None or model is None:
+        return []
+    user_embedding = get_embedding(user_input)
+    if user_embedding is None:
+        logging.error("Failed to get user input embedding for retrieval.")
         return []
+    valid_memory_items = []
+    memory_embeddings_np = []
+    for m in mem_list:
+        if m.get("embedding") is not None and isinstance(m["embedding"], list):
+            try:
+                 np_embedding = np.array(m["embedding"])
+                 if np_embedding.shape == (model.config.hidden_size,): # Use model config for dimension
+                      valid_memory_items.append(m)
+                      memory_embeddings_np.append(np_embedding)
+                 else:
+                     logging.warning(f"Embedding dimension mismatch for memory entry: {m['content'][:50]}...")
+            except Exception as conv_e:
+                logging.warning(f"Could not convert embedding for memory entry: {m['content'][:50]}... Error: {conv_e}")
+                pass
+    if not valid_memory_items:
+         return []
+    similarities = cosine_similarity([user_embedding], np.array(memory_embeddings_np))[0]
+    relevant_messages_sorted = sorted(zip(similarities, valid_memory_items), key=lambda x: x[0], reverse=True)
+    return [m[1] for m in relevant_messages_sorted[:top_k]]
+def construct_prompt(mem_list, user_input, max_tokens_in_prompt=1000):
+    relevant_memory_items = retrieve_relevant_memory(mem_list, user_input)
+    relevant_content_set = {m["content"] for m in relevant_memory_items if "content" in m}
+    messages_for_api = []
+    messages_for_api.append({"role": "system", "content": "You are a helpful and friendly AI assistant."})
+    current_prompt_tokens = len(messages_for_api[0]["content"].split())
+    context_messages = []
+    for msg in mem_list:
+        if "content" in msg and msg["content"] in relevant_content_set and msg["role"] in ["user", "assistant", "system"]:
+            msg_text = f'{msg["role"]}: {msg["content"]}\n'
+            msg_tokens = len(msg_text.split())
+            if current_prompt_tokens + msg_tokens > max_tokens_in_prompt:
+                break
+            context_messages.append({"role": msg["role"], "content": msg["content"]})
+            current_prompt_tokens += msg_tokens
+    messages_for_api.extend(context_messages)
+    user_input_tokens = len(user_input.split())
+    if current_prompt_tokens + user_input_tokens > max_tokens_in_prompt and len(messages_for_api) > 1:
+         logging.warning(f"User input exceeds max_tokens_in_prompt with existing context. Context may be truncated.")
+    messages_for_api.append({"role": "user", "content": user_input})
+    return messages_for_api
+def trim_memory(mem_list, max_size=50):
+    while len(mem_list) > max_size:
+        mem_list.pop(0)
+    return mem_list
+def summarize_memory(mem_list):
+    if not mem_list or client is None:
+        logging.warning("Memory is empty or Groq client not initialized. Cannot summarize.")
+        return []
+    long_term_memory = " ".join([m["content"] for m in mem_list if "content" in m])
+    if not long_term_memory.strip():
+         logging.warning("Memory content is empty. Cannot summarize.")
+         return []
     try:
+        summary_completion = client.chat.completions.create(
+            model="llama-3.1-8b-instruct-fpt",
             messages=[
+                {"role": "system", "content": "Summarize the following conversation for key points. Keep it concise."},
+                {"role": "user", "content": long_term_memory},
             ],
+            max_tokens= 500,
         )
+        summary_text = summary_completion.choices[0].message.content
+        logging.info("Memory summarized.")
+        return [{"role": "system", "content": f"Previous conversation summary: {summary_text}"}]
+    except Exception as e:
+        logging.error(f"Error summarizing memory: {e}")
+        return mem_list
+# --- Flask Routes --- (MUST come AFTER app is defined) ---
+@app.route('/')
+def index():
+    if 'chat_memory' not in session:
+        session['chat_memory'] = []
+    return render_template('index.html')
+@app.route('/chat', methods=['POST'])
+def chat():
+    # Check if Groq client AND embedding models are initialized
+    if client is None or tokenizer is None or model is None:
+         status_code = 500
+         error_message = "Chatbot backend is not fully initialized (API key or embedding models missing)."
+         logging.error(error_message)
+         return jsonify({"response": error_message}), status_code
+    user_input = request.json.get('message')
+    if not user_input or not user_input.strip():
+        return jsonify({"response": "Please enter a message."}), 400
+    current_memory_serializable = session.get('chat_memory', [])
+    messages_for_api = construct_prompt(current_memory_serializable, user_input)
+    try:
         completion = client.chat.completions.create(
+            model="llama-3.1-8b-instruct-fpt",
+            messages=messages_for_api,
+            temperature=0.6,
+            max_tokens=1024,
+            top_p=0.95,
+            stream=False,
+            stop=None,
         )
+        ai_response_content = completion.choices[0].message.content
+    except Exception as e:
+        logging.error(f"Error calling Groq API: {e}")
+        ai_response_content = "Sorry, I encountered an error when trying to respond. Please try again later."
+    current_memory_serializable = add_to_memory(current_memory_serializable, "user", user_input)
+    current_memory_serializable = add_to_memory(current_memory_serialable, "assistant", ai_response_content)
+    current_memory_serializable = trim_memory(current_memory_serializable, max_size=20)
+    session['chat_memory'] = current_memory_serializable
+    return jsonify({"response": ai_response_content})
+@app.route('/clear_memory', methods=['POST'])
+def clear_memory():
+    session['chat_memory'] = []
+    logging.info("Chat memory cleared.")
+    return jsonify({"status": "Memory cleared."})
+# --- Running the App ---
+if __name__ == '__main__':
+    # Using Uvicorn instead of Waitress
+    logging.info("Starting Uvicorn server...")
+    port = int(os.environ.get('PORT', 7860))
+    # Use uvicorn.run to start the Flask app (which is a WSGI app)
+    # It automatically detects it's a WSGI app
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=port)