Spaces:

agnixcode
/

vectorembaddings_project_2

Sleeping

App Files Files Community

Dua Rajper commited on Apr 4, 2025

Commit

5144ee6

verified ·

1 Parent(s): 238fc8b

Update app.py

Browse files

Files changed (1) hide show

app.py +271 -87

app.py CHANGED Viewed

@@ -2,16 +2,15 @@ import streamlit as st
 import os
 import google.generativeai as genai
 from dotenv import load_dotenv
-import numpy as np
 import time
 from typing import Any, List, Optional
 from sklearn.metrics.pairwise import cosine_similarity
-from sentence_transformers import SentenceTransformer
 import tensorflow as tf
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Input
 from tensorflow.keras.utils import to_categorical
-from tensorflow.keras.optimizers import Adam
 # Load environment variables
 load_dotenv()
@@ -21,140 +20,325 @@ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 if GOOGLE_API_KEY:
     genai.configure(api_key=GOOGLE_API_KEY)
 else:
-    st.error("Google AI Studio API key not found. Please add it to your .env file.")
     st.stop()
-# Load embedding model (local)
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-st.title("AI App: Embeddings, RAG, Similarity & Classification")
 # --- Helper Functions ---
-def generate_embeddings(texts: List[str], model_name: str = "") -> List[List[float]]:
     try:
-        return embedding_model.encode(texts).tolist()
     except Exception as e:
-        st.error(f"Error generating embeddings: {e}")
-        return []
 def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
     for i in range(max_retries):
         try:
             model = genai.GenerativeModel(model_name)
-            return model.generate_content(prompt, generation_config=generation_config)
         except Exception as e:
-            if i < max_retries - 1:
-                st.warning(f"Error: {e}. Retrying in {delay} seconds...")
                 time.sleep(delay)
             else:
-                st.error(f"Failed after {max_retries} attempts: {e}")
-    return None
 def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
     return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
-def create_and_train_model(embeddings, labels, num_classes, epochs, batch_size, learning_rate, optimizer_str):
     model = Sequential([
         Input(shape=(len(embeddings[0]),)),
-        Dense(64, activation='relu'),
         Dense(32, activation='relu'),
         Dense(num_classes, activation='softmax')
     ])
-    if optimizer_str == 'adam':
         optimizer = Adam(learning_rate=learning_rate)
-    elif optimizer_str == 'sgd':
         optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
-    else:
         optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
     model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
     encoded_labels = to_categorical(labels, num_classes=num_classes)
     model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
     return model
-def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
-    data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
-    texts, labels = [], []
-    for i, pair in enumerate(data_pairs):
-        if len(pair) != 2:
-            st.error(f"Invalid line {i+1}: {pair}")
-            return None
-        try:
-            texts.append(pair[0].strip())
-            labels.append(int(pair[1].strip()))
-        except ValueError:
-            st.error(f"Invalid label in line {i+1}")
-            return None
-    return texts, labels
-# --- RAG ---
-st.header("🔎 RAG: Retrieval-Augmented Generation")
-rag_model_name = st.selectbox("Text generation model", ["gemini-pro"])
-rag_context = st.text_area("Context documents (separated by new lines)", height=150)
-rag_question = st.text_area("Your question", height=70)
-rag_max_context_length = st.slider("Max context length", 100, 2000, 500)
 if st.button("Answer with RAG"):
     if not rag_context or not rag_question:
         st.warning("Please provide both context and a question.")
     else:
-        with st.spinner("Generating..."):
             try:
-                context_lines = rag_context.split('\n')
-                context_embeddings = generate_embeddings(context_lines)
-                question_embedding = generate_embeddings([rag_question])[0]
-                similarities = cosine_similarity([question_embedding], context_embeddings)[0]
-                best_match_index = np.argmax(similarities)
-                selected_context = context_lines[best_match_index][:rag_max_context_length]
-                prompt = f"Use the context below to answer the question.\n\nContext:\n{selected_context}\n\nQuestion: {rag_question}"
-                response = generate_with_retry(prompt, rag_model_name, genai.types.GenerationConfig())
                 if response:
-                    st.subheader("Answer:")
-                    st.markdown(response.text)
             except Exception as e:
-                st.error(f"Error: {e}")
 # --- Text Similarity ---
-st.header("🧠 Text Similarity")
-text1 = st.text_area("Text 1", height=70)
-text2 = st.text_area("Text 2", height=70)
 if st.button("Calculate Similarity"):
     if not text1 or not text2:
-        st.warning("Please enter both texts.")
     else:
-        try:
-            embeddings = generate_embeddings([text1, text2])
-            similarity = calculate_similarity(embeddings[0], embeddings[1])
-            st.write(f"Cosine Similarity: **{similarity:.4f}**")
-        except Exception as e:
-            st.error(f"Error: {e}")
 # --- Neural Classification ---
-st.header("🧪 Neural Classification")
-classification_data = st.text_area("Training data (text,label)", "text1,0\ntext2,1", height=100)
-classification_prompt = st.text_area("Text to classify", "This is a sample input.", height=70)
-num_epochs = st.number_input("Epochs", 1, 100, 10)
-batch_size = st.number_input("Batch Size", 1, 128, 32)
-learning_rate = st.number_input("Learning Rate", 0.0001, 0.1, 0.0001, format="%.4f")
-optimizer_str = st.selectbox("Optimizer", ["adam", "sgd", "rmsprop"])
 if st.button("Classify"):
-    try:
-        result = process_classification_data(classification_data)
-        if not result:
-            st.stop()
-        train_texts, train_labels = result
-        train_embeddings = generate_embeddings(train_texts)
-        model = create_and_train_model(train_embeddings, train_labels, len(set(train_labels)),
-                                       num_epochs, batch_size, learning_rate, optimizer_str)
-        predict_embedding = generate_embeddings([classification_prompt])[0]
-        prediction = model.predict(np.array([predict_embedding]), verbose=0)
-        predicted_class = int(np.argmax(prediction[0]))
-        st.success(f"Predicted Class: **{predicted_class}**")
-        st.write("Prediction Probabilities:", prediction)
-    except Exception as e:
-        st.error(f"Classification Error: {e}")

 import os
 import google.generativeai as genai
 from dotenv import load_dotenv
 import time
 from typing import Any, List, Optional
+import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
 import tensorflow as tf
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Input
 from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.optimizers import Adam  # Import Adam optimizer
 # Load environment variables
 load_dotenv()
 if GOOGLE_API_KEY:
     genai.configure(api_key=GOOGLE_API_KEY)
 else:
+    st.error(
+        "Google AI Studio API key not found. Please add it to your .env file.  "
+        "You can obtain an API key from https://makersuite.google.com/."
+    )
     st.stop()
+st.title("Embeddings and Vector Search Demo")
+st.subheader("Explore Embeddings and Vector Databases")
+# Sidebar for explanations
+with st.sidebar:
+    st.header("Embeddings and Vector Search")
+    st.markdown(
+        """
+        This app demonstrates how embeddings and vector databases can be used for various tasks.
+        """
+    )
+    st.subheader("Key Concepts:")
+    st.markdown(
+        """
+        -   **Embeddings**: Numerical representations of text, capturing semantic meaning.
+        -   **Vector Databases**:  Databases optimized for storing and querying vectors (simulated here).
+        -   **Retrieval Augmented Generation (RAG)**:  Combining retrieval with LLM generation.
+        -   **Cosine Similarity**:  A measure of similarity between two vectors.
+        -   **Neural Networks**: Using embeddings as input for classification.
+        """
+    )
+    st.subheader("Whitepaper Insights")
+    st.markdown(
+        """
+        -   Efficient similarity search using vector indexes (e.g., ANN).
+        -   Handling large datasets and scalability considerations.
+        -   Applications of embeddings: search, recommendation, classification, etc.
+        """
+    )
 # --- Helper Functions ---
+def code_block(text: str, language: str = "text") -> None:
+    """Displays text as a formatted code block in Streamlit."""
+    st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
+def display_response(response: Any) -> None:
+    """Displays the model's response."""
+    if response and hasattr(response, "text"):
+        st.subheader("Generated Response:")
+        st.markdown(response.text)
+    else:
+        st.error("Failed to generate a response.")
+def generate_embeddings(texts: List[str], model_name: str) -> Optional[List[List[float]]]:
+    """Generates embeddings for a list of texts using a specified model.
+    Args:
+        texts: List of text strings.
+        model_name: Name of the embedding model.
+    Returns:
+        List of embeddings (list of floats) or None on error.
+    """
     try:
+        model = genai.GenerativeModel(model_name)
+        response = model.generate_embeddings(texts=texts)
+        return [embedding.values for embedding in response.embeddings]
     except Exception as e:
+        st.error(f"Error generating embeddings with model '{model_name}': {e}")
+        return None
 def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
+    """Generates content with retry logic and error handling.
+    Args:
+        prompt: The prompt string.
+        model_name: The name of the language model.
+        generation_config: The generation configuration.
+        max_retries: Maximum number of retries.
+        delay: Delay in seconds between retries.
+    Returns:
+        The generated response or None on error.
+    """
     for i in range(max_retries):
         try:
             model = genai.GenerativeModel(model_name)
+            response = model.generate_content(prompt, generation_config=generation_config)
+            return response
         except Exception as e:
+            error_message = str(e)
+            st.warning(f"Error during generation (attempt {i + 1}/{max_retries}): {error_message}")
+            if "404" in error_message and "not found" in error_message:
+                st.error(
+                    f"Model '{model_name}' is not available or not supported.  Please select a different model."
+                )
+                return None  # Return None to signal a non-retryable error
+            elif i < max_retries - 1:
+                st.info(f"Retrying in {delay} seconds...")
                 time.sleep(delay)
             else:
+                st.error(f"Failed to generate content after {max_retries} attempts.  Please check your prompt and model.")
+                return None # Return None after max retries
+    return None  #Should never reach here
 def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
+    """Calculates the cosine similarity between two embeddings."""
     return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
+def create_and_train_model(
+    embeddings: List[List[float]],
+    labels: List[int],
+    num_classes: int,
+    epochs: int,
+    batch_size: int,
+    learning_rate: float,
+    optimizer_str: str
+) -> tf.keras.Model:
+    """Creates and trains a neural network for classification.
+    Args:
+        embeddings: List of input embeddings.
+        labels: List of integer labels.
+        num_classes: Number of classes.
+        epochs: Number of training epochs.
+        batch_size: Batch size for training.
+        learning_rate: Learning rate for the optimizer.
+        optimizer_str: Name of the optimizer ('adam', 'sgd', 'rmsprop')
+    Returns:
+        Trained Keras model.
+    """
     model = Sequential([
         Input(shape=(len(embeddings[0]),)),
+        Dense(64, activation='relu'),  # Increased hidden layer size
         Dense(32, activation='relu'),
         Dense(num_classes, activation='softmax')
     ])
+    if optimizer_str.lower() == 'adam':
         optimizer = Adam(learning_rate=learning_rate)
+    elif optimizer_str.lower() == 'sgd':
         optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
+    elif optimizer_str.lower() == 'rmsprop':
         optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
+    else:
+        optimizer = Adam(learning_rate=learning_rate) #default
     model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
     encoded_labels = to_categorical(labels, num_classes=num_classes)
     model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
     return model
+# --- RAG Question Answering ---
+st.header("RAG Question Answering")
+rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
+rag_embedding_model = st.selectbox("Select embedding model for RAG:", ["gemini-pro"], index=0)
+rag_context = st.text_area(
+    "Enter your context documents:",
+    "Relevant information to answer the question.  Separate documents with newlines.",
+    height=150,
+)
+rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=70)
+rag_max_context_length = st.number_input("Maximum Context Length", min_value=100, max_value=2000, value=500, step=100)
 if st.button("Answer with RAG"):
     if not rag_context or not rag_question:
         st.warning("Please provide both context and a question.")
     else:
+        with st.spinner("Generating answer..."):
             try:
+                # 1. Generate embeddings for the context
+                context_embeddings = generate_embeddings(rag_context.split('\n'), rag_embedding_model)
+                if not context_embeddings:
+                    st.stop()
+                # 2. Generate embedding for the question
+                question_embedding = generate_embeddings([rag_question], rag_embedding_model)
+                if not question_embedding:
+                    st.stop()
+                # 3. Calculate similarity scores
+                similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
+                # 4. Find the most relevant document(s)
+                most_relevant_index = np.argmax(similarities)
+                relevant_context = rag_context.split('\n')[most_relevant_index]
+                #truncate context
+                if len(relevant_context) > rag_max_context_length:
+                    relevant_context = relevant_context[:rag_max_context_length]
+                # 5. Construct the prompt
+                rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
+                # 6. Generate the answer
+                response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
                 if response:
+                    display_response(response)
             except Exception as e:
+                st.error(f"An error occurred: {e}")
 # --- Text Similarity ---
+st.header("Text Similarity")
+similarity_embedding_model = st.selectbox("Select embedding model for similarity:", ["gemini-pro"], index=0)
+text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=70)
+text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=70)
 if st.button("Calculate Similarity"):
     if not text1 or not text2:
+        st.warning("Please provide both texts.")
     else:
+        with st.spinner("Calculating similarity..."):
+            try:
+                # 1. Generate embeddings
+                embeddings = generate_embeddings([text1, text2], similarity_embedding_model)
+                if not embeddings:
+                    st.stop()
+                # 2. Calculate cosine similarity
+                similarity = calculate_similarity(embeddings[0], embeddings[1])
+                st.subheader("Cosine Similarity:")
+                st.write(similarity)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
 # --- Neural Classification ---
+st.header("Neural Classification with Embeddings")
+classification_embedding_model = st.selectbox("Select embedding model for classification:", ["gemini-pro"], index=0)
+classification_data = st.text_area(
+    "Enter your training data (text, label pairs), separated by newlines.  Example: text1,0\\ntext2,1",
+    "text1,0\ntext2,1\ntext3,0\ntext4,1",
+    height=150,
+)
+classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=70)
+num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=200, value=10, step=1)
+batch_size = st.number_input("Batch Size", min_value=1, max_value=128, value=32, step=1)
+learning_rate = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.0001, step=0.0001, format="%.4f")
+optimizer_str = st.selectbox("Optimizer", ['adam', 'sgd', 'rmsprop'], index=0)
+def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
+    """Processes the classification data string into lists of texts and labels.
+    Args:
+        data:  The input string.
+    Returns:
+       A tuple of (texts, labels) or None on error
+    """
+    data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
+    if not data_pairs:
+        st.error("No valid data pairs found.  Please ensure each line contains 'text,label'.")
+        return None
+    texts = []
+    labels = []
+    for i, pair in enumerate(data_pairs):
+        if len(pair) != 2:
+            st.error(f"Invalid data format in line {i + 1}: '{','.join(pair)}'.  Expected 'text,label'.")
+            return None
+        text = pair[0].strip()
+        label_str = pair[1].strip()
+        try:
+            label = int(label_str)
+            texts.append(text)
+            labels.append(label)
+        except ValueError:
+            st.error(f"Invalid label value in line {i + 1}: '{label_str}'.  Label must be an integer.")
+            return None
+    return texts, labels
 if st.button("Classify"):
+    if not classification_data or not classification_prompt:
+        st.warning("Please provide training data and text to classify.")
+    else:
+        with st.spinner("Classifying..."):
+            try:
+                # 1. Process the training data
+                processed_data = process_classification_data(classification_data)
+                if not processed_data:
+                    st.stop()
+                train_texts, train_labels = processed_data
+                num_classes = len(set(train_labels))
+                # 2. Generate embeddings for training data
+                train_embeddings = generate_embeddings(train_texts, classification_embedding_model)
+                if not train_embeddings:
+                    st.stop()
+                # 3. Create and train the model
+                model = create_and_train_model(
+                    train_embeddings, train_labels, num_classes, num_epochs, batch_size, learning_rate, optimizer_str
+                )
+                # 4. Generate embedding for the text to classify
+                predict_embedding = generate_embeddings([classification_prompt], classification_embedding_model)
+                if not predict_embedding:
+                    st.stop()
+                # 5. Make the prediction
+                prediction = model.predict(np.array([predict_embedding]), verbose=0)
+                predicted_class = np.argmax(prediction[0])
+                st.subheader("Predicted Class:")
+                st.write(predicted_class)
+                st.subheader("Prediction Probabilities:")
+                st.write(prediction)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")