Spaces:

agnixcode
/

vectorembaddings_project_2

Sleeping

App Files Files Community

Dua Rajper commited on Apr 4, 2025

Commit

0edb972

verified ·

1 Parent(s): 80cdccf

Create app.py

Browse files

Files changed (1) hide show

app.py +238 -0

app.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import streamlit as st
+import os
+import google.generativeai as genai
+from dotenv import load_dotenv
+import json
+import textwrap
+import time
+from typing import Any, List
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Input
+from tensorflow.keras.utils import to_categorical
+# Load environment variables
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+# Configure Generative AI model
+if GOOGLE_API_KEY:
+    genai.configure(api_key=GOOGLE_API_KEY)
+    model = genai.GenerativeModel('gemini-pro')  # You can choose a suitable model.
+else:
+    st.error(
+        "Google AI Studio API key not found. Please add it to your .env file.  "
+        "You can obtain an API key from https://makersuite.google.com/."
+    )
+    st.stop()
+st.title("Embeddings and Vector Search Demo")
+st.subheader("Explore Embeddings and Vector Databases")
+# Sidebar for explanations
+with st.sidebar:
+    st.header("Embeddings and Vector Search")
+    st.markdown(
+        """
+        This app demonstrates how embeddings and vector databases can be used for various tasks.
+        """
+    )
+    st.subheader("Key Concepts:")
+    st.markdown(
+        """
+        -   **Embeddings**: Numerical representations of text, capturing semantic meaning.
+        -   **Vector Databases**:  Databases optimized for storing and querying vectors.
+        -   **Retrieval Augmented Generation (RAG)**:  Combining retrieval with LLM generation.
+        -   **Cosine Similarity**:  A measure of similarity between two vectors.
+        """
+    )
+    st.subheader("Whitepaper Insights")
+    st.markdown(
+        """
+    -   Efficient similarity search using vector indexes (e.g., ANN).
+    -   Handling large datasets and scalability.
+    -   Applications of embeddings: search, recommendation, classification.
+        """
+    )
+# --- Helper Functions ---
+def code_block(text: str, language: str = "text") -> None:
+    """Displays text as a formatted code block in Streamlit."""
+    st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
+def display_response(response: Any) -> None:
+    """Displays the model's response."""
+    if response and hasattr(response, "text"):
+        st.subheader("Generated Response:")
+        st.markdown(response.text)
+    else:
+        st.error("Failed to generate a response.")
+def generate_embeddings(texts: List[str], model_name: str = 'models/embedding-001') -> List[List[float]]:
+    """Generates embeddings for a list of texts."""
+    try:
+        embedding_model = genai.EmbeddingModel(model_name)
+        embeddings = embedding_model.embed_content(texts=texts)
+        return [embedding.values for embedding in embeddings.embeddings]  # Extract embedding values
+    except Exception as e:
+        st.error(f"Error generating embeddings: {e}")
+        return []
+def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
+    """Generates content with retry logic."""
+    for i in range(max_retries):
+        try:
+            model = genai.GenerativeModel(model_name)
+            response = model.generate_content(prompt, generation_config=generation_config)
+            return response
+        except Exception as e:
+            error_message = str(e)
+            st.warning(f"Error during generation (attempt {i + 1}/{max_retries}): {error_message}")
+            if "404" in error_message and "not found" in error_message:
+                st.error(
+                    f"Model '{model_name}' is not available or not supported.  Please select a different model."
+                )
+                return None
+            elif i < max_retries - 1:
+                st.info(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                raise
+    raise Exception("Failed to generate content after maximum retries")
+# --- RAG Question Answering ---
+st.header("RAG Question Answering")
+rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
+rag_context = st.text_area(
+    "Enter your context documents:",
+    "Relevant information to answer the question.  Separate documents with newlines.",
+    height=150,
+)
+rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=50)
+if st.button("Answer with RAG"):
+    if not rag_context or not rag_question:
+        st.warning("Please provide both context and a question.")
+    else:
+        with st.spinner("Generating answer..."):
+            try:
+                # 1. Generate embeddings for the context
+                context_embeddings = generate_embeddings(rag_context.split('\n'))
+                if not context_embeddings:
+                    st.stop()
+                # 2. Generate embedding for the question
+                question_embedding = generate_embeddings([rag_question])[0]
+                # 3. Calculate similarity scores
+                similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
+                # 4. Find the most relevant document(s)
+                most_relevant_index = np.argmax(similarities)
+                relevant_context = rag_context.split('\n')[most_relevant_index]
+                # 5. Construct the prompt
+                rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
+                # 6. Generate the answer
+                response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
+                display_response(response)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+# --- Text Similarity ---
+st.header("Text Similarity")
+similarity_model_name = st.selectbox("Select model for similarity:", ["models/embedding-001"], index=0)  # Use a model that supports embeddings
+text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=50)
+text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=50)
+if st.button("Calculate Similarity"):
+    if not text1 or not text2:
+        st.warning("Please provide both texts.")
+    else:
+        with st.spinner("Calculating similarity..."):
+            try:
+                # 1. Generate embeddings
+                embeddings = generate_embeddings([text1, text2], similarity_model_name)
+                if not embeddings:
+                    st.stop()
+                # 2. Calculate cosine similarity
+                similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
+                st.subheader("Cosine Similarity:")
+                st.write(similarity)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+# --- Neural Classification ---
+st.header("Neural Classification with Embeddings")
+classification_model_name = st.selectbox("Select model for classification:", ["models/embedding-001"], index=0) #use embedding model
+classification_data = st.text_area(
+    "Enter your training data (text, label pairs), separated by newlines.  Example: text1,0\\ntext2,1",
+    "text1,0\ntext2,1\ntext3,0\ntext4,1",
+    height=150,
+)
+classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=50)
+num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=100, value=10, step=1)
+def create_and_train_model(embeddings: List[List[float]], labels: List[int], num_classes: int, epochs: int):
+    """Creates and trains a simple neural network for classification."""
+    model = Sequential([
+        Input(shape=(len(embeddings[0]),)),  # Input shape is the embedding size
+        Dense(16, activation='relu'),
+        Dense(num_classes, activation='softmax')  # Output layer with softmax
+    ])
+    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
+    encoded_labels = to_categorical(labels, num_classes=num_classes) #one hot encode
+    model.fit(np.array(embeddings), encoded_labels, epochs=epochs, verbose=0)  # Suppress training output
+    return model
+if st.button("Classify"):
+    if not classification_data or not classification_prompt:
+        st.warning("Please provide training data and text to classify.")
+    else:
+        with st.spinner("Classifying..."):
+            try:
+                # 1. Process the training data
+                data_pairs = [line.split(',') for line in classification_data.split('\n') if ',' in line]
+                train_texts = [pair[0].strip() for pair in data_pairs]
+                train_labels = [int(pair[1].strip()) for pair in data_pairs]
+                num_classes = len(set(train_labels)) #number of classes
+                # 2. Generate embeddings for training data
+                train_embeddings = generate_embeddings(train_texts, classification_model_name)
+                if not train_embeddings:
+                    st.stop()
+                # 3. Create and train the model
+                model = create_and_train_model(train_embeddings, train_labels, num_classes, num_epochs)
+                # 4. Generate embedding for the text to classify
+                predict_embedding = generate_embeddings([classification_prompt], classification_model_name)[0]
+                # 5. Make the prediction
+                prediction = model.predict(np.array([predict_embedding]), verbose=0)
+                predicted_class = np.argmax(prediction[0])
+                st.subheader("Predicted Class:")
+                st.write(predicted_class)
+                st.subheader("Prediction Probabilities:")
+                st.write(prediction)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")