Spaces:

louiecerv
/

VQA_unsupervised_machine_learning

Build error

App Files Files Community

louiecerv commited on Feb 16, 2025

Commit

815ea8d

1 Parent(s): b13d8d1

Sync with remote

Browse files

Files changed (2) hide show

app.py +248 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import streamlit as st
+import os
+import google.generativeai as genai
+from huggingface_hub import hf_hub_download
+import base64
+MODEL_ID = "gemini-2.0-flash-exp"  # Keep the model ID as is
+try:
+    api_key = os.getenv("GEMINI_API_KEY")
+    model_id = MODEL_ID
+    genai.configure(api_key=api_key)
+except Exception as e:
+    st.error(f"Error: {e}")
+    st.stop
+model = genai.GenerativeModel(MODEL_ID)
+chat = model.start_chat()
+def download_pdf():
+    """
+    Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
+    """
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        repo_id = "louiecerv/vqa_machine_learning_dataset"  # Corrected dataset repo path
+        filename = "Unsupervised_Learning_Algorithms.pdf"
+        filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
+        return filepath
+    except Exception as e:
+        st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
+        st.stop()  # Stop if the download fails
+# Initialize conversation history in Streamlit session state
+if "conversation_history" not in st.session_state:
+    st.session_state.conversation_history = []
+if "uploaded_file_part" not in st.session_state:  # Store the file *part*
+    st.session_state.uploaded_file_part = None
+if "uploaded_pdf_path" not in st.session_state:
+    st.session_state.uploaded_pdf_path = download_pdf()
+def multimodal_prompt(pdf_path, text_prompt):
+    """
+    Sends a multimodal prompt to Gemini, handling file uploads efficiently.
+    Args:
+        pdf_path: The path to the PDF file.
+        text_prompt: The text prompt for the model.
+    Returns:
+        The model's response as a string, or an error message.
+    """
+    try:
+        if st.session_state.uploaded_file_part is None:  # First time, upload
+            pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
+            st.session_state.uploaded_file_part = pdf_part
+            prompt = [text_prompt, pdf_part] # First turn includes the actual file
+        else: # Subsequent turns, reference the file
+            prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference
+        response = chat.send_message(prompt)
+        # Update conversation history
+        st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True})
+        st.session_state.conversation_history.append({"role": "assistant", "content": response.text})
+        return response.text
+    except Exception as e:
+        return f"An error occurred: {e}"
+def display_download_button(file_path, file_name):
+    try:
+        with open(file_path, "rb") as f:
+            file_bytes = f.read()
+        b64 = base64.b64encode(file_bytes).decode()
+        href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document (PDF)</a>'
+        st.markdown(href, unsafe_allow_html=True)
+    except FileNotFoundError:
+        st.error("File not found for download.")
+    except Exception as e:
+        st.error(f"Error during download: {e}")
+# Define the ML Models
+models = ["K-Means Clustering", "Hierarchical Clustering",
+         "DBSCAN", "Gaussian Mixture Models", "Principal Component Analysis (PCA)",
+         "t-Distributed Stochastic Neighbor Embedding", "Autoencoders", "Self-Organizing Maps (SOM)", "Association Rule Learning"]
+# --- Sidebar ---
+st.sidebar.title("🤖 Visual Q and A")
+selected_model = st.sidebar.selectbox("Select the ML Model", models)
+# --- Main Page ---
+st.title("📚 VQA on the Unsupervised Machine Learning Algorithms")
+about = """
+**How to use this App**
+This app leverages Gemini 2.0 to provide insights on the provided document.
+Select a question from the dropdown menu or enter your own question to get
+Gemini's generated response based on the provided document.
+"""
+with st.expander("How to use this App"):
+    st.markdown(about)
+# --- Q and A Tab ---
+st.header("Questions and Answers")
+# Generate 5 questions based on the selected model
+if selected_model == "K-Means Clustering":
+    questions = [
+    "What is the fundamental objective of the K-Means clustering algorithm, and how does it achieve this objective?",
+    "Explain the concept of 'inertia' in the context of K-Means clustering and its role in the algorithm's operation.",
+    "Describe the four key steps involved in the K-Means clustering process, providing details about each step.",
+    "What are the main advantages and disadvantages of using the K-Means clustering algorithm?",
+    "How does the selection of the 'k' value (number of clusters) influence the results of K-Means clustering? What are some common methods for determining the optimal 'k'?",
+    "Discuss the issue of sensitivity to initialization in K-Means clustering. How can this sensitivity affect the clustering results, and what strategies can be employed to mitigate this issue?",
+    "Explain why K-Means clustering might struggle with datasets containing clusters of varying shapes and densities. Are there any modifications or alternative algorithms that can address this limitation?",
+    "How can outliers impact the performance of K-Means clustering? Discuss techniques for identifying and handling outliers in the context of this algorithm.",
+    "Describe several real-world applications where K-Means clustering can be effectively utilized, providing specific examples.",
+    "Compare and contrast K-Means clustering with other unsupervised learning algorithms, such as hierarchical clustering or DBSCAN, highlighting their relative strengths and weaknesses."
+    ]
+if selected_model == "Hierarchical Clustering":
+    questions = [
+    "What is the primary objective of hierarchical clustering, and how does it differ from other clustering techniques like k-means?",
+    "Explain the difference between the agglomerative and divisive approaches to hierarchical clustering, and provide a real-world example where each approach might be preferred.",
+    "Describe the concept of 'linkage criteria' in hierarchical clustering. Discuss the three common types of linkage (single, complete, and average) and how they influence cluster formation.",
+    "How can a dendrogram be used to interpret the results of hierarchical clustering? What information can you glean from its structure and branch lengths?",
+    "Discuss the advantages and disadvantages of hierarchical clustering compared to other unsupervised learning methods. When might you choose hierarchical clustering over k-means or DBSCAN?",
+    "How does the choice of distance metric affect the results of hierarchical clustering? Explain the impact of using different distance metrics like Euclidean, Manhattan, and cosine distance.",
+    "Hierarchical clustering can be sensitive to noise and outliers. How can you identify and address these issues when applying this technique?",
+    "Explain how hierarchical clustering can be used for exploratory data analysis. Provide an example of how you might use it to gain insights into a new dataset.",
+    "Discuss the computational complexity of hierarchical clustering. How does it scale with the number of data points, and what are some strategies for handling large datasets?",
+    "Can hierarchical clustering be used with categorical data? If so, how would you adapt the distance metric and linkage criteria to handle such data?"
+    ]
+if selected_model == "DBSCAN":
+    questions = [
+    "What are the core differences between DBSCAN and traditional clustering algorithms like K-Means, and how do these differences impact the types of data structures they can effectively cluster?",
+    "Explain the concept of density-based clustering and how DBSCAN utilizes this concept to identify clusters.",
+    "How does DBSCAN handle outliers, and why is this approach beneficial in certain datasets compared to other clustering techniques?",
+    "What are the two key parameters in DBSCAN, and how do they influence the clustering outcome?",
+    "Describe the process of identifying core points, border points, and noise points in DBSCAN.",
+    "Discuss the advantages and disadvantages of using DBSCAN, particularly its ability to handle arbitrarily shaped clusters and its sensitivity to parameter settings.",
+    "In what scenarios would DBSCAN be a more suitable choice than K-Means or hierarchical clustering?",
+    "How does DBSCAN's ability to identify noise contribute to its effectiveness in anomaly detection tasks?",
+    "What are some real-world applications of DBSCAN, and how does its density-based approach address the specific challenges of these applications?",
+    "How does DBSCAN compare to other density-based clustering algorithms, and what factors might lead you to choose DBSCAN over alternative methods?"
+    ]
+if selected_model == "Gaussian Mixture Models":
+    questions = [
+    "Explain the underlying assumption of Gaussian Mixture Models (GMMs) and how it differs from the assumptions made by K-Means clustering.",
+    "Describe the role of Gaussian distributions in GMMs and how they contribute to the model's flexibility in capturing cluster shapes.",
+    "How does the Expectation-Maximization (EM) algorithm facilitate the estimation of parameters in GMMs?",
+    "What are the advantages of using GMMs over K-Means for clustering data with varying shapes and densities?",
+    "Explain the concept of 'soft clustering' in GMMs and how it provides a more nuanced understanding of cluster assignments compared to 'hard clustering' methods.",
+    "How can GMMs be used for density estimation, and what are the benefits of this probabilistic approach?",
+    "Discuss the challenges associated with initializing GMMs and the potential impact on the final clustering results.",
+    "In what situations might GMMs be a preferred choice over other clustering algorithms, considering their strengths and weaknesses?",
+    "How does the concept of 'responsibility' in the E-step of the EM algorithm help in assigning data points to Gaussian components?",
+    "Provide examples of real-world applications where GMMs have been successfully employed for clustering or density estimation tasks."
+    ]
+if selected_model == "Principal Component Analysis (PCA)":
+    questions = [
+    "How does PCA achieve dimensionality reduction, and what are the key mathematical concepts involved in this process?",
+    "Explain the role of eigenvectors and eigenvalues in PCA, and how they contribute to identifying principal components.",
+    "What are the benefits of using PCA for dimensionality reduction, particularly in the context of large datasets?",
+    "How does PCA help in addressing the curse of dimensionality, and why is this important in machine learning?",
+    "Describe the steps involved in performing PCA, including data standardization and the selection of principal components.",
+    "Discuss the limitations of PCA, such as its linearity assumption and potential issues with interpretability.",
+    "In what situations might PCA not be suitable for dimensionality reduction, and what alternative techniques could be considered?",
+    "How can PCA be used to improve the performance of other machine learning algorithms, and what types of algorithms benefit most from this preprocessing step?",
+    "What are some real-world applications of PCA, and how does its ability to reduce dimensionality contribute to solving these problems?",
+    "How does PCA compare to other dimensionality reduction techniques, and what factors would influence your choice between PCA and alternative methods?"
+    ]
+if selected_model == "Self-Organizing Maps (SOM)":
+    questions = [
+    "Explain the concept of a Self-Organizing Map (SOM) and its role in unsupervised learning.",
+    "Describe the structure of a SOM, including its layers and the connections between neurons.",
+    "How does the competitive learning process work in a SOM, and how is the Best Matching Unit (BMU) determined?",
+    "Explain the process of weight adaptation in a SOM and how it leads to the formation of a topological map.",
+    "What are the key parameters involved in training a SOM, and how do they affect the resulting map?",
+    "Discuss the advantages and disadvantages of using SOMs for dimensionality reduction and visualization.",
+    "How does a SOM preserve the topological properties of the input data, and why is this important?",
+    "What are some common applications of SOMs in fields like data analysis, image processing, and pattern recognition?",
+    "Compare and contrast SOMs with other unsupervised learning techniques such as K-Means clustering and Principal Component Analysis (PCA).",
+    "How can SOMs be used for clustering and classification tasks, and what are the limitations of this approach?"
+    ]
+if selected_model == "t-Distributed Stochastic Neighbor Embedding":
+    questions = [
+    "What is the primary objective of t-SNE, and how does it differ from the goals of principal component analysis (PCA)?",
+    "Explain the concept of 'perplexity' in t-SNE and its role in balancing local and global structure preservation.",
+    "How does t-SNE use probability distributions to represent relationships between data points in high-dimensional and low-dimensional spaces?",
+    "Describe the optimization process in t-SNE and the challenges associated with minimizing the Kullback-Leibler divergence.",
+    "What are the advantages of t-SNE over linear dimensionality reduction techniques like PCA, particularly for visualizing complex datasets?",
+    "Discuss the limitations of t-SNE, including its computational cost and sensitivity to parameter settings.",
+    "How does the 'crowding problem' affect t-SNE visualizations, and what strategies can be used to mitigate this issue?",
+    "In what situations would t-SNE be the preferred choice for dimensionality reduction and visualization compared to other techniques?",
+    "Provide examples of real-world applications where t-SNE has been successfully used to gain insights from high-dimensional data.",
+    "How can t-SNE be combined with other machine learning techniques, such as clustering or classification, to improve data analysis and visualization?"
+    ]
+if selected_model == "Autoencoders":
+    questions = [
+    "What is the fundamental purpose of an autoencoder, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?",
+    "Describe the two main components of an autoencoder and their respective roles in the learning process.",
+    "Explain the concept of a latent space representation in the context of autoencoders. How does this representation contribute to dimensionality reduction and feature extraction?",
+    "How does the training process of an autoencoder work, and what is the significance of minimizing reconstruction error?",
+    "What are the advantages of using autoencoders for non-linear dimensionality reduction compared to linear techniques like PCA?",
+    "Discuss how autoencoders can be applied to tasks such as denoising and anomaly detection.",
+    "What are some potential challenges or drawbacks of using autoencoders, such as overfitting or the need for large datasets?",
+    "How can techniques like regularization help to mitigate the risk of overfitting in autoencoders?",
+    "Explain how the flexibility of autoencoders allows them to be adapted to various architectures and applications.",
+    "Can you provide examples of real-world applications where autoencoders have been successfully used for dimensionality reduction, feature extraction, or other unsupervised learning tasks?"
+    ]
+if selected_model == "Association Rule Learning":
+    questions = [
+    "What is the primary goal of Association Rule Learning, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?",
+    "Explain the concept of 'support' and 'confidence' in Association Rule Learning, and how these metrics are used to evaluate the strength of an association rule.",
+    "Describe the Apriori algorithm, focusing on its key steps for generating frequent itemsets and association rules.",
+    "How does the Apriori algorithm address the challenge of computational complexity when dealing with a large number of possible itemsets?",
+    "What are the advantages and disadvantages of using Association Rule Learning, particularly in terms of interpretability and computational cost?",
+    "In what real-world scenarios is Association Rule Learning most applicable, and what types of insights can be gained from its application?",
+    "How does the choice of support and confidence thresholds impact the number and quality of discovered rules, and what factors should be considered when setting these thresholds?",
+    "What are some potential challenges or limitations of Association Rule Learning, such as dealing with rare items or handling continuous variables?",
+    "How can Association Rule Learning be used in conjunction with other data mining or machine learning techniques to enhance its effectiveness?",
+    "Discuss the ethical considerations surrounding the application of Association Rule Learning, particularly in areas like customer privacy and targeted advertising."
+    ]
+# Create a selection box
+selected_question = st.selectbox("Choose a question", questions)
+# Display a checkbox
+if st.checkbox('Check this box to ask a question not listed above'):
+    # If the checkbox is checked, display a text box
+    selected_question = st.text_input('Enter a question')
+if st.button("Ask AI"):
+    with st.spinner("AI is thinking..."):
+        if st.session_state.uploaded_pdf_path is None:
+            st.session_state.uploaded_pdf_path = download_pdf()
+        filepath = st.session_state.uploaded_pdf_path
+        text_prompt = f"Use the provided document focus on rhe topic: {selected_model} to answer the following question: {selected_question}.  Use your own knowledge as well as sources from the web and the provided document. Always cite your sourcss."
+        response = multimodal_prompt(filepath, text_prompt)  # Use the downloaded filepath
+        st.markdown(f"**Question:** {selected_question}")
+        st.markdown(f"**Response:** {response}")
+if st.session_state.uploaded_pdf_path:
+    display_download_button(st.session_state.uploaded_pdf_path, "Unsupervised_Learning_Algorithms.pdf")
+st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
+st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit
+huggingface_hub
+google-generativeai