Spaces:
Build error
Build error
| import streamlit as st | |
| import os | |
| import google.generativeai as genai | |
| from huggingface_hub import hf_hub_download | |
| import base64 | |
| MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is | |
| try: | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| model_id = MODEL_ID | |
| genai.configure(api_key=api_key) | |
| except Exception as e: | |
| st.error(f"Error: {e}") | |
| st.stop | |
| model = genai.GenerativeModel(MODEL_ID) | |
| chat = model.start_chat() | |
| def download_pdf(): | |
| """ | |
| Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename. | |
| """ | |
| try: | |
| hf_token = os.getenv("HF_TOKEN") | |
| repo_id = "louiecerv/vqa_machine_learning_dataset" # Corrected dataset repo path | |
| filename = "Unsupervised_Learning_Algorithms.pdf" | |
| filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset") | |
| return filepath | |
| except Exception as e: | |
| st.error(f"Failed to download PDF from Hugging Face Hub: {e}") | |
| st.stop() # Stop if the download fails | |
| # Initialize conversation history in Streamlit session state | |
| if "conversation_history" not in st.session_state: | |
| st.session_state.conversation_history = [] | |
| if "uploaded_file_part" not in st.session_state: # Store the file *part* | |
| st.session_state.uploaded_file_part = None | |
| if "uploaded_pdf_path" not in st.session_state: | |
| st.session_state.uploaded_pdf_path = download_pdf() | |
| def multimodal_prompt(pdf_path, text_prompt): | |
| """ | |
| Sends a multimodal prompt to Gemini, handling file uploads efficiently. | |
| Args: | |
| pdf_path: The path to the PDF file. | |
| text_prompt: The text prompt for the model. | |
| Returns: | |
| The model's response as a string, or an error message. | |
| """ | |
| try: | |
| if st.session_state.uploaded_file_part is None: # First time, upload | |
| pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf") | |
| st.session_state.uploaded_file_part = pdf_part | |
| prompt = [text_prompt, pdf_part] # First turn includes the actual file | |
| else: # Subsequent turns, reference the file | |
| prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference | |
| response = chat.send_message(prompt) | |
| # Update conversation history | |
| st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True}) | |
| st.session_state.conversation_history.append({"role": "assistant", "content": response.text}) | |
| return response.text | |
| except Exception as e: | |
| return f"An error occurred: {e}" | |
| def display_download_button(file_path, file_name): | |
| try: | |
| with open(file_path, "rb") as f: | |
| file_bytes = f.read() | |
| b64 = base64.b64encode(file_bytes).decode() | |
| href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document (PDF)</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| except FileNotFoundError: | |
| st.error("File not found for download.") | |
| except Exception as e: | |
| st.error(f"Error during download: {e}") | |
| # Define the ML Models | |
| models = ["K-Means Clustering", "Hierarchical Clustering", | |
| "DBSCAN", "Gaussian Mixture Models", "Principal Component Analysis (PCA)", | |
| "t-Distributed Stochastic Neighbor Embedding", "Autoencoders", "Self-Organizing Maps (SOM)", "Association Rule Learning"] | |
| # --- Sidebar --- | |
| st.sidebar.title("🤖 Visual Q and A") | |
| selected_model = st.sidebar.selectbox("Select the ML Model", models) | |
| # --- Main Page --- | |
| st.title("📚 VQA on the Unsupervised Machine Learning Algorithms") | |
| about = """ | |
| **How to use this App** | |
| This app leverages Gemini 2.0 to provide insights on the provided document. | |
| Select a question from the dropdown menu or enter your own question to get | |
| Gemini's generated response based on the provided document. | |
| """ | |
| with st.expander("How to use this App"): | |
| st.markdown(about) | |
| # --- Q and A Tab --- | |
| st.header("Questions and Answers") | |
| # Generate 5 questions based on the selected model | |
| if selected_model == "K-Means Clustering": | |
| questions = [ | |
| "What is the fundamental objective of the K-Means clustering algorithm, and how does it achieve this objective?", | |
| "Explain the concept of 'inertia' in the context of K-Means clustering and its role in the algorithm's operation.", | |
| "Describe the four key steps involved in the K-Means clustering process, providing details about each step.", | |
| "What are the main advantages and disadvantages of using the K-Means clustering algorithm?", | |
| "How does the selection of the 'k' value (number of clusters) influence the results of K-Means clustering? What are some common methods for determining the optimal 'k'?", | |
| "Discuss the issue of sensitivity to initialization in K-Means clustering. How can this sensitivity affect the clustering results, and what strategies can be employed to mitigate this issue?", | |
| "Explain why K-Means clustering might struggle with datasets containing clusters of varying shapes and densities. Are there any modifications or alternative algorithms that can address this limitation?", | |
| "How can outliers impact the performance of K-Means clustering? Discuss techniques for identifying and handling outliers in the context of this algorithm.", | |
| "Describe several real-world applications where K-Means clustering can be effectively utilized, providing specific examples.", | |
| "Compare and contrast K-Means clustering with other unsupervised learning algorithms, such as hierarchical clustering or DBSCAN, highlighting their relative strengths and weaknesses." | |
| ] | |
| if selected_model == "Hierarchical Clustering": | |
| questions = [ | |
| "What is the primary objective of hierarchical clustering, and how does it differ from other clustering techniques like k-means?", | |
| "Explain the difference between the agglomerative and divisive approaches to hierarchical clustering, and provide a real-world example where each approach might be preferred.", | |
| "Describe the concept of 'linkage criteria' in hierarchical clustering. Discuss the three common types of linkage (single, complete, and average) and how they influence cluster formation.", | |
| "How can a dendrogram be used to interpret the results of hierarchical clustering? What information can you glean from its structure and branch lengths?", | |
| "Discuss the advantages and disadvantages of hierarchical clustering compared to other unsupervised learning methods. When might you choose hierarchical clustering over k-means or DBSCAN?", | |
| "How does the choice of distance metric affect the results of hierarchical clustering? Explain the impact of using different distance metrics like Euclidean, Manhattan, and cosine distance.", | |
| "Hierarchical clustering can be sensitive to noise and outliers. How can you identify and address these issues when applying this technique?", | |
| "Explain how hierarchical clustering can be used for exploratory data analysis. Provide an example of how you might use it to gain insights into a new dataset.", | |
| "Discuss the computational complexity of hierarchical clustering. How does it scale with the number of data points, and what are some strategies for handling large datasets?", | |
| "Can hierarchical clustering be used with categorical data? If so, how would you adapt the distance metric and linkage criteria to handle such data?" | |
| ] | |
| if selected_model == "DBSCAN": | |
| questions = [ | |
| "What are the core differences between DBSCAN and traditional clustering algorithms like K-Means, and how do these differences impact the types of data structures they can effectively cluster?", | |
| "Explain the concept of density-based clustering and how DBSCAN utilizes this concept to identify clusters.", | |
| "How does DBSCAN handle outliers, and why is this approach beneficial in certain datasets compared to other clustering techniques?", | |
| "What are the two key parameters in DBSCAN, and how do they influence the clustering outcome?", | |
| "Describe the process of identifying core points, border points, and noise points in DBSCAN.", | |
| "Discuss the advantages and disadvantages of using DBSCAN, particularly its ability to handle arbitrarily shaped clusters and its sensitivity to parameter settings.", | |
| "In what scenarios would DBSCAN be a more suitable choice than K-Means or hierarchical clustering?", | |
| "How does DBSCAN's ability to identify noise contribute to its effectiveness in anomaly detection tasks?", | |
| "What are some real-world applications of DBSCAN, and how does its density-based approach address the specific challenges of these applications?", | |
| "How does DBSCAN compare to other density-based clustering algorithms, and what factors might lead you to choose DBSCAN over alternative methods?" | |
| ] | |
| if selected_model == "Gaussian Mixture Models": | |
| questions = [ | |
| "Explain the underlying assumption of Gaussian Mixture Models (GMMs) and how it differs from the assumptions made by K-Means clustering.", | |
| "Describe the role of Gaussian distributions in GMMs and how they contribute to the model's flexibility in capturing cluster shapes.", | |
| "How does the Expectation-Maximization (EM) algorithm facilitate the estimation of parameters in GMMs?", | |
| "What are the advantages of using GMMs over K-Means for clustering data with varying shapes and densities?", | |
| "Explain the concept of 'soft clustering' in GMMs and how it provides a more nuanced understanding of cluster assignments compared to 'hard clustering' methods.", | |
| "How can GMMs be used for density estimation, and what are the benefits of this probabilistic approach?", | |
| "Discuss the challenges associated with initializing GMMs and the potential impact on the final clustering results.", | |
| "In what situations might GMMs be a preferred choice over other clustering algorithms, considering their strengths and weaknesses?", | |
| "How does the concept of 'responsibility' in the E-step of the EM algorithm help in assigning data points to Gaussian components?", | |
| "Provide examples of real-world applications where GMMs have been successfully employed for clustering or density estimation tasks." | |
| ] | |
| if selected_model == "Principal Component Analysis (PCA)": | |
| questions = [ | |
| "How does PCA achieve dimensionality reduction, and what are the key mathematical concepts involved in this process?", | |
| "Explain the role of eigenvectors and eigenvalues in PCA, and how they contribute to identifying principal components.", | |
| "What are the benefits of using PCA for dimensionality reduction, particularly in the context of large datasets?", | |
| "How does PCA help in addressing the curse of dimensionality, and why is this important in machine learning?", | |
| "Describe the steps involved in performing PCA, including data standardization and the selection of principal components.", | |
| "Discuss the limitations of PCA, such as its linearity assumption and potential issues with interpretability.", | |
| "In what situations might PCA not be suitable for dimensionality reduction, and what alternative techniques could be considered?", | |
| "How can PCA be used to improve the performance of other machine learning algorithms, and what types of algorithms benefit most from this preprocessing step?", | |
| "What are some real-world applications of PCA, and how does its ability to reduce dimensionality contribute to solving these problems?", | |
| "How does PCA compare to other dimensionality reduction techniques, and what factors would influence your choice between PCA and alternative methods?" | |
| ] | |
| if selected_model == "Self-Organizing Maps (SOM)": | |
| questions = [ | |
| "Explain the concept of a Self-Organizing Map (SOM) and its role in unsupervised learning.", | |
| "Describe the structure of a SOM, including its layers and the connections between neurons.", | |
| "How does the competitive learning process work in a SOM, and how is the Best Matching Unit (BMU) determined?", | |
| "Explain the process of weight adaptation in a SOM and how it leads to the formation of a topological map.", | |
| "What are the key parameters involved in training a SOM, and how do they affect the resulting map?", | |
| "Discuss the advantages and disadvantages of using SOMs for dimensionality reduction and visualization.", | |
| "How does a SOM preserve the topological properties of the input data, and why is this important?", | |
| "What are some common applications of SOMs in fields like data analysis, image processing, and pattern recognition?", | |
| "Compare and contrast SOMs with other unsupervised learning techniques such as K-Means clustering and Principal Component Analysis (PCA).", | |
| "How can SOMs be used for clustering and classification tasks, and what are the limitations of this approach?" | |
| ] | |
| if selected_model == "t-Distributed Stochastic Neighbor Embedding": | |
| questions = [ | |
| "What is the primary objective of t-SNE, and how does it differ from the goals of principal component analysis (PCA)?", | |
| "Explain the concept of 'perplexity' in t-SNE and its role in balancing local and global structure preservation.", | |
| "How does t-SNE use probability distributions to represent relationships between data points in high-dimensional and low-dimensional spaces?", | |
| "Describe the optimization process in t-SNE and the challenges associated with minimizing the Kullback-Leibler divergence.", | |
| "What are the advantages of t-SNE over linear dimensionality reduction techniques like PCA, particularly for visualizing complex datasets?", | |
| "Discuss the limitations of t-SNE, including its computational cost and sensitivity to parameter settings.", | |
| "How does the 'crowding problem' affect t-SNE visualizations, and what strategies can be used to mitigate this issue?", | |
| "In what situations would t-SNE be the preferred choice for dimensionality reduction and visualization compared to other techniques?", | |
| "Provide examples of real-world applications where t-SNE has been successfully used to gain insights from high-dimensional data.", | |
| "How can t-SNE be combined with other machine learning techniques, such as clustering or classification, to improve data analysis and visualization?" | |
| ] | |
| if selected_model == "Autoencoders": | |
| questions = [ | |
| "What is the fundamental purpose of an autoencoder, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?", | |
| "Describe the two main components of an autoencoder and their respective roles in the learning process.", | |
| "Explain the concept of a latent space representation in the context of autoencoders. How does this representation contribute to dimensionality reduction and feature extraction?", | |
| "How does the training process of an autoencoder work, and what is the significance of minimizing reconstruction error?", | |
| "What are the advantages of using autoencoders for non-linear dimensionality reduction compared to linear techniques like PCA?", | |
| "Discuss how autoencoders can be applied to tasks such as denoising and anomaly detection.", | |
| "What are some potential challenges or drawbacks of using autoencoders, such as overfitting or the need for large datasets?", | |
| "How can techniques like regularization help to mitigate the risk of overfitting in autoencoders?", | |
| "Explain how the flexibility of autoencoders allows them to be adapted to various architectures and applications.", | |
| "Can you provide examples of real-world applications where autoencoders have been successfully used for dimensionality reduction, feature extraction, or other unsupervised learning tasks?" | |
| ] | |
| if selected_model == "Association Rule Learning": | |
| questions = [ | |
| "What is the primary goal of Association Rule Learning, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?", | |
| "Explain the concept of 'support' and 'confidence' in Association Rule Learning, and how these metrics are used to evaluate the strength of an association rule.", | |
| "Describe the Apriori algorithm, focusing on its key steps for generating frequent itemsets and association rules.", | |
| "How does the Apriori algorithm address the challenge of computational complexity when dealing with a large number of possible itemsets?", | |
| "What are the advantages and disadvantages of using Association Rule Learning, particularly in terms of interpretability and computational cost?", | |
| "In what real-world scenarios is Association Rule Learning most applicable, and what types of insights can be gained from its application?", | |
| "How does the choice of support and confidence thresholds impact the number and quality of discovered rules, and what factors should be considered when setting these thresholds?", | |
| "What are some potential challenges or limitations of Association Rule Learning, such as dealing with rare items or handling continuous variables?", | |
| "How can Association Rule Learning be used in conjunction with other data mining or machine learning techniques to enhance its effectiveness?", | |
| "Discuss the ethical considerations surrounding the application of Association Rule Learning, particularly in areas like customer privacy and targeted advertising." | |
| ] | |
| # Create a selection box | |
| selected_question = st.selectbox("Choose a question", questions) | |
| # Display a checkbox | |
| if st.checkbox('Check this box to ask a question not listed above'): | |
| # If the checkbox is checked, display a text box | |
| selected_question = st.text_input('Enter a question') | |
| if st.button("Ask AI"): | |
| with st.spinner("AI is thinking..."): | |
| if st.session_state.uploaded_pdf_path is None: | |
| st.session_state.uploaded_pdf_path = download_pdf() | |
| filepath = st.session_state.uploaded_pdf_path | |
| text_prompt = f"Use the provided document focus on rhe topic: {selected_model} to answer the following question: {selected_question}. Use your own knowledge as well as sources from the web and the provided document. Always cite your sourcss." | |
| response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath | |
| st.markdown(f"**Question:** {selected_question}") | |
| st.markdown(f"**Response:** {response}") | |
| if st.session_state.uploaded_pdf_path: | |
| display_download_button(st.session_state.uploaded_pdf_path, "Unsupervised_Learning_Algorithms.pdf") | |
| st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)") | |
| st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨") |