louiecerv commited on
Commit
815ea8d
·
1 Parent(s): b13d8d1

Sync with remote

Browse files
Files changed (2) hide show
  1. app.py +248 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import google.generativeai as genai
4
+ from huggingface_hub import hf_hub_download
5
+ import base64
6
+
7
+ MODEL_ID = "gemini-2.0-flash-exp" # Keep the model ID as is
8
+ try:
9
+ api_key = os.getenv("GEMINI_API_KEY")
10
+ model_id = MODEL_ID
11
+ genai.configure(api_key=api_key)
12
+ except Exception as e:
13
+ st.error(f"Error: {e}")
14
+ st.stop
15
+
16
+ model = genai.GenerativeModel(MODEL_ID)
17
+ chat = model.start_chat()
18
+
19
+ def download_pdf():
20
+ """
21
+ Downloads the PDF file from the Hugging Face Hub using the correct repo path and filename.
22
+ """
23
+ try:
24
+ hf_token = os.getenv("HF_TOKEN")
25
+ repo_id = "louiecerv/vqa_machine_learning_dataset" # Corrected dataset repo path
26
+ filename = "Unsupervised_Learning_Algorithms.pdf"
27
+ filepath = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token, repo_type="dataset")
28
+ return filepath
29
+ except Exception as e:
30
+ st.error(f"Failed to download PDF from Hugging Face Hub: {e}")
31
+ st.stop() # Stop if the download fails
32
+
33
+ # Initialize conversation history in Streamlit session state
34
+ if "conversation_history" not in st.session_state:
35
+ st.session_state.conversation_history = []
36
+ if "uploaded_file_part" not in st.session_state: # Store the file *part*
37
+ st.session_state.uploaded_file_part = None
38
+ if "uploaded_pdf_path" not in st.session_state:
39
+ st.session_state.uploaded_pdf_path = download_pdf()
40
+
41
+ def multimodal_prompt(pdf_path, text_prompt):
42
+ """
43
+ Sends a multimodal prompt to Gemini, handling file uploads efficiently.
44
+ Args:
45
+ pdf_path: The path to the PDF file.
46
+ text_prompt: The text prompt for the model.
47
+ Returns:
48
+ The model's response as a string, or an error message.
49
+ """
50
+ try:
51
+ if st.session_state.uploaded_file_part is None: # First time, upload
52
+ pdf_part = genai.upload_file(pdf_path, mime_type="application/pdf")
53
+ st.session_state.uploaded_file_part = pdf_part
54
+ prompt = [text_prompt, pdf_part] # First turn includes the actual file
55
+ else: # Subsequent turns, reference the file
56
+
57
+ prompt = [text_prompt, st.session_state.uploaded_file_part] # Subsequent turns include the file reference
58
+
59
+ response = chat.send_message(prompt)
60
+
61
+ # Update conversation history
62
+ st.session_state.conversation_history.append({"role": "user", "content": text_prompt, "has_pdf": True})
63
+ st.session_state.conversation_history.append({"role": "assistant", "content": response.text})
64
+ return response.text
65
+
66
+ except Exception as e:
67
+ return f"An error occurred: {e}"
68
+
69
+ def display_download_button(file_path, file_name):
70
+ try:
71
+ with open(file_path, "rb") as f:
72
+ file_bytes = f.read()
73
+ b64 = base64.b64encode(file_bytes).decode()
74
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{file_name}">Download the source document (PDF)</a>'
75
+ st.markdown(href, unsafe_allow_html=True)
76
+ except FileNotFoundError:
77
+ st.error("File not found for download.")
78
+ except Exception as e:
79
+ st.error(f"Error during download: {e}")
80
+
81
+ # Define the ML Models
82
+ models = ["K-Means Clustering", "Hierarchical Clustering",
83
+ "DBSCAN", "Gaussian Mixture Models", "Principal Component Analysis (PCA)",
84
+ "t-Distributed Stochastic Neighbor Embedding", "Autoencoders", "Self-Organizing Maps (SOM)", "Association Rule Learning"]
85
+
86
+ # --- Sidebar ---
87
+ st.sidebar.title("🤖 Visual Q and A")
88
+ selected_model = st.sidebar.selectbox("Select the ML Model", models)
89
+
90
+ # --- Main Page ---
91
+ st.title("📚 VQA on the Unsupervised Machine Learning Algorithms")
92
+ about = """
93
+
94
+ **How to use this App**
95
+ This app leverages Gemini 2.0 to provide insights on the provided document.
96
+ Select a question from the dropdown menu or enter your own question to get
97
+ Gemini's generated response based on the provided document.
98
+ """
99
+
100
+ with st.expander("How to use this App"):
101
+ st.markdown(about)
102
+
103
+ # --- Q and A Tab ---
104
+ st.header("Questions and Answers")
105
+
106
+ # Generate 5 questions based on the selected model
107
+ if selected_model == "K-Means Clustering":
108
+ questions = [
109
+ "What is the fundamental objective of the K-Means clustering algorithm, and how does it achieve this objective?",
110
+ "Explain the concept of 'inertia' in the context of K-Means clustering and its role in the algorithm's operation.",
111
+ "Describe the four key steps involved in the K-Means clustering process, providing details about each step.",
112
+ "What are the main advantages and disadvantages of using the K-Means clustering algorithm?",
113
+ "How does the selection of the 'k' value (number of clusters) influence the results of K-Means clustering? What are some common methods for determining the optimal 'k'?",
114
+ "Discuss the issue of sensitivity to initialization in K-Means clustering. How can this sensitivity affect the clustering results, and what strategies can be employed to mitigate this issue?",
115
+ "Explain why K-Means clustering might struggle with datasets containing clusters of varying shapes and densities. Are there any modifications or alternative algorithms that can address this limitation?",
116
+ "How can outliers impact the performance of K-Means clustering? Discuss techniques for identifying and handling outliers in the context of this algorithm.",
117
+ "Describe several real-world applications where K-Means clustering can be effectively utilized, providing specific examples.",
118
+ "Compare and contrast K-Means clustering with other unsupervised learning algorithms, such as hierarchical clustering or DBSCAN, highlighting their relative strengths and weaknesses."
119
+ ]
120
+ if selected_model == "Hierarchical Clustering":
121
+ questions = [
122
+ "What is the primary objective of hierarchical clustering, and how does it differ from other clustering techniques like k-means?",
123
+ "Explain the difference between the agglomerative and divisive approaches to hierarchical clustering, and provide a real-world example where each approach might be preferred.",
124
+ "Describe the concept of 'linkage criteria' in hierarchical clustering. Discuss the three common types of linkage (single, complete, and average) and how they influence cluster formation.",
125
+ "How can a dendrogram be used to interpret the results of hierarchical clustering? What information can you glean from its structure and branch lengths?",
126
+ "Discuss the advantages and disadvantages of hierarchical clustering compared to other unsupervised learning methods. When might you choose hierarchical clustering over k-means or DBSCAN?",
127
+ "How does the choice of distance metric affect the results of hierarchical clustering? Explain the impact of using different distance metrics like Euclidean, Manhattan, and cosine distance.",
128
+ "Hierarchical clustering can be sensitive to noise and outliers. How can you identify and address these issues when applying this technique?",
129
+ "Explain how hierarchical clustering can be used for exploratory data analysis. Provide an example of how you might use it to gain insights into a new dataset.",
130
+ "Discuss the computational complexity of hierarchical clustering. How does it scale with the number of data points, and what are some strategies for handling large datasets?",
131
+ "Can hierarchical clustering be used with categorical data? If so, how would you adapt the distance metric and linkage criteria to handle such data?"
132
+ ]
133
+ if selected_model == "DBSCAN":
134
+ questions = [
135
+ "What are the core differences between DBSCAN and traditional clustering algorithms like K-Means, and how do these differences impact the types of data structures they can effectively cluster?",
136
+ "Explain the concept of density-based clustering and how DBSCAN utilizes this concept to identify clusters.",
137
+ "How does DBSCAN handle outliers, and why is this approach beneficial in certain datasets compared to other clustering techniques?",
138
+ "What are the two key parameters in DBSCAN, and how do they influence the clustering outcome?",
139
+ "Describe the process of identifying core points, border points, and noise points in DBSCAN.",
140
+ "Discuss the advantages and disadvantages of using DBSCAN, particularly its ability to handle arbitrarily shaped clusters and its sensitivity to parameter settings.",
141
+ "In what scenarios would DBSCAN be a more suitable choice than K-Means or hierarchical clustering?",
142
+ "How does DBSCAN's ability to identify noise contribute to its effectiveness in anomaly detection tasks?",
143
+ "What are some real-world applications of DBSCAN, and how does its density-based approach address the specific challenges of these applications?",
144
+ "How does DBSCAN compare to other density-based clustering algorithms, and what factors might lead you to choose DBSCAN over alternative methods?"
145
+ ]
146
+ if selected_model == "Gaussian Mixture Models":
147
+ questions = [
148
+ "Explain the underlying assumption of Gaussian Mixture Models (GMMs) and how it differs from the assumptions made by K-Means clustering.",
149
+ "Describe the role of Gaussian distributions in GMMs and how they contribute to the model's flexibility in capturing cluster shapes.",
150
+ "How does the Expectation-Maximization (EM) algorithm facilitate the estimation of parameters in GMMs?",
151
+ "What are the advantages of using GMMs over K-Means for clustering data with varying shapes and densities?",
152
+ "Explain the concept of 'soft clustering' in GMMs and how it provides a more nuanced understanding of cluster assignments compared to 'hard clustering' methods.",
153
+ "How can GMMs be used for density estimation, and what are the benefits of this probabilistic approach?",
154
+ "Discuss the challenges associated with initializing GMMs and the potential impact on the final clustering results.",
155
+ "In what situations might GMMs be a preferred choice over other clustering algorithms, considering their strengths and weaknesses?",
156
+ "How does the concept of 'responsibility' in the E-step of the EM algorithm help in assigning data points to Gaussian components?",
157
+ "Provide examples of real-world applications where GMMs have been successfully employed for clustering or density estimation tasks."
158
+ ]
159
+ if selected_model == "Principal Component Analysis (PCA)":
160
+ questions = [
161
+ "How does PCA achieve dimensionality reduction, and what are the key mathematical concepts involved in this process?",
162
+ "Explain the role of eigenvectors and eigenvalues in PCA, and how they contribute to identifying principal components.",
163
+ "What are the benefits of using PCA for dimensionality reduction, particularly in the context of large datasets?",
164
+ "How does PCA help in addressing the curse of dimensionality, and why is this important in machine learning?",
165
+ "Describe the steps involved in performing PCA, including data standardization and the selection of principal components.",
166
+ "Discuss the limitations of PCA, such as its linearity assumption and potential issues with interpretability.",
167
+ "In what situations might PCA not be suitable for dimensionality reduction, and what alternative techniques could be considered?",
168
+ "How can PCA be used to improve the performance of other machine learning algorithms, and what types of algorithms benefit most from this preprocessing step?",
169
+ "What are some real-world applications of PCA, and how does its ability to reduce dimensionality contribute to solving these problems?",
170
+ "How does PCA compare to other dimensionality reduction techniques, and what factors would influence your choice between PCA and alternative methods?"
171
+ ]
172
+ if selected_model == "Self-Organizing Maps (SOM)":
173
+ questions = [
174
+ "Explain the concept of a Self-Organizing Map (SOM) and its role in unsupervised learning.",
175
+ "Describe the structure of a SOM, including its layers and the connections between neurons.",
176
+ "How does the competitive learning process work in a SOM, and how is the Best Matching Unit (BMU) determined?",
177
+ "Explain the process of weight adaptation in a SOM and how it leads to the formation of a topological map.",
178
+ "What are the key parameters involved in training a SOM, and how do they affect the resulting map?",
179
+ "Discuss the advantages and disadvantages of using SOMs for dimensionality reduction and visualization.",
180
+ "How does a SOM preserve the topological properties of the input data, and why is this important?",
181
+ "What are some common applications of SOMs in fields like data analysis, image processing, and pattern recognition?",
182
+ "Compare and contrast SOMs with other unsupervised learning techniques such as K-Means clustering and Principal Component Analysis (PCA).",
183
+ "How can SOMs be used for clustering and classification tasks, and what are the limitations of this approach?"
184
+ ]
185
+ if selected_model == "t-Distributed Stochastic Neighbor Embedding":
186
+ questions = [
187
+ "What is the primary objective of t-SNE, and how does it differ from the goals of principal component analysis (PCA)?",
188
+ "Explain the concept of 'perplexity' in t-SNE and its role in balancing local and global structure preservation.",
189
+ "How does t-SNE use probability distributions to represent relationships between data points in high-dimensional and low-dimensional spaces?",
190
+ "Describe the optimization process in t-SNE and the challenges associated with minimizing the Kullback-Leibler divergence.",
191
+ "What are the advantages of t-SNE over linear dimensionality reduction techniques like PCA, particularly for visualizing complex datasets?",
192
+ "Discuss the limitations of t-SNE, including its computational cost and sensitivity to parameter settings.",
193
+ "How does the 'crowding problem' affect t-SNE visualizations, and what strategies can be used to mitigate this issue?",
194
+ "In what situations would t-SNE be the preferred choice for dimensionality reduction and visualization compared to other techniques?",
195
+ "Provide examples of real-world applications where t-SNE has been successfully used to gain insights from high-dimensional data.",
196
+ "How can t-SNE be combined with other machine learning techniques, such as clustering or classification, to improve data analysis and visualization?"
197
+ ]
198
+ if selected_model == "Autoencoders":
199
+ questions = [
200
+ "What is the fundamental purpose of an autoencoder, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?",
201
+ "Describe the two main components of an autoencoder and their respective roles in the learning process.",
202
+ "Explain the concept of a latent space representation in the context of autoencoders. How does this representation contribute to dimensionality reduction and feature extraction?",
203
+ "How does the training process of an autoencoder work, and what is the significance of minimizing reconstruction error?",
204
+ "What are the advantages of using autoencoders for non-linear dimensionality reduction compared to linear techniques like PCA?",
205
+ "Discuss how autoencoders can be applied to tasks such as denoising and anomaly detection.",
206
+ "What are some potential challenges or drawbacks of using autoencoders, such as overfitting or the need for large datasets?",
207
+ "How can techniques like regularization help to mitigate the risk of overfitting in autoencoders?",
208
+ "Explain how the flexibility of autoencoders allows them to be adapted to various architectures and applications.",
209
+ "Can you provide examples of real-world applications where autoencoders have been successfully used for dimensionality reduction, feature extraction, or other unsupervised learning tasks?"
210
+ ]
211
+ if selected_model == "Association Rule Learning":
212
+ questions = [
213
+ "What is the primary goal of Association Rule Learning, and how does it differ from other unsupervised learning techniques like clustering or dimensionality reduction?",
214
+ "Explain the concept of 'support' and 'confidence' in Association Rule Learning, and how these metrics are used to evaluate the strength of an association rule.",
215
+ "Describe the Apriori algorithm, focusing on its key steps for generating frequent itemsets and association rules.",
216
+ "How does the Apriori algorithm address the challenge of computational complexity when dealing with a large number of possible itemsets?",
217
+ "What are the advantages and disadvantages of using Association Rule Learning, particularly in terms of interpretability and computational cost?",
218
+ "In what real-world scenarios is Association Rule Learning most applicable, and what types of insights can be gained from its application?",
219
+ "How does the choice of support and confidence thresholds impact the number and quality of discovered rules, and what factors should be considered when setting these thresholds?",
220
+ "What are some potential challenges or limitations of Association Rule Learning, such as dealing with rare items or handling continuous variables?",
221
+ "How can Association Rule Learning be used in conjunction with other data mining or machine learning techniques to enhance its effectiveness?",
222
+ "Discuss the ethical considerations surrounding the application of Association Rule Learning, particularly in areas like customer privacy and targeted advertising."
223
+ ]
224
+
225
+ # Create a selection box
226
+ selected_question = st.selectbox("Choose a question", questions)
227
+
228
+ # Display a checkbox
229
+ if st.checkbox('Check this box to ask a question not listed above'):
230
+ # If the checkbox is checked, display a text box
231
+ selected_question = st.text_input('Enter a question')
232
+
233
+ if st.button("Ask AI"):
234
+ with st.spinner("AI is thinking..."):
235
+ if st.session_state.uploaded_pdf_path is None:
236
+ st.session_state.uploaded_pdf_path = download_pdf()
237
+
238
+ filepath = st.session_state.uploaded_pdf_path
239
+ text_prompt = f"Use the provided document focus on rhe topic: {selected_model} to answer the following question: {selected_question}. Use your own knowledge as well as sources from the web and the provided document. Always cite your sourcss."
240
+ response = multimodal_prompt(filepath, text_prompt) # Use the downloaded filepath
241
+ st.markdown(f"**Question:** {selected_question}")
242
+ st.markdown(f"**Response:** {response}")
243
+
244
+ if st.session_state.uploaded_pdf_path:
245
+ display_download_button(st.session_state.uploaded_pdf_path, "Unsupervised_Learning_Algorithms.pdf")
246
+
247
+ st.markdown("[Visit our Hugging Face Space!](https://huggingface.co/wvsuaidev)")
248
+ st.markdown("© 2025 WVSU AI Dev Team 🤖 ✨")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit
2
+ huggingface_hub
3
+ google-generativeai