Dua Rajper commited on
Commit
5144ee6
·
verified ·
1 Parent(s): 238fc8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -87
app.py CHANGED
@@ -2,16 +2,15 @@ import streamlit as st
2
  import os
3
  import google.generativeai as genai
4
  from dotenv import load_dotenv
5
- import numpy as np
6
  import time
7
  from typing import Any, List, Optional
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- from sentence_transformers import SentenceTransformer
10
  import tensorflow as tf
11
  from tensorflow.keras.models import Sequential
12
  from tensorflow.keras.layers import Dense, Input
13
  from tensorflow.keras.utils import to_categorical
14
- from tensorflow.keras.optimizers import Adam
15
 
16
  # Load environment variables
17
  load_dotenv()
@@ -21,140 +20,325 @@ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
21
  if GOOGLE_API_KEY:
22
  genai.configure(api_key=GOOGLE_API_KEY)
23
  else:
24
- st.error("Google AI Studio API key not found. Please add it to your .env file.")
 
 
 
25
  st.stop()
26
 
27
- # Load embedding model (local)
28
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
29
 
30
- st.title("AI App: Embeddings, RAG, Similarity & Classification")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # --- Helper Functions ---
33
- def generate_embeddings(texts: List[str], model_name: str = "") -> List[List[float]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
- return embedding_model.encode(texts).tolist()
 
 
36
  except Exception as e:
37
- st.error(f"Error generating embeddings: {e}")
38
- return []
 
 
39
 
40
  def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
 
 
 
 
 
 
 
 
 
 
 
 
41
  for i in range(max_retries):
42
  try:
43
  model = genai.GenerativeModel(model_name)
44
- return model.generate_content(prompt, generation_config=generation_config)
 
45
  except Exception as e:
46
- if i < max_retries - 1:
47
- st.warning(f"Error: {e}. Retrying in {delay} seconds...")
 
 
 
 
 
 
 
48
  time.sleep(delay)
49
  else:
50
- st.error(f"Failed after {max_retries} attempts: {e}")
51
- return None
 
 
 
52
 
53
  def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
 
54
  return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
55
 
56
- def create_and_train_model(embeddings, labels, num_classes, epochs, batch_size, learning_rate, optimizer_str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  model = Sequential([
58
  Input(shape=(len(embeddings[0]),)),
59
- Dense(64, activation='relu'),
60
  Dense(32, activation='relu'),
61
  Dense(num_classes, activation='softmax')
62
  ])
63
 
64
- if optimizer_str == 'adam':
65
  optimizer = Adam(learning_rate=learning_rate)
66
- elif optimizer_str == 'sgd':
67
  optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
68
- else:
69
  optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
 
 
70
 
71
  model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
72
  encoded_labels = to_categorical(labels, num_classes=num_classes)
73
  model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
74
  return model
75
 
76
- def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
77
- data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
78
- texts, labels = [], []
79
- for i, pair in enumerate(data_pairs):
80
- if len(pair) != 2:
81
- st.error(f"Invalid line {i+1}: {pair}")
82
- return None
83
- try:
84
- texts.append(pair[0].strip())
85
- labels.append(int(pair[1].strip()))
86
- except ValueError:
87
- st.error(f"Invalid label in line {i+1}")
88
- return None
89
- return texts, labels
90
 
91
- # --- RAG ---
92
- st.header("🔎 RAG: Retrieval-Augmented Generation")
93
- rag_model_name = st.selectbox("Text generation model", ["gemini-pro"])
94
- rag_context = st.text_area("Context documents (separated by new lines)", height=150)
95
- rag_question = st.text_area("Your question", height=70)
96
- rag_max_context_length = st.slider("Max context length", 100, 2000, 500)
 
 
 
 
 
 
 
97
 
98
  if st.button("Answer with RAG"):
99
  if not rag_context or not rag_question:
100
  st.warning("Please provide both context and a question.")
101
  else:
102
- with st.spinner("Generating..."):
103
  try:
104
- context_lines = rag_context.split('\n')
105
- context_embeddings = generate_embeddings(context_lines)
106
- question_embedding = generate_embeddings([rag_question])[0]
107
- similarities = cosine_similarity([question_embedding], context_embeddings)[0]
108
- best_match_index = np.argmax(similarities)
109
- selected_context = context_lines[best_match_index][:rag_max_context_length]
110
-
111
- prompt = f"Use the context below to answer the question.\n\nContext:\n{selected_context}\n\nQuestion: {rag_question}"
112
- response = generate_with_retry(prompt, rag_model_name, genai.types.GenerationConfig())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  if response:
114
- st.subheader("Answer:")
115
- st.markdown(response.text)
116
  except Exception as e:
117
- st.error(f"Error: {e}")
 
 
118
 
119
  # --- Text Similarity ---
120
- st.header("🧠 Text Similarity")
121
- text1 = st.text_area("Text 1", height=70)
122
- text2 = st.text_area("Text 2", height=70)
 
123
 
124
  if st.button("Calculate Similarity"):
125
  if not text1 or not text2:
126
- st.warning("Please enter both texts.")
127
  else:
128
- try:
129
- embeddings = generate_embeddings([text1, text2])
130
- similarity = calculate_similarity(embeddings[0], embeddings[1])
131
- st.write(f"Cosine Similarity: **{similarity:.4f}**")
132
- except Exception as e:
133
- st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
134
 
135
  # --- Neural Classification ---
136
- st.header("🧪 Neural Classification")
137
- classification_data = st.text_area("Training data (text,label)", "text1,0\ntext2,1", height=100)
138
- classification_prompt = st.text_area("Text to classify", "This is a sample input.", height=70)
139
- num_epochs = st.number_input("Epochs", 1, 100, 10)
140
- batch_size = st.number_input("Batch Size", 1, 128, 32)
141
- learning_rate = st.number_input("Learning Rate", 0.0001, 0.1, 0.0001, format="%.4f")
142
- optimizer_str = st.selectbox("Optimizer", ["adam", "sgd", "rmsprop"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  if st.button("Classify"):
145
- try:
146
- result = process_classification_data(classification_data)
147
- if not result:
148
- st.stop()
149
- train_texts, train_labels = result
150
- train_embeddings = generate_embeddings(train_texts)
151
- model = create_and_train_model(train_embeddings, train_labels, len(set(train_labels)),
152
- num_epochs, batch_size, learning_rate, optimizer_str)
153
-
154
- predict_embedding = generate_embeddings([classification_prompt])[0]
155
- prediction = model.predict(np.array([predict_embedding]), verbose=0)
156
- predicted_class = int(np.argmax(prediction[0]))
157
- st.success(f"Predicted Class: **{predicted_class}**")
158
- st.write("Prediction Probabilities:", prediction)
159
- except Exception as e:
160
- st.error(f"Classification Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  import google.generativeai as genai
4
  from dotenv import load_dotenv
 
5
  import time
6
  from typing import Any, List, Optional
7
+ import numpy as np
8
  from sklearn.metrics.pairwise import cosine_similarity
 
9
  import tensorflow as tf
10
  from tensorflow.keras.models import Sequential
11
  from tensorflow.keras.layers import Dense, Input
12
  from tensorflow.keras.utils import to_categorical
13
+ from tensorflow.keras.optimizers import Adam # Import Adam optimizer
14
 
15
  # Load environment variables
16
  load_dotenv()
 
20
  if GOOGLE_API_KEY:
21
  genai.configure(api_key=GOOGLE_API_KEY)
22
  else:
23
+ st.error(
24
+ "Google AI Studio API key not found. Please add it to your .env file. "
25
+ "You can obtain an API key from https://makersuite.google.com/."
26
+ )
27
  st.stop()
28
 
29
+ st.title("Embeddings and Vector Search Demo")
30
+ st.subheader("Explore Embeddings and Vector Databases")
31
 
32
+ # Sidebar for explanations
33
+ with st.sidebar:
34
+ st.header("Embeddings and Vector Search")
35
+ st.markdown(
36
+ """
37
+ This app demonstrates how embeddings and vector databases can be used for various tasks.
38
+ """
39
+ )
40
+ st.subheader("Key Concepts:")
41
+ st.markdown(
42
+ """
43
+ - **Embeddings**: Numerical representations of text, capturing semantic meaning.
44
+ - **Vector Databases**: Databases optimized for storing and querying vectors (simulated here).
45
+ - **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
46
+ - **Cosine Similarity**: A measure of similarity between two vectors.
47
+ - **Neural Networks**: Using embeddings as input for classification.
48
+ """
49
+ )
50
+ st.subheader("Whitepaper Insights")
51
+ st.markdown(
52
+ """
53
+ - Efficient similarity search using vector indexes (e.g., ANN).
54
+ - Handling large datasets and scalability considerations.
55
+ - Applications of embeddings: search, recommendation, classification, etc.
56
+ """
57
+ )
58
 
59
  # --- Helper Functions ---
60
+ def code_block(text: str, language: str = "text") -> None:
61
+ """Displays text as a formatted code block in Streamlit."""
62
+ st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
63
+
64
+
65
+ def display_response(response: Any) -> None:
66
+ """Displays the model's response."""
67
+ if response and hasattr(response, "text"):
68
+ st.subheader("Generated Response:")
69
+ st.markdown(response.text)
70
+ else:
71
+ st.error("Failed to generate a response.")
72
+
73
+
74
+
75
+ def generate_embeddings(texts: List[str], model_name: str) -> Optional[List[List[float]]]:
76
+ """Generates embeddings for a list of texts using a specified model.
77
+
78
+ Args:
79
+ texts: List of text strings.
80
+ model_name: Name of the embedding model.
81
+
82
+ Returns:
83
+ List of embeddings (list of floats) or None on error.
84
+ """
85
  try:
86
+ model = genai.GenerativeModel(model_name)
87
+ response = model.generate_embeddings(texts=texts)
88
+ return [embedding.values for embedding in response.embeddings]
89
  except Exception as e:
90
+ st.error(f"Error generating embeddings with model '{model_name}': {e}")
91
+ return None
92
+
93
+
94
 
95
  def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
96
+ """Generates content with retry logic and error handling.
97
+
98
+ Args:
99
+ prompt: The prompt string.
100
+ model_name: The name of the language model.
101
+ generation_config: The generation configuration.
102
+ max_retries: Maximum number of retries.
103
+ delay: Delay in seconds between retries.
104
+
105
+ Returns:
106
+ The generated response or None on error.
107
+ """
108
  for i in range(max_retries):
109
  try:
110
  model = genai.GenerativeModel(model_name)
111
+ response = model.generate_content(prompt, generation_config=generation_config)
112
+ return response
113
  except Exception as e:
114
+ error_message = str(e)
115
+ st.warning(f"Error during generation (attempt {i + 1}/{max_retries}): {error_message}")
116
+ if "404" in error_message and "not found" in error_message:
117
+ st.error(
118
+ f"Model '{model_name}' is not available or not supported. Please select a different model."
119
+ )
120
+ return None # Return None to signal a non-retryable error
121
+ elif i < max_retries - 1:
122
+ st.info(f"Retrying in {delay} seconds...")
123
  time.sleep(delay)
124
  else:
125
+ st.error(f"Failed to generate content after {max_retries} attempts. Please check your prompt and model.")
126
+ return None # Return None after max retries
127
+ return None #Should never reach here
128
+
129
+
130
 
131
  def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
132
+ """Calculates the cosine similarity between two embeddings."""
133
  return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
134
 
135
+
136
+
137
+ def create_and_train_model(
138
+ embeddings: List[List[float]],
139
+ labels: List[int],
140
+ num_classes: int,
141
+ epochs: int,
142
+ batch_size: int,
143
+ learning_rate: float,
144
+ optimizer_str: str
145
+ ) -> tf.keras.Model:
146
+ """Creates and trains a neural network for classification.
147
+
148
+ Args:
149
+ embeddings: List of input embeddings.
150
+ labels: List of integer labels.
151
+ num_classes: Number of classes.
152
+ epochs: Number of training epochs.
153
+ batch_size: Batch size for training.
154
+ learning_rate: Learning rate for the optimizer.
155
+ optimizer_str: Name of the optimizer ('adam', 'sgd', 'rmsprop')
156
+
157
+ Returns:
158
+ Trained Keras model.
159
+ """
160
  model = Sequential([
161
  Input(shape=(len(embeddings[0]),)),
162
+ Dense(64, activation='relu'), # Increased hidden layer size
163
  Dense(32, activation='relu'),
164
  Dense(num_classes, activation='softmax')
165
  ])
166
 
167
+ if optimizer_str.lower() == 'adam':
168
  optimizer = Adam(learning_rate=learning_rate)
169
+ elif optimizer_str.lower() == 'sgd':
170
  optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
171
+ elif optimizer_str.lower() == 'rmsprop':
172
  optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
173
+ else:
174
+ optimizer = Adam(learning_rate=learning_rate) #default
175
 
176
  model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
177
  encoded_labels = to_categorical(labels, num_classes=num_classes)
178
  model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
179
  return model
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+
183
+ # --- RAG Question Answering ---
184
+ st.header("RAG Question Answering")
185
+ rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
186
+ rag_embedding_model = st.selectbox("Select embedding model for RAG:", ["gemini-pro"], index=0)
187
+ rag_context = st.text_area(
188
+ "Enter your context documents:",
189
+ "Relevant information to answer the question. Separate documents with newlines.",
190
+ height=150,
191
+ )
192
+ rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=70)
193
+ rag_max_context_length = st.number_input("Maximum Context Length", min_value=100, max_value=2000, value=500, step=100)
194
+
195
 
196
  if st.button("Answer with RAG"):
197
  if not rag_context or not rag_question:
198
  st.warning("Please provide both context and a question.")
199
  else:
200
+ with st.spinner("Generating answer..."):
201
  try:
202
+ # 1. Generate embeddings for the context
203
+ context_embeddings = generate_embeddings(rag_context.split('\n'), rag_embedding_model)
204
+ if not context_embeddings:
205
+ st.stop()
206
+
207
+ # 2. Generate embedding for the question
208
+ question_embedding = generate_embeddings([rag_question], rag_embedding_model)
209
+ if not question_embedding:
210
+ st.stop()
211
+
212
+ # 3. Calculate similarity scores
213
+ similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
214
+
215
+ # 4. Find the most relevant document(s)
216
+ most_relevant_index = np.argmax(similarities)
217
+ relevant_context = rag_context.split('\n')[most_relevant_index]
218
+ #truncate context
219
+ if len(relevant_context) > rag_max_context_length:
220
+ relevant_context = relevant_context[:rag_max_context_length]
221
+
222
+ # 5. Construct the prompt
223
+ rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
224
+
225
+ # 6. Generate the answer
226
+ response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
227
  if response:
228
+ display_response(response)
 
229
  except Exception as e:
230
+ st.error(f"An error occurred: {e}")
231
+
232
+
233
 
234
  # --- Text Similarity ---
235
+ st.header("Text Similarity")
236
+ similarity_embedding_model = st.selectbox("Select embedding model for similarity:", ["gemini-pro"], index=0)
237
+ text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=70)
238
+ text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=70)
239
 
240
  if st.button("Calculate Similarity"):
241
  if not text1 or not text2:
242
+ st.warning("Please provide both texts.")
243
  else:
244
+ with st.spinner("Calculating similarity..."):
245
+ try:
246
+ # 1. Generate embeddings
247
+ embeddings = generate_embeddings([text1, text2], similarity_embedding_model)
248
+ if not embeddings:
249
+ st.stop()
250
+
251
+ # 2. Calculate cosine similarity
252
+ similarity = calculate_similarity(embeddings[0], embeddings[1])
253
+ st.subheader("Cosine Similarity:")
254
+ st.write(similarity)
255
+ except Exception as e:
256
+ st.error(f"An error occurred: {e}")
257
+
258
+
259
 
260
  # --- Neural Classification ---
261
+ st.header("Neural Classification with Embeddings")
262
+ classification_embedding_model = st.selectbox("Select embedding model for classification:", ["gemini-pro"], index=0)
263
+ classification_data = st.text_area(
264
+ "Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
265
+ "text1,0\ntext2,1\ntext3,0\ntext4,1",
266
+ height=150,
267
+ )
268
+ classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=70)
269
+ num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=200, value=10, step=1)
270
+ batch_size = st.number_input("Batch Size", min_value=1, max_value=128, value=32, step=1)
271
+ learning_rate = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.0001, step=0.0001, format="%.4f")
272
+ optimizer_str = st.selectbox("Optimizer", ['adam', 'sgd', 'rmsprop'], index=0)
273
+
274
+
275
+ def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
276
+ """Processes the classification data string into lists of texts and labels.
277
+
278
+ Args:
279
+ data: The input string.
280
+
281
+ Returns:
282
+ A tuple of (texts, labels) or None on error
283
+ """
284
+ data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
285
+ if not data_pairs:
286
+ st.error("No valid data pairs found. Please ensure each line contains 'text,label'.")
287
+ return None
288
+ texts = []
289
+ labels = []
290
+ for i, pair in enumerate(data_pairs):
291
+ if len(pair) != 2:
292
+ st.error(f"Invalid data format in line {i + 1}: '{','.join(pair)}'. Expected 'text,label'.")
293
+ return None
294
+ text = pair[0].strip()
295
+ label_str = pair[1].strip()
296
+ try:
297
+ label = int(label_str)
298
+ texts.append(text)
299
+ labels.append(label)
300
+ except ValueError:
301
+ st.error(f"Invalid label value in line {i + 1}: '{label_str}'. Label must be an integer.")
302
+ return None
303
+ return texts, labels
304
+
305
+
306
 
307
  if st.button("Classify"):
308
+ if not classification_data or not classification_prompt:
309
+ st.warning("Please provide training data and text to classify.")
310
+ else:
311
+ with st.spinner("Classifying..."):
312
+ try:
313
+ # 1. Process the training data
314
+ processed_data = process_classification_data(classification_data)
315
+ if not processed_data:
316
+ st.stop()
317
+ train_texts, train_labels = processed_data
318
+ num_classes = len(set(train_labels))
319
+
320
+ # 2. Generate embeddings for training data
321
+ train_embeddings = generate_embeddings(train_texts, classification_embedding_model)
322
+ if not train_embeddings:
323
+ st.stop()
324
+
325
+ # 3. Create and train the model
326
+ model = create_and_train_model(
327
+ train_embeddings, train_labels, num_classes, num_epochs, batch_size, learning_rate, optimizer_str
328
+ )
329
+
330
+ # 4. Generate embedding for the text to classify
331
+ predict_embedding = generate_embeddings([classification_prompt], classification_embedding_model)
332
+ if not predict_embedding:
333
+ st.stop()
334
+
335
+ # 5. Make the prediction
336
+ prediction = model.predict(np.array([predict_embedding]), verbose=0)
337
+ predicted_class = np.argmax(prediction[0])
338
+ st.subheader("Predicted Class:")
339
+ st.write(predicted_class)
340
+ st.subheader("Prediction Probabilities:")
341
+ st.write(prediction)
342
+
343
+ except Exception as e:
344
+ st.error(f"An error occurred: {e}")