Dua Rajper commited on
Commit
60f12fd
·
verified ·
1 Parent(s): c431480

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -49
app.py CHANGED
@@ -5,13 +5,14 @@ from dotenv import load_dotenv
5
  import json
6
  import textwrap
7
  import time
8
- from typing import Any, List
9
  import numpy as np
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  import tensorflow as tf
12
  from tensorflow.keras.models import Sequential
13
  from tensorflow.keras.layers import Dense, Input
14
  from tensorflow.keras.utils import to_categorical
 
15
 
16
  # Load environment variables
17
  load_dotenv()
@@ -20,7 +21,6 @@ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
20
  # Configure Generative AI model
21
  if GOOGLE_API_KEY:
22
  genai.configure(api_key=GOOGLE_API_KEY)
23
- model = genai.GenerativeModel('gemini-pro') # You can choose a suitable model.
24
  else:
25
  st.error(
26
  "Google AI Studio API key not found. Please add it to your .env file. "
@@ -43,17 +43,18 @@ with st.sidebar:
43
  st.markdown(
44
  """
45
  - **Embeddings**: Numerical representations of text, capturing semantic meaning.
46
- - **Vector Databases**: Databases optimized for storing and querying vectors.
47
  - **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
48
  - **Cosine Similarity**: A measure of similarity between two vectors.
 
49
  """
50
  )
51
  st.subheader("Whitepaper Insights")
52
  st.markdown(
53
  """
54
- - Efficient similarity search using vector indexes (e.g., ANN).
55
- - Handling large datasets and scalability.
56
- - Applications of embeddings: search, recommendation, classification.
57
  """
58
  )
59
 
@@ -72,20 +73,40 @@ def display_response(response: Any) -> None:
72
  st.error("Failed to generate a response.")
73
 
74
 
75
- def generate_embeddings(texts: List[str], model_name: str = 'models/embedding-001') -> List[List[float]]:
76
- """Generates embeddings for a list of texts."""
 
 
 
 
 
 
 
 
 
77
  try:
78
  embedding_model = genai.EmbeddingModel(model_name)
79
  embeddings = embedding_model.embed_content(texts=texts)
80
- return [embedding.values for embedding in embeddings.embeddings] # Extract embedding values
81
  except Exception as e:
82
- st.error(f"Error generating embeddings: {e}")
83
- return []
84
 
85
 
86
 
87
  def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
88
- """Generates content with retry logic."""
 
 
 
 
 
 
 
 
 
 
 
89
  for i in range(max_retries):
90
  try:
91
  model = genai.GenerativeModel(model_name)
@@ -98,25 +119,81 @@ def generate_with_retry(prompt: str, model_name: str, generation_config: genai.t
98
  st.error(
99
  f"Model '{model_name}' is not available or not supported. Please select a different model."
100
  )
101
- return None
102
  elif i < max_retries - 1:
103
  st.info(f"Retrying in {delay} seconds...")
104
  time.sleep(delay)
105
  else:
106
- raise
107
- raise Exception("Failed to generate content after maximum retries")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
 
111
  # --- RAG Question Answering ---
112
  st.header("RAG Question Answering")
113
  rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
 
114
  rag_context = st.text_area(
115
  "Enter your context documents:",
116
  "Relevant information to answer the question. Separate documents with newlines.",
117
  height=150,
118
  )
119
- rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=50)
 
 
120
 
121
  if st.button("Answer with RAG"):
122
  if not rag_context or not rag_question:
@@ -125,12 +202,14 @@ if st.button("Answer with RAG"):
125
  with st.spinner("Generating answer..."):
126
  try:
127
  # 1. Generate embeddings for the context
128
- context_embeddings = generate_embeddings(rag_context.split('\n'))
129
  if not context_embeddings:
130
  st.stop()
131
 
132
  # 2. Generate embedding for the question
133
- question_embedding = generate_embeddings([rag_question])[0]
 
 
134
 
135
  # 3. Calculate similarity scores
136
  similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
@@ -138,13 +217,17 @@ if st.button("Answer with RAG"):
138
  # 4. Find the most relevant document(s)
139
  most_relevant_index = np.argmax(similarities)
140
  relevant_context = rag_context.split('\n')[most_relevant_index]
 
 
 
141
 
142
  # 5. Construct the prompt
143
  rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
144
 
145
  # 6. Generate the answer
146
  response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
147
- display_response(response)
 
148
  except Exception as e:
149
  st.error(f"An error occurred: {e}")
150
 
@@ -152,9 +235,9 @@ if st.button("Answer with RAG"):
152
 
153
  # --- Text Similarity ---
154
  st.header("Text Similarity")
155
- similarity_model_name = st.selectbox("Select model for similarity:", ["models/embedding-001"], index=0) # Use a model that supports embeddings
156
- text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=50)
157
- text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=50)
158
 
159
  if st.button("Calculate Similarity"):
160
  if not text1 or not text2:
@@ -163,11 +246,12 @@ if st.button("Calculate Similarity"):
163
  with st.spinner("Calculating similarity..."):
164
  try:
165
  # 1. Generate embeddings
166
- embeddings = generate_embeddings([text1, text2], similarity_model_name)
167
  if not embeddings:
168
  st.stop()
 
169
  # 2. Calculate cosine similarity
170
- similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
171
  st.subheader("Cosine Similarity:")
172
  st.write(similarity)
173
  except Exception as e:
@@ -177,28 +261,48 @@ if st.button("Calculate Similarity"):
177
 
178
  # --- Neural Classification ---
179
  st.header("Neural Classification with Embeddings")
180
- classification_model_name = st.selectbox("Select model for classification:", ["models/embedding-001"], index=0) #use embedding model
181
  classification_data = st.text_area(
182
  "Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
183
  "text1,0\ntext2,1\ntext3,0\ntext4,1",
184
  height=150,
185
  )
186
- classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=50)
187
- num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=100, value=10, step=1)
188
-
189
-
190
- def create_and_train_model(embeddings: List[List[float]], labels: List[int], num_classes: int, epochs: int):
191
- """Creates and trains a simple neural network for classification."""
192
- model = Sequential([
193
- Input(shape=(len(embeddings[0]),)), # Input shape is the embedding size
194
- Dense(16, activation='relu'),
195
- Dense(num_classes, activation='softmax') # Output layer with softmax
196
- ])
197
-
198
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
199
- encoded_labels = to_categorical(labels, num_classes=num_classes) #one hot encode
200
- model.fit(np.array(embeddings), encoded_labels, epochs=epochs, verbose=0) # Suppress training output
201
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
 
204
 
@@ -209,21 +313,26 @@ if st.button("Classify"):
209
  with st.spinner("Classifying..."):
210
  try:
211
  # 1. Process the training data
212
- data_pairs = [line.split(',') for line in classification_data.split('\n') if ',' in line]
213
- train_texts = [pair[0].strip() for pair in data_pairs]
214
- train_labels = [int(pair[1].strip()) for pair in data_pairs]
215
- num_classes = len(set(train_labels)) #number of classes
 
216
 
217
  # 2. Generate embeddings for training data
218
- train_embeddings = generate_embeddings(train_texts, classification_model_name)
219
  if not train_embeddings:
220
  st.stop()
221
 
222
  # 3. Create and train the model
223
- model = create_and_train_model(train_embeddings, train_labels, num_classes, num_epochs)
 
 
224
 
225
  # 4. Generate embedding for the text to classify
226
- predict_embedding = generate_embeddings([classification_prompt], classification_model_name)[0]
 
 
227
 
228
  # 5. Make the prediction
229
  prediction = model.predict(np.array([predict_embedding]), verbose=0)
@@ -235,4 +344,3 @@ if st.button("Classify"):
235
 
236
  except Exception as e:
237
  st.error(f"An error occurred: {e}")
238
-
 
5
  import json
6
  import textwrap
7
  import time
8
+ from typing import Any, List, Optional
9
  import numpy as np
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  import tensorflow as tf
12
  from tensorflow.keras.models import Sequential
13
  from tensorflow.keras.layers import Dense, Input
14
  from tensorflow.keras.utils import to_categorical
15
+ from tensorflow.keras.optimizers import Adam # Import Adam optimizer
16
 
17
  # Load environment variables
18
  load_dotenv()
 
21
  # Configure Generative AI model
22
  if GOOGLE_API_KEY:
23
  genai.configure(api_key=GOOGLE_API_KEY)
 
24
  else:
25
  st.error(
26
  "Google AI Studio API key not found. Please add it to your .env file. "
 
43
  st.markdown(
44
  """
45
  - **Embeddings**: Numerical representations of text, capturing semantic meaning.
46
+ - **Vector Databases**: Databases optimized for storing and querying vectors (simulated here).
47
  - **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
48
  - **Cosine Similarity**: A measure of similarity between two vectors.
49
+ - **Neural Networks**: Using embeddings as input for classification.
50
  """
51
  )
52
  st.subheader("Whitepaper Insights")
53
  st.markdown(
54
  """
55
+ - Efficient similarity search using vector indexes (e.g., ANN).
56
+ - Handling large datasets and scalability considerations.
57
+ - Applications of embeddings: search, recommendation, classification, etc.
58
  """
59
  )
60
 
 
73
  st.error("Failed to generate a response.")
74
 
75
 
76
+
77
+ def generate_embeddings(texts: List[str], model_name: str) -> Optional[List[List[float]]]:
78
+ """Generates embeddings for a list of texts using a specified model.
79
+
80
+ Args:
81
+ texts: List of text strings.
82
+ model_name: Name of the embedding model.
83
+
84
+ Returns:
85
+ List of embeddings (list of floats) or None on error.
86
+ """
87
  try:
88
  embedding_model = genai.EmbeddingModel(model_name)
89
  embeddings = embedding_model.embed_content(texts=texts)
90
+ return [embedding.values for embedding in embeddings.embeddings]
91
  except Exception as e:
92
+ st.error(f"Error generating embeddings with model '{model_name}': {e}")
93
+ return None
94
 
95
 
96
 
97
  def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
98
+ """Generates content with retry logic and error handling.
99
+
100
+ Args:
101
+ prompt: The prompt string.
102
+ model_name: The name of the language model.
103
+ generation_config: The generation configuration.
104
+ max_retries: Maximum number of retries.
105
+ delay: Delay in seconds between retries.
106
+
107
+ Returns:
108
+ The generated response or None on error.
109
+ """
110
  for i in range(max_retries):
111
  try:
112
  model = genai.GenerativeModel(model_name)
 
119
  st.error(
120
  f"Model '{model_name}' is not available or not supported. Please select a different model."
121
  )
122
+ return None # Return None to signal a non-retryable error
123
  elif i < max_retries - 1:
124
  st.info(f"Retrying in {delay} seconds...")
125
  time.sleep(delay)
126
  else:
127
+ st.error(f"Failed to generate content after {max_retries} attempts. Please check your prompt and model.")
128
+ return None # Return None after max retries
129
+ return None #Should never reach here
130
+
131
+
132
+
133
+ def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
134
+ """Calculates the cosine similarity between two embeddings."""
135
+ return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
136
+
137
+
138
+
139
+ def create_and_train_model(
140
+ embeddings: List[List[float]],
141
+ labels: List[int],
142
+ num_classes: int,
143
+ epochs: int,
144
+ batch_size: int,
145
+ learning_rate: float,
146
+ optimizer_str: str
147
+ ) -> tf.keras.Model:
148
+ """Creates and trains a neural network for classification.
149
+
150
+ Args:
151
+ embeddings: List of input embeddings.
152
+ labels: List of integer labels.
153
+ num_classes: Number of classes.
154
+ epochs: Number of training epochs.
155
+ batch_size: Batch size for training.
156
+ learning_rate: Learning rate for the optimizer.
157
+ optimizer_str: Name of the optimizer ('adam', 'sgd', 'rmsprop')
158
+
159
+ Returns:
160
+ Trained Keras model.
161
+ """
162
+ model = Sequential([
163
+ Input(shape=(len(embeddings[0]),)),
164
+ Dense(64, activation='relu'), # Increased hidden layer size
165
+ Dense(32, activation='relu'),
166
+ Dense(num_classes, activation='softmax')
167
+ ])
168
+
169
+ if optimizer_str.lower() == 'adam':
170
+ optimizer = Adam(learning_rate=learning_rate)
171
+ elif optimizer_str.lower() == 'sgd':
172
+ optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
173
+ elif optimizer_str.lower() == 'rmsprop':
174
+ optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
175
+ else:
176
+ optimizer = Adam(learning_rate=learning_rate) #default
177
+
178
+ model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
179
+ encoded_labels = to_categorical(labels, num_classes=num_classes)
180
+ model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
181
+ return model
182
 
183
 
184
 
185
  # --- RAG Question Answering ---
186
  st.header("RAG Question Answering")
187
  rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
188
+ rag_embedding_model = st.selectbox("Select embedding model for RAG:", ["models/embedding-001"], index=0)
189
  rag_context = st.text_area(
190
  "Enter your context documents:",
191
  "Relevant information to answer the question. Separate documents with newlines.",
192
  height=150,
193
  )
194
+ rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=70) # Changed height to 70
195
+ rag_max_context_length = st.number_input("Maximum Context Length", min_value=100, max_value=2000, value=500, step=100)
196
+
197
 
198
  if st.button("Answer with RAG"):
199
  if not rag_context or not rag_question:
 
202
  with st.spinner("Generating answer..."):
203
  try:
204
  # 1. Generate embeddings for the context
205
+ context_embeddings = generate_embeddings(rag_context.split('\n'), rag_embedding_model)
206
  if not context_embeddings:
207
  st.stop()
208
 
209
  # 2. Generate embedding for the question
210
+ question_embedding = generate_embeddings([rag_question], rag_embedding_model)
211
+ if not question_embedding:
212
+ st.stop()
213
 
214
  # 3. Calculate similarity scores
215
  similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
 
217
  # 4. Find the most relevant document(s)
218
  most_relevant_index = np.argmax(similarities)
219
  relevant_context = rag_context.split('\n')[most_relevant_index]
220
+ #truncate context
221
+ if len(relevant_context) > rag_max_context_length:
222
+ relevant_context = relevant_context[:rag_max_context_length]
223
 
224
  # 5. Construct the prompt
225
  rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
226
 
227
  # 6. Generate the answer
228
  response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
229
+ if response:
230
+ display_response(response)
231
  except Exception as e:
232
  st.error(f"An error occurred: {e}")
233
 
 
235
 
236
  # --- Text Similarity ---
237
  st.header("Text Similarity")
238
+ similarity_embedding_model = st.selectbox("Select embedding model for similarity:", ["models/embedding-001"], index=0)
239
+ text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=70) # Changed height to 70
240
+ text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=70) # Changed height to 70
241
 
242
  if st.button("Calculate Similarity"):
243
  if not text1 or not text2:
 
246
  with st.spinner("Calculating similarity..."):
247
  try:
248
  # 1. Generate embeddings
249
+ embeddings = generate_embeddings([text1, text2], similarity_embedding_model)
250
  if not embeddings:
251
  st.stop()
252
+
253
  # 2. Calculate cosine similarity
254
+ similarity = calculate_similarity(embeddings[0], embeddings[1])
255
  st.subheader("Cosine Similarity:")
256
  st.write(similarity)
257
  except Exception as e:
 
261
 
262
  # --- Neural Classification ---
263
  st.header("Neural Classification with Embeddings")
264
+ classification_embedding_model = st.selectbox("Select embedding model for classification:", ["models/embedding-001"], index=0)
265
  classification_data = st.text_area(
266
  "Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
267
  "text1,0\ntext2,1\ntext3,0\ntext4,1",
268
  height=150,
269
  )
270
+ classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=70) # Changed height to 70
271
+ num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=200, value=10, step=1)
272
+ batch_size = st.number_input("Batch Size", min_value=1, max_value=128, value=32, step=1)
273
+ learning_rate = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.0001, step=0.0001, format="%.4f")
274
+ optimizer_str = st.selectbox("Optimizer", ['adam', 'sgd', 'rmsprop'], index=0)
275
+
276
+
277
+ def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
278
+ """Processes the classification data string into lists of texts and labels.
279
+
280
+ Args:
281
+ data: The input string.
282
+
283
+ Returns:
284
+ A tuple of (texts, labels) or None on error
285
+ """
286
+ data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
287
+ if not data_pairs:
288
+ st.error("No valid data pairs found. Please ensure each line contains 'text,label'.")
289
+ return None
290
+ texts = []
291
+ labels = []
292
+ for i, pair in enumerate(data_pairs):
293
+ if len(pair) != 2:
294
+ st.error(f"Invalid data format in line {i + 1}: '{','.join(pair)}'. Expected 'text,label'.")
295
+ return None
296
+ text = pair[0].strip()
297
+ label_str = pair[1].strip()
298
+ try:
299
+ label = int(label_str)
300
+ texts.append(text)
301
+ labels.append(label)
302
+ except ValueError:
303
+ st.error(f"Invalid label value in line {i + 1}: '{label_str}'. Label must be an integer.")
304
+ return None
305
+ return texts, labels
306
 
307
 
308
 
 
313
  with st.spinner("Classifying..."):
314
  try:
315
  # 1. Process the training data
316
+ processed_data = process_classification_data(classification_data)
317
+ if not processed_data:
318
+ st.stop()
319
+ train_texts, train_labels = processed_data
320
+ num_classes = len(set(train_labels))
321
 
322
  # 2. Generate embeddings for training data
323
+ train_embeddings = generate_embeddings(train_texts, classification_embedding_model)
324
  if not train_embeddings:
325
  st.stop()
326
 
327
  # 3. Create and train the model
328
+ model = create_and_train_model(
329
+ train_embeddings, train_labels, num_classes, num_epochs, batch_size, learning_rate, optimizer_str
330
+ )
331
 
332
  # 4. Generate embedding for the text to classify
333
+ predict_embedding = generate_embeddings([classification_prompt], classification_embedding_model)
334
+ if not predict_embedding:
335
+ st.stop()
336
 
337
  # 5. Make the prediction
338
  prediction = model.predict(np.array([predict_embedding]), verbose=0)
 
344
 
345
  except Exception as e:
346
  st.error(f"An error occurred: {e}")