Dua Rajper commited on
Commit
b073183
·
verified ·
1 Parent(s): b6e01f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -289
app.py CHANGED
@@ -10,7 +10,7 @@ import tensorflow as tf
10
  from tensorflow.keras.models import Sequential
11
  from tensorflow.keras.layers import Dense, Input
12
  from tensorflow.keras.utils import to_categorical
13
- from tensorflow.keras.optimizers import Adam # Import Adam optimizer
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -21,7 +21,7 @@ if GOOGLE_API_KEY:
21
  genai.configure(api_key=GOOGLE_API_KEY)
22
  else:
23
  st.error(
24
- "Google AI Studio API key not found. Please add it to your .env file. "
25
  "You can obtain an API key from https://makersuite.google.com/."
26
  )
27
  st.stop()
@@ -40,19 +40,11 @@ with st.sidebar:
40
  st.subheader("Key Concepts:")
41
  st.markdown(
42
  """
43
- - **Embeddings**: Numerical representations of text, capturing semantic meaning.
44
- - **Vector Databases**: Databases optimized for storing and querying vectors (simulated here).
45
- - **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
46
- - **Cosine Similarity**: A measure of similarity between two vectors.
47
- - **Neural Networks**: Using embeddings as input for classification.
48
- """
49
- )
50
- st.subheader("Whitepaper Insights")
51
- st.markdown(
52
- """
53
- - Efficient similarity search using vector indexes (e.g., ANN).
54
- - Handling large datasets and scalability considerations.
55
- - Applications of embeddings: search, recommendation, classification, etc.
56
  """
57
  )
58
 
@@ -61,7 +53,6 @@ def code_block(text: str, language: str = "text") -> None:
61
  """Displays text as a formatted code block in Streamlit."""
62
  st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
63
 
64
-
65
  def display_response(response: Any) -> None:
66
  """Displays the model's response."""
67
  if response and hasattr(response, "text"):
@@ -70,276 +61,4 @@ def display_response(response: Any) -> None:
70
  else:
71
  st.error("Failed to generate a response.")
72
 
73
-
74
-
75
- def generate_embeddings(texts: List[str], model_name: str) -> Optional[List[List[float]]]:
76
- """Generates embeddings for a list of texts using a specified model.
77
-
78
- Args:
79
- texts: List of text strings.
80
- model_name: Name of the embedding model.
81
-
82
- Returns:
83
- List of embeddings (list of floats) or None on error.
84
- """
85
- try:
86
- model = genai.GenerativeModel(model_name)
87
- response = model.generate_embeddings(texts=texts) # Use generate_embeddings
88
- return [embedding.values for embedding in response.embeddings] # changed
89
- except Exception as e:
90
- st.error(f"Error generating embeddings with model '{model_name}': {e}")
91
- return None
92
-
93
-
94
-
95
- def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
96
- """Generates content with retry logic and error handling.
97
-
98
- Args:
99
- prompt: The prompt string.
100
- model_name: The name of the language model.
101
- generation_config: The generation configuration.
102
- max_retries: Maximum number of retries.
103
- delay: Delay in seconds between retries.
104
-
105
- Returns:
106
- The generated response or None on error.
107
- """
108
- for i in range(max_retries):
109
- try:
110
- model = genai.GenerativeModel(model_name)
111
- response = model.generate_content(prompt, generation_config=generation_config)
112
- return response
113
- except Exception as e:
114
- error_message = str(e)
115
- st.warning(f"Error during generation (attempt {i + 1}/{max_retries}): {error_message}")
116
- if "404" in error_message and "not found" in error_message:
117
- st.error(
118
- f"Model '{model_name}' is not available or not supported. Please select a different model."
119
- )
120
- return None # Return None to signal a non-retryable error
121
- elif i < max_retries - 1:
122
- st.info(f"Retrying in {delay} seconds...")
123
- time.sleep(delay)
124
- else:
125
- st.error(f"Failed to generate content after {max_retries} attempts. Please check your prompt and model.")
126
- return None # Return None after max retries
127
- return None #Should never reach here
128
-
129
-
130
-
131
- def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
132
- """Calculates the cosine similarity between two embeddings."""
133
- return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
134
-
135
-
136
-
137
- def create_and_train_model(
138
- embeddings: List[List[float]],
139
- labels: List[int],
140
- num_classes: int,
141
- epochs: int,
142
- batch_size: int,
143
- learning_rate: float,
144
- optimizer_str: str
145
- ) -> tf.keras.Model:
146
- """Creates and trains a neural network for classification.
147
-
148
- Args:
149
- embeddings: List of input embeddings.
150
- labels: List of integer labels.
151
- num_classes: Number of classes.
152
- epochs: Number of training epochs.
153
- batch_size: Batch size for training.
154
- learning_rate: Learning rate for the optimizer.
155
- optimizer_str: Name of the optimizer ('adam', 'sgd', 'rmsprop')
156
-
157
- Returns:
158
- Trained Keras model.
159
- """
160
- model = Sequential([
161
- Input(shape=(len(embeddings[0]),)),
162
- Dense(64, activation='relu'), # Increased hidden layer size
163
- Dense(32, activation='relu'),
164
- Dense(num_classes, activation='softmax')
165
- ])
166
-
167
- if optimizer_str.lower() == 'adam':
168
- optimizer = Adam(learning_rate=learning_rate)
169
- elif optimizer_str.lower() == 'sgd':
170
- optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
171
- elif optimizer_str.lower() == 'rmsprop':
172
- optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
173
- else:
174
- optimizer = Adam(learning_rate=learning_rate) #default
175
-
176
- model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
177
- encoded_labels = to_categorical(labels, num_classes=num_classes)
178
- model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
179
- return model
180
-
181
-
182
-
183
- # --- RAG Question Answering ---
184
- st.header("RAG Question Answering")
185
- rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
186
- rag_embedding_model = st.selectbox("Select embedding model for RAG:", ["gemini-pro"], index=0)
187
- rag_context = st.text_area(
188
- "Enter your context documents:",
189
- "Relevant information to answer the question. Separate documents with newlines.",
190
- height=150,
191
- )
192
- rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=70)
193
- rag_max_context_length = st.number_input("Maximum Context Length", min_value=100, max_value=2000, value=500, step=100)
194
-
195
-
196
- if st.button("Answer with RAG"):
197
- if not rag_context or not rag_question:
198
- st.warning("Please provide both context and a question.")
199
- else:
200
- with st.spinner("Generating answer..."):
201
- try:
202
- # 1. Generate embeddings for the context
203
- context_embeddings = generate_embeddings(rag_context.split('\n'), rag_embedding_model)
204
- if not context_embeddings:
205
- st.stop()
206
-
207
- # 2. Generate embedding for the question
208
- question_embedding = generate_embeddings([rag_question], rag_embedding_model)
209
- if not question_embedding:
210
- st.stop()
211
-
212
- # 3. Calculate similarity scores
213
- similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
214
-
215
- # 4. Find the most relevant document(s)
216
- most_relevant_index = np.argmax(similarities)
217
- relevant_context = rag_context.split('\n')[most_relevant_index]
218
- #truncate context
219
- if len(relevant_context) > rag_max_context_length:
220
- relevant_context = relevant_context[:rag_max_context_length]
221
-
222
- # 5. Construct the prompt
223
- rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
224
-
225
- # 6. Generate the answer
226
- response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
227
- if response:
228
- display_response(response)
229
- except Exception as e:
230
- st.error(f"An error occurred: {e}")
231
-
232
-
233
-
234
- # --- Text Similarity ---
235
- st.header("Text Similarity")
236
- similarity_embedding_model = st.selectbox("Select embedding model for similarity:", ["gemini-pro"], index=0)
237
- text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=70)
238
- text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=70)
239
-
240
- if st.button("Calculate Similarity"):
241
- if not text1 or not text2:
242
- st.warning("Please provide both texts.")
243
- else:
244
- with st.spinner("Calculating similarity..."):
245
- try:
246
- # 1. Generate embeddings
247
- embeddings = generate_embeddings([text1, text2], similarity_embedding_model)
248
- if not embeddings:
249
- st.stop()
250
-
251
- # 2. Calculate cosine similarity
252
- similarity = calculate_similarity(embeddings[0], embeddings[1])
253
- st.subheader("Cosine Similarity:")
254
- st.write(similarity)
255
- except Exception as e:
256
- st.error(f"An error occurred: {e}")
257
-
258
-
259
-
260
- # --- Neural Classification ---
261
- st.header("Neural Classification with Embeddings")
262
- classification_embedding_model = st.selectbox("Select embedding model for classification:", ["gemini-pro"], index=0)
263
- classification_data = st.text_area(
264
- "Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
265
- "text1,0\ntext2,1\ntext3,0\ntext4,1",
266
- height=150,
267
- )
268
- classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=70)
269
- num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=200, value=10, step=1)
270
- batch_size = st.number_input("Batch Size", min_value=1, max_value=128, value=32, step=1)
271
- learning_rate = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.0001, step=0.0001, format="%.4f")
272
- optimizer_str = st.selectbox("Optimizer", ['adam', 'sgd', 'rmsprop'], index=0)
273
-
274
-
275
- def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
276
- """Processes the classification data string into lists of texts and labels.
277
-
278
- Args:
279
- data: The input string.
280
-
281
- Returns:
282
- A tuple of (texts, labels) or None on error
283
- """
284
- data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
285
- if not data_pairs:
286
- st.error("No valid data pairs found. Please ensure each line contains 'text,label'.")
287
- return None
288
- texts = []
289
- labels = []
290
- for i, pair in enumerate(data_pairs):
291
- if len(pair) != 2:
292
- st.error(f"Invalid data format in line {i + 1}: '{','.join(pair)}'. Expected 'text,label'.")
293
- return None
294
- text = pair[0].strip()
295
- label_str = pair[1].strip()
296
- try:
297
- label = int(label_str)
298
- texts.append(text)
299
- labels.append(label)
300
- except ValueError:
301
- st.error(f"Invalid label value in line {i + 1}: '{label_str}'. Label must be an integer.")
302
- return None
303
- return texts, labels
304
-
305
-
306
-
307
- if st.button("Classify"):
308
- if not classification_data or not classification_prompt:
309
- st.warning("Please provide training data and text to classify.")
310
- else:
311
- with st.spinner("Classifying..."):
312
- try:
313
- # 1. Process the training data
314
- processed_data = process_classification_data(classification_data)
315
- if not processed_data:
316
- st.stop()
317
- train_texts, train_labels = processed_data
318
- num_classes = len(set(train_labels))
319
-
320
- # 2. Generate embeddings for training data
321
- train_embeddings = generate_embeddings(train_texts, classification_embedding_model)
322
- if not train_embeddings:
323
- st.stop()
324
-
325
- # 3. Create and train the model
326
- model = create_and_train_model(
327
- train_embeddings, train_labels, num_classes, num_epochs, batch_size, learning_rate, optimizer_str
328
- )
329
-
330
- # 4. Generate embedding for the text to classify
331
- predict_embedding = generate_embeddings([classification_prompt], classification_embedding_model)
332
- if not predict_embedding:
333
- st.stop()
334
-
335
- # 5. Make the prediction
336
- prediction = model.predict(np.array([predict_embedding]), verbose=0)
337
- predicted_class = np.argmax(prediction[0])
338
- st.subheader("Predicted Class:")
339
- st.write(predicted_class)
340
- st.subheader("Prediction Probabilities:")
341
- st.write(prediction)
342
-
343
- except Exception as e:
344
- st.error(f"An error occurred: {e}")
345
-
 
10
  from tensorflow.keras.models import Sequential
11
  from tensorflow.keras.layers import Dense, Input
12
  from tensorflow.keras.utils import to_categorical
13
+ from tensorflow.keras.optimizers import Adam
14
 
15
  # Load environment variables
16
  load_dotenv()
 
21
  genai.configure(api_key=GOOGLE_API_KEY)
22
  else:
23
  st.error(
24
+ "Google AI Studio API key not found. Please add it to your .env file. "
25
  "You can obtain an API key from https://makersuite.google.com/."
26
  )
27
  st.stop()
 
40
  st.subheader("Key Concepts:")
41
  st.markdown(
42
  """
43
+ - **Embeddings**: Numerical representations of text, capturing semantic meaning.
44
+ - **Vector Databases**: Databases optimized for storing and querying vectors (simulated here).
45
+ - **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
46
+ - **Cosine Similarity**: A measure of similarity between two vectors.
47
+ - **Neural Networks**: Using embeddings as input for classification.
 
 
 
 
 
 
 
 
48
  """
49
  )
50
 
 
53
  """Displays text as a formatted code block in Streamlit."""
54
  st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
55
 
 
56
  def display_response(response: Any) -> None:
57
  """Displays the model's response."""
58
  if response and hasattr(response, "text"):
 
61
  else:
62
  st.error("Failed to generate a response.")
63
 
64
+ def generate_embeddings(texts: List[str], model_name