Dua Rajper commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ import tensorflow as tf
|
|
| 10 |
from tensorflow.keras.models import Sequential
|
| 11 |
from tensorflow.keras.layers import Dense, Input
|
| 12 |
from tensorflow.keras.utils import to_categorical
|
| 13 |
-
from tensorflow.keras.optimizers import Adam
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
load_dotenv()
|
|
@@ -21,7 +21,7 @@ if GOOGLE_API_KEY:
|
|
| 21 |
genai.configure(api_key=GOOGLE_API_KEY)
|
| 22 |
else:
|
| 23 |
st.error(
|
| 24 |
-
"Google AI Studio API key not found. Please add it to your .env file.
|
| 25 |
"You can obtain an API key from https://makersuite.google.com/."
|
| 26 |
)
|
| 27 |
st.stop()
|
|
@@ -40,19 +40,11 @@ with st.sidebar:
|
|
| 40 |
st.subheader("Key Concepts:")
|
| 41 |
st.markdown(
|
| 42 |
"""
|
| 43 |
-
-
|
| 44 |
-
-
|
| 45 |
-
-
|
| 46 |
-
-
|
| 47 |
-
-
|
| 48 |
-
"""
|
| 49 |
-
)
|
| 50 |
-
st.subheader("Whitepaper Insights")
|
| 51 |
-
st.markdown(
|
| 52 |
-
"""
|
| 53 |
-
- Efficient similarity search using vector indexes (e.g., ANN).
|
| 54 |
-
- Handling large datasets and scalability considerations.
|
| 55 |
-
- Applications of embeddings: search, recommendation, classification, etc.
|
| 56 |
"""
|
| 57 |
)
|
| 58 |
|
|
@@ -61,7 +53,6 @@ def code_block(text: str, language: str = "text") -> None:
|
|
| 61 |
"""Displays text as a formatted code block in Streamlit."""
|
| 62 |
st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
|
| 63 |
|
| 64 |
-
|
| 65 |
def display_response(response: Any) -> None:
|
| 66 |
"""Displays the model's response."""
|
| 67 |
if response and hasattr(response, "text"):
|
|
@@ -70,276 +61,4 @@ def display_response(response: Any) -> None:
|
|
| 70 |
else:
|
| 71 |
st.error("Failed to generate a response.")
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def generate_embeddings(texts: List[str], model_name: str) -> Optional[List[List[float]]]:
|
| 76 |
-
"""Generates embeddings for a list of texts using a specified model.
|
| 77 |
-
|
| 78 |
-
Args:
|
| 79 |
-
texts: List of text strings.
|
| 80 |
-
model_name: Name of the embedding model.
|
| 81 |
-
|
| 82 |
-
Returns:
|
| 83 |
-
List of embeddings (list of floats) or None on error.
|
| 84 |
-
"""
|
| 85 |
-
try:
|
| 86 |
-
model = genai.GenerativeModel(model_name)
|
| 87 |
-
response = model.generate_embeddings(texts=texts) # Use generate_embeddings
|
| 88 |
-
return [embedding.values for embedding in response.embeddings] # changed
|
| 89 |
-
except Exception as e:
|
| 90 |
-
st.error(f"Error generating embeddings with model '{model_name}': {e}")
|
| 91 |
-
return None
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
|
| 96 |
-
"""Generates content with retry logic and error handling.
|
| 97 |
-
|
| 98 |
-
Args:
|
| 99 |
-
prompt: The prompt string.
|
| 100 |
-
model_name: The name of the language model.
|
| 101 |
-
generation_config: The generation configuration.
|
| 102 |
-
max_retries: Maximum number of retries.
|
| 103 |
-
delay: Delay in seconds between retries.
|
| 104 |
-
|
| 105 |
-
Returns:
|
| 106 |
-
The generated response or None on error.
|
| 107 |
-
"""
|
| 108 |
-
for i in range(max_retries):
|
| 109 |
-
try:
|
| 110 |
-
model = genai.GenerativeModel(model_name)
|
| 111 |
-
response = model.generate_content(prompt, generation_config=generation_config)
|
| 112 |
-
return response
|
| 113 |
-
except Exception as e:
|
| 114 |
-
error_message = str(e)
|
| 115 |
-
st.warning(f"Error during generation (attempt {i + 1}/{max_retries}): {error_message}")
|
| 116 |
-
if "404" in error_message and "not found" in error_message:
|
| 117 |
-
st.error(
|
| 118 |
-
f"Model '{model_name}' is not available or not supported. Please select a different model."
|
| 119 |
-
)
|
| 120 |
-
return None # Return None to signal a non-retryable error
|
| 121 |
-
elif i < max_retries - 1:
|
| 122 |
-
st.info(f"Retrying in {delay} seconds...")
|
| 123 |
-
time.sleep(delay)
|
| 124 |
-
else:
|
| 125 |
-
st.error(f"Failed to generate content after {max_retries} attempts. Please check your prompt and model.")
|
| 126 |
-
return None # Return None after max retries
|
| 127 |
-
return None #Should never reach here
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
|
| 132 |
-
"""Calculates the cosine similarity between two embeddings."""
|
| 133 |
-
return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
def create_and_train_model(
|
| 138 |
-
embeddings: List[List[float]],
|
| 139 |
-
labels: List[int],
|
| 140 |
-
num_classes: int,
|
| 141 |
-
epochs: int,
|
| 142 |
-
batch_size: int,
|
| 143 |
-
learning_rate: float,
|
| 144 |
-
optimizer_str: str
|
| 145 |
-
) -> tf.keras.Model:
|
| 146 |
-
"""Creates and trains a neural network for classification.
|
| 147 |
-
|
| 148 |
-
Args:
|
| 149 |
-
embeddings: List of input embeddings.
|
| 150 |
-
labels: List of integer labels.
|
| 151 |
-
num_classes: Number of classes.
|
| 152 |
-
epochs: Number of training epochs.
|
| 153 |
-
batch_size: Batch size for training.
|
| 154 |
-
learning_rate: Learning rate for the optimizer.
|
| 155 |
-
optimizer_str: Name of the optimizer ('adam', 'sgd', 'rmsprop')
|
| 156 |
-
|
| 157 |
-
Returns:
|
| 158 |
-
Trained Keras model.
|
| 159 |
-
"""
|
| 160 |
-
model = Sequential([
|
| 161 |
-
Input(shape=(len(embeddings[0]),)),
|
| 162 |
-
Dense(64, activation='relu'), # Increased hidden layer size
|
| 163 |
-
Dense(32, activation='relu'),
|
| 164 |
-
Dense(num_classes, activation='softmax')
|
| 165 |
-
])
|
| 166 |
-
|
| 167 |
-
if optimizer_str.lower() == 'adam':
|
| 168 |
-
optimizer = Adam(learning_rate=learning_rate)
|
| 169 |
-
elif optimizer_str.lower() == 'sgd':
|
| 170 |
-
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
|
| 171 |
-
elif optimizer_str.lower() == 'rmsprop':
|
| 172 |
-
optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
|
| 173 |
-
else:
|
| 174 |
-
optimizer = Adam(learning_rate=learning_rate) #default
|
| 175 |
-
|
| 176 |
-
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
|
| 177 |
-
encoded_labels = to_categorical(labels, num_classes=num_classes)
|
| 178 |
-
model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
|
| 179 |
-
return model
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
# --- RAG Question Answering ---
|
| 184 |
-
st.header("RAG Question Answering")
|
| 185 |
-
rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
|
| 186 |
-
rag_embedding_model = st.selectbox("Select embedding model for RAG:", ["gemini-pro"], index=0)
|
| 187 |
-
rag_context = st.text_area(
|
| 188 |
-
"Enter your context documents:",
|
| 189 |
-
"Relevant information to answer the question. Separate documents with newlines.",
|
| 190 |
-
height=150,
|
| 191 |
-
)
|
| 192 |
-
rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=70)
|
| 193 |
-
rag_max_context_length = st.number_input("Maximum Context Length", min_value=100, max_value=2000, value=500, step=100)
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
if st.button("Answer with RAG"):
|
| 197 |
-
if not rag_context or not rag_question:
|
| 198 |
-
st.warning("Please provide both context and a question.")
|
| 199 |
-
else:
|
| 200 |
-
with st.spinner("Generating answer..."):
|
| 201 |
-
try:
|
| 202 |
-
# 1. Generate embeddings for the context
|
| 203 |
-
context_embeddings = generate_embeddings(rag_context.split('\n'), rag_embedding_model)
|
| 204 |
-
if not context_embeddings:
|
| 205 |
-
st.stop()
|
| 206 |
-
|
| 207 |
-
# 2. Generate embedding for the question
|
| 208 |
-
question_embedding = generate_embeddings([rag_question], rag_embedding_model)
|
| 209 |
-
if not question_embedding:
|
| 210 |
-
st.stop()
|
| 211 |
-
|
| 212 |
-
# 3. Calculate similarity scores
|
| 213 |
-
similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
|
| 214 |
-
|
| 215 |
-
# 4. Find the most relevant document(s)
|
| 216 |
-
most_relevant_index = np.argmax(similarities)
|
| 217 |
-
relevant_context = rag_context.split('\n')[most_relevant_index]
|
| 218 |
-
#truncate context
|
| 219 |
-
if len(relevant_context) > rag_max_context_length:
|
| 220 |
-
relevant_context = relevant_context[:rag_max_context_length]
|
| 221 |
-
|
| 222 |
-
# 5. Construct the prompt
|
| 223 |
-
rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
|
| 224 |
-
|
| 225 |
-
# 6. Generate the answer
|
| 226 |
-
response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
|
| 227 |
-
if response:
|
| 228 |
-
display_response(response)
|
| 229 |
-
except Exception as e:
|
| 230 |
-
st.error(f"An error occurred: {e}")
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
# --- Text Similarity ---
|
| 235 |
-
st.header("Text Similarity")
|
| 236 |
-
similarity_embedding_model = st.selectbox("Select embedding model for similarity:", ["gemini-pro"], index=0)
|
| 237 |
-
text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=70)
|
| 238 |
-
text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=70)
|
| 239 |
-
|
| 240 |
-
if st.button("Calculate Similarity"):
|
| 241 |
-
if not text1 or not text2:
|
| 242 |
-
st.warning("Please provide both texts.")
|
| 243 |
-
else:
|
| 244 |
-
with st.spinner("Calculating similarity..."):
|
| 245 |
-
try:
|
| 246 |
-
# 1. Generate embeddings
|
| 247 |
-
embeddings = generate_embeddings([text1, text2], similarity_embedding_model)
|
| 248 |
-
if not embeddings:
|
| 249 |
-
st.stop()
|
| 250 |
-
|
| 251 |
-
# 2. Calculate cosine similarity
|
| 252 |
-
similarity = calculate_similarity(embeddings[0], embeddings[1])
|
| 253 |
-
st.subheader("Cosine Similarity:")
|
| 254 |
-
st.write(similarity)
|
| 255 |
-
except Exception as e:
|
| 256 |
-
st.error(f"An error occurred: {e}")
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
# --- Neural Classification ---
|
| 261 |
-
st.header("Neural Classification with Embeddings")
|
| 262 |
-
classification_embedding_model = st.selectbox("Select embedding model for classification:", ["gemini-pro"], index=0)
|
| 263 |
-
classification_data = st.text_area(
|
| 264 |
-
"Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
|
| 265 |
-
"text1,0\ntext2,1\ntext3,0\ntext4,1",
|
| 266 |
-
height=150,
|
| 267 |
-
)
|
| 268 |
-
classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=70)
|
| 269 |
-
num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=200, value=10, step=1)
|
| 270 |
-
batch_size = st.number_input("Batch Size", min_value=1, max_value=128, value=32, step=1)
|
| 271 |
-
learning_rate = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.0001, step=0.0001, format="%.4f")
|
| 272 |
-
optimizer_str = st.selectbox("Optimizer", ['adam', 'sgd', 'rmsprop'], index=0)
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
|
| 276 |
-
"""Processes the classification data string into lists of texts and labels.
|
| 277 |
-
|
| 278 |
-
Args:
|
| 279 |
-
data: The input string.
|
| 280 |
-
|
| 281 |
-
Returns:
|
| 282 |
-
A tuple of (texts, labels) or None on error
|
| 283 |
-
"""
|
| 284 |
-
data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
|
| 285 |
-
if not data_pairs:
|
| 286 |
-
st.error("No valid data pairs found. Please ensure each line contains 'text,label'.")
|
| 287 |
-
return None
|
| 288 |
-
texts = []
|
| 289 |
-
labels = []
|
| 290 |
-
for i, pair in enumerate(data_pairs):
|
| 291 |
-
if len(pair) != 2:
|
| 292 |
-
st.error(f"Invalid data format in line {i + 1}: '{','.join(pair)}'. Expected 'text,label'.")
|
| 293 |
-
return None
|
| 294 |
-
text = pair[0].strip()
|
| 295 |
-
label_str = pair[1].strip()
|
| 296 |
-
try:
|
| 297 |
-
label = int(label_str)
|
| 298 |
-
texts.append(text)
|
| 299 |
-
labels.append(label)
|
| 300 |
-
except ValueError:
|
| 301 |
-
st.error(f"Invalid label value in line {i + 1}: '{label_str}'. Label must be an integer.")
|
| 302 |
-
return None
|
| 303 |
-
return texts, labels
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
if st.button("Classify"):
|
| 308 |
-
if not classification_data or not classification_prompt:
|
| 309 |
-
st.warning("Please provide training data and text to classify.")
|
| 310 |
-
else:
|
| 311 |
-
with st.spinner("Classifying..."):
|
| 312 |
-
try:
|
| 313 |
-
# 1. Process the training data
|
| 314 |
-
processed_data = process_classification_data(classification_data)
|
| 315 |
-
if not processed_data:
|
| 316 |
-
st.stop()
|
| 317 |
-
train_texts, train_labels = processed_data
|
| 318 |
-
num_classes = len(set(train_labels))
|
| 319 |
-
|
| 320 |
-
# 2. Generate embeddings for training data
|
| 321 |
-
train_embeddings = generate_embeddings(train_texts, classification_embedding_model)
|
| 322 |
-
if not train_embeddings:
|
| 323 |
-
st.stop()
|
| 324 |
-
|
| 325 |
-
# 3. Create and train the model
|
| 326 |
-
model = create_and_train_model(
|
| 327 |
-
train_embeddings, train_labels, num_classes, num_epochs, batch_size, learning_rate, optimizer_str
|
| 328 |
-
)
|
| 329 |
-
|
| 330 |
-
# 4. Generate embedding for the text to classify
|
| 331 |
-
predict_embedding = generate_embeddings([classification_prompt], classification_embedding_model)
|
| 332 |
-
if not predict_embedding:
|
| 333 |
-
st.stop()
|
| 334 |
-
|
| 335 |
-
# 5. Make the prediction
|
| 336 |
-
prediction = model.predict(np.array([predict_embedding]), verbose=0)
|
| 337 |
-
predicted_class = np.argmax(prediction[0])
|
| 338 |
-
st.subheader("Predicted Class:")
|
| 339 |
-
st.write(predicted_class)
|
| 340 |
-
st.subheader("Prediction Probabilities:")
|
| 341 |
-
st.write(prediction)
|
| 342 |
-
|
| 343 |
-
except Exception as e:
|
| 344 |
-
st.error(f"An error occurred: {e}")
|
| 345 |
-
|
|
|
|
| 10 |
from tensorflow.keras.models import Sequential
|
| 11 |
from tensorflow.keras.layers import Dense, Input
|
| 12 |
from tensorflow.keras.utils import to_categorical
|
| 13 |
+
from tensorflow.keras.optimizers import Adam
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
load_dotenv()
|
|
|
|
| 21 |
genai.configure(api_key=GOOGLE_API_KEY)
|
| 22 |
else:
|
| 23 |
st.error(
|
| 24 |
+
"Google AI Studio API key not found. Please add it to your .env file. "
|
| 25 |
"You can obtain an API key from https://makersuite.google.com/."
|
| 26 |
)
|
| 27 |
st.stop()
|
|
|
|
| 40 |
st.subheader("Key Concepts:")
|
| 41 |
st.markdown(
|
| 42 |
"""
|
| 43 |
+
- **Embeddings**: Numerical representations of text, capturing semantic meaning.
|
| 44 |
+
- **Vector Databases**: Databases optimized for storing and querying vectors (simulated here).
|
| 45 |
+
- **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
|
| 46 |
+
- **Cosine Similarity**: A measure of similarity between two vectors.
|
| 47 |
+
- **Neural Networks**: Using embeddings as input for classification.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
"""
|
| 49 |
)
|
| 50 |
|
|
|
|
| 53 |
"""Displays text as a formatted code block in Streamlit."""
|
| 54 |
st.markdown(f"```{language}\n{text}\n```", unsafe_allow_html=True)
|
| 55 |
|
|
|
|
| 56 |
def display_response(response: Any) -> None:
|
| 57 |
"""Displays the model's response."""
|
| 58 |
if response and hasattr(response, "text"):
|
|
|
|
| 61 |
else:
|
| 62 |
st.error("Failed to generate a response.")
|
| 63 |
|
| 64 |
+
def generate_embeddings(texts: List[str], model_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|