Dua Rajper commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,13 +5,14 @@ from dotenv import load_dotenv
|
|
| 5 |
import json
|
| 6 |
import textwrap
|
| 7 |
import time
|
| 8 |
-
from typing import Any, List
|
| 9 |
import numpy as np
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
import tensorflow as tf
|
| 12 |
from tensorflow.keras.models import Sequential
|
| 13 |
from tensorflow.keras.layers import Dense, Input
|
| 14 |
from tensorflow.keras.utils import to_categorical
|
|
|
|
| 15 |
|
| 16 |
# Load environment variables
|
| 17 |
load_dotenv()
|
|
@@ -20,7 +21,6 @@ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
|
| 20 |
# Configure Generative AI model
|
| 21 |
if GOOGLE_API_KEY:
|
| 22 |
genai.configure(api_key=GOOGLE_API_KEY)
|
| 23 |
-
model = genai.GenerativeModel('gemini-pro') # You can choose a suitable model.
|
| 24 |
else:
|
| 25 |
st.error(
|
| 26 |
"Google AI Studio API key not found. Please add it to your .env file. "
|
|
@@ -43,17 +43,18 @@ with st.sidebar:
|
|
| 43 |
st.markdown(
|
| 44 |
"""
|
| 45 |
- **Embeddings**: Numerical representations of text, capturing semantic meaning.
|
| 46 |
-
- **Vector Databases**: Databases optimized for storing and querying vectors.
|
| 47 |
- **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
|
| 48 |
- **Cosine Similarity**: A measure of similarity between two vectors.
|
|
|
|
| 49 |
"""
|
| 50 |
)
|
| 51 |
st.subheader("Whitepaper Insights")
|
| 52 |
st.markdown(
|
| 53 |
"""
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
"""
|
| 58 |
)
|
| 59 |
|
|
@@ -72,20 +73,40 @@ def display_response(response: Any) -> None:
|
|
| 72 |
st.error("Failed to generate a response.")
|
| 73 |
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
try:
|
| 78 |
embedding_model = genai.EmbeddingModel(model_name)
|
| 79 |
embeddings = embedding_model.embed_content(texts=texts)
|
| 80 |
-
return [embedding.values for embedding in embeddings.embeddings]
|
| 81 |
except Exception as e:
|
| 82 |
-
st.error(f"Error generating embeddings: {e}")
|
| 83 |
-
return
|
| 84 |
|
| 85 |
|
| 86 |
|
| 87 |
def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
|
| 88 |
-
"""Generates content with retry logic.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
for i in range(max_retries):
|
| 90 |
try:
|
| 91 |
model = genai.GenerativeModel(model_name)
|
|
@@ -98,25 +119,81 @@ def generate_with_retry(prompt: str, model_name: str, generation_config: genai.t
|
|
| 98 |
st.error(
|
| 99 |
f"Model '{model_name}' is not available or not supported. Please select a different model."
|
| 100 |
)
|
| 101 |
-
return None
|
| 102 |
elif i < max_retries - 1:
|
| 103 |
st.info(f"Retrying in {delay} seconds...")
|
| 104 |
time.sleep(delay)
|
| 105 |
else:
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
|
| 111 |
# --- RAG Question Answering ---
|
| 112 |
st.header("RAG Question Answering")
|
| 113 |
rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
|
|
|
|
| 114 |
rag_context = st.text_area(
|
| 115 |
"Enter your context documents:",
|
| 116 |
"Relevant information to answer the question. Separate documents with newlines.",
|
| 117 |
height=150,
|
| 118 |
)
|
| 119 |
-
rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=
|
|
|
|
|
|
|
| 120 |
|
| 121 |
if st.button("Answer with RAG"):
|
| 122 |
if not rag_context or not rag_question:
|
|
@@ -125,12 +202,14 @@ if st.button("Answer with RAG"):
|
|
| 125 |
with st.spinner("Generating answer..."):
|
| 126 |
try:
|
| 127 |
# 1. Generate embeddings for the context
|
| 128 |
-
context_embeddings = generate_embeddings(rag_context.split('\n'))
|
| 129 |
if not context_embeddings:
|
| 130 |
st.stop()
|
| 131 |
|
| 132 |
# 2. Generate embedding for the question
|
| 133 |
-
question_embedding = generate_embeddings([rag_question])
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# 3. Calculate similarity scores
|
| 136 |
similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
|
|
@@ -138,13 +217,17 @@ if st.button("Answer with RAG"):
|
|
| 138 |
# 4. Find the most relevant document(s)
|
| 139 |
most_relevant_index = np.argmax(similarities)
|
| 140 |
relevant_context = rag_context.split('\n')[most_relevant_index]
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# 5. Construct the prompt
|
| 143 |
rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
|
| 144 |
|
| 145 |
# 6. Generate the answer
|
| 146 |
response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
|
| 147 |
-
|
|
|
|
| 148 |
except Exception as e:
|
| 149 |
st.error(f"An error occurred: {e}")
|
| 150 |
|
|
@@ -152,9 +235,9 @@ if st.button("Answer with RAG"):
|
|
| 152 |
|
| 153 |
# --- Text Similarity ---
|
| 154 |
st.header("Text Similarity")
|
| 155 |
-
|
| 156 |
-
text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=
|
| 157 |
-
text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=
|
| 158 |
|
| 159 |
if st.button("Calculate Similarity"):
|
| 160 |
if not text1 or not text2:
|
|
@@ -163,11 +246,12 @@ if st.button("Calculate Similarity"):
|
|
| 163 |
with st.spinner("Calculating similarity..."):
|
| 164 |
try:
|
| 165 |
# 1. Generate embeddings
|
| 166 |
-
embeddings = generate_embeddings([text1, text2],
|
| 167 |
if not embeddings:
|
| 168 |
st.stop()
|
|
|
|
| 169 |
# 2. Calculate cosine similarity
|
| 170 |
-
similarity =
|
| 171 |
st.subheader("Cosine Similarity:")
|
| 172 |
st.write(similarity)
|
| 173 |
except Exception as e:
|
|
@@ -177,28 +261,48 @@ if st.button("Calculate Similarity"):
|
|
| 177 |
|
| 178 |
# --- Neural Classification ---
|
| 179 |
st.header("Neural Classification with Embeddings")
|
| 180 |
-
|
| 181 |
classification_data = st.text_area(
|
| 182 |
"Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
|
| 183 |
"text1,0\ntext2,1\ntext3,0\ntext4,1",
|
| 184 |
height=150,
|
| 185 |
)
|
| 186 |
-
classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=
|
| 187 |
-
num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
|
| 204 |
|
|
@@ -209,21 +313,26 @@ if st.button("Classify"):
|
|
| 209 |
with st.spinner("Classifying..."):
|
| 210 |
try:
|
| 211 |
# 1. Process the training data
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
# 2. Generate embeddings for training data
|
| 218 |
-
train_embeddings = generate_embeddings(train_texts,
|
| 219 |
if not train_embeddings:
|
| 220 |
st.stop()
|
| 221 |
|
| 222 |
# 3. Create and train the model
|
| 223 |
-
model = create_and_train_model(
|
|
|
|
|
|
|
| 224 |
|
| 225 |
# 4. Generate embedding for the text to classify
|
| 226 |
-
predict_embedding = generate_embeddings([classification_prompt],
|
|
|
|
|
|
|
| 227 |
|
| 228 |
# 5. Make the prediction
|
| 229 |
prediction = model.predict(np.array([predict_embedding]), verbose=0)
|
|
@@ -235,4 +344,3 @@ if st.button("Classify"):
|
|
| 235 |
|
| 236 |
except Exception as e:
|
| 237 |
st.error(f"An error occurred: {e}")
|
| 238 |
-
|
|
|
|
| 5 |
import json
|
| 6 |
import textwrap
|
| 7 |
import time
|
| 8 |
+
from typing import Any, List, Optional
|
| 9 |
import numpy as np
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
import tensorflow as tf
|
| 12 |
from tensorflow.keras.models import Sequential
|
| 13 |
from tensorflow.keras.layers import Dense, Input
|
| 14 |
from tensorflow.keras.utils import to_categorical
|
| 15 |
+
from tensorflow.keras.optimizers import Adam # Import Adam optimizer
|
| 16 |
|
| 17 |
# Load environment variables
|
| 18 |
load_dotenv()
|
|
|
|
| 21 |
# Configure Generative AI model
|
| 22 |
if GOOGLE_API_KEY:
|
| 23 |
genai.configure(api_key=GOOGLE_API_KEY)
|
|
|
|
| 24 |
else:
|
| 25 |
st.error(
|
| 26 |
"Google AI Studio API key not found. Please add it to your .env file. "
|
|
|
|
| 43 |
st.markdown(
|
| 44 |
"""
|
| 45 |
- **Embeddings**: Numerical representations of text, capturing semantic meaning.
|
| 46 |
+
- **Vector Databases**: Databases optimized for storing and querying vectors (simulated here).
|
| 47 |
- **Retrieval Augmented Generation (RAG)**: Combining retrieval with LLM generation.
|
| 48 |
- **Cosine Similarity**: A measure of similarity between two vectors.
|
| 49 |
+
- **Neural Networks**: Using embeddings as input for classification.
|
| 50 |
"""
|
| 51 |
)
|
| 52 |
st.subheader("Whitepaper Insights")
|
| 53 |
st.markdown(
|
| 54 |
"""
|
| 55 |
+
- Efficient similarity search using vector indexes (e.g., ANN).
|
| 56 |
+
- Handling large datasets and scalability considerations.
|
| 57 |
+
- Applications of embeddings: search, recommendation, classification, etc.
|
| 58 |
"""
|
| 59 |
)
|
| 60 |
|
|
|
|
| 73 |
st.error("Failed to generate a response.")
|
| 74 |
|
| 75 |
|
| 76 |
+
|
| 77 |
+
def generate_embeddings(texts: List[str], model_name: str) -> Optional[List[List[float]]]:
|
| 78 |
+
"""Generates embeddings for a list of texts using a specified model.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
texts: List of text strings.
|
| 82 |
+
model_name: Name of the embedding model.
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
List of embeddings (list of floats) or None on error.
|
| 86 |
+
"""
|
| 87 |
try:
|
| 88 |
embedding_model = genai.EmbeddingModel(model_name)
|
| 89 |
embeddings = embedding_model.embed_content(texts=texts)
|
| 90 |
+
return [embedding.values for embedding in embeddings.embeddings]
|
| 91 |
except Exception as e:
|
| 92 |
+
st.error(f"Error generating embeddings with model '{model_name}': {e}")
|
| 93 |
+
return None
|
| 94 |
|
| 95 |
|
| 96 |
|
| 97 |
def generate_with_retry(prompt: str, model_name: str, generation_config: genai.types.GenerationConfig, max_retries: int = 3, delay: int = 5) -> Any:
|
| 98 |
+
"""Generates content with retry logic and error handling.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
prompt: The prompt string.
|
| 102 |
+
model_name: The name of the language model.
|
| 103 |
+
generation_config: The generation configuration.
|
| 104 |
+
max_retries: Maximum number of retries.
|
| 105 |
+
delay: Delay in seconds between retries.
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
The generated response or None on error.
|
| 109 |
+
"""
|
| 110 |
for i in range(max_retries):
|
| 111 |
try:
|
| 112 |
model = genai.GenerativeModel(model_name)
|
|
|
|
| 119 |
st.error(
|
| 120 |
f"Model '{model_name}' is not available or not supported. Please select a different model."
|
| 121 |
)
|
| 122 |
+
return None # Return None to signal a non-retryable error
|
| 123 |
elif i < max_retries - 1:
|
| 124 |
st.info(f"Retrying in {delay} seconds...")
|
| 125 |
time.sleep(delay)
|
| 126 |
else:
|
| 127 |
+
st.error(f"Failed to generate content after {max_retries} attempts. Please check your prompt and model.")
|
| 128 |
+
return None # Return None after max retries
|
| 129 |
+
return None #Should never reach here
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def calculate_similarity(embedding1: List[float], embedding2: List[float]) -> float:
|
| 134 |
+
"""Calculates the cosine similarity between two embeddings."""
|
| 135 |
+
return cosine_similarity(np.array(embedding1).reshape(1, -1), np.array(embedding2).reshape(1, -1))[0][0]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def create_and_train_model(
|
| 140 |
+
embeddings: List[List[float]],
|
| 141 |
+
labels: List[int],
|
| 142 |
+
num_classes: int,
|
| 143 |
+
epochs: int,
|
| 144 |
+
batch_size: int,
|
| 145 |
+
learning_rate: float,
|
| 146 |
+
optimizer_str: str
|
| 147 |
+
) -> tf.keras.Model:
|
| 148 |
+
"""Creates and trains a neural network for classification.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
embeddings: List of input embeddings.
|
| 152 |
+
labels: List of integer labels.
|
| 153 |
+
num_classes: Number of classes.
|
| 154 |
+
epochs: Number of training epochs.
|
| 155 |
+
batch_size: Batch size for training.
|
| 156 |
+
learning_rate: Learning rate for the optimizer.
|
| 157 |
+
optimizer_str: Name of the optimizer ('adam', 'sgd', 'rmsprop')
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Trained Keras model.
|
| 161 |
+
"""
|
| 162 |
+
model = Sequential([
|
| 163 |
+
Input(shape=(len(embeddings[0]),)),
|
| 164 |
+
Dense(64, activation='relu'), # Increased hidden layer size
|
| 165 |
+
Dense(32, activation='relu'),
|
| 166 |
+
Dense(num_classes, activation='softmax')
|
| 167 |
+
])
|
| 168 |
+
|
| 169 |
+
if optimizer_str.lower() == 'adam':
|
| 170 |
+
optimizer = Adam(learning_rate=learning_rate)
|
| 171 |
+
elif optimizer_str.lower() == 'sgd':
|
| 172 |
+
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
|
| 173 |
+
elif optimizer_str.lower() == 'rmsprop':
|
| 174 |
+
optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
|
| 175 |
+
else:
|
| 176 |
+
optimizer = Adam(learning_rate=learning_rate) #default
|
| 177 |
+
|
| 178 |
+
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
|
| 179 |
+
encoded_labels = to_categorical(labels, num_classes=num_classes)
|
| 180 |
+
model.fit(np.array(embeddings), encoded_labels, epochs=epochs, batch_size=batch_size, verbose=0)
|
| 181 |
+
return model
|
| 182 |
|
| 183 |
|
| 184 |
|
| 185 |
# --- RAG Question Answering ---
|
| 186 |
st.header("RAG Question Answering")
|
| 187 |
rag_model_name = st.selectbox("Select model for RAG:", ["gemini-pro"], index=0)
|
| 188 |
+
rag_embedding_model = st.selectbox("Select embedding model for RAG:", ["models/embedding-001"], index=0)
|
| 189 |
rag_context = st.text_area(
|
| 190 |
"Enter your context documents:",
|
| 191 |
"Relevant information to answer the question. Separate documents with newlines.",
|
| 192 |
height=150,
|
| 193 |
)
|
| 194 |
+
rag_question = st.text_area("Ask a question about the context:", "What is the main topic?", height=70) # Changed height to 70
|
| 195 |
+
rag_max_context_length = st.number_input("Maximum Context Length", min_value=100, max_value=2000, value=500, step=100)
|
| 196 |
+
|
| 197 |
|
| 198 |
if st.button("Answer with RAG"):
|
| 199 |
if not rag_context or not rag_question:
|
|
|
|
| 202 |
with st.spinner("Generating answer..."):
|
| 203 |
try:
|
| 204 |
# 1. Generate embeddings for the context
|
| 205 |
+
context_embeddings = generate_embeddings(rag_context.split('\n'), rag_embedding_model)
|
| 206 |
if not context_embeddings:
|
| 207 |
st.stop()
|
| 208 |
|
| 209 |
# 2. Generate embedding for the question
|
| 210 |
+
question_embedding = generate_embeddings([rag_question], rag_embedding_model)
|
| 211 |
+
if not question_embedding:
|
| 212 |
+
st.stop()
|
| 213 |
|
| 214 |
# 3. Calculate similarity scores
|
| 215 |
similarities = cosine_similarity(np.array(question_embedding).reshape(1, -1), np.array(context_embeddings))[0]
|
|
|
|
| 217 |
# 4. Find the most relevant document(s)
|
| 218 |
most_relevant_index = np.argmax(similarities)
|
| 219 |
relevant_context = rag_context.split('\n')[most_relevant_index]
|
| 220 |
+
#truncate context
|
| 221 |
+
if len(relevant_context) > rag_max_context_length:
|
| 222 |
+
relevant_context = relevant_context[:rag_max_context_length]
|
| 223 |
|
| 224 |
# 5. Construct the prompt
|
| 225 |
rag_prompt = f"Use the following context to answer the question: '{rag_question}'.\nContext: {relevant_context}"
|
| 226 |
|
| 227 |
# 6. Generate the answer
|
| 228 |
response = generate_with_retry(rag_prompt, rag_model_name, generation_config=genai.types.GenerationConfig())
|
| 229 |
+
if response:
|
| 230 |
+
display_response(response)
|
| 231 |
except Exception as e:
|
| 232 |
st.error(f"An error occurred: {e}")
|
| 233 |
|
|
|
|
| 235 |
|
| 236 |
# --- Text Similarity ---
|
| 237 |
st.header("Text Similarity")
|
| 238 |
+
similarity_embedding_model = st.selectbox("Select embedding model for similarity:", ["models/embedding-001"], index=0)
|
| 239 |
+
text1 = st.text_area("Enter text 1:", "This is the first sentence.", height=70) # Changed height to 70
|
| 240 |
+
text2 = st.text_area("Enter text 2:", "This is a similar sentence.", height=70) # Changed height to 70
|
| 241 |
|
| 242 |
if st.button("Calculate Similarity"):
|
| 243 |
if not text1 or not text2:
|
|
|
|
| 246 |
with st.spinner("Calculating similarity..."):
|
| 247 |
try:
|
| 248 |
# 1. Generate embeddings
|
| 249 |
+
embeddings = generate_embeddings([text1, text2], similarity_embedding_model)
|
| 250 |
if not embeddings:
|
| 251 |
st.stop()
|
| 252 |
+
|
| 253 |
# 2. Calculate cosine similarity
|
| 254 |
+
similarity = calculate_similarity(embeddings[0], embeddings[1])
|
| 255 |
st.subheader("Cosine Similarity:")
|
| 256 |
st.write(similarity)
|
| 257 |
except Exception as e:
|
|
|
|
| 261 |
|
| 262 |
# --- Neural Classification ---
|
| 263 |
st.header("Neural Classification with Embeddings")
|
| 264 |
+
classification_embedding_model = st.selectbox("Select embedding model for classification:", ["models/embedding-001"], index=0)
|
| 265 |
classification_data = st.text_area(
|
| 266 |
"Enter your training data (text, label pairs), separated by newlines. Example: text1,0\\ntext2,1",
|
| 267 |
"text1,0\ntext2,1\ntext3,0\ntext4,1",
|
| 268 |
height=150,
|
| 269 |
)
|
| 270 |
+
classification_prompt = st.text_area("Enter text to classify:", "This is a test text.", height=70) # Changed height to 70
|
| 271 |
+
num_epochs = st.number_input("Number of Epochs", min_value=1, max_value=200, value=10, step=1)
|
| 272 |
+
batch_size = st.number_input("Batch Size", min_value=1, max_value=128, value=32, step=1)
|
| 273 |
+
learning_rate = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.0001, step=0.0001, format="%.4f")
|
| 274 |
+
optimizer_str = st.selectbox("Optimizer", ['adam', 'sgd', 'rmsprop'], index=0)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def process_classification_data(data: str) -> Optional[tuple[List[str], List[int]]]:
|
| 278 |
+
"""Processes the classification data string into lists of texts and labels.
|
| 279 |
+
|
| 280 |
+
Args:
|
| 281 |
+
data: The input string.
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
A tuple of (texts, labels) or None on error
|
| 285 |
+
"""
|
| 286 |
+
data_pairs = [line.split(',') for line in data.split('\n') if ',' in line]
|
| 287 |
+
if not data_pairs:
|
| 288 |
+
st.error("No valid data pairs found. Please ensure each line contains 'text,label'.")
|
| 289 |
+
return None
|
| 290 |
+
texts = []
|
| 291 |
+
labels = []
|
| 292 |
+
for i, pair in enumerate(data_pairs):
|
| 293 |
+
if len(pair) != 2:
|
| 294 |
+
st.error(f"Invalid data format in line {i + 1}: '{','.join(pair)}'. Expected 'text,label'.")
|
| 295 |
+
return None
|
| 296 |
+
text = pair[0].strip()
|
| 297 |
+
label_str = pair[1].strip()
|
| 298 |
+
try:
|
| 299 |
+
label = int(label_str)
|
| 300 |
+
texts.append(text)
|
| 301 |
+
labels.append(label)
|
| 302 |
+
except ValueError:
|
| 303 |
+
st.error(f"Invalid label value in line {i + 1}: '{label_str}'. Label must be an integer.")
|
| 304 |
+
return None
|
| 305 |
+
return texts, labels
|
| 306 |
|
| 307 |
|
| 308 |
|
|
|
|
| 313 |
with st.spinner("Classifying..."):
|
| 314 |
try:
|
| 315 |
# 1. Process the training data
|
| 316 |
+
processed_data = process_classification_data(classification_data)
|
| 317 |
+
if not processed_data:
|
| 318 |
+
st.stop()
|
| 319 |
+
train_texts, train_labels = processed_data
|
| 320 |
+
num_classes = len(set(train_labels))
|
| 321 |
|
| 322 |
# 2. Generate embeddings for training data
|
| 323 |
+
train_embeddings = generate_embeddings(train_texts, classification_embedding_model)
|
| 324 |
if not train_embeddings:
|
| 325 |
st.stop()
|
| 326 |
|
| 327 |
# 3. Create and train the model
|
| 328 |
+
model = create_and_train_model(
|
| 329 |
+
train_embeddings, train_labels, num_classes, num_epochs, batch_size, learning_rate, optimizer_str
|
| 330 |
+
)
|
| 331 |
|
| 332 |
# 4. Generate embedding for the text to classify
|
| 333 |
+
predict_embedding = generate_embeddings([classification_prompt], classification_embedding_model)
|
| 334 |
+
if not predict_embedding:
|
| 335 |
+
st.stop()
|
| 336 |
|
| 337 |
# 5. Make the prediction
|
| 338 |
prediction = model.predict(np.array([predict_embedding]), verbose=0)
|
|
|
|
| 344 |
|
| 345 |
except Exception as e:
|
| 346 |
st.error(f"An error occurred: {e}")
|
|
|