Spaces:

Datasaur
/

few-shot-demo

Sleeping

App Files Files Community

spdin commited on Mar 23, 2023

Commit

333cd19

1 Parent(s): 0da7162

initial commit

Browse files

Files changed (6) hide show

app.py +33 -0
model.py +47 -0
prediction.py +48 -0
training.py +74 -0
utils.py +12 -0
validation.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import uuid
+import streamlit as st
+import training
+import validation
+import prediction
+# Session initialization
+if "key" not in st.session_state:
+    st.session_state["key"] = str(uuid.uuid4()).split("-")[-1]
+def training_page():
+    training.main()
+def validation_page():
+    validation.main()
+def prediction_page():
+    prediction.main()
+page_names_to_funcs = {
+    "Training": training_page,
+    "Validation": validation_page,
+    "Prediction": prediction_page,
+}
+selected_page = st.sidebar.selectbox("Select a page", page_names_to_funcs.keys())
+page_names_to_funcs[selected_page]()

model.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from setfit import SetFitModel, SetFitTrainer
+from sentence_transformers.losses import CosineSimilarityLoss
+# Function to create a pipeline for text classification using the trained model
+def create_classifier(model_path):
+    classifier = SetFitModel.from_pretrained(
+        model_path,
+        local_files_only=True,
+    )
+    return classifier
+def run_setfit_training(
+    session_id, model_id, model_name, train_dataset, batch_size, num_iterations
+):
+    model = SetFitModel.from_pretrained(model_id)
+    # Create trainer
+    trainer = SetFitTrainer(
+        model=model,
+        train_dataset=train_dataset,
+        eval_dataset=train_dataset,
+        loss_class=CosineSimilarityLoss,
+        metric="accuracy",
+        batch_size=batch_size,
+        num_iterations=num_iterations,  # The number of text pairs to generate for contrastive learning
+        num_epochs=1,  # The number of epochs to use for constrastive learning
+        column_mapping={"text": "text", "label": "label"},
+    )
+    trainer.train()
+    # metrics = trainer.evaluate()
+    # accuracy = metrics["accuracy"]
+    print(f"model used: {model_id}")
+    print(f"train dataset: {len(train_dataset)} samples")
+    # print(f"accuracy: {accuracy}")
+    save_model_path = f"./models/{session_id}/{model_id}_{model_name}"
+    trainer.model._save_pretrained(
+        save_directory=f"./models/{session_id}/{model_id}_{model_name}"
+    )
+    return save_model_path

prediction.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import pickle
+import streamlit as st
+import model
+def main():
+    st.title("Model Prediction")
+    st.write(f"Session ID: {st.session_state.key}")
+    session_id = st.session_state.key
+    if not os.path.isdir(f"models/{session_id}"):
+        st.write("Model is not available")
+        st.stop()
+    model_options = [model_name for model_name in os.listdir(f"models/{session_id}")]
+    models = {
+        model_name: os.path.abspath(os.path.join(f"models/{session_id}", model_name))
+        for model_name in model_options
+    }
+    model_name = st.selectbox("Select a model", options=model_options)
+    # Text input
+    text = st.text_area("Enter some text here", height=200)
+    # Prediction button
+    if st.button("Predict"):
+        with open(f"{models[model_name]}/label.pkl", "rb") as f:
+            label_map = pickle.load(f)
+        classifier = model.create_classifier(models[model_name])
+        prediction = classifier([text])
+        prediction_class = prediction[0].item()
+        confidence_score = classifier.predict_proba([text])[0][prediction_class].item()
+        st.write(
+            "The predicted label is:",
+            label_map[prediction_class],
+            f"{round(confidence_score*100,2)}%",
+        )

training.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import pickle
+import pandas as pd
+import streamlit as st
+from datasets import Dataset
+import model
+from utils import check_columns, count_labels
+# Main function to run the Streamlit app
+def main():
+    # Set app title
+    st.title("Few Shot Learning Demo using SetFit")
+    # Display the session ID
+    st.write(f"Session ID: {st.session_state.key}")
+    session_id = st.session_state.key
+    # Create file uploader
+    uploaded_file = st.file_uploader("Choose a CSV file to upload", type="csv")
+    # Check if file was uploaded
+    if uploaded_file is not None:
+        # Read CSV file into pandas DataFrame
+        df = pd.read_csv(uploaded_file)
+        # Check if DataFrame has expected columns
+        if check_columns(df):
+            # Display DataFrame as a table
+            st.write(df)
+            # Calculate the number of instances of each label class
+            label_counts = count_labels(df)
+            st.write(f"Number of instances of each label class: {label_counts}")
+            labels = set(df["label"].tolist())
+            label_map = {label: idx for idx, label in enumerate(labels)}
+            df["label"] = df["label"].map(label_map)
+            dataset = Dataset.from_pandas(df)
+            model_name = st.text_input("Input the model name")
+            pretrained_model_options = ["all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2"]
+            pretrained_model = st.selectbox(
+                "Select a pretrained model", options=pretrained_model_options
+            )
+            # Add Train button
+            if st.button("Train"):
+                # Train the model
+                with st.spinner("Training model..."):
+                    model_path = model.run_setfit_training(
+                        session_id,
+                        pretrained_model,
+                        model_name,
+                        dataset,
+                        1,
+                        10,
+                    )
+                st.write(f"Model checkpoint saved {model_path.split('/')[-1]}")
+                label_map = {v: k for k, v in label_map.items()}
+                with open(f"{model_path}/label.pkl", "wb") as f:
+                    pickle.dump(label_map, f)
+                st.write("Training Finished")
+                st.write("Go to Validation Page")
+        else:
+            st.error("File must have 'text' and 'label' columns.")

utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Function to check if the uploaded file has the expected columns
+def check_columns(df):
+    if set(df.columns) == set(["text", "label"]):
+        return True
+    else:
+        return False
+# Function to calculate the number of instances of each label class
+def count_labels(df):
+    counts = df["label"].value_counts()
+    return counts.to_dict()

validation.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import pickle
+import pandas as pd
+import streamlit as st
+import model
+from utils import check_columns
+# Function to validate the trained model with a new uploaded CSV file
+def main():
+    st.title("Model Validation")
+    # Display the session ID
+    st.write(f"Session ID: {st.session_state.key}")
+    session_id = st.session_state.key
+    if not os.path.isdir(f"models/{session_id}"):
+        st.write("Model is not available")
+        st.stop()
+    model_options = [model_name for model_name in os.listdir(f"models/{session_id}")]
+    models = {
+        model_name: os.path.abspath(os.path.join(f"models/{session_id}", model_name))
+        for model_name in model_options
+    }
+    model_name = st.selectbox("Select a model", options=model_options)
+    # Create file uploader for validation CSV file
+    validation_file = st.file_uploader(
+        "Choose a CSV file to validate the model", type="csv"
+    )
+    # Check if validation file was uploaded
+    if validation_file is not None:
+        # Read CSV file into pandas DataFrame
+        validation_df = pd.read_csv(validation_file)
+        # Check if DataFrame has expected columns
+        if check_columns(validation_df):
+            # Display DataFrame as a table
+            st.write(validation_df)
+            # Create pipeline for text classification using the trained model
+            classifier = model.create_classifier(models[model_name])
+            with open(f"{models[model_name]}/label.pkl", "rb") as f:
+                label_map = pickle.load(f)
+            results = classifier(validation_df["text"].tolist())
+            # Predict labels for validation DataFrame
+            validation_df["predicted_label"] = [
+                label_map[result.item()] for result in results
+            ]
+            # Display validation DataFrame with predicted labels
+            st.write("Validation results:")
+            st.write(validation_df)
+        else:
+            st.error("Validation file must have 'text' and 'label' columns.")