Spaces:

WhoLetMeCook
/

DistilBERTDemo

Sleeping

App Files Files Community

Vincent Qin commited on Sep 7, 2024

Commit

4c41360

1 Parent(s): e5c1f52

Added DistilBERT test

Browse files

Files changed (1) hide show

app.py +98 -2

app.py CHANGED Viewed

@@ -1,4 +1,100 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+import numpy as np
+from datasets import load_dataset, Dataset
+from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
+from datasets import load_metric
+import torch
+# Load datasets (IMDB and SST2) and combine them
+@st.cache_resource
+def load_datasets():
+    imdb = load_dataset('imdb', split='train[:5000]')
+    sst2 = load_dataset('glue', 'sst2', split='train[:5000]')
+    # Combine datasets into a single list
+    train_list = [{'text': example['text'], 'label': example['label']} for example in imdb] + [{'text': example['sentence'], 'label': example['label']} for example in sst2]
+    full_data = Dataset.from_list(train_list)
+    # Split the dataset into train/validation/test
+    train_data = full_data.train_test_split(test_size=0.2, seed=42)
+    train_data = train_data['train'].train_test_split(test_size=0.25, seed=42)  # 60% train, 20% validation, 20% test
+    return train_data['train'], train_data['test']
+train_dataset, val_dataset = load_datasets()
+# Load the tokenizer and model
+@st.cache_resource
+def load_tokenizer_model():
+    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
+    return tokenizer, model
+tokenizer, model = load_tokenizer_model()
+# Preprocess function for tokenization
+def preprocess_function(examples):
+    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
+# Tokenize datasets
+tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
+tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
+# Define the training arguments
+training_args = TrainingArguments(
+    output_dir='./results',
+    evaluation_strategy='epoch',
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    logging_dir='./logs',
+)
+# Load accuracy metric
+metric = load_metric('accuracy')
+# Function to compute metrics
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+# Initialize the trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train_dataset,
+    eval_dataset=tokenized_val_dataset,
+    compute_metrics=compute_metrics,
+)
+# Streamlit UI
+st.title("DistilBERT Sentiment Training and Inference")
+# Button to start training
+if st.button("Train the Model"):
+    st.write("Training the model... This will take some time.")
+    trainer.train()
+    st.write("Model training complete!")
+# User input for inference
+st.write("Once the model is trained, you can enter a sentence for sentiment analysis:")
+user_input = st.text_area("Enter a sentence:")
+# Function to make predictions
+def predict_sentiment(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    prediction = torch.argmax(logits, dim=-1).item()
+    return "Positive" if prediction == 1 else "Negative"
+# Button to generate predictions after training
+if st.button("Analyze Sentiment"):
+    if user_input.strip():
+        result = predict_sentiment(user_input)
+        st.write(f"Predicted Sentiment: **{result}**")
+    else:
+        st.write("Please enter a sentence.")