Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
| import torch | |
| import pandas as pd | |
| def load_orca_dataset(): | |
| st.info("Loading dataset... This may take a while.") | |
| return load_dataset("microsoft/orca-agentinstruct-1M-v1") | |
| def load_model_and_tokenizer(model_name): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| return tokenizer, model | |
| def evaluate_model(ds, tokenizer, model, max_samples, text_field): | |
| st.info("Evaluating the model...") | |
| classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
| results = [] | |
| for i, example in enumerate(ds): | |
| if i >= max_samples: | |
| break | |
| input_text = example[text_field] | |
| result = classifier(input_text)[0] | |
| results.append({"input": input_text, "label": result["label"], "score": result["score"]}) | |
| return results | |
| def main(): | |
| st.title("Orca Dataset Browser and Model Evaluator") | |
| st.sidebar.header("Configuration") | |
| load_dataset_btn = st.sidebar.button("Load Dataset") | |
| if load_dataset_btn: | |
| dataset = load_orca_dataset() | |
| st.session_state["dataset"] = dataset | |
| if "dataset" in st.session_state: | |
| dataset = st.session_state["dataset"] | |
| # List available splits | |
| available_splits = list(dataset.keys()) | |
| st.sidebar.subheader("Available Dataset Splits") | |
| selected_split = st.sidebar.selectbox("Select Split", available_splits) | |
| st.subheader("Dataset Explorer") | |
| st.write(f"Displaying information for split: `{selected_split}`") | |
| st.write(dataset[selected_split].info) | |
| # Determine available fields | |
| sample_entry = dataset[selected_split][0] | |
| st.sidebar.subheader("Available Fields in Dataset") | |
| available_fields = list(sample_entry.keys()) | |
| st.sidebar.write(available_fields) | |
| text_field = st.sidebar.selectbox("Select Text Field", available_fields) | |
| sample_size = st.slider("Number of Samples to Display", min_value=1, max_value=20, value=5) | |
| st.write(dataset[selected_split].shuffle(seed=42).select(range(sample_size))) | |
| st.subheader("Model Evaluator") | |
| model_name = st.text_input("Enter Hugging Face Model Name", value="distilbert-base-uncased-finetuned-sst-2-english") | |
| max_samples = st.number_input("Number of Samples to Evaluate", min_value=1, max_value=100, value=10) | |
| if st.button("Load Model and Evaluate"): | |
| tokenizer, model = load_model_and_tokenizer(model_name) | |
| results = evaluate_model(dataset[selected_split].shuffle(seed=42).select(range(max_samples)), tokenizer, model, max_samples, text_field) | |
| st.subheader("Evaluation Results") | |
| st.write(results) | |
| st.download_button( | |
| label="Download Results as CSV", | |
| data=pd.DataFrame(results).to_csv(index=False), | |
| file_name="evaluation_results.csv", | |
| mime="text/csv", | |
| ) | |
| if __name__ == "__main__": | |
| main() | |