FiLM_Benchmark / app.py
Nickeik's picture
Update app.py
f61ed22 verified
import gradio as gr
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import openai
import requests
# Define the available models and tasks
TASKS = ["sentiment-analysis", "ner", "text-classification"]
MODELS = {
"DistilBERT": "distilbert-base-uncased",
"BERT": "bert-base-uncased",
"RoBERTa": "roberta-base",
"LLaMA2_7B_chat": "meta-llama/Llama-2-7b-chat-hf",
"LLaMA2_70B": "meta-llama/Llama-2-70b-hf",
"ChatGLM3_6B": "THUDM/chatglm-6b",
"InternLM_7B": "internlm/internlm-7b",
"Falcon_7B": "tiiuae/falcon-7b"
# Add other Hugging Face models here
}
# Function to load pipeline for Hugging Face models
def load_pipeline(task, model):
model_name = MODELS[model]
return pipeline(task, model=model_name)
# Function to predict using Hugging Face models and OpenAI models
def predict(task, model, text):
try:
selected_pipeline = load_pipeline(task, model)
if model in ["ChatGPT", "GPT4"]:
# OpenAI API request
response = openai.ChatCompletion.create(
model="gpt-4" if model == "GPT4" else "gpt-3.5-turbo",
messages=[{"role": "user", "content": text}]
)
return response['choices'][0]['message']['content']
else:
# Hugging Face pipeline
results = selected_pipeline(text)
return results
except Exception as e:
print(f"Error in prediction: {e}")
return {"error": str(e)}
# Function to benchmark Hugging Face models and OpenAI models
def benchmark(task, model, file):
try:
data = pd.read_csv(file.name)
texts = data['query'].tolist()
true_labels = data['answer'].tolist()
predictions = []
if model in ["ChatGPT", "GPT4"]:
for text in texts:
response = openai.ChatCompletion.create(
model="gpt-4" if model == "GPT4" else "gpt-3.5-turbo",
messages=[{"role": "user", "content": text}]
)
predictions.append(response['choices'][0]['message']['content'].strip())
else:
selected_pipeline = load_pipeline(task, model)
predictions = [selected_pipeline(text)[0]['label'] for text in texts]
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
return {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1 Score": f1
}
except Exception as e:
print(f"Error in benchmarking: {e}")
return {"error": str(e)}
# Define the Gradio interface
with gr.Blocks() as demo:
with gr.Row():
task_input = gr.Dropdown(TASKS, label="Task")
model_input = gr.Dropdown(list(MODELS.keys()) + ["ChatGPT", "GPT4"], label="Model")
with gr.Tab("Predict"):
with gr.Row():
text_input = gr.Textbox(lines=2, placeholder="Enter text here...", label="Text")
predict_button = gr.Button("Predict")
predict_output = gr.JSON(label="Prediction Output")
predict_button.click(predict, inputs=[task_input, model_input, text_input], outputs=predict_output)
with gr.Tab("Benchmark"):
with gr.Row():
file_input = gr.File(label="Upload CSV for Benchmarking")
benchmark_button = gr.Button("Benchmark")
benchmark_output = gr.JSON(label="Benchmark Output")
benchmark_button.click(benchmark, inputs=[task_input, model_input, file_input], outputs=benchmark_output)
demo.launch()