ActiveYixiao's picture
Update app.py
8a7992e verified
import gradio as gr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import traceback
import logging
import torch
# --- logging setup so errors appear in HF logs ---
logging.basicConfig(level=logging.ERROR)
# --- Model paths ---
model_names = [
"ActiveYixiao/roberta-large-ToM0",
"ActiveYixiao/roberta-large-ToM1",
"ActiveYixiao/roberta-large-ToM2",
"ActiveYixiao/roberta-large-ToM3",
"ActiveYixiao/roberta-large-ToM4",
"ActiveYixiao/roberta-large-ToM6",
"ActiveYixiao/roberta-large-ToM7",
"ActiveYixiao/roberta-large-ToM8",
"ActiveYixiao/roberta-large-ToM9",
]
def classify_csv(file, progress=gr.Progress()):
try:
# Load input CSV
try:
df = pd.read_csv(file, encoding="utf-8")
except UnicodeDecodeError:
df = pd.read_csv(file, encoding="latin1")
# Clean the file and remove empty rows
df = df.dropna(how="all")
df = df[~(df.astype(str).apply(lambda x: x.str.strip() == '').all(axis=1))]
# Reset index after cleaning
df = df.reset_index(drop=True)
# Verify required columns exist
required_columns = ["test-1", "test-2", "test-3", "test-4", "test-5", "test-6", "test-7", "test-8", "test-9"]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing columns: {missing_columns} in input CSV")
# Prepare output data
output_data = {}
output_data['ID'] = df.iloc[:, 0] # First column as ID
total_steps = len(model_names) * len(df)
current_step = 0
for i, model_name in enumerate(model_names):
test_col = f"test-{i+1}"
score_col = f"score-{i+1}"
output_data[test_col] = df[test_col]
progress((i / len(model_names)), f"Loading model {i+1}/{len(model_names)}")
try:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Use device mapping for better memory management
device = 0 if torch.cuda.is_available() else -1
clf = pipeline(
"text-classification",
model=model,
tokenizer=tokenizer,
return_all_scores=False,
device=device
)
preds = []
for j, text in enumerate(df[test_col].astype(str).fillna("")):
current_step += 1
progress((current_step / total_steps), f"Processing {test_col}, row {j+1}/{len(df)}")
try:
result = clf(text)[0]
label = result["label"]
# More robust label checking
if label in ["LABEL_1", "1", "positive", "POSITIVE"]:
preds.append(1)
else:
preds.append(0)
except Exception as e:
logging.error(f"Error processing row {j+1} in {test_col}: {str(e)}")
preds.append(0)
output_data[score_col] = preds
except Exception as e:
logging.error(f"Error loading model {model_name}: {str(e)}")
# Fill with zeros if model fails
output_data[score_col] = [0] * len(df)
finally:
# Clean up to free memory
if 'clf' in locals():
del clf
if 'model' in locals():
del model
if 'tokenizer' in locals():
del tokenizer
torch.cuda.empty_cache() if torch.cuda.is_available() else None
out_df = pd.DataFrame(output_data)
out_path = "classified_output.csv"
out_df.to_csv(out_path, index=False)
progress(1, "Done! File ready for download.")
return out_path, None
except Exception as e:
err_msg = f"❌ Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
logging.error(err_msg)
return None, err_msg
# Create example input for users to see
demo = gr.Interface(
fn=classify_csv,
inputs=gr.File(file_types=[".csv"], label="Upload CSV"),
outputs=[
gr.File(label="Download Scored CSV"),
gr.Textbox(label="Error Message", lines=5)
],
title="Mind-Reading Response Classification System (9 Test Items)",
description=(
"### 📋 Thank-you for accessing the online marking system for the Birmingham Advanced Mindreading Stories. "
"Please follow the instructions below to prepare your dataset and then click ‘Submit’ to have your participant responses marked by our fine-tuned language models.\n\n"
"Upload Instructions\n"
"- The file must be a **CSV** with exactly **10 columns**\n"
"- The **first column** must contain participant IDs\n"
"- The **next 9 columns** must be named `test-1`, `test-2`, ..., `test-9`\n\n"
"**Example format:**\n"
"<pre style='background-color:#f8f9fa;border:1px solid #ccc;border-radius:8px;padding:10px;font-family:monospace;overflow-x:auto;white-space:pre;'>"
"ID,test-1,test-2,test-3,test-4,test-5,test-6,test-7,test-8,test-9\n"
"1,response,response,response,response,response,response,response,response,response\n"
"2,response,response,response,response,response,response,response,response,response\n"
"3,response,response,response,response,response,response,response,response,response\n"
"...\n"
"N,response,response,response,response,response,response,response,response,response"
"</pre>"
),
article=(
"""
<p>The output CSV will include the original test responses plus <code>score-1</code> ... <code>score-9</code> columns.</p>
<hr>
<p><strong>Reference:</strong></p>
<pre style="background-color:#f8f9fa;border:1px solid #ccc;border-radius:8px;padding:10px;font-family:monospace;overflow-x:auto;white-space:pre;">
@inproceedings{wang2025automatic,
title={Automatic Scoring of an Open-Response Measure of Advanced Mind-Reading Using Large Language Models},
author={Wang, Yixiao and Dsouza, Russel and Lee, Robert and Apperly, Ian and Devine, Rory T and van der Kleij, Sanne W and Lee, Mark},
booktitle={The 10th Workshop on Computational Linguistics and Clinical Psychology},
pages={79},
year={2025}
}
</pre>
"""
)
)
if __name__ == "__main__":
demo.launch(share=True)