|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
|
|
import traceback |
|
|
import logging |
|
|
import torch |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.ERROR) |
|
|
|
|
|
|
|
|
model_names = [ |
|
|
"ActiveYixiao/roberta-large-ToM0", |
|
|
"ActiveYixiao/roberta-large-ToM1", |
|
|
"ActiveYixiao/roberta-large-ToM2", |
|
|
"ActiveYixiao/roberta-large-ToM3", |
|
|
"ActiveYixiao/roberta-large-ToM4", |
|
|
"ActiveYixiao/roberta-large-ToM6", |
|
|
"ActiveYixiao/roberta-large-ToM7", |
|
|
"ActiveYixiao/roberta-large-ToM8", |
|
|
"ActiveYixiao/roberta-large-ToM9", |
|
|
] |
|
|
|
|
|
def classify_csv(file, progress=gr.Progress()): |
|
|
try: |
|
|
|
|
|
try: |
|
|
df = pd.read_csv(file, encoding="utf-8") |
|
|
except UnicodeDecodeError: |
|
|
df = pd.read_csv(file, encoding="latin1") |
|
|
|
|
|
|
|
|
df = df.dropna(how="all") |
|
|
df = df[~(df.astype(str).apply(lambda x: x.str.strip() == '').all(axis=1))] |
|
|
|
|
|
|
|
|
df = df.reset_index(drop=True) |
|
|
|
|
|
|
|
|
required_columns = ["test-1", "test-2", "test-3", "test-4", "test-5", "test-6", "test-7", "test-8", "test-9"] |
|
|
missing_columns = [col for col in required_columns if col not in df.columns] |
|
|
if missing_columns: |
|
|
raise ValueError(f"Missing columns: {missing_columns} in input CSV") |
|
|
|
|
|
|
|
|
output_data = {} |
|
|
output_data['ID'] = df.iloc[:, 0] |
|
|
|
|
|
total_steps = len(model_names) * len(df) |
|
|
current_step = 0 |
|
|
|
|
|
for i, model_name in enumerate(model_names): |
|
|
test_col = f"test-{i+1}" |
|
|
score_col = f"score-{i+1}" |
|
|
|
|
|
output_data[test_col] = df[test_col] |
|
|
|
|
|
progress((i / len(model_names)), f"Loading model {i+1}/{len(model_names)}") |
|
|
|
|
|
try: |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
device = 0 if torch.cuda.is_available() else -1 |
|
|
clf = pipeline( |
|
|
"text-classification", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
return_all_scores=False, |
|
|
device=device |
|
|
) |
|
|
|
|
|
preds = [] |
|
|
for j, text in enumerate(df[test_col].astype(str).fillna("")): |
|
|
current_step += 1 |
|
|
progress((current_step / total_steps), f"Processing {test_col}, row {j+1}/{len(df)}") |
|
|
|
|
|
try: |
|
|
result = clf(text)[0] |
|
|
label = result["label"] |
|
|
|
|
|
if label in ["LABEL_1", "1", "positive", "POSITIVE"]: |
|
|
preds.append(1) |
|
|
else: |
|
|
preds.append(0) |
|
|
except Exception as e: |
|
|
logging.error(f"Error processing row {j+1} in {test_col}: {str(e)}") |
|
|
preds.append(0) |
|
|
|
|
|
output_data[score_col] = preds |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Error loading model {model_name}: {str(e)}") |
|
|
|
|
|
output_data[score_col] = [0] * len(df) |
|
|
|
|
|
finally: |
|
|
|
|
|
if 'clf' in locals(): |
|
|
del clf |
|
|
if 'model' in locals(): |
|
|
del model |
|
|
if 'tokenizer' in locals(): |
|
|
del tokenizer |
|
|
torch.cuda.empty_cache() if torch.cuda.is_available() else None |
|
|
|
|
|
out_df = pd.DataFrame(output_data) |
|
|
out_path = "classified_output.csv" |
|
|
out_df.to_csv(out_path, index=False) |
|
|
|
|
|
progress(1, "Done! File ready for download.") |
|
|
return out_path, None |
|
|
|
|
|
except Exception as e: |
|
|
err_msg = f"❌ Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" |
|
|
logging.error(err_msg) |
|
|
return None, err_msg |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=classify_csv, |
|
|
inputs=gr.File(file_types=[".csv"], label="Upload CSV"), |
|
|
outputs=[ |
|
|
gr.File(label="Download Scored CSV"), |
|
|
gr.Textbox(label="Error Message", lines=5) |
|
|
], |
|
|
title="Mind-Reading Response Classification System (9 Test Items)", |
|
|
description=( |
|
|
"### 📋 Thank-you for accessing the online marking system for the Birmingham Advanced Mindreading Stories. " |
|
|
"Please follow the instructions below to prepare your dataset and then click ‘Submit’ to have your participant responses marked by our fine-tuned language models.\n\n" |
|
|
"Upload Instructions\n" |
|
|
"- The file must be a **CSV** with exactly **10 columns**\n" |
|
|
"- The **first column** must contain participant IDs\n" |
|
|
"- The **next 9 columns** must be named `test-1`, `test-2`, ..., `test-9`\n\n" |
|
|
"**Example format:**\n" |
|
|
"<pre style='background-color:#f8f9fa;border:1px solid #ccc;border-radius:8px;padding:10px;font-family:monospace;overflow-x:auto;white-space:pre;'>" |
|
|
"ID,test-1,test-2,test-3,test-4,test-5,test-6,test-7,test-8,test-9\n" |
|
|
"1,response,response,response,response,response,response,response,response,response\n" |
|
|
"2,response,response,response,response,response,response,response,response,response\n" |
|
|
"3,response,response,response,response,response,response,response,response,response\n" |
|
|
"...\n" |
|
|
"N,response,response,response,response,response,response,response,response,response" |
|
|
"</pre>" |
|
|
), |
|
|
article=( |
|
|
""" |
|
|
<p>The output CSV will include the original test responses plus <code>score-1</code> ... <code>score-9</code> columns.</p> |
|
|
<hr> |
|
|
<p><strong>Reference:</strong></p> |
|
|
<pre style="background-color:#f8f9fa;border:1px solid #ccc;border-radius:8px;padding:10px;font-family:monospace;overflow-x:auto;white-space:pre;"> |
|
|
@inproceedings{wang2025automatic, |
|
|
title={Automatic Scoring of an Open-Response Measure of Advanced Mind-Reading Using Large Language Models}, |
|
|
author={Wang, Yixiao and Dsouza, Russel and Lee, Robert and Apperly, Ian and Devine, Rory T and van der Kleij, Sanne W and Lee, Mark}, |
|
|
booktitle={The 10th Workshop on Computational Linguistics and Clinical Psychology}, |
|
|
pages={79}, |
|
|
year={2025} |
|
|
} |
|
|
</pre> |
|
|
""" |
|
|
) |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=True) |
|
|
|