tudorgeorgescu's picture
fix
e566037 verified
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
import torch
# Set device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load pre-trained T5 model and tokenizer
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
# Define Streamlit app
st.title("Text Normalization with T5")
st.write("A simple app to normalize noisy text into standardized formats.")
# Tabs for training and inference
tab1, tab2 = st.tabs(["Training", "Inference"])
# Training Tab
with tab1:
st.header("Train the Model")
# Upload training data
uploaded_file = st.file_uploader("Upload a CSV file with `input` and `output` columns", type=["csv"])
if uploaded_file:
import pandas as pd
data = pd.read_csv(uploaded_file)
if "input" in data.columns and "output" in data.columns:
st.write("Sample Data:")
st.write(data.head())
# Preprocess data into Hugging Face Dataset format
def preprocess_function(examples):
inputs = [str(x) for x in examples["input"]]
targets = [str(x) for x in examples["output"]]
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"]
model_inputs["labels"] = labels
return model_inputs
dataset = Dataset.from_pandas(data)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# Split data into train/test
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]
# Define Trainer and training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer
)
# Train button
if st.button("Train the Model"):
with st.spinner("Training in progress..."):
trainer.train()
st.success("Training completed! The model is ready to use.")
else:
st.error("The uploaded CSV must have `input` and `output` columns.")
# Inference Tab
with tab2:
st.header("Normalize Text")
# Input text for inference
input_text = st.text_area("Enter the noisy text:", "")
if st.button("Normalize"):
if input_text.strip():
with st.spinner("Generating normalized text..."):
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
normalized_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
st.write(f"Normalized Text: **{normalized_text}**")
else:
st.warning("Please enter some text for normalization.")