import streamlit as st from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from datasets import Dataset import evaluate import torch # Set device (use GPU if available) device = "cuda" if torch.cuda.is_available() else "cpu" # Load pre-trained T5 model and tokenizer MODEL_NAME = "t5-small" tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME) model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device) # Define Streamlit app st.title("Text Normalization with T5") st.write("A simple app to normalize noisy text into standardized formats.") # Tabs for training and inference tab1, tab2 = st.tabs(["Training", "Inference"]) # Training Tab with tab1: st.header("Train the Model") # Upload training data uploaded_file = st.file_uploader("Upload a CSV file with `input` and `output` columns", type=["csv"]) if uploaded_file: import pandas as pd data = pd.read_csv(uploaded_file) if "input" in data.columns and "output" in data.columns: st.write("Sample Data:") st.write(data.head()) # Preprocess data into Hugging Face Dataset format def preprocess_function(examples): inputs = [str(x) for x in examples["input"]] targets = [str(x) for x in examples["output"]] model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"] model_inputs["labels"] = labels return model_inputs dataset = Dataset.from_pandas(data) tokenized_dataset = dataset.map(preprocess_function, batched=True) # Split data into train/test train_test_split = tokenized_dataset.train_test_split(test_size=0.2) train_dataset = train_test_split["train"] eval_dataset = train_test_split["test"] # Define Trainer and training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=False ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer ) # Train button if st.button("Train the Model"): with st.spinner("Training in progress..."): trainer.train() st.success("Training completed! The model is ready to use.") else: st.error("The uploaded CSV must have `input` and `output` columns.") # Inference Tab with tab2: st.header("Normalize Text") # Input text for inference input_text = st.text_area("Enter the noisy text:", "") if st.button("Normalize"): if input_text.strip(): with st.spinner("Generating normalized text..."): input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device) outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True) normalized_text = tokenizer.decode(outputs[0], skip_special_tokens=True) st.write(f"Normalized Text: **{normalized_text}**") else: st.warning("Please enter some text for normalization.")