Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments | |
| from datasets import Dataset | |
| import evaluate | |
| import torch | |
| # Set device (use GPU if available) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load pre-trained T5 model and tokenizer | |
| MODEL_NAME = "t5-small" | |
| tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME) | |
| model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device) | |
| # Define Streamlit app | |
| st.title("Text Normalization with T5") | |
| st.write("A simple app to normalize noisy text into standardized formats.") | |
| # Tabs for training and inference | |
| tab1, tab2 = st.tabs(["Training", "Inference"]) | |
| # Training Tab | |
| with tab1: | |
| st.header("Train the Model") | |
| # Upload training data | |
| uploaded_file = st.file_uploader("Upload a CSV file with `input` and `output` columns", type=["csv"]) | |
| if uploaded_file: | |
| import pandas as pd | |
| data = pd.read_csv(uploaded_file) | |
| if "input" in data.columns and "output" in data.columns: | |
| st.write("Sample Data:") | |
| st.write(data.head()) | |
| # Preprocess data into Hugging Face Dataset format | |
| def preprocess_function(examples): | |
| inputs = [str(x) for x in examples["input"]] | |
| targets = [str(x) for x in examples["output"]] | |
| model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") | |
| labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"] | |
| model_inputs["labels"] = labels | |
| return model_inputs | |
| dataset = Dataset.from_pandas(data) | |
| tokenized_dataset = dataset.map(preprocess_function, batched=True) | |
| # Split data into train/test | |
| train_test_split = tokenized_dataset.train_test_split(test_size=0.2) | |
| train_dataset = train_test_split["train"] | |
| eval_dataset = train_test_split["test"] | |
| # Define Trainer and training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| evaluation_strategy="epoch", | |
| learning_rate=5e-5, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=16, | |
| num_train_epochs=3, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| push_to_hub=False | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| tokenizer=tokenizer | |
| ) | |
| # Train button | |
| if st.button("Train the Model"): | |
| with st.spinner("Training in progress..."): | |
| trainer.train() | |
| st.success("Training completed! The model is ready to use.") | |
| else: | |
| st.error("The uploaded CSV must have `input` and `output` columns.") | |
| # Inference Tab | |
| with tab2: | |
| st.header("Normalize Text") | |
| # Input text for inference | |
| input_text = st.text_area("Enter the noisy text:", "") | |
| if st.button("Normalize"): | |
| if input_text.strip(): | |
| with st.spinner("Generating normalized text..."): | |
| input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device) | |
| outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True) | |
| normalized_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| st.write(f"Normalized Text: **{normalized_text}**") | |
| else: | |
| st.warning("Please enter some text for normalization.") |