import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
import torch

# Set device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load pre-trained T5 model and tokenizer
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
# Define Streamlit app
st.title("Text Normalization with T5")
st.write("A simple app to normalize noisy text into standardized formats.")
# Tabs for training and inference
tab1, tab2 = st.tabs(["Training", "Inference"])
# Training Tab
with tab1:
    st.header("Train the Model")
    # Upload training data
    uploaded_file = st.file_uploader("Upload a CSV file with `input` and `output` columns", type=["csv"])
    if uploaded_file:
        import pandas as pd
        data = pd.read_csv(uploaded_file)
        if "input" in data.columns and "output" in data.columns:
            st.write("Sample Data:")
            st.write(data.head())
            # Preprocess data into Hugging Face Dataset format
            def preprocess_function(examples):
                inputs = [str(x) for x in examples["input"]]
                targets = [str(x) for x in examples["output"]]
                model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
                labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"]
                model_inputs["labels"] = labels
                return model_inputs
            dataset = Dataset.from_pandas(data)
            tokenized_dataset = dataset.map(preprocess_function, batched=True)
            # Split data into train/test
            train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
            train_dataset = train_test_split["train"]
            eval_dataset = train_test_split["test"]
            # Define Trainer and training arguments
            training_args = TrainingArguments(
                output_dir="./results",
                evaluation_strategy="epoch",
                learning_rate=5e-5,
                per_device_train_batch_size=16,
                per_device_eval_batch_size=16,
                num_train_epochs=3,
                weight_decay=0.01,
                logging_dir="./logs",
                save_strategy="epoch",
                load_best_model_at_end=True,
                push_to_hub=False
            )
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                tokenizer=tokenizer
            )
            # Train button
            if st.button("Train the Model"):
                with st.spinner("Training in progress..."):
                    trainer.train()
                st.success("Training completed! The model is ready to use.")
        else:
            st.error("The uploaded CSV must have `input` and `output` columns.")
# Inference Tab
with tab2:
    st.header("Normalize Text")
    # Input text for inference
    input_text = st.text_area("Enter the noisy text:", "")
    if st.button("Normalize"):
        if input_text.strip():
            with st.spinner("Generating normalized text..."):
                input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
                outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
                normalized_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            st.write(f"Normalized Text: **{normalized_text}**")
        else:
            st.warning("Please enter some text for normalization.")