Spaces:

tudorgeorgescu
/

partnerHarmonization

Sleeping

App Files Files Community

partnerHarmonization / app.py

tudorgeorgescu

fix

e566037 verified about 1 year ago

raw

history blame contribute delete

3.82 kB

	import streamlit as st
	from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
	from datasets import Dataset
	import evaluate
	import torch

	# Set device (use GPU if available)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	# Load pre-trained T5 model and tokenizer
	MODEL_NAME = "t5-small"
	tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
	model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
	# Define Streamlit app
	st.title("Text Normalization with T5")
	st.write("A simple app to normalize noisy text into standardized formats.")
	# Tabs for training and inference
	tab1, tab2 = st.tabs(["Training", "Inference"])
	# Training Tab
	with tab1:
	st.header("Train the Model")
	# Upload training data
	uploaded_file = st.file_uploader("Upload a CSV file with `input` and `output` columns", type=["csv"])
	if uploaded_file:
	import pandas as pd
	data = pd.read_csv(uploaded_file)
	if "input" in data.columns and "output" in data.columns:
	st.write("Sample Data:")
	st.write(data.head())
	# Preprocess data into Hugging Face Dataset format
	def preprocess_function(examples):
	inputs = [str(x) for x in examples["input"]]
	targets = [str(x) for x in examples["output"]]
	model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
	labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"]
	model_inputs["labels"] = labels
	return model_inputs
	dataset = Dataset.from_pandas(data)
	tokenized_dataset = dataset.map(preprocess_function, batched=True)
	# Split data into train/test
	train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
	train_dataset = train_test_split["train"]
	eval_dataset = train_test_split["test"]
	# Define Trainer and training arguments
	training_args = TrainingArguments(
	output_dir="./results",
	evaluation_strategy="epoch",
	learning_rate=5e-5,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=16,
	num_train_epochs=3,
	weight_decay=0.01,
	logging_dir="./logs",
	save_strategy="epoch",
	load_best_model_at_end=True,
	push_to_hub=False
	)
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	tokenizer=tokenizer
	)
	# Train button
	if st.button("Train the Model"):
	with st.spinner("Training in progress..."):
	trainer.train()
	st.success("Training completed! The model is ready to use.")
	else:
	st.error("The uploaded CSV must have `input` and `output` columns.")
	# Inference Tab
	with tab2:
	st.header("Normalize Text")
	# Input text for inference
	input_text = st.text_area("Enter the noisy text:", "")
	if st.button("Normalize"):
	if input_text.strip():
	with st.spinner("Generating normalized text..."):
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
	outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
	normalized_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	st.write(f"Normalized Text: {normalized_text}")
	else:
	st.warning("Please enter some text for normalization.")