Spaces:

Faruna01
/

igala-streamlit-app

Running

Igala dataset Streamlit app

7fdce62 4 months ago

1.56 kB

	import pandas as pd

	# Load cleaned dataset
	df = pd.read_csv("data/igala_sentences.csv")

	print(df.head())
	print(df.info())

	import matplotlib.pyplot as plt

	# Plot sentence length distribution
	plt.hist(df["length"], bins=10)
	plt.title("Sentence Length Distribution")
	plt.xlabel("Sentence Length")
	plt.ylabel("Number of Sentences")
	plt.show()

	# Boxplot
	plt.figure()
	plt.boxplot(df["length"])
	plt.title("Sentence Length Boxplot")
	plt.ylabel("Length")
	plt.show()

	print("Shortest sentence length:", df["length"].min())
	print("Longest sentence length:", df["length"].max())
	print("Average sentence length:", df["length"].mean())

	# Create labels (all Igala for now)
	df["label"] = 1
	X = df["sentence"] # input text
	y = df["label"] # output label

	from sklearn.feature_extraction.text import TfidfVectorizer

	vectorizer = TfidfVectorizer(
	max_features=500,
	ngram_range=(1, 2)
	)

	X_vec = vectorizer.fit_transform(X)

	print("Vectorized shape:", X_vec.shape)

	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(
	X_vec, y, test_size=0.3, random_state=42
	)

	print("Training samples:", X_train.shape[0])
	print("Testing samples:", X_test.shape[0])

	from sklearn.naive_bayes import MultinomialNB

	model = MultinomialNB()
	model.fit(X_train, y_train)

	print("Model training complete")

	from sklearn.metrics import accuracy_score, confusion_matrix

	y_pred = model.predict(X_test)

	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Confusion Matrix:")
	print(confusion_matrix(y_test, y_pred))