Spaces:

Faruna01
/

igala-streamlit-app

Running

App Files Files Community

Faruna01 commited on Dec 27, 2025

Commit

7fdce62

0 Parent(s):

Igala dataset Streamlit app

Browse files

Files changed (7) hide show

app.py +87 -0
data/igala_sentences.csv +12 -0
hello.py +14 -0
igala.txt +12 -0
requirements.txt +5 -0
week2_practice.py +39 -0
week3_ml.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import streamlit as st
+import pandas as pd
+from collections import Counter
+import string
+from sklearn.feature_extraction.text import CountVectorizer
+# ---------------- Page config ----------------
+st.set_page_config(
+    page_title="Igala Dataset Explorer",
+    layout="wide"
+)
+st.title("Igala Dataset Explorer")
+st.write("Welcome! This app explores an Igala text dataset.")
+# ---------------- Load data ----------------
+df = pd.read_csv("data/igala_sentences.csv")
+# ---------------- Preview ----------------
+st.subheader("📄 Dataset Preview")
+st.dataframe(df.head())
+# ---------------- Statistics ----------------
+st.subheader("📊 Dataset Statistics")
+col1, col2, col3, col4 = st.columns(4)
+col1.metric("Total Sentences", len(df))
+col2.metric("Average Length", round(df["length"].mean(), 2))
+col3.metric("Shortest", df["length"].min())
+col4.metric("Longest", df["length"].max())
+# ---------------- Word Frequency ----------------
+st.subheader("🔤 Top 20 Most Common Words")
+text = " ".join(df["sentence"].astype(str))
+text = text.lower()
+text = text.translate(str.maketrans("", "", string.punctuation))
+words = text.split()
+words = [w for w in words if len(w) > 1]
+# ✅ THIS WAS MISSING
+word_counts = Counter(words)
+word_freq_df = (
+    pd.DataFrame(word_counts.items(), columns=["word", "frequency"])
+    .sort_values(by="frequency", ascending=False)
+)
+freq_df = word_freq_df.head(20)
+st.bar_chart(freq_df.set_index("word"))
+# ---------------- Bigrams ----------------
+st.subheader("Top Bigrams")
+vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=10)
+X = vectorizer.fit_transform(df["sentence"].astype(str))
+bigrams = vectorizer.get_feature_names_out()
+st.write(bigrams)
+# ---------------- Sentence Length Distribution ----------------
+st.subheader("📐 Sentence Length Distribution")
+st.bar_chart(df["length"])
+# ---------------- Sidebar Filter ----------------
+st.sidebar.header("🔧 Filters")
+min_len = st.sidebar.slider(
+    "Minimum sentence length",
+    int(df["length"].min()),
+    int(df["length"].max()),
+    int(df["length"].min())
+)
+filtered_df = df[df["length"] >= min_len]
+st.subheader("Sentence Length Filter")
+st.write(filtered_df)
+# ---------------- Footer ----------------
+st.markdown("---")
+st.caption("Built with ❤️ using Python, Pandas & Streamlit")
+#python -m streamlit run app.py
+#Local URL: http://localhost:8501
+#  Network URL: http://192.168.0.191:8501

data/igala_sentences.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+sentence,length
+jọshua,6
+ọjọ jẹ ñwu jọshua koji mosẹs,28
+1 oñ i lewatu abu ku mosẹs adu jihofa lẹ leku jihofa ka ñwu jọshua ọma nun ki ache ẹwñ ñwu mosẹs lẹ kakini,106
+2 mosẹs adumi la leku mẹ todulẹ uwẹ ki kwanẹ kẹ daba jọrdani uwẹ kpai amonẹi chaka mẹ nyoj anẹ ku nadu ñwu ma owñ chamọma israẹli,129
+3 ugbo du kadaba ẹrẹwẹ manẹ uwẹ omi du ñwu alu ku ka ñwu mosẹs lẹ,65
+4 akwo ajẹtachi kpai anẹ lẹbanọn kidẹ gbogbo wohimini elile owñ chohimini yufretis kpai anẹ abo hẹt chaka gbogbo wohimini elile ọwọ olanẹ elẹ chalọmẹwẹ,151
+5 i nẹnẹ du ki aneke gbọna nyuwẹ ọjọ du kudejuwẹ kade n alu ku dugbo mosẹs alu lẹ nadugbo wẹ gẹ nawe ehi iñmi kẹ n nahi uwẹ tinyọ n,131
+6 feju gbiti kẹ fẹdọ nẹ todu uwẹ ajẹ ñw amonẹi jẹ anẹ ku gọ ñw amatama ka ku nadu ñwu mai ẹwñ ogu,97
+7 feju gbiti kate kẹ fẹdọ nẹ nana kẹ fejutẹ kẹ che teju alu du kukọọla ki mosẹs adumi ñwa ñw ẹ lẹ ka ẹ ki rida bọ tẹ tọwọ awọtọ abẹk awohi n todu ki ra ñw ẹ ugbo du kuwẹ anyi,174
+8 ọla ọtakada ukọọlai akwo buwẹ alu tẹ n ama ẹ nayayewñ tọrọka todu todu kuwẹ ki dejutẹ kẹ che teju alu du ku ma kọrubutu efuwñ todu ẹgbalẹ ẹwñwẹ kẹ che anawo ñw ẹ ẹgbalẹ tak i ara ñw ẹ,185
+9 i chomi ñwa ñw ẹ n ẹ feju gbiti kẹ la fẹdọ nẹ ẹ ki chokpo n ẹdọwẹ ki da kpu n todu jihofa ọjọwẹ dugbo wẹ ugbo du kuwẹ anyi,124

hello.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import string
+from collections import Counter
+with open("igala.txt", "r", encoding="utf-8") as f:
+    text = f.read()
+text = text.lower()
+text = text.replace("ʼ", "")
+text = text.translate(str.maketrans("", "", string.punctuation))
+words = text.split()
+words = [w for w in words if len(w) > 1]
+freq = Counter(words)
+print(freq.most_common(20))

igala.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Jọshua
+1
+Ọjọ jẹ ñwu Jọshua koji Mosẹs
+1 Oñ i lewatu abu ku Mosẹs, adu Jihofa lẹ, leku, Jihofa ka ñwu Jọshua, ọma Nun kʼi ache ẹwñ ñwu Mosẹs lẹ, kakini,
+2 Mosẹs adu-mi la leku mẹ; todulẹ uwẹ ki kwanẹ kʼẹ daba Jọrdan-i, uwẹ kpai amonẹ-i chaka, mẹ nyʼoj anẹ kʼu nadu ñwu ma, owñ chʼamọma Israẹl-i.
+3 Ugbo du kʼadaba ẹrẹ-wẹ mʼanẹ, uwẹ omi du ñwu, alu kʼu ka ñwu Mosẹs lẹ.
+4 Akwo ajẹtachi kpai anẹ Lẹbanọn kidẹ, gbogbo wʼohimini elile, owñ chʼohimini Yufretis, kpai anẹ abo Hẹt chaka gbogbo wʼOhimini Elile ọwọ olanẹ, e-lẹ chʼalọmẹ-wẹ.
+5 I nʼẹnẹ du kʼi aneke gbʼọna nyʼuwẹ ọjọ du kʼudeju-wẹ kade n; alu kʼu dʼugbo Mosẹs, alu lẹ nadʼugbo wẹ gẹ; nawe ehi iñmi kʼẹ n, nahi uwẹ tinyọ n.
+6 Fʼeju gbiti kʼẹ fʼẹdọ nẹ; todu uwẹ ajẹ ñw amonẹ-i jẹ anẹ kʼu gọ ñw amata-ma ka kʼu nadu ñwu ma-i ẹwñ ogu.
+7 Fʼeju gbiti kate, kʼẹ fʼẹdọ nẹ nana, kʼẹ fejutẹ kʼẹ che teju alu du kʼukọọla ki Mosẹs, adu-mi, ñwa ñw ẹ lẹ ka; ẹ ki rida bʼọ tẹ tʼọwọ awọtọ abẹk awohi n, todu ki ra ñw ẹ ugbo du kʼuwẹ anyi.
+8 Ọla ọtakada ukọọla-i akwo bʼuwẹ alu tẹ n, ama ẹ nayaye-wñ tʼọrọka tʼodu, todu kʼuwẹ ki dejutẹ kʼẹ che teju alu du ku ma kʼọrubutu efu-wñ; todu ẹgbalẹ ẹwñ-wẹ kʼẹ che anawo ñw ẹ, ẹgbalẹ tak i ara ñw ẹ.
+9 I chʼomi ñwa ñw ẹ n? Ẹ fʼeju gbiti kʼẹ la fʼẹdọ nẹ; ẹ ki chokpo n, ẹdọ-wẹ ki da kpu n; todu Jihofa Ọjọ-wẹ dʼugbo wẹ ugbo du kʼuwẹ anyi.

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+pandas
+numpy
+scikit-learn
+matplotlib

week2_practice.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pandas as pd
+import string
+# 1. Read raw text
+with open("igala.txt", "r", encoding="utf-8") as f:
+    text = f.read()
+# 2. Basic cleaning
+text = text.lower()
+text = text.replace("ʼ", "")
+text = text.translate(str.maketrans("", "", string.punctuation))
+# 3. Split into sentences (lines)
+sentences = text.splitlines()
+# 4. Remove empty lines
+sentences = [s for s in sentences if s.strip() != ""]
+# 5. Create DataFrame
+df = pd.DataFrame({"sentence": sentences})
+# 6. Add sentence length
+df["length"] = df["sentence"].str.len()
+# 7. Filter very short sentences
+df = df[df["length"] > 3]
+print(df.head())
+print("Total sentences:", len(df))
+#EDA (PRACTICAL,
+print(df.describe())
+df = df.reset_index(drop=True)
+# 8. Save cleaned dataset
+df.to_csv("data/igala_sentences.csv", index=False)
+print("Saved to data/igala_sentences.csv")

week3_ml.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pandas as pd
+# Load cleaned dataset
+df = pd.read_csv("data/igala_sentences.csv")
+print(df.head())
+print(df.info())
+import matplotlib.pyplot as plt
+# Plot sentence length distribution
+plt.hist(df["length"], bins=10)
+plt.title("Sentence Length Distribution")
+plt.xlabel("Sentence Length")
+plt.ylabel("Number of Sentences")
+plt.show()
+# Boxplot
+plt.figure()
+plt.boxplot(df["length"])
+plt.title("Sentence Length Boxplot")
+plt.ylabel("Length")
+plt.show()
+print("Shortest sentence length:", df["length"].min())
+print("Longest sentence length:", df["length"].max())
+print("Average sentence length:", df["length"].mean())
+# Create labels (all Igala for now)
+df["label"] = 1
+X = df["sentence"]   # input text
+y = df["label"]      # output label
+from sklearn.feature_extraction.text import TfidfVectorizer
+vectorizer = TfidfVectorizer(
+    max_features=500,
+    ngram_range=(1, 2)
+)
+X_vec = vectorizer.fit_transform(X)
+print("Vectorized shape:", X_vec.shape)
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(
+    X_vec, y, test_size=0.3, random_state=42
+)
+print("Training samples:", X_train.shape[0])
+print("Testing samples:", X_test.shape[0])
+from sklearn.naive_bayes import MultinomialNB
+model = MultinomialNB()
+model.fit(X_train, y_train)
+print("Model training complete")
+from sklearn.metrics import accuracy_score, confusion_matrix
+y_pred = model.predict(X_test)
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print("Confusion Matrix:")
+print(confusion_matrix(y_test, y_pred))