Spaces:
Running
Running
Commit ·
7fdce62
0
Parent(s):
Igala dataset Streamlit app
Browse files- app.py +87 -0
- data/igala_sentences.csv +12 -0
- hello.py +14 -0
- igala.txt +12 -0
- requirements.txt +5 -0
- week2_practice.py +39 -0
- week3_ml.py +67 -0
app.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from collections import Counter
|
| 4 |
+
import string
|
| 5 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 6 |
+
|
| 7 |
+
# ---------------- Page config ----------------
|
| 8 |
+
st.set_page_config(
|
| 9 |
+
page_title="Igala Dataset Explorer",
|
| 10 |
+
layout="wide"
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
st.title("Igala Dataset Explorer")
|
| 14 |
+
st.write("Welcome! This app explores an Igala text dataset.")
|
| 15 |
+
|
| 16 |
+
# ---------------- Load data ----------------
|
| 17 |
+
df = pd.read_csv("data/igala_sentences.csv")
|
| 18 |
+
|
| 19 |
+
# ---------------- Preview ----------------
|
| 20 |
+
st.subheader("📄 Dataset Preview")
|
| 21 |
+
st.dataframe(df.head())
|
| 22 |
+
|
| 23 |
+
# ---------------- Statistics ----------------
|
| 24 |
+
st.subheader("📊 Dataset Statistics")
|
| 25 |
+
|
| 26 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 27 |
+
col1.metric("Total Sentences", len(df))
|
| 28 |
+
col2.metric("Average Length", round(df["length"].mean(), 2))
|
| 29 |
+
col3.metric("Shortest", df["length"].min())
|
| 30 |
+
col4.metric("Longest", df["length"].max())
|
| 31 |
+
|
| 32 |
+
# ---------------- Word Frequency ----------------
|
| 33 |
+
st.subheader("🔤 Top 20 Most Common Words")
|
| 34 |
+
|
| 35 |
+
text = " ".join(df["sentence"].astype(str))
|
| 36 |
+
text = text.lower()
|
| 37 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 38 |
+
|
| 39 |
+
words = text.split()
|
| 40 |
+
words = [w for w in words if len(w) > 1]
|
| 41 |
+
|
| 42 |
+
# ✅ THIS WAS MISSING
|
| 43 |
+
word_counts = Counter(words)
|
| 44 |
+
|
| 45 |
+
word_freq_df = (
|
| 46 |
+
pd.DataFrame(word_counts.items(), columns=["word", "frequency"])
|
| 47 |
+
.sort_values(by="frequency", ascending=False)
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
freq_df = word_freq_df.head(20)
|
| 51 |
+
st.bar_chart(freq_df.set_index("word"))
|
| 52 |
+
|
| 53 |
+
# ---------------- Bigrams ----------------
|
| 54 |
+
st.subheader("Top Bigrams")
|
| 55 |
+
|
| 56 |
+
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=10)
|
| 57 |
+
X = vectorizer.fit_transform(df["sentence"].astype(str))
|
| 58 |
+
bigrams = vectorizer.get_feature_names_out()
|
| 59 |
+
|
| 60 |
+
st.write(bigrams)
|
| 61 |
+
|
| 62 |
+
# ---------------- Sentence Length Distribution ----------------
|
| 63 |
+
st.subheader("📐 Sentence Length Distribution")
|
| 64 |
+
st.bar_chart(df["length"])
|
| 65 |
+
|
| 66 |
+
# ---------------- Sidebar Filter ----------------
|
| 67 |
+
st.sidebar.header("🔧 Filters")
|
| 68 |
+
|
| 69 |
+
min_len = st.sidebar.slider(
|
| 70 |
+
"Minimum sentence length",
|
| 71 |
+
int(df["length"].min()),
|
| 72 |
+
int(df["length"].max()),
|
| 73 |
+
int(df["length"].min())
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
filtered_df = df[df["length"] >= min_len]
|
| 77 |
+
|
| 78 |
+
st.subheader("Sentence Length Filter")
|
| 79 |
+
st.write(filtered_df)
|
| 80 |
+
|
| 81 |
+
# ---------------- Footer ----------------
|
| 82 |
+
st.markdown("---")
|
| 83 |
+
st.caption("Built with ❤️ using Python, Pandas & Streamlit")
|
| 84 |
+
|
| 85 |
+
#python -m streamlit run app.py
|
| 86 |
+
#Local URL: http://localhost:8501
|
| 87 |
+
# Network URL: http://192.168.0.191:8501
|
data/igala_sentences.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentence,length
|
| 2 |
+
jọshua,6
|
| 3 |
+
ọjọ jẹ ñwu jọshua koji mosẹs,28
|
| 4 |
+
1 oñ i lewatu abu ku mosẹs adu jihofa lẹ leku jihofa ka ñwu jọshua ọma nun ki ache ẹwñ ñwu mosẹs lẹ kakini,106
|
| 5 |
+
2 mosẹs adumi la leku mẹ todulẹ uwẹ ki kwanẹ kẹ daba jọrdani uwẹ kpai amonẹi chaka mẹ nyoj anẹ ku nadu ñwu ma owñ chamọma israẹli,129
|
| 6 |
+
3 ugbo du kadaba ẹrẹwẹ manẹ uwẹ omi du ñwu alu ku ka ñwu mosẹs lẹ,65
|
| 7 |
+
4 akwo ajẹtachi kpai anẹ lẹbanọn kidẹ gbogbo wohimini elile owñ chohimini yufretis kpai anẹ abo hẹt chaka gbogbo wohimini elile ọwọ olanẹ elẹ chalọmẹwẹ,151
|
| 8 |
+
5 i nẹnẹ du ki aneke gbọna nyuwẹ ọjọ du kudejuwẹ kade n alu ku dugbo mosẹs alu lẹ nadugbo wẹ gẹ nawe ehi iñmi kẹ n nahi uwẹ tinyọ n,131
|
| 9 |
+
6 feju gbiti kẹ fẹdọ nẹ todu uwẹ ajẹ ñw amonẹi jẹ anẹ ku gọ ñw amatama ka ku nadu ñwu mai ẹwñ ogu,97
|
| 10 |
+
7 feju gbiti kate kẹ fẹdọ nẹ nana kẹ fejutẹ kẹ che teju alu du kukọọla ki mosẹs adumi ñwa ñw ẹ lẹ ka ẹ ki rida bọ tẹ tọwọ awọtọ abẹk awohi n todu ki ra ñw ẹ ugbo du kuwẹ anyi,174
|
| 11 |
+
8 ọla ọtakada ukọọlai akwo buwẹ alu tẹ n ama ẹ nayayewñ tọrọka todu todu kuwẹ ki dejutẹ kẹ che teju alu du ku ma kọrubutu efuwñ todu ẹgbalẹ ẹwñwẹ kẹ che anawo ñw ẹ ẹgbalẹ tak i ara ñw ẹ,185
|
| 12 |
+
9 i chomi ñwa ñw ẹ n ẹ feju gbiti kẹ la fẹdọ nẹ ẹ ki chokpo n ẹdọwẹ ki da kpu n todu jihofa ọjọwẹ dugbo wẹ ugbo du kuwẹ anyi,124
|
hello.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
from collections import Counter
|
| 3 |
+
|
| 4 |
+
with open("igala.txt", "r", encoding="utf-8") as f:
|
| 5 |
+
text = f.read()
|
| 6 |
+
|
| 7 |
+
text = text.lower()
|
| 8 |
+
text = text.replace("ʼ", "")
|
| 9 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 10 |
+
words = text.split()
|
| 11 |
+
words = [w for w in words if len(w) > 1]
|
| 12 |
+
|
| 13 |
+
freq = Counter(words)
|
| 14 |
+
print(freq.most_common(20))
|
igala.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Jọshua
|
| 2 |
+
1
|
| 3 |
+
Ọjọ jẹ ñwu Jọshua koji Mosẹs
|
| 4 |
+
1 Oñ i lewatu abu ku Mosẹs, adu Jihofa lẹ, leku, Jihofa ka ñwu Jọshua, ọma Nun kʼi ache ẹwñ ñwu Mosẹs lẹ, kakini,
|
| 5 |
+
2 Mosẹs adu-mi la leku mẹ; todulẹ uwẹ ki kwanẹ kʼẹ daba Jọrdan-i, uwẹ kpai amonẹ-i chaka, mẹ nyʼoj anẹ kʼu nadu ñwu ma, owñ chʼamọma Israẹl-i.
|
| 6 |
+
3 Ugbo du kʼadaba ẹrẹ-wẹ mʼanẹ, uwẹ omi du ñwu, alu kʼu ka ñwu Mosẹs lẹ.
|
| 7 |
+
4 Akwo ajẹtachi kpai anẹ Lẹbanọn kidẹ, gbogbo wʼohimini elile, owñ chʼohimini Yufretis, kpai anẹ abo Hẹt chaka gbogbo wʼOhimini Elile ọwọ olanẹ, e-lẹ chʼalọmẹ-wẹ.
|
| 8 |
+
5 I nʼẹnẹ du kʼi aneke gbʼọna nyʼuwẹ ọjọ du kʼudeju-wẹ kade n; alu kʼu dʼugbo Mosẹs, alu lẹ nadʼugbo wẹ gẹ; nawe ehi iñmi kʼẹ n, nahi uwẹ tinyọ n.
|
| 9 |
+
6 Fʼeju gbiti kʼẹ fʼẹdọ nẹ; todu uwẹ ajẹ ñw amonẹ-i jẹ anẹ kʼu gọ ñw amata-ma ka kʼu nadu ñwu ma-i ẹwñ ogu.
|
| 10 |
+
7 Fʼeju gbiti kate, kʼẹ fʼẹdọ nẹ nana, kʼẹ fejutẹ kʼẹ che teju alu du kʼukọọla ki Mosẹs, adu-mi, ñwa ñw ẹ lẹ ka; ẹ ki rida bʼọ tẹ tʼọwọ awọtọ abẹk awohi n, todu ki ra ñw ẹ ugbo du kʼuwẹ anyi.
|
| 11 |
+
8 Ọla ọtakada ukọọla-i akwo bʼuwẹ alu tẹ n, ama ẹ nayaye-wñ tʼọrọka tʼodu, todu kʼuwẹ ki dejutẹ kʼẹ che teju alu du ku ma kʼọrubutu efu-wñ; todu ẹgbalẹ ẹwñ-wẹ kʼẹ che anawo ñw ẹ, ẹgbalẹ tak i ara ñw ẹ.
|
| 12 |
+
9 I chʼomi ñwa ñw ẹ n? Ẹ fʼeju gbiti kʼẹ la fʼẹdọ nẹ; ẹ ki chokpo n, ẹdọ-wẹ ki da kpu n; todu Jihofa Ọjọ-wẹ dʼugbo wẹ ugbo du kʼuwẹ anyi.
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|
| 5 |
+
matplotlib
|
week2_practice.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import string
|
| 3 |
+
|
| 4 |
+
# 1. Read raw text
|
| 5 |
+
with open("igala.txt", "r", encoding="utf-8") as f:
|
| 6 |
+
text = f.read()
|
| 7 |
+
|
| 8 |
+
# 2. Basic cleaning
|
| 9 |
+
text = text.lower()
|
| 10 |
+
text = text.replace("ʼ", "")
|
| 11 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# 3. Split into sentences (lines)
|
| 15 |
+
sentences = text.splitlines()
|
| 16 |
+
|
| 17 |
+
# 4. Remove empty lines
|
| 18 |
+
sentences = [s for s in sentences if s.strip() != ""]
|
| 19 |
+
|
| 20 |
+
# 5. Create DataFrame
|
| 21 |
+
df = pd.DataFrame({"sentence": sentences})
|
| 22 |
+
|
| 23 |
+
# 6. Add sentence length
|
| 24 |
+
df["length"] = df["sentence"].str.len()
|
| 25 |
+
|
| 26 |
+
# 7. Filter very short sentences
|
| 27 |
+
df = df[df["length"] > 3]
|
| 28 |
+
|
| 29 |
+
print(df.head())
|
| 30 |
+
print("Total sentences:", len(df))
|
| 31 |
+
|
| 32 |
+
#EDA (PRACTICAL,
|
| 33 |
+
print(df.describe())
|
| 34 |
+
|
| 35 |
+
df = df.reset_index(drop=True)
|
| 36 |
+
|
| 37 |
+
# 8. Save cleaned dataset
|
| 38 |
+
df.to_csv("data/igala_sentences.csv", index=False)
|
| 39 |
+
print("Saved to data/igala_sentences.csv")
|
week3_ml.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
# Load cleaned dataset
|
| 4 |
+
df = pd.read_csv("data/igala_sentences.csv")
|
| 5 |
+
|
| 6 |
+
print(df.head())
|
| 7 |
+
print(df.info())
|
| 8 |
+
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
|
| 11 |
+
# Plot sentence length distribution
|
| 12 |
+
plt.hist(df["length"], bins=10)
|
| 13 |
+
plt.title("Sentence Length Distribution")
|
| 14 |
+
plt.xlabel("Sentence Length")
|
| 15 |
+
plt.ylabel("Number of Sentences")
|
| 16 |
+
plt.show()
|
| 17 |
+
|
| 18 |
+
# Boxplot
|
| 19 |
+
plt.figure()
|
| 20 |
+
plt.boxplot(df["length"])
|
| 21 |
+
plt.title("Sentence Length Boxplot")
|
| 22 |
+
plt.ylabel("Length")
|
| 23 |
+
plt.show()
|
| 24 |
+
|
| 25 |
+
print("Shortest sentence length:", df["length"].min())
|
| 26 |
+
print("Longest sentence length:", df["length"].max())
|
| 27 |
+
print("Average sentence length:", df["length"].mean())
|
| 28 |
+
|
| 29 |
+
# Create labels (all Igala for now)
|
| 30 |
+
df["label"] = 1
|
| 31 |
+
X = df["sentence"] # input text
|
| 32 |
+
y = df["label"] # output label
|
| 33 |
+
|
| 34 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 35 |
+
|
| 36 |
+
vectorizer = TfidfVectorizer(
|
| 37 |
+
max_features=500,
|
| 38 |
+
ngram_range=(1, 2)
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
X_vec = vectorizer.fit_transform(X)
|
| 42 |
+
|
| 43 |
+
print("Vectorized shape:", X_vec.shape)
|
| 44 |
+
|
| 45 |
+
from sklearn.model_selection import train_test_split
|
| 46 |
+
|
| 47 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 48 |
+
X_vec, y, test_size=0.3, random_state=42
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
print("Training samples:", X_train.shape[0])
|
| 52 |
+
print("Testing samples:", X_test.shape[0])
|
| 53 |
+
|
| 54 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 55 |
+
|
| 56 |
+
model = MultinomialNB()
|
| 57 |
+
model.fit(X_train, y_train)
|
| 58 |
+
|
| 59 |
+
print("Model training complete")
|
| 60 |
+
|
| 61 |
+
from sklearn.metrics import accuracy_score, confusion_matrix
|
| 62 |
+
|
| 63 |
+
y_pred = model.predict(X_test)
|
| 64 |
+
|
| 65 |
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
| 66 |
+
print("Confusion Matrix:")
|
| 67 |
+
print(confusion_matrix(y_test, y_pred))
|