Faruna01 commited on
Commit
7fdce62
·
0 Parent(s):

Igala dataset Streamlit app

Browse files
Files changed (7) hide show
  1. app.py +87 -0
  2. data/igala_sentences.csv +12 -0
  3. hello.py +14 -0
  4. igala.txt +12 -0
  5. requirements.txt +5 -0
  6. week2_practice.py +39 -0
  7. week3_ml.py +67 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from collections import Counter
4
+ import string
5
+ from sklearn.feature_extraction.text import CountVectorizer
6
+
7
+ # ---------------- Page config ----------------
8
+ st.set_page_config(
9
+ page_title="Igala Dataset Explorer",
10
+ layout="wide"
11
+ )
12
+
13
+ st.title("Igala Dataset Explorer")
14
+ st.write("Welcome! This app explores an Igala text dataset.")
15
+
16
+ # ---------------- Load data ----------------
17
+ df = pd.read_csv("data/igala_sentences.csv")
18
+
19
+ # ---------------- Preview ----------------
20
+ st.subheader("📄 Dataset Preview")
21
+ st.dataframe(df.head())
22
+
23
+ # ---------------- Statistics ----------------
24
+ st.subheader("📊 Dataset Statistics")
25
+
26
+ col1, col2, col3, col4 = st.columns(4)
27
+ col1.metric("Total Sentences", len(df))
28
+ col2.metric("Average Length", round(df["length"].mean(), 2))
29
+ col3.metric("Shortest", df["length"].min())
30
+ col4.metric("Longest", df["length"].max())
31
+
32
+ # ---------------- Word Frequency ----------------
33
+ st.subheader("🔤 Top 20 Most Common Words")
34
+
35
+ text = " ".join(df["sentence"].astype(str))
36
+ text = text.lower()
37
+ text = text.translate(str.maketrans("", "", string.punctuation))
38
+
39
+ words = text.split()
40
+ words = [w for w in words if len(w) > 1]
41
+
42
+ # ✅ THIS WAS MISSING
43
+ word_counts = Counter(words)
44
+
45
+ word_freq_df = (
46
+ pd.DataFrame(word_counts.items(), columns=["word", "frequency"])
47
+ .sort_values(by="frequency", ascending=False)
48
+ )
49
+
50
+ freq_df = word_freq_df.head(20)
51
+ st.bar_chart(freq_df.set_index("word"))
52
+
53
+ # ---------------- Bigrams ----------------
54
+ st.subheader("Top Bigrams")
55
+
56
+ vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=10)
57
+ X = vectorizer.fit_transform(df["sentence"].astype(str))
58
+ bigrams = vectorizer.get_feature_names_out()
59
+
60
+ st.write(bigrams)
61
+
62
+ # ---------------- Sentence Length Distribution ----------------
63
+ st.subheader("📐 Sentence Length Distribution")
64
+ st.bar_chart(df["length"])
65
+
66
+ # ---------------- Sidebar Filter ----------------
67
+ st.sidebar.header("🔧 Filters")
68
+
69
+ min_len = st.sidebar.slider(
70
+ "Minimum sentence length",
71
+ int(df["length"].min()),
72
+ int(df["length"].max()),
73
+ int(df["length"].min())
74
+ )
75
+
76
+ filtered_df = df[df["length"] >= min_len]
77
+
78
+ st.subheader("Sentence Length Filter")
79
+ st.write(filtered_df)
80
+
81
+ # ---------------- Footer ----------------
82
+ st.markdown("---")
83
+ st.caption("Built with ❤️ using Python, Pandas & Streamlit")
84
+
85
+ #python -m streamlit run app.py
86
+ #Local URL: http://localhost:8501
87
+ # Network URL: http://192.168.0.191:8501
data/igala_sentences.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence,length
2
+ jọshua,6
3
+ ọjọ jẹ ñwu jọshua koji mosẹs,28
4
+ 1 oñ i lewatu abu ku mosẹs adu jihofa lẹ leku jihofa ka ñwu jọshua ọma nun ki ache ẹwñ ñwu mosẹs lẹ kakini,106
5
+ 2 mosẹs adumi la leku mẹ todulẹ uwẹ ki kwanẹ kẹ daba jọrdani uwẹ kpai amonẹi chaka mẹ nyoj anẹ ku nadu ñwu ma owñ chamọma israẹli,129
6
+ 3 ugbo du kadaba ẹrẹwẹ manẹ uwẹ omi du ñwu alu ku ka ñwu mosẹs lẹ,65
7
+ 4 akwo ajẹtachi kpai anẹ lẹbanọn kidẹ gbogbo wohimini elile owñ chohimini yufretis kpai anẹ abo hẹt chaka gbogbo wohimini elile ọwọ olanẹ elẹ chalọmẹwẹ,151
8
+ 5 i nẹnẹ du ki aneke gbọna nyuwẹ ọjọ du kudejuwẹ kade n alu ku dugbo mosẹs alu lẹ nadugbo wẹ gẹ nawe ehi iñmi kẹ n nahi uwẹ tinyọ n,131
9
+ 6 feju gbiti kẹ fẹdọ nẹ todu uwẹ ajẹ ñw amonẹi jẹ anẹ ku gọ ñw amatama ka ku nadu ñwu mai ẹwñ ogu,97
10
+ 7 feju gbiti kate kẹ fẹdọ nẹ nana kẹ fejutẹ kẹ che teju alu du kukọọla ki mosẹs adumi ñwa ñw ẹ lẹ ka ẹ ki rida bọ tẹ tọwọ awọtọ abẹk awohi n todu ki ra ñw ẹ ugbo du kuwẹ anyi,174
11
+ 8 ọla ọtakada ukọọlai akwo buwẹ alu tẹ n ama ẹ nayayewñ tọrọka todu todu kuwẹ ki dejutẹ kẹ che teju alu du ku ma kọrubutu efuwñ todu ẹgbalẹ ẹwñwẹ kẹ che anawo ñw ẹ ẹgbalẹ tak i ara ñw ẹ,185
12
+ 9 i chomi ñwa ñw ẹ n ẹ feju gbiti kẹ la fẹdọ nẹ ẹ ki chokpo n ẹdọwẹ ki da kpu n todu jihofa ọjọwẹ dugbo wẹ ugbo du kuwẹ anyi,124
hello.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ from collections import Counter
3
+
4
+ with open("igala.txt", "r", encoding="utf-8") as f:
5
+ text = f.read()
6
+
7
+ text = text.lower()
8
+ text = text.replace("ʼ", "")
9
+ text = text.translate(str.maketrans("", "", string.punctuation))
10
+ words = text.split()
11
+ words = [w for w in words if len(w) > 1]
12
+
13
+ freq = Counter(words)
14
+ print(freq.most_common(20))
igala.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Jọshua
2
+ 1
3
+ Ọjọ jẹ ñwu Jọshua koji Mosẹs
4
+ 1 Oñ i lewatu abu ku Mosẹs, adu Jihofa lẹ, leku, Jihofa ka ñwu Jọshua, ọma Nun kʼi ache ẹwñ ñwu Mosẹs lẹ, kakini,
5
+ 2 Mosẹs adu-mi la leku mẹ; todulẹ uwẹ ki kwanẹ kʼẹ daba Jọrdan-i, uwẹ kpai amonẹ-i chaka, mẹ nyʼoj anẹ kʼu nadu ñwu ma, owñ chʼamọma Israẹl-i.
6
+ 3 Ugbo du kʼadaba ẹrẹ-wẹ mʼanẹ, uwẹ omi du ñwu, alu kʼu ka ñwu Mosẹs lẹ.
7
+ 4 Akwo ajẹtachi kpai anẹ Lẹbanọn kidẹ, gbogbo wʼohimini elile, owñ chʼohimini Yufretis, kpai anẹ abo Hẹt chaka gbogbo wʼOhimini Elile ọwọ olanẹ, e-lẹ chʼalọmẹ-wẹ.
8
+ 5 I nʼẹnẹ du kʼi aneke gbʼọna nyʼuwẹ ọjọ du kʼudeju-wẹ kade n; alu kʼu dʼugbo Mosẹs, alu lẹ nadʼugbo wẹ gẹ; nawe ehi iñmi kʼẹ n, nahi uwẹ tinyọ n.
9
+ 6 Fʼeju gbiti kʼẹ fʼẹdọ nẹ; todu uwẹ ajẹ ñw amonẹ-i jẹ anẹ kʼu gọ ñw amata-ma ka kʼu nadu ñwu ma-i ẹwñ ogu.
10
+ 7 Fʼeju gbiti kate, kʼẹ fʼẹdọ nẹ nana, kʼẹ fejutẹ kʼẹ che teju alu du kʼukọọla ki Mosẹs, adu-mi, ñwa ñw ẹ lẹ ka; ẹ ki rida bʼọ tẹ tʼọwọ awọtọ abẹk awohi n, todu ki ra ñw ẹ ugbo du kʼuwẹ anyi.
11
+ 8 Ọla ọtakada ukọọla-i akwo bʼuwẹ alu tẹ n, ama ẹ nayaye-wñ tʼọrọka tʼodu, todu kʼuwẹ ki dejutẹ kʼẹ che teju alu du ku ma kʼọrubutu efu-wñ; todu ẹgbalẹ ẹwñ-wẹ kʼẹ che anawo ñw ẹ, ẹgbalẹ tak i ara ñw ẹ.
12
+ 9 I chʼomi ñwa ñw ẹ n? Ẹ fʼeju gbiti kʼẹ la fʼẹdọ nẹ; ẹ ki chokpo n, ẹdọ-wẹ ki da kpu n; todu Jihofa Ọjọ-wẹ dʼugbo wẹ ugbo du kʼuwẹ anyi.
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ matplotlib
week2_practice.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import string
3
+
4
+ # 1. Read raw text
5
+ with open("igala.txt", "r", encoding="utf-8") as f:
6
+ text = f.read()
7
+
8
+ # 2. Basic cleaning
9
+ text = text.lower()
10
+ text = text.replace("ʼ", "")
11
+ text = text.translate(str.maketrans("", "", string.punctuation))
12
+
13
+
14
+ # 3. Split into sentences (lines)
15
+ sentences = text.splitlines()
16
+
17
+ # 4. Remove empty lines
18
+ sentences = [s for s in sentences if s.strip() != ""]
19
+
20
+ # 5. Create DataFrame
21
+ df = pd.DataFrame({"sentence": sentences})
22
+
23
+ # 6. Add sentence length
24
+ df["length"] = df["sentence"].str.len()
25
+
26
+ # 7. Filter very short sentences
27
+ df = df[df["length"] > 3]
28
+
29
+ print(df.head())
30
+ print("Total sentences:", len(df))
31
+
32
+ #EDA (PRACTICAL,
33
+ print(df.describe())
34
+
35
+ df = df.reset_index(drop=True)
36
+
37
+ # 8. Save cleaned dataset
38
+ df.to_csv("data/igala_sentences.csv", index=False)
39
+ print("Saved to data/igala_sentences.csv")
week3_ml.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Load cleaned dataset
4
+ df = pd.read_csv("data/igala_sentences.csv")
5
+
6
+ print(df.head())
7
+ print(df.info())
8
+
9
+ import matplotlib.pyplot as plt
10
+
11
+ # Plot sentence length distribution
12
+ plt.hist(df["length"], bins=10)
13
+ plt.title("Sentence Length Distribution")
14
+ plt.xlabel("Sentence Length")
15
+ plt.ylabel("Number of Sentences")
16
+ plt.show()
17
+
18
+ # Boxplot
19
+ plt.figure()
20
+ plt.boxplot(df["length"])
21
+ plt.title("Sentence Length Boxplot")
22
+ plt.ylabel("Length")
23
+ plt.show()
24
+
25
+ print("Shortest sentence length:", df["length"].min())
26
+ print("Longest sentence length:", df["length"].max())
27
+ print("Average sentence length:", df["length"].mean())
28
+
29
+ # Create labels (all Igala for now)
30
+ df["label"] = 1
31
+ X = df["sentence"] # input text
32
+ y = df["label"] # output label
33
+
34
+ from sklearn.feature_extraction.text import TfidfVectorizer
35
+
36
+ vectorizer = TfidfVectorizer(
37
+ max_features=500,
38
+ ngram_range=(1, 2)
39
+ )
40
+
41
+ X_vec = vectorizer.fit_transform(X)
42
+
43
+ print("Vectorized shape:", X_vec.shape)
44
+
45
+ from sklearn.model_selection import train_test_split
46
+
47
+ X_train, X_test, y_train, y_test = train_test_split(
48
+ X_vec, y, test_size=0.3, random_state=42
49
+ )
50
+
51
+ print("Training samples:", X_train.shape[0])
52
+ print("Testing samples:", X_test.shape[0])
53
+
54
+ from sklearn.naive_bayes import MultinomialNB
55
+
56
+ model = MultinomialNB()
57
+ model.fit(X_train, y_train)
58
+
59
+ print("Model training complete")
60
+
61
+ from sklearn.metrics import accuracy_score, confusion_matrix
62
+
63
+ y_pred = model.predict(X_test)
64
+
65
+ print("Accuracy:", accuracy_score(y_test, y_pred))
66
+ print("Confusion Matrix:")
67
+ print(confusion_matrix(y_test, y_pred))