Spaces:

duyguerisken
/

NLP

Sleeping

App Files Files Community

duyguerisken commited on Feb 27

Commit

d8ac5e3

verified ·

1 Parent(s): 91386b9

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +63 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,65 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import neattext.functions as nfx
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+import nltk
+# NLTK verilerini indir (HF üzerinde çalışması için gerekli)
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('stopwords')
+st.set_page_config(page_title="Konu Modelleme Analizi", layout="wide")
+st.title("📂 Konu Modelleme (Topic Modeling) Analizi")
+st.markdown("Bu uygulama, metin veri kümeleri içerisindeki gizli temaları tespit eder.")
+# 1. Veri Yükleme
+@st.cache_data
+def load_data():
+    # Notebook'undaki gibi latin1 encoding ile okuyoruz
+    df = pd.read_csv("src/articles.csv", encoding='latin1')
+    return df
+try:
+    df = load_data()
+    st.success("Veri seti başarıyla yüklendi!")
+    # 2. Veri Ön İşleme (Notebook'undaki fonksiyon)
+    if st.checkbox("Veriyi Ön İşlemden Geçir (Cleaning)"):
+        with st.spinner("Metinler temizleniyor..."):
+            df['Processed_Article'] = df['Article'].apply(nfx.remove_punctuations)
+            df['Processed_Article'] = df['Processed_Article'].apply(lambda x: nfx.remove_stopwords(x, lang='english'))
+            st.write(df[['Article', 'Processed_Article']].head())
+    # 3. LDA Modelleme
+    st.sidebar.header("Model Ayarları")
+    n_topics = st.sidebar.slider("Konu Sayısı", min_value=2, max_value=15, value=10)
+    if st.button("Modeli Eğit ve Konuları Bul"):
+        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
+        x = vectorizer.fit_transform(df['Article'])
+        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
+        lda.fit(x)
+        # Konuları Görselleştirme (Notebook'undaki grafik mantığı)
+        st.subheader(f"Belirlenen {n_topics} Konu ve Anahtar Kelimeler")
+        feature_names = vectorizer.get_feature_names_out()
+        for index, topic in enumerate(lda.components_):
+            top_words_indices = topic.argsort()[-7:][::-1]
+            top_words = [feature_names[i] for i in top_words_indices]
+            top_weights = topic[top_words_indices]
+            fig, ax = plt.subplots(figsize=(8, 4))
+            sns.barplot(x=top_weights, y=top_words, palette="viridis", ax=ax)
+            ax.set_title(f"Konu {index + 1}")
+            st.pyplot(fig)
+except FileNotFoundError:
+    st.error("Lütfen 'articles.csv' dosyasını uygulama klasörüne ekleyin.")