Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import string | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from wordcloud import WordCloud | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from sklearn.metrics import silhouette_score | |
| # Download stopwords if not available | |
| nltk.download("stopwords") | |
| # Load models and vectorizer | |
| kmeans = joblib.load("kmeans_fake_news.pkl") | |
| lda = joblib.load("lda_fake_news.pkl") | |
| vectorizer = joblib.load("tfidf_vectorizer.pkl") | |
| # Load dataset | |
| DATASET_URL = "https://www.kaggle.com/datasets/mrisdal/fake-news" | |
| fake_df = pd.read_csv("Fake.csv") | |
| # Preprocessing | |
| stop_words = set(stopwords.words("english")) | |
| def clean_text(text): | |
| """Cleans the input text by removing punctuation, numbers, and stopwords.""" | |
| text = text.lower() | |
| text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation | |
| text = re.sub(r"\d+", "", text) # Remove numbers | |
| text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords | |
| return text | |
| fake_df = fake_df[['title', 'text']].dropna() | |
| fake_df['content'] = fake_df['title'] + " " + fake_df['text'] | |
| fake_df['clean_text'] = fake_df['content'].apply(clean_text) | |
| # Transform text into TF-IDF features | |
| X = vectorizer.transform(fake_df['clean_text']) | |
| fake_df['cluster'] = kmeans.predict(X) | |
| # Get top words for LDA topics | |
| words = np.array(vectorizer.get_feature_names_out()) | |
| top_words = [" ".join(words[np.argsort(topic)][-10:]) for topic in lda.components_] | |
| # Sidebar Navigation | |
| st.sidebar.title("Navigation") | |
| page = st.sidebar.radio("Go to", ["Dataset", "Visualizations", "Model Info", "Model Metrics", "Predictor"]) | |
| # Model Information Page | |
| if page == "Model Info": | |
| st.title("Model Information") | |
| st.write("### Machine Learning Models Used") | |
| st.markdown( | |
| """ | |
| - **K-Means Clustering**: Used to group fake news articles into clusters based on their content similarity. | |
| - **Latent Dirichlet Allocation (LDA)**: Used for topic modeling to extract the main topics from fake news articles. | |
| - **TF-IDF Vectorizer**: Transforms the textual content into numerical features to be used by the models. | |
| """ | |
| ) | |
| # Dataset Page | |
| elif page == "Dataset": | |
| st.title("Fake News Topic Analyzer") | |
| st.write("### About the Dataset") | |
| st.markdown( | |
| """ | |
| The dataset contains **fake news articles** collected from multiple sources. | |
| It includes titles, article texts, and publishing dates. | |
| We use this dataset for **unsupervised clustering and topic modeling**. | |
| """ | |
| ) | |
| st.write(f"๐ **Dataset Source:** [Kaggle: Fake News](<{DATASET_URL}>)") | |
| st.write("### Sample Data (Raw)") | |
| st.dataframe(fake_df[['title', 'text']].head()) | |
| st.write("### Sample Data (Cleaned)") | |
| st.dataframe(fake_df[['clean_text']].head()) | |
| st.write("### Word Cloud of Most Frequent Words") | |
| wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(fake_df['clean_text'])) | |
| fig, ax = plt.subplots() | |
| ax.imshow(wordcloud, interpolation="bilinear") | |
| ax.axis("off") | |
| st.pyplot(fig) | |
| # Visualizations Page | |
| elif page == "Visualizations": | |
| st.title("Fake News Clustering & Topic Modeling") | |
| st.write("### Cluster Distribution") | |
| fig, ax = plt.subplots() | |
| sns.countplot(x=fake_df['cluster'], ax=ax, palette="viridis") | |
| ax.set_xlabel("Cluster") | |
| ax.set_ylabel("Number of Articles") | |
| st.pyplot(fig) | |
| st.write("### Topic Words from LDA") | |
| for idx, words in enumerate(top_words): | |
| st.write(f"**Topic {idx}:** {words}") | |
| # Model Metrics Page | |
| elif page == "Model Metrics": | |
| st.title("Model Clustering Performance") | |
| sil_score = silhouette_score(X, fake_df['cluster']) | |
| st.write(f"### Silhouette Score (K-Means Clustering): **{sil_score:.4f}**") | |
| st.write("### Sample Articles per Cluster") | |
| for cluster_id in sorted(fake_df['cluster'].unique()): | |
| st.write(f"#### Cluster {cluster_id} Samples") | |
| st.dataframe(fake_df[fake_df['cluster'] == cluster_id][['title', 'text']].head(3)) | |
| # Predictor Page | |
| elif page == "Predictor": | |
| st.title("Fake News Topic Analyzer") | |
| user_input = st.text_area("Enter news content:") | |
| if st.button("Analyze"): | |
| if user_input.strip(): | |
| cleaned_input = clean_text(user_input) | |
| vectorized_input = vectorizer.transform([cleaned_input]) | |
| cluster_pred = kmeans.predict(vectorized_input)[0] | |
| topic_pred = np.argmax(lda.transform(vectorized_input)) | |
| st.write(f"### Predicted Cluster: {cluster_pred}") | |
| # Handle out-of-range topic index | |
| if topic_pred < len(top_words): | |
| st.write(f"### Predicted Topic: {topic_pred} - {top_words[topic_pred]}") | |
| else: | |
| st.write(f"### Predicted Topic: {topic_pred} (No keywords available)") |