Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from wordcloud import WordCloud | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_file): | |
| with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc: | |
| text = "\n".join([page.get_text() for page in doc]) | |
| return text | |
| # Streamlit UI | |
| st.title("π Document Clustering App") | |
| st.write("This app performs unsupervised clustering on uploaded PDF documents.") | |
| # Upload PDF files | |
| uploaded_files = st.file_uploader("Upload one or more PDF files", type=["pdf"], accept_multiple_files=True) | |
| # Slider for number of clusters | |
| num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1) | |
| if uploaded_files: | |
| # Extract text from PDFs | |
| documents = [extract_text_from_pdf(file) for file in uploaded_files] | |
| # Convert documents to TF-IDF features | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(documents) | |
| # Apply KMeans clustering | |
| model = KMeans(n_clusters=num_clusters, random_state=42) | |
| clusters = model.fit_predict(X) | |
| # Model Metrics Section | |
| st.subheader("π Model Metrics") | |
| if len(uploaded_files) > 1: | |
| silhouette_avg = silhouette_score(X, clusters) | |
| st.write(f"**Silhouette Score:** {silhouette_avg:.3f}") | |
| else: | |
| st.write("**Silhouette Score:** N/A (Need at least 2 documents)") | |
| # Cluster Size Distribution | |
| st.write("### Cluster Size Distribution") | |
| cluster_counts = pd.Series(clusters).value_counts().sort_index() | |
| fig, ax = plt.subplots() | |
| sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax, palette="viridis") | |
| ax.set_xlabel("Cluster") | |
| ax.set_ylabel("Number of Documents") | |
| ax.set_title("Cluster Size Distribution") | |
| st.pyplot(fig) | |
| # Create word clouds for each cluster | |
| st.subheader("π₯ Word Clouds for Each Cluster") | |
| for i in range(num_clusters): | |
| cluster_docs = [documents[j] for j in range(len(documents)) if clusters[j] == i] | |
| cluster_text = " ".join(cluster_docs) | |
| if cluster_text: | |
| wordcloud = WordCloud(width=800, height=400, max_words=100, background_color='white').generate(cluster_text) | |
| st.write(f"### Cluster {i+1}") | |
| st.image(wordcloud.to_array()) | |
| else: | |
| st.write(f"### Cluster {i+1} (No documents in this cluster)") | |
| # Display clustered documents | |
| st.subheader("π Clustered Documents") | |
| df = pd.DataFrame({"Document": [file.name for file in uploaded_files], "Cluster": clusters}) | |
| st.dataframe(df) |