Spaces:

Tassawar
/

WordCloud

Sleeping

App Files Files Community

Tassawar commited on Feb 5, 2025

Commit

4f4de74

verified ·

1 Parent(s): 6e1a8dc

txt

Browse files

Files changed (2) hide show

requirements.txt +8 -0
word_cloud_app.py +111 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+docx==0.2.4
+matplotlib==3.9.2
+numpy==2.1.0
+pandas==2.2.2
+PyPDF2==3.0.1
+python_docx==1.1.2
+streamlit==1.38.0
+wordcloud==1.9.3

word_cloud_app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from wordcloud import WordCloud, STOPWORDS
+import matplotlib.pyplot as plt
+import PyPDF2
+from docx import Document
+import base64
+from io import BytesIO
+# Functions for file reading
+def read_txt(file):
+    return file.getvalue().decode("utf-8")
+def read_docx(file):
+    doc = Document(file)
+    return " ".join([para.text for para in doc.paragraphs])
+def read_pdf(file):
+    pdf = PyPDF2.PdfReader(file)
+    return " ".join([page.extract_text() for page in pdf.pages])
+# Function to filter out stopwords
+def filter_stopwords(text, additional_stopwords=[]):
+    words = text.split()
+    all_stopwords = STOPWORDS.union(set(additional_stopwords))
+    filtered_words = [word for word in words if word.lower() not in all_stopwords]
+    return " ".join(filtered_words)
+# Function to create download link for plot
+def get_image_download_link(buffered, format_):
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Word Cloud as {format_}</a>'
+# Function to generate a download link for a DataFrame
+def get_table_download_link(df, filename, file_label):
+    csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode()
+    return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>'
+# Streamlit UI
+st.title("🌥️ Word Cloud Generator")
+st.subheader("Upload a PDF, DOCX, or TXT file to generate a Word Cloud")
+uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
+if uploaded_file:
+    file_details = {
+        "File Name": uploaded_file.name,
+        "File Type": uploaded_file.type,
+        "File Size (KB)": round(uploaded_file.size / 1024, 2)
+    }
+    st.sidebar.write("**File Details:**", file_details)
+    # File reading based on type
+    if uploaded_file.type == "text/plain":
+        text = read_txt(uploaded_file)
+    elif uploaded_file.type == "application/pdf":
+        text = read_pdf(uploaded_file)
+    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        text = read_docx(uploaded_file)
+    else:
+        st.error("Unsupported file type. Please upload a TXT, PDF, or DOCX file.")
+        st.stop()
+    # Word count table
+    words = text.split()
+    word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
+    # Sidebar: Stopwords and Word Cloud Customization
+    use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True)
+    top_words = word_count['Word'].head(50).tolist()
+    additional_stopwords = st.sidebar.multiselect("Add stopwords:", sorted(top_words))
+    all_stopwords = STOPWORDS.union(set(additional_stopwords)) if use_standard_stopwords else set(additional_stopwords)
+    filtered_text = filter_stopwords(text, all_stopwords)
+    if filtered_text:
+        # Word Cloud dimensions
+        width = st.sidebar.slider("Word Cloud Width (px)", 400, 2000, 1200, 50)
+        height = st.sidebar.slider("Word Cloud Height (px)", 200, 2000, 800, 50)
+        # Generate Word Cloud
+        st.subheader("Generated Word Cloud")
+        fig, ax = plt.subplots(figsize=(width/100, height/100))
+        wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200).generate(filtered_text)
+        ax.imshow(wordcloud_img, interpolation='bilinear')
+        ax.axis('off')
+        # Display and save options
+        st.pyplot(fig)
+        format_ = st.selectbox("Save Word Cloud as:", ["png", "jpeg", "svg", "pdf"])
+        resolution = st.slider("Image Resolution (DPI)", 100, 500, 300, 50)
+        if st.button(f"Save Word Cloud as {format_.upper()}"):
+            buffered = BytesIO()
+            plt.savefig(buffered, format=format_, dpi=resolution)
+            st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True)
+        # Display word count table
+        st.subheader("Word Count Table")
+        st.write(word_count)
+        if st.button('Download Word Count Table as CSV'):
+            st.markdown(get_table_download_link(word_count, "word_count.csv", "Download CSV"), unsafe_allow_html=True)
+# Footer with author info
+st.sidebar.markdown("### Created by: [Tassawar Abbas](https://github.com/Abbas829)")
+st.sidebar.markdown("Contact: [Email](mailto:abbas829@gmail.com)")
+st.sidebar.markdown("Facebook: [Tassawar Abbas](https://www.facebook.com/abbas829)")
+st.sidebar.markdown("Linkedin: [Tassawar Abbas](https://www.linkedin.com/in/abbas829pro)")