| import streamlit as st |
| import pandas as pd |
| import numpy as np |
| from wordcloud import WordCloud, STOPWORDS |
| import matplotlib.pyplot as plt |
| import PyPDF2 |
| from docx import Document |
| import base64 |
| from io import BytesIO |
|
|
| |
| def read_txt(file): |
| return file.getvalue().decode("utf-8") |
|
|
| def read_docx(file): |
| doc = Document(file) |
| return " ".join([para.text for para in doc.paragraphs]) |
|
|
| def read_pdf(file): |
| pdf = PyPDF2.PdfReader(file) |
| return " ".join([page.extract_text() for page in pdf.pages]) |
|
|
| |
| def filter_stopwords(text, additional_stopwords=[]): |
| words = text.split() |
| all_stopwords = STOPWORDS.union(set(additional_stopwords)) |
| filtered_words = [word for word in words if word.lower() not in all_stopwords] |
| return " ".join(filtered_words) |
|
|
| |
| def get_image_download_link(buffered, format_): |
| image_base64 = base64.b64encode(buffered.getvalue()).decode() |
| return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Word Cloud as {format_}</a>' |
|
|
| |
| def get_table_download_link(df, filename, file_label): |
| csv = df.to_csv(index=False) |
| b64 = base64.b64encode(csv.encode()).decode() |
| return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>' |
|
|
| |
| st.title("🌥️ Word Cloud Generator") |
| st.subheader("Upload a PDF, DOCX, or TXT file to generate a Word Cloud") |
|
|
| uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"]) |
|
|
| if uploaded_file: |
| file_details = { |
| "File Name": uploaded_file.name, |
| "File Type": uploaded_file.type, |
| "File Size (KB)": round(uploaded_file.size / 1024, 2) |
| } |
| st.sidebar.write("**File Details:**", file_details) |
|
|
| |
| if uploaded_file.type == "text/plain": |
| text = read_txt(uploaded_file) |
| elif uploaded_file.type == "application/pdf": |
| text = read_pdf(uploaded_file) |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
| text = read_docx(uploaded_file) |
| else: |
| st.error("Unsupported file type. Please upload a TXT, PDF, or DOCX file.") |
| st.stop() |
|
|
| |
| words = text.split() |
| word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False) |
|
|
| |
| use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True) |
| top_words = word_count['Word'].head(50).tolist() |
| additional_stopwords = st.sidebar.multiselect("Add stopwords:", sorted(top_words)) |
|
|
| all_stopwords = STOPWORDS.union(set(additional_stopwords)) if use_standard_stopwords else set(additional_stopwords) |
| filtered_text = filter_stopwords(text, all_stopwords) |
|
|
| if filtered_text: |
| |
| width = st.sidebar.slider("Word Cloud Width (px)", 400, 2000, 1200, 50) |
| height = st.sidebar.slider("Word Cloud Height (px)", 200, 2000, 800, 50) |
|
|
| |
| st.subheader("Generated Word Cloud") |
| fig, ax = plt.subplots(figsize=(width/100, height/100)) |
| wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200).generate(filtered_text) |
| ax.imshow(wordcloud_img, interpolation='bilinear') |
| ax.axis('off') |
|
|
| |
| st.pyplot(fig) |
| format_ = st.selectbox("Save Word Cloud as:", ["png", "jpeg", "svg", "pdf"]) |
| resolution = st.slider("Image Resolution (DPI)", 100, 500, 300, 50) |
|
|
| if st.button(f"Save Word Cloud as {format_.upper()}"): |
| buffered = BytesIO() |
| plt.savefig(buffered, format=format_, dpi=resolution) |
| st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True) |
|
|
| |
| |
| st.subheader("Word Count Table") |
| st.write(word_count) |
| if st.button('Download Word Count Table as CSV'): |
| st.markdown(get_table_download_link(word_count, "word_count.csv", "Download CSV"), unsafe_allow_html=True) |
|
|
| |
| st.sidebar.markdown("### Created by: [Tassawar Abbas](https://github.com/Abbas829)") |
| st.sidebar.markdown("Contact: [Email](mailto:abbas829@gmail.com)") |
| st.sidebar.markdown("Facebook: [Tassawar Abbas](https://www.facebook.com/abbas829)") |
| st.sidebar.markdown("Linkedin: [Tassawar Abbas](https://www.linkedin.com/in/abbas829pro)") |
|
|