|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from wordcloud import WordCloud, STOPWORDS |
|
|
import matplotlib.pyplot as plt |
|
|
import PyPDF2 |
|
|
from docx import Document |
|
|
import base64 |
|
|
from io import BytesIO |
|
|
|
|
|
|
|
|
def read_txt(file): |
|
|
return file.getvalue().decode("utf-8") |
|
|
|
|
|
def read_docx(file): |
|
|
doc = Document(file) |
|
|
return " ".join([para.text for para in doc.paragraphs]) |
|
|
|
|
|
def read_pdf(file): |
|
|
pdf = PyPDF2.PdfReader(file) |
|
|
return " ".join([page.extract_text() for page in pdf.pages]) |
|
|
|
|
|
|
|
|
def filter_stopwords(text, additional_stopwords=[]): |
|
|
words = text.split() |
|
|
all_stopwords = STOPWORDS.union(set(additional_stopwords)) |
|
|
filtered_words = [word for word in words if word.lower() not in all_stopwords] |
|
|
return " ".join(filtered_words) |
|
|
|
|
|
|
|
|
def get_image_download_link(buffered, format_): |
|
|
image_base64 = base64.b64encode(buffered.getvalue()).decode() |
|
|
return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Word Cloud as {format_}</a>' |
|
|
|
|
|
|
|
|
def get_table_download_link(df, filename, file_label): |
|
|
csv = df.to_csv(index=False) |
|
|
b64 = base64.b64encode(csv.encode()).decode() |
|
|
return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>' |
|
|
|
|
|
|
|
|
st.title("🌥️ Word Cloud Generator") |
|
|
st.subheader("Upload a PDF, DOCX, or TXT file to generate a Word Cloud") |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"]) |
|
|
|
|
|
if uploaded_file: |
|
|
file_details = { |
|
|
"File Name": uploaded_file.name, |
|
|
"File Type": uploaded_file.type, |
|
|
"File Size (KB)": round(uploaded_file.size / 1024, 2) |
|
|
} |
|
|
st.sidebar.write("**File Details:**", file_details) |
|
|
|
|
|
|
|
|
if uploaded_file.type == "text/plain": |
|
|
text = read_txt(uploaded_file) |
|
|
elif uploaded_file.type == "application/pdf": |
|
|
text = read_pdf(uploaded_file) |
|
|
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
|
|
text = read_docx(uploaded_file) |
|
|
else: |
|
|
st.error("Unsupported file type. Please upload a TXT, PDF, or DOCX file.") |
|
|
st.stop() |
|
|
|
|
|
|
|
|
words = text.split() |
|
|
word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False) |
|
|
|
|
|
|
|
|
use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True) |
|
|
top_words = word_count['Word'].head(50).tolist() |
|
|
additional_stopwords = st.sidebar.multiselect("Add stopwords:", sorted(top_words)) |
|
|
|
|
|
all_stopwords = STOPWORDS.union(set(additional_stopwords)) if use_standard_stopwords else set(additional_stopwords) |
|
|
filtered_text = filter_stopwords(text, all_stopwords) |
|
|
|
|
|
if filtered_text: |
|
|
|
|
|
width = st.sidebar.slider("Word Cloud Width (px)", 400, 2000, 1200, 50) |
|
|
height = st.sidebar.slider("Word Cloud Height (px)", 200, 2000, 800, 50) |
|
|
|
|
|
|
|
|
st.subheader("Generated Word Cloud") |
|
|
fig, ax = plt.subplots(figsize=(width/100, height/100)) |
|
|
wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200).generate(filtered_text) |
|
|
ax.imshow(wordcloud_img, interpolation='bilinear') |
|
|
ax.axis('off') |
|
|
|
|
|
|
|
|
st.pyplot(fig) |
|
|
format_ = st.selectbox("Save Word Cloud as:", ["png", "jpeg", "svg", "pdf"]) |
|
|
resolution = st.slider("Image Resolution (DPI)", 100, 500, 300, 50) |
|
|
|
|
|
if st.button(f"Save Word Cloud as {format_.upper()}"): |
|
|
buffered = BytesIO() |
|
|
plt.savefig(buffered, format=format_, dpi=resolution) |
|
|
st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Word Count Table") |
|
|
st.write(word_count) |
|
|
if st.button('Download Word Count Table as CSV'): |
|
|
st.markdown(get_table_download_link(word_count, "word_count.csv", "Download CSV"), unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.sidebar.markdown("### Created by: [Tassawar Abbas](https://github.com/Abbas829)") |
|
|
st.sidebar.markdown("Contact: [Email](mailto:abbas829@gmail.com)") |
|
|
st.sidebar.markdown("Facebook: [Tassawar Abbas](https://www.facebook.com/abbas829)") |
|
|
st.sidebar.markdown("Linkedin: [Tassawar Abbas](https://www.linkedin.com/in/abbas829pro)") |