Tassawar commited on
Commit
4f4de74
·
verified ·
1 Parent(s): 6e1a8dc
Files changed (2) hide show
  1. requirements.txt +8 -0
  2. word_cloud_app.py +111 -0
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ docx==0.2.4
2
+ matplotlib==3.9.2
3
+ numpy==2.1.0
4
+ pandas==2.2.2
5
+ PyPDF2==3.0.1
6
+ python_docx==1.1.2
7
+ streamlit==1.38.0
8
+ wordcloud==1.9.3
word_cloud_app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from wordcloud import WordCloud, STOPWORDS
5
+ import matplotlib.pyplot as plt
6
+ import PyPDF2
7
+ from docx import Document
8
+ import base64
9
+ from io import BytesIO
10
+
11
+ # Functions for file reading
12
+ def read_txt(file):
13
+ return file.getvalue().decode("utf-8")
14
+
15
+ def read_docx(file):
16
+ doc = Document(file)
17
+ return " ".join([para.text for para in doc.paragraphs])
18
+
19
+ def read_pdf(file):
20
+ pdf = PyPDF2.PdfReader(file)
21
+ return " ".join([page.extract_text() for page in pdf.pages])
22
+
23
+ # Function to filter out stopwords
24
+ def filter_stopwords(text, additional_stopwords=[]):
25
+ words = text.split()
26
+ all_stopwords = STOPWORDS.union(set(additional_stopwords))
27
+ filtered_words = [word for word in words if word.lower() not in all_stopwords]
28
+ return " ".join(filtered_words)
29
+
30
+ # Function to create download link for plot
31
+ def get_image_download_link(buffered, format_):
32
+ image_base64 = base64.b64encode(buffered.getvalue()).decode()
33
+ return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Word Cloud as {format_}</a>'
34
+
35
+ # Function to generate a download link for a DataFrame
36
+ def get_table_download_link(df, filename, file_label):
37
+ csv = df.to_csv(index=False)
38
+ b64 = base64.b64encode(csv.encode()).decode()
39
+ return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>'
40
+
41
+ # Streamlit UI
42
+ st.title("🌥️ Word Cloud Generator")
43
+ st.subheader("Upload a PDF, DOCX, or TXT file to generate a Word Cloud")
44
+
45
+ uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
46
+
47
+ if uploaded_file:
48
+ file_details = {
49
+ "File Name": uploaded_file.name,
50
+ "File Type": uploaded_file.type,
51
+ "File Size (KB)": round(uploaded_file.size / 1024, 2)
52
+ }
53
+ st.sidebar.write("**File Details:**", file_details)
54
+
55
+ # File reading based on type
56
+ if uploaded_file.type == "text/plain":
57
+ text = read_txt(uploaded_file)
58
+ elif uploaded_file.type == "application/pdf":
59
+ text = read_pdf(uploaded_file)
60
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
61
+ text = read_docx(uploaded_file)
62
+ else:
63
+ st.error("Unsupported file type. Please upload a TXT, PDF, or DOCX file.")
64
+ st.stop()
65
+
66
+ # Word count table
67
+ words = text.split()
68
+ word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
69
+
70
+ # Sidebar: Stopwords and Word Cloud Customization
71
+ use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True)
72
+ top_words = word_count['Word'].head(50).tolist()
73
+ additional_stopwords = st.sidebar.multiselect("Add stopwords:", sorted(top_words))
74
+
75
+ all_stopwords = STOPWORDS.union(set(additional_stopwords)) if use_standard_stopwords else set(additional_stopwords)
76
+ filtered_text = filter_stopwords(text, all_stopwords)
77
+
78
+ if filtered_text:
79
+ # Word Cloud dimensions
80
+ width = st.sidebar.slider("Word Cloud Width (px)", 400, 2000, 1200, 50)
81
+ height = st.sidebar.slider("Word Cloud Height (px)", 200, 2000, 800, 50)
82
+
83
+ # Generate Word Cloud
84
+ st.subheader("Generated Word Cloud")
85
+ fig, ax = plt.subplots(figsize=(width/100, height/100))
86
+ wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200).generate(filtered_text)
87
+ ax.imshow(wordcloud_img, interpolation='bilinear')
88
+ ax.axis('off')
89
+
90
+ # Display and save options
91
+ st.pyplot(fig)
92
+ format_ = st.selectbox("Save Word Cloud as:", ["png", "jpeg", "svg", "pdf"])
93
+ resolution = st.slider("Image Resolution (DPI)", 100, 500, 300, 50)
94
+
95
+ if st.button(f"Save Word Cloud as {format_.upper()}"):
96
+ buffered = BytesIO()
97
+ plt.savefig(buffered, format=format_, dpi=resolution)
98
+ st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True)
99
+
100
+ # Display word count table
101
+
102
+ st.subheader("Word Count Table")
103
+ st.write(word_count)
104
+ if st.button('Download Word Count Table as CSV'):
105
+ st.markdown(get_table_download_link(word_count, "word_count.csv", "Download CSV"), unsafe_allow_html=True)
106
+
107
+ # Footer with author info
108
+ st.sidebar.markdown("### Created by: [Tassawar Abbas](https://github.com/Abbas829)")
109
+ st.sidebar.markdown("Contact: [Email](mailto:abbas829@gmail.com)")
110
+ st.sidebar.markdown("Facebook: [Tassawar Abbas](https://www.facebook.com/abbas829)")
111
+ st.sidebar.markdown("Linkedin: [Tassawar Abbas](https://www.linkedin.com/in/abbas829pro)")