| import gradio as gr |
| import zipfile |
| from collections import Counter |
| import re |
| import os |
| from wordcloud import WordCloud |
|
|
| stop_words = [ |
| 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', |
| 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', |
| 'can', "can't", 'come', 'could', "couldn't", |
| 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', |
| 'each', |
| 'few', 'for', 'from', 'further', |
| 'get', 'got', |
| 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", |
| 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', |
| 'just', |
| 'let', "let's", 'like', |
| 'made', 'me', 'more', 'most', "mustn't", 'my', 'myself', |
| 'no', 'nor', 'not', 'now', |
| 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', |
| 'same', 'say', 'see', 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', |
| 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', |
| 'under', 'until', 'up', |
| 'very', |
| 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", |
| 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', "-","+91","pm","am","cse","nit","sec","b","SECTION","section","media","omitted","message","deleted","added" |
| ] |
|
|
| def clean_text(text): |
| emoji_pattern = re.compile( |
| "[" |
| u"\U0001F600-\U0001F64F" |
| u"\U0001F300-\U0001F5FF" |
| u"\U0001F680-\U0001F6FF" |
| u"\U0001F700-\U0001F77F" |
| u"\U0001F780-\U0001F7FF" |
| u"\U0001F800-\U0001F8FF" |
| u"\U0001F900-\U0001F9FF" |
| u"\U0001FA00-\U0001FA6F" |
| u"\U0001FA70-\U0001FAFF" |
| u"\U00002702-\U000027B0" |
| u"\U000024C2-\U0001F251" |
| "]", |
| flags=re.UNICODE |
| ) |
| mention_pattern = re.compile(r'@\d+') |
| special_char_pattern = re.compile(r'[^\w\s]') |
| angle_brackets_pattern = re.compile(r'<.*?>') |
|
|
| text = emoji_pattern.sub(r'', text) |
| text = mention_pattern.sub(r'', text) |
| text = special_char_pattern.sub(r'', text) |
| text = angle_brackets_pattern.sub(r'', text) |
| |
| return text.lower() |
|
|
|
|
| def generate_wordcloud(zip_file): |
| extracted_text = [] |
|
|
| with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: |
| file_name = zip_ref.namelist()[0] |
| with zip_ref.open(file_name) as file: |
| for line in file: |
| extracted_text.append(line.decode('utf-8').strip()) |
|
|
| words = [] |
| for line in extracted_text: |
| if ":" in line: |
| message = line.split(":", 1)[-1].strip() |
| for word in message.split(): |
| word = clean_text(word) |
| if word and word not in stop_words and not word.isnumeric(): |
| words.append(word) |
|
|
| word_counts = Counter(words) |
|
|
| if word_counts: |
| most_frequent_word = max(word_counts, key=word_counts.get) |
| highest_count = word_counts[most_frequent_word] |
| for word in word_counts: |
| word_counts[word] = word_counts[word] / highest_count |
|
|
| os.makedirs('output', exist_ok=True) |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts) |
| output_path = "output/output.png" |
| wordcloud.to_file(output_path) |
|
|
| return output_path |
| else: |
| return "No valid words found" |
|
|
|
|
| iface = gr.Interface( |
| fn=generate_wordcloud, |
| inputs=gr.File(label="Upload a zip file"), |
| outputs=gr.Image(type="filepath", label="Generated Word Cloud"), |
| title="Word Cloud Generator", |
| description="Upload a zip file containing a text file. The most frequent words will be displayed in a word cloud.", |
| ) |
|
|
| iface.launch() |