YashsharmaPhD commited on
Commit
fbf137c
·
verified ·
1 Parent(s): 6488cb1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import zipfile
4
+ import pandas as pd
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.util import ngrams
8
+ from nltk.corpus import stopwords
9
+ from sentence_transformers import SentenceTransformer
10
+ from sklearn.manifold import TSNE
11
+ import plotly.express as px
12
+ import gradio as gr
13
+ import tempfile
14
+
15
+ # Download NLTK assets
16
+ nltk.download('punkt')
17
+ nltk.download('stopwords')
18
+ stop_words = set(stopwords.words('english'))
19
+
20
+ # Global variable
21
+ embed_df = pd.DataFrame()
22
+
23
+ def analyze_bigrams(zip_file, perplexity):
24
+ global embed_df
25
+ if zip_file is None:
26
+ return "Please upload a ZIP file containing .txt files.", None
27
+
28
+ # Extract uploaded zip to a temporary directory
29
+ with tempfile.TemporaryDirectory() as tmpdir:
30
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
31
+ zip_ref.extractall(tmpdir)
32
+
33
+ # Gather all .txt files
34
+ txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")]
35
+ if not txt_files:
36
+ return "No .txt files found in the ZIP file.", None
37
+
38
+ all_texts = []
39
+ for file_path in txt_files:
40
+ with open(file_path, "r", encoding="utf-8") as file:
41
+ all_texts.append(file.read().lower())
42
+
43
+ bigram_counter = {}
44
+ for text in all_texts:
45
+ tokens = [word for word in word_tokenize(text) if word.isalpha() and word not in stop_words]
46
+ bigrams = ngrams(tokens, 2)
47
+ for bg in bigrams:
48
+ phrase = ' '.join(bg)
49
+ bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1
50
+
51
+ top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100]
52
+ bigram_texts = [item[0] for item in top_bigrams]
53
+ counts = [item[1] for item in top_bigrams]
54
+
55
+ model = SentenceTransformer('all-MiniLM-L6-v2')
56
+ embeddings = model.encode(bigram_texts)
57
+
58
+ tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42)
59
+ tsne_results = tsne.fit_transform(embeddings)
60
+
61
+ embed_df = pd.DataFrame({
62
+ 'bigram': bigram_texts,
63
+ 'count': counts,
64
+ 'tsne_1': tsne_results[:, 0],
65
+ 'tsne_2': tsne_results[:, 1]
66
+ })
67
+
68
+ fig = px.scatter(embed_df, x='tsne_1', y='tsne_2', hover_name='bigram',
69
+ size='count', title="Bigram Embeddings", template='plotly_white')
70
+ fig.update_layout(dragmode='lasso')
71
+
72
+ return "Bigram analysis complete. Select points on the plot below.", fig
73
+
74
+ def generate_bar_plot(selected_indices):
75
+ global embed_df
76
+ if not embed_df.empty and selected_indices:
77
+ selected_df = embed_df.iloc[selected_indices]
78
+ fig = px.bar(selected_df.sort_values("count", ascending=False),
79
+ x="count", y="bigram", orientation="h",
80
+ title="Selected Bigram Frequencies")
81
+ return fig
82
+ return None
83
+
84
+ with gr.Blocks() as demo:
85
+ gr.Markdown("## 📦 Upload a ZIP of .txt files to Analyze Bigrams")
86
+
87
+ zip_input = gr.File(label="Upload ZIP File of .txt Files", type="file")
88
+ perplexity_input = gr.Number(label="t-SNE Perplexity", value=30)
89
+
90
+ generate_btn = gr.Button("Generate Scatter Plot")
91
+ status_output = gr.Label()
92
+ scatter_plot = gr.Plot()
93
+ bar_plot = gr.Plot()
94
+
95
+ generate_btn.click(fn=analyze_bigrams,
96
+ inputs=[zip_input, perplexity_input],
97
+ outputs=[status_output, scatter_plot])
98
+
99
+ scatter_plot.select(fn=generate_bar_plot,
100
+ inputs=[],
101
+ outputs=bar_plot)
102
+
103
+ demo.launch()