Spaces:

YashsharmaPhD
/

Biagram_analysis_NLP_data

Runtime error

App Files Files Community

YashsharmaPhD commited on May 9, 2025

Commit

fbf137c

verified ·

1 Parent(s): 6488cb1

Create app.py

Browse files

Files changed (1) hide show

app.py +103 -0

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import shutil
+import zipfile
+import pandas as pd
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.util import ngrams
+from nltk.corpus import stopwords
+from sentence_transformers import SentenceTransformer
+from sklearn.manifold import TSNE
+import plotly.express as px
+import gradio as gr
+import tempfile
+# Download NLTK assets
+nltk.download('punkt')
+nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+# Global variable
+embed_df = pd.DataFrame()
+def analyze_bigrams(zip_file, perplexity):
+    global embed_df
+    if zip_file is None:
+        return "Please upload a ZIP file containing .txt files.", None
+    # Extract uploaded zip to a temporary directory
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+            zip_ref.extractall(tmpdir)
+        # Gather all .txt files
+        txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")]
+        if not txt_files:
+            return "No .txt files found in the ZIP file.", None
+        all_texts = []
+        for file_path in txt_files:
+            with open(file_path, "r", encoding="utf-8") as file:
+                all_texts.append(file.read().lower())
+        bigram_counter = {}
+        for text in all_texts:
+            tokens = [word for word in word_tokenize(text) if word.isalpha() and word not in stop_words]
+            bigrams = ngrams(tokens, 2)
+            for bg in bigrams:
+                phrase = ' '.join(bg)
+                bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1
+        top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100]
+        bigram_texts = [item[0] for item in top_bigrams]
+        counts = [item[1] for item in top_bigrams]
+        model = SentenceTransformer('all-MiniLM-L6-v2')
+        embeddings = model.encode(bigram_texts)
+        tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42)
+        tsne_results = tsne.fit_transform(embeddings)
+        embed_df = pd.DataFrame({
+            'bigram': bigram_texts,
+            'count': counts,
+            'tsne_1': tsne_results[:, 0],
+            'tsne_2': tsne_results[:, 1]
+        })
+        fig = px.scatter(embed_df, x='tsne_1', y='tsne_2', hover_name='bigram',
+                         size='count', title="Bigram Embeddings", template='plotly_white')
+        fig.update_layout(dragmode='lasso')
+        return "Bigram analysis complete. Select points on the plot below.", fig
+def generate_bar_plot(selected_indices):
+    global embed_df
+    if not embed_df.empty and selected_indices:
+        selected_df = embed_df.iloc[selected_indices]
+        fig = px.bar(selected_df.sort_values("count", ascending=False),
+                     x="count", y="bigram", orientation="h",
+                     title="Selected Bigram Frequencies")
+        return fig
+    return None
+with gr.Blocks() as demo:
+    gr.Markdown("## 📦 Upload a ZIP of .txt files to Analyze Bigrams")
+    zip_input = gr.File(label="Upload ZIP File of .txt Files", type="file")
+    perplexity_input = gr.Number(label="t-SNE Perplexity", value=30)
+    generate_btn = gr.Button("Generate Scatter Plot")
+    status_output = gr.Label()
+    scatter_plot = gr.Plot()
+    bar_plot = gr.Plot()
+    generate_btn.click(fn=analyze_bigrams,
+                       inputs=[zip_input, perplexity_input],
+                       outputs=[status_output, scatter_plot])
+    scatter_plot.select(fn=generate_bar_plot,
+                        inputs=[],
+                        outputs=bar_plot)
+demo.launch()