import os import io import zipfile import tempfile import pandas as pd import nltk from nltk.tokenize import word_tokenize from nltk.util import ngrams from nltk.corpus import stopwords from sentence_transformers import SentenceTransformer from sklearn.manifold import TSNE import plotly.express as px import gradio as gr # Download NLTK resources nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) embed_df = pd.DataFrame() def analyze_bigrams(zip_file, perplexity): global embed_df if zip_file is None: return "Please upload a ZIP file containing .txt files.", None zip_stream = io.BytesIO(zip_file) with tempfile.TemporaryDirectory() as tmpdir: with zipfile.ZipFile(zip_stream, 'r') as zip_ref: zip_ref.extractall(tmpdir) txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")] if not txt_files: return "No .txt files found.", None all_texts = [] for path in txt_files: with open(path, "r", encoding="utf-8") as f: all_texts.append(f.read().lower()) bigram_counter = {} for text in all_texts: tokens = [w for w in word_tokenize(text) if w.isalpha() and w not in stop_words] bigrams = ngrams(tokens, 2) for bg in bigrams: phrase = ' '.join(bg) bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1 top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100] bigram_texts = [x[0] for x in top_bigrams] counts = [x[1] for x in top_bigrams] model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = model.encode(bigram_texts) tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42) tsne_results = tsne.fit_transform(embeddings) embed_df = pd.DataFrame({ "bigram": bigram_texts, "count": counts, "tsne_1": tsne_results[:, 0], "tsne_2": tsne_results[:, 1], }) fig = px.scatter(embed_df, x="tsne_1", y="tsne_2", hover_name="bigram", size="count", template="plotly_white", title="Bigram t-SNE Projection") fig.update_layout(dragmode="lasso") return "✅ Bigram analysis complete. Use lasso to select points.", fig def generate_bar_plot(events): global embed_df if not events or embed_df.empty: return None selected_indices = [pt["pointIndex"] for pt in events] selected_df = embed_df.iloc[selected_indices] fig = px.bar(selected_df.sort_values("count", ascending=False), x="count", y="bigram", orientation="h", title="Selected Bigram Frequencies") return fig # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 📦 Upload ZIP of .txt files to Analyze Bigrams") zip_input = gr.File(label="Upload ZIP File of .txt Files", type="binary") perplexity_input = gr.Number(label="t-SNE Perplexity", value=30) analyze_btn = gr.Button("Analyze") status = gr.Label() scatter_plot = gr.Plot() bar_plot = gr.Plot() analyze_btn.click(analyze_bigrams, inputs=[zip_input, perplexity_input], outputs=[status, scatter_plot]) gr.plotly_events(scatter_plot, select_event=True)( generate_bar_plot, inputs=None, outputs=bar_plot ) demo.launch(share=True)