YashsharmaPhD's picture
Update app.py
8e26ef9 verified
import os
import io
import zipfile
import tempfile
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import plotly.express as px
import gradio as gr
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
embed_df = pd.DataFrame()
def analyze_bigrams(zip_file, perplexity):
global embed_df
if zip_file is None:
return "Please upload a ZIP file containing .txt files.", None
zip_stream = io.BytesIO(zip_file)
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(zip_stream, 'r') as zip_ref:
zip_ref.extractall(tmpdir)
txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")]
if not txt_files:
return "No .txt files found.", None
all_texts = []
for path in txt_files:
with open(path, "r", encoding="utf-8") as f:
all_texts.append(f.read().lower())
bigram_counter = {}
for text in all_texts:
tokens = [w for w in word_tokenize(text) if w.isalpha() and w not in stop_words]
bigrams = ngrams(tokens, 2)
for bg in bigrams:
phrase = ' '.join(bg)
bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1
top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100]
bigram_texts = [x[0] for x in top_bigrams]
counts = [x[1] for x in top_bigrams]
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(bigram_texts)
tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42)
tsne_results = tsne.fit_transform(embeddings)
embed_df = pd.DataFrame({
"bigram": bigram_texts,
"count": counts,
"tsne_1": tsne_results[:, 0],
"tsne_2": tsne_results[:, 1],
})
fig = px.scatter(embed_df, x="tsne_1", y="tsne_2", hover_name="bigram",
size="count", template="plotly_white", title="Bigram t-SNE Projection")
fig.update_layout(dragmode="lasso")
return "✅ Bigram analysis complete. Use lasso to select points.", fig
def generate_bar_plot(events):
global embed_df
if not events or embed_df.empty:
return None
selected_indices = [pt["pointIndex"] for pt in events]
selected_df = embed_df.iloc[selected_indices]
fig = px.bar(selected_df.sort_values("count", ascending=False),
x="count", y="bigram", orientation="h",
title="Selected Bigram Frequencies")
return fig
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## 📦 Upload ZIP of .txt files to Analyze Bigrams")
zip_input = gr.File(label="Upload ZIP File of .txt Files", type="binary")
perplexity_input = gr.Number(label="t-SNE Perplexity", value=30)
analyze_btn = gr.Button("Analyze")
status = gr.Label()
scatter_plot = gr.Plot()
bar_plot = gr.Plot()
analyze_btn.click(analyze_bigrams,
inputs=[zip_input, perplexity_input],
outputs=[status, scatter_plot])
gr.plotly_events(scatter_plot, select_event=True)(
generate_bar_plot,
inputs=None,
outputs=bar_plot
)
demo.launch(share=True)