Spaces:
Runtime error
Runtime error
File size: 3,522 Bytes
fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 53ed032 fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 45cf4ac fbf137c 53ed032 fbf137c 53ed032 45cf4ac 53ed032 0f7f6fa fbf137c 53ed032 45cf4ac 53ed032 45cf4ac fbf137c 53ed032 fbf137c 0f2ab56 fbf137c 0f7f6fa 53ed032 45cf4ac fbf137c 45cf4ac fbf137c 8e26ef9 fbf137c 45cf4ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import io
import zipfile
import tempfile
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import plotly.express as px
import gradio as gr
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
embed_df = pd.DataFrame()
def analyze_bigrams(zip_file, perplexity):
global embed_df
if zip_file is None:
return "Please upload a ZIP file containing .txt files.", None
zip_stream = io.BytesIO(zip_file)
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(zip_stream, 'r') as zip_ref:
zip_ref.extractall(tmpdir)
txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")]
if not txt_files:
return "No .txt files found.", None
all_texts = []
for path in txt_files:
with open(path, "r", encoding="utf-8") as f:
all_texts.append(f.read().lower())
bigram_counter = {}
for text in all_texts:
tokens = [w for w in word_tokenize(text) if w.isalpha() and w not in stop_words]
bigrams = ngrams(tokens, 2)
for bg in bigrams:
phrase = ' '.join(bg)
bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1
top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100]
bigram_texts = [x[0] for x in top_bigrams]
counts = [x[1] for x in top_bigrams]
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(bigram_texts)
tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42)
tsne_results = tsne.fit_transform(embeddings)
embed_df = pd.DataFrame({
"bigram": bigram_texts,
"count": counts,
"tsne_1": tsne_results[:, 0],
"tsne_2": tsne_results[:, 1],
})
fig = px.scatter(embed_df, x="tsne_1", y="tsne_2", hover_name="bigram",
size="count", template="plotly_white", title="Bigram t-SNE Projection")
fig.update_layout(dragmode="lasso")
return "✅ Bigram analysis complete. Use lasso to select points.", fig
def generate_bar_plot(events):
global embed_df
if not events or embed_df.empty:
return None
selected_indices = [pt["pointIndex"] for pt in events]
selected_df = embed_df.iloc[selected_indices]
fig = px.bar(selected_df.sort_values("count", ascending=False),
x="count", y="bigram", orientation="h",
title="Selected Bigram Frequencies")
return fig
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## 📦 Upload ZIP of .txt files to Analyze Bigrams")
zip_input = gr.File(label="Upload ZIP File of .txt Files", type="binary")
perplexity_input = gr.Number(label="t-SNE Perplexity", value=30)
analyze_btn = gr.Button("Analyze")
status = gr.Label()
scatter_plot = gr.Plot()
bar_plot = gr.Plot()
analyze_btn.click(analyze_bigrams,
inputs=[zip_input, perplexity_input],
outputs=[status, scatter_plot])
gr.plotly_events(scatter_plot, select_event=True)(
generate_bar_plot,
inputs=None,
outputs=bar_plot
)
demo.launch(share=True)
|