Spaces:
Runtime error
Runtime error
| import os | |
| import io | |
| import zipfile | |
| import tempfile | |
| import pandas as pd | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.util import ngrams | |
| from nltk.corpus import stopwords | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.manifold import TSNE | |
| import plotly.express as px | |
| import gradio as gr | |
| # Download NLTK resources | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| stop_words = set(stopwords.words('english')) | |
| embed_df = pd.DataFrame() | |
| def analyze_bigrams(zip_file, perplexity): | |
| global embed_df | |
| if zip_file is None: | |
| return "Please upload a ZIP file containing .txt files.", None | |
| zip_stream = io.BytesIO(zip_file) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| with zipfile.ZipFile(zip_stream, 'r') as zip_ref: | |
| zip_ref.extractall(tmpdir) | |
| txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")] | |
| if not txt_files: | |
| return "No .txt files found.", None | |
| all_texts = [] | |
| for path in txt_files: | |
| with open(path, "r", encoding="utf-8") as f: | |
| all_texts.append(f.read().lower()) | |
| bigram_counter = {} | |
| for text in all_texts: | |
| tokens = [w for w in word_tokenize(text) if w.isalpha() and w not in stop_words] | |
| bigrams = ngrams(tokens, 2) | |
| for bg in bigrams: | |
| phrase = ' '.join(bg) | |
| bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1 | |
| top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100] | |
| bigram_texts = [x[0] for x in top_bigrams] | |
| counts = [x[1] for x in top_bigrams] | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = model.encode(bigram_texts) | |
| tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42) | |
| tsne_results = tsne.fit_transform(embeddings) | |
| embed_df = pd.DataFrame({ | |
| "bigram": bigram_texts, | |
| "count": counts, | |
| "tsne_1": tsne_results[:, 0], | |
| "tsne_2": tsne_results[:, 1], | |
| }) | |
| fig = px.scatter(embed_df, x="tsne_1", y="tsne_2", hover_name="bigram", | |
| size="count", template="plotly_white", title="Bigram t-SNE Projection") | |
| fig.update_layout(dragmode="lasso") | |
| return "✅ Bigram analysis complete. Use lasso to select points.", fig | |
| def generate_bar_plot(events): | |
| global embed_df | |
| if not events or embed_df.empty: | |
| return None | |
| selected_indices = [pt["pointIndex"] for pt in events] | |
| selected_df = embed_df.iloc[selected_indices] | |
| fig = px.bar(selected_df.sort_values("count", ascending=False), | |
| x="count", y="bigram", orientation="h", | |
| title="Selected Bigram Frequencies") | |
| return fig | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 📦 Upload ZIP of .txt files to Analyze Bigrams") | |
| zip_input = gr.File(label="Upload ZIP File of .txt Files", type="binary") | |
| perplexity_input = gr.Number(label="t-SNE Perplexity", value=30) | |
| analyze_btn = gr.Button("Analyze") | |
| status = gr.Label() | |
| scatter_plot = gr.Plot() | |
| bar_plot = gr.Plot() | |
| analyze_btn.click(analyze_bigrams, | |
| inputs=[zip_input, perplexity_input], | |
| outputs=[status, scatter_plot]) | |
| gr.plotly_events(scatter_plot, select_event=True)( | |
| generate_bar_plot, | |
| inputs=None, | |
| outputs=bar_plot | |
| ) | |
| demo.launch(share=True) | |