Spaces:

YashsharmaPhD
/

Biagram_analysis_NLP_data

Runtime error

App Files Files Community

Biagram_analysis_NLP_data / app.py

YashsharmaPhD

Update app.py

8e26ef9 verified 9 months ago

raw

history blame contribute delete

3.52 kB

	import os
	import io
	import zipfile
	import tempfile
	import pandas as pd
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.util import ngrams
	from nltk.corpus import stopwords
	from sentence_transformers import SentenceTransformer
	from sklearn.manifold import TSNE
	import plotly.express as px
	import gradio as gr

	# Download NLTK resources
	nltk.download('punkt')
	nltk.download('stopwords')
	stop_words = set(stopwords.words('english'))

	embed_df = pd.DataFrame()

	def analyze_bigrams(zip_file, perplexity):
	global embed_df
	if zip_file is None:
	return "Please upload a ZIP file containing .txt files.", None

	zip_stream = io.BytesIO(zip_file)

	with tempfile.TemporaryDirectory() as tmpdir:
	with zipfile.ZipFile(zip_stream, 'r') as zip_ref:
	zip_ref.extractall(tmpdir)

	txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")]
	if not txt_files:
	return "No .txt files found.", None

	all_texts = []
	for path in txt_files:
	with open(path, "r", encoding="utf-8") as f:
	all_texts.append(f.read().lower())

	bigram_counter = {}
	for text in all_texts:
	tokens = [w for w in word_tokenize(text) if w.isalpha() and w not in stop_words]
	bigrams = ngrams(tokens, 2)
	for bg in bigrams:
	phrase = ' '.join(bg)
	bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1

	top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100]
	bigram_texts = [x[0] for x in top_bigrams]
	counts = [x[1] for x in top_bigrams]

	model = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = model.encode(bigram_texts)

	tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42)
	tsne_results = tsne.fit_transform(embeddings)

	embed_df = pd.DataFrame({
	"bigram": bigram_texts,
	"count": counts,
	"tsne_1": tsne_results[:, 0],
	"tsne_2": tsne_results[:, 1],
	})

	fig = px.scatter(embed_df, x="tsne_1", y="tsne_2", hover_name="bigram",
	size="count", template="plotly_white", title="Bigram t-SNE Projection")
	fig.update_layout(dragmode="lasso")

	return "✅ Bigram analysis complete. Use lasso to select points.", fig

	def generate_bar_plot(events):
	global embed_df
	if not events or embed_df.empty:
	return None

	selected_indices = [pt["pointIndex"] for pt in events]
	selected_df = embed_df.iloc[selected_indices]

	fig = px.bar(selected_df.sort_values("count", ascending=False),
	x="count", y="bigram", orientation="h",
	title="Selected Bigram Frequencies")
	return fig

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("## 📦 Upload ZIP of .txt files to Analyze Bigrams")

	zip_input = gr.File(label="Upload ZIP File of .txt Files", type="binary")
	perplexity_input = gr.Number(label="t-SNE Perplexity", value=30)

	analyze_btn = gr.Button("Analyze")
	status = gr.Label()
	scatter_plot = gr.Plot()
	bar_plot = gr.Plot()

	analyze_btn.click(analyze_bigrams,
	inputs=[zip_input, perplexity_input],
	outputs=[status, scatter_plot])

	gr.plotly_events(scatter_plot, select_event=True)(
	generate_bar_plot,
	inputs=None,
	outputs=bar_plot
	)

	demo.launch(share=True)