Spaces:

Alcime
/

visualize_dataset

Sleeping

App Files Files Community

visualize_dataset / app.py

Alcime

Create app.py

fce64e9 verified about 1 year ago

raw

history blame

3.54 kB

	import streamlit as st
	import os
	from datasets import load_dataset
	from bunkatopics import Bunka
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.llms import HuggingFaceHub

	# Streamlit app
	st.title("Bunka Map 🗺️")

	# Input parameters
	dataset_id = st.text_input("Dataset ID", "bunkalab/medium-sample-technology")
	language = st.text_input("Language", "english")
	text_field = st.text_input("Text Field", "title")
	embedder_model = st.text_input("Embedder Model", "sentence-transformers/distiluse-base-multilingual-cased-v2")
	sample_size = st.number_input("Sample Size", min_value=100, max_value=10000, value=1000)
	n_clusters = st.number_input("Number of Clusters", min_value=5, max_value=50, value=15)
	llm_model = st.text_input("LLM Model", "mistralai/Mistral-7B-Instruct-v0.1")

	# Hugging Face API token input
	hf_token = st.text_input("Hugging Face API Token", type="password")

	if st.button("Generate Bunka Map"):
	# Load dataset and sample
	@st.cache_data
	def load_data(dataset_id, text_field, sample_size):
	dataset = load_dataset(dataset_id, streaming=True)
	docs_sample = []
	for i, example in enumerate(dataset["train"]):
	if i >= sample_size:
	break
	docs_sample.append(example[text_field])
	return docs_sample

	docs_sample = load_data(dataset_id, text_field, sample_size)

	# Initialize embedding model and Bunka
	embedding_model = HuggingFaceEmbeddings(model_name=embedder_model)
	bunka = Bunka(embedding_model=embedding_model, language=language)

	# Fit Bunka to the text data
	bunka.fit(docs_sample)

	# Generate topics
	df_topics = bunka.get_topics(n_clusters=n_clusters, name_length=5, min_count_terms=2)

	# Visualize topics
	st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))

	# Clean labels using LLM
	if hf_token:
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
	llm = HuggingFaceHub(repo_id=llm_model, huggingfacehub_api_token=hf_token)
	bunka.get_clean_topic_name(llm=llm, language=language)
	st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))
	else:
	st.warning("Please provide a Hugging Face API token to clean labels using LLM.")

	# Manual topic cleaning
	st.subheader("Manually Clean Topics")
	cleaned_topics = {}
	for topic, keywords in bunka.topics_.items():
	cleaned_topic = st.text_input(f"Topic {topic}", ", ".join(keywords))
	cleaned_topics[topic] = cleaned_topic.split(", ")

	if st.button("Update Topics"):
	bunka.topics_ = cleaned_topics
	st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))

	# Remove unwanted topics
	st.subheader("Remove Unwanted Topics")
	topics_to_remove = st.multiselect("Select topics to remove", list(bunka.topics_.keys()))
	if st.button("Remove Topics"):
	bunka.clean_data_by_topics(topics_to_remove)
	st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))

	# Save dataset
	if st.button("Save Cleaned Dataset"):
	name = dataset_id.replace('/', '_') + '_cleaned.csv'
	bunka.df_cleaned_.to_csv(name)
	st.success(f"Dataset saved as {name}")