Spaces:

poemsforaphrodite
/

content_clustering

Runtime error

App Files Files Community

content_clustering / main.py

poemsforaphrodite

Upload folder using huggingface_hub

541feac verified almost 2 years ago

raw

history blame contribute delete

3.73 kB

	import os
	from dotenv import load_dotenv
	import gradio as gr
	import cohere
	import requests
	from bs4 import BeautifulSoup
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	import numpy as np
	import plotly.express as px

	# Load the API key from the .env file
	load_dotenv()
	api_key = os.getenv('COHERE_API_KEY')

	# Initialize the Cohere client with your API key
	co = cohere.Client(api_key)

	def fetch_text_from_url(url):
	"""Fetches and returns the text content of a given URL."""
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')
	# Extract the text from the web page
	return soup.get_text(separator=' ', strip=True)
	except Exception as e:
	return ""

	def cluster_urls(urls, num_clusters):
	# Split the URLs into a list
	url_list = urls.split('\n')
	url_list = [url.strip() for url in url_list if url.strip()] # Remove any empty lines or spaces

	# Fetch the content from each URL
	url_contents = [fetch_text_from_url(url) for url in url_list]

	# Generate embeddings for the URL contents using Cohere
	embeddings = co.embed(texts=url_contents, model='embed-multilingual-v3.0', input_type='clustering').embeddings

	# Perform clustering using KMeans
	kmeans = KMeans(n_clusters=num_clusters, init='k-means++')
	labels = kmeans.fit_predict(embeddings)

	# Create a dictionary to store the clusters with string keys
	clusters = {str(i): [] for i in range(num_clusters)}
	for url, label in zip(url_list, labels):
	clusters[str(label)].append(url)

	# Ensure no cluster is empty by redistributing URLs from empty clusters
	empty_clusters = [i for i, urls in clusters.items() if not urls]
	for empty_cluster in empty_clusters:
	max_cluster = max(clusters, key=lambda k: len(clusters[k]))
	clusters[empty_cluster].append(clusters[max_cluster].pop())

	# Plotting the clusters
	plot = plot_clusters(embeddings, labels, url_list, num_clusters)

	return clusters, plot

	def plot_clusters(embeddings, labels, url_list, num_clusters):
	# Reduce dimensions for visualization using PCA
	pca = PCA(n_components=2)
	reduced_embeddings = pca.fit_transform(embeddings)

	# Create a scatter plot using Plotly
	fig = px.scatter(
	x=reduced_embeddings[:, 0],
	y=reduced_embeddings[:, 1],
	color=labels,
	labels={'color': 'Cluster'},
	hover_data={'URL': url_list},
	title='URL Clustering',
	)

	return fig

	# Create a Gradio interface
	def gradio_interface(urls, num_clusters):
	clusters, plot = cluster_urls(urls, num_clusters)
	return clusters, plot

	inputs = [
	gr.Textbox(label='URLs', lines=5,
	placeholder='Enter URLs, one per line',
	value='https://en.wikipedia.org/wiki/Jellyfish\n'
	'https://en.wikipedia.org/wiki/Crab\n'
	'https://en.wikipedia.org/wiki/Goldfish\n'
	'https://en.wikipedia.org/wiki/Cattle\n'
	'https://en.wikipedia.org/wiki/Pig\n'
	'https://en.wikipedia.org/wiki/Artificial_intelligence\n'
	'https://en.wikipedia.org/wiki/Large_language_model\n'
	),
	gr.Slider(minimum=1, maximum=10, step=1, label='Number of Clusters', value=3)
	]
	output = [
	gr.JSON(label='Clusters'),
	gr.Plot(label='Clustering Plot')
	]

	interface = gr.Interface(
	fn=gradio_interface,
	inputs=inputs,
	outputs=output,
	title='URL Clustering',
	description='Cluster URLs based on the content of the pages using Cohere'
	)

	# Launch the Gradio interface
	interface.launch()