poemsforaphrodite's picture
Upload folder using huggingface_hub
541feac verified
import os
from dotenv import load_dotenv
import gradio as gr
import cohere
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
# Load the API key from the .env file
load_dotenv()
api_key = os.getenv('COHERE_API_KEY')
# Initialize the Cohere client with your API key
co = cohere.Client(api_key)
def fetch_text_from_url(url):
"""Fetches and returns the text content of a given URL."""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the text from the web page
return soup.get_text(separator=' ', strip=True)
except Exception as e:
return ""
def cluster_urls(urls, num_clusters):
# Split the URLs into a list
url_list = urls.split('\n')
url_list = [url.strip() for url in url_list if url.strip()] # Remove any empty lines or spaces
# Fetch the content from each URL
url_contents = [fetch_text_from_url(url) for url in url_list]
# Generate embeddings for the URL contents using Cohere
embeddings = co.embed(texts=url_contents, model='embed-multilingual-v3.0', input_type='clustering').embeddings
# Perform clustering using KMeans
kmeans = KMeans(n_clusters=num_clusters, init='k-means++')
labels = kmeans.fit_predict(embeddings)
# Create a dictionary to store the clusters with string keys
clusters = {str(i): [] for i in range(num_clusters)}
for url, label in zip(url_list, labels):
clusters[str(label)].append(url)
# Ensure no cluster is empty by redistributing URLs from empty clusters
empty_clusters = [i for i, urls in clusters.items() if not urls]
for empty_cluster in empty_clusters:
max_cluster = max(clusters, key=lambda k: len(clusters[k]))
clusters[empty_cluster].append(clusters[max_cluster].pop())
# Plotting the clusters
plot = plot_clusters(embeddings, labels, url_list, num_clusters)
return clusters, plot
def plot_clusters(embeddings, labels, url_list, num_clusters):
# Reduce dimensions for visualization using PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)
# Create a scatter plot using Plotly
fig = px.scatter(
x=reduced_embeddings[:, 0],
y=reduced_embeddings[:, 1],
color=labels,
labels={'color': 'Cluster'},
hover_data={'URL': url_list},
title='URL Clustering',
)
return fig
# Create a Gradio interface
def gradio_interface(urls, num_clusters):
clusters, plot = cluster_urls(urls, num_clusters)
return clusters, plot
inputs = [
gr.Textbox(label='URLs', lines=5,
placeholder='Enter URLs, one per line',
value='https://en.wikipedia.org/wiki/Jellyfish\n'
'https://en.wikipedia.org/wiki/Crab\n'
'https://en.wikipedia.org/wiki/Goldfish\n'
'https://en.wikipedia.org/wiki/Cattle\n'
'https://en.wikipedia.org/wiki/Pig\n'
'https://en.wikipedia.org/wiki/Artificial_intelligence\n'
'https://en.wikipedia.org/wiki/Large_language_model\n'
),
gr.Slider(minimum=1, maximum=10, step=1, label='Number of Clusters', value=3)
]
output = [
gr.JSON(label='Clusters'),
gr.Plot(label='Clustering Plot')
]
interface = gr.Interface(
fn=gradio_interface,
inputs=inputs,
outputs=output,
title='URL Clustering',
description='Cluster URLs based on the content of the pages using Cohere'
)
# Launch the Gradio interface
interface.launch()