Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- clustering_plot.html +0 -0
- clustering_plot.png +0 -0
- main.py +16 -28
clustering_plot.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
clustering_plot.png
CHANGED
|
|
main.py
CHANGED
|
@@ -5,9 +5,9 @@ import cohere
|
|
| 5 |
import requests
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
from sklearn.cluster import KMeans
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
from sklearn.decomposition import PCA
|
| 10 |
import numpy as np
|
|
|
|
| 11 |
|
| 12 |
# Load the API key from the .env file
|
| 13 |
load_dotenv()
|
|
@@ -54,43 +54,31 @@ def cluster_urls(urls, num_clusters):
|
|
| 54 |
clusters[empty_cluster].append(clusters[max_cluster].pop())
|
| 55 |
|
| 56 |
# Plotting the clusters
|
| 57 |
-
|
| 58 |
|
| 59 |
-
return clusters,
|
| 60 |
|
| 61 |
def plot_clusters(embeddings, labels, url_list, num_clusters):
|
| 62 |
# Reduce dimensions for visualization using PCA
|
| 63 |
pca = PCA(n_components=2)
|
| 64 |
reduced_embeddings = pca.fit_transform(embeddings)
|
| 65 |
|
| 66 |
-
# Create a scatter plot
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
-
cluster_points = reduced_embeddings[np.array(labels) == i]
|
| 72 |
-
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {i+1}', color=colors[i])
|
| 73 |
-
|
| 74 |
-
for i, url in enumerate(url_list):
|
| 75 |
-
plt.annotate(f'URL {i+1}', (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))
|
| 76 |
-
|
| 77 |
-
plt.title('URL Clustering')
|
| 78 |
-
plt.xlabel('PCA Component 1')
|
| 79 |
-
plt.ylabel('PCA Component 2')
|
| 80 |
-
plt.legend()
|
| 81 |
-
plt.grid(True)
|
| 82 |
-
|
| 83 |
-
# Save the plot to a file
|
| 84 |
-
plot_path = 'clustering_plot.png'
|
| 85 |
-
plt.savefig(plot_path)
|
| 86 |
-
plt.close()
|
| 87 |
-
|
| 88 |
-
return plot_path
|
| 89 |
|
| 90 |
# Create a Gradio interface
|
| 91 |
def gradio_interface(urls, num_clusters):
|
| 92 |
-
clusters,
|
| 93 |
-
return clusters,
|
| 94 |
|
| 95 |
inputs = [
|
| 96 |
gr.Textbox(label='URLs', lines=5,
|
|
@@ -107,7 +95,7 @@ inputs = [
|
|
| 107 |
]
|
| 108 |
output = [
|
| 109 |
gr.JSON(label='Clusters'),
|
| 110 |
-
gr.
|
| 111 |
]
|
| 112 |
|
| 113 |
interface = gr.Interface(
|
|
|
|
| 5 |
import requests
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
from sklearn.cluster import KMeans
|
|
|
|
| 8 |
from sklearn.decomposition import PCA
|
| 9 |
import numpy as np
|
| 10 |
+
import plotly.express as px
|
| 11 |
|
| 12 |
# Load the API key from the .env file
|
| 13 |
load_dotenv()
|
|
|
|
| 54 |
clusters[empty_cluster].append(clusters[max_cluster].pop())
|
| 55 |
|
| 56 |
# Plotting the clusters
|
| 57 |
+
plot = plot_clusters(embeddings, labels, url_list, num_clusters)
|
| 58 |
|
| 59 |
+
return clusters, plot
|
| 60 |
|
| 61 |
def plot_clusters(embeddings, labels, url_list, num_clusters):
|
| 62 |
# Reduce dimensions for visualization using PCA
|
| 63 |
pca = PCA(n_components=2)
|
| 64 |
reduced_embeddings = pca.fit_transform(embeddings)
|
| 65 |
|
| 66 |
+
# Create a scatter plot using Plotly
|
| 67 |
+
fig = px.scatter(
|
| 68 |
+
x=reduced_embeddings[:, 0],
|
| 69 |
+
y=reduced_embeddings[:, 1],
|
| 70 |
+
color=labels,
|
| 71 |
+
labels={'color': 'Cluster'},
|
| 72 |
+
hover_data={'URL': url_list},
|
| 73 |
+
title='URL Clustering',
|
| 74 |
+
)
|
| 75 |
|
| 76 |
+
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Create a Gradio interface
|
| 79 |
def gradio_interface(urls, num_clusters):
|
| 80 |
+
clusters, plot = cluster_urls(urls, num_clusters)
|
| 81 |
+
return clusters, plot
|
| 82 |
|
| 83 |
inputs = [
|
| 84 |
gr.Textbox(label='URLs', lines=5,
|
|
|
|
| 95 |
]
|
| 96 |
output = [
|
| 97 |
gr.JSON(label='Clusters'),
|
| 98 |
+
gr.Plot(label='Clustering Plot')
|
| 99 |
]
|
| 100 |
|
| 101 |
interface = gr.Interface(
|