poemsforaphrodite commited on
Commit
541feac
·
verified ·
1 Parent(s): 9198d66

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. clustering_plot.html +0 -0
  2. clustering_plot.png +0 -0
  3. main.py +16 -28
clustering_plot.html CHANGED
The diff for this file is too large to render. See raw diff
 
clustering_plot.png CHANGED
main.py CHANGED
@@ -5,9 +5,9 @@ import cohere
5
  import requests
6
  from bs4 import BeautifulSoup
7
  from sklearn.cluster import KMeans
8
- import matplotlib.pyplot as plt
9
  from sklearn.decomposition import PCA
10
  import numpy as np
 
11
 
12
  # Load the API key from the .env file
13
  load_dotenv()
@@ -54,43 +54,31 @@ def cluster_urls(urls, num_clusters):
54
  clusters[empty_cluster].append(clusters[max_cluster].pop())
55
 
56
  # Plotting the clusters
57
- plot_path = plot_clusters(embeddings, labels, url_list, num_clusters)
58
 
59
- return clusters, plot_path
60
 
61
  def plot_clusters(embeddings, labels, url_list, num_clusters):
62
  # Reduce dimensions for visualization using PCA
63
  pca = PCA(n_components=2)
64
  reduced_embeddings = pca.fit_transform(embeddings)
65
 
66
- # Create a scatter plot
67
- plt.figure(figsize=(10, 7))
68
- colors = plt.cm.rainbow(np.linspace(0, 1, num_clusters))
 
 
 
 
 
 
69
 
70
- for i in range(num_clusters):
71
- cluster_points = reduced_embeddings[np.array(labels) == i]
72
- plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {i+1}', color=colors[i])
73
-
74
- for i, url in enumerate(url_list):
75
- plt.annotate(f'URL {i+1}', (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))
76
-
77
- plt.title('URL Clustering')
78
- plt.xlabel('PCA Component 1')
79
- plt.ylabel('PCA Component 2')
80
- plt.legend()
81
- plt.grid(True)
82
-
83
- # Save the plot to a file
84
- plot_path = 'clustering_plot.png'
85
- plt.savefig(plot_path)
86
- plt.close()
87
-
88
- return plot_path
89
 
90
  # Create a Gradio interface
91
  def gradio_interface(urls, num_clusters):
92
- clusters, plot_path = cluster_urls(urls, num_clusters)
93
- return clusters, plot_path
94
 
95
  inputs = [
96
  gr.Textbox(label='URLs', lines=5,
@@ -107,7 +95,7 @@ inputs = [
107
  ]
108
  output = [
109
  gr.JSON(label='Clusters'),
110
- gr.Image(label='Clustering Plot')
111
  ]
112
 
113
  interface = gr.Interface(
 
5
  import requests
6
  from bs4 import BeautifulSoup
7
  from sklearn.cluster import KMeans
 
8
  from sklearn.decomposition import PCA
9
  import numpy as np
10
+ import plotly.express as px
11
 
12
  # Load the API key from the .env file
13
  load_dotenv()
 
54
  clusters[empty_cluster].append(clusters[max_cluster].pop())
55
 
56
  # Plotting the clusters
57
+ plot = plot_clusters(embeddings, labels, url_list, num_clusters)
58
 
59
+ return clusters, plot
60
 
61
  def plot_clusters(embeddings, labels, url_list, num_clusters):
62
  # Reduce dimensions for visualization using PCA
63
  pca = PCA(n_components=2)
64
  reduced_embeddings = pca.fit_transform(embeddings)
65
 
66
+ # Create a scatter plot using Plotly
67
+ fig = px.scatter(
68
+ x=reduced_embeddings[:, 0],
69
+ y=reduced_embeddings[:, 1],
70
+ color=labels,
71
+ labels={'color': 'Cluster'},
72
+ hover_data={'URL': url_list},
73
+ title='URL Clustering',
74
+ )
75
 
76
+ return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Create a Gradio interface
79
  def gradio_interface(urls, num_clusters):
80
+ clusters, plot = cluster_urls(urls, num_clusters)
81
+ return clusters, plot
82
 
83
  inputs = [
84
  gr.Textbox(label='URLs', lines=5,
 
95
  ]
96
  output = [
97
  gr.JSON(label='Clusters'),
98
+ gr.Plot(label='Clustering Plot')
99
  ]
100
 
101
  interface = gr.Interface(