Spaces:
Runtime error
Runtime error
| import os | |
| from dotenv import load_dotenv | |
| import gradio as gr | |
| import cohere | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| import numpy as np | |
| import plotly.express as px | |
| # Load the API key from the .env file | |
| load_dotenv() | |
| api_key = os.getenv('COHERE_API_KEY') | |
| # Initialize the Cohere client with your API key | |
| co = cohere.Client(api_key) | |
| def fetch_text_from_url(url): | |
| """Fetches and returns the text content of a given URL.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract the text from the web page | |
| return soup.get_text(separator=' ', strip=True) | |
| except Exception as e: | |
| return "" | |
| def cluster_urls(urls, num_clusters): | |
| # Split the URLs into a list | |
| url_list = urls.split('\n') | |
| url_list = [url.strip() for url in url_list if url.strip()] # Remove any empty lines or spaces | |
| # Fetch the content from each URL | |
| url_contents = [fetch_text_from_url(url) for url in url_list] | |
| # Generate embeddings for the URL contents using Cohere | |
| embeddings = co.embed(texts=url_contents, model='embed-multilingual-v3.0', input_type='clustering').embeddings | |
| # Perform clustering using KMeans | |
| kmeans = KMeans(n_clusters=num_clusters, init='k-means++') | |
| labels = kmeans.fit_predict(embeddings) | |
| # Create a dictionary to store the clusters with string keys | |
| clusters = {str(i): [] for i in range(num_clusters)} | |
| for url, label in zip(url_list, labels): | |
| clusters[str(label)].append(url) | |
| # Ensure no cluster is empty by redistributing URLs from empty clusters | |
| empty_clusters = [i for i, urls in clusters.items() if not urls] | |
| for empty_cluster in empty_clusters: | |
| max_cluster = max(clusters, key=lambda k: len(clusters[k])) | |
| clusters[empty_cluster].append(clusters[max_cluster].pop()) | |
| # Plotting the clusters | |
| plot = plot_clusters(embeddings, labels, url_list, num_clusters) | |
| return clusters, plot | |
| def plot_clusters(embeddings, labels, url_list, num_clusters): | |
| # Reduce dimensions for visualization using PCA | |
| pca = PCA(n_components=2) | |
| reduced_embeddings = pca.fit_transform(embeddings) | |
| # Create a scatter plot using Plotly | |
| fig = px.scatter( | |
| x=reduced_embeddings[:, 0], | |
| y=reduced_embeddings[:, 1], | |
| color=labels, | |
| labels={'color': 'Cluster'}, | |
| hover_data={'URL': url_list}, | |
| title='URL Clustering', | |
| ) | |
| return fig | |
| # Create a Gradio interface | |
| def gradio_interface(urls, num_clusters): | |
| clusters, plot = cluster_urls(urls, num_clusters) | |
| return clusters, plot | |
| inputs = [ | |
| gr.Textbox(label='URLs', lines=5, | |
| placeholder='Enter URLs, one per line', | |
| value='https://en.wikipedia.org/wiki/Jellyfish\n' | |
| 'https://en.wikipedia.org/wiki/Crab\n' | |
| 'https://en.wikipedia.org/wiki/Goldfish\n' | |
| 'https://en.wikipedia.org/wiki/Cattle\n' | |
| 'https://en.wikipedia.org/wiki/Pig\n' | |
| 'https://en.wikipedia.org/wiki/Artificial_intelligence\n' | |
| 'https://en.wikipedia.org/wiki/Large_language_model\n' | |
| ), | |
| gr.Slider(minimum=1, maximum=10, step=1, label='Number of Clusters', value=3) | |
| ] | |
| output = [ | |
| gr.JSON(label='Clusters'), | |
| gr.Plot(label='Clustering Plot') | |
| ] | |
| interface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=inputs, | |
| outputs=output, | |
| title='URL Clustering', | |
| description='Cluster URLs based on the content of the pages using Cohere' | |
| ) | |
| # Launch the Gradio interface | |
| interface.launch() | |