Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| from bs4 import BeautifulSoup, Tag | |
| from collections import Counter | |
| import re | |
| import string | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.corpus import words | |
| from nltk.tokenize import word_tokenize | |
| from gensim.models import Word2Vec | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import tempfile | |
| import gradio as gr | |
| import openai | |
| from googlesearch import search | |
| from pytrends.request import TrendReq | |
| from sklearn.manifold import MDS, TSNE | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.cluster import KMeans | |
| from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas | |
| from IPython.display import HTML | |
| import numpy as np | |
| import matplotlib.cm as cm | |
| from urllib.parse import urlparse, urljoin | |
| import os | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| nltk.download('words') | |
| # Set your OpenAI API key here | |
| openai.api_key = os.environ['OPENAI_API_KEY'] | |
| #@title Define functions | |
| def get_image_html(fig): | |
| buf = io.BytesIO() | |
| fig.savefig(buf, format='png') | |
| buf.seek(0) | |
| return '<img src="data:image/png;base64,{}"/>'.format(base64.b64encode(buf.getvalue()).decode('ascii')) | |
| def search_top_competitors(keywords, num_results=10): | |
| competitors = set() | |
| for keyword in keywords: | |
| for url in search(keyword, num_results=num_results): | |
| competitors.add(url) | |
| return list(competitors) | |
| def get_page_content(url): | |
| response = requests.get(url) | |
| return BeautifulSoup(response.text, 'html.parser') | |
| def get_meta_tags(soup): | |
| meta_tags = soup.find_all('meta') | |
| return {tag.get('name'): tag.get('content') for tag in meta_tags if tag.get('name')} | |
| def get_heading_tags(soup): | |
| headings = {} | |
| for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
| headings[tag] = [heading.text for heading in soup.find_all(tag)] | |
| return headings | |
| def analyze_keywords(keywords_counter, top_n=10): | |
| return keywords_counter.most_common(top_n) | |
| def visualize_keywords(keywords_counter, top_n=10): | |
| common_keywords = analyze_keywords(keywords_counter, top_n) | |
| df = pd.DataFrame(common_keywords, columns=['Keyword', 'Count']) | |
| df.set_index('Keyword', inplace=True) | |
| df.plot(kind='bar', figsize=(12, 6)) | |
| plt.title('Top Keywords') | |
| plt.xlabel('Keywords') | |
| plt.ylabel('Frequency') | |
| fig = plt.gcf() # Get the current figure | |
| plt.tight_layout() | |
| temp_image_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png") | |
| plt.savefig(temp_image_file.name, format='png') | |
| plt.close() | |
| return temp_image_file.name | |
| def plot_trends(keywords): | |
| pytrends = TrendReq(hl='en-US', tz=360, retries=3) | |
| pytrends.build_payload(keywords, cat=0, timeframe='today 12-m', geo='', gprop='') | |
| trends_data = pytrends.interest_over_time() | |
| return trends_data | |
| def preprocess_text(text, min_word_length=3): | |
| stop_words = set(stopwords.words('english')) | |
| words = word_tokenize(text.lower()) | |
| words = [word for word in words if word.isalnum()] | |
| words = [word for word in words if len(word) >= min_word_length and word not in stop_words] | |
| return words | |
| def visualize_clusters(words, model): | |
| matrix = np.zeros((len(words), model.vector_size)) | |
| for i, word in enumerate(words): | |
| matrix[i, :] = model.wv[word] | |
| mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42) | |
| distance_matrix = 1 - cosine_similarity(matrix) | |
| coords = mds.fit_transform(distance_matrix) | |
| x, y = coords[:, 0], coords[:, 1] | |
| for i, word in enumerate(words): | |
| plt.scatter(x[i], y[i], alpha=0.5) | |
| plt.text(x[i], y[i], word, fontsize=10) | |
| plt.title('Word Clusters based on Thematic Relatedness') | |
| plt.show() | |
| def create_cluster_table(words, model, clusters): | |
| matrix = np.zeros((len(words), model.vector_size)) | |
| for i, word in enumerate(words): | |
| matrix[i, :] = model.wv[word] | |
| # Create a dictionary to store words per cluster | |
| cluster_dict = {} | |
| for i, word in enumerate(words): | |
| cluster_id = clusters[i] | |
| if cluster_id not in cluster_dict: | |
| cluster_dict[cluster_id] = [] | |
| cluster_dict[cluster_id].append(word) | |
| # Create a DataFrame from the dictionary | |
| max_words = max(len(cluster_words) for cluster_words in cluster_dict.values()) | |
| num_clusters = len(cluster_dict) | |
| data = {f"Cluster {i}": cluster_dict.get(i, []) + [None] * (max_words - len(cluster_dict.get(i, []))) | |
| for i in range(num_clusters)} | |
| df = pd.DataFrame(data) | |
| return df | |
| def clean_text(text): | |
| # Separate words that are meant to be separated | |
| text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) | |
| # Tokenize the text | |
| tokens = nltk.word_tokenize(text) | |
| # Remove nonsensical words | |
| try: | |
| english_words = set(words) | |
| except: | |
| english_words = set(words.words()) | |
| clean_tokens = [token for token in tokens if token.lower() in english_words or token.istitle()] | |
| # Join tokens back into a string | |
| clean_text = ' '.join(clean_tokens) | |
| return clean_text | |
| def visualize_clusters_og(words, model): | |
| matrix = np.zeros((len(words), model.vector_size)) | |
| for i, word in enumerate(words): | |
| matrix[i, :] = model.wv[word] | |
| n_clusters = 5 | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(matrix) | |
| tsne = TSNE(n_components=2, random_state=42) | |
| coords = tsne.fit_transform(matrix) | |
| x, y = coords[:, 0], coords[:, 1] | |
| colors = cm.rainbow(np.linspace(0, 1, n_clusters)) | |
| plt.figure(figsize=(8, 8)) | |
| for i, word in enumerate(words): | |
| plt.scatter(x[i], y[i], c=[colors[clusters[i]]], alpha=0.7) | |
| plt.text(x[i], y[i], word, fontsize=10) | |
| plt.xticks([]) | |
| plt.yticks([]) | |
| plt.title('Word Clusters based on Thematic Relatedness') | |
| plt.show() | |
| def visualize_clusters_plot(words, model): | |
| matrix = np.zeros((len(words), model.vector_size)) | |
| for i, word in enumerate(words): | |
| matrix[i, :] = model.wv[word] | |
| n_clusters = 4 | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(matrix) | |
| try: | |
| tsne = TSNE(n_components=2, random_state=42) | |
| coords = tsne.fit_transform(matrix) | |
| except ValueError: | |
| max_perplexity = len(words) - 1 | |
| tsne = TSNE(n_components=2, random_state=42, perplexity=max_perplexity) | |
| coords = tsne.fit_transform(matrix) | |
| x, y = coords[:, 0], coords[:, 1] | |
| colors = cm.rainbow(np.linspace(0, 1, n_clusters)) | |
| fig, axs = plt.subplots(2, 2, figsize=(8, 8), gridspec_kw={'width_ratios': [sum(clusters == 0) + sum(clusters == 1), sum(clusters == 2) + sum(clusters == 3)], 'height_ratios': [sum(clusters == 0) + sum(clusters == 2), sum(clusters == 1) + sum(clusters == 3)]}) | |
| fig.subplots_adjust(wspace=0, hspace=0) | |
| for ax in axs.ravel(): | |
| ax.axis('off') | |
| for i, word in enumerate(words): | |
| cluster_idx = clusters[i] | |
| ax = axs[cluster_idx // 2, cluster_idx % 2] | |
| ax.scatter(x[i], y[i], c=[colors[cluster_idx]], alpha=0.7) | |
| ax.text(x[i], y[i], word, fontsize=10) | |
| plt.legend(loc="best", fontsize=13) | |
| plt.tight_layout() | |
| temp_image_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png") | |
| plt.savefig(temp_image_file.name, format='png') | |
| plt.close() | |
| return temp_image_file.name, clusters | |
| def sanitize_url(url): | |
| if not re.match('^(http|https)://', url): | |
| url = 'http://' + url | |
| if not re.match('^(http|https)://www\.', url): | |
| url = re.sub('^(http|https)://', r'\g<0>www.', url) | |
| return url | |
| # Define the inputs and outputs | |
| competitor_url_input = gr.inputs.Textbox(label="Competitor URL", placeholder="Enter a competitor URL") | |
| full_site_scrape_checkbox = gr.inputs.Checkbox(label="Tick for full site scrape (otherwise landing page only)") | |
| meta_tags_output = gr.outputs.Textbox(label="Meta Tags") | |
| heading_tags_output = gr.outputs.Textbox(label="Heading Tags") | |
| top10keywords_output = gr.outputs.Textbox(label="Top 10 Keywords") | |
| cluster_table_output = gr.outputs.HTML(label="Cluster Table") | |
| cluster_plot_output = gr.outputs.Image(type='filepath', label="Cluster Plot") | |
| keyword_plot_output = gr.outputs.Image(type='filepath', label="Keyword Plot") | |
| seo_analysis_output = gr.outputs.Textbox(label="SEO Analysis") | |
| def append_unique_elements(source, target): | |
| for element in source: | |
| if isinstance(element, Tag) and element not in target: | |
| target.append(element) | |
| def get_internal_links(url: str): | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| internal_links = set() | |
| for link in soup.find_all("a"): | |
| href = link.get("href") | |
| if href: | |
| joined_url = urljoin(url, href) | |
| parsed_url = urlparse(joined_url) | |
| if parsed_url.netloc == urlparse(url).netloc: | |
| internal_links.add(joined_url) | |
| return internal_links | |
| def analyze_single_page(competitor_url: str): | |
| sanitized_url = sanitize_url(competitor_url) | |
| soup = get_page_content(sanitized_url) | |
| # Scrape and analyze meta tags | |
| meta_tags = get_meta_tags(soup) | |
| topmetatags = "" | |
| for name, content in meta_tags.items(): | |
| if "description" in name.lower(): | |
| topmetatags += (f"{name}: {content}\n") | |
| # Scrape and analyze heading tags | |
| heading_tags = get_heading_tags(soup) | |
| topheadingtags = "" | |
| for tag, headings in heading_tags.items(): | |
| filtered_headings = [heading for heading in headings if len(heading) > 2] | |
| if filtered_headings: | |
| topheadingtags += (f"{tag}: {', '.join(filtered_headings)}\n") | |
| # Scrape, analyze, and visualize keywords from page content | |
| page_text = soup.get_text() | |
| page_text_cleaned = clean_text(page_text) | |
| preprocessed_text = preprocess_text(page_text_cleaned) | |
| keywords_counter = Counter(preprocessed_text) | |
| top10keywords = "" | |
| for keyword, count in analyze_keywords(keywords_counter, top_n=10): | |
| top10keywords += (f"{keyword}: {count}\n") | |
| # Semantic clustering and visualization | |
| sentences = [preprocessed_text[i:i+10] for i in range(0, len(preprocessed_text), 10)] | |
| model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4) | |
| words = [word for word, _ in analyze_keywords(keywords_counter, top_n=50)] | |
| clusters = [model.wv.doesnt_match(words)] * len(words) | |
| cluster_plot,clusters = visualize_clusters_plot(words, model) | |
| cluster_table = create_cluster_table(words, model, clusters) | |
| keyword_plot = visualize_keywords(keywords_counter, top_n=10) | |
| table_string = cluster_table.to_string(index=False) | |
| SEO_prompt = f"""The following information is given about a company's website: | |
| Meta Tags: | |
| {{meta_tags}} | |
| Heading Tags: | |
| {{heading_tags}} | |
| Top 10 Keywords: | |
| {{top10keywords}} | |
| The following table represents clusters of thematically related words identified using NLP and clustering techniques. Each column represents a different cluster, and the words in each column are thematically related. | |
| {table_string} | |
| Please analyze the provided information and perform the following tasks: | |
| 1. Predict what the website is all about (the market sector). | |
| 2. Based on the market sector of the company, give a name to each cluster based on the theme it represents. The name needs to be the best summary of all the words in the cluster. | |
| 3. Perform a SWOT analysis (Strengths, Weaknesses, Opportunities, and Threats) from an SEO perspective for the company as a whole, taking into account the meta tags, heading tags, top 10 keywords, and the clusters. | |
| Please provide your analysis in a clear and concise manner. | |
| 4. Lastly, suggest a list of 5 single words and 5 phrases (no longer than 3 words each) that the company should be using to improve their SEO | |
| """.format(meta_tags=meta_tags, heading_tags=heading_tags, top10keywords=top10keywords, table_string=table_string) | |
| def analyse_SEO(SEO_prompt): | |
| response = openai.Completion.create( | |
| model="text-davinci-003", | |
| prompt = SEO_prompt, | |
| temperature=0.7, | |
| max_tokens=1000, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0 | |
| ) | |
| gpt3_response = response.get('choices')[0].text | |
| return gpt3_response,response | |
| seo_analysis = analyse_SEO(SEO_prompt) | |
| return topmetatags, topheadingtags, top10keywords, cluster_table.to_html(), cluster_plot, keyword_plot, seo_analysis[0] | |
| def analyze_website(competitor_url: str, full_site_scrape: bool = False): | |
| if not full_site_scrape: | |
| topmetatags, topheadingtags, top10keywords, cluster_table, cluster_plot, keyword_plot, seo_analysis = analyze_single_page(competitor_url) | |
| return topmetatags, topheadingtags, top10keywords, cluster_table, cluster_plot, keyword_plot, seo_analysis | |
| sanitized_url = sanitize_url(competitor_url) | |
| internal_links = get_internal_links(sanitized_url) | |
| soup_collection = BeautifulSoup("<html><head></head><body></body></html>", "html.parser") | |
| for link in internal_links: | |
| try: | |
| soup = get_page_content(link) | |
| append_unique_elements(soup.head, soup_collection.head) | |
| append_unique_elements(soup.body, soup_collection.body) | |
| except Exception as e: | |
| print(f"Failed to analyze link: {link}. Error: {e}") | |
| print('got all the links') | |
| # Scrape and analyze meta tags | |
| meta_tags = get_meta_tags(soup_collection) | |
| topmetatags = "" | |
| for name, content in meta_tags.items(): | |
| if "description" in name.lower(): | |
| topmetatags += (f"{name}: {content}\n") | |
| print('fetched metatags') | |
| # Scrape and analyze heading tags | |
| heading_tags = get_heading_tags(soup_collection) | |
| topheadingtags = "" | |
| for tag, headings in heading_tags.items(): | |
| filtered_headings = [heading for heading in headings if len(heading) > 2] | |
| if filtered_headings: | |
| topheadingtags += (f"{tag}: {', '.join(filtered_headings)}\n") | |
| print("fetched heading tags") | |
| # Scrape, analyze, and visualize keywords from page content | |
| page_text = soup_collection.get_text() | |
| page_text_cleaned = clean_text(page_text) | |
| preprocessed_text = preprocess_text(page_text_cleaned) | |
| keywords_counter = Counter(preprocessed_text) | |
| top10keywords = "" | |
| for keyword, count in analyze_keywords(keywords_counter, top_n=10): | |
| top10keywords += (f"{keyword}: {count}\n") | |
| print("fetched keywords") | |
| # Semantic clustering and visualization | |
| sentences = [preprocessed_text[i:i+10] for i in range(0, len(preprocessed_text), 10)] | |
| model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4) | |
| words = [word for word, _ in analyze_keywords(keywords_counter, top_n=50)] | |
| clusters = [model.wv.doesnt_match(words)] * len(words) | |
| print("calculated clusters") | |
| cluster_plot,clusters = visualize_clusters_plot(words, model) | |
| cluster_table = create_cluster_table(words, model, clusters) | |
| keyword_plot = visualize_keywords(keywords_counter, top_n=10) | |
| print("plotted figures") | |
| table_string = cluster_table.to_string(index=False) | |
| print("created table string") | |
| heading_tags_compressed = {} | |
| for key, values in heading_tags.items(): | |
| count = Counter(values) | |
| sorted_values = sorted(count.keys(), key=lambda x: count[x], reverse=True) | |
| filtered_values = [value for value in sorted_values if value.strip() != ""] | |
| heading_tags_compressed[key] = filtered_values[:10] | |
| heading_tags_clean = {} | |
| for key, values in heading_tags.items(): | |
| count = Counter(values) | |
| sorted_values_clean = sorted(count.keys(), key=lambda x: count[x], reverse=True) | |
| heading_tags_clean = [value for value in sorted_values_clean if value.strip() != ""] | |
| print("cleaned up heading tags") | |
| SEO_prompt = f"""The following information is given about a company's website: | |
| Meta Tags: | |
| {{meta_tags}} | |
| Heading Tags: | |
| {{heading_tags_compressed}} | |
| Top 10 Keywords: | |
| {{top10keywords}} | |
| The following table represents clusters of thematically related words identified using NLP and clustering techniques. Each column represents a different cluster, and the words in each column are thematically related. | |
| {table_string} | |
| Please analyze the provided information and perform the following tasks: | |
| 1. Predict what the website is all about (the market sector). | |
| 2. Based on the market sector of the company, give a name to each cluster based on the theme it represents. The name needs to be the best summary of all the words in the cluster. | |
| 3. Perform a SWOT analysis (Strengths, Weaknesses, Opportunities, and Threats) from an SEO perspective for the company as a whole, taking into account the meta tags, heading tags, top 10 keywords, and the clusters. | |
| Please provide your analysis in a clear and concise manner. | |
| 4. Lastly, suggest a list of 10 words and 10 phrases that the company should be using to improve their SEO | |
| """.format(meta_tags=meta_tags, heading_tags_compressed=heading_tags_compressed, top10keywords=top10keywords, table_string=table_string) | |
| print("defined SEO prompt") | |
| def analyse_SEO(SEO_prompt): | |
| response = openai.Completion.create( | |
| model="text-davinci-003", | |
| prompt = SEO_prompt, | |
| temperature=0.7, | |
| max_tokens=1000, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0 | |
| ) | |
| gpt3_response = response.get('choices')[0].text | |
| return gpt3_response,response | |
| seo_analysis = analyse_SEO(SEO_prompt) | |
| print("ran seo analysis") | |
| print(topmetatags, heading_tags_clean,top10keywords,cluster_table.to_html(), cluster_plot, keyword_plot,seo_analysis[0]) | |
| return topmetatags, heading_tags_clean, top10keywords, cluster_table.to_html(), cluster_plot, keyword_plot, seo_analysis[0] | |
| gr.Interface( | |
| fn=analyze_website, | |
| inputs=[competitor_url_input, full_site_scrape_checkbox], | |
| outputs=[ | |
| meta_tags_output, | |
| heading_tags_output, | |
| top10keywords_output, | |
| cluster_table_output, | |
| cluster_plot_output, | |
| keyword_plot_output, | |
| seo_analysis_output, | |
| ], | |
| title="SEO Analysis Tool", | |
| description="Enter a competitor URL to perform a SEO analysis (some javascript pages will deny full scrape).", | |
| layout="vertical" | |
| ).launch(debug=True) |