Spaces:

PhilPome
/

seo-analysis-tool

Runtime error

App Files Files Community

seo-analysis-tool / app.py

PhilPome

Rename seo_analysis_tool.py to app.py

44dc3d6 over 2 years ago

raw

history blame contribute delete

18.6 kB

	import os
	import requests
	from bs4 import BeautifulSoup, Tag
	from collections import Counter
	import re
	import string
	import nltk
	from nltk.corpus import stopwords
	from nltk.corpus import words
	from nltk.tokenize import word_tokenize
	from gensim.models import Word2Vec
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import tempfile
	import gradio as gr
	import openai
	from googlesearch import search
	from pytrends.request import TrendReq
	from sklearn.manifold import MDS, TSNE
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.cluster import KMeans
	from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
	from IPython.display import HTML
	import numpy as np
	import matplotlib.cm as cm
	from urllib.parse import urlparse, urljoin
	import os



	nltk.download('stopwords')
	nltk.download('punkt')
	nltk.download('words')

	# Set your OpenAI API key here
	openai.api_key = os.environ['OPENAI_API_KEY']


	#@title Define functions

	def get_image_html(fig):
	buf = io.BytesIO()
	fig.savefig(buf, format='png')
	buf.seek(0)
	return '<img src="data:image/png;base64,{}"/>'.format(base64.b64encode(buf.getvalue()).decode('ascii'))


	def search_top_competitors(keywords, num_results=10):
	competitors = set()
	for keyword in keywords:
	for url in search(keyword, num_results=num_results):
	competitors.add(url)
	return list(competitors)



	def get_page_content(url):
	response = requests.get(url)
	return BeautifulSoup(response.text, 'html.parser')

	def get_meta_tags(soup):
	meta_tags = soup.find_all('meta')
	return {tag.get('name'): tag.get('content') for tag in meta_tags if tag.get('name')}

	def get_heading_tags(soup):
	headings = {}
	for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
	headings[tag] = [heading.text for heading in soup.find_all(tag)]
	return headings

	def analyze_keywords(keywords_counter, top_n=10):
	return keywords_counter.most_common(top_n)

	def visualize_keywords(keywords_counter, top_n=10):
	common_keywords = analyze_keywords(keywords_counter, top_n)
	df = pd.DataFrame(common_keywords, columns=['Keyword', 'Count'])
	df.set_index('Keyword', inplace=True)
	df.plot(kind='bar', figsize=(12, 6))
	plt.title('Top Keywords')
	plt.xlabel('Keywords')
	plt.ylabel('Frequency')

	fig = plt.gcf() # Get the current figure

	plt.tight_layout()
	temp_image_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
	plt.savefig(temp_image_file.name, format='png')
	plt.close()
	return temp_image_file.name


	def plot_trends(keywords):
	pytrends = TrendReq(hl='en-US', tz=360, retries=3)
	pytrends.build_payload(keywords, cat=0, timeframe='today 12-m', geo='', gprop='')
	trends_data = pytrends.interest_over_time()
	return trends_data



	def preprocess_text(text, min_word_length=3):
	stop_words = set(stopwords.words('english'))
	words = word_tokenize(text.lower())
	words = [word for word in words if word.isalnum()]
	words = [word for word in words if len(word) >= min_word_length and word not in stop_words]
	return words

	def visualize_clusters(words, model):
	matrix = np.zeros((len(words), model.vector_size))

	for i, word in enumerate(words):
	matrix[i, :] = model.wv[word]

	mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
	distance_matrix = 1 - cosine_similarity(matrix)
	coords = mds.fit_transform(distance_matrix)

	x, y = coords[:, 0], coords[:, 1]

	for i, word in enumerate(words):
	plt.scatter(x[i], y[i], alpha=0.5)
	plt.text(x[i], y[i], word, fontsize=10)

	plt.title('Word Clusters based on Thematic Relatedness')
	plt.show()



	def create_cluster_table(words, model, clusters):
	matrix = np.zeros((len(words), model.vector_size))

	for i, word in enumerate(words):
	matrix[i, :] = model.wv[word]

	# Create a dictionary to store words per cluster
	cluster_dict = {}
	for i, word in enumerate(words):
	cluster_id = clusters[i]
	if cluster_id not in cluster_dict:
	cluster_dict[cluster_id] = []
	cluster_dict[cluster_id].append(word)

	# Create a DataFrame from the dictionary
	max_words = max(len(cluster_words) for cluster_words in cluster_dict.values())
	num_clusters = len(cluster_dict)
	data = {f"Cluster {i}": cluster_dict.get(i, []) + [None] * (max_words - len(cluster_dict.get(i, [])))
	for i in range(num_clusters)}

	df = pd.DataFrame(data)
	return df


	def clean_text(text):
	# Separate words that are meant to be separated
	text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

	# Tokenize the text
	tokens = nltk.word_tokenize(text)

	# Remove nonsensical words
	try:
	english_words = set(words)
	except:
	english_words = set(words.words())
	clean_tokens = [token for token in tokens if token.lower() in english_words or token.istitle()]

	# Join tokens back into a string
	clean_text = ' '.join(clean_tokens)

	return clean_text

	def visualize_clusters_og(words, model):
	matrix = np.zeros((len(words), model.vector_size))

	for i, word in enumerate(words):
	matrix[i, :] = model.wv[word]

	n_clusters = 5
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	clusters = kmeans.fit_predict(matrix)

	tsne = TSNE(n_components=2, random_state=42)
	coords = tsne.fit_transform(matrix)

	x, y = coords[:, 0], coords[:, 1]

	colors = cm.rainbow(np.linspace(0, 1, n_clusters))

	plt.figure(figsize=(8, 8))
	for i, word in enumerate(words):
	plt.scatter(x[i], y[i], c=[colors[clusters[i]]], alpha=0.7)
	plt.text(x[i], y[i], word, fontsize=10)

	plt.xticks([])
	plt.yticks([])
	plt.title('Word Clusters based on Thematic Relatedness')
	plt.show()


	def visualize_clusters_plot(words, model):
	matrix = np.zeros((len(words), model.vector_size))

	for i, word in enumerate(words):
	matrix[i, :] = model.wv[word]

	n_clusters = 4
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	clusters = kmeans.fit_predict(matrix)

	try:
	tsne = TSNE(n_components=2, random_state=42)
	coords = tsne.fit_transform(matrix)
	except ValueError:
	max_perplexity = len(words) - 1
	tsne = TSNE(n_components=2, random_state=42, perplexity=max_perplexity)
	coords = tsne.fit_transform(matrix)


	x, y = coords[:, 0], coords[:, 1]

	colors = cm.rainbow(np.linspace(0, 1, n_clusters))

	fig, axs = plt.subplots(2, 2, figsize=(8, 8), gridspec_kw={'width_ratios': [sum(clusters == 0) + sum(clusters == 1), sum(clusters == 2) + sum(clusters == 3)], 'height_ratios': [sum(clusters == 0) + sum(clusters == 2), sum(clusters == 1) + sum(clusters == 3)]})
	fig.subplots_adjust(wspace=0, hspace=0)

	for ax in axs.ravel():
	ax.axis('off')

	for i, word in enumerate(words):
	cluster_idx = clusters[i]
	ax = axs[cluster_idx // 2, cluster_idx % 2]
	ax.scatter(x[i], y[i], c=[colors[cluster_idx]], alpha=0.7)
	ax.text(x[i], y[i], word, fontsize=10)

	plt.legend(loc="best", fontsize=13)
	plt.tight_layout()
	temp_image_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
	plt.savefig(temp_image_file.name, format='png')
	plt.close()
	return temp_image_file.name, clusters


	def sanitize_url(url):
	if not re.match('^(http\|https)://', url):
	url = 'http://' + url

	if not re.match('^(http\|https)://www\.', url):
	url = re.sub('^(http\|https)://', r'\g<0>www.', url)

	return url




	# Define the inputs and outputs
	competitor_url_input = gr.inputs.Textbox(label="Competitor URL", placeholder="Enter a competitor URL")

	full_site_scrape_checkbox = gr.inputs.Checkbox(label="Tick for full site scrape (otherwise landing page only)")


	meta_tags_output = gr.outputs.Textbox(label="Meta Tags")
	heading_tags_output = gr.outputs.Textbox(label="Heading Tags")
	top10keywords_output = gr.outputs.Textbox(label="Top 10 Keywords")
	cluster_table_output = gr.outputs.HTML(label="Cluster Table")
	cluster_plot_output = gr.outputs.Image(type='filepath', label="Cluster Plot")
	keyword_plot_output = gr.outputs.Image(type='filepath', label="Keyword Plot")
	seo_analysis_output = gr.outputs.Textbox(label="SEO Analysis")

	def append_unique_elements(source, target):
	for element in source:
	if isinstance(element, Tag) and element not in target:
	target.append(element)

	def get_internal_links(url: str):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, "html.parser")
	internal_links = set()

	for link in soup.find_all("a"):
	href = link.get("href")

	if href:
	joined_url = urljoin(url, href)
	parsed_url = urlparse(joined_url)

	if parsed_url.netloc == urlparse(url).netloc:
	internal_links.add(joined_url)

	return internal_links

	def analyze_single_page(competitor_url: str):
	sanitized_url = sanitize_url(competitor_url)
	soup = get_page_content(sanitized_url)

	# Scrape and analyze meta tags
	meta_tags = get_meta_tags(soup)
	topmetatags = ""
	for name, content in meta_tags.items():
	if "description" in name.lower():
	topmetatags += (f"{name}: {content}\n")

	# Scrape and analyze heading tags
	heading_tags = get_heading_tags(soup)
	topheadingtags = ""
	for tag, headings in heading_tags.items():
	filtered_headings = [heading for heading in headings if len(heading) > 2]
	if filtered_headings:
	topheadingtags += (f"{tag}: {', '.join(filtered_headings)}\n")

	# Scrape, analyze, and visualize keywords from page content
	page_text = soup.get_text()
	page_text_cleaned = clean_text(page_text)
	preprocessed_text = preprocess_text(page_text_cleaned)

	keywords_counter = Counter(preprocessed_text)
	top10keywords = ""

	for keyword, count in analyze_keywords(keywords_counter, top_n=10):
	top10keywords += (f"{keyword}: {count}\n")

	# Semantic clustering and visualization
	sentences = [preprocessed_text[i:i+10] for i in range(0, len(preprocessed_text), 10)]
	model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

	words = [word for word, _ in analyze_keywords(keywords_counter, top_n=50)]
	clusters = [model.wv.doesnt_match(words)] * len(words)


	cluster_plot,clusters = visualize_clusters_plot(words, model)
	cluster_table = create_cluster_table(words, model, clusters)
	keyword_plot = visualize_keywords(keywords_counter, top_n=10)

	table_string = cluster_table.to_string(index=False)
	SEO_prompt = f"""The following information is given about a company's website:
	Meta Tags:
	{{meta_tags}}
	Heading Tags:
	{{heading_tags}}
	Top 10 Keywords:
	{{top10keywords}}
	The following table represents clusters of thematically related words identified using NLP and clustering techniques. Each column represents a different cluster, and the words in each column are thematically related.
	{table_string}
	Please analyze the provided information and perform the following tasks:
	1. Predict what the website is all about (the market sector).
	2. Based on the market sector of the company, give a name to each cluster based on the theme it represents. The name needs to be the best summary of all the words in the cluster.
	3. Perform a SWOT analysis (Strengths, Weaknesses, Opportunities, and Threats) from an SEO perspective for the company as a whole, taking into account the meta tags, heading tags, top 10 keywords, and the clusters.
	Please provide your analysis in a clear and concise manner.
	4. Lastly, suggest a list of 5 single words and 5 phrases (no longer than 3 words each) that the company should be using to improve their SEO
	""".format(meta_tags=meta_tags, heading_tags=heading_tags, top10keywords=top10keywords, table_string=table_string)



	def analyse_SEO(SEO_prompt):
	response = openai.Completion.create(
	model="text-davinci-003",
	prompt = SEO_prompt,
	temperature=0.7,
	max_tokens=1000,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)
	gpt3_response = response.get('choices')[0].text
	return gpt3_response,response


	seo_analysis = analyse_SEO(SEO_prompt)

	return topmetatags, topheadingtags, top10keywords, cluster_table.to_html(), cluster_plot, keyword_plot, seo_analysis[0]




	def analyze_website(competitor_url: str, full_site_scrape: bool = False):

	if not full_site_scrape:
	topmetatags, topheadingtags, top10keywords, cluster_table, cluster_plot, keyword_plot, seo_analysis = analyze_single_page(competitor_url)
	return topmetatags, topheadingtags, top10keywords, cluster_table, cluster_plot, keyword_plot, seo_analysis

	sanitized_url = sanitize_url(competitor_url)
	internal_links = get_internal_links(sanitized_url)
	soup_collection = BeautifulSoup("<html><head></head><body></body></html>", "html.parser")

	for link in internal_links:
	try:
	soup = get_page_content(link)
	append_unique_elements(soup.head, soup_collection.head)
	append_unique_elements(soup.body, soup_collection.body)
	except Exception as e:
	print(f"Failed to analyze link: {link}. Error: {e}")

	print('got all the links')

	# Scrape and analyze meta tags
	meta_tags = get_meta_tags(soup_collection)
	topmetatags = ""
	for name, content in meta_tags.items():
	if "description" in name.lower():
	topmetatags += (f"{name}: {content}\n")

	print('fetched metatags')

	# Scrape and analyze heading tags
	heading_tags = get_heading_tags(soup_collection)
	topheadingtags = ""
	for tag, headings in heading_tags.items():
	filtered_headings = [heading for heading in headings if len(heading) > 2]
	if filtered_headings:
	topheadingtags += (f"{tag}: {', '.join(filtered_headings)}\n")

	print("fetched heading tags")

	# Scrape, analyze, and visualize keywords from page content
	page_text = soup_collection.get_text()
	page_text_cleaned = clean_text(page_text)
	preprocessed_text = preprocess_text(page_text_cleaned)

	keywords_counter = Counter(preprocessed_text)
	top10keywords = ""

	for keyword, count in analyze_keywords(keywords_counter, top_n=10):
	top10keywords += (f"{keyword}: {count}\n")

	print("fetched keywords")

	# Semantic clustering and visualization
	sentences = [preprocessed_text[i:i+10] for i in range(0, len(preprocessed_text), 10)]
	model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

	words = [word for word, _ in analyze_keywords(keywords_counter, top_n=50)]
	clusters = [model.wv.doesnt_match(words)] * len(words)

	print("calculated clusters")

	cluster_plot,clusters = visualize_clusters_plot(words, model)
	cluster_table = create_cluster_table(words, model, clusters)
	keyword_plot = visualize_keywords(keywords_counter, top_n=10)


	print("plotted figures")

	table_string = cluster_table.to_string(index=False)

	print("created table string")

	heading_tags_compressed = {}

	for key, values in heading_tags.items():
	count = Counter(values)
	sorted_values = sorted(count.keys(), key=lambda x: count[x], reverse=True)
	filtered_values = [value for value in sorted_values if value.strip() != ""]
	heading_tags_compressed[key] = filtered_values[:10]


	heading_tags_clean = {}

	for key, values in heading_tags.items():
	count = Counter(values)
	sorted_values_clean = sorted(count.keys(), key=lambda x: count[x], reverse=True)
	heading_tags_clean = [value for value in sorted_values_clean if value.strip() != ""]

	print("cleaned up heading tags")


	SEO_prompt = f"""The following information is given about a company's website:
	Meta Tags:
	{{meta_tags}}
	Heading Tags:
	{{heading_tags_compressed}}
	Top 10 Keywords:
	{{top10keywords}}
	The following table represents clusters of thematically related words identified using NLP and clustering techniques. Each column represents a different cluster, and the words in each column are thematically related.
	{table_string}
	Please analyze the provided information and perform the following tasks:
	1. Predict what the website is all about (the market sector).
	2. Based on the market sector of the company, give a name to each cluster based on the theme it represents. The name needs to be the best summary of all the words in the cluster.
	3. Perform a SWOT analysis (Strengths, Weaknesses, Opportunities, and Threats) from an SEO perspective for the company as a whole, taking into account the meta tags, heading tags, top 10 keywords, and the clusters.
	Please provide your analysis in a clear and concise manner.
	4. Lastly, suggest a list of 10 words and 10 phrases that the company should be using to improve their SEO
	""".format(meta_tags=meta_tags, heading_tags_compressed=heading_tags_compressed, top10keywords=top10keywords, table_string=table_string)

	print("defined SEO prompt")

	def analyse_SEO(SEO_prompt):
	response = openai.Completion.create(
	model="text-davinci-003",
	prompt = SEO_prompt,
	temperature=0.7,
	max_tokens=1000,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)
	gpt3_response = response.get('choices')[0].text
	return gpt3_response,response


	seo_analysis = analyse_SEO(SEO_prompt)

	print("ran seo analysis")

	print(topmetatags, heading_tags_clean,top10keywords,cluster_table.to_html(), cluster_plot, keyword_plot,seo_analysis[0])


	return topmetatags, heading_tags_clean, top10keywords, cluster_table.to_html(), cluster_plot, keyword_plot, seo_analysis[0]



	gr.Interface(
	fn=analyze_website,
	inputs=[competitor_url_input, full_site_scrape_checkbox],
	outputs=[
	meta_tags_output,
	heading_tags_output,
	top10keywords_output,
	cluster_table_output,
	cluster_plot_output,
	keyword_plot_output,
	seo_analysis_output,
	],
	title="SEO Analysis Tool",
	description="Enter a competitor URL to perform a SEO analysis (some javascript pages will deny full scrape).",
	layout="vertical"
	).launch(debug=True)