seo-analysis-tool/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
seo-analysis-tool/README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Seo Analysis Tool
3
+ emoji: 📉
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.28.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+
13
+ ## Introduction
14
+ This project is an SEO analysis tool designed to analyze websites for SEO-related metrics and provide insights. The tool offers functionality to scrape web content, analyze keyword distribution, and provide SEO optimization suggestions.
15
+
16
+ ## Original Author
17
+ Originally developed by PhilPome.
18
+
19
+ ## Updates by crconyc
20
+ This project has been updated by crconyc in November 2023 to align with the latest version of Gradio. The updates include modifications to the Gradio interface setup and function calls, ensuring compatibility with the latest Gradio API.
21
+
22
+ ## Update Details
23
+ - Updated Gradio interface calls to match the latest API specifications as of November 2023.
24
+ - Refactored the code for improved clarity and efficiency, especially in the handling of Gradio inputs and outputs. https://www.gradio.app/docs/interface
25
+ - Ensured compatibility with Gradio version [4.7.1], addressing previous issues with outdated API usage.
26
+
27
+ ## Installation
28
+ To install the necessary dependencies for this tool, run the following command:
29
+ pip install -r requirements.txt
30
+
31
+
32
+ ## Usage
33
+ To run the tool, navigate to the project directory and execute:
34
+ python app.py
35
+
36
+ Follow the on-screen instructions or prompts to perform the SEO analysis.
37
+
38
+ ## License
39
+ Apache 2.0
40
+
41
+ ## Acknowledgements
42
+ Special thanks to PhilPome for developing the initial version!
seo-analysis-tool/app.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup, Tag
4
+ from collections import Counter
5
+ import re
6
+ import string
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk.corpus import words
10
+ from nltk.tokenize import word_tokenize
11
+ from gensim.models import Word2Vec
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ import tempfile
16
+ import gradio as gr
17
+ import openai
18
+ from googlesearch import search
19
+ from pytrends.request import TrendReq
20
+ from sklearn.manifold import MDS, TSNE
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ from sklearn.cluster import KMeans
23
+ from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
24
+ from IPython.display import HTML
25
+ import numpy as np
26
+ import matplotlib.cm as cm
27
+ from urllib.parse import urlparse, urljoin
28
+ import os
29
+
30
+
31
+
32
+ nltk.download('stopwords')
33
+ nltk.download('punkt')
34
+ nltk.download('words')
35
+
36
+ # Set your OpenAI API key here
37
+ openai.api_key = os.environ['OPENAI_API_KEY']
38
+
39
+
40
+ #@title Define functions
41
+
42
+ def get_image_html(fig):
43
+ buf = io.BytesIO()
44
+ fig.savefig(buf, format='png')
45
+ buf.seek(0)
46
+ return '<img src="data:image/png;base64,{}"/>'.format(base64.b64encode(buf.getvalue()).decode('ascii'))
47
+
48
+
49
+ def search_top_competitors(keywords, num_results=10):
50
+ competitors = set()
51
+ for keyword in keywords:
52
+ for url in search(keyword, num_results=num_results):
53
+ competitors.add(url)
54
+ return list(competitors)
55
+
56
+
57
+
58
+ def get_page_content(url):
59
+ response = requests.get(url)
60
+ return BeautifulSoup(response.text, 'html.parser')
61
+
62
+ def get_meta_tags(soup):
63
+ meta_tags = soup.find_all('meta')
64
+ return {tag.get('name'): tag.get('content') for tag in meta_tags if tag.get('name')}
65
+
66
+ def get_heading_tags(soup):
67
+ headings = {}
68
+ for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
69
+ headings[tag] = [heading.text for heading in soup.find_all(tag)]
70
+ return headings
71
+
72
+ def analyze_keywords(keywords_counter, top_n=10):
73
+ return keywords_counter.most_common(top_n)
74
+
75
+ def visualize_keywords(keywords_counter, top_n=10):
76
+ common_keywords = analyze_keywords(keywords_counter, top_n)
77
+ df = pd.DataFrame(common_keywords, columns=['Keyword', 'Count'])
78
+ df.set_index('Keyword', inplace=True)
79
+ df.plot(kind='bar', figsize=(12, 6))
80
+ plt.title('Top Keywords')
81
+ plt.xlabel('Keywords')
82
+ plt.ylabel('Frequency')
83
+
84
+ fig = plt.gcf() # Get the current figure
85
+
86
+ plt.tight_layout()
87
+ temp_image_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
88
+ plt.savefig(temp_image_file.name, format='png')
89
+ plt.close()
90
+ return temp_image_file.name
91
+
92
+
93
+ def plot_trends(keywords):
94
+ pytrends = TrendReq(hl='en-US', tz=360, retries=3)
95
+ pytrends.build_payload(keywords, cat=0, timeframe='today 12-m', geo='', gprop='')
96
+ trends_data = pytrends.interest_over_time()
97
+ return trends_data
98
+
99
+
100
+
101
+ def preprocess_text(text, min_word_length=3):
102
+ stop_words = set(stopwords.words('english'))
103
+ words = word_tokenize(text.lower())
104
+ words = [word for word in words if word.isalnum()]
105
+ words = [word for word in words if len(word) >= min_word_length and word not in stop_words]
106
+ return words
107
+
108
+ def visualize_clusters(words, model):
109
+ matrix = np.zeros((len(words), model.vector_size))
110
+
111
+ for i, word in enumerate(words):
112
+ matrix[i, :] = model.wv[word]
113
+
114
+ mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
115
+ distance_matrix = 1 - cosine_similarity(matrix)
116
+ coords = mds.fit_transform(distance_matrix)
117
+
118
+ x, y = coords[:, 0], coords[:, 1]
119
+
120
+ for i, word in enumerate(words):
121
+ plt.scatter(x[i], y[i], alpha=0.5)
122
+ plt.text(x[i], y[i], word, fontsize=10)
123
+
124
+ plt.title('Word Clusters based on Thematic Relatedness')
125
+ plt.show()
126
+
127
+
128
+
129
+ def create_cluster_table(words, model, clusters):
130
+ matrix = np.zeros((len(words), model.vector_size))
131
+
132
+ for i, word in enumerate(words):
133
+ matrix[i, :] = model.wv[word]
134
+
135
+ # Create a dictionary to store words per cluster
136
+ cluster_dict = {}
137
+ for i, word in enumerate(words):
138
+ cluster_id = clusters[i]
139
+ if cluster_id not in cluster_dict:
140
+ cluster_dict[cluster_id] = []
141
+ cluster_dict[cluster_id].append(word)
142
+
143
+ # Create a DataFrame from the dictionary
144
+ max_words = max(len(cluster_words) for cluster_words in cluster_dict.values())
145
+ num_clusters = len(cluster_dict)
146
+ data = {f"Cluster {i}": cluster_dict.get(i, []) + [None] * (max_words - len(cluster_dict.get(i, [])))
147
+ for i in range(num_clusters)}
148
+
149
+ df = pd.DataFrame(data)
150
+ return df
151
+
152
+
153
+ def clean_text(text):
154
+ # Separate words that are meant to be separated
155
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
156
+
157
+ # Tokenize the text
158
+ tokens = nltk.word_tokenize(text)
159
+
160
+ # Remove nonsensical words
161
+ try:
162
+ english_words = set(words)
163
+ except:
164
+ english_words = set(words.words())
165
+ clean_tokens = [token for token in tokens if token.lower() in english_words or token.istitle()]
166
+
167
+ # Join tokens back into a string
168
+ clean_text = ' '.join(clean_tokens)
169
+
170
+ return clean_text
171
+
172
+ def visualize_clusters_og(words, model):
173
+ matrix = np.zeros((len(words), model.vector_size))
174
+
175
+ for i, word in enumerate(words):
176
+ matrix[i, :] = model.wv[word]
177
+
178
+ n_clusters = 5
179
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
180
+ clusters = kmeans.fit_predict(matrix)
181
+
182
+ tsne = TSNE(n_components=2, random_state=42)
183
+ coords = tsne.fit_transform(matrix)
184
+
185
+ x, y = coords[:, 0], coords[:, 1]
186
+
187
+ colors = cm.rainbow(np.linspace(0, 1, n_clusters))
188
+
189
+ plt.figure(figsize=(8, 8))
190
+ for i, word in enumerate(words):
191
+ plt.scatter(x[i], y[i], c=[colors[clusters[i]]], alpha=0.7)
192
+ plt.text(x[i], y[i], word, fontsize=10)
193
+
194
+ plt.xticks([])
195
+ plt.yticks([])
196
+ plt.title('Word Clusters based on Thematic Relatedness')
197
+ plt.show()
198
+
199
+
200
+ def visualize_clusters_plot(words, model):
201
+ matrix = np.zeros((len(words), model.vector_size))
202
+
203
+ for i, word in enumerate(words):
204
+ matrix[i, :] = model.wv[word]
205
+
206
+ n_clusters = 4
207
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
208
+ clusters = kmeans.fit_predict(matrix)
209
+
210
+ try:
211
+ tsne = TSNE(n_components=2, random_state=42)
212
+ coords = tsne.fit_transform(matrix)
213
+ except ValueError:
214
+ max_perplexity = len(words) - 1
215
+ tsne = TSNE(n_components=2, random_state=42, perplexity=max_perplexity)
216
+ coords = tsne.fit_transform(matrix)
217
+
218
+
219
+ x, y = coords[:, 0], coords[:, 1]
220
+
221
+ colors = cm.rainbow(np.linspace(0, 1, n_clusters))
222
+
223
+ fig, axs = plt.subplots(2, 2, figsize=(8, 8), gridspec_kw={'width_ratios': [sum(clusters == 0) + sum(clusters == 1), sum(clusters == 2) + sum(clusters == 3)], 'height_ratios': [sum(clusters == 0) + sum(clusters == 2), sum(clusters == 1) + sum(clusters == 3)]})
224
+ fig.subplots_adjust(wspace=0, hspace=0)
225
+
226
+ for ax in axs.ravel():
227
+ ax.axis('off')
228
+
229
+ for i, word in enumerate(words):
230
+ cluster_idx = clusters[i]
231
+ ax = axs[cluster_idx // 2, cluster_idx % 2]
232
+ ax.scatter(x[i], y[i], c=[colors[cluster_idx]], alpha=0.7)
233
+ ax.text(x[i], y[i], word, fontsize=10)
234
+
235
+ plt.legend(loc="best", fontsize=13)
236
+ plt.tight_layout()
237
+ temp_image_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
238
+ plt.savefig(temp_image_file.name, format='png')
239
+ plt.close()
240
+ return temp_image_file.name, clusters
241
+
242
+
243
+ def sanitize_url(url):
244
+ if not re.match('^(http|https)://', url):
245
+ url = 'http://' + url
246
+
247
+ if not re.match('^(http|https)://www\.', url):
248
+ url = re.sub('^(http|https)://', r'\g<0>www.', url)
249
+
250
+ return url
251
+
252
+
253
+
254
+
255
+ # Define the inputs and outputs
256
+ competitor_url_input = gr.Textbox(label="Competitor URL", placeholder="Enter a competitor URL")
257
+
258
+ full_site_scrape_checkbox = gr.Checkbox(label="Tick for full site scrape (otherwise landing page only)")
259
+
260
+
261
+ meta_tags_output = gr.Textbox(label="Meta Tags")
262
+ heading_tags_output = gr.Textbox(label="Heading Tags")
263
+ top10keywords_output = gr.Textbox(label="Top 10 Keywords")
264
+ cluster_table_output = gr.HTML(label="Cluster Table")
265
+ cluster_plot_output = gr.Image(type='filepath', label="Cluster Plot")
266
+ keyword_plot_output = gr.Image(type='filepath', label="Keyword Plot")
267
+ seo_analysis_output = gr.Textbox(label="SEO Analysis")
268
+
269
+ def append_unique_elements(source, target):
270
+ for element in source:
271
+ if isinstance(element, Tag) and element not in target:
272
+ target.append(element)
273
+
274
+ def get_internal_links(url: str):
275
+ response = requests.get(url)
276
+ soup = BeautifulSoup(response.content, "html.parser")
277
+ internal_links = set()
278
+
279
+ for link in soup.find_all("a"):
280
+ href = link.get("href")
281
+
282
+ if href:
283
+ joined_url = urljoin(url, href)
284
+ parsed_url = urlparse(joined_url)
285
+
286
+ if parsed_url.netloc == urlparse(url).netloc:
287
+ internal_links.add(joined_url)
288
+
289
+ return internal_links
290
+
291
+ def analyze_single_page(competitor_url: str):
292
+ sanitized_url = sanitize_url(competitor_url)
293
+ soup = get_page_content(sanitized_url)
294
+
295
+ # Scrape and analyze meta tags
296
+ meta_tags = get_meta_tags(soup)
297
+ topmetatags = ""
298
+ for name, content in meta_tags.items():
299
+ if "description" in name.lower():
300
+ topmetatags += (f"{name}: {content}\n")
301
+
302
+ # Scrape and analyze heading tags
303
+ heading_tags = get_heading_tags(soup)
304
+ topheadingtags = ""
305
+ for tag, headings in heading_tags.items():
306
+ filtered_headings = [heading for heading in headings if len(heading) > 2]
307
+ if filtered_headings:
308
+ topheadingtags += (f"{tag}: {', '.join(filtered_headings)}\n")
309
+
310
+ # Scrape, analyze, and visualize keywords from page content
311
+ page_text = soup.get_text()
312
+ page_text_cleaned = clean_text(page_text)
313
+ preprocessed_text = preprocess_text(page_text_cleaned)
314
+
315
+ keywords_counter = Counter(preprocessed_text)
316
+ top10keywords = ""
317
+
318
+ for keyword, count in analyze_keywords(keywords_counter, top_n=10):
319
+ top10keywords += (f"{keyword}: {count}\n")
320
+
321
+ # Semantic clustering and visualization
322
+ sentences = [preprocessed_text[i:i+10] for i in range(0, len(preprocessed_text), 10)]
323
+ model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
324
+
325
+ words = [word for word, _ in analyze_keywords(keywords_counter, top_n=50)]
326
+ clusters = [model.wv.doesnt_match(words)] * len(words)
327
+
328
+
329
+ cluster_plot,clusters = visualize_clusters_plot(words, model)
330
+ cluster_table = create_cluster_table(words, model, clusters)
331
+ keyword_plot = visualize_keywords(keywords_counter, top_n=10)
332
+
333
+ table_string = cluster_table.to_string(index=False)
334
+ SEO_prompt = f"""The following information is given about a company's website:
335
+ Meta Tags:
336
+ {{meta_tags}}
337
+ Heading Tags:
338
+ {{heading_tags}}
339
+ Top 10 Keywords:
340
+ {{top10keywords}}
341
+ The following table represents clusters of thematically related words identified using NLP and clustering techniques. Each column represents a different cluster, and the words in each column are thematically related.
342
+ {table_string}
343
+ Please analyze the provided information and perform the following tasks:
344
+ 1. Predict what the website is all about (the market sector).
345
+ 2. Based on the market sector of the company, give a name to each cluster based on the theme it represents. The name needs to be the best summary of all the words in the cluster.
346
+ 3. Perform a SWOT analysis (Strengths, Weaknesses, Opportunities, and Threats) from an SEO perspective for the company as a whole, taking into account the meta tags, heading tags, top 10 keywords, and the clusters.
347
+ Please provide your analysis in a clear and concise manner.
348
+ 4. Lastly, suggest a list of 5 single words and 5 phrases (no longer than 3 words each) that the company should be using to improve their SEO
349
+ """.format(meta_tags=meta_tags, heading_tags=heading_tags, top10keywords=top10keywords, table_string=table_string)
350
+
351
+
352
+
353
+ def analyse_SEO(SEO_prompt):
354
+ response = openai.Completion.create(
355
+ model="text-davinci-003",
356
+ prompt = SEO_prompt,
357
+ temperature=0.7,
358
+ max_tokens=1000,
359
+ top_p=1,
360
+ frequency_penalty=0,
361
+ presence_penalty=0
362
+ )
363
+ gpt3_response = response.get('choices')[0].text
364
+ return gpt3_response,response
365
+
366
+
367
+ seo_analysis = analyse_SEO(SEO_prompt)
368
+
369
+ return topmetatags, topheadingtags, top10keywords, cluster_table.to_html(), cluster_plot, keyword_plot, seo_analysis[0]
370
+
371
+
372
+
373
+
374
+ def analyze_website(competitor_url: str, full_site_scrape: bool = False):
375
+
376
+ if not full_site_scrape:
377
+ topmetatags, topheadingtags, top10keywords, cluster_table, cluster_plot, keyword_plot, seo_analysis = analyze_single_page(competitor_url)
378
+ return topmetatags, topheadingtags, top10keywords, cluster_table, cluster_plot, keyword_plot, seo_analysis
379
+
380
+ sanitized_url = sanitize_url(competitor_url)
381
+ internal_links = get_internal_links(sanitized_url)
382
+ soup_collection = BeautifulSoup("<html><head></head><body></body></html>", "html.parser")
383
+
384
+ for link in internal_links:
385
+ try:
386
+ soup = get_page_content(link)
387
+ append_unique_elements(soup.head, soup_collection.head)
388
+ append_unique_elements(soup.body, soup_collection.body)
389
+ except Exception as e:
390
+ print(f"Failed to analyze link: {link}. Error: {e}")
391
+
392
+ print('got all the links')
393
+
394
+ # Scrape and analyze meta tags
395
+ meta_tags = get_meta_tags(soup_collection)
396
+ topmetatags = ""
397
+ for name, content in meta_tags.items():
398
+ if "description" in name.lower():
399
+ topmetatags += (f"{name}: {content}\n")
400
+
401
+ print('fetched metatags')
402
+
403
+ # Scrape and analyze heading tags
404
+ heading_tags = get_heading_tags(soup_collection)
405
+ topheadingtags = ""
406
+ for tag, headings in heading_tags.items():
407
+ filtered_headings = [heading for heading in headings if len(heading) > 2]
408
+ if filtered_headings:
409
+ topheadingtags += (f"{tag}: {', '.join(filtered_headings)}\n")
410
+
411
+ print("fetched heading tags")
412
+
413
+ # Scrape, analyze, and visualize keywords from page content
414
+ page_text = soup_collection.get_text()
415
+ page_text_cleaned = clean_text(page_text)
416
+ preprocessed_text = preprocess_text(page_text_cleaned)
417
+
418
+ keywords_counter = Counter(preprocessed_text)
419
+ top10keywords = ""
420
+
421
+ for keyword, count in analyze_keywords(keywords_counter, top_n=10):
422
+ top10keywords += (f"{keyword}: {count}\n")
423
+
424
+ print("fetched keywords")
425
+
426
+ # Semantic clustering and visualization
427
+ sentences = [preprocessed_text[i:i+10] for i in range(0, len(preprocessed_text), 10)]
428
+ model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
429
+
430
+ words = [word for word, _ in analyze_keywords(keywords_counter, top_n=50)]
431
+ clusters = [model.wv.doesnt_match(words)] * len(words)
432
+
433
+ print("calculated clusters")
434
+
435
+ cluster_plot,clusters = visualize_clusters_plot(words, model)
436
+ cluster_table = create_cluster_table(words, model, clusters)
437
+ keyword_plot = visualize_keywords(keywords_counter, top_n=10)
438
+
439
+
440
+ print("plotted figures")
441
+
442
+ table_string = cluster_table.to_string(index=False)
443
+
444
+ print("created table string")
445
+
446
+ heading_tags_compressed = {}
447
+
448
+ for key, values in heading_tags.items():
449
+ count = Counter(values)
450
+ sorted_values = sorted(count.keys(), key=lambda x: count[x], reverse=True)
451
+ filtered_values = [value for value in sorted_values if value.strip() != ""]
452
+ heading_tags_compressed[key] = filtered_values[:10]
453
+
454
+
455
+ heading_tags_clean = {}
456
+
457
+ for key, values in heading_tags.items():
458
+ count = Counter(values)
459
+ sorted_values_clean = sorted(count.keys(), key=lambda x: count[x], reverse=True)
460
+ heading_tags_clean = [value for value in sorted_values_clean if value.strip() != ""]
461
+
462
+ print("cleaned up heading tags")
463
+
464
+
465
+ SEO_prompt = f"""The following information is given about a company's website:
466
+ Meta Tags:
467
+ {{meta_tags}}
468
+ Heading Tags:
469
+ {{heading_tags_compressed}}
470
+ Top 10 Keywords:
471
+ {{top10keywords}}
472
+ The following table represents clusters of thematically related words identified using NLP and clustering techniques. Each column represents a different cluster, and the words in each column are thematically related.
473
+ {table_string}
474
+ Please analyze the provided information and perform the following tasks:
475
+ 1. Predict what the website is all about (the market sector).
476
+ 2. Based on the market sector of the company, give a name to each cluster based on the theme it represents. The name needs to be the best summary of all the words in the cluster.
477
+ 3. Perform a SWOT analysis (Strengths, Weaknesses, Opportunities, and Threats) from an SEO perspective for the company as a whole, taking into account the meta tags, heading tags, top 10 keywords, and the clusters.
478
+ Please provide your analysis in a clear and concise manner.
479
+ 4. Lastly, suggest a list of 10 words and 10 phrases that the company should be using to improve their SEO
480
+ """.format(meta_tags=meta_tags, heading_tags_compressed=heading_tags_compressed, top10keywords=top10keywords, table_string=table_string)
481
+
482
+ print("defined SEO prompt")
483
+
484
+ def analyse_SEO(SEO_prompt):
485
+ response = openai.Completion.create(
486
+ model="text-davinci-003",
487
+ prompt = SEO_prompt,
488
+ temperature=0.7,
489
+ max_tokens=1000,
490
+ top_p=1,
491
+ frequency_penalty=0,
492
+ presence_penalty=0
493
+ )
494
+ gpt3_response = response.get('choices')[0].text
495
+ return gpt3_response,response
496
+
497
+
498
+ seo_analysis = analyse_SEO(SEO_prompt)
499
+
500
+ print("ran seo analysis")
501
+
502
+ print(topmetatags, heading_tags_clean,top10keywords,cluster_table.to_html(), cluster_plot, keyword_plot,seo_analysis[0])
503
+
504
+
505
+ return topmetatags, heading_tags_clean, top10keywords, cluster_table.to_html(), cluster_plot, keyword_plot, seo_analysis[0]
506
+
507
+
508
+
509
+ gr.Interface(
510
+ fn=analyze_website,
511
+ inputs=[competitor_url_input, full_site_scrape_checkbox],
512
+ outputs=[
513
+ meta_tags_output,
514
+ heading_tags_output,
515
+ top10keywords_output,
516
+ cluster_table_output,
517
+ cluster_plot_output,
518
+ keyword_plot_output,
519
+ seo_analysis_output,
520
+ ],
521
+ title="SEO Analysis Tool",
522
+ description="Enter a competitor URL to perform a SEO analysis (some javascript pages will deny full scrape).",
523
+ ).launch(debug=True)
seo-analysis-tool/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ gensim
3
+ gradio
4
+ matplotlib
5
+ nltk
6
+ numpy
7
+ openai
8
+ pandas
9
+ requests
10
+ scipy
11
+ seaborn
12
+ googlesearch-python
13
+ pytrends
14
+ scikit-learn
15
+ ipython
16
+ celery
17
+ redis