Spaces:
Sleeping
Sleeping
File size: 1,964 Bytes
2f7b0d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# Download once
nltk.download('punkt')
nltk.download('stopwords')
def fetch_text(url):
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Only extract <p> tag text for speed & relevance
paragraphs = soup.find_all('p')
text = ' '.join([p.get_text() for p in paragraphs])
if not text.strip():
return None, f"No readable <p> content found at {url}"
return text, None
except Exception as e:
return None, f"Error fetching {url}: {str(e)}"
def extract_keywords(text):
text = text.lower()
tokens = word_tokenize(text)
words = [w for w in tokens if w.isalnum()]
stop_words = set(stopwords.words('english'))
return set([w for w in words if w not in stop_words and len(w) > 2])
def compare_keywords(url_a, url_b):
text_a, error_a = fetch_text(url_a)
text_b, error_b = fetch_text(url_b)
if error_a or error_b:
return f"β Errors:\n\n{error_a or ''}\n{error_b or ''}"
keywords_a = extract_keywords(text_a)
keywords_b = extract_keywords(text_b)
missing = sorted(list(keywords_b - keywords_a))
if not missing:
return "β
No unique keywords found in B that are missing in A."
return f"π Keywords in B but not A:\n\n" + "\n".join(missing)
# Gradio interface
demo = gr.Interface(
fn=compare_keywords,
inputs=[
gr.Textbox(label="Your Website (A)"),
gr.Textbox(label="Competitor Website (B)")
],
outputs="text",
title="π Website Keyword Gap Finder",
description="Enter two URLs. See what keywords your competitor uses that you don't."
)
demo.launch()
|