Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.util import ngrams | |
| from nltk.stem import PorterStemmer | |
| import pandas as pd | |
| import nltk | |
| import string | |
| import io | |
| import os | |
| api_key = os.getenv("API_KEY") | |
| # ==========1- NLTK DOWNLOADS========= | |
| def ensure_nltk_data(): | |
| resources = [ | |
| ("tokenizers/punkt", "punkt"), | |
| ("corpora/stopwords", "stopwords"), | |
| ("tokenizers/punkt_tab", "punkt_tab") | |
| ] | |
| for resource_path, download_name in resources: | |
| try: | |
| nltk.data.find(resource_path) | |
| except LookupError: | |
| nltk.download(download_name) | |
| ensure_nltk_data() | |
| # =======2-EXTRACT FUNCTION WITH USER AGENT========== | |
| def extract_blog_content(url): | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; rv:105.0) " | |
| "Gecko/20100101 Firefox/105.0" | |
| ) | |
| } | |
| response = requests.get(url, headers=headers) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| meta_title = soup.find('meta', attrs={'name': 'title'}) or soup.find('title') | |
| meta_description = soup.find('meta', attrs={'name': 'description'}) | |
| meta_title = meta_title['content'] if meta_title and 'content' in meta_title.attrs else '' | |
| meta_description = meta_description['content'] if meta_description and 'content' in meta_description.attrs else '' | |
| article_title_element = soup.find('h1') | |
| article_title = article_title_element.get_text(strip=True) if article_title_element else '' | |
| blog_text = " ".join([p.get_text() for p in soup.find_all('p')]) | |
| return meta_title, meta_description, article_title, blog_text | |
| #========3- PREPROCESSING + TF-IDF LOGIC======= | |
| def preprocess_text(text): | |
| stop_words = set(stopwords.words('english')) | |
| stemmer = PorterStemmer() | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation] | |
| return tokens | |
| def generate_ngrams(tokens, max_n=3): | |
| all_ngrams = [] | |
| for n in range(1, max_n + 1): | |
| ngrams_list = [" ".join(gram) for gram in ngrams(tokens, n)] | |
| all_ngrams.extend(ngrams_list) | |
| return all_ngrams | |
| # ======= 4-KEYWORD TOOL API + SELECTION LOGIC ========== | |
| def get_keyword_metrics(keywords): | |
| if not keywords: | |
| st.error("No keywords to process.") | |
| return {} | |
| url = "https://api.keywordtool.io/v2/search/volume/google" | |
| payload = { | |
| "metrics_network": "googlesearchnetwork", | |
| "metrics_currency": "USD", | |
| "complete": False, | |
| "output": "json", | |
| "apikey": api_key, | |
| "keyword": keywords | |
| } | |
| headers = {"content-type": "application/json"} | |
| response = requests.post(url, json=payload, headers=headers) | |
| if response.status_code == 200: | |
| return response.json() | |
| else: | |
| st.error("API Error: " + response.text) | |
| return {} | |
| def select_top_keywords(metrics_response, percentage, scored_keywords): | |
| keywords_data = metrics_response.get('results', {}) | |
| keyword_scores = [] | |
| for keyword, data in keywords_data.items(): | |
| search_volume = data.get('volume', 0) or 0 | |
| trend = data.get('trend', 0) or 0 | |
| cpc = data.get('cpc', 0) or 0 | |
| competition = data.get('cmp', 0) or 0 | |
| tfidf_score = next((score for kw, score in scored_keywords if kw == keyword), 0) | |
| percentage_score = tfidf_score * 100 # Convert to percentage | |
| keyword_scores.append((keyword, percentage_score, search_volume, trend, cpc, competition)) | |
| sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True) | |
| top_count = max(1, int(len(sorted_keywords) * (percentage / 100))) | |
| return sorted_keywords[:top_count] | |
| # =======5-UI & LOGIC FLOW========== | |
| st.title("Keyword Analysis Tool") | |
| # A. URL input | |
| url = st.text_input("Enter the URL:", key="url_input") | |
| if "meta_title" not in st.session_state: | |
| st.session_state.meta_title = "" | |
| if "meta_description" not in st.session_state: | |
| st.session_state.meta_description = "" | |
| if "article_title" not in st.session_state: | |
| st.session_state.article_title = "" | |
| if "article_text" not in st.session_state: | |
| st.session_state.article_text = "" | |
| # B- Step 1: Fetch Data | |
| if st.button("Fetch Data"): | |
| if url.strip(): | |
| meta_title, meta_description, article_title, blog_text = extract_blog_content(url) | |
| st.session_state.meta_title = meta_title | |
| st.session_state.meta_description = meta_description | |
| st.session_state.article_title = article_title | |
| st.session_state.article_text = blog_text | |
| else: | |
| st.error("Please enter a valid URL.") | |
| # C-Show the fetched data so user can modify | |
| st.subheader("Modify Fetched Content") | |
| st.session_state.meta_title = st.text_input("Meta Title", st.session_state.meta_title) | |
| st.session_state.meta_description = st.text_area("Meta Description", st.session_state.meta_description) | |
| st.session_state.article_title = st.text_input("Article Title", st.session_state.article_title) | |
| st.session_state.article_text = st.text_area("Article Text", st.session_state.article_text) | |
| # D- Checkboxes to select which parts to analyze | |
| include_meta_title = st.checkbox("Include Meta Title") | |
| include_meta_description = st.checkbox("Include Meta Description") | |
| include_article_title = st.checkbox("Include Article Title") | |
| include_article_text = st.checkbox("Include Article Text") | |
| # E- Top % of Keywords | |
| top_percentage = st.number_input("Top % of Keywords to Display", min_value=1, max_value=100, value=100, step=1) | |
| # F- Analyze Button -> runs the original logic | |
| if st.button("Analyze"): | |
| if not url.strip(): | |
| st.error("Please enter a valid URL.") | |
| else: | |
| selected_text = "" | |
| if include_meta_title: | |
| selected_text += st.session_state.meta_title + " " | |
| if include_meta_description: | |
| selected_text += st.session_state.meta_description + " " | |
| if include_article_title: | |
| selected_text += st.session_state.article_title + " " | |
| if include_article_text: | |
| selected_text += st.session_state.article_text | |
| if not selected_text.strip(): | |
| st.error("No text selected for analysis. Please check at least one option.") | |
| else: | |
| # ========== ORIGINAL ANALYSIS LOGIC (unchanged) ========== | |
| tokens = preprocess_text(selected_text) | |
| ngrams_list = generate_ngrams(tokens, max_n=3) | |
| unique_ngrams = list(set(ngrams_list)) | |
| if not unique_ngrams: | |
| st.error("Vocabulary is empty. Please ensure valid input data.") | |
| else: | |
| tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_ngrams) | |
| tfidf_vectorizer.fit([" ".join(tokens)]) | |
| tfidf_scores = tfidf_vectorizer.transform([" ".join(tokens)]).toarray()[0] | |
| scored_keywords = sorted( | |
| zip(unique_ngrams, tfidf_scores), | |
| key=lambda x: x[1], | |
| reverse=True | |
| )[:100] | |
| keywords = [kw for kw, _ in scored_keywords] | |
| metrics_response = get_keyword_metrics(keywords) | |
| if metrics_response: | |
| # Select top keywords based on user percentage | |
| top_keywords_data = select_top_keywords(metrics_response, top_percentage, scored_keywords) | |
| data = { | |
| "Keyword": [k[0] for k in top_keywords_data], | |
| "Score (%)": [f"{k[1]:.2f}" for k in top_keywords_data], | |
| "Search Volume": [k[2] for k in top_keywords_data], | |
| "Trend": [k[3] for k in top_keywords_data], | |
| "CPC": [k[4] for k in top_keywords_data], | |
| "Competition": [k[5] for k in top_keywords_data], | |
| } | |
| df = pd.DataFrame(data) | |
| st.dataframe(df) | |
| output_format = st.selectbox("Download format", ["CSV", "Excel"]) | |
| if output_format == "CSV": | |
| csv_data = df.to_csv(index=False).encode('utf-8') | |
| st.download_button( | |
| label="Download CSV", | |
| data=csv_data, | |
| file_name="keywords.csv", | |
| mime="text/csv", | |
| key="download-csv", | |
| ) | |
| else: # Excel | |
| excel_buffer = io.BytesIO() | |
| with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: | |
| df.to_excel(writer, index=False, sheet_name="Sheet1") | |
| excel_data = excel_buffer.getvalue() | |
| st.download_button( | |
| label="Download Excel", | |
| data=excel_data, | |
| file_name="keywords.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| key="download-excel", | |
| ) |