Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,579 +9,488 @@ import time
|
|
| 9 |
import logging
|
| 10 |
import re
|
| 11 |
from retrying import retry
|
| 12 |
-
import gc
|
| 13 |
|
| 14 |
# --- Configuration ---
|
| 15 |
-
|
| 16 |
-
# Model Options - Added more models with notes on resource needs
|
| 17 |
MODEL_OPTIONS = {
|
| 18 |
-
#
|
| 19 |
-
"Mistral-7B-Instruct (Fast,
|
| 20 |
-
"Gemma-7B (Google,
|
| 21 |
-
"
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
"
|
| 26 |
-
"Qwen1.5-14B-Chat (Alibaba, Strong
|
| 27 |
-
|
| 28 |
-
#
|
|
|
|
| 29 |
}
|
| 30 |
-
#
|
| 31 |
-
DEFAULT_MODEL_KEY = "Mistral-7B-Instruct (Fast, Good Quality, Med RAM)"
|
| 32 |
|
| 33 |
# Scraping & Generation Defaults
|
| 34 |
-
DEFAULT_NUM_RESULTS =
|
| 35 |
REQUEST_TIMEOUT = 15
|
| 36 |
-
MAX_COMPETITOR_TEXT_LENGTH =
|
| 37 |
-
DEFAULT_MAX_GENERATION_TOKENS =
|
| 38 |
|
| 39 |
-
# Retry settings
|
| 40 |
RETRY_WAIT_FIXED = 2000
|
| 41 |
RETRY_STOP_MAX_ATTEMPT = 3
|
| 42 |
|
| 43 |
# Tone & Audience Options
|
| 44 |
-
TONE_OPTIONS = ["Conversational", "Professional", "Authoritative", "Technical", "Friendly", "Engaging", "Educational"]
|
| 45 |
-
AUDIENCE_OPTIONS = ["Beginners", "General Audience", "Experts", "Professionals (Specific Field)", "Customers", "Students"]
|
| 46 |
|
| 47 |
# --- Logging Setup ---
|
| 48 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
|
| 49 |
logger = logging.getLogger(__name__)
|
| 50 |
|
| 51 |
-
# ---
|
| 52 |
-
# Initialize session state
|
|
|
|
|
|
|
|
|
|
| 53 |
if 'scraped_urls' not in st.session_state: st.session_state.scraped_urls = []
|
| 54 |
if 'competitor_analysis_text' not in st.session_state: st.session_state.competitor_analysis_text = ""
|
| 55 |
if 'generated_content' not in st.session_state: st.session_state.generated_content = ""
|
| 56 |
if 'internal_link_suggestions' not in st.session_state: st.session_state.internal_link_suggestions = ""
|
| 57 |
if 'last_keyword' not in st.session_state: st.session_state.last_keyword = ""
|
| 58 |
-
if 'last_model_id' not in st.session_state: st.session_state.last_model_id = ""
|
| 59 |
if 'last_website_url' not in st.session_state: st.session_state.last_website_url = ""
|
| 60 |
-
if '
|
| 61 |
-
if 'current_model_id' not in st.session_state: st.session_state.current_model_id = ""
|
| 62 |
|
| 63 |
|
| 64 |
-
#
|
| 65 |
def clear_gpu_memory():
|
|
|
|
| 66 |
logger.info("Attempting to clear GPU memory...")
|
| 67 |
if torch.cuda.is_available():
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
else:
|
| 72 |
logger.info("No GPU available, skipping memory clearing.")
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
logger.info(f"Unloading previous model: {st.session_state.current_model_id}")
|
| 86 |
-
st.session_state.
|
| 87 |
-
clear_gpu_memory() #
|
| 88 |
-
st.
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
| 92 |
pipeline_instance = None
|
|
|
|
| 93 |
try:
|
| 94 |
-
# Determine torch_dtype based on availability and model needs
|
| 95 |
-
# Use bfloat16 if available for better performance on compatible GPUs
|
| 96 |
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else torch.float32
|
| 97 |
logger.info(f"Using dtype: {dtype}")
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
"microsoft/Phi-3-medium-4k-instruct",
|
| 102 |
"deepseek-ai/DeepSeek-Coder-V2-Instruct",
|
| 103 |
-
# Add
|
| 104 |
]
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
pipeline_instance.tokenizer.pad_token_id
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
st.session_state.current_model_pipeline = pipeline_instance
|
| 125 |
-
st.session_state.current_model_id =
|
| 126 |
-
st.toast(f"Model {
|
| 127 |
-
|
| 128 |
|
| 129 |
except ImportError as e:
|
| 130 |
-
logger.error(f"ImportError loading
|
| 131 |
-
st.error(f"Error
|
| 132 |
-
return None
|
| 133 |
except Exception as e:
|
| 134 |
-
logger.error(f"
|
| 135 |
-
st.error(f"Failed to load {
|
| 136 |
-
#
|
| 137 |
-
st.session_state.
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
# User Agent Caching (can remain cache_resource)
|
| 143 |
@st.cache_resource
|
| 144 |
def get_user_agent():
|
|
|
|
| 145 |
logger.info("Initializing FakeUserAgent.")
|
| 146 |
try:
|
| 147 |
-
# Handle potential issues with finding data files for fake_useragent
|
| 148 |
return UserAgent(fallback='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
|
| 149 |
except Exception as e:
|
| 150 |
logger.error(f"Failed to initialize FakeUserAgent: {e}", exc_info=True)
|
| 151 |
-
st.error(f"Could not initialize User Agent generator.
|
| 152 |
return None
|
| 153 |
|
| 154 |
-
# --- Core Functions (Scraping,
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
st.session_state.competitor_analysis_text = ""
|
| 160 |
-
st.session_state.generated_content = ""
|
| 161 |
-
st.session_state.internal_link_suggestions = ""
|
| 162 |
-
st.session_state.last_keyword = ""
|
| 163 |
-
# Don't reset model pipeline here, only data
|
| 164 |
-
logger.info("Session state data reset.")
|
| 165 |
|
|
|
|
| 166 |
@retry(wait_fixed=RETRY_WAIT_FIXED, stop_max_attempt_number=RETRY_STOP_MAX_ATTEMPT,
|
| 167 |
retry_on_exception=lambda e: isinstance(e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError)))
|
| 168 |
def fetch_url_content(url, headers):
|
| 169 |
-
"""Fetches content for a single URL with retries for specific errors."""
|
| 170 |
logger.info(f"Fetching {url} (Attempt {fetch_url_content.retry.attempt_number+1}/{RETRY_STOP_MAX_ATTEMPT})")
|
| 171 |
response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
|
| 172 |
response.raise_for_status()
|
| 173 |
if 'text/html' not in response.headers.get('Content-Type', ''):
|
| 174 |
-
logger.warning(f"Skipping URL {url} -
|
| 175 |
return None
|
| 176 |
-
# Check for excessively large pages (potential trap or non-article content)
|
| 177 |
if len(response.content) > 10 * 1024 * 1024: # 10 MB limit
|
| 178 |
-
logger.warning(f"Skipping URL {url} - Content too large
|
| 179 |
return None
|
| 180 |
return response
|
| 181 |
|
| 182 |
def clean_text(text):
|
| 183 |
-
"""Enhanced text cleaning."""
|
| 184 |
-
# Remove multiple spaces and newlines
|
| 185 |
text = re.sub(r'\s{2,}', ' ', text)
|
| 186 |
text = re.sub(r'\n+', '\n', text)
|
| 187 |
-
# Remove lines that are likely boilerplate/navigation/ads more aggressively
|
| 188 |
lines = text.split('\n')
|
| 189 |
cleaned_lines = []
|
| 190 |
-
min_line_length = 20
|
| 191 |
-
min_words_per_line = 3
|
| 192 |
skip_phrases = [
|
| 193 |
'copyright Β©', 'all rights reserved', 'privacy policy', 'terms of use', 'terms and conditions',
|
| 194 |
'cookie policy', 'subscribe', 'sign up', 'log in', 'advertisement', 'share this', 'related posts',
|
| 195 |
'leave a reply', 'comment', 'posted on', 'by author', 'tags:', 'categories:', 'follow us', 'read more',
|
| 196 |
-
'click here', 'learn more', 'next article', 'previous article'
|
| 197 |
]
|
| 198 |
for line in lines:
|
| 199 |
stripped_line = line.strip()
|
| 200 |
lower_line = stripped_line.lower()
|
| 201 |
-
# Check length, word count, and if it contains skip phrases
|
| 202 |
if len(stripped_line) >= min_line_length and \
|
| 203 |
len(stripped_line.split()) >= min_words_per_line and \
|
| 204 |
not any(phrase in lower_line for phrase in skip_phrases):
|
| 205 |
cleaned_lines.append(stripped_line)
|
| 206 |
-
|
| 207 |
text = '\n'.join(cleaned_lines)
|
| 208 |
return text.strip()
|
| 209 |
|
| 210 |
def scrape_page_content(url, user_agent, scrape_status_ui):
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
try:
|
| 216 |
response = fetch_url_content(url, headers)
|
| 217 |
-
if response is None:
|
| 218 |
-
|
| 219 |
-
return ""
|
| 220 |
-
|
| 221 |
-
soup = BeautifulSoup(response.content, 'lxml') # Use lxml for potentially faster parsing
|
| 222 |
-
|
| 223 |
-
# --- Enhanced Extraction & Cleaning ---
|
| 224 |
tags_to_remove = ["script", "style", "nav", "footer", "aside", "form", "header", "noscript", "button", "input", "select", "textarea", "figure", "figcaption", "iframe", "svg", "path", "meta", "link"]
|
| 225 |
for element in soup(tags_to_remove): element.decompose()
|
| 226 |
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract()
|
| 227 |
-
|
| 228 |
-
main_content = (soup.find('main') or soup.find('article') or
|
| 229 |
-
soup.find(role='main') or
|
| 230 |
soup.find('div', class_=re.compile(r'(content|main|body|post|entry|article)', re.I)) or
|
| 231 |
soup.find('div', id=re.compile(r'(content|main|body|post|entry|article)', re.I)))
|
| 232 |
target_soup = main_content if main_content else soup.body
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
texts = target_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'blockquote', 'span']) # Added span, sometimes used for content
|
| 237 |
content_parts = []
|
| 238 |
for elem in texts:
|
| 239 |
-
# Avoid extracting text from elements likely inside removed sections (double check)
|
| 240 |
if elem.find_parent(tags_to_remove): continue
|
| 241 |
-
# Get text, strip extra whitespace, join parts if nested
|
| 242 |
elem_text = elem.get_text(separator=' ', strip=True)
|
| 243 |
-
# Filter out short/noisy text elements
|
| 244 |
if len(elem_text) > 10 and len(elem_text.split()) > 1:
|
| 245 |
-
|
| 246 |
-
if elem.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'tr']:
|
| 247 |
content_parts.append(elem_text + "\n")
|
| 248 |
-
else:
|
| 249 |
-
content_parts.append(elem_text + " ")
|
| 250 |
-
|
| 251 |
content = "".join(content_parts)
|
| 252 |
-
cleaned_content = clean_text(content)
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
scrape_status_ui.warning(f"β οΈ Low content: {url}", icon="πΈοΈ")
|
| 257 |
-
else:
|
| 258 |
-
logger.info(f"Scraped {len(cleaned_content)} chars: {url}")
|
| 259 |
-
scrape_status_ui.success(f"β
Scraped: {url} ({len(cleaned_content)} chars)", icon="πΈοΈ")
|
| 260 |
-
|
| 261 |
-
time.sleep(0.6) # Slightly increased delay
|
| 262 |
return cleaned_content
|
| 263 |
-
|
| 264 |
-
except
|
| 265 |
-
logger.warning(f"Final scrape attempt failed: {url}. Error: {e}")
|
| 266 |
-
scrape_status_ui.error(f"β Fail scrape: {url} ({e})", icon="πΈοΈ")
|
| 267 |
-
return ""
|
| 268 |
-
except Exception as e:
|
| 269 |
-
logger.error(f"Unexpected scrape error: {url}: {e}", exc_info=True)
|
| 270 |
-
scrape_status_ui.error(f"β Error scraping: {url} (Check logs)", icon="πΈοΈ")
|
| 271 |
-
return ""
|
| 272 |
-
|
| 273 |
|
| 274 |
def get_top_urls(keyword, num_results):
|
| 275 |
-
# (Function remains the same as previous version - already robust)
|
| 276 |
logger.info(f"Fetching top {num_results} URLs for keyword: '{keyword}'")
|
| 277 |
try:
|
| 278 |
-
# Use a longer timeout for the search itself
|
| 279 |
urls = list(search(keyword, num_results=num_results, sleep_interval=2.5, lang="en", timeout=15))
|
| 280 |
logger.info(f"Found URLs: {urls}")
|
| 281 |
-
if not urls:
|
| 282 |
-
st.warning(f"β οΈ No Google search results found for '{keyword}'. Try a different keyword?")
|
| 283 |
-
return []
|
| 284 |
return urls
|
| 285 |
except Exception as e:
|
| 286 |
-
error_message = str(e)
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
st.error(f"β Google search blocked (Error 429). WAIT before retrying.")
|
| 291 |
-
elif "timed out" in error_message:
|
| 292 |
-
st.error(f"β Google search request timed out.")
|
| 293 |
-
else:
|
| 294 |
-
st.error(f"β Failed to fetch Google results. Error: {error_message[:100]}...") # Truncate long errors
|
| 295 |
return []
|
| 296 |
|
| 297 |
-
|
| 298 |
-
# --- Prompt Building Functions ---
|
| 299 |
-
|
| 300 |
def build_content_generation_prompt(keyword, competitor_texts, tone, audience, model_id):
|
| 301 |
-
"
|
| 302 |
-
logger.info(f"Building content gen prompt. Tone: {tone}, Audience: {audience}. Competitor length: {len(competitor_texts)}")
|
| 303 |
if len(competitor_texts) > MAX_COMPETITOR_TEXT_LENGTH:
|
| 304 |
-
competitor_summary = competitor_texts[:MAX_COMPETITOR_TEXT_LENGTH] + "... [
|
| 305 |
-
logger.warning(f"
|
| 306 |
-
else:
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
**Desired Tone:** {tone}
|
| 315 |
-
|
| 316 |
-
**Objective:** Generate an exceptional, SEO-optimized article for "{keyword}" designed to outperform the current top-ranking content by providing substantially more value, unique insights, and a better user experience.
|
| 317 |
-
|
| 318 |
-
**Competitor Analysis Context (Analyze this text for topics, depth, strengths, and weaknesses/gaps):**
|
| 319 |
-
--- BEGIN COMPETITOR CONTENT ---
|
| 320 |
{competitor_summary}
|
| 321 |
-
--- END COMPETITOR
|
| 322 |
-
|
| 323 |
-
**
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
* Comprehensive Body: Cover all essential aspects, expanding beyond competitor content.
|
| 333 |
-
* Strong Conclusion: Summarize key takeaways, provide final insight or call-to-action (if appropriate).
|
| 334 |
-
4. **SEO Integration (Natural):** Seamlessly integrate "{keyword}" and related semantic terms (LSI) into title, headings, intro, body, conclusion. Prioritize topical relevance and natural language over density. Avoid keyword stuffing.
|
| 335 |
-
5. **Originality & Credibility:** Generate 100% unique content. Use competitor text ONLY for analysis. Do NOT plagiarize. Ensure factual accuracy.
|
| 336 |
-
6. **Negative Constraints (DO NOT):** Do not rehash competitors; include preambles/sign-offs; use excessive jargon (unless for 'Experts'); write long paragraphs; stuff keywords; invent facts.
|
| 337 |
-
|
| 338 |
-
**Output:** Deliver ONLY the generated Markdown article, starting directly with the H2 title.
|
| 339 |
-
"""
|
| 340 |
-
# Use the format expected by the pipeline's chat template (usually system/user roles)
|
| 341 |
-
# The pipeline should handle model-specific formatting (e.g., [INST], <|im_start|>)
|
| 342 |
-
messages = [
|
| 343 |
-
{"role": "system", "content": system_prompt},
|
| 344 |
-
{"role": "user", "content": user_prompt}
|
| 345 |
-
]
|
| 346 |
-
logger.info(f"Content generation prompt constructed for model {model_id}.")
|
| 347 |
return messages
|
| 348 |
|
| 349 |
-
|
| 350 |
-
# *** NEW: Internal Linking Prompt ***
|
| 351 |
def build_internal_link_prompt(generated_content, keyword, website_url):
|
| 352 |
-
"
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
system_prompt = "You are an SEO assistant specialized in identifying internal linking opportunities within website content."
|
| 356 |
-
|
| 357 |
user_prompt = f"""**Website Base URL:** {website_url}
|
| 358 |
**Main Topic of Article:** "{keyword}"
|
| 359 |
-
|
| 360 |
-
**Task:** Please review the following article content. Identify 3 to 5 specific phrases or sentences within the text that represent good opportunities for internal links to other relevant pages on the website ({website_url}).
|
| 361 |
-
|
| 362 |
**For each opportunity, provide:**
|
| 363 |
-
1.
|
| 364 |
-
2.
|
| 365 |
-
|
| 366 |
-
**
|
| 367 |
-
|
| 368 |
-
* Choose anchor text that is natural and descriptive.
|
| 369 |
-
* Focus on links that would genuinely add value for a reader seeking more information on that specific point.
|
| 370 |
-
* Format your output as a Markdown numbered list.
|
| 371 |
-
|
| 372 |
-
**Article Content to Analyze:**
|
| 373 |
-
--- BEGIN ARTICLE CONTENT ---
|
| 374 |
{generated_content[:8000]}
|
| 375 |
-
--- END ARTICLE
|
| 376 |
-
"""
|
| 377 |
-
|
| 378 |
-
messages = [
|
| 379 |
-
{"role": "system", "content": system_prompt},
|
| 380 |
-
{"role": "user", "content": user_prompt}
|
| 381 |
-
]
|
| 382 |
return messages
|
| 383 |
|
| 384 |
-
# --- LLM Generation Functions ---
|
| 385 |
-
|
| 386 |
def run_llm_generation(pipe, messages, max_tokens):
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
model_id = pipe.model.name_or_path # Get model id from pipeline
|
| 392 |
-
|
| 393 |
-
logger.info(f"Running generation with {model_id}. Max new tokens: {max_tokens}.")
|
| 394 |
-
generation_start_time = time.time()
|
| 395 |
-
|
| 396 |
try:
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
"do_sample": True,
|
| 403 |
-
"pad_token_id": pipe.tokenizer.eos_token_id,
|
| 404 |
-
"eos_token_id": pipe.tokenizer.eos_token_id,
|
| 405 |
-
# Use pipeline's chat template automatically if available
|
| 406 |
-
}
|
| 407 |
-
logger.info(f"Generation arguments: {generation_args}")
|
| 408 |
-
|
| 409 |
-
# --- Execute Pipeline ---
|
| 410 |
-
results = pipe(messages, **generation_args)
|
| 411 |
-
|
| 412 |
-
# --- Robust Extraction of Assistant's Response ---
|
| 413 |
-
# (Using the refined extraction logic from previous iteration)
|
| 414 |
assistant_response = None
|
| 415 |
if results and results[0] and 'generated_text' in results[0]:
|
| 416 |
output_data = results[0]['generated_text']
|
| 417 |
-
if isinstance(output_data, list):
|
| 418 |
-
|
| 419 |
-
if assistant_message: assistant_response = assistant_message
|
| 420 |
-
elif isinstance(output_data, str): # Format: "System...\nUser...\nAssistant..."
|
| 421 |
-
# Find the last message in the prompt list to split after it
|
| 422 |
last_prompt_content = messages[-1]['content']
|
| 423 |
last_prompt_index = output_data.rfind(last_prompt_content)
|
| 424 |
-
if last_prompt_index != -1:
|
| 425 |
-
|
| 426 |
-
else: # Fallback if prompt isn't exactly echoed
|
| 427 |
-
potential_response = output_data # Assume it might just be the response
|
| 428 |
-
# Clean potential role markers, </s> tokens etc.
|
| 429 |
assistant_response = re.sub(r"^(assistant|ASSISTANT|</s>|<\|im_end\|>|<\|assistant\|>)\s*[:\n]*", "", potential_response, flags=re.IGNORECASE | re.DOTALL).strip()
|
| 430 |
-
else: logger.error(f"Unexpected output format
|
| 431 |
else: logger.error(f"Unexpected LLM output structure: {results}")
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
# --- Final Validation and Cleaning ---
|
| 435 |
if assistant_response:
|
| 436 |
-
duration = time.time() -
|
| 437 |
-
|
| 438 |
-
assistant_response
|
| 439 |
-
assistant_response = re.sub(r"\n```$", "", assistant_response).strip()
|
| 440 |
-
# Basic length check
|
| 441 |
-
if len(assistant_response) < 50:
|
| 442 |
-
logger.warning(f"Generated output very short ({len(assistant_response)} chars).")
|
| 443 |
-
st.warning("β οΈ Generated output seems very short. Please review.")
|
| 444 |
return assistant_response
|
| 445 |
-
else:
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
return None
|
| 449 |
-
|
| 450 |
-
except torch.cuda.OutOfMemoryError:
|
| 451 |
-
logger.error(f"OOM Error during generation with {model_id}!", exc_info=True)
|
| 452 |
-
st.error(f"β Generation failed: Out of GPU Memory ({model_id}). Try a smaller model, reduce 'Max Generation Tokens', or restart the space.")
|
| 453 |
-
clear_gpu_memory() # Attempt to recover
|
| 454 |
-
return None
|
| 455 |
-
except Exception as e:
|
| 456 |
-
logger.error(f"Unhandled error during generation ({model_id}): {e}", exc_info=True)
|
| 457 |
-
st.error(f"β Unexpected error during generation: {e}")
|
| 458 |
-
return None
|
| 459 |
-
|
| 460 |
|
| 461 |
# --- Streamlit App UI ---
|
| 462 |
|
| 463 |
-
st.set_page_config(layout="wide", page_title="
|
| 464 |
|
| 465 |
-
# Sidebar
|
| 466 |
with st.sidebar:
|
| 467 |
st.header("βοΈ Configuration")
|
|
|
|
|
|
|
|
|
|
| 468 |
selected_model_key = st.selectbox(
|
| 469 |
"Choose Language Model:",
|
| 470 |
options=list(MODEL_OPTIONS.keys()),
|
| 471 |
index=list(MODEL_OPTIONS.keys()).index(DEFAULT_MODEL_KEY),
|
| 472 |
-
|
|
|
|
| 473 |
)
|
| 474 |
selected_model_id = MODEL_OPTIONS[selected_model_key]
|
| 475 |
|
| 476 |
-
#
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
-
st.markdown("---")
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
|
|
|
|
|
|
| 485 |
selected_tone = st.selectbox("Content Tone:", options=TONE_OPTIONS, index=TONE_OPTIONS.index("Engaging"))
|
| 486 |
selected_audience = st.selectbox("Target Audience:", options=AUDIENCE_OPTIONS, index=AUDIENCE_OPTIONS.index("General Audience"))
|
| 487 |
-
max_gen_tokens = st.number_input("Max Generation Tokens:", min_value=500, max_value=8192, value=DEFAULT_MAX_GENERATION_TOKENS, step=
|
| 488 |
-
|
| 489 |
-
with st.expander("Internal Linking (Optional)"):
|
| 490 |
-
website_url = st.text_input("Your Website URL (for link suggestions):", placeholder="https://www.example.com", value=st.session_state.get("last_website_url", ""))
|
| 491 |
-
st.session_state.last_website_url = website_url # Store immediately for reuse
|
| 492 |
|
| 493 |
-
|
| 494 |
-
st.
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
st.warning("β οΈ No model loaded. Choose and click 'Load/Switch'.")
|
| 500 |
|
|
|
|
|
|
|
| 501 |
st.info(f"""
|
|
|
|
| 502 |
- **Competitors:** Top {num_results}
|
| 503 |
- **Max Generation:** ~{max_gen_tokens} tokens
|
| 504 |
""")
|
| 505 |
st.warning("""
|
| 506 |
-
- **
|
| 507 |
-
- **
|
| 508 |
-
- **
|
| 509 |
-
- **Internal Linking:** Suggestions are AI-based guesses of relevant topics; verify and find the actual URLs yourself.
|
| 510 |
""")
|
| 511 |
-
if st.button("Clear
|
| 512 |
-
|
| 513 |
-
st.toast("Cleared scraped data and generated content.", icon="ποΈ")
|
| 514 |
-
|
| 515 |
|
| 516 |
-
# Main App Area
|
| 517 |
-
st.title("β¨
|
| 518 |
-
st.markdown(f"
|
| 519 |
|
| 520 |
-
# User Input
|
| 521 |
-
|
|
|
|
| 522 |
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
st.markdown("---")
|
| 533 |
|
| 534 |
-
# --- Main Workflow ---
|
| 535 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
if not keyword:
|
| 537 |
st.warning("β οΈ Please enter a keyword.")
|
| 538 |
st.stop()
|
| 539 |
|
| 540 |
-
st.session_state.last_keyword = keyword # Store keyword
|
| 541 |
ua = get_user_agent() # Ensure user agent is ready
|
| 542 |
-
if not ua: st.error("β User Agent failed
|
| 543 |
|
| 544 |
-
# Reset previous results for this
|
| 545 |
st.session_state.generated_content = ""
|
| 546 |
st.session_state.internal_link_suggestions = ""
|
| 547 |
|
| 548 |
-
# --- Step 1: Scrape Competitors ---
|
| 549 |
-
# Check if
|
| 550 |
if keyword != st.session_state.get('_internal_last_scrape_keyword', None) or not st.session_state.competitor_analysis_text:
|
| 551 |
-
logger.info(f"
|
| 552 |
-
st.session_state.competitor_analysis_text = "" # Clear
|
| 553 |
-
st.session_state.scraped_urls = []
|
|
|
|
| 554 |
|
| 555 |
scrape_container = st.container()
|
| 556 |
with scrape_container:
|
| 557 |
-
st.
|
| 558 |
-
|
|
|
|
|
|
|
| 559 |
|
| 560 |
urls = get_top_urls(keyword, num_results)
|
| 561 |
st.session_state.scraped_urls = urls
|
| 562 |
|
| 563 |
if urls:
|
| 564 |
all_texts = []
|
| 565 |
-
|
| 566 |
-
scrape_status_messages = [] # Collect messages
|
| 567 |
-
|
| 568 |
for i, url in enumerate(urls):
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
time.sleep(0.1) # UI refresh delay
|
| 578 |
|
| 579 |
st.session_state.competitor_analysis_text = "\n\n --- ARTICLE SEPARATOR --- \n\n".join(all_texts)
|
| 580 |
-
st.session_state['_internal_last_scrape_keyword'] = keyword # Mark
|
| 581 |
|
| 582 |
if st.session_state.competitor_analysis_text:
|
| 583 |
-
scrape_container.success(f"β
Scraped {
|
| 584 |
-
logger.info(f"Scraping complete. Extracted {len(st.session_state.competitor_analysis_text)} chars.")
|
| 585 |
else:
|
| 586 |
scrape_container.error("β Failed to scrape sufficient content. Cannot generate article.")
|
| 587 |
st.stop()
|
|
@@ -589,22 +498,18 @@ if generate_button:
|
|
| 589 |
scrape_container.error("β Could not retrieve competitor URLs. Cannot proceed.")
|
| 590 |
st.stop()
|
| 591 |
else:
|
| 592 |
-
st.success(f"βοΈ Using previously scraped data for '{keyword}'. ({len(st.session_state.competitor_analysis_text)} chars
|
| 593 |
-
logger.info(f"Using cached scrape data for keyword '{keyword}'.")
|
| 594 |
-
|
| 595 |
|
| 596 |
# --- Step 2: Generate Main Content ---
|
| 597 |
-
st.
|
| 598 |
-
generation_status = st.status(
|
| 599 |
with generation_status:
|
| 600 |
-
st.write(f"**Tone:** {selected_tone}, **Audience:** {selected_audience}")
|
| 601 |
-
st.write(f"**Max Tokens:** {max_gen_tokens}")
|
| 602 |
-
|
| 603 |
gen_prompt = build_content_generation_prompt(
|
| 604 |
keyword, st.session_state.competitor_analysis_text, selected_tone, selected_audience, st.session_state.current_model_id
|
| 605 |
)
|
| 606 |
generated_content = run_llm_generation(st.session_state.current_model_pipeline, gen_prompt, max_gen_tokens)
|
| 607 |
-
st.session_state.generated_content = generated_content
|
| 608 |
|
| 609 |
if generated_content:
|
| 610 |
generation_status.update(label="β
Content Generation Complete!", state="complete")
|
|
@@ -612,37 +517,31 @@ if generate_button:
|
|
| 612 |
generation_status.update(label="β Content Generation Failed.", state="error")
|
| 613 |
st.stop() # Stop if main content fails
|
| 614 |
|
| 615 |
-
# --- Display
|
| 616 |
if st.session_state.generated_content:
|
| 617 |
st.markdown("---")
|
| 618 |
st.subheader("π Generated SEO Content")
|
| 619 |
st.markdown(st.session_state.generated_content)
|
| 620 |
-
st.text_area("Copyable Markdown:", st.session_state.generated_content, height=400, key="
|
| 621 |
|
| 622 |
-
# ---
|
| 623 |
-
if
|
| 624 |
st.markdown("---")
|
| 625 |
st.subheader("π Internal Linking Suggestions")
|
| 626 |
-
if st.button("Suggest Internal Links", key="
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
if link_suggestions:
|
| 639 |
-
link_status.update(label="β
Link suggestions generated!", state="complete")
|
| 640 |
-
else:
|
| 641 |
-
link_status.update(label="β Failed to generate link suggestions.", state="error")
|
| 642 |
-
|
| 643 |
-
# Display suggestions if available
|
| 644 |
if st.session_state.internal_link_suggestions:
|
| 645 |
st.markdown(st.session_state.internal_link_suggestions)
|
| 646 |
-
st.info("βΉοΈ
|
| 647 |
else:
|
| 648 |
-
|
|
|
|
|
|
| 9 |
import logging
|
| 10 |
import re
|
| 11 |
from retrying import retry
|
| 12 |
+
import gc
|
| 13 |
|
| 14 |
# --- Configuration ---
|
| 15 |
+
# Model Options (Ensure keys clearly indicate resource needs)
|
|
|
|
| 16 |
MODEL_OPTIONS = {
|
| 17 |
+
# Lighter Models (More likely to work on free tiers)
|
| 18 |
+
"Mistral-7B-Instruct (Fast, Med RAM)": "mistralai/Mistral-7B-Instruct-v0.2",
|
| 19 |
+
"Gemma-7B-IT (Google, Med RAM)": "google/gemma-7b-it",
|
| 20 |
+
"Phi-3-Mini-4k-Instruct (Microsoft, Small, Good)": "microsoft/Phi-3-mini-4k-instruct", # Requires trust_remote_code
|
| 21 |
+
|
| 22 |
+
# Medium Models (May require upgraded tiers / more RAM/GPU)
|
| 23 |
+
"Llama-3-8B-Instruct (Meta, High Quality, High RAM/GPU)": "meta-llama/Meta-Llama-3-8B-Instruct",
|
| 24 |
+
"Phi-3-Medium-4k-Instruct (Microsoft, Strong, High RAM/GPU)": "microsoft/Phi-3-medium-4k-instruct", # Requires trust_remote_code
|
| 25 |
+
"Qwen1.5-14B-Chat (Alibaba, Strong, High RAM/GPU)": "Qwen/Qwen1.5-14B-Chat",
|
| 26 |
+
|
| 27 |
+
# Larger Models (Very likely require significant paid resources)
|
| 28 |
+
"DeepSeek-Coder-V2-Instruct (DeepSeek, High RAM/GPU)": "deepseek-ai/DeepSeek-Coder-V2-Instruct", # Requires trust_remote_code
|
| 29 |
}
|
| 30 |
+
DEFAULT_MODEL_KEY = "Mistral-7B-Instruct (Fast, Med RAM)" # Start with a lighter default selection
|
|
|
|
| 31 |
|
| 32 |
# Scraping & Generation Defaults
|
| 33 |
+
DEFAULT_NUM_RESULTS = 4 # Reduced default slightly
|
| 34 |
REQUEST_TIMEOUT = 15
|
| 35 |
+
MAX_COMPETITOR_TEXT_LENGTH = 5500
|
| 36 |
+
DEFAULT_MAX_GENERATION_TOKENS = 2800
|
| 37 |
|
| 38 |
+
# Retry settings
|
| 39 |
RETRY_WAIT_FIXED = 2000
|
| 40 |
RETRY_STOP_MAX_ATTEMPT = 3
|
| 41 |
|
| 42 |
# Tone & Audience Options
|
| 43 |
+
TONE_OPTIONS = ["Conversational", "Professional", "Authoritative", "Technical", "Friendly", "Engaging", "Educational", "Persuasive"]
|
| 44 |
+
AUDIENCE_OPTIONS = ["Beginners", "General Audience", "Experts", "Professionals (Specific Field)", "Customers", "Students", "Decision Makers"]
|
| 45 |
|
| 46 |
# --- Logging Setup ---
|
| 47 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
|
| 48 |
logger = logging.getLogger(__name__)
|
| 49 |
|
| 50 |
+
# --- State Management ---
|
| 51 |
+
# Initialize session state keys carefully
|
| 52 |
+
if 'current_model_pipeline' not in st.session_state: st.session_state.current_model_pipeline = None
|
| 53 |
+
if 'current_model_id' not in st.session_state: st.session_state.current_model_id = ""
|
| 54 |
+
# Data related state
|
| 55 |
if 'scraped_urls' not in st.session_state: st.session_state.scraped_urls = []
|
| 56 |
if 'competitor_analysis_text' not in st.session_state: st.session_state.competitor_analysis_text = ""
|
| 57 |
if 'generated_content' not in st.session_state: st.session_state.generated_content = ""
|
| 58 |
if 'internal_link_suggestions' not in st.session_state: st.session_state.internal_link_suggestions = ""
|
| 59 |
if 'last_keyword' not in st.session_state: st.session_state.last_keyword = ""
|
|
|
|
| 60 |
if 'last_website_url' not in st.session_state: st.session_state.last_website_url = ""
|
| 61 |
+
if '_internal_last_scrape_keyword' not in st.session_state: st.session_state._internal_last_scrape_keyword = ""
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
+
# --- Helper Functions ---
|
| 65 |
def clear_gpu_memory():
|
| 66 |
+
"""Attempts to clear GPU memory cache and run garbage collection."""
|
| 67 |
logger.info("Attempting to clear GPU memory...")
|
| 68 |
if torch.cuda.is_available():
|
| 69 |
+
try:
|
| 70 |
+
st.session_state.current_model_pipeline = None # Ensure reference is removed FIRST
|
| 71 |
+
gc.collect() # Run Python garbage collection
|
| 72 |
+
torch.cuda.empty_cache() # Tell PyTorch to release cached memory
|
| 73 |
+
gc.collect() # Run GC again
|
| 74 |
+
logger.info("GPU memory cache cleared and garbage collected.")
|
| 75 |
+
st.toast("Cleared GPU memory.", icon="π§Ή")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Error clearing GPU memory: {e}", exc_info=True)
|
| 78 |
+
st.toast(f"Error clearing GPU memory: {e}", icon="β")
|
| 79 |
else:
|
| 80 |
logger.info("No GPU available, skipping memory clearing.")
|
| 81 |
+
st.session_state.current_model_pipeline = None # Still clear the reference
|
| 82 |
+
gc.collect()
|
| 83 |
|
| 84 |
+
def reset_app_data():
|
| 85 |
+
"""Clears stored scraping and generation results, keeps model loaded."""
|
| 86 |
+
st.session_state.scraped_urls = []
|
| 87 |
+
st.session_state.competitor_analysis_text = ""
|
| 88 |
+
st.session_state.generated_content = ""
|
| 89 |
+
st.session_state.internal_link_suggestions = ""
|
| 90 |
+
st.session_state.last_keyword = ""
|
| 91 |
+
st.session_state._internal_last_scrape_keyword = ""
|
| 92 |
+
logger.info("App data state reset (scraped/generated content).")
|
| 93 |
+
st.toast("Cleared scraped data and generated content.", icon="ποΈ")
|
| 94 |
+
|
| 95 |
+
# --- Model Loading (On Demand) ---
|
| 96 |
+
def load_model(model_id_to_load):
|
| 97 |
+
"""Loads the selected model, unloading any previous one."""
|
| 98 |
+
# If the requested model is already loaded, do nothing
|
| 99 |
+
if st.session_state.get('current_model_id') == model_id_to_load and st.session_state.get('current_model_pipeline') is not None:
|
| 100 |
+
logger.info(f"Model {model_id_to_load} is already loaded.")
|
| 101 |
+
st.toast(f"{model_id_to_load} is already loaded.", icon="β
")
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
# Unload previous model if one exists and is different
|
| 105 |
+
if st.session_state.get('current_model_pipeline') is not None:
|
| 106 |
logger.info(f"Unloading previous model: {st.session_state.current_model_id}")
|
| 107 |
+
st.toast(f"Unloading {st.session_state.current_model_id}...", icon="π§Ή")
|
| 108 |
+
clear_gpu_memory() # This sets pipeline to None and clears cache
|
| 109 |
+
st.session_state.current_model_id = "" # Clear model ID state
|
| 110 |
|
| 111 |
+
# Load the new model
|
| 112 |
+
st.toast(f"Loading {model_id_to_load}... This may take time & RAM/GPU.", icon="β³")
|
| 113 |
+
logger.info(f"Attempting to load LLM pipeline for model: {model_id_to_load}")
|
| 114 |
pipeline_instance = None
|
| 115 |
+
success = False
|
| 116 |
try:
|
|
|
|
|
|
|
| 117 |
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else torch.float32
|
| 118 |
logger.info(f"Using dtype: {dtype}")
|
| 119 |
|
| 120 |
+
trust_code_models = [
|
| 121 |
+
"microsoft/Phi-3-mini-4k-instruct",
|
| 122 |
"microsoft/Phi-3-medium-4k-instruct",
|
| 123 |
"deepseek-ai/DeepSeek-Coder-V2-Instruct",
|
| 124 |
+
# Add others if needed
|
| 125 |
]
|
| 126 |
+
trust_code = model_id_to_load in trust_code_models
|
| 127 |
+
logger.info(f"Trust remote code for {model_id_to_load}: {trust_code}")
|
| 128 |
+
|
| 129 |
+
# Display spinner during the actual loading
|
| 130 |
+
with st.spinner(f"Loading {model_id_to_load} into memory..."):
|
| 131 |
+
pipeline_instance = pipeline(
|
| 132 |
+
"text-generation",
|
| 133 |
+
model=model_id_to_load,
|
| 134 |
+
trust_remote_code=trust_code,
|
| 135 |
+
device_map="auto",
|
| 136 |
+
torch_dtype=dtype,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Handle pad_token
|
| 140 |
+
if pipeline_instance.tokenizer.pad_token_id is None:
|
| 141 |
+
pipeline_instance.tokenizer.pad_token_id = pipeline_instance.tokenizer.eos_token_id
|
| 142 |
+
if hasattr(pipeline_instance.model, 'config'):
|
| 143 |
+
pipeline_instance.model.config.pad_token_id = pipeline_instance.tokenizer.eos_token_id
|
| 144 |
+
logger.warning(f"Set pad_token_id to eos_token_id for {model_id_to_load}")
|
| 145 |
+
|
| 146 |
+
logger.info(f"LLM pipeline loaded successfully for {model_id_to_load}.")
|
| 147 |
st.session_state.current_model_pipeline = pipeline_instance
|
| 148 |
+
st.session_state.current_model_id = model_id_to_load
|
| 149 |
+
st.toast(f"Model {model_id_to_load} loaded!", icon="β
")
|
| 150 |
+
success = True
|
| 151 |
|
| 152 |
except ImportError as e:
|
| 153 |
+
logger.error(f"ImportError loading {model_id_to_load}: {e}. Missing dependency?", exc_info=True)
|
| 154 |
+
st.error(f"Load Error: Missing library for {model_id_to_load}? Check logs. Details: {e}")
|
|
|
|
| 155 |
except Exception as e:
|
| 156 |
+
logger.error(f"Failed to load {model_id_to_load}: {e}", exc_info=True)
|
| 157 |
+
st.error(f"Failed to load {model_id_to_load}. Error: {e}. Check resource limits (RAM/GPU) & logs.")
|
| 158 |
+
clear_gpu_memory() # Attempt to clean up if loading failed
|
| 159 |
+
st.session_state.current_model_id = "" # Ensure state reflects failure
|
| 160 |
+
finally:
|
| 161 |
+
return success # Return status
|
| 162 |
+
|
| 163 |
+
# --- User Agent Caching ---
|
|
|
|
| 164 |
@st.cache_resource
|
| 165 |
def get_user_agent():
|
| 166 |
+
# (Same as previous version)
|
| 167 |
logger.info("Initializing FakeUserAgent.")
|
| 168 |
try:
|
|
|
|
| 169 |
return UserAgent(fallback='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
|
| 170 |
except Exception as e:
|
| 171 |
logger.error(f"Failed to initialize FakeUserAgent: {e}", exc_info=True)
|
| 172 |
+
st.error(f"Could not initialize User Agent generator. Error: {e}")
|
| 173 |
return None
|
| 174 |
|
| 175 |
+
# --- Core Functions (Scraping, Prompt Building, Generation Logic) ---
|
| 176 |
+
# These functions (get_top_urls, scrape_page_content, clean_text, fetch_url_content,
|
| 177 |
+
# build_content_generation_prompt, build_internal_link_prompt, run_llm_generation)
|
| 178 |
+
# remain largely the same as the previous version, as they were already quite robust.
|
| 179 |
+
# Ensure `run_llm_generation` correctly uses the pipeline passed to it (which it did).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
# --- (Include the definitions for the core functions here - unchanged from previous version) ---
|
| 182 |
@retry(wait_fixed=RETRY_WAIT_FIXED, stop_max_attempt_number=RETRY_STOP_MAX_ATTEMPT,
|
| 183 |
retry_on_exception=lambda e: isinstance(e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError)))
|
| 184 |
def fetch_url_content(url, headers):
|
|
|
|
| 185 |
logger.info(f"Fetching {url} (Attempt {fetch_url_content.retry.attempt_number+1}/{RETRY_STOP_MAX_ATTEMPT})")
|
| 186 |
response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
|
| 187 |
response.raise_for_status()
|
| 188 |
if 'text/html' not in response.headers.get('Content-Type', ''):
|
| 189 |
+
logger.warning(f"Skipping URL {url} - Not HTML")
|
| 190 |
return None
|
|
|
|
| 191 |
if len(response.content) > 10 * 1024 * 1024: # 10 MB limit
|
| 192 |
+
logger.warning(f"Skipping URL {url} - Content too large")
|
| 193 |
return None
|
| 194 |
return response
|
| 195 |
|
| 196 |
def clean_text(text):
|
|
|
|
|
|
|
| 197 |
text = re.sub(r'\s{2,}', ' ', text)
|
| 198 |
text = re.sub(r'\n+', '\n', text)
|
|
|
|
| 199 |
lines = text.split('\n')
|
| 200 |
cleaned_lines = []
|
| 201 |
+
min_line_length = 20
|
| 202 |
+
min_words_per_line = 3
|
| 203 |
skip_phrases = [
|
| 204 |
'copyright Β©', 'all rights reserved', 'privacy policy', 'terms of use', 'terms and conditions',
|
| 205 |
'cookie policy', 'subscribe', 'sign up', 'log in', 'advertisement', 'share this', 'related posts',
|
| 206 |
'leave a reply', 'comment', 'posted on', 'by author', 'tags:', 'categories:', 'follow us', 'read more',
|
| 207 |
+
'click here', 'learn more', 'next article', 'previous article', 'you may also like', 'related topics'
|
| 208 |
]
|
| 209 |
for line in lines:
|
| 210 |
stripped_line = line.strip()
|
| 211 |
lower_line = stripped_line.lower()
|
|
|
|
| 212 |
if len(stripped_line) >= min_line_length and \
|
| 213 |
len(stripped_line.split()) >= min_words_per_line and \
|
| 214 |
not any(phrase in lower_line for phrase in skip_phrases):
|
| 215 |
cleaned_lines.append(stripped_line)
|
|
|
|
| 216 |
text = '\n'.join(cleaned_lines)
|
| 217 |
return text.strip()
|
| 218 |
|
| 219 |
def scrape_page_content(url, user_agent, scrape_status_ui):
|
| 220 |
+
if not user_agent: logger.error("User Agent missing."); return ""
|
| 221 |
+
headers = {
|
| 222 |
+
'User-Agent': user_agent.random,
|
| 223 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 224 |
+
'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/',
|
| 225 |
+
'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
|
| 226 |
+
}
|
| 227 |
try:
|
| 228 |
response = fetch_url_content(url, headers)
|
| 229 |
+
if response is None: scrape_status_ui.warning(f"β οΈ Skip/Fail fetch: {url}", icon="πΈοΈ"); return ""
|
| 230 |
+
soup = BeautifulSoup(response.content, 'lxml')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
tags_to_remove = ["script", "style", "nav", "footer", "aside", "form", "header", "noscript", "button", "input", "select", "textarea", "figure", "figcaption", "iframe", "svg", "path", "meta", "link"]
|
| 232 |
for element in soup(tags_to_remove): element.decompose()
|
| 233 |
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract()
|
| 234 |
+
main_content = (soup.find('main') or soup.find('article') or soup.find(role='main') or
|
|
|
|
|
|
|
| 235 |
soup.find('div', class_=re.compile(r'(content|main|body|post|entry|article)', re.I)) or
|
| 236 |
soup.find('div', id=re.compile(r'(content|main|body|post|entry|article)', re.I)))
|
| 237 |
target_soup = main_content if main_content else soup.body
|
| 238 |
+
if not target_soup: logger.warning(f"No body/main: {url}"); scrape_status_ui.warning(f"β οΈ No body/main: {url}", icon="πΈοΈ"); return ""
|
| 239 |
+
texts = target_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'blockquote', 'span'])
|
|
|
|
|
|
|
| 240 |
content_parts = []
|
| 241 |
for elem in texts:
|
|
|
|
| 242 |
if elem.find_parent(tags_to_remove): continue
|
|
|
|
| 243 |
elem_text = elem.get_text(separator=' ', strip=True)
|
|
|
|
| 244 |
if len(elem_text) > 10 and len(elem_text.split()) > 1:
|
| 245 |
+
if elem.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'tr', 'div']: # Added div for structure
|
|
|
|
| 246 |
content_parts.append(elem_text + "\n")
|
| 247 |
+
else: content_parts.append(elem_text + " ")
|
|
|
|
|
|
|
| 248 |
content = "".join(content_parts)
|
| 249 |
+
cleaned_content = clean_text(content)
|
| 250 |
+
if len(cleaned_content) < 150: logger.warning(f"Low content ({len(cleaned_content)} chars): {url}"); scrape_status_ui.warning(f"β οΈ Low content: {url}", icon="πΈοΈ")
|
| 251 |
+
else: logger.info(f"Scraped {len(cleaned_content)} chars: {url}"); scrape_status_ui.success(f"β
Scraped: {url} ({len(cleaned_content)} chars)", icon="πΈοΈ")
|
| 252 |
+
time.sleep(0.6)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
return cleaned_content
|
| 254 |
+
except requests.exceptions.RequestException as e: logger.warning(f"Final scrape fail: {url}. Err: {e}"); scrape_status_ui.error(f"β Fail scrape: {url} ({e})", icon="πΈοΈ"); return ""
|
| 255 |
+
except Exception as e: logger.error(f"Unexpected scrape error: {url}: {e}", exc_info=True); scrape_status_ui.error(f"β Error scraping: {url} (Logs)", icon="πΈοΈ"); return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
def get_top_urls(keyword, num_results):
|
|
|
|
| 258 |
logger.info(f"Fetching top {num_results} URLs for keyword: '{keyword}'")
|
| 259 |
try:
|
|
|
|
| 260 |
urls = list(search(keyword, num_results=num_results, sleep_interval=2.5, lang="en", timeout=15))
|
| 261 |
logger.info(f"Found URLs: {urls}")
|
| 262 |
+
if not urls: st.warning(f"β οΈ No Google search results found for '{keyword}'."); return []
|
|
|
|
|
|
|
| 263 |
return urls
|
| 264 |
except Exception as e:
|
| 265 |
+
error_message = str(e); logger.error(f"GSearch Error: {error_message}", exc_info=True)
|
| 266 |
+
if "429" in error_message: st.error(f"β Google search blocked (429). WAIT before retrying.")
|
| 267 |
+
elif "timed out" in error_message: st.error(f"β Google search timed out.")
|
| 268 |
+
else: st.error(f"β GSearch Error: {error_message[:100]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
return []
|
| 270 |
|
|
|
|
|
|
|
|
|
|
| 271 |
def build_content_generation_prompt(keyword, competitor_texts, tone, audience, model_id):
|
| 272 |
+
logger.info(f"Build content gen prompt. Tone: {tone}, Audience: {audience}. Comp length: {len(competitor_texts)}")
|
|
|
|
| 273 |
if len(competitor_texts) > MAX_COMPETITOR_TEXT_LENGTH:
|
| 274 |
+
competitor_summary = competitor_texts[:MAX_COMPETITOR_TEXT_LENGTH] + "... [Truncated]"
|
| 275 |
+
logger.warning(f"Comp text truncated.")
|
| 276 |
+
else: competitor_summary = competitor_texts
|
| 277 |
+
system_prompt = f"""You are an expert SEO Content Strategist & world-class Copywriter. Task: Analyze competitor text & generate a significantly superior, comprehensive, user-first article for keyword '{keyword}', targeting '{audience}' audience with '{tone}' tone. Focus on quality, depth, clarity, fulfilling user intent better than competition."""
|
| 278 |
+
user_prompt = f"""**Keyword:** "{keyword}"
|
| 279 |
+
**Audience:** {audience}
|
| 280 |
+
**Tone:** {tone}
|
| 281 |
+
**Objective:** Generate exceptional, SEO-optimized article for "{keyword}" designed to outperform top content via superior value, insights, UX.
|
| 282 |
+
**Competitor Analysis Context (Analyze for topics, depth, strengths, WEAKNESSES/GAPS):**
|
| 283 |
+
--- BEGIN COMPETITOR ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
{competitor_summary}
|
| 285 |
+
--- END COMPETITOR ---
|
| 286 |
+
**Content Gen Instructions:**
|
| 287 |
+
1. **Value & Depth:** Be demonstrably better. Deeper, clearer, actionable advice, unique perspectives/data, fill gaps. Address user intent exhaustively.
|
| 288 |
+
2. **User-First & Humanized:** Write for '{audience}' in '{tone}'. Clear, concise, short paras, varied sentences, engaging Qs. Logical flow, readable.
|
| 289 |
+
3. **Structure (Strict Markdown):** Compelling H2 Title. Engaging Intro (50-150 words): Hook, purpose/value, outline. Logical Sections (H2)/Sub-sections (H3): Descriptive, keyword-aware headings. Readability: Bullets (`* `), Numbered lists (`1. `), **Bold** (strategic). Comprehensive Body: Expand beyond competitors. Strong Conclusion: Summarize takeaways, final insight/CTA.
|
| 290 |
+
4. **SEO (Natural):** Weave "{keyword}" & LSI terms into title, headings, intro, body, conclusion. Prioritize relevance/clarity over density. NO keyword stuffing.
|
| 291 |
+
5. **Originality & Credibility:** 100% unique. Use comp text ONLY for analysis. NO plagiarism. Factual accuracy.
|
| 292 |
+
6. **Negative Constraints:** DO NOT: Rehash competitors; use preambles/sign-offs; use excessive jargon (unless 'Experts'); write long paragraphs; stuff keywords; invent facts.
|
| 293 |
+
**Output:** ONLY the Markdown article, starting with H2 title."""
|
| 294 |
+
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
| 295 |
+
logger.info(f"Content prompt done for {model_id}.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
return messages
|
| 297 |
|
|
|
|
|
|
|
| 298 |
def build_internal_link_prompt(generated_content, keyword, website_url):
|
| 299 |
+
logger.info(f"Build internal link prompt for URL: {website_url}")
|
| 300 |
+
system_prompt = "You are an SEO assistant specialized in identifying internal linking opportunities."
|
|
|
|
|
|
|
|
|
|
| 301 |
user_prompt = f"""**Website Base URL:** {website_url}
|
| 302 |
**Main Topic of Article:** "{keyword}"
|
| 303 |
+
**Task:** Review the article below. Identify 3-5 phrases/sentences for internal links relevant to {website_url}.
|
|
|
|
|
|
|
| 304 |
**For each opportunity, provide:**
|
| 305 |
+
1. Exact anchor text phrase/sentence from article.
|
| 306 |
+
2. Brief description of the *type* of relevant content needed (e.g., "detailed guide on [sub-topic]", "service page for [service]").
|
| 307 |
+
**IMPORTANT:** Do NOT invent URLs. Describe the *type* of page. Choose natural anchor text. Focus on value. Format as Markdown numbered list.
|
| 308 |
+
**Article Content (Analyze first ~8000 chars):**
|
| 309 |
+
--- BEGIN ARTICLE ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
{generated_content[:8000]}
|
| 311 |
+
--- END ARTICLE ---"""
|
| 312 |
+
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
return messages
|
| 314 |
|
|
|
|
|
|
|
| 315 |
def run_llm_generation(pipe, messages, max_tokens):
|
| 316 |
+
if pipe is None: st.error("β LLM Pipeline missing."); return None
|
| 317 |
+
model_id = pipe.model.name_or_path
|
| 318 |
+
logger.info(f"Running generation: {model_id}. Max tokens: {max_tokens}.")
|
| 319 |
+
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
try:
|
| 321 |
+
gen_args = {"max_new_tokens": max_tokens, "temperature": 0.7, "top_p": 0.95, "top_k": 40,
|
| 322 |
+
"do_sample": True, "pad_token_id": pipe.tokenizer.eos_token_id, "eos_token_id": pipe.tokenizer.eos_token_id}
|
| 323 |
+
logger.info(f"Gen args: {gen_args}")
|
| 324 |
+
results = pipe(messages, **gen_args)
|
| 325 |
+
# --- Robust Extraction ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
assistant_response = None
|
| 327 |
if results and results[0] and 'generated_text' in results[0]:
|
| 328 |
output_data = results[0]['generated_text']
|
| 329 |
+
if isinstance(output_data, list): assistant_message = next((msg['content'] for msg in reversed(output_data) if msg['role'] == 'assistant'), None); assistant_response = assistant_message
|
| 330 |
+
elif isinstance(output_data, str):
|
|
|
|
|
|
|
|
|
|
| 331 |
last_prompt_content = messages[-1]['content']
|
| 332 |
last_prompt_index = output_data.rfind(last_prompt_content)
|
| 333 |
+
if last_prompt_index != -1: potential_response = output_data[last_prompt_index + len(last_prompt_content):].strip()
|
| 334 |
+
else: potential_response = output_data
|
|
|
|
|
|
|
|
|
|
| 335 |
assistant_response = re.sub(r"^(assistant|ASSISTANT|</s>|<\|im_end\|>|<\|assistant\|>)\s*[:\n]*", "", potential_response, flags=re.IGNORECASE | re.DOTALL).strip()
|
| 336 |
+
else: logger.error(f"Unexpected output format: {type(output_data)}")
|
| 337 |
else: logger.error(f"Unexpected LLM output structure: {results}")
|
| 338 |
+
# --- Validation ---
|
|
|
|
|
|
|
| 339 |
if assistant_response:
|
| 340 |
+
duration = time.time() - start_time; logger.info(f"Gen success ({model_id}) {duration:.2f}s. Len: {len(assistant_response)}.")
|
| 341 |
+
assistant_response = re.sub(r"^```markdown\n", "", assistant_response).strip(); assistant_response = re.sub(r"\n```$", "", assistant_response).strip()
|
| 342 |
+
if len(assistant_response) < 30: logger.warning(f"Gen output very short ({len(assistant_response)})."); st.warning("β οΈ Gen output very short.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
return assistant_response
|
| 344 |
+
else: logger.error(f"Failed parse assistant response. Output: {results}"); st.error("β Failed parse LLM response. Check logs."); return None
|
| 345 |
+
except torch.cuda.OutOfMemoryError: logger.error(f"OOM Error ({model_id})!", exc_info=True); st.error(f"β OOM Error ({model_id}). Try smaller model/less tokens/restart."); clear_gpu_memory(); return None
|
| 346 |
+
except Exception as e: logger.error(f"Unhandled gen error ({model_id}): {e}", exc_info=True); st.error(f"β Unexpected gen error: {e}"); return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
# --- Streamlit App UI ---
|
| 349 |
|
| 350 |
+
st.set_page_config(layout="wide", page_title="On-Demand SEO Content Gen")
|
| 351 |
|
| 352 |
+
# --- Sidebar ---
|
| 353 |
with st.sidebar:
|
| 354 |
st.header("βοΈ Configuration")
|
| 355 |
+
|
| 356 |
+
# Model Selection & Loading Area
|
| 357 |
+
st.subheader("1. Select & Load Model")
|
| 358 |
selected_model_key = st.selectbox(
|
| 359 |
"Choose Language Model:",
|
| 360 |
options=list(MODEL_OPTIONS.keys()),
|
| 361 |
index=list(MODEL_OPTIONS.keys()).index(DEFAULT_MODEL_KEY),
|
| 362 |
+
key="model_selector", # Key for potential state access
|
| 363 |
+
help="Choose AI model. Performance & resources vary. Load required."
|
| 364 |
)
|
| 365 |
selected_model_id = MODEL_OPTIONS[selected_model_key]
|
| 366 |
|
| 367 |
+
# Display current status and load button
|
| 368 |
+
load_button_placeholder = st.empty() # Placeholder for dynamic button text/state
|
| 369 |
+
model_status_placeholder = st.empty() # Placeholder for status message
|
| 370 |
+
|
| 371 |
+
if st.session_state.get('current_model_id') == selected_model_id and st.session_state.get('current_model_pipeline') is not None:
|
| 372 |
+
model_status_placeholder.success(f"β
Loaded: `{selected_model_id}`")
|
| 373 |
+
load_button_text = f"Switch from {selected_model_key}" # Or "Reload"
|
| 374 |
+
elif st.session_state.get('current_model_pipeline') is not None:
|
| 375 |
+
model_status_placeholder.warning(f"β οΈ Loaded: `{st.session_state.current_model_id}`\nSelected: `{selected_model_id}`")
|
| 376 |
+
load_button_text = f"Unload Current & Load {selected_model_key}"
|
| 377 |
+
else:
|
| 378 |
+
model_status_placeholder.info("βΉοΈ No model loaded.")
|
| 379 |
+
load_button_text = f"Load Selected: {selected_model_key}"
|
| 380 |
+
|
| 381 |
+
if load_button_placeholder.button(load_button_text, key="load_model"):
|
| 382 |
+
load_model(selected_model_id)
|
| 383 |
+
# Rerun to update status placeholders immediately after load attempt
|
| 384 |
+
st.rerun()
|
| 385 |
|
| 386 |
+
st.markdown("---")
|
| 387 |
|
| 388 |
+
# Content Settings
|
| 389 |
+
st.subheader("2. Content Settings")
|
| 390 |
+
with st.expander("Adjust Content Parameters", expanded=False):
|
| 391 |
+
num_results = st.slider("Competitors to Analyze:", min_value=1, max_value=8, value=DEFAULT_NUM_RESULTS, step=1)
|
| 392 |
selected_tone = st.selectbox("Content Tone:", options=TONE_OPTIONS, index=TONE_OPTIONS.index("Engaging"))
|
| 393 |
selected_audience = st.selectbox("Target Audience:", options=AUDIENCE_OPTIONS, index=AUDIENCE_OPTIONS.index("General Audience"))
|
| 394 |
+
max_gen_tokens = st.number_input("Max Generation Tokens:", min_value=500, max_value=8192, value=DEFAULT_MAX_GENERATION_TOKENS, step=100)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
+
# Internal Linking
|
| 397 |
+
st.subheader("3. Internal Linking (Optional)")
|
| 398 |
+
with st.expander("Configure Link Suggestions", expanded=False):
|
| 399 |
+
website_url = st.text_input("Your Website URL:", placeholder="https://www.example.com", value=st.session_state.get("last_website_url", ""), key="website_url_input")
|
| 400 |
+
# Update state immediately on change if needed, or just read before use
|
| 401 |
+
st.session_state.last_website_url = website_url
|
|
|
|
| 402 |
|
| 403 |
+
st.markdown("---")
|
| 404 |
+
st.header("βΉοΈ App Info & Actions")
|
| 405 |
st.info(f"""
|
| 406 |
+
- **Status:** {'Model Loaded' if st.session_state.current_model_pipeline else 'No Model Loaded'}
|
| 407 |
- **Competitors:** Top {num_results}
|
| 408 |
- **Max Generation:** ~{max_gen_tokens} tokens
|
| 409 |
""")
|
| 410 |
st.warning("""
|
| 411 |
+
- **Load Model First:** Select a model and click 'Load' before generating.
|
| 412 |
+
- **Resource Use:** Models need significant RAM/GPU. Loading WILL fail if resources are insufficient.
|
| 413 |
+
- **Review Output:** AI provides drafts. ALWAYS review, edit, fact-check.
|
|
|
|
| 414 |
""")
|
| 415 |
+
if st.button("Clear Scraped/Generated Data", key="clear_data"):
|
| 416 |
+
reset_app_data()
|
|
|
|
|
|
|
| 417 |
|
| 418 |
+
# --- Main App Area ---
|
| 419 |
+
st.title("β¨ On-Demand SEO Content Generator β¨")
|
| 420 |
+
st.markdown(f"Load your chosen AI model, then generate SEO-focused content.")
|
| 421 |
|
| 422 |
+
# User Input Area
|
| 423 |
+
st.subheader("Keyword & Generation")
|
| 424 |
+
keyword = st.text_input("Enter Primary Target Keyword:", placeholder="e.g., vertical hydroponics guide", value=st.session_state.get("last_keyword", ""), key="keyword_input")
|
| 425 |
|
| 426 |
+
# Disable button if model not loaded
|
| 427 |
+
generate_button_disabled = st.session_state.current_model_pipeline is None
|
| 428 |
+
generate_button_help = "Load a model from the sidebar first." if generate_button_disabled else "Analyze competitors and generate article."
|
| 429 |
|
| 430 |
+
analyze_button = st.button(
|
| 431 |
+
"Analyze Competitors & Generate Content",
|
| 432 |
+
type="primary",
|
| 433 |
+
key="generate_button",
|
| 434 |
+
disabled=generate_button_disabled,
|
| 435 |
+
help=generate_button_help
|
| 436 |
+
)
|
| 437 |
|
| 438 |
st.markdown("---")
|
| 439 |
|
| 440 |
+
# --- Main Workflow Triggered by Button ---
|
| 441 |
+
if analyze_button:
|
| 442 |
+
# Double check model is loaded (though button should be disabled)
|
| 443 |
+
if not st.session_state.current_model_pipeline:
|
| 444 |
+
st.error("β Cannot generate: No model loaded. Please use the sidebar.")
|
| 445 |
+
st.stop()
|
| 446 |
if not keyword:
|
| 447 |
st.warning("β οΈ Please enter a keyword.")
|
| 448 |
st.stop()
|
| 449 |
|
| 450 |
+
st.session_state.last_keyword = keyword # Store keyword for potential reuse
|
| 451 |
ua = get_user_agent() # Ensure user agent is ready
|
| 452 |
+
if not ua: st.error("β User Agent failed. Cannot scrape."); st.stop()
|
| 453 |
|
| 454 |
+
# Reset previous generation results for this run
|
| 455 |
st.session_state.generated_content = ""
|
| 456 |
st.session_state.internal_link_suggestions = ""
|
| 457 |
|
| 458 |
+
# --- Step 1: Scrape Competitors (with status updates) ---
|
| 459 |
+
# Check if scrape needed
|
| 460 |
if keyword != st.session_state.get('_internal_last_scrape_keyword', None) or not st.session_state.competitor_analysis_text:
|
| 461 |
+
logger.info(f"Scraping needed for '{keyword}'.")
|
| 462 |
+
st.session_state.competitor_analysis_text = "" # Clear old text
|
| 463 |
+
st.session_state.scraped_urls = []
|
| 464 |
+
st.session_state['_internal_last_scrape_keyword'] = "" # Reset marker until success
|
| 465 |
|
| 466 |
scrape_container = st.container()
|
| 467 |
with scrape_container:
|
| 468 |
+
st.info(f"πΈοΈ Fetching URLs and Scraping Top {num_results} Competitors...")
|
| 469 |
+
progress_text = "Scraping progress..."
|
| 470 |
+
scrape_progress_bar = st.progress(0, text=progress_text)
|
| 471 |
+
status_area = st.container() # Use container for multiple status lines
|
| 472 |
|
| 473 |
urls = get_top_urls(keyword, num_results)
|
| 474 |
st.session_state.scraped_urls = urls
|
| 475 |
|
| 476 |
if urls:
|
| 477 |
all_texts = []
|
| 478 |
+
scraped_count = 0
|
|
|
|
|
|
|
| 479 |
for i, url in enumerate(urls):
|
| 480 |
+
with status_area: # Show status within the designated area
|
| 481 |
+
scrape_status_ui = st.empty() # Placeholder for single URL status
|
| 482 |
+
content = scrape_page_content(url, ua, scrape_status_ui)
|
| 483 |
+
if content:
|
| 484 |
+
all_texts.append(content)
|
| 485 |
+
scraped_count += 1
|
| 486 |
+
scrape_progress_bar.progress((i + 1) / len(urls), text=f"Processed URL {i+1}/{len(urls)}...")
|
| 487 |
+
time.sleep(0.1) # UI update breather
|
|
|
|
| 488 |
|
| 489 |
st.session_state.competitor_analysis_text = "\n\n --- ARTICLE SEPARATOR --- \n\n".join(all_texts)
|
| 490 |
+
st.session_state['_internal_last_scrape_keyword'] = keyword # Mark scrape success for this keyword
|
| 491 |
|
| 492 |
if st.session_state.competitor_analysis_text:
|
| 493 |
+
scrape_container.success(f"β
Scraped {scraped_count}/{len(urls)} pages. Analysis text: {len(st.session_state.competitor_analysis_text)} chars.")
|
|
|
|
| 494 |
else:
|
| 495 |
scrape_container.error("β Failed to scrape sufficient content. Cannot generate article.")
|
| 496 |
st.stop()
|
|
|
|
| 498 |
scrape_container.error("β Could not retrieve competitor URLs. Cannot proceed.")
|
| 499 |
st.stop()
|
| 500 |
else:
|
| 501 |
+
st.success(f"βοΈ Using previously scraped data for '{keyword}'. ({len(st.session_state.competitor_analysis_text)} chars).")
|
|
|
|
|
|
|
| 502 |
|
| 503 |
# --- Step 2: Generate Main Content ---
|
| 504 |
+
st.info(f"βοΈ Generating Content with {st.session_state.current_model_id}...")
|
| 505 |
+
generation_status = st.status("Sending request to LLM...")
|
| 506 |
with generation_status:
|
| 507 |
+
st.write(f"**Tone:** {selected_tone}, **Audience:** {selected_audience}, **Max Tokens:** {max_gen_tokens}")
|
|
|
|
|
|
|
| 508 |
gen_prompt = build_content_generation_prompt(
|
| 509 |
keyword, st.session_state.competitor_analysis_text, selected_tone, selected_audience, st.session_state.current_model_id
|
| 510 |
)
|
| 511 |
generated_content = run_llm_generation(st.session_state.current_model_pipeline, gen_prompt, max_gen_tokens)
|
| 512 |
+
st.session_state.generated_content = generated_content
|
| 513 |
|
| 514 |
if generated_content:
|
| 515 |
generation_status.update(label="β
Content Generation Complete!", state="complete")
|
|
|
|
| 517 |
generation_status.update(label="β Content Generation Failed.", state="error")
|
| 518 |
st.stop() # Stop if main content fails
|
| 519 |
|
| 520 |
+
# --- Display Outputs (Outside the button click conditional) ---
|
| 521 |
if st.session_state.generated_content:
|
| 522 |
st.markdown("---")
|
| 523 |
st.subheader("π Generated SEO Content")
|
| 524 |
st.markdown(st.session_state.generated_content)
|
| 525 |
+
st.text_area("Copyable Markdown:", st.session_state.generated_content, height=400, key="generated_content_area_display")
|
| 526 |
|
| 527 |
+
# --- Internal Linking Section ---
|
| 528 |
+
if st.session_state.last_website_url: # Only show if URL was provided
|
| 529 |
st.markdown("---")
|
| 530 |
st.subheader("π Internal Linking Suggestions")
|
| 531 |
+
if st.button("Suggest Internal Links", key="suggest_links_button_display"):
|
| 532 |
+
link_status = st.status(f"Analyzing content for link opportunities ({st.session_state.current_model_id})...")
|
| 533 |
+
with link_status:
|
| 534 |
+
st.write(f"Website context: {st.session_state.last_website_url}")
|
| 535 |
+
link_prompt = build_internal_link_prompt(st.session_state.generated_content, keyword, st.session_state.last_website_url)
|
| 536 |
+
link_suggestions = run_llm_generation(st.session_state.current_model_pipeline, link_prompt, max_tokens=500) # Use fewer tokens
|
| 537 |
+
st.session_state.internal_link_suggestions = link_suggestions
|
| 538 |
+
if link_suggestions: link_status.update(label="β
Link suggestions generated!", state="complete")
|
| 539 |
+
else: link_status.update(label="β Failed to generate link suggestions.", state="error")
|
| 540 |
+
|
| 541 |
+
# Display suggestions if they exist in state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
if st.session_state.internal_link_suggestions:
|
| 543 |
st.markdown(st.session_state.internal_link_suggestions)
|
| 544 |
+
st.info("βΉοΈ AI suggestions only. Verify relevance and find actual URLs on your site.")
|
| 545 |
else:
|
| 546 |
+
st.markdown("---")
|
| 547 |
+
st.info("Provide your website URL in the sidebar to enable internal link suggestions after generating content.")
|