garyuzair commited on
Commit
c0388eb
Β·
verified Β·
1 Parent(s): 88f139c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +312 -413
app.py CHANGED
@@ -9,579 +9,488 @@ import time
9
  import logging
10
  import re
11
  from retrying import retry
12
- import gc # Garbage collector for potentially cleaning up GPU memory
13
 
14
  # --- Configuration ---
15
-
16
- # Model Options - Added more models with notes on resource needs
17
  MODEL_OPTIONS = {
18
- # Smaller/Faster Models
19
- "Mistral-7B-Instruct (Fast, Good Quality, Med RAM)": "mistralai/Mistral-7B-Instruct-v0.2",
20
- "Gemma-7B (Google, Good Quality, Med RAM)": "google/gemma-7b-it", # Instruct-tuned version
21
- "Llama-3-8B (Meta, Very Good Quality, High RAM)": "meta-llama/Meta-Llama-3-8B-Instruct",
22
- "Phi-3-Medium (Microsoft, Strong Reasoning, Med RAM)": "microsoft/Phi-3-medium-4k-instruct", # Needs trust_remote_code
23
-
24
- # Larger/Slower Models (Likely require significant resources / paid tiers)
25
- "DeepSeek-Coder-V2 (DeepSeek, Code/General, High RAM/GPU)": "deepseek-ai/DeepSeek-Coder-V2-Instruct", # Example: Using Coder V2 Instruct variant
26
- "Qwen1.5-14B-Chat (Alibaba, Strong General, High RAM/GPU)": "Qwen/Qwen1.5-14B-Chat",
27
- # Commenting out extremely large models unlikely to run easily:
28
- # "Qwen3-235B-A22B (Very Large, EXPERIMENTAL)": "Qwen/Qwen3-235B-A22B", # Extremely large, needs special setup
 
29
  }
30
- # Default to a reasonably performant model
31
- DEFAULT_MODEL_KEY = "Mistral-7B-Instruct (Fast, Good Quality, Med RAM)"
32
 
33
  # Scraping & Generation Defaults
34
- DEFAULT_NUM_RESULTS = 5
35
  REQUEST_TIMEOUT = 15
36
- MAX_COMPETITOR_TEXT_LENGTH = 6000 # Increased slightly more, but monitor
37
- DEFAULT_MAX_GENERATION_TOKENS = 3000 # Increased default target
38
 
39
- # Retry settings for scraping
40
  RETRY_WAIT_FIXED = 2000
41
  RETRY_STOP_MAX_ATTEMPT = 3
42
 
43
  # Tone & Audience Options
44
- TONE_OPTIONS = ["Conversational", "Professional", "Authoritative", "Technical", "Friendly", "Engaging", "Educational"]
45
- AUDIENCE_OPTIONS = ["Beginners", "General Audience", "Experts", "Professionals (Specific Field)", "Customers", "Students"]
46
 
47
  # --- Logging Setup ---
48
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
49
  logger = logging.getLogger(__name__)
50
 
51
- # --- Caching & State ---
52
- # Initialize session state more comprehensively
 
 
 
53
  if 'scraped_urls' not in st.session_state: st.session_state.scraped_urls = []
54
  if 'competitor_analysis_text' not in st.session_state: st.session_state.competitor_analysis_text = ""
55
  if 'generated_content' not in st.session_state: st.session_state.generated_content = ""
56
  if 'internal_link_suggestions' not in st.session_state: st.session_state.internal_link_suggestions = ""
57
  if 'last_keyword' not in st.session_state: st.session_state.last_keyword = ""
58
- if 'last_model_id' not in st.session_state: st.session_state.last_model_id = ""
59
  if 'last_website_url' not in st.session_state: st.session_state.last_website_url = ""
60
- if 'current_model_pipeline' not in st.session_state: st.session_state.current_model_pipeline = None
61
- if 'current_model_id' not in st.session_state: st.session_state.current_model_id = ""
62
 
63
 
64
- # Function to explicitly clear GPU memory
65
  def clear_gpu_memory():
 
66
  logger.info("Attempting to clear GPU memory...")
67
  if torch.cuda.is_available():
68
- torch.cuda.empty_cache()
69
- gc.collect()
70
- logger.info("GPU memory cache cleared and garbage collected.")
 
 
 
 
 
 
 
71
  else:
72
  logger.info("No GPU available, skipping memory clearing.")
 
 
73
 
74
-
75
- # Modified caching: Load models into session state instead of cache_resource
76
- # This allows unloading previous models when switching.
77
- def load_model(model_id):
78
- """Loads the selected model pipeline into session state, unloading the previous one."""
79
- if st.session_state.current_model_id == model_id and st.session_state.current_model_pipeline is not None:
80
- logger.info(f"Model {model_id} is already loaded.")
81
- return st.session_state.current_model_pipeline
82
-
83
- # Unload previous model if different
84
- if st.session_state.current_model_pipeline is not None:
 
 
 
 
 
 
 
 
 
 
 
85
  logger.info(f"Unloading previous model: {st.session_state.current_model_id}")
86
- st.session_state.current_model_pipeline = None # Remove reference
87
- clear_gpu_memory() # Attempt to free memory
88
- st.toast(f"Unloaded previous model: {st.session_state.current_model_id}", icon="🧹")
89
 
90
- st.toast(f"Loading {model_id}... This may take time and significant RAM/GPU.", icon="⏳")
91
- logger.info(f"Attempting to load LLM pipeline for model: {model_id}")
 
92
  pipeline_instance = None
 
93
  try:
94
- # Determine torch_dtype based on availability and model needs
95
- # Use bfloat16 if available for better performance on compatible GPUs
96
  dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else torch.float32
97
  logger.info(f"Using dtype: {dtype}")
98
 
99
- # Models requiring trust_remote_code
100
- trust_code = model_id in [
101
  "microsoft/Phi-3-medium-4k-instruct",
102
  "deepseek-ai/DeepSeek-Coder-V2-Instruct",
103
- # Add any other models known to require it
104
  ]
105
- logger.info(f"Trust remote code for {model_id}: {trust_code}")
106
-
107
- pipeline_instance = pipeline(
108
- "text-generation",
109
- model=model_id,
110
- trust_remote_code=trust_code,
111
- device_map="auto",
112
- torch_dtype=dtype,
113
- # Consider adding quantization for very large models if needed, e.g.:
114
- # load_in_8bit=True # Requires bitsandbytes
115
- )
116
-
117
- # Ensure tokenizer has pad_token (important for generation)
118
- if pipeline_instance.tokenizer.pad_token_id is None:
119
- pipeline_instance.tokenizer.pad_token_id = pipeline_instance.tokenizer.eos_token_id
120
- pipeline_instance.model.config.pad_token_id = pipeline_instance.tokenizer.eos_token_id # Also set in config
121
- logger.warning(f"Set pad_token_id to eos_token_id ({pipeline_instance.tokenizer.eos_token_id}) for model {model_id}")
122
-
123
- logger.info(f"LLM pipeline loaded successfully for {model_id}.")
 
 
124
  st.session_state.current_model_pipeline = pipeline_instance
125
- st.session_state.current_model_id = model_id
126
- st.toast(f"Model {model_id} loaded successfully!", icon="βœ…")
127
- return pipeline_instance
128
 
129
  except ImportError as e:
130
- logger.error(f"ImportError loading pipeline for {model_id}: {e}. Missing dependencies?", exc_info=True)
131
- st.error(f"Error loading model {model_id}. Required library missing? Check logs. Error: {e}")
132
- return None
133
  except Exception as e:
134
- logger.error(f"Error loading LLM pipeline for {model_id}: {e}", exc_info=True)
135
- st.error(f"Failed to load {model_id}. Error: {e}. Check resource limits (RAM/GPU) & logs.")
136
- # Clear potentially partially loaded state
137
- st.session_state.current_model_pipeline = None
138
- st.session_state.current_model_id = ""
139
- clear_gpu_memory()
140
- return None
141
-
142
- # User Agent Caching (can remain cache_resource)
143
  @st.cache_resource
144
  def get_user_agent():
 
145
  logger.info("Initializing FakeUserAgent.")
146
  try:
147
- # Handle potential issues with finding data files for fake_useragent
148
  return UserAgent(fallback='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
149
  except Exception as e:
150
  logger.error(f"Failed to initialize FakeUserAgent: {e}", exc_info=True)
151
- st.error(f"Could not initialize User Agent generator. Scraping might fail. Error: {e}")
152
  return None
153
 
154
- # --- Core Functions (Scraping, Content Generation - Minimal changes needed) ---
155
-
156
- def reset_session_state():
157
- """Clears all generated and scraped data from session state."""
158
- st.session_state.scraped_urls = []
159
- st.session_state.competitor_analysis_text = ""
160
- st.session_state.generated_content = ""
161
- st.session_state.internal_link_suggestions = ""
162
- st.session_state.last_keyword = ""
163
- # Don't reset model pipeline here, only data
164
- logger.info("Session state data reset.")
165
 
 
166
  @retry(wait_fixed=RETRY_WAIT_FIXED, stop_max_attempt_number=RETRY_STOP_MAX_ATTEMPT,
167
  retry_on_exception=lambda e: isinstance(e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError)))
168
  def fetch_url_content(url, headers):
169
- """Fetches content for a single URL with retries for specific errors."""
170
  logger.info(f"Fetching {url} (Attempt {fetch_url_content.retry.attempt_number+1}/{RETRY_STOP_MAX_ATTEMPT})")
171
  response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
172
  response.raise_for_status()
173
  if 'text/html' not in response.headers.get('Content-Type', ''):
174
- logger.warning(f"Skipping URL {url} - Content-Type is not HTML ({response.headers.get('Content-Type')})")
175
  return None
176
- # Check for excessively large pages (potential trap or non-article content)
177
  if len(response.content) > 10 * 1024 * 1024: # 10 MB limit
178
- logger.warning(f"Skipping URL {url} - Content too large ({len(response.content)} bytes)")
179
  return None
180
  return response
181
 
182
  def clean_text(text):
183
- """Enhanced text cleaning."""
184
- # Remove multiple spaces and newlines
185
  text = re.sub(r'\s{2,}', ' ', text)
186
  text = re.sub(r'\n+', '\n', text)
187
- # Remove lines that are likely boilerplate/navigation/ads more aggressively
188
  lines = text.split('\n')
189
  cleaned_lines = []
190
- min_line_length = 20 # Heuristic: Lines shorter than this are often noise
191
- min_words_per_line = 3 # Heuristic: Lines with few words are often noise
192
  skip_phrases = [
193
  'copyright Β©', 'all rights reserved', 'privacy policy', 'terms of use', 'terms and conditions',
194
  'cookie policy', 'subscribe', 'sign up', 'log in', 'advertisement', 'share this', 'related posts',
195
  'leave a reply', 'comment', 'posted on', 'by author', 'tags:', 'categories:', 'follow us', 'read more',
196
- 'click here', 'learn more', 'next article', 'previous article'
197
  ]
198
  for line in lines:
199
  stripped_line = line.strip()
200
  lower_line = stripped_line.lower()
201
- # Check length, word count, and if it contains skip phrases
202
  if len(stripped_line) >= min_line_length and \
203
  len(stripped_line.split()) >= min_words_per_line and \
204
  not any(phrase in lower_line for phrase in skip_phrases):
205
  cleaned_lines.append(stripped_line)
206
-
207
  text = '\n'.join(cleaned_lines)
208
  return text.strip()
209
 
210
  def scrape_page_content(url, user_agent, scrape_status_ui):
211
- """Scrapes, cleans, and extracts relevant text content with improved logic and retries."""
212
- # (Code similar to previous version, with enhanced cleaning and error logging)
213
- if not user_agent: logger.error("User Agent generator not available."); return ""
214
- headers = { /* ... headers ... */ } # Keep previous headers
 
 
 
215
  try:
216
  response = fetch_url_content(url, headers)
217
- if response is None:
218
- scrape_status_ui.warning(f"⚠️ Skip/Fail fetch: {url}", icon="πŸ•ΈοΈ")
219
- return ""
220
-
221
- soup = BeautifulSoup(response.content, 'lxml') # Use lxml for potentially faster parsing
222
-
223
- # --- Enhanced Extraction & Cleaning ---
224
  tags_to_remove = ["script", "style", "nav", "footer", "aside", "form", "header", "noscript", "button", "input", "select", "textarea", "figure", "figcaption", "iframe", "svg", "path", "meta", "link"]
225
  for element in soup(tags_to_remove): element.decompose()
226
  for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract()
227
-
228
- main_content = (soup.find('main') or soup.find('article') or
229
- soup.find(role='main') or
230
  soup.find('div', class_=re.compile(r'(content|main|body|post|entry|article)', re.I)) or
231
  soup.find('div', id=re.compile(r'(content|main|body|post|entry|article)', re.I)))
232
  target_soup = main_content if main_content else soup.body
233
-
234
- if not target_soup: logger.warning(f"No body/main found: {url}"); scrape_status_ui.warning(f"⚠️ No body/main: {url}", icon="πŸ•ΈοΈ"); return ""
235
-
236
- texts = target_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'blockquote', 'span']) # Added span, sometimes used for content
237
  content_parts = []
238
  for elem in texts:
239
- # Avoid extracting text from elements likely inside removed sections (double check)
240
  if elem.find_parent(tags_to_remove): continue
241
- # Get text, strip extra whitespace, join parts if nested
242
  elem_text = elem.get_text(separator=' ', strip=True)
243
- # Filter out short/noisy text elements
244
  if len(elem_text) > 10 and len(elem_text.split()) > 1:
245
- # Add newline after block elements for structure
246
- if elem.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'tr']:
247
  content_parts.append(elem_text + "\n")
248
- else:
249
- content_parts.append(elem_text + " ")
250
-
251
  content = "".join(content_parts)
252
- cleaned_content = clean_text(content) # Apply enhanced cleaning
253
-
254
- if len(cleaned_content) < 150: # Increased threshold for meaningful content
255
- logger.warning(f"Low content ({len(cleaned_content)} chars): {url}")
256
- scrape_status_ui.warning(f"⚠️ Low content: {url}", icon="πŸ•ΈοΈ")
257
- else:
258
- logger.info(f"Scraped {len(cleaned_content)} chars: {url}")
259
- scrape_status_ui.success(f"βœ… Scraped: {url} ({len(cleaned_content)} chars)", icon="πŸ•ΈοΈ")
260
-
261
- time.sleep(0.6) # Slightly increased delay
262
  return cleaned_content
263
-
264
- except requests.exceptions.RequestException as e:
265
- logger.warning(f"Final scrape attempt failed: {url}. Error: {e}")
266
- scrape_status_ui.error(f"❌ Fail scrape: {url} ({e})", icon="πŸ•ΈοΈ")
267
- return ""
268
- except Exception as e:
269
- logger.error(f"Unexpected scrape error: {url}: {e}", exc_info=True)
270
- scrape_status_ui.error(f"❌ Error scraping: {url} (Check logs)", icon="πŸ•ΈοΈ")
271
- return ""
272
-
273
 
274
  def get_top_urls(keyword, num_results):
275
- # (Function remains the same as previous version - already robust)
276
  logger.info(f"Fetching top {num_results} URLs for keyword: '{keyword}'")
277
  try:
278
- # Use a longer timeout for the search itself
279
  urls = list(search(keyword, num_results=num_results, sleep_interval=2.5, lang="en", timeout=15))
280
  logger.info(f"Found URLs: {urls}")
281
- if not urls:
282
- st.warning(f"⚠️ No Google search results found for '{keyword}'. Try a different keyword?")
283
- return []
284
  return urls
285
  except Exception as e:
286
- error_message = str(e)
287
- logger.error(f"Error fetching Google search results for '{keyword}': {error_message}", exc_info=True)
288
- # Specific error handling...
289
- if "429" in error_message or "Too Many Requests" in error_message:
290
- st.error(f"❌ Google search blocked (Error 429). WAIT before retrying.")
291
- elif "timed out" in error_message:
292
- st.error(f"❌ Google search request timed out.")
293
- else:
294
- st.error(f"❌ Failed to fetch Google results. Error: {error_message[:100]}...") # Truncate long errors
295
  return []
296
 
297
-
298
- # --- Prompt Building Functions ---
299
-
300
  def build_content_generation_prompt(keyword, competitor_texts, tone, audience, model_id):
301
- """Builds the main content generation prompt (incorporates previous best practices)."""
302
- logger.info(f"Building content gen prompt. Tone: {tone}, Audience: {audience}. Competitor length: {len(competitor_texts)}")
303
  if len(competitor_texts) > MAX_COMPETITOR_TEXT_LENGTH:
304
- competitor_summary = competitor_texts[:MAX_COMPETITOR_TEXT_LENGTH] + "... [Content Truncated]"
305
- logger.warning(f"Competitor text truncated to {MAX_COMPETITOR_TEXT_LENGTH} chars.")
306
- else:
307
- competitor_summary = competitor_texts
308
-
309
- # System prompt can be tailored slightly if needed, but a generic strong one works well
310
- system_prompt = f"""You are an expert SEO Content Strategist and world-class Copywriter. Your task is to analyze competitor text and generate a significantly superior, comprehensive, user-first article for the keyword '{keyword}', targeting a '{audience}' audience with a '{tone}' tone. Focus on quality, depth, clarity, and fulfilling user intent better than the competition."""
311
-
312
- user_prompt = f"""**Primary Keyword:** "{keyword}"
313
- **Target Audience:** {audience}
314
- **Desired Tone:** {tone}
315
-
316
- **Objective:** Generate an exceptional, SEO-optimized article for "{keyword}" designed to outperform the current top-ranking content by providing substantially more value, unique insights, and a better user experience.
317
-
318
- **Competitor Analysis Context (Analyze this text for topics, depth, strengths, and weaknesses/gaps):**
319
- --- BEGIN COMPETITOR CONTENT ---
320
  {competitor_summary}
321
- --- END COMPETITOR CONTENT ---
322
-
323
- **Content Generation Instructions:**
324
-
325
- 1. **Outperform & Add Value:** Create content that is clearly superior to the competitor examples. Go deeper, explain concepts more clearly, provide actionable advice, offer unique perspectives or data, and fill identified content gaps. Address the core user intent behind "{keyword}" comprehensively.
326
- 2. **User-First & Humanized:** Write for the '{audience}' reader in the specified '{tone}'. Use clear, concise language, short paragraphs, varied sentence structure, and potentially engaging questions. Ensure logical flow and high readability.
327
- 3. **Structure (Strict Markdown):**
328
- * Compelling H2 Title (related to "{keyword}").
329
- * Engaging Introduction (50-150 words): Hook reader, state purpose/value, outline content.
330
- * Logical Sections (H2) & Sub-sections (H3): Use descriptive, keyword-aware headings.
331
- * Readability Enhancers: Use bullet points (`* ` or `- `), numbered lists (`1. `), and **bold text** strategically for emphasis.
332
- * Comprehensive Body: Cover all essential aspects, expanding beyond competitor content.
333
- * Strong Conclusion: Summarize key takeaways, provide final insight or call-to-action (if appropriate).
334
- 4. **SEO Integration (Natural):** Seamlessly integrate "{keyword}" and related semantic terms (LSI) into title, headings, intro, body, conclusion. Prioritize topical relevance and natural language over density. Avoid keyword stuffing.
335
- 5. **Originality & Credibility:** Generate 100% unique content. Use competitor text ONLY for analysis. Do NOT plagiarize. Ensure factual accuracy.
336
- 6. **Negative Constraints (DO NOT):** Do not rehash competitors; include preambles/sign-offs; use excessive jargon (unless for 'Experts'); write long paragraphs; stuff keywords; invent facts.
337
-
338
- **Output:** Deliver ONLY the generated Markdown article, starting directly with the H2 title.
339
- """
340
- # Use the format expected by the pipeline's chat template (usually system/user roles)
341
- # The pipeline should handle model-specific formatting (e.g., [INST], <|im_start|>)
342
- messages = [
343
- {"role": "system", "content": system_prompt},
344
- {"role": "user", "content": user_prompt}
345
- ]
346
- logger.info(f"Content generation prompt constructed for model {model_id}.")
347
  return messages
348
 
349
-
350
- # *** NEW: Internal Linking Prompt ***
351
  def build_internal_link_prompt(generated_content, keyword, website_url):
352
- """Builds the prompt for suggesting internal links."""
353
- logger.info(f"Building internal link suggestion prompt for URL: {website_url}")
354
-
355
- system_prompt = "You are an SEO assistant specialized in identifying internal linking opportunities within website content."
356
-
357
  user_prompt = f"""**Website Base URL:** {website_url}
358
  **Main Topic of Article:** "{keyword}"
359
-
360
- **Task:** Please review the following article content. Identify 3 to 5 specific phrases or sentences within the text that represent good opportunities for internal links to other relevant pages on the website ({website_url}).
361
-
362
  **For each opportunity, provide:**
363
- 1. The exact phrase/sentence from the article that should be the anchor text.
364
- 2. A brief description of the *type* of relevant content the link should point to on the website (e.g., "a detailed guide on [sub-topic]", "a related service page for [service]", "a case study about [specific example]", "a blog post explaining [related concept]").
365
-
366
- **IMPORTANT:**
367
- * Do NOT invent specific URLs (like `{website_url}/blog/my-post`). Only describe the *type* of page needed.
368
- * Choose anchor text that is natural and descriptive.
369
- * Focus on links that would genuinely add value for a reader seeking more information on that specific point.
370
- * Format your output as a Markdown numbered list.
371
-
372
- **Article Content to Analyze:**
373
- --- BEGIN ARTICLE CONTENT ---
374
  {generated_content[:8000]}
375
- --- END ARTICLE CONTENT ---
376
- """ # Limit context sent for linking analysis
377
-
378
- messages = [
379
- {"role": "system", "content": system_prompt},
380
- {"role": "user", "content": user_prompt}
381
- ]
382
  return messages
383
 
384
- # --- LLM Generation Functions ---
385
-
386
  def run_llm_generation(pipe, messages, max_tokens):
387
- """Runs the LLM pipeline with common settings and robust error handling."""
388
- if pipe is None:
389
- st.error("❌ LLM Pipeline is not available.")
390
- return None
391
- model_id = pipe.model.name_or_path # Get model id from pipeline
392
-
393
- logger.info(f"Running generation with {model_id}. Max new tokens: {max_tokens}.")
394
- generation_start_time = time.time()
395
-
396
  try:
397
- generation_args = {
398
- "max_new_tokens": max_tokens,
399
- "temperature": 0.7,
400
- "top_p": 0.95,
401
- "top_k": 40,
402
- "do_sample": True,
403
- "pad_token_id": pipe.tokenizer.eos_token_id,
404
- "eos_token_id": pipe.tokenizer.eos_token_id,
405
- # Use pipeline's chat template automatically if available
406
- }
407
- logger.info(f"Generation arguments: {generation_args}")
408
-
409
- # --- Execute Pipeline ---
410
- results = pipe(messages, **generation_args)
411
-
412
- # --- Robust Extraction of Assistant's Response ---
413
- # (Using the refined extraction logic from previous iteration)
414
  assistant_response = None
415
  if results and results[0] and 'generated_text' in results[0]:
416
  output_data = results[0]['generated_text']
417
- if isinstance(output_data, list): # Format: [{'role':'user',...}, {'role':'assistant',...}]
418
- assistant_message = next((msg['content'] for msg in reversed(output_data) if msg['role'] == 'assistant'), None)
419
- if assistant_message: assistant_response = assistant_message
420
- elif isinstance(output_data, str): # Format: "System...\nUser...\nAssistant..."
421
- # Find the last message in the prompt list to split after it
422
  last_prompt_content = messages[-1]['content']
423
  last_prompt_index = output_data.rfind(last_prompt_content)
424
- if last_prompt_index != -1:
425
- potential_response = output_data[last_prompt_index + len(last_prompt_content):].strip()
426
- else: # Fallback if prompt isn't exactly echoed
427
- potential_response = output_data # Assume it might just be the response
428
- # Clean potential role markers, </s> tokens etc.
429
  assistant_response = re.sub(r"^(assistant|ASSISTANT|</s>|<\|im_end\|>|<\|assistant\|>)\s*[:\n]*", "", potential_response, flags=re.IGNORECASE | re.DOTALL).strip()
430
- else: logger.error(f"Unexpected output format type: {type(output_data)}")
431
  else: logger.error(f"Unexpected LLM output structure: {results}")
432
-
433
-
434
- # --- Final Validation and Cleaning ---
435
  if assistant_response:
436
- duration = time.time() - generation_start_time
437
- logger.info(f"Generation successful ({model_id}) in {duration:.2f}s. Length: {len(assistant_response)} chars.")
438
- assistant_response = re.sub(r"^```markdown\n", "", assistant_response).strip()
439
- assistant_response = re.sub(r"\n```$", "", assistant_response).strip()
440
- # Basic length check
441
- if len(assistant_response) < 50:
442
- logger.warning(f"Generated output very short ({len(assistant_response)} chars).")
443
- st.warning("⚠️ Generated output seems very short. Please review.")
444
  return assistant_response
445
- else:
446
- logger.error(f"Failed to extract assistant response. Full output: {results}")
447
- st.error("❌ Failed to parse LLM response structure. Check logs.")
448
- return None
449
-
450
- except torch.cuda.OutOfMemoryError:
451
- logger.error(f"OOM Error during generation with {model_id}!", exc_info=True)
452
- st.error(f"❌ Generation failed: Out of GPU Memory ({model_id}). Try a smaller model, reduce 'Max Generation Tokens', or restart the space.")
453
- clear_gpu_memory() # Attempt to recover
454
- return None
455
- except Exception as e:
456
- logger.error(f"Unhandled error during generation ({model_id}): {e}", exc_info=True)
457
- st.error(f"❌ Unexpected error during generation: {e}")
458
- return None
459
-
460
 
461
  # --- Streamlit App UI ---
462
 
463
- st.set_page_config(layout="wide", page_title="Advanced SEO Content Generator v3")
464
 
465
- # Sidebar Setup
466
  with st.sidebar:
467
  st.header("βš™οΈ Configuration")
 
 
 
468
  selected_model_key = st.selectbox(
469
  "Choose Language Model:",
470
  options=list(MODEL_OPTIONS.keys()),
471
  index=list(MODEL_OPTIONS.keys()).index(DEFAULT_MODEL_KEY),
472
- help="Select AI model. Performance & resource needs vary significantly. Larger models may fail on free tiers."
 
473
  )
474
  selected_model_id = MODEL_OPTIONS[selected_model_key]
475
 
476
- # Button to explicitly load/switch model
477
- if st.button(f"Load/Switch to {selected_model_key}", key="load_model_button"):
478
- with st.spinner(f"Loading {selected_model_key}..."):
479
- load_model(selected_model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
- st.markdown("---") # Separator
482
 
483
- with st.expander("Content Settings", expanded=True):
484
- num_results = st.slider("Competitors to Analyze:", min_value=1, max_value=10, value=DEFAULT_NUM_RESULTS, step=1)
 
 
485
  selected_tone = st.selectbox("Content Tone:", options=TONE_OPTIONS, index=TONE_OPTIONS.index("Engaging"))
486
  selected_audience = st.selectbox("Target Audience:", options=AUDIENCE_OPTIONS, index=AUDIENCE_OPTIONS.index("General Audience"))
487
- max_gen_tokens = st.number_input("Max Generation Tokens:", min_value=500, max_value=8192, value=DEFAULT_MAX_GENERATION_TOKENS, step=250, help="Approximate max length of generated content. Higher values need more time/resources.")
488
-
489
- with st.expander("Internal Linking (Optional)"):
490
- website_url = st.text_input("Your Website URL (for link suggestions):", placeholder="https://www.example.com", value=st.session_state.get("last_website_url", ""))
491
- st.session_state.last_website_url = website_url # Store immediately for reuse
492
 
493
- st.markdown("---") # Separator
494
- st.header("ℹ️ Info & Notes")
495
- # Display currently loaded model (if any)
496
- if st.session_state.current_model_pipeline:
497
- st.success(f"βœ… Loaded: `{st.session_state.current_model_id}`")
498
- else:
499
- st.warning("⚠️ No model loaded. Choose and click 'Load/Switch'.")
500
 
 
 
501
  st.info(f"""
 
502
  - **Competitors:** Top {num_results}
503
  - **Max Generation:** ~{max_gen_tokens} tokens
504
  """)
505
  st.warning("""
506
- - **Resource Use:** Models vary greatly in RAM/GPU needs. Large models WILL fail on free tiers. Ensure model is loaded before generating.
507
- - **Scraping:** May fail. Success indicators shown during process.
508
- - **Human Review ESSENTIAL:** AI provides drafts. **Always** review, fact-check, edit, add unique value.
509
- - **Internal Linking:** Suggestions are AI-based guesses of relevant topics; verify and find the actual URLs yourself.
510
  """)
511
- if st.button("Clear All Cached Data", key="clear_all_data"):
512
- reset_session_state()
513
- st.toast("Cleared scraped data and generated content.", icon="πŸ—‘οΈ")
514
-
515
 
516
- # Main App Area
517
- st.title("✨ Advanced SEO Content Generator ✨")
518
- st.markdown(f"Leverage AI to analyze competitors and craft superior content.")
519
 
520
- # User Input
521
- keyword = st.text_input("Enter Primary Target Keyword:", placeholder="e.g., benefits of hydroponic gardening at home", value=st.session_state.get("last_keyword", ""))
 
522
 
523
- col1, col2 = st.columns([2, 1])
524
- with col1:
525
- generate_button = st.button("Analyze & Generate Content", type="primary", key="generate_main_content")
526
 
527
- # Check if model is loaded before allowing generation
528
- if not st.session_state.current_model_pipeline:
529
- st.error("Please load a model from the sidebar before generating content.")
530
- st.stop()
 
 
 
531
 
532
  st.markdown("---")
533
 
534
- # --- Main Workflow ---
535
- if generate_button:
 
 
 
 
536
  if not keyword:
537
  st.warning("⚠️ Please enter a keyword.")
538
  st.stop()
539
 
540
- st.session_state.last_keyword = keyword # Store keyword
541
  ua = get_user_agent() # Ensure user agent is ready
542
- if not ua: st.error("❌ User Agent failed to initialize. Cannot scrape."); st.stop()
543
 
544
- # Reset previous results for this keyword if generating anew
545
  st.session_state.generated_content = ""
546
  st.session_state.internal_link_suggestions = ""
547
 
548
- # --- Step 1: Scrape Competitors ---
549
- # Check if we need to re-scrape (different keyword or no cached data)
550
  if keyword != st.session_state.get('_internal_last_scrape_keyword', None) or not st.session_state.competitor_analysis_text:
551
- logger.info(f"New keyword or no cached data. Starting scrape for '{keyword}'.")
552
- st.session_state.competitor_analysis_text = "" # Clear previous text
553
- st.session_state.scraped_urls = [] # Clear previous URLs
 
554
 
555
  scrape_container = st.container()
556
  with scrape_container:
557
- st.subheader(f"πŸ•ΈοΈ Scraping Top {num_results} Competitors...")
558
- status_area = st.empty() # Placeholder for multi-line status
 
 
559
 
560
  urls = get_top_urls(keyword, num_results)
561
  st.session_state.scraped_urls = urls
562
 
563
  if urls:
564
  all_texts = []
565
- progress_bar = st.progress(0, text="Scraping progress...")
566
- scrape_status_messages = [] # Collect messages
567
-
568
  for i, url in enumerate(urls):
569
- scrape_status_ui = st.empty() # Temporary UI element for each URL status
570
- content = scrape_page_content(url, ua, scrape_status_ui)
571
- if content:
572
- all_texts.append(content)
573
- # Use toast for success, keep warnings/errors in main area if needed
574
- # st.toast(f"Scraped: {url[:50]}...", icon="βœ…")
575
- # Update overall progress
576
- progress_bar.progress((i + 1) / len(urls), text=f"Scraping URL {i+1}/{len(urls)}")
577
- time.sleep(0.1) # UI refresh delay
578
 
579
  st.session_state.competitor_analysis_text = "\n\n --- ARTICLE SEPARATOR --- \n\n".join(all_texts)
580
- st.session_state['_internal_last_scrape_keyword'] = keyword # Mark keyword as scraped
581
 
582
  if st.session_state.competitor_analysis_text:
583
- scrape_container.success(f"βœ… Scraped {len(all_texts)}/{len(urls)} pages. Total analysis text: {len(st.session_state.competitor_analysis_text)} chars.")
584
- logger.info(f"Scraping complete. Extracted {len(st.session_state.competitor_analysis_text)} chars.")
585
  else:
586
  scrape_container.error("❌ Failed to scrape sufficient content. Cannot generate article.")
587
  st.stop()
@@ -589,22 +498,18 @@ if generate_button:
589
  scrape_container.error("❌ Could not retrieve competitor URLs. Cannot proceed.")
590
  st.stop()
591
  else:
592
- st.success(f"βœ”οΈ Using previously scraped data for '{keyword}'. ({len(st.session_state.competitor_analysis_text)} chars from {len(st.session_state.scraped_urls)} URLs).")
593
- logger.info(f"Using cached scrape data for keyword '{keyword}'.")
594
-
595
 
596
  # --- Step 2: Generate Main Content ---
597
- st.subheader("✍️ Generating Main Content...")
598
- generation_status = st.status(f"Generating content with {st.session_state.current_model_id}...")
599
  with generation_status:
600
- st.write(f"**Tone:** {selected_tone}, **Audience:** {selected_audience}")
601
- st.write(f"**Max Tokens:** {max_gen_tokens}")
602
-
603
  gen_prompt = build_content_generation_prompt(
604
  keyword, st.session_state.competitor_analysis_text, selected_tone, selected_audience, st.session_state.current_model_id
605
  )
606
  generated_content = run_llm_generation(st.session_state.current_model_pipeline, gen_prompt, max_gen_tokens)
607
- st.session_state.generated_content = generated_content # Store in state
608
 
609
  if generated_content:
610
  generation_status.update(label="βœ… Content Generation Complete!", state="complete")
@@ -612,37 +517,31 @@ if generate_button:
612
  generation_status.update(label="❌ Content Generation Failed.", state="error")
613
  st.stop() # Stop if main content fails
614
 
615
- # --- Display Generated Content (if available) ---
616
  if st.session_state.generated_content:
617
  st.markdown("---")
618
  st.subheader("πŸ“ Generated SEO Content")
619
  st.markdown(st.session_state.generated_content)
620
- st.text_area("Copyable Markdown:", st.session_state.generated_content, height=400, key="generated_content_area")
621
 
622
- # --- Step 3: Internal Linking (Optional) ---
623
- if website_url:
624
  st.markdown("---")
625
  st.subheader("πŸ”— Internal Linking Suggestions")
626
- if st.button("Suggest Internal Links", key="suggest_links_button"):
627
- if not st.session_state.generated_content:
628
- st.warning("⚠️ Generate content first before suggesting links.")
629
- else:
630
- link_status = st.status("Analyzing content for linking opportunities...")
631
- with link_status:
632
- st.write(f"Analyzing based on website: {website_url}")
633
- link_prompt = build_internal_link_prompt(st.session_state.generated_content, keyword, website_url)
634
- # Use fewer tokens for link suggestions
635
- link_suggestions = run_llm_generation(st.session_state.current_model_pipeline, link_prompt, max_tokens=500)
636
- st.session_state.internal_link_suggestions = link_suggestions
637
-
638
- if link_suggestions:
639
- link_status.update(label="βœ… Link suggestions generated!", state="complete")
640
- else:
641
- link_status.update(label="❌ Failed to generate link suggestions.", state="error")
642
-
643
- # Display suggestions if available
644
  if st.session_state.internal_link_suggestions:
645
  st.markdown(st.session_state.internal_link_suggestions)
646
- st.info("ℹ️ Remember: These are AI suggestions. Find the best matching *actual* URL on your site for each.")
647
  else:
648
- st.info("Provide your website URL in the sidebar under 'Advanced Options -> Internal Linking' to enable link suggestions.")
 
 
9
  import logging
10
  import re
11
  from retrying import retry
12
+ import gc
13
 
14
  # --- Configuration ---
15
+ # Model Options (Ensure keys clearly indicate resource needs)
 
16
  MODEL_OPTIONS = {
17
+ # Lighter Models (More likely to work on free tiers)
18
+ "Mistral-7B-Instruct (Fast, Med RAM)": "mistralai/Mistral-7B-Instruct-v0.2",
19
+ "Gemma-7B-IT (Google, Med RAM)": "google/gemma-7b-it",
20
+ "Phi-3-Mini-4k-Instruct (Microsoft, Small, Good)": "microsoft/Phi-3-mini-4k-instruct", # Requires trust_remote_code
21
+
22
+ # Medium Models (May require upgraded tiers / more RAM/GPU)
23
+ "Llama-3-8B-Instruct (Meta, High Quality, High RAM/GPU)": "meta-llama/Meta-Llama-3-8B-Instruct",
24
+ "Phi-3-Medium-4k-Instruct (Microsoft, Strong, High RAM/GPU)": "microsoft/Phi-3-medium-4k-instruct", # Requires trust_remote_code
25
+ "Qwen1.5-14B-Chat (Alibaba, Strong, High RAM/GPU)": "Qwen/Qwen1.5-14B-Chat",
26
+
27
+ # Larger Models (Very likely require significant paid resources)
28
+ "DeepSeek-Coder-V2-Instruct (DeepSeek, High RAM/GPU)": "deepseek-ai/DeepSeek-Coder-V2-Instruct", # Requires trust_remote_code
29
  }
30
+ DEFAULT_MODEL_KEY = "Mistral-7B-Instruct (Fast, Med RAM)" # Start with a lighter default selection
 
31
 
32
  # Scraping & Generation Defaults
33
+ DEFAULT_NUM_RESULTS = 4 # Reduced default slightly
34
  REQUEST_TIMEOUT = 15
35
+ MAX_COMPETITOR_TEXT_LENGTH = 5500
36
+ DEFAULT_MAX_GENERATION_TOKENS = 2800
37
 
38
+ # Retry settings
39
  RETRY_WAIT_FIXED = 2000
40
  RETRY_STOP_MAX_ATTEMPT = 3
41
 
42
  # Tone & Audience Options
43
+ TONE_OPTIONS = ["Conversational", "Professional", "Authoritative", "Technical", "Friendly", "Engaging", "Educational", "Persuasive"]
44
+ AUDIENCE_OPTIONS = ["Beginners", "General Audience", "Experts", "Professionals (Specific Field)", "Customers", "Students", "Decision Makers"]
45
 
46
  # --- Logging Setup ---
47
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
48
  logger = logging.getLogger(__name__)
49
 
50
+ # --- State Management ---
51
+ # Initialize session state keys carefully
52
+ if 'current_model_pipeline' not in st.session_state: st.session_state.current_model_pipeline = None
53
+ if 'current_model_id' not in st.session_state: st.session_state.current_model_id = ""
54
+ # Data related state
55
  if 'scraped_urls' not in st.session_state: st.session_state.scraped_urls = []
56
  if 'competitor_analysis_text' not in st.session_state: st.session_state.competitor_analysis_text = ""
57
  if 'generated_content' not in st.session_state: st.session_state.generated_content = ""
58
  if 'internal_link_suggestions' not in st.session_state: st.session_state.internal_link_suggestions = ""
59
  if 'last_keyword' not in st.session_state: st.session_state.last_keyword = ""
 
60
  if 'last_website_url' not in st.session_state: st.session_state.last_website_url = ""
61
+ if '_internal_last_scrape_keyword' not in st.session_state: st.session_state._internal_last_scrape_keyword = ""
 
62
 
63
 
64
+ # --- Helper Functions ---
65
  def clear_gpu_memory():
66
+ """Attempts to clear GPU memory cache and run garbage collection."""
67
  logger.info("Attempting to clear GPU memory...")
68
  if torch.cuda.is_available():
69
+ try:
70
+ st.session_state.current_model_pipeline = None # Ensure reference is removed FIRST
71
+ gc.collect() # Run Python garbage collection
72
+ torch.cuda.empty_cache() # Tell PyTorch to release cached memory
73
+ gc.collect() # Run GC again
74
+ logger.info("GPU memory cache cleared and garbage collected.")
75
+ st.toast("Cleared GPU memory.", icon="🧹")
76
+ except Exception as e:
77
+ logger.error(f"Error clearing GPU memory: {e}", exc_info=True)
78
+ st.toast(f"Error clearing GPU memory: {e}", icon="❌")
79
  else:
80
  logger.info("No GPU available, skipping memory clearing.")
81
+ st.session_state.current_model_pipeline = None # Still clear the reference
82
+ gc.collect()
83
 
84
+ def reset_app_data():
85
+ """Clears stored scraping and generation results, keeps model loaded."""
86
+ st.session_state.scraped_urls = []
87
+ st.session_state.competitor_analysis_text = ""
88
+ st.session_state.generated_content = ""
89
+ st.session_state.internal_link_suggestions = ""
90
+ st.session_state.last_keyword = ""
91
+ st.session_state._internal_last_scrape_keyword = ""
92
+ logger.info("App data state reset (scraped/generated content).")
93
+ st.toast("Cleared scraped data and generated content.", icon="πŸ—‘οΈ")
94
+
95
+ # --- Model Loading (On Demand) ---
96
+ def load_model(model_id_to_load):
97
+ """Loads the selected model, unloading any previous one."""
98
+ # If the requested model is already loaded, do nothing
99
+ if st.session_state.get('current_model_id') == model_id_to_load and st.session_state.get('current_model_pipeline') is not None:
100
+ logger.info(f"Model {model_id_to_load} is already loaded.")
101
+ st.toast(f"{model_id_to_load} is already loaded.", icon="βœ…")
102
+ return True
103
+
104
+ # Unload previous model if one exists and is different
105
+ if st.session_state.get('current_model_pipeline') is not None:
106
  logger.info(f"Unloading previous model: {st.session_state.current_model_id}")
107
+ st.toast(f"Unloading {st.session_state.current_model_id}...", icon="🧹")
108
+ clear_gpu_memory() # This sets pipeline to None and clears cache
109
+ st.session_state.current_model_id = "" # Clear model ID state
110
 
111
+ # Load the new model
112
+ st.toast(f"Loading {model_id_to_load}... This may take time & RAM/GPU.", icon="⏳")
113
+ logger.info(f"Attempting to load LLM pipeline for model: {model_id_to_load}")
114
  pipeline_instance = None
115
+ success = False
116
  try:
 
 
117
  dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else torch.float32
118
  logger.info(f"Using dtype: {dtype}")
119
 
120
+ trust_code_models = [
121
+ "microsoft/Phi-3-mini-4k-instruct",
122
  "microsoft/Phi-3-medium-4k-instruct",
123
  "deepseek-ai/DeepSeek-Coder-V2-Instruct",
124
+ # Add others if needed
125
  ]
126
+ trust_code = model_id_to_load in trust_code_models
127
+ logger.info(f"Trust remote code for {model_id_to_load}: {trust_code}")
128
+
129
+ # Display spinner during the actual loading
130
+ with st.spinner(f"Loading {model_id_to_load} into memory..."):
131
+ pipeline_instance = pipeline(
132
+ "text-generation",
133
+ model=model_id_to_load,
134
+ trust_remote_code=trust_code,
135
+ device_map="auto",
136
+ torch_dtype=dtype,
137
+ )
138
+
139
+ # Handle pad_token
140
+ if pipeline_instance.tokenizer.pad_token_id is None:
141
+ pipeline_instance.tokenizer.pad_token_id = pipeline_instance.tokenizer.eos_token_id
142
+ if hasattr(pipeline_instance.model, 'config'):
143
+ pipeline_instance.model.config.pad_token_id = pipeline_instance.tokenizer.eos_token_id
144
+ logger.warning(f"Set pad_token_id to eos_token_id for {model_id_to_load}")
145
+
146
+ logger.info(f"LLM pipeline loaded successfully for {model_id_to_load}.")
147
  st.session_state.current_model_pipeline = pipeline_instance
148
+ st.session_state.current_model_id = model_id_to_load
149
+ st.toast(f"Model {model_id_to_load} loaded!", icon="βœ…")
150
+ success = True
151
 
152
  except ImportError as e:
153
+ logger.error(f"ImportError loading {model_id_to_load}: {e}. Missing dependency?", exc_info=True)
154
+ st.error(f"Load Error: Missing library for {model_id_to_load}? Check logs. Details: {e}")
 
155
  except Exception as e:
156
+ logger.error(f"Failed to load {model_id_to_load}: {e}", exc_info=True)
157
+ st.error(f"Failed to load {model_id_to_load}. Error: {e}. Check resource limits (RAM/GPU) & logs.")
158
+ clear_gpu_memory() # Attempt to clean up if loading failed
159
+ st.session_state.current_model_id = "" # Ensure state reflects failure
160
+ finally:
161
+ return success # Return status
162
+
163
+ # --- User Agent Caching ---
 
164
  @st.cache_resource
165
  def get_user_agent():
166
+ # (Same as previous version)
167
  logger.info("Initializing FakeUserAgent.")
168
  try:
 
169
  return UserAgent(fallback='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
170
  except Exception as e:
171
  logger.error(f"Failed to initialize FakeUserAgent: {e}", exc_info=True)
172
+ st.error(f"Could not initialize User Agent generator. Error: {e}")
173
  return None
174
 
175
+ # --- Core Functions (Scraping, Prompt Building, Generation Logic) ---
176
+ # These functions (get_top_urls, scrape_page_content, clean_text, fetch_url_content,
177
+ # build_content_generation_prompt, build_internal_link_prompt, run_llm_generation)
178
+ # remain largely the same as the previous version, as they were already quite robust.
179
+ # Ensure `run_llm_generation` correctly uses the pipeline passed to it (which it did).
 
 
 
 
 
 
180
 
181
+ # --- (Include the definitions for the core functions here - unchanged from previous version) ---
182
  @retry(wait_fixed=RETRY_WAIT_FIXED, stop_max_attempt_number=RETRY_STOP_MAX_ATTEMPT,
183
  retry_on_exception=lambda e: isinstance(e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError)))
184
  def fetch_url_content(url, headers):
 
185
  logger.info(f"Fetching {url} (Attempt {fetch_url_content.retry.attempt_number+1}/{RETRY_STOP_MAX_ATTEMPT})")
186
  response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
187
  response.raise_for_status()
188
  if 'text/html' not in response.headers.get('Content-Type', ''):
189
+ logger.warning(f"Skipping URL {url} - Not HTML")
190
  return None
 
191
  if len(response.content) > 10 * 1024 * 1024: # 10 MB limit
192
+ logger.warning(f"Skipping URL {url} - Content too large")
193
  return None
194
  return response
195
 
196
  def clean_text(text):
 
 
197
  text = re.sub(r'\s{2,}', ' ', text)
198
  text = re.sub(r'\n+', '\n', text)
 
199
  lines = text.split('\n')
200
  cleaned_lines = []
201
+ min_line_length = 20
202
+ min_words_per_line = 3
203
  skip_phrases = [
204
  'copyright Β©', 'all rights reserved', 'privacy policy', 'terms of use', 'terms and conditions',
205
  'cookie policy', 'subscribe', 'sign up', 'log in', 'advertisement', 'share this', 'related posts',
206
  'leave a reply', 'comment', 'posted on', 'by author', 'tags:', 'categories:', 'follow us', 'read more',
207
+ 'click here', 'learn more', 'next article', 'previous article', 'you may also like', 'related topics'
208
  ]
209
  for line in lines:
210
  stripped_line = line.strip()
211
  lower_line = stripped_line.lower()
 
212
  if len(stripped_line) >= min_line_length and \
213
  len(stripped_line.split()) >= min_words_per_line and \
214
  not any(phrase in lower_line for phrase in skip_phrases):
215
  cleaned_lines.append(stripped_line)
 
216
  text = '\n'.join(cleaned_lines)
217
  return text.strip()
218
 
219
  def scrape_page_content(url, user_agent, scrape_status_ui):
220
+ if not user_agent: logger.error("User Agent missing."); return ""
221
+ headers = {
222
+ 'User-Agent': user_agent.random,
223
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
224
+ 'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/',
225
+ 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
226
+ }
227
  try:
228
  response = fetch_url_content(url, headers)
229
+ if response is None: scrape_status_ui.warning(f"⚠️ Skip/Fail fetch: {url}", icon="πŸ•ΈοΈ"); return ""
230
+ soup = BeautifulSoup(response.content, 'lxml')
 
 
 
 
 
231
  tags_to_remove = ["script", "style", "nav", "footer", "aside", "form", "header", "noscript", "button", "input", "select", "textarea", "figure", "figcaption", "iframe", "svg", "path", "meta", "link"]
232
  for element in soup(tags_to_remove): element.decompose()
233
  for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract()
234
+ main_content = (soup.find('main') or soup.find('article') or soup.find(role='main') or
 
 
235
  soup.find('div', class_=re.compile(r'(content|main|body|post|entry|article)', re.I)) or
236
  soup.find('div', id=re.compile(r'(content|main|body|post|entry|article)', re.I)))
237
  target_soup = main_content if main_content else soup.body
238
+ if not target_soup: logger.warning(f"No body/main: {url}"); scrape_status_ui.warning(f"⚠️ No body/main: {url}", icon="πŸ•ΈοΈ"); return ""
239
+ texts = target_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'blockquote', 'span'])
 
 
240
  content_parts = []
241
  for elem in texts:
 
242
  if elem.find_parent(tags_to_remove): continue
 
243
  elem_text = elem.get_text(separator=' ', strip=True)
 
244
  if len(elem_text) > 10 and len(elem_text.split()) > 1:
245
+ if elem.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'tr', 'div']: # Added div for structure
 
246
  content_parts.append(elem_text + "\n")
247
+ else: content_parts.append(elem_text + " ")
 
 
248
  content = "".join(content_parts)
249
+ cleaned_content = clean_text(content)
250
+ if len(cleaned_content) < 150: logger.warning(f"Low content ({len(cleaned_content)} chars): {url}"); scrape_status_ui.warning(f"⚠️ Low content: {url}", icon="πŸ•ΈοΈ")
251
+ else: logger.info(f"Scraped {len(cleaned_content)} chars: {url}"); scrape_status_ui.success(f"βœ… Scraped: {url} ({len(cleaned_content)} chars)", icon="πŸ•ΈοΈ")
252
+ time.sleep(0.6)
 
 
 
 
 
 
253
  return cleaned_content
254
+ except requests.exceptions.RequestException as e: logger.warning(f"Final scrape fail: {url}. Err: {e}"); scrape_status_ui.error(f"❌ Fail scrape: {url} ({e})", icon="πŸ•ΈοΈ"); return ""
255
+ except Exception as e: logger.error(f"Unexpected scrape error: {url}: {e}", exc_info=True); scrape_status_ui.error(f"❌ Error scraping: {url} (Logs)", icon="πŸ•ΈοΈ"); return ""
 
 
 
 
 
 
 
 
256
 
257
  def get_top_urls(keyword, num_results):
 
258
  logger.info(f"Fetching top {num_results} URLs for keyword: '{keyword}'")
259
  try:
 
260
  urls = list(search(keyword, num_results=num_results, sleep_interval=2.5, lang="en", timeout=15))
261
  logger.info(f"Found URLs: {urls}")
262
+ if not urls: st.warning(f"⚠️ No Google search results found for '{keyword}'."); return []
 
 
263
  return urls
264
  except Exception as e:
265
+ error_message = str(e); logger.error(f"GSearch Error: {error_message}", exc_info=True)
266
+ if "429" in error_message: st.error(f"❌ Google search blocked (429). WAIT before retrying.")
267
+ elif "timed out" in error_message: st.error(f"❌ Google search timed out.")
268
+ else: st.error(f"❌ GSearch Error: {error_message[:100]}...")
 
 
 
 
 
269
  return []
270
 
 
 
 
271
  def build_content_generation_prompt(keyword, competitor_texts, tone, audience, model_id):
272
+ logger.info(f"Build content gen prompt. Tone: {tone}, Audience: {audience}. Comp length: {len(competitor_texts)}")
 
273
  if len(competitor_texts) > MAX_COMPETITOR_TEXT_LENGTH:
274
+ competitor_summary = competitor_texts[:MAX_COMPETITOR_TEXT_LENGTH] + "... [Truncated]"
275
+ logger.warning(f"Comp text truncated.")
276
+ else: competitor_summary = competitor_texts
277
+ system_prompt = f"""You are an expert SEO Content Strategist & world-class Copywriter. Task: Analyze competitor text & generate a significantly superior, comprehensive, user-first article for keyword '{keyword}', targeting '{audience}' audience with '{tone}' tone. Focus on quality, depth, clarity, fulfilling user intent better than competition."""
278
+ user_prompt = f"""**Keyword:** "{keyword}"
279
+ **Audience:** {audience}
280
+ **Tone:** {tone}
281
+ **Objective:** Generate exceptional, SEO-optimized article for "{keyword}" designed to outperform top content via superior value, insights, UX.
282
+ **Competitor Analysis Context (Analyze for topics, depth, strengths, WEAKNESSES/GAPS):**
283
+ --- BEGIN COMPETITOR ---
 
 
 
 
 
 
284
  {competitor_summary}
285
+ --- END COMPETITOR ---
286
+ **Content Gen Instructions:**
287
+ 1. **Value & Depth:** Be demonstrably better. Deeper, clearer, actionable advice, unique perspectives/data, fill gaps. Address user intent exhaustively.
288
+ 2. **User-First & Humanized:** Write for '{audience}' in '{tone}'. Clear, concise, short paras, varied sentences, engaging Qs. Logical flow, readable.
289
+ 3. **Structure (Strict Markdown):** Compelling H2 Title. Engaging Intro (50-150 words): Hook, purpose/value, outline. Logical Sections (H2)/Sub-sections (H3): Descriptive, keyword-aware headings. Readability: Bullets (`* `), Numbered lists (`1. `), **Bold** (strategic). Comprehensive Body: Expand beyond competitors. Strong Conclusion: Summarize takeaways, final insight/CTA.
290
+ 4. **SEO (Natural):** Weave "{keyword}" & LSI terms into title, headings, intro, body, conclusion. Prioritize relevance/clarity over density. NO keyword stuffing.
291
+ 5. **Originality & Credibility:** 100% unique. Use comp text ONLY for analysis. NO plagiarism. Factual accuracy.
292
+ 6. **Negative Constraints:** DO NOT: Rehash competitors; use preambles/sign-offs; use excessive jargon (unless 'Experts'); write long paragraphs; stuff keywords; invent facts.
293
+ **Output:** ONLY the Markdown article, starting with H2 title."""
294
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
295
+ logger.info(f"Content prompt done for {model_id}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  return messages
297
 
 
 
298
  def build_internal_link_prompt(generated_content, keyword, website_url):
299
+ logger.info(f"Build internal link prompt for URL: {website_url}")
300
+ system_prompt = "You are an SEO assistant specialized in identifying internal linking opportunities."
 
 
 
301
  user_prompt = f"""**Website Base URL:** {website_url}
302
  **Main Topic of Article:** "{keyword}"
303
+ **Task:** Review the article below. Identify 3-5 phrases/sentences for internal links relevant to {website_url}.
 
 
304
  **For each opportunity, provide:**
305
+ 1. Exact anchor text phrase/sentence from article.
306
+ 2. Brief description of the *type* of relevant content needed (e.g., "detailed guide on [sub-topic]", "service page for [service]").
307
+ **IMPORTANT:** Do NOT invent URLs. Describe the *type* of page. Choose natural anchor text. Focus on value. Format as Markdown numbered list.
308
+ **Article Content (Analyze first ~8000 chars):**
309
+ --- BEGIN ARTICLE ---
 
 
 
 
 
 
310
  {generated_content[:8000]}
311
+ --- END ARTICLE ---"""
312
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
 
 
 
 
313
  return messages
314
 
 
 
315
  def run_llm_generation(pipe, messages, max_tokens):
316
+ if pipe is None: st.error("❌ LLM Pipeline missing."); return None
317
+ model_id = pipe.model.name_or_path
318
+ logger.info(f"Running generation: {model_id}. Max tokens: {max_tokens}.")
319
+ start_time = time.time()
 
 
 
 
 
320
  try:
321
+ gen_args = {"max_new_tokens": max_tokens, "temperature": 0.7, "top_p": 0.95, "top_k": 40,
322
+ "do_sample": True, "pad_token_id": pipe.tokenizer.eos_token_id, "eos_token_id": pipe.tokenizer.eos_token_id}
323
+ logger.info(f"Gen args: {gen_args}")
324
+ results = pipe(messages, **gen_args)
325
+ # --- Robust Extraction ---
 
 
 
 
 
 
 
 
 
 
 
 
326
  assistant_response = None
327
  if results and results[0] and 'generated_text' in results[0]:
328
  output_data = results[0]['generated_text']
329
+ if isinstance(output_data, list): assistant_message = next((msg['content'] for msg in reversed(output_data) if msg['role'] == 'assistant'), None); assistant_response = assistant_message
330
+ elif isinstance(output_data, str):
 
 
 
331
  last_prompt_content = messages[-1]['content']
332
  last_prompt_index = output_data.rfind(last_prompt_content)
333
+ if last_prompt_index != -1: potential_response = output_data[last_prompt_index + len(last_prompt_content):].strip()
334
+ else: potential_response = output_data
 
 
 
335
  assistant_response = re.sub(r"^(assistant|ASSISTANT|</s>|<\|im_end\|>|<\|assistant\|>)\s*[:\n]*", "", potential_response, flags=re.IGNORECASE | re.DOTALL).strip()
336
+ else: logger.error(f"Unexpected output format: {type(output_data)}")
337
  else: logger.error(f"Unexpected LLM output structure: {results}")
338
+ # --- Validation ---
 
 
339
  if assistant_response:
340
+ duration = time.time() - start_time; logger.info(f"Gen success ({model_id}) {duration:.2f}s. Len: {len(assistant_response)}.")
341
+ assistant_response = re.sub(r"^```markdown\n", "", assistant_response).strip(); assistant_response = re.sub(r"\n```$", "", assistant_response).strip()
342
+ if len(assistant_response) < 30: logger.warning(f"Gen output very short ({len(assistant_response)})."); st.warning("⚠️ Gen output very short.")
 
 
 
 
 
343
  return assistant_response
344
+ else: logger.error(f"Failed parse assistant response. Output: {results}"); st.error("❌ Failed parse LLM response. Check logs."); return None
345
+ except torch.cuda.OutOfMemoryError: logger.error(f"OOM Error ({model_id})!", exc_info=True); st.error(f"❌ OOM Error ({model_id}). Try smaller model/less tokens/restart."); clear_gpu_memory(); return None
346
+ except Exception as e: logger.error(f"Unhandled gen error ({model_id}): {e}", exc_info=True); st.error(f"❌ Unexpected gen error: {e}"); return None
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  # --- Streamlit App UI ---
349
 
350
+ st.set_page_config(layout="wide", page_title="On-Demand SEO Content Gen")
351
 
352
+ # --- Sidebar ---
353
  with st.sidebar:
354
  st.header("βš™οΈ Configuration")
355
+
356
+ # Model Selection & Loading Area
357
+ st.subheader("1. Select & Load Model")
358
  selected_model_key = st.selectbox(
359
  "Choose Language Model:",
360
  options=list(MODEL_OPTIONS.keys()),
361
  index=list(MODEL_OPTIONS.keys()).index(DEFAULT_MODEL_KEY),
362
+ key="model_selector", # Key for potential state access
363
+ help="Choose AI model. Performance & resources vary. Load required."
364
  )
365
  selected_model_id = MODEL_OPTIONS[selected_model_key]
366
 
367
+ # Display current status and load button
368
+ load_button_placeholder = st.empty() # Placeholder for dynamic button text/state
369
+ model_status_placeholder = st.empty() # Placeholder for status message
370
+
371
+ if st.session_state.get('current_model_id') == selected_model_id and st.session_state.get('current_model_pipeline') is not None:
372
+ model_status_placeholder.success(f"βœ… Loaded: `{selected_model_id}`")
373
+ load_button_text = f"Switch from {selected_model_key}" # Or "Reload"
374
+ elif st.session_state.get('current_model_pipeline') is not None:
375
+ model_status_placeholder.warning(f"⚠️ Loaded: `{st.session_state.current_model_id}`\nSelected: `{selected_model_id}`")
376
+ load_button_text = f"Unload Current & Load {selected_model_key}"
377
+ else:
378
+ model_status_placeholder.info("ℹ️ No model loaded.")
379
+ load_button_text = f"Load Selected: {selected_model_key}"
380
+
381
+ if load_button_placeholder.button(load_button_text, key="load_model"):
382
+ load_model(selected_model_id)
383
+ # Rerun to update status placeholders immediately after load attempt
384
+ st.rerun()
385
 
386
+ st.markdown("---")
387
 
388
+ # Content Settings
389
+ st.subheader("2. Content Settings")
390
+ with st.expander("Adjust Content Parameters", expanded=False):
391
+ num_results = st.slider("Competitors to Analyze:", min_value=1, max_value=8, value=DEFAULT_NUM_RESULTS, step=1)
392
  selected_tone = st.selectbox("Content Tone:", options=TONE_OPTIONS, index=TONE_OPTIONS.index("Engaging"))
393
  selected_audience = st.selectbox("Target Audience:", options=AUDIENCE_OPTIONS, index=AUDIENCE_OPTIONS.index("General Audience"))
394
+ max_gen_tokens = st.number_input("Max Generation Tokens:", min_value=500, max_value=8192, value=DEFAULT_MAX_GENERATION_TOKENS, step=100)
 
 
 
 
395
 
396
+ # Internal Linking
397
+ st.subheader("3. Internal Linking (Optional)")
398
+ with st.expander("Configure Link Suggestions", expanded=False):
399
+ website_url = st.text_input("Your Website URL:", placeholder="https://www.example.com", value=st.session_state.get("last_website_url", ""), key="website_url_input")
400
+ # Update state immediately on change if needed, or just read before use
401
+ st.session_state.last_website_url = website_url
 
402
 
403
+ st.markdown("---")
404
+ st.header("ℹ️ App Info & Actions")
405
  st.info(f"""
406
+ - **Status:** {'Model Loaded' if st.session_state.current_model_pipeline else 'No Model Loaded'}
407
  - **Competitors:** Top {num_results}
408
  - **Max Generation:** ~{max_gen_tokens} tokens
409
  """)
410
  st.warning("""
411
+ - **Load Model First:** Select a model and click 'Load' before generating.
412
+ - **Resource Use:** Models need significant RAM/GPU. Loading WILL fail if resources are insufficient.
413
+ - **Review Output:** AI provides drafts. ALWAYS review, edit, fact-check.
 
414
  """)
415
+ if st.button("Clear Scraped/Generated Data", key="clear_data"):
416
+ reset_app_data()
 
 
417
 
418
+ # --- Main App Area ---
419
+ st.title("✨ On-Demand SEO Content Generator ✨")
420
+ st.markdown(f"Load your chosen AI model, then generate SEO-focused content.")
421
 
422
+ # User Input Area
423
+ st.subheader("Keyword & Generation")
424
+ keyword = st.text_input("Enter Primary Target Keyword:", placeholder="e.g., vertical hydroponics guide", value=st.session_state.get("last_keyword", ""), key="keyword_input")
425
 
426
+ # Disable button if model not loaded
427
+ generate_button_disabled = st.session_state.current_model_pipeline is None
428
+ generate_button_help = "Load a model from the sidebar first." if generate_button_disabled else "Analyze competitors and generate article."
429
 
430
+ analyze_button = st.button(
431
+ "Analyze Competitors & Generate Content",
432
+ type="primary",
433
+ key="generate_button",
434
+ disabled=generate_button_disabled,
435
+ help=generate_button_help
436
+ )
437
 
438
  st.markdown("---")
439
 
440
+ # --- Main Workflow Triggered by Button ---
441
+ if analyze_button:
442
+ # Double check model is loaded (though button should be disabled)
443
+ if not st.session_state.current_model_pipeline:
444
+ st.error("❌ Cannot generate: No model loaded. Please use the sidebar.")
445
+ st.stop()
446
  if not keyword:
447
  st.warning("⚠️ Please enter a keyword.")
448
  st.stop()
449
 
450
+ st.session_state.last_keyword = keyword # Store keyword for potential reuse
451
  ua = get_user_agent() # Ensure user agent is ready
452
+ if not ua: st.error("❌ User Agent failed. Cannot scrape."); st.stop()
453
 
454
+ # Reset previous generation results for this run
455
  st.session_state.generated_content = ""
456
  st.session_state.internal_link_suggestions = ""
457
 
458
+ # --- Step 1: Scrape Competitors (with status updates) ---
459
+ # Check if scrape needed
460
  if keyword != st.session_state.get('_internal_last_scrape_keyword', None) or not st.session_state.competitor_analysis_text:
461
+ logger.info(f"Scraping needed for '{keyword}'.")
462
+ st.session_state.competitor_analysis_text = "" # Clear old text
463
+ st.session_state.scraped_urls = []
464
+ st.session_state['_internal_last_scrape_keyword'] = "" # Reset marker until success
465
 
466
  scrape_container = st.container()
467
  with scrape_container:
468
+ st.info(f"πŸ•ΈοΈ Fetching URLs and Scraping Top {num_results} Competitors...")
469
+ progress_text = "Scraping progress..."
470
+ scrape_progress_bar = st.progress(0, text=progress_text)
471
+ status_area = st.container() # Use container for multiple status lines
472
 
473
  urls = get_top_urls(keyword, num_results)
474
  st.session_state.scraped_urls = urls
475
 
476
  if urls:
477
  all_texts = []
478
+ scraped_count = 0
 
 
479
  for i, url in enumerate(urls):
480
+ with status_area: # Show status within the designated area
481
+ scrape_status_ui = st.empty() # Placeholder for single URL status
482
+ content = scrape_page_content(url, ua, scrape_status_ui)
483
+ if content:
484
+ all_texts.append(content)
485
+ scraped_count += 1
486
+ scrape_progress_bar.progress((i + 1) / len(urls), text=f"Processed URL {i+1}/{len(urls)}...")
487
+ time.sleep(0.1) # UI update breather
 
488
 
489
  st.session_state.competitor_analysis_text = "\n\n --- ARTICLE SEPARATOR --- \n\n".join(all_texts)
490
+ st.session_state['_internal_last_scrape_keyword'] = keyword # Mark scrape success for this keyword
491
 
492
  if st.session_state.competitor_analysis_text:
493
+ scrape_container.success(f"βœ… Scraped {scraped_count}/{len(urls)} pages. Analysis text: {len(st.session_state.competitor_analysis_text)} chars.")
 
494
  else:
495
  scrape_container.error("❌ Failed to scrape sufficient content. Cannot generate article.")
496
  st.stop()
 
498
  scrape_container.error("❌ Could not retrieve competitor URLs. Cannot proceed.")
499
  st.stop()
500
  else:
501
+ st.success(f"βœ”οΈ Using previously scraped data for '{keyword}'. ({len(st.session_state.competitor_analysis_text)} chars).")
 
 
502
 
503
  # --- Step 2: Generate Main Content ---
504
+ st.info(f"✍️ Generating Content with {st.session_state.current_model_id}...")
505
+ generation_status = st.status("Sending request to LLM...")
506
  with generation_status:
507
+ st.write(f"**Tone:** {selected_tone}, **Audience:** {selected_audience}, **Max Tokens:** {max_gen_tokens}")
 
 
508
  gen_prompt = build_content_generation_prompt(
509
  keyword, st.session_state.competitor_analysis_text, selected_tone, selected_audience, st.session_state.current_model_id
510
  )
511
  generated_content = run_llm_generation(st.session_state.current_model_pipeline, gen_prompt, max_gen_tokens)
512
+ st.session_state.generated_content = generated_content
513
 
514
  if generated_content:
515
  generation_status.update(label="βœ… Content Generation Complete!", state="complete")
 
517
  generation_status.update(label="❌ Content Generation Failed.", state="error")
518
  st.stop() # Stop if main content fails
519
 
520
+ # --- Display Outputs (Outside the button click conditional) ---
521
  if st.session_state.generated_content:
522
  st.markdown("---")
523
  st.subheader("πŸ“ Generated SEO Content")
524
  st.markdown(st.session_state.generated_content)
525
+ st.text_area("Copyable Markdown:", st.session_state.generated_content, height=400, key="generated_content_area_display")
526
 
527
+ # --- Internal Linking Section ---
528
+ if st.session_state.last_website_url: # Only show if URL was provided
529
  st.markdown("---")
530
  st.subheader("πŸ”— Internal Linking Suggestions")
531
+ if st.button("Suggest Internal Links", key="suggest_links_button_display"):
532
+ link_status = st.status(f"Analyzing content for link opportunities ({st.session_state.current_model_id})...")
533
+ with link_status:
534
+ st.write(f"Website context: {st.session_state.last_website_url}")
535
+ link_prompt = build_internal_link_prompt(st.session_state.generated_content, keyword, st.session_state.last_website_url)
536
+ link_suggestions = run_llm_generation(st.session_state.current_model_pipeline, link_prompt, max_tokens=500) # Use fewer tokens
537
+ st.session_state.internal_link_suggestions = link_suggestions
538
+ if link_suggestions: link_status.update(label="βœ… Link suggestions generated!", state="complete")
539
+ else: link_status.update(label="❌ Failed to generate link suggestions.", state="error")
540
+
541
+ # Display suggestions if they exist in state
 
 
 
 
 
 
 
542
  if st.session_state.internal_link_suggestions:
543
  st.markdown(st.session_state.internal_link_suggestions)
544
+ st.info("ℹ️ AI suggestions only. Verify relevance and find actual URLs on your site.")
545
  else:
546
+ st.markdown("---")
547
+ st.info("Provide your website URL in the sidebar to enable internal link suggestions after generating content.")