Spaces:

garyuzair
/

SEO-Content-Generator

Runtime error

App Files Files Community

garyuzair commited on May 6, 2025

Commit

c0388eb

verified ·

1 Parent(s): 88f139c

Update app.py

Browse files

Files changed (1) hide show

app.py +312 -413

app.py CHANGED Viewed

@@ -9,579 +9,488 @@ import time
 import logging
 import re
 from retrying import retry
-import gc # Garbage collector for potentially cleaning up GPU memory
 # --- Configuration ---
-# Model Options - Added more models with notes on resource needs
 MODEL_OPTIONS = {
-    # Smaller/Faster Models
-    "Mistral-7B-Instruct (Fast, Good Quality, Med RAM)": "mistralai/Mistral-7B-Instruct-v0.2",
-    "Gemma-7B (Google, Good Quality, Med RAM)": "google/gemma-7b-it", # Instruct-tuned version
-    "Llama-3-8B (Meta, Very Good Quality, High RAM)": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "Phi-3-Medium (Microsoft, Strong Reasoning, Med RAM)": "microsoft/Phi-3-medium-4k-instruct", # Needs trust_remote_code
-    # Larger/Slower Models (Likely require significant resources / paid tiers)
-    "DeepSeek-Coder-V2 (DeepSeek, Code/General, High RAM/GPU)": "deepseek-ai/DeepSeek-Coder-V2-Instruct", # Example: Using Coder V2 Instruct variant
-    "Qwen1.5-14B-Chat (Alibaba, Strong General, High RAM/GPU)": "Qwen/Qwen1.5-14B-Chat",
-    # Commenting out extremely large models unlikely to run easily:
-    # "Qwen3-235B-A22B (Very Large, EXPERIMENTAL)": "Qwen/Qwen3-235B-A22B", # Extremely large, needs special setup
 }
-# Default to a reasonably performant model
-DEFAULT_MODEL_KEY = "Mistral-7B-Instruct (Fast, Good Quality, Med RAM)"
 # Scraping & Generation Defaults
-DEFAULT_NUM_RESULTS = 5
 REQUEST_TIMEOUT = 15
-MAX_COMPETITOR_TEXT_LENGTH = 6000 # Increased slightly more, but monitor
-DEFAULT_MAX_GENERATION_TOKENS = 3000 # Increased default target
-# Retry settings for scraping
 RETRY_WAIT_FIXED = 2000
 RETRY_STOP_MAX_ATTEMPT = 3
 # Tone & Audience Options
-TONE_OPTIONS = ["Conversational", "Professional", "Authoritative", "Technical", "Friendly", "Engaging", "Educational"]
-AUDIENCE_OPTIONS = ["Beginners", "General Audience", "Experts", "Professionals (Specific Field)", "Customers", "Students"]
 # --- Logging Setup ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 logger = logging.getLogger(__name__)
-# --- Caching & State ---
-# Initialize session state more comprehensively
 if 'scraped_urls' not in st.session_state: st.session_state.scraped_urls = []
 if 'competitor_analysis_text' not in st.session_state: st.session_state.competitor_analysis_text = ""
 if 'generated_content' not in st.session_state: st.session_state.generated_content = ""
 if 'internal_link_suggestions' not in st.session_state: st.session_state.internal_link_suggestions = ""
 if 'last_keyword' not in st.session_state: st.session_state.last_keyword = ""
-if 'last_model_id' not in st.session_state: st.session_state.last_model_id = ""
 if 'last_website_url' not in st.session_state: st.session_state.last_website_url = ""
-if 'current_model_pipeline' not in st.session_state: st.session_state.current_model_pipeline = None
-if 'current_model_id' not in st.session_state: st.session_state.current_model_id = ""
-# Function to explicitly clear GPU memory
 def clear_gpu_memory():
     logger.info("Attempting to clear GPU memory...")
     if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()
-        logger.info("GPU memory cache cleared and garbage collected.")
     else:
         logger.info("No GPU available, skipping memory clearing.")
-# Modified caching: Load models into session state instead of cache_resource
-# This allows unloading previous models when switching.
-def load_model(model_id):
-    """Loads the selected model pipeline into session state, unloading the previous one."""
-    if st.session_state.current_model_id == model_id and st.session_state.current_model_pipeline is not None:
-        logger.info(f"Model {model_id} is already loaded.")
-        return st.session_state.current_model_pipeline
-    # Unload previous model if different
-    if st.session_state.current_model_pipeline is not None:
         logger.info(f"Unloading previous model: {st.session_state.current_model_id}")
-        st.session_state.current_model_pipeline = None # Remove reference
-        clear_gpu_memory() # Attempt to free memory
-        st.toast(f"Unloaded previous model: {st.session_state.current_model_id}", icon="🧹")
-    st.toast(f"Loading {model_id}... This may take time and significant RAM/GPU.", icon="⏳")
-    logger.info(f"Attempting to load LLM pipeline for model: {model_id}")
     pipeline_instance = None
     try:
-        # Determine torch_dtype based on availability and model needs
-        # Use bfloat16 if available for better performance on compatible GPUs
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else torch.float32
         logger.info(f"Using dtype: {dtype}")
-        # Models requiring trust_remote_code
-        trust_code = model_id in [
             "microsoft/Phi-3-medium-4k-instruct",
             "deepseek-ai/DeepSeek-Coder-V2-Instruct",
-            # Add any other models known to require it
         ]
-        logger.info(f"Trust remote code for {model_id}: {trust_code}")
-        pipeline_instance = pipeline(
-            "text-generation",
-            model=model_id,
-            trust_remote_code=trust_code,
-            device_map="auto",
-            torch_dtype=dtype,
-            # Consider adding quantization for very large models if needed, e.g.:
-            # load_in_8bit=True # Requires bitsandbytes
-        )
-        # Ensure tokenizer has pad_token (important for generation)
-        if pipeline_instance.tokenizer.pad_token_id is None:
-            pipeline_instance.tokenizer.pad_token_id = pipeline_instance.tokenizer.eos_token_id
-            pipeline_instance.model.config.pad_token_id = pipeline_instance.tokenizer.eos_token_id # Also set in config
-            logger.warning(f"Set pad_token_id to eos_token_id ({pipeline_instance.tokenizer.eos_token_id}) for model {model_id}")
-        logger.info(f"LLM pipeline loaded successfully for {model_id}.")
         st.session_state.current_model_pipeline = pipeline_instance
-        st.session_state.current_model_id = model_id
-        st.toast(f"Model {model_id} loaded successfully!", icon="✅")
-        return pipeline_instance
     except ImportError as e:
-        logger.error(f"ImportError loading pipeline for {model_id}: {e}. Missing dependencies?", exc_info=True)
-        st.error(f"Error loading model {model_id}. Required library missing? Check logs. Error: {e}")
-        return None
     except Exception as e:
-        logger.error(f"Error loading LLM pipeline for {model_id}: {e}", exc_info=True)
-        st.error(f"Failed to load {model_id}. Error: {e}. Check resource limits (RAM/GPU) & logs.")
-        # Clear potentially partially loaded state
-        st.session_state.current_model_pipeline = None
-        st.session_state.current_model_id = ""
-        clear_gpu_memory()
-        return None
-# User Agent Caching (can remain cache_resource)
 @st.cache_resource
 def get_user_agent():
     logger.info("Initializing FakeUserAgent.")
     try:
-        # Handle potential issues with finding data files for fake_useragent
         return UserAgent(fallback='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
     except Exception as e:
         logger.error(f"Failed to initialize FakeUserAgent: {e}", exc_info=True)
-        st.error(f"Could not initialize User Agent generator. Scraping might fail. Error: {e}")
         return None
-# --- Core Functions (Scraping, Content Generation - Minimal changes needed) ---
-def reset_session_state():
-    """Clears all generated and scraped data from session state."""
-    st.session_state.scraped_urls = []
-    st.session_state.competitor_analysis_text = ""
-    st.session_state.generated_content = ""
-    st.session_state.internal_link_suggestions = ""
-    st.session_state.last_keyword = ""
-    # Don't reset model pipeline here, only data
-    logger.info("Session state data reset.")
 @retry(wait_fixed=RETRY_WAIT_FIXED, stop_max_attempt_number=RETRY_STOP_MAX_ATTEMPT,
        retry_on_exception=lambda e: isinstance(e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError)))
 def fetch_url_content(url, headers):
-    """Fetches content for a single URL with retries for specific errors."""
     logger.info(f"Fetching {url} (Attempt {fetch_url_content.retry.attempt_number+1}/{RETRY_STOP_MAX_ATTEMPT})")
     response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
     response.raise_for_status()
     if 'text/html' not in response.headers.get('Content-Type', ''):
-        logger.warning(f"Skipping URL {url} - Content-Type is not HTML ({response.headers.get('Content-Type')})")
         return None
-    # Check for excessively large pages (potential trap or non-article content)
     if len(response.content) > 10 * 1024 * 1024: # 10 MB limit
-        logger.warning(f"Skipping URL {url} - Content too large ({len(response.content)} bytes)")
         return None
     return response
 def clean_text(text):
-    """Enhanced text cleaning."""
-    # Remove multiple spaces and newlines
     text = re.sub(r'\s{2,}', ' ', text)
     text = re.sub(r'\n+', '\n', text)
-    # Remove lines that are likely boilerplate/navigation/ads more aggressively
     lines = text.split('\n')
     cleaned_lines = []
-    min_line_length = 20 # Heuristic: Lines shorter than this are often noise
-    min_words_per_line = 3 # Heuristic: Lines with few words are often noise
     skip_phrases = [
         'copyright ©', 'all rights reserved', 'privacy policy', 'terms of use', 'terms and conditions',
         'cookie policy', 'subscribe', 'sign up', 'log in', 'advertisement', 'share this', 'related posts',
         'leave a reply', 'comment', 'posted on', 'by author', 'tags:', 'categories:', 'follow us', 'read more',
-        'click here', 'learn more', 'next article', 'previous article'
     ]
     for line in lines:
         stripped_line = line.strip()
         lower_line = stripped_line.lower()
-        # Check length, word count, and if it contains skip phrases
         if len(stripped_line) >= min_line_length and \
            len(stripped_line.split()) >= min_words_per_line and \
            not any(phrase in lower_line for phrase in skip_phrases):
             cleaned_lines.append(stripped_line)
     text = '\n'.join(cleaned_lines)
     return text.strip()
 def scrape_page_content(url, user_agent, scrape_status_ui):
-    """Scrapes, cleans, and extracts relevant text content with improved logic and retries."""
-    # (Code similar to previous version, with enhanced cleaning and error logging)
-    if not user_agent: logger.error("User Agent generator not available."); return ""
-    headers = { /* ... headers ... */ } # Keep previous headers
     try:
         response = fetch_url_content(url, headers)
-        if response is None:
-             scrape_status_ui.warning(f"⚠️ Skip/Fail fetch: {url}", icon="🕸️")
-             return ""
-        soup = BeautifulSoup(response.content, 'lxml') # Use lxml for potentially faster parsing
-        # --- Enhanced Extraction & Cleaning ---
         tags_to_remove = ["script", "style", "nav", "footer", "aside", "form", "header", "noscript", "button", "input", "select", "textarea", "figure", "figcaption", "iframe", "svg", "path", "meta", "link"]
         for element in soup(tags_to_remove): element.decompose()
         for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract()
-        main_content = (soup.find('main') or soup.find('article') or
-                        soup.find(role='main') or
                         soup.find('div', class_=re.compile(r'(content|main|body|post|entry|article)', re.I)) or
                         soup.find('div', id=re.compile(r'(content|main|body|post|entry|article)', re.I)))
         target_soup = main_content if main_content else soup.body
-        if not target_soup: logger.warning(f"No body/main found: {url}"); scrape_status_ui.warning(f"⚠️ No body/main: {url}", icon="🕸️"); return ""
-        texts = target_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'blockquote', 'span']) # Added span, sometimes used for content
         content_parts = []
         for elem in texts:
-            # Avoid extracting text from elements likely inside removed sections (double check)
             if elem.find_parent(tags_to_remove): continue
-            # Get text, strip extra whitespace, join parts if nested
             elem_text = elem.get_text(separator=' ', strip=True)
-            # Filter out short/noisy text elements
             if len(elem_text) > 10 and len(elem_text.split()) > 1:
-                # Add newline after block elements for structure
-                if elem.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'tr']:
                      content_parts.append(elem_text + "\n")
-                else:
-                     content_parts.append(elem_text + " ")
         content = "".join(content_parts)
-        cleaned_content = clean_text(content) # Apply enhanced cleaning
-        if len(cleaned_content) < 150: # Increased threshold for meaningful content
-             logger.warning(f"Low content ({len(cleaned_content)} chars): {url}")
-             scrape_status_ui.warning(f"⚠️ Low content: {url}", icon="🕸️")
-        else:
-             logger.info(f"Scraped {len(cleaned_content)} chars: {url}")
-             scrape_status_ui.success(f"✅ Scraped: {url} ({len(cleaned_content)} chars)", icon="🕸️")
-        time.sleep(0.6) # Slightly increased delay
         return cleaned_content
-    except requests.exceptions.RequestException as e:
-        logger.warning(f"Final scrape attempt failed: {url}. Error: {e}")
-        scrape_status_ui.error(f"❌ Fail scrape: {url} ({e})", icon="🕸️")
-        return ""
-    except Exception as e:
-        logger.error(f"Unexpected scrape error: {url}: {e}", exc_info=True)
-        scrape_status_ui.error(f"❌ Error scraping: {url} (Check logs)", icon="🕸️")
-        return ""
 def get_top_urls(keyword, num_results):
-    # (Function remains the same as previous version - already robust)
     logger.info(f"Fetching top {num_results} URLs for keyword: '{keyword}'")
     try:
-        # Use a longer timeout for the search itself
         urls = list(search(keyword, num_results=num_results, sleep_interval=2.5, lang="en", timeout=15))
         logger.info(f"Found URLs: {urls}")
-        if not urls:
-             st.warning(f"⚠️ No Google search results found for '{keyword}'. Try a different keyword?")
-             return []
         return urls
     except Exception as e:
-        error_message = str(e)
-        logger.error(f"Error fetching Google search results for '{keyword}': {error_message}", exc_info=True)
-        # Specific error handling...
-        if "429" in error_message or "Too Many Requests" in error_message:
-             st.error(f"❌ Google search blocked (Error 429). WAIT before retrying.")
-        elif "timed out" in error_message:
-             st.error(f"❌ Google search request timed out.")
-        else:
-             st.error(f"❌ Failed to fetch Google results. Error: {error_message[:100]}...") # Truncate long errors
         return []
-# --- Prompt Building Functions ---
 def build_content_generation_prompt(keyword, competitor_texts, tone, audience, model_id):
-    """Builds the main content generation prompt (incorporates previous best practices)."""
-    logger.info(f"Building content gen prompt. Tone: {tone}, Audience: {audience}. Competitor length: {len(competitor_texts)}")
     if len(competitor_texts) > MAX_COMPETITOR_TEXT_LENGTH:
-        competitor_summary = competitor_texts[:MAX_COMPETITOR_TEXT_LENGTH] + "... [Content Truncated]"
-        logger.warning(f"Competitor text truncated to {MAX_COMPETITOR_TEXT_LENGTH} chars.")
-    else:
-        competitor_summary = competitor_texts
-    # System prompt can be tailored slightly if needed, but a generic strong one works well
-    system_prompt = f"""You are an expert SEO Content Strategist and world-class Copywriter. Your task is to analyze competitor text and generate a significantly superior, comprehensive, user-first article for the keyword '{keyword}', targeting a '{audience}' audience with a '{tone}' tone. Focus on quality, depth, clarity, and fulfilling user intent better than the competition."""
-    user_prompt = f"""**Primary Keyword:** "{keyword}"
-**Target Audience:** {audience}
-**Desired Tone:** {tone}
-**Objective:** Generate an exceptional, SEO-optimized article for "{keyword}" designed to outperform the current top-ranking content by providing substantially more value, unique insights, and a better user experience.
-**Competitor Analysis Context (Analyze this text for topics, depth, strengths, and weaknesses/gaps):**
---- BEGIN COMPETITOR CONTENT ---
 {competitor_summary}
---- END COMPETITOR CONTENT ---
-**Content Generation Instructions:**
-1.  **Outperform & Add Value:** Create content that is clearly superior to the competitor examples. Go deeper, explain concepts more clearly, provide actionable advice, offer unique perspectives or data, and fill identified content gaps. Address the core user intent behind "{keyword}" comprehensively.
-2.  **User-First & Humanized:** Write for the '{audience}' reader in the specified '{tone}'. Use clear, concise language, short paragraphs, varied sentence structure, and potentially engaging questions. Ensure logical flow and high readability.
-3.  **Structure (Strict Markdown):**
-    *   Compelling H2 Title (related to "{keyword}").
-    *   Engaging Introduction (50-150 words): Hook reader, state purpose/value, outline content.
-    *   Logical Sections (H2) & Sub-sections (H3): Use descriptive, keyword-aware headings.
-    *   Readability Enhancers: Use bullet points (`* ` or `- `), numbered lists (`1. `), and **bold text** strategically for emphasis.
-    *   Comprehensive Body: Cover all essential aspects, expanding beyond competitor content.
-    *   Strong Conclusion: Summarize key takeaways, provide final insight or call-to-action (if appropriate).
-4.  **SEO Integration (Natural):** Seamlessly integrate "{keyword}" and related semantic terms (LSI) into title, headings, intro, body, conclusion. Prioritize topical relevance and natural language over density. Avoid keyword stuffing.
-5.  **Originality & Credibility:** Generate 100% unique content. Use competitor text ONLY for analysis. Do NOT plagiarize. Ensure factual accuracy.
-6.  **Negative Constraints (DO NOT):** Do not rehash competitors; include preambles/sign-offs; use excessive jargon (unless for 'Experts'); write long paragraphs; stuff keywords; invent facts.
-**Output:** Deliver ONLY the generated Markdown article, starting directly with the H2 title.
-"""
-    # Use the format expected by the pipeline's chat template (usually system/user roles)
-    # The pipeline should handle model-specific formatting (e.g., [INST], <|im_start|>)
-    messages = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_prompt}
-    ]
-    logger.info(f"Content generation prompt constructed for model {model_id}.")
     return messages
-# *** NEW: Internal Linking Prompt ***
 def build_internal_link_prompt(generated_content, keyword, website_url):
-    """Builds the prompt for suggesting internal links."""
-    logger.info(f"Building internal link suggestion prompt for URL: {website_url}")
-    system_prompt = "You are an SEO assistant specialized in identifying internal linking opportunities within website content."
     user_prompt = f"""**Website Base URL:** {website_url}
 **Main Topic of Article:** "{keyword}"
-**Task:** Please review the following article content. Identify 3 to 5 specific phrases or sentences within the text that represent good opportunities for internal links to other relevant pages on the website ({website_url}).
 **For each opportunity, provide:**
-1.  The exact phrase/sentence from the article that should be the anchor text.
-2.  A brief description of the *type* of relevant content the link should point to on the website (e.g., "a detailed guide on [sub-topic]", "a related service page for [service]", "a case study about [specific example]", "a blog post explaining [related concept]").
-**IMPORTANT:**
-*   Do NOT invent specific URLs (like `{website_url}/blog/my-post`). Only describe the *type* of page needed.
-*   Choose anchor text that is natural and descriptive.
-*   Focus on links that would genuinely add value for a reader seeking more information on that specific point.
-*   Format your output as a Markdown numbered list.
-**Article Content to Analyze:**
---- BEGIN ARTICLE CONTENT ---
 {generated_content[:8000]}
---- END ARTICLE CONTENT ---
-""" # Limit context sent for linking analysis
-    messages = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_prompt}
-    ]
     return messages
-# --- LLM Generation Functions ---
 def run_llm_generation(pipe, messages, max_tokens):
-    """Runs the LLM pipeline with common settings and robust error handling."""
-    if pipe is None:
-        st.error("❌ LLM Pipeline is not available.")
-        return None
-    model_id = pipe.model.name_or_path # Get model id from pipeline
-    logger.info(f"Running generation with {model_id}. Max new tokens: {max_tokens}.")
-    generation_start_time = time.time()
     try:
-        generation_args = {
-            "max_new_tokens": max_tokens,
-            "temperature": 0.7,
-            "top_p": 0.95,
-            "top_k": 40,
-            "do_sample": True,
-            "pad_token_id": pipe.tokenizer.eos_token_id,
-            "eos_token_id": pipe.tokenizer.eos_token_id,
-            # Use pipeline's chat template automatically if available
-        }
-        logger.info(f"Generation arguments: {generation_args}")
-        # --- Execute Pipeline ---
-        results = pipe(messages, **generation_args)
-        # --- Robust Extraction of Assistant's Response ---
-        # (Using the refined extraction logic from previous iteration)
         assistant_response = None
         if results and results[0] and 'generated_text' in results[0]:
              output_data = results[0]['generated_text']
-             if isinstance(output_data, list): # Format: [{'role':'user',...}, {'role':'assistant',...}]
-                 assistant_message = next((msg['content'] for msg in reversed(output_data) if msg['role'] == 'assistant'), None)
-                 if assistant_message: assistant_response = assistant_message
-             elif isinstance(output_data, str): # Format: "System...\nUser...\nAssistant..."
-                 # Find the last message in the prompt list to split after it
                  last_prompt_content = messages[-1]['content']
                  last_prompt_index = output_data.rfind(last_prompt_content)
-                 if last_prompt_index != -1:
-                     potential_response = output_data[last_prompt_index + len(last_prompt_content):].strip()
-                 else: # Fallback if prompt isn't exactly echoed
-                     potential_response = output_data # Assume it might just be the response
-                 # Clean potential role markers, </s> tokens etc.
                  assistant_response = re.sub(r"^(assistant|ASSISTANT|</s>|<\|im_end\|>|<\|assistant\|>)\s*[:\n]*", "", potential_response, flags=re.IGNORECASE | re.DOTALL).strip()
-             else: logger.error(f"Unexpected output format type: {type(output_data)}")
         else: logger.error(f"Unexpected LLM output structure: {results}")
-        # --- Final Validation and Cleaning ---
         if assistant_response:
-            duration = time.time() - generation_start_time
-            logger.info(f"Generation successful ({model_id}) in {duration:.2f}s. Length: {len(assistant_response)} chars.")
-            assistant_response = re.sub(r"^```markdown\n", "", assistant_response).strip()
-            assistant_response = re.sub(r"\n```$", "", assistant_response).strip()
-            # Basic length check
-            if len(assistant_response) < 50:
-                 logger.warning(f"Generated output very short ({len(assistant_response)} chars).")
-                 st.warning("⚠️ Generated output seems very short. Please review.")
             return assistant_response
-        else:
-            logger.error(f"Failed to extract assistant response. Full output: {results}")
-            st.error("❌ Failed to parse LLM response structure. Check logs.")
-            return None
-    except torch.cuda.OutOfMemoryError:
-         logger.error(f"OOM Error during generation with {model_id}!", exc_info=True)
-         st.error(f"❌ Generation failed: Out of GPU Memory ({model_id}). Try a smaller model, reduce 'Max Generation Tokens', or restart the space.")
-         clear_gpu_memory() # Attempt to recover
-         return None
-    except Exception as e:
-        logger.error(f"Unhandled error during generation ({model_id}): {e}", exc_info=True)
-        st.error(f"❌ Unexpected error during generation: {e}")
-        return None
 # --- Streamlit App UI ---
-st.set_page_config(layout="wide", page_title="Advanced SEO Content Generator v3")
-# Sidebar Setup
 with st.sidebar:
     st.header("⚙️ Configuration")
     selected_model_key = st.selectbox(
         "Choose Language Model:",
         options=list(MODEL_OPTIONS.keys()),
         index=list(MODEL_OPTIONS.keys()).index(DEFAULT_MODEL_KEY),
-        help="Select AI model. Performance & resource needs vary significantly. Larger models may fail on free tiers."
     )
     selected_model_id = MODEL_OPTIONS[selected_model_key]
-    # Button to explicitly load/switch model
-    if st.button(f"Load/Switch to {selected_model_key}", key="load_model_button"):
-        with st.spinner(f"Loading {selected_model_key}..."):
-            load_model(selected_model_id)
-    st.markdown("---") # Separator
-    with st.expander("Content Settings", expanded=True):
-        num_results = st.slider("Competitors to Analyze:", min_value=1, max_value=10, value=DEFAULT_NUM_RESULTS, step=1)
         selected_tone = st.selectbox("Content Tone:", options=TONE_OPTIONS, index=TONE_OPTIONS.index("Engaging"))
         selected_audience = st.selectbox("Target Audience:", options=AUDIENCE_OPTIONS, index=AUDIENCE_OPTIONS.index("General Audience"))
-        max_gen_tokens = st.number_input("Max Generation Tokens:", min_value=500, max_value=8192, value=DEFAULT_MAX_GENERATION_TOKENS, step=250, help="Approximate max length of generated content. Higher values need more time/resources.")
-    with st.expander("Internal Linking (Optional)"):
-        website_url = st.text_input("Your Website URL (for link suggestions):", placeholder="https://www.example.com", value=st.session_state.get("last_website_url", ""))
-        st.session_state.last_website_url = website_url # Store immediately for reuse
-    st.markdown("---") # Separator
-    st.header("ℹ️ Info & Notes")
-    # Display currently loaded model (if any)
-    if st.session_state.current_model_pipeline:
-         st.success(f"✅ Loaded: `{st.session_state.current_model_id}`")
-    else:
-         st.warning("⚠️ No model loaded. Choose and click 'Load/Switch'.")
     st.info(f"""
     - **Competitors:** Top {num_results}
     - **Max Generation:** ~{max_gen_tokens} tokens
     """)
     st.warning("""
-    - **Resource Use:** Models vary greatly in RAM/GPU needs. Large models WILL fail on free tiers. Ensure model is loaded before generating.
-    - **Scraping:** May fail. Success indicators shown during process.
-    - **Human Review ESSENTIAL:** AI provides drafts. **Always** review, fact-check, edit, add unique value.
-    - **Internal Linking:** Suggestions are AI-based guesses of relevant topics; verify and find the actual URLs yourself.
     """)
-    if st.button("Clear All Cached Data", key="clear_all_data"):
-        reset_session_state()
-        st.toast("Cleared scraped data and generated content.", icon="🗑️")
-# Main App Area
-st.title("✨ Advanced SEO Content Generator ✨")
-st.markdown(f"Leverage AI to analyze competitors and craft superior content.")
-# User Input
-keyword = st.text_input("Enter Primary Target Keyword:", placeholder="e.g., benefits of hydroponic gardening at home", value=st.session_state.get("last_keyword", ""))
-col1, col2 = st.columns([2, 1])
-with col1:
-    generate_button = st.button("Analyze & Generate Content", type="primary", key="generate_main_content")
-# Check if model is loaded before allowing generation
-if not st.session_state.current_model_pipeline:
-    st.error("Please load a model from the sidebar before generating content.")
-    st.stop()
 st.markdown("---")
-# --- Main Workflow ---
-if generate_button:
     if not keyword:
         st.warning("⚠️ Please enter a keyword.")
         st.stop()
-    st.session_state.last_keyword = keyword # Store keyword
     ua = get_user_agent() # Ensure user agent is ready
-    if not ua: st.error("❌ User Agent failed to initialize. Cannot scrape."); st.stop()
-    # Reset previous results for this keyword if generating anew
     st.session_state.generated_content = ""
     st.session_state.internal_link_suggestions = ""
-    # --- Step 1: Scrape Competitors ---
-    # Check if we need to re-scrape (different keyword or no cached data)
     if keyword != st.session_state.get('_internal_last_scrape_keyword', None) or not st.session_state.competitor_analysis_text:
-        logger.info(f"New keyword or no cached data. Starting scrape for '{keyword}'.")
-        st.session_state.competitor_analysis_text = "" # Clear previous text
-        st.session_state.scraped_urls = [] # Clear previous URLs
         scrape_container = st.container()
         with scrape_container:
-            st.subheader(f"🕸️ Scraping Top {num_results} Competitors...")
-            status_area = st.empty() # Placeholder for multi-line status
         urls = get_top_urls(keyword, num_results)
         st.session_state.scraped_urls = urls
         if urls:
             all_texts = []
-            progress_bar = st.progress(0, text="Scraping progress...")
-            scrape_status_messages = [] # Collect messages
             for i, url in enumerate(urls):
-                scrape_status_ui = st.empty() # Temporary UI element for each URL status
-                content = scrape_page_content(url, ua, scrape_status_ui)
-                if content:
-                    all_texts.append(content)
-                    # Use toast for success, keep warnings/errors in main area if needed
-                    # st.toast(f"Scraped: {url[:50]}...", icon="✅")
-                # Update overall progress
-                progress_bar.progress((i + 1) / len(urls), text=f"Scraping URL {i+1}/{len(urls)}")
-                time.sleep(0.1) # UI refresh delay
             st.session_state.competitor_analysis_text = "\n\n --- ARTICLE SEPARATOR --- \n\n".join(all_texts)
-            st.session_state['_internal_last_scrape_keyword'] = keyword # Mark keyword as scraped
             if st.session_state.competitor_analysis_text:
-                scrape_container.success(f"✅ Scraped {len(all_texts)}/{len(urls)} pages. Total analysis text: {len(st.session_state.competitor_analysis_text)} chars.")
-                logger.info(f"Scraping complete. Extracted {len(st.session_state.competitor_analysis_text)} chars.")
             else:
                 scrape_container.error("❌ Failed to scrape sufficient content. Cannot generate article.")
                 st.stop()
@@ -589,22 +498,18 @@ if generate_button:
             scrape_container.error("❌ Could not retrieve competitor URLs. Cannot proceed.")
             st.stop()
     else:
-        st.success(f"✔️ Using previously scraped data for '{keyword}'. ({len(st.session_state.competitor_analysis_text)} chars from {len(st.session_state.scraped_urls)} URLs).")
-        logger.info(f"Using cached scrape data for keyword '{keyword}'.")
     # --- Step 2: Generate Main Content ---
-    st.subheader("✍️ Generating Main Content...")
-    generation_status = st.status(f"Generating content with {st.session_state.current_model_id}...")
     with generation_status:
-        st.write(f"**Tone:** {selected_tone}, **Audience:** {selected_audience}")
-        st.write(f"**Max Tokens:** {max_gen_tokens}")
         gen_prompt = build_content_generation_prompt(
             keyword, st.session_state.competitor_analysis_text, selected_tone, selected_audience, st.session_state.current_model_id
         )
         generated_content = run_llm_generation(st.session_state.current_model_pipeline, gen_prompt, max_gen_tokens)
-        st.session_state.generated_content = generated_content # Store in state
     if generated_content:
          generation_status.update(label="✅ Content Generation Complete!", state="complete")
@@ -612,37 +517,31 @@ if generate_button:
          generation_status.update(label="❌ Content Generation Failed.", state="error")
          st.stop() # Stop if main content fails
-# --- Display Generated Content (if available) ---
 if st.session_state.generated_content:
     st.markdown("---")
     st.subheader("📝 Generated SEO Content")
     st.markdown(st.session_state.generated_content)
-    st.text_area("Copyable Markdown:", st.session_state.generated_content, height=400, key="generated_content_area")
-    # --- Step 3: Internal Linking (Optional) ---
-    if website_url:
         st.markdown("---")
         st.subheader("🔗 Internal Linking Suggestions")
-        if st.button("Suggest Internal Links", key="suggest_links_button"):
-            if not st.session_state.generated_content:
-                st.warning("⚠️ Generate content first before suggesting links.")
-            else:
-                link_status = st.status("Analyzing content for linking opportunities...")
-                with link_status:
-                    st.write(f"Analyzing based on website: {website_url}")
-                    link_prompt = build_internal_link_prompt(st.session_state.generated_content, keyword, website_url)
-                    # Use fewer tokens for link suggestions
-                    link_suggestions = run_llm_generation(st.session_state.current_model_pipeline, link_prompt, max_tokens=500)
-                    st.session_state.internal_link_suggestions = link_suggestions
-                if link_suggestions:
-                    link_status.update(label="✅ Link suggestions generated!", state="complete")
-                else:
-                    link_status.update(label="❌ Failed to generate link suggestions.", state="error")
-        # Display suggestions if available
         if st.session_state.internal_link_suggestions:
             st.markdown(st.session_state.internal_link_suggestions)
-            st.info("ℹ️ Remember: These are AI suggestions. Find the best matching *actual* URL on your site for each.")
     else:
-        st.info("Provide your website URL in the sidebar under 'Advanced Options -> Internal Linking' to enable link suggestions.")

 import logging
 import re
 from retrying import retry
+import gc
 # --- Configuration ---
+# Model Options (Ensure keys clearly indicate resource needs)
 MODEL_OPTIONS = {
+    # Lighter Models (More likely to work on free tiers)
+    "Mistral-7B-Instruct (Fast, Med RAM)": "mistralai/Mistral-7B-Instruct-v0.2",
+    "Gemma-7B-IT (Google, Med RAM)": "google/gemma-7b-it",
+    "Phi-3-Mini-4k-Instruct (Microsoft, Small, Good)": "microsoft/Phi-3-mini-4k-instruct", # Requires trust_remote_code
+    # Medium Models (May require upgraded tiers / more RAM/GPU)
+    "Llama-3-8B-Instruct (Meta, High Quality, High RAM/GPU)": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "Phi-3-Medium-4k-Instruct (Microsoft, Strong, High RAM/GPU)": "microsoft/Phi-3-medium-4k-instruct", # Requires trust_remote_code
+    "Qwen1.5-14B-Chat (Alibaba, Strong, High RAM/GPU)": "Qwen/Qwen1.5-14B-Chat",
+    # Larger Models (Very likely require significant paid resources)
+    "DeepSeek-Coder-V2-Instruct (DeepSeek, High RAM/GPU)": "deepseek-ai/DeepSeek-Coder-V2-Instruct", # Requires trust_remote_code
 }
+DEFAULT_MODEL_KEY = "Mistral-7B-Instruct (Fast, Med RAM)" # Start with a lighter default selection
 # Scraping & Generation Defaults
+DEFAULT_NUM_RESULTS = 4 # Reduced default slightly
 REQUEST_TIMEOUT = 15
+MAX_COMPETITOR_TEXT_LENGTH = 5500
+DEFAULT_MAX_GENERATION_TOKENS = 2800
+# Retry settings
 RETRY_WAIT_FIXED = 2000
 RETRY_STOP_MAX_ATTEMPT = 3
 # Tone & Audience Options
+TONE_OPTIONS = ["Conversational", "Professional", "Authoritative", "Technical", "Friendly", "Engaging", "Educational", "Persuasive"]
+AUDIENCE_OPTIONS = ["Beginners", "General Audience", "Experts", "Professionals (Specific Field)", "Customers", "Students", "Decision Makers"]
 # --- Logging Setup ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 logger = logging.getLogger(__name__)
+# --- State Management ---
+# Initialize session state keys carefully
+if 'current_model_pipeline' not in st.session_state: st.session_state.current_model_pipeline = None
+if 'current_model_id' not in st.session_state: st.session_state.current_model_id = ""
+# Data related state
 if 'scraped_urls' not in st.session_state: st.session_state.scraped_urls = []
 if 'competitor_analysis_text' not in st.session_state: st.session_state.competitor_analysis_text = ""
 if 'generated_content' not in st.session_state: st.session_state.generated_content = ""
 if 'internal_link_suggestions' not in st.session_state: st.session_state.internal_link_suggestions = ""
 if 'last_keyword' not in st.session_state: st.session_state.last_keyword = ""
 if 'last_website_url' not in st.session_state: st.session_state.last_website_url = ""
+if '_internal_last_scrape_keyword' not in st.session_state: st.session_state._internal_last_scrape_keyword = ""
+# --- Helper Functions ---
 def clear_gpu_memory():
+    """Attempts to clear GPU memory cache and run garbage collection."""
     logger.info("Attempting to clear GPU memory...")
     if torch.cuda.is_available():
+        try:
+            st.session_state.current_model_pipeline = None # Ensure reference is removed FIRST
+            gc.collect() # Run Python garbage collection
+            torch.cuda.empty_cache() # Tell PyTorch to release cached memory
+            gc.collect() # Run GC again
+            logger.info("GPU memory cache cleared and garbage collected.")
+            st.toast("Cleared GPU memory.", icon="🧹")
+        except Exception as e:
+            logger.error(f"Error clearing GPU memory: {e}", exc_info=True)
+            st.toast(f"Error clearing GPU memory: {e}", icon="❌")
     else:
         logger.info("No GPU available, skipping memory clearing.")
+        st.session_state.current_model_pipeline = None # Still clear the reference
+        gc.collect()
+def reset_app_data():
+    """Clears stored scraping and generation results, keeps model loaded."""
+    st.session_state.scraped_urls = []
+    st.session_state.competitor_analysis_text = ""
+    st.session_state.generated_content = ""
+    st.session_state.internal_link_suggestions = ""
+    st.session_state.last_keyword = ""
+    st.session_state._internal_last_scrape_keyword = ""
+    logger.info("App data state reset (scraped/generated content).")
+    st.toast("Cleared scraped data and generated content.", icon="🗑️")
+# --- Model Loading (On Demand) ---
+def load_model(model_id_to_load):
+    """Loads the selected model, unloading any previous one."""
+    # If the requested model is already loaded, do nothing
+    if st.session_state.get('current_model_id') == model_id_to_load and st.session_state.get('current_model_pipeline') is not None:
+        logger.info(f"Model {model_id_to_load} is already loaded.")
+        st.toast(f"{model_id_to_load} is already loaded.", icon="✅")
+        return True
+    # Unload previous model if one exists and is different
+    if st.session_state.get('current_model_pipeline') is not None:
         logger.info(f"Unloading previous model: {st.session_state.current_model_id}")
+        st.toast(f"Unloading {st.session_state.current_model_id}...", icon="🧹")
+        clear_gpu_memory() # This sets pipeline to None and clears cache
+        st.session_state.current_model_id = "" # Clear model ID state
+    # Load the new model
+    st.toast(f"Loading {model_id_to_load}... This may take time & RAM/GPU.", icon="⏳")
+    logger.info(f"Attempting to load LLM pipeline for model: {model_id_to_load}")
     pipeline_instance = None
+    success = False
     try:
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 if torch.cuda.is_available() else torch.float32
         logger.info(f"Using dtype: {dtype}")
+        trust_code_models = [
+            "microsoft/Phi-3-mini-4k-instruct",
             "microsoft/Phi-3-medium-4k-instruct",
             "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+            # Add others if needed
         ]
+        trust_code = model_id_to_load in trust_code_models
+        logger.info(f"Trust remote code for {model_id_to_load}: {trust_code}")
+        # Display spinner during the actual loading
+        with st.spinner(f"Loading {model_id_to_load} into memory..."):
+            pipeline_instance = pipeline(
+                "text-generation",
+                model=model_id_to_load,
+                trust_remote_code=trust_code,
+                device_map="auto",
+                torch_dtype=dtype,
+            )
+            # Handle pad_token
+            if pipeline_instance.tokenizer.pad_token_id is None:
+                pipeline_instance.tokenizer.pad_token_id = pipeline_instance.tokenizer.eos_token_id
+                if hasattr(pipeline_instance.model, 'config'):
+                    pipeline_instance.model.config.pad_token_id = pipeline_instance.tokenizer.eos_token_id
+                logger.warning(f"Set pad_token_id to eos_token_id for {model_id_to_load}")
+        logger.info(f"LLM pipeline loaded successfully for {model_id_to_load}.")
         st.session_state.current_model_pipeline = pipeline_instance
+        st.session_state.current_model_id = model_id_to_load
+        st.toast(f"Model {model_id_to_load} loaded!", icon="✅")
+        success = True
     except ImportError as e:
+        logger.error(f"ImportError loading {model_id_to_load}: {e}. Missing dependency?", exc_info=True)
+        st.error(f"Load Error: Missing library for {model_id_to_load}? Check logs. Details: {e}")
     except Exception as e:
+        logger.error(f"Failed to load {model_id_to_load}: {e}", exc_info=True)
+        st.error(f"Failed to load {model_id_to_load}. Error: {e}. Check resource limits (RAM/GPU) & logs.")
+        clear_gpu_memory() # Attempt to clean up if loading failed
+        st.session_state.current_model_id = "" # Ensure state reflects failure
+    finally:
+        return success # Return status
+# --- User Agent Caching ---
 @st.cache_resource
 def get_user_agent():
+    # (Same as previous version)
     logger.info("Initializing FakeUserAgent.")
     try:
         return UserAgent(fallback='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
     except Exception as e:
         logger.error(f"Failed to initialize FakeUserAgent: {e}", exc_info=True)
+        st.error(f"Could not initialize User Agent generator. Error: {e}")
         return None
+# --- Core Functions (Scraping, Prompt Building, Generation Logic) ---
+# These functions (get_top_urls, scrape_page_content, clean_text, fetch_url_content,
+# build_content_generation_prompt, build_internal_link_prompt, run_llm_generation)
+# remain largely the same as the previous version, as they were already quite robust.
+# Ensure `run_llm_generation` correctly uses the pipeline passed to it (which it did).
+# --- (Include the definitions for the core functions here - unchanged from previous version) ---
 @retry(wait_fixed=RETRY_WAIT_FIXED, stop_max_attempt_number=RETRY_STOP_MAX_ATTEMPT,
        retry_on_exception=lambda e: isinstance(e, (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.HTTPError)))
 def fetch_url_content(url, headers):
     logger.info(f"Fetching {url} (Attempt {fetch_url_content.retry.attempt_number+1}/{RETRY_STOP_MAX_ATTEMPT})")
     response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
     response.raise_for_status()
     if 'text/html' not in response.headers.get('Content-Type', ''):
+        logger.warning(f"Skipping URL {url} - Not HTML")
         return None
     if len(response.content) > 10 * 1024 * 1024: # 10 MB limit
+        logger.warning(f"Skipping URL {url} - Content too large")
         return None
     return response
 def clean_text(text):
     text = re.sub(r'\s{2,}', ' ', text)
     text = re.sub(r'\n+', '\n', text)
     lines = text.split('\n')
     cleaned_lines = []
+    min_line_length = 20
+    min_words_per_line = 3
     skip_phrases = [
         'copyright ©', 'all rights reserved', 'privacy policy', 'terms of use', 'terms and conditions',
         'cookie policy', 'subscribe', 'sign up', 'log in', 'advertisement', 'share this', 'related posts',
         'leave a reply', 'comment', 'posted on', 'by author', 'tags:', 'categories:', 'follow us', 'read more',
+        'click here', 'learn more', 'next article', 'previous article', 'you may also like', 'related topics'
     ]
     for line in lines:
         stripped_line = line.strip()
         lower_line = stripped_line.lower()
         if len(stripped_line) >= min_line_length and \
            len(stripped_line.split()) >= min_words_per_line and \
            not any(phrase in lower_line for phrase in skip_phrases):
             cleaned_lines.append(stripped_line)
     text = '\n'.join(cleaned_lines)
     return text.strip()
 def scrape_page_content(url, user_agent, scrape_status_ui):
+    if not user_agent: logger.error("User Agent missing."); return ""
+    headers = {
+        'User-Agent': user_agent.random,
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5', 'Referer': 'https://www.google.com/',
+        'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
+    }
     try:
         response = fetch_url_content(url, headers)
+        if response is None: scrape_status_ui.warning(f"⚠️ Skip/Fail fetch: {url}", icon="🕸️"); return ""
+        soup = BeautifulSoup(response.content, 'lxml')
         tags_to_remove = ["script", "style", "nav", "footer", "aside", "form", "header", "noscript", "button", "input", "select", "textarea", "figure", "figcaption", "iframe", "svg", "path", "meta", "link"]
         for element in soup(tags_to_remove): element.decompose()
         for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract()
+        main_content = (soup.find('main') or soup.find('article') or soup.find(role='main') or
                         soup.find('div', class_=re.compile(r'(content|main|body|post|entry|article)', re.I)) or
                         soup.find('div', id=re.compile(r'(content|main|body|post|entry|article)', re.I)))
         target_soup = main_content if main_content else soup.body
+        if not target_soup: logger.warning(f"No body/main: {url}"); scrape_status_ui.warning(f"⚠️ No body/main: {url}", icon="🕸️"); return ""
+        texts = target_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'blockquote', 'span'])
         content_parts = []
         for elem in texts:
             if elem.find_parent(tags_to_remove): continue
             elem_text = elem.get_text(separator=' ', strip=True)
             if len(elem_text) > 10 and len(elem_text.split()) > 1:
+                if elem.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'tr', 'div']: # Added div for structure
                      content_parts.append(elem_text + "\n")
+                else: content_parts.append(elem_text + " ")
         content = "".join(content_parts)
+        cleaned_content = clean_text(content)
+        if len(cleaned_content) < 150: logger.warning(f"Low content ({len(cleaned_content)} chars): {url}"); scrape_status_ui.warning(f"⚠️ Low content: {url}", icon="🕸️")
+        else: logger.info(f"Scraped {len(cleaned_content)} chars: {url}"); scrape_status_ui.success(f"✅ Scraped: {url} ({len(cleaned_content)} chars)", icon="🕸️")
+        time.sleep(0.6)
         return cleaned_content
+    except requests.exceptions.RequestException as e: logger.warning(f"Final scrape fail: {url}. Err: {e}"); scrape_status_ui.error(f"❌ Fail scrape: {url} ({e})", icon="🕸️"); return ""
+    except Exception as e: logger.error(f"Unexpected scrape error: {url}: {e}", exc_info=True); scrape_status_ui.error(f"❌ Error scraping: {url} (Logs)", icon="🕸️"); return ""
 def get_top_urls(keyword, num_results):
     logger.info(f"Fetching top {num_results} URLs for keyword: '{keyword}'")
     try:
         urls = list(search(keyword, num_results=num_results, sleep_interval=2.5, lang="en", timeout=15))
         logger.info(f"Found URLs: {urls}")
+        if not urls: st.warning(f"⚠️ No Google search results found for '{keyword}'."); return []
         return urls
     except Exception as e:
+        error_message = str(e); logger.error(f"GSearch Error: {error_message}", exc_info=True)
+        if "429" in error_message: st.error(f"❌ Google search blocked (429). WAIT before retrying.")
+        elif "timed out" in error_message: st.error(f"❌ Google search timed out.")
+        else: st.error(f"❌ GSearch Error: {error_message[:100]}...")
         return []
 def build_content_generation_prompt(keyword, competitor_texts, tone, audience, model_id):
+    logger.info(f"Build content gen prompt. Tone: {tone}, Audience: {audience}. Comp length: {len(competitor_texts)}")
     if len(competitor_texts) > MAX_COMPETITOR_TEXT_LENGTH:
+        competitor_summary = competitor_texts[:MAX_COMPETITOR_TEXT_LENGTH] + "... [Truncated]"
+        logger.warning(f"Comp text truncated.")
+    else: competitor_summary = competitor_texts
+    system_prompt = f"""You are an expert SEO Content Strategist & world-class Copywriter. Task: Analyze competitor text & generate a significantly superior, comprehensive, user-first article for keyword '{keyword}', targeting '{audience}' audience with '{tone}' tone. Focus on quality, depth, clarity, fulfilling user intent better than competition."""
+    user_prompt = f"""**Keyword:** "{keyword}"
+**Audience:** {audience}
+**Tone:** {tone}
+**Objective:** Generate exceptional, SEO-optimized article for "{keyword}" designed to outperform top content via superior value, insights, UX.
+**Competitor Analysis Context (Analyze for topics, depth, strengths, WEAKNESSES/GAPS):**
+--- BEGIN COMPETITOR ---
 {competitor_summary}
+--- END COMPETITOR ---
+**Content Gen Instructions:**
+1. **Value & Depth:** Be demonstrably better. Deeper, clearer, actionable advice, unique perspectives/data, fill gaps. Address user intent exhaustively.
+2. **User-First & Humanized:** Write for '{audience}' in '{tone}'. Clear, concise, short paras, varied sentences, engaging Qs. Logical flow, readable.
+3. **Structure (Strict Markdown):** Compelling H2 Title. Engaging Intro (50-150 words): Hook, purpose/value, outline. Logical Sections (H2)/Sub-sections (H3): Descriptive, keyword-aware headings. Readability: Bullets (`* `), Numbered lists (`1. `), **Bold** (strategic). Comprehensive Body: Expand beyond competitors. Strong Conclusion: Summarize takeaways, final insight/CTA.
+4. **SEO (Natural):** Weave "{keyword}" & LSI terms into title, headings, intro, body, conclusion. Prioritize relevance/clarity over density. NO keyword stuffing.
+5. **Originality & Credibility:** 100% unique. Use comp text ONLY for analysis. NO plagiarism. Factual accuracy.
+6. **Negative Constraints:** DO NOT: Rehash competitors; use preambles/sign-offs; use excessive jargon (unless 'Experts'); write long paragraphs; stuff keywords; invent facts.
+**Output:** ONLY the Markdown article, starting with H2 title."""
+    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+    logger.info(f"Content prompt done for {model_id}.")
     return messages
 def build_internal_link_prompt(generated_content, keyword, website_url):
+    logger.info(f"Build internal link prompt for URL: {website_url}")
+    system_prompt = "You are an SEO assistant specialized in identifying internal linking opportunities."
     user_prompt = f"""**Website Base URL:** {website_url}
 **Main Topic of Article:** "{keyword}"
+**Task:** Review the article below. Identify 3-5 phrases/sentences for internal links relevant to {website_url}.
 **For each opportunity, provide:**
+1. Exact anchor text phrase/sentence from article.
+2. Brief description of the *type* of relevant content needed (e.g., "detailed guide on [sub-topic]", "service page for [service]").
+**IMPORTANT:** Do NOT invent URLs. Describe the *type* of page. Choose natural anchor text. Focus on value. Format as Markdown numbered list.
+**Article Content (Analyze first ~8000 chars):**
+--- BEGIN ARTICLE ---
 {generated_content[:8000]}
+--- END ARTICLE ---"""
+    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
     return messages
 def run_llm_generation(pipe, messages, max_tokens):
+    if pipe is None: st.error("❌ LLM Pipeline missing."); return None
+    model_id = pipe.model.name_or_path
+    logger.info(f"Running generation: {model_id}. Max tokens: {max_tokens}.")
+    start_time = time.time()
     try:
+        gen_args = {"max_new_tokens": max_tokens, "temperature": 0.7, "top_p": 0.95, "top_k": 40,
+                    "do_sample": True, "pad_token_id": pipe.tokenizer.eos_token_id, "eos_token_id": pipe.tokenizer.eos_token_id}
+        logger.info(f"Gen args: {gen_args}")
+        results = pipe(messages, **gen_args)
+        # --- Robust Extraction ---
         assistant_response = None
         if results and results[0] and 'generated_text' in results[0]:
              output_data = results[0]['generated_text']
+             if isinstance(output_data, list): assistant_message = next((msg['content'] for msg in reversed(output_data) if msg['role'] == 'assistant'), None); assistant_response = assistant_message
+             elif isinstance(output_data, str):
                  last_prompt_content = messages[-1]['content']
                  last_prompt_index = output_data.rfind(last_prompt_content)
+                 if last_prompt_index != -1: potential_response = output_data[last_prompt_index + len(last_prompt_content):].strip()
+                 else: potential_response = output_data
                  assistant_response = re.sub(r"^(assistant|ASSISTANT|</s>|<\|im_end\|>|<\|assistant\|>)\s*[:\n]*", "", potential_response, flags=re.IGNORECASE | re.DOTALL).strip()
+             else: logger.error(f"Unexpected output format: {type(output_data)}")
         else: logger.error(f"Unexpected LLM output structure: {results}")
+        # --- Validation ---
         if assistant_response:
+            duration = time.time() - start_time; logger.info(f"Gen success ({model_id}) {duration:.2f}s. Len: {len(assistant_response)}.")
+            assistant_response = re.sub(r"^```markdown\n", "", assistant_response).strip(); assistant_response = re.sub(r"\n```$", "", assistant_response).strip()
+            if len(assistant_response) < 30: logger.warning(f"Gen output very short ({len(assistant_response)})."); st.warning("⚠️ Gen output very short.")
             return assistant_response
+        else: logger.error(f"Failed parse assistant response. Output: {results}"); st.error("❌ Failed parse LLM response. Check logs."); return None
+    except torch.cuda.OutOfMemoryError: logger.error(f"OOM Error ({model_id})!", exc_info=True); st.error(f"❌ OOM Error ({model_id}). Try smaller model/less tokens/restart."); clear_gpu_memory(); return None
+    except Exception as e: logger.error(f"Unhandled gen error ({model_id}): {e}", exc_info=True); st.error(f"❌ Unexpected gen error: {e}"); return None
 # --- Streamlit App UI ---
+st.set_page_config(layout="wide", page_title="On-Demand SEO Content Gen")
+# --- Sidebar ---
 with st.sidebar:
     st.header("⚙️ Configuration")
+    # Model Selection & Loading Area
+    st.subheader("1. Select & Load Model")
     selected_model_key = st.selectbox(
         "Choose Language Model:",
         options=list(MODEL_OPTIONS.keys()),
         index=list(MODEL_OPTIONS.keys()).index(DEFAULT_MODEL_KEY),
+        key="model_selector", # Key for potential state access
+        help="Choose AI model. Performance & resources vary. Load required."
     )
     selected_model_id = MODEL_OPTIONS[selected_model_key]
+    # Display current status and load button
+    load_button_placeholder = st.empty() # Placeholder for dynamic button text/state
+    model_status_placeholder = st.empty() # Placeholder for status message
+    if st.session_state.get('current_model_id') == selected_model_id and st.session_state.get('current_model_pipeline') is not None:
+        model_status_placeholder.success(f"✅ Loaded: `{selected_model_id}`")
+        load_button_text = f"Switch from {selected_model_key}" # Or "Reload"
+    elif st.session_state.get('current_model_pipeline') is not None:
+        model_status_placeholder.warning(f"⚠️ Loaded: `{st.session_state.current_model_id}`\nSelected: `{selected_model_id}`")
+        load_button_text = f"Unload Current & Load {selected_model_key}"
+    else:
+        model_status_placeholder.info("ℹ️ No model loaded.")
+        load_button_text = f"Load Selected: {selected_model_key}"
+    if load_button_placeholder.button(load_button_text, key="load_model"):
+        load_model(selected_model_id)
+        # Rerun to update status placeholders immediately after load attempt
+        st.rerun()
+    st.markdown("---")
+    # Content Settings
+    st.subheader("2. Content Settings")
+    with st.expander("Adjust Content Parameters", expanded=False):
+        num_results = st.slider("Competitors to Analyze:", min_value=1, max_value=8, value=DEFAULT_NUM_RESULTS, step=1)
         selected_tone = st.selectbox("Content Tone:", options=TONE_OPTIONS, index=TONE_OPTIONS.index("Engaging"))
         selected_audience = st.selectbox("Target Audience:", options=AUDIENCE_OPTIONS, index=AUDIENCE_OPTIONS.index("General Audience"))
+        max_gen_tokens = st.number_input("Max Generation Tokens:", min_value=500, max_value=8192, value=DEFAULT_MAX_GENERATION_TOKENS, step=100)
+    # Internal Linking
+    st.subheader("3. Internal Linking (Optional)")
+    with st.expander("Configure Link Suggestions", expanded=False):
+        website_url = st.text_input("Your Website URL:", placeholder="https://www.example.com", value=st.session_state.get("last_website_url", ""), key="website_url_input")
+        # Update state immediately on change if needed, or just read before use
+        st.session_state.last_website_url = website_url
+    st.markdown("---")
+    st.header("ℹ️ App Info & Actions")
     st.info(f"""
+    - **Status:** {'Model Loaded' if st.session_state.current_model_pipeline else 'No Model Loaded'}
     - **Competitors:** Top {num_results}
     - **Max Generation:** ~{max_gen_tokens} tokens
     """)
     st.warning("""
+    - **Load Model First:** Select a model and click 'Load' before generating.
+    - **Resource Use:** Models need significant RAM/GPU. Loading WILL fail if resources are insufficient.
+    - **Review Output:** AI provides drafts. ALWAYS review, edit, fact-check.
     """)
+    if st.button("Clear Scraped/Generated Data", key="clear_data"):
+        reset_app_data()
+# --- Main App Area ---
+st.title("✨ On-Demand SEO Content Generator ✨")
+st.markdown(f"Load your chosen AI model, then generate SEO-focused content.")
+# User Input Area
+st.subheader("Keyword & Generation")
+keyword = st.text_input("Enter Primary Target Keyword:", placeholder="e.g., vertical hydroponics guide", value=st.session_state.get("last_keyword", ""), key="keyword_input")
+# Disable button if model not loaded
+generate_button_disabled = st.session_state.current_model_pipeline is None
+generate_button_help = "Load a model from the sidebar first." if generate_button_disabled else "Analyze competitors and generate article."
+analyze_button = st.button(
+    "Analyze Competitors & Generate Content",
+    type="primary",
+    key="generate_button",
+    disabled=generate_button_disabled,
+    help=generate_button_help
+)
 st.markdown("---")
+# --- Main Workflow Triggered by Button ---
+if analyze_button:
+    # Double check model is loaded (though button should be disabled)
+    if not st.session_state.current_model_pipeline:
+        st.error("❌ Cannot generate: No model loaded. Please use the sidebar.")
+        st.stop()
     if not keyword:
         st.warning("⚠️ Please enter a keyword.")
         st.stop()
+    st.session_state.last_keyword = keyword # Store keyword for potential reuse
     ua = get_user_agent() # Ensure user agent is ready
+    if not ua: st.error("❌ User Agent failed. Cannot scrape."); st.stop()
+    # Reset previous generation results for this run
     st.session_state.generated_content = ""
     st.session_state.internal_link_suggestions = ""
+    # --- Step 1: Scrape Competitors (with status updates) ---
+    # Check if scrape needed
     if keyword != st.session_state.get('_internal_last_scrape_keyword', None) or not st.session_state.competitor_analysis_text:
+        logger.info(f"Scraping needed for '{keyword}'.")
+        st.session_state.competitor_analysis_text = "" # Clear old text
+        st.session_state.scraped_urls = []
+        st.session_state['_internal_last_scrape_keyword'] = "" # Reset marker until success
         scrape_container = st.container()
         with scrape_container:
+            st.info(f"🕸️ Fetching URLs and Scraping Top {num_results} Competitors...")
+            progress_text = "Scraping progress..."
+            scrape_progress_bar = st.progress(0, text=progress_text)
+            status_area = st.container() # Use container for multiple status lines
         urls = get_top_urls(keyword, num_results)
         st.session_state.scraped_urls = urls
         if urls:
             all_texts = []
+            scraped_count = 0
             for i, url in enumerate(urls):
+                with status_area: # Show status within the designated area
+                     scrape_status_ui = st.empty() # Placeholder for single URL status
+                     content = scrape_page_content(url, ua, scrape_status_ui)
+                     if content:
+                         all_texts.append(content)
+                         scraped_count += 1
+                scrape_progress_bar.progress((i + 1) / len(urls), text=f"Processed URL {i+1}/{len(urls)}...")
+                time.sleep(0.1) # UI update breather
             st.session_state.competitor_analysis_text = "\n\n --- ARTICLE SEPARATOR --- \n\n".join(all_texts)
+            st.session_state['_internal_last_scrape_keyword'] = keyword # Mark scrape success for this keyword
             if st.session_state.competitor_analysis_text:
+                scrape_container.success(f"✅ Scraped {scraped_count}/{len(urls)} pages. Analysis text: {len(st.session_state.competitor_analysis_text)} chars.")
             else:
                 scrape_container.error("❌ Failed to scrape sufficient content. Cannot generate article.")
                 st.stop()
             scrape_container.error("❌ Could not retrieve competitor URLs. Cannot proceed.")
             st.stop()
     else:
+        st.success(f"✔️ Using previously scraped data for '{keyword}'. ({len(st.session_state.competitor_analysis_text)} chars).")
     # --- Step 2: Generate Main Content ---
+    st.info(f"✍️ Generating Content with {st.session_state.current_model_id}...")
+    generation_status = st.status("Sending request to LLM...")
     with generation_status:
+        st.write(f"**Tone:** {selected_tone}, **Audience:** {selected_audience}, **Max Tokens:** {max_gen_tokens}")
         gen_prompt = build_content_generation_prompt(
             keyword, st.session_state.competitor_analysis_text, selected_tone, selected_audience, st.session_state.current_model_id
         )
         generated_content = run_llm_generation(st.session_state.current_model_pipeline, gen_prompt, max_gen_tokens)
+        st.session_state.generated_content = generated_content
     if generated_content:
          generation_status.update(label="✅ Content Generation Complete!", state="complete")
          generation_status.update(label="❌ Content Generation Failed.", state="error")
          st.stop() # Stop if main content fails
+# --- Display Outputs (Outside the button click conditional) ---
 if st.session_state.generated_content:
     st.markdown("---")
     st.subheader("📝 Generated SEO Content")
     st.markdown(st.session_state.generated_content)
+    st.text_area("Copyable Markdown:", st.session_state.generated_content, height=400, key="generated_content_area_display")
+    # --- Internal Linking Section ---
+    if st.session_state.last_website_url: # Only show if URL was provided
         st.markdown("---")
         st.subheader("🔗 Internal Linking Suggestions")
+        if st.button("Suggest Internal Links", key="suggest_links_button_display"):
+            link_status = st.status(f"Analyzing content for link opportunities ({st.session_state.current_model_id})...")
+            with link_status:
+                st.write(f"Website context: {st.session_state.last_website_url}")
+                link_prompt = build_internal_link_prompt(st.session_state.generated_content, keyword, st.session_state.last_website_url)
+                link_suggestions = run_llm_generation(st.session_state.current_model_pipeline, link_prompt, max_tokens=500) # Use fewer tokens
+                st.session_state.internal_link_suggestions = link_suggestions
+            if link_suggestions: link_status.update(label="✅ Link suggestions generated!", state="complete")
+            else: link_status.update(label="❌ Failed to generate link suggestions.", state="error")
+        # Display suggestions if they exist in state
         if st.session_state.internal_link_suggestions:
             st.markdown(st.session_state.internal_link_suggestions)
+            st.info("ℹ️ AI suggestions only. Verify relevance and find actual URLs on your site.")
     else:
+         st.markdown("---")
+         st.info("Provide your website URL in the sidebar to enable internal link suggestions after generating content.")