Spaces:

roguchi
/

ultima_seo

Sleeping

App Files Files Community

rsm-roguchi commited on Jan 13

Commit

60c1d6e

1 Parent(s): 93589c3

dokcer changes

Browse files

Files changed (3) hide show

Dockerfile +8 -2
code/llm_connect.py +2 -2
server/blog.py +175 -117

Dockerfile CHANGED Viewed

@@ -2,14 +2,20 @@
 FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
 # Optional system extras
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    ffmpeg fonts-noto-color-emoji fonts-liberation \
  && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Python deps
 COPY requirements.txt .
 # Make sure requirements.txt has: shap==0.48.0
 RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
@@ -20,4 +26,4 @@ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 7860
-CMD ["shiny", "run", "--host", "0.0.0.0", "--port", "7860", "app:app"]

 FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
 # Optional system extras
+# FIXED: Added 'build-essential' and 'python3-dev' to allow compiling C libraries like SHAP
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    python3-dev \
+    ffmpeg \
+    fonts-noto-color-emoji \
+    fonts-liberation \
  && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Python deps
 COPY requirements.txt .
 # Make sure requirements.txt has: shap==0.48.0
 RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 7860
+CMD ["shiny", "run", "--host", "0.0.0.0", "--port", "7860", "app:app"]

code/llm_connect.py CHANGED Viewed

@@ -91,7 +91,7 @@ def test_llama_connection(api_key: str, timeout: int = 20) -> bool:
 def query_gemini(
     messages: List[dict],
-    model: str = "gemini-2.0-flash",
     max_tokens: int = 4000,
     temperature: int = 0.4,
     api_key: str = "",
@@ -173,7 +173,7 @@ def get_response(
             api_key=os.getenv("GEMINI_API_KEY"),
             temperature=temperature,
             max_tokens=max_tokens,
-            model=model_name if model_name else 'gemini-2.0-flash'
         )
     else:
         raise ValueError("LLM: Invalid LLM specified")

 def query_gemini(
     messages: List[dict],
+    model: str = "gemma-3-12b-it",
     max_tokens: int = 4000,
     temperature: int = 0.4,
     api_key: str = "",
             api_key=os.getenv("GEMINI_API_KEY"),
             temperature=temperature,
             max_tokens=max_tokens,
+            model=model_name if model_name else 'gemma-3-12b-it'
         )
     else:
         raise ValueError("LLM: Invalid LLM specified")

server/blog.py CHANGED Viewed

@@ -40,6 +40,57 @@ async def scrape_div_content_from_url(url: str) -> str:
         print(f"[ERROR] Failed to render or scrape: {e}")
         return ""
 # === Async keyword + scrape + fallback logic ===
 async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
     scraped_text = await scrape_div_content_from_url(url)
@@ -50,124 +101,101 @@ async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
     # === Step 1: Extract condensed topic keywords ===
     try:
         condensed_prompt = (
-            "Extract exactly 5 to 7 Google search phrases from the content below that reflect real user search intent. "
-            "Each phrase should describe a specific product, use case, or collector topic — not generic brands or categories.\n\n"
-            "⚠️ Rules:\n"
-            "- Each phrase must be 2 to 5 words\n"
-            "- All phrases must be lowercase and ASCII-only\n"
-            "- Do NOT include apostrophes, single quotes, or quotation marks — rewrite or skip any phrases that contain them\n"
-            "- Do NOT include single words or overly broad terms like 'pokemon'\n"
-            "- Do NOT return line breaks, bullet points, or list formatting\n\n"
-            "✅ Output format:\n"
-            "Return a single comma-separated string of keyword phrases, with no brackets, no quotes, and no explanation.\n"
-            "Example output:\n"
-            "vintage charizard value, graded card pricing, rare booster packs, psa 10 umbreon, tcg price trends\n\n"
-            f"Content:\n{scraped_text}"
         )
         condensed_topic_raw = get_response(
             input=condensed_prompt,
             template=lambda x: x.strip(),
             llm="gemini",
             md=False,
-            temperature=0.6,
-            max_tokens=100
         )
-        print(condensed_topic_raw)
-        # Parse comma-separated string
-        condensed_topic = [kw.strip() for kw in condensed_topic_raw.split(",") if kw.strip()]
-        if not condensed_topic:
-            condensed_topic = ["trading cards"]
-        print(f"[INFO] Condensed topic keywords: {condensed_topic}")
     except Exception as e:
-        print(f"[WARN] Could not infer topics: {e}")
         condensed_topic = ["trading cards"]
-    # === Step 2: Pull suggestions from PyTrends ===
-    time.sleep(3)
     all_suggestions = set()
     try:
-        pytrends = TrendReq(hl="en-US", tz=360, timeout=10)
-        for topic in condensed_topic:
-            time.sleep(5)
-            suggestions = pytrends.suggestions(keyword=topic)
-            if suggestions:
-                titles = [s["title"] for s in suggestions]
-                all_suggestions.update(titles)
-                print(f"[INFO] Suggestions for '{topic}': {titles[:3]}")
     except Exception as e:
-        print(f"[WARN] PyTrends suggestions failed: {e}")
-    all_suggestions = list(all_suggestions)
-    # === Step 3: Let Gemini filter suggestions for relevance ===
-    filtered_keywords = []
-    if all_suggestions:
-        filter_prompt = (
-            f"The following article was scraped:\n\n{scraped_text[:1500]}\n\n"
-            f"Here is a list of keyword suggestions:\n{all_suggestions}\n\n"
-            "Return only the keywords that are clearly relevant to the article topic. "
-            "Return a valid Python list of strings only. No explanation, bullets, or formatting."
-        )
-        raw_filtered = get_response(
-            input=filter_prompt,
-            template=lambda x: x.strip(),
-            llm="gemini",
-            md=False,
-            temperature=0.3,
-            max_tokens=200
-        )
-        match = re.search(r"\[.*?\]", raw_filtered)
-        if match:
-            try:
-                filtered_keywords = ast.literal_eval(match.group(0))
-            except:
-                filtered_keywords = []
-    # === Step 4: Fallback to Gemini keyword generation if needed ===
-    if not filtered_keywords:
-        fallback_prompt = (
-            f"You are an SEO expert. Generate {llm_n} niche-relevant SEO keywords "
-            f"based on this content:\n\n{scraped_text}\n\n"
-            "Return a comma-separated list of lowercase 2–5 word search phrases. No formatting."
-        )
-        fallback_keywords_raw = get_response(
-            input=fallback_prompt,
-            template=lambda x: x.strip(),
-            llm="gemini",
-            md=False,
-            temperature=0.7,
-            max_tokens=400
-        )
-        filtered_keywords = [kw.strip() for kw in fallback_keywords_raw.split(",") if kw.strip()]
-        print(f"[INFO] Fallback keywords used: {filtered_keywords[:top_n]}")
-    # === Step 5: Enforce minimum of 30 keywords ===
-    time.sleep(3)
-    combined_keywords = list(dict.fromkeys(filtered_keywords))  # remove duplicates
     if len(combined_keywords) < 30:
-        needed = 30 - len(combined_keywords)
-        print(f"[INFO] Need {needed} more keywords to reach 30. Using Gemini to pad.")
         pad_prompt = (
-            f"The following article content is missing SEO keyword coverage:\n\n"
-            f"{scraped_text}\n\n"
-            f"Generate exactly {needed} additional SEO keyword phrases.\n"
-            "Each keyword must:\n"
-            "- be 2 to 5 words long\n"
-            "- be lowercase only\n"
-            "- use ASCII characters only (no symbols or accents)\n"
-            "- be clearly relevant to the article\n"
-            "- avoid generic terms like 'pokemon'\n\n"
-            "Return only the keywords as a single comma-separated string, with no extra formatting or explanation.\n"
-            "Example output:\n"
-            "keyword one, keyword two, keyword three"
         )
         pad_raw = get_response(
@@ -175,23 +203,41 @@ async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
             template=lambda x: x.strip(),
             llm="gemini",
             md=False,
-            temperature=0.7,
-            max_tokens=200
         )
         pad_keywords = []
-        print(pad_raw)
-        try:
-            pad_keywords = [kw.strip() for kw in pad_raw.split(",") if kw.strip()]
-        except Exception as e:
-            print(f"[WARN] Keyword parsing failed: {e}")
-            pad_keywords = []
-        combined_keywords = list(dict.fromkeys(combined_keywords + pad_keywords))
-        print(f"[INFO] Padded {len(pad_keywords)} keywords:", pad_keywords)
-        return combined_keywords[:30], scraped_text
@@ -236,19 +282,31 @@ def server(input, output, session):
         keyword_str = ", ".join(keywords)
         # Title generation from scraped text
         infer_topic_prompt = (
-            f"Based on the following article content:\n\n{scraped[:2000]}\n\n"
-            f"Return a short, descriptive blog post title (max 70 characters)."
-            f"Return ONLY the TITLE"
         )
-        seo_title = get_response(
             input=infer_topic_prompt,
-            template=lambda x: x.strip().replace('"', ''),
             llm="gemini",
             md=False,
-            temperature=0.5,
-            max_tokens=20
         )
         # Blog generation with injected SEO
         prompt = (

         print(f"[ERROR] Failed to render or scrape: {e}")
         return ""
+# === Step 6: Semantic Validation (The "Double Check") ===
+# ==========================================
+# 1. HELPER: Semantic Keyword Validation (Fixed)
+# ==========================================
+def filter_irrelevant_keywords(keywords: list, article_text: str) -> list:
+    print(f"[INFO] Validating {len(keywords)} keywords for relevance...")
+    validation_prompt = (
+        f"Role: You are an elite SEO Editor.\n"
+        f"Task: Review the list of keywords below against the provided Article Content.\n"
+        f"Action: REMOVE any keywords that are irrelevant, hallucinatory, or completely off-topic.\n"
+        f"Criteria: Keep specific, long-tail, and topically related keywords. Remove generic terms.\n\n"
+        f"--- KEYWORDS TO REVIEW ---\n"
+        f"{', '.join(keywords)}\n\n"
+        f"--- ARTICLE CONTEXT ---\n"
+        f"{article_text[:1500]}\n\n"
+        f"OUTPUT FORMAT:\n"
+        f"Return the CLEANED list as a simple BULLET LIST (one per line).\n"
+        f"Example:\n- keyword one\n- keyword two"
+    )
+    try:
+        validated_raw = get_response(
+            input=validation_prompt,
+            template=lambda x: x.strip(),
+            llm="gemini",
+            md=False,
+            temperature=0.1,
+            max_tokens=1000
+        )
+        # Robust Line-by-Line Parsing (No more SyntaxErrors)
+        clean_list = []
+        for line in validated_raw.split("\n"):
+            # Strip bullets (*, -) and surrounding whitespace
+            clean_item = line.lstrip("*- ").strip().lower()
+            # Basic sanity checks to avoid empty lines or conversational filler
+            if clean_item and len(clean_item) > 2 and "here are" not in clean_item:
+                clean_list.append(clean_item)
+        dropped_count = len(keywords) - len(clean_list)
+        if dropped_count > 0:
+            print(f"[INFO] Validation removed {dropped_count} irrelevant keywords.")
+        return clean_list
+    except Exception as e:
+        print(f"[WARN] Validation failed: {e}. Returning originals.")
+        return keywords
 # === Async keyword + scrape + fallback logic ===
 async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
     scraped_text = await scrape_div_content_from_url(url)
     # === Step 1: Extract condensed topic keywords ===
     try:
         condensed_prompt = (
+            "You are an SEO expert. Identify exactly 5 distinct main topics from the text below.\n"
+            "Format: Return a BULLET LIST only.\n"
+            "Rules: NO intro text. NO numbering. NO explanations.\n"
+            f"TEXT TO ANALYZE:\n{scraped_text[:3000]}"
         )
         condensed_topic_raw = get_response(
             input=condensed_prompt,
             template=lambda x: x.strip(),
             llm="gemini",
             md=False,
+            temperature=0.3, # Lower temp = less chatty
+            max_tokens=200
         )
+        # Cleaner parsing logic
+        condensed_topic = []
+        for line in condensed_topic_raw.split("\n"):
+            clean = line.replace("*", "").replace("-", "").strip().lower()
+            if clean and "here are" not in clean:
+                condensed_topic.append(clean)
+        if len(condensed_topic) < 2:
+             condensed_topic = [k.strip() for k in condensed_topic_raw.split(",") if k.strip()]
     except Exception as e:
         condensed_topic = ["trading cards"]
+    # === Step 2: PyTrends Logic (Fixed) ===
+    print(f"[INFO] Starting PyTrends for topics: {condensed_topic[:3]}")
     all_suggestions = set()
+    # FIX: Initialize with retries=0 to bypass the 'method_whitelist' crash
+    # We will handle retries manually in the loop below.
     try:
+        pytrends = TrendReq(hl="en-US", tz=360, timeout=10, retries=0)
     except Exception as e:
+        print(f"[ERROR] Could not initialize PyTrends: {e}")
+        pytrends = None
+    if pytrends:
+        for topic in condensed_topic[:3]:
+            print(f"[INFO] Querying PyTrends for: '{topic}'...")
+            # Manual Retry Logic (since we disabled the internal one)
+            for attempt in range(3):
+                try:
+                    # Sleep to prevent 429 Too Many Requests
+                    time.sleep(2)
+                    suggestions = pytrends.suggestions(keyword=topic)
+                    if suggestions:
+                        titles = [s["title"].lower().strip() for s in suggestions]
+                        print(f"   -> Found {len(titles)} suggestions: {titles}")
+                        all_suggestions.update(titles)
+                        break # Success, stop retrying this keyword
+                    else:
+                        print("   -> No suggestions found.")
+                        break # No data, stop retrying
+                except Exception as inner_e:
+                    # If it's a 429 error, wait longer and try again
+                    if "429" in str(inner_e):
+                        print(f"   -> [WARN] Rate limited on '{topic}'. Waiting 5s...")
+                        time.sleep(5)
+                    else:
+                        print(f"   -> [WARN] Failed for '{topic}' (Attempt {attempt+1}/3): {inner_e}")
+                        if attempt == 2: # Last attempt failed
+                            print("   -> Giving up on this keyword.")
+    # Convert set to list
+    combined_keywords = list(all_suggestions)
+    if not combined_keywords:
+        print("[INFO] PyTrends returned 0 results. Switching to LLM Fallback.")
+    else:
+        print(f"[INFO] PyTrends successful. Total keywords: {len(combined_keywords)}")
+    # === Step 3: Fallback / Filtering ===
+    # If PyTrends gave results, we trust them. If not, we use LLM.
+    combined_keywords = list(all_suggestions)
+    # === Step 4: Padding (The Fix for "Okay here are...") ===
     if len(combined_keywords) < 30:
+        needed = 35 - len(combined_keywords)
         pad_prompt = (
+            f"Generate exactly {needed} NEW, DISTINCT long-tail SEO keywords based on this text.\n"
+            f"STRICT OUTPUT RULES:\n"
+            f"1. Return ONLY a raw bullet list (one keyword per line).\n"
+            f"2. DO NOT write 'Here are the keywords'.\n"
+            f"3. DO NOT add parentheses or explanations like '(best for beginners)'.\n"
+            f"4. Just the keywords.\n\n"
+            f"Context:\n{scraped_text[:2500]}"
         )
         pad_raw = get_response(
             template=lambda x: x.strip(),
             llm="gemini",
             md=False,
+            temperature=0.5, # Lower temp prevents hallucinated explanations
+            max_tokens=1000
         )
         pad_keywords = []
+        for line in pad_raw.split("\n"):
+            # remove bullets
+            clean_line = line.strip().lstrip("*-+1234567890. ").strip()
+            # remove parenthetical explanations using regex
+            # e.g., "op13 cards (rare)" -> "op13 cards"
+            clean_line = re.sub(r"\(.*?\)", "", clean_line).strip()
+            # Filter out chatty lines
+            if (len(clean_line) > 3
+                and "here are" not in clean_line.lower()
+                and "formatted as" not in clean_line.lower()
+                and ":" not in clean_line):
+                pad_keywords.append(clean_line.lower())
+        combined_keywords = list(set(combined_keywords + pad_keywords))
+    # Double check relevance before returning
+    if len(combined_keywords) > 10:
+        validated_keywords = filter_irrelevant_keywords(combined_keywords, scraped_text)
+    else:
+        validated_keywords = combined_keywords
+    # Fallback if validation was too aggressive
+    if len(validated_keywords) < 10:
+         validated_keywords = combined_keywords
+    print(f"[INFO] Final validated count: {len(validated_keywords)}")
+    return validated_keywords[:30], scraped_text
         keyword_str = ", ".join(keywords)
         # Title generation from scraped text
+        # Title generation with stricter prompt
         infer_topic_prompt = (
+            f"Write ONE catchy, click-worthy H1 Blog Title for the content below.\n"
+            f"STRICT RULES:\n"
+            f"- Return ONLY the title string.\n"
+            f"- Do NOT write 'Title:' or 'Here is a title'.\n"
+            f"- Do NOT use quotation marks.\n"
+            f"- Max 15 words.\n\n"
+            f"Content:\n{scraped[:2000]}"
         )
+        seo_title_raw = get_response(
             input=infer_topic_prompt,
+            template=lambda x: x.strip(),
             llm="gemini",
             md=False,
+            temperature=0.7,
+            max_tokens=60
         )
+        # Cleanup: Remove quotes and "Title:" prefix if the LLM ignores rules
+        seo_title = seo_title_raw.replace('"', '').replace("Title:", "").strip()
+        # If it gave multiple options (detected by newlines), take the first one
+        if "\n" in seo_title:
+            seo_title = seo_title.split("\n")[0].strip()
         # Blog generation with injected SEO
         prompt = (