Spaces:
Sleeping
Sleeping
rsm-roguchi commited on
Commit ·
60c1d6e
1
Parent(s): 93589c3
dokcer changes
Browse files- Dockerfile +8 -2
- code/llm_connect.py +2 -2
- server/blog.py +175 -117
Dockerfile
CHANGED
|
@@ -2,14 +2,20 @@
|
|
| 2 |
FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
|
| 3 |
|
| 4 |
# Optional system extras
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
|
| 9 |
WORKDIR /app
|
| 10 |
|
| 11 |
# Python deps
|
| 12 |
COPY requirements.txt .
|
|
|
|
| 13 |
# Make sure requirements.txt has: shap==0.48.0
|
| 14 |
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
| 15 |
|
|
@@ -20,4 +26,4 @@ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
|
| 20 |
COPY . .
|
| 21 |
|
| 22 |
EXPOSE 7860
|
| 23 |
-
CMD ["shiny", "run", "--host", "0.0.0.0", "--port", "7860", "app:app"]
|
|
|
|
| 2 |
FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
|
| 3 |
|
| 4 |
# Optional system extras
|
| 5 |
+
# FIXED: Added 'build-essential' and 'python3-dev' to allow compiling C libraries like SHAP
|
| 6 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
build-essential \
|
| 8 |
+
python3-dev \
|
| 9 |
+
ffmpeg \
|
| 10 |
+
fonts-noto-color-emoji \
|
| 11 |
+
fonts-liberation \
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
WORKDIR /app
|
| 15 |
|
| 16 |
# Python deps
|
| 17 |
COPY requirements.txt .
|
| 18 |
+
|
| 19 |
# Make sure requirements.txt has: shap==0.48.0
|
| 20 |
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
| 21 |
|
|
|
|
| 26 |
COPY . .
|
| 27 |
|
| 28 |
EXPOSE 7860
|
| 29 |
+
CMD ["shiny", "run", "--host", "0.0.0.0", "--port", "7860", "app:app"]
|
code/llm_connect.py
CHANGED
|
@@ -91,7 +91,7 @@ def test_llama_connection(api_key: str, timeout: int = 20) -> bool:
|
|
| 91 |
|
| 92 |
def query_gemini(
|
| 93 |
messages: List[dict],
|
| 94 |
-
model: str = "
|
| 95 |
max_tokens: int = 4000,
|
| 96 |
temperature: int = 0.4,
|
| 97 |
api_key: str = "",
|
|
@@ -173,7 +173,7 @@ def get_response(
|
|
| 173 |
api_key=os.getenv("GEMINI_API_KEY"),
|
| 174 |
temperature=temperature,
|
| 175 |
max_tokens=max_tokens,
|
| 176 |
-
model=model_name if model_name else '
|
| 177 |
)
|
| 178 |
else:
|
| 179 |
raise ValueError("LLM: Invalid LLM specified")
|
|
|
|
| 91 |
|
| 92 |
def query_gemini(
|
| 93 |
messages: List[dict],
|
| 94 |
+
model: str = "gemma-3-12b-it",
|
| 95 |
max_tokens: int = 4000,
|
| 96 |
temperature: int = 0.4,
|
| 97 |
api_key: str = "",
|
|
|
|
| 173 |
api_key=os.getenv("GEMINI_API_KEY"),
|
| 174 |
temperature=temperature,
|
| 175 |
max_tokens=max_tokens,
|
| 176 |
+
model=model_name if model_name else 'gemma-3-12b-it'
|
| 177 |
)
|
| 178 |
else:
|
| 179 |
raise ValueError("LLM: Invalid LLM specified")
|
server/blog.py
CHANGED
|
@@ -40,6 +40,57 @@ async def scrape_div_content_from_url(url: str) -> str:
|
|
| 40 |
print(f"[ERROR] Failed to render or scrape: {e}")
|
| 41 |
return ""
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# === Async keyword + scrape + fallback logic ===
|
| 44 |
async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
|
| 45 |
scraped_text = await scrape_div_content_from_url(url)
|
|
@@ -50,124 +101,101 @@ async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
|
|
| 50 |
# === Step 1: Extract condensed topic keywords ===
|
| 51 |
try:
|
| 52 |
condensed_prompt = (
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
-
"
|
| 56 |
-
"
|
| 57 |
-
"- All phrases must be lowercase and ASCII-only\n"
|
| 58 |
-
"- Do NOT include apostrophes, single quotes, or quotation marks — rewrite or skip any phrases that contain them\n"
|
| 59 |
-
"- Do NOT include single words or overly broad terms like 'pokemon'\n"
|
| 60 |
-
"- Do NOT return line breaks, bullet points, or list formatting\n\n"
|
| 61 |
-
"✅ Output format:\n"
|
| 62 |
-
"Return a single comma-separated string of keyword phrases, with no brackets, no quotes, and no explanation.\n"
|
| 63 |
-
"Example output:\n"
|
| 64 |
-
"vintage charizard value, graded card pricing, rare booster packs, psa 10 umbreon, tcg price trends\n\n"
|
| 65 |
-
f"Content:\n{scraped_text}"
|
| 66 |
)
|
| 67 |
|
| 68 |
-
|
| 69 |
condensed_topic_raw = get_response(
|
| 70 |
input=condensed_prompt,
|
| 71 |
template=lambda x: x.strip(),
|
| 72 |
llm="gemini",
|
| 73 |
md=False,
|
| 74 |
-
temperature=0.
|
| 75 |
-
max_tokens=
|
| 76 |
)
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
|
| 82 |
-
if not condensed_topic:
|
| 83 |
-
condensed_topic = ["trading cards"]
|
| 84 |
-
|
| 85 |
-
print(f"[INFO] Condensed topic keywords: {condensed_topic}")
|
| 86 |
except Exception as e:
|
| 87 |
-
print(f"[WARN] Could not infer topics: {e}")
|
| 88 |
condensed_topic = ["trading cards"]
|
| 89 |
|
| 90 |
-
# === Step 2:
|
| 91 |
-
|
| 92 |
all_suggestions = set()
|
|
|
|
|
|
|
|
|
|
| 93 |
try:
|
| 94 |
-
pytrends = TrendReq(hl="en-US", tz=360, timeout=10)
|
| 95 |
-
for topic in condensed_topic:
|
| 96 |
-
time.sleep(5)
|
| 97 |
-
suggestions = pytrends.suggestions(keyword=topic)
|
| 98 |
-
if suggestions:
|
| 99 |
-
titles = [s["title"] for s in suggestions]
|
| 100 |
-
all_suggestions.update(titles)
|
| 101 |
-
print(f"[INFO] Suggestions for '{topic}': {titles[:3]}")
|
| 102 |
except Exception as e:
|
| 103 |
-
print(f"[
|
| 104 |
-
|
| 105 |
-
all_suggestions = list(all_suggestions)
|
| 106 |
-
|
| 107 |
-
# === Step 3: Let Gemini filter suggestions for relevance ===
|
| 108 |
-
filtered_keywords = []
|
| 109 |
-
if all_suggestions:
|
| 110 |
-
filter_prompt = (
|
| 111 |
-
f"The following article was scraped:\n\n{scraped_text[:1500]}\n\n"
|
| 112 |
-
f"Here is a list of keyword suggestions:\n{all_suggestions}\n\n"
|
| 113 |
-
"Return only the keywords that are clearly relevant to the article topic. "
|
| 114 |
-
"Return a valid Python list of strings only. No explanation, bullets, or formatting."
|
| 115 |
-
)
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
filtered_keywords = ast.literal_eval(match.group(0))
|
| 130 |
-
except:
|
| 131 |
-
filtered_keywords = []
|
| 132 |
-
|
| 133 |
-
# === Step 4: Fallback to Gemini keyword generation if needed ===
|
| 134 |
-
if not filtered_keywords:
|
| 135 |
-
fallback_prompt = (
|
| 136 |
-
f"You are an SEO expert. Generate {llm_n} niche-relevant SEO keywords "
|
| 137 |
-
f"based on this content:\n\n{scraped_text}\n\n"
|
| 138 |
-
"Return a comma-separated list of lowercase 2–5 word search phrases. No formatting."
|
| 139 |
-
)
|
| 140 |
-
fallback_keywords_raw = get_response(
|
| 141 |
-
input=fallback_prompt,
|
| 142 |
-
template=lambda x: x.strip(),
|
| 143 |
-
llm="gemini",
|
| 144 |
-
md=False,
|
| 145 |
-
temperature=0.7,
|
| 146 |
-
max_tokens=400
|
| 147 |
-
)
|
| 148 |
-
filtered_keywords = [kw.strip() for kw in fallback_keywords_raw.split(",") if kw.strip()]
|
| 149 |
-
print(f"[INFO] Fallback keywords used: {filtered_keywords[:top_n]}")
|
| 150 |
|
| 151 |
-
# === Step
|
| 152 |
-
time.sleep(3)
|
| 153 |
-
combined_keywords = list(dict.fromkeys(filtered_keywords)) # remove duplicates
|
| 154 |
if len(combined_keywords) < 30:
|
| 155 |
-
needed =
|
| 156 |
-
|
| 157 |
-
|
| 158 |
pad_prompt = (
|
| 159 |
-
f"
|
| 160 |
-
f"
|
| 161 |
-
f"
|
| 162 |
-
"
|
| 163 |
-
"
|
| 164 |
-
"
|
| 165 |
-
"
|
| 166 |
-
"- be clearly relevant to the article\n"
|
| 167 |
-
"- avoid generic terms like 'pokemon'\n\n"
|
| 168 |
-
"Return only the keywords as a single comma-separated string, with no extra formatting or explanation.\n"
|
| 169 |
-
"Example output:\n"
|
| 170 |
-
"keyword one, keyword two, keyword three"
|
| 171 |
)
|
| 172 |
|
| 173 |
pad_raw = get_response(
|
|
@@ -175,23 +203,41 @@ async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
|
|
| 175 |
template=lambda x: x.strip(),
|
| 176 |
llm="gemini",
|
| 177 |
md=False,
|
| 178 |
-
temperature=0.
|
| 179 |
-
max_tokens=
|
| 180 |
)
|
| 181 |
|
| 182 |
pad_keywords = []
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
|
|
|
|
| 195 |
|
| 196 |
|
| 197 |
|
|
@@ -236,19 +282,31 @@ def server(input, output, session):
|
|
| 236 |
keyword_str = ", ".join(keywords)
|
| 237 |
|
| 238 |
# Title generation from scraped text
|
|
|
|
| 239 |
infer_topic_prompt = (
|
| 240 |
-
f"
|
| 241 |
-
f"
|
| 242 |
-
f"Return ONLY the
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
)
|
| 244 |
-
|
|
|
|
| 245 |
input=infer_topic_prompt,
|
| 246 |
-
template=lambda x: x.strip()
|
| 247 |
llm="gemini",
|
| 248 |
md=False,
|
| 249 |
-
temperature=0.
|
| 250 |
-
max_tokens=
|
| 251 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
# Blog generation with injected SEO
|
| 254 |
prompt = (
|
|
|
|
| 40 |
print(f"[ERROR] Failed to render or scrape: {e}")
|
| 41 |
return ""
|
| 42 |
|
| 43 |
+
# === Step 6: Semantic Validation (The "Double Check") ===
|
| 44 |
+
# ==========================================
|
| 45 |
+
# 1. HELPER: Semantic Keyword Validation (Fixed)
|
| 46 |
+
# ==========================================
|
| 47 |
+
def filter_irrelevant_keywords(keywords: list, article_text: str) -> list:
|
| 48 |
+
print(f"[INFO] Validating {len(keywords)} keywords for relevance...")
|
| 49 |
+
|
| 50 |
+
validation_prompt = (
|
| 51 |
+
f"Role: You are an elite SEO Editor.\n"
|
| 52 |
+
f"Task: Review the list of keywords below against the provided Article Content.\n"
|
| 53 |
+
f"Action: REMOVE any keywords that are irrelevant, hallucinatory, or completely off-topic.\n"
|
| 54 |
+
f"Criteria: Keep specific, long-tail, and topically related keywords. Remove generic terms.\n\n"
|
| 55 |
+
f"--- KEYWORDS TO REVIEW ---\n"
|
| 56 |
+
f"{', '.join(keywords)}\n\n"
|
| 57 |
+
f"--- ARTICLE CONTEXT ---\n"
|
| 58 |
+
f"{article_text[:1500]}\n\n"
|
| 59 |
+
f"OUTPUT FORMAT:\n"
|
| 60 |
+
f"Return the CLEANED list as a simple BULLET LIST (one per line).\n"
|
| 61 |
+
f"Example:\n- keyword one\n- keyword two"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
validated_raw = get_response(
|
| 66 |
+
input=validation_prompt,
|
| 67 |
+
template=lambda x: x.strip(),
|
| 68 |
+
llm="gemini",
|
| 69 |
+
md=False,
|
| 70 |
+
temperature=0.1,
|
| 71 |
+
max_tokens=1000
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Robust Line-by-Line Parsing (No more SyntaxErrors)
|
| 75 |
+
clean_list = []
|
| 76 |
+
for line in validated_raw.split("\n"):
|
| 77 |
+
# Strip bullets (*, -) and surrounding whitespace
|
| 78 |
+
clean_item = line.lstrip("*- ").strip().lower()
|
| 79 |
+
|
| 80 |
+
# Basic sanity checks to avoid empty lines or conversational filler
|
| 81 |
+
if clean_item and len(clean_item) > 2 and "here are" not in clean_item:
|
| 82 |
+
clean_list.append(clean_item)
|
| 83 |
+
|
| 84 |
+
dropped_count = len(keywords) - len(clean_list)
|
| 85 |
+
if dropped_count > 0:
|
| 86 |
+
print(f"[INFO] Validation removed {dropped_count} irrelevant keywords.")
|
| 87 |
+
|
| 88 |
+
return clean_list
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
print(f"[WARN] Validation failed: {e}. Returning originals.")
|
| 92 |
+
return keywords
|
| 93 |
+
|
| 94 |
# === Async keyword + scrape + fallback logic ===
|
| 95 |
async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
|
| 96 |
scraped_text = await scrape_div_content_from_url(url)
|
|
|
|
| 101 |
# === Step 1: Extract condensed topic keywords ===
|
| 102 |
try:
|
| 103 |
condensed_prompt = (
|
| 104 |
+
"You are an SEO expert. Identify exactly 5 distinct main topics from the text below.\n"
|
| 105 |
+
"Format: Return a BULLET LIST only.\n"
|
| 106 |
+
"Rules: NO intro text. NO numbering. NO explanations.\n"
|
| 107 |
+
f"TEXT TO ANALYZE:\n{scraped_text[:3000]}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
)
|
| 109 |
|
|
|
|
| 110 |
condensed_topic_raw = get_response(
|
| 111 |
input=condensed_prompt,
|
| 112 |
template=lambda x: x.strip(),
|
| 113 |
llm="gemini",
|
| 114 |
md=False,
|
| 115 |
+
temperature=0.3, # Lower temp = less chatty
|
| 116 |
+
max_tokens=200
|
| 117 |
)
|
| 118 |
+
|
| 119 |
+
# Cleaner parsing logic
|
| 120 |
+
condensed_topic = []
|
| 121 |
+
for line in condensed_topic_raw.split("\n"):
|
| 122 |
+
clean = line.replace("*", "").replace("-", "").strip().lower()
|
| 123 |
+
if clean and "here are" not in clean:
|
| 124 |
+
condensed_topic.append(clean)
|
| 125 |
|
| 126 |
+
if len(condensed_topic) < 2:
|
| 127 |
+
condensed_topic = [k.strip() for k in condensed_topic_raw.split(",") if k.strip()]
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
except Exception as e:
|
|
|
|
| 130 |
condensed_topic = ["trading cards"]
|
| 131 |
|
| 132 |
+
# === Step 2: PyTrends Logic (Fixed) ===
|
| 133 |
+
print(f"[INFO] Starting PyTrends for topics: {condensed_topic[:3]}")
|
| 134 |
all_suggestions = set()
|
| 135 |
+
|
| 136 |
+
# FIX: Initialize with retries=0 to bypass the 'method_whitelist' crash
|
| 137 |
+
# We will handle retries manually in the loop below.
|
| 138 |
try:
|
| 139 |
+
pytrends = TrendReq(hl="en-US", tz=360, timeout=10, retries=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
except Exception as e:
|
| 141 |
+
print(f"[ERROR] Could not initialize PyTrends: {e}")
|
| 142 |
+
pytrends = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
if pytrends:
|
| 145 |
+
for topic in condensed_topic[:3]:
|
| 146 |
+
print(f"[INFO] Querying PyTrends for: '{topic}'...")
|
| 147 |
+
|
| 148 |
+
# Manual Retry Logic (since we disabled the internal one)
|
| 149 |
+
for attempt in range(3):
|
| 150 |
+
try:
|
| 151 |
+
# Sleep to prevent 429 Too Many Requests
|
| 152 |
+
time.sleep(2)
|
| 153 |
+
|
| 154 |
+
suggestions = pytrends.suggestions(keyword=topic)
|
| 155 |
+
|
| 156 |
+
if suggestions:
|
| 157 |
+
titles = [s["title"].lower().strip() for s in suggestions]
|
| 158 |
+
print(f" -> Found {len(titles)} suggestions: {titles}")
|
| 159 |
+
all_suggestions.update(titles)
|
| 160 |
+
break # Success, stop retrying this keyword
|
| 161 |
+
else:
|
| 162 |
+
print(" -> No suggestions found.")
|
| 163 |
+
break # No data, stop retrying
|
| 164 |
+
|
| 165 |
+
except Exception as inner_e:
|
| 166 |
+
# If it's a 429 error, wait longer and try again
|
| 167 |
+
if "429" in str(inner_e):
|
| 168 |
+
print(f" -> [WARN] Rate limited on '{topic}'. Waiting 5s...")
|
| 169 |
+
time.sleep(5)
|
| 170 |
+
else:
|
| 171 |
+
print(f" -> [WARN] Failed for '{topic}' (Attempt {attempt+1}/3): {inner_e}")
|
| 172 |
+
if attempt == 2: # Last attempt failed
|
| 173 |
+
print(" -> Giving up on this keyword.")
|
| 174 |
+
|
| 175 |
+
# Convert set to list
|
| 176 |
+
combined_keywords = list(all_suggestions)
|
| 177 |
+
|
| 178 |
+
if not combined_keywords:
|
| 179 |
+
print("[INFO] PyTrends returned 0 results. Switching to LLM Fallback.")
|
| 180 |
+
else:
|
| 181 |
+
print(f"[INFO] PyTrends successful. Total keywords: {len(combined_keywords)}")
|
| 182 |
|
| 183 |
+
# === Step 3: Fallback / Filtering ===
|
| 184 |
+
# If PyTrends gave results, we trust them. If not, we use LLM.
|
| 185 |
+
combined_keywords = list(all_suggestions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
# === Step 4: Padding (The Fix for "Okay here are...") ===
|
|
|
|
|
|
|
| 188 |
if len(combined_keywords) < 30:
|
| 189 |
+
needed = 35 - len(combined_keywords)
|
| 190 |
+
|
|
|
|
| 191 |
pad_prompt = (
|
| 192 |
+
f"Generate exactly {needed} NEW, DISTINCT long-tail SEO keywords based on this text.\n"
|
| 193 |
+
f"STRICT OUTPUT RULES:\n"
|
| 194 |
+
f"1. Return ONLY a raw bullet list (one keyword per line).\n"
|
| 195 |
+
f"2. DO NOT write 'Here are the keywords'.\n"
|
| 196 |
+
f"3. DO NOT add parentheses or explanations like '(best for beginners)'.\n"
|
| 197 |
+
f"4. Just the keywords.\n\n"
|
| 198 |
+
f"Context:\n{scraped_text[:2500]}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
)
|
| 200 |
|
| 201 |
pad_raw = get_response(
|
|
|
|
| 203 |
template=lambda x: x.strip(),
|
| 204 |
llm="gemini",
|
| 205 |
md=False,
|
| 206 |
+
temperature=0.5, # Lower temp prevents hallucinated explanations
|
| 207 |
+
max_tokens=1000
|
| 208 |
)
|
| 209 |
|
| 210 |
pad_keywords = []
|
| 211 |
+
for line in pad_raw.split("\n"):
|
| 212 |
+
# remove bullets
|
| 213 |
+
clean_line = line.strip().lstrip("*-+1234567890. ").strip()
|
| 214 |
+
|
| 215 |
+
# remove parenthetical explanations using regex
|
| 216 |
+
# e.g., "op13 cards (rare)" -> "op13 cards"
|
| 217 |
+
clean_line = re.sub(r"\(.*?\)", "", clean_line).strip()
|
| 218 |
+
|
| 219 |
+
# Filter out chatty lines
|
| 220 |
+
if (len(clean_line) > 3
|
| 221 |
+
and "here are" not in clean_line.lower()
|
| 222 |
+
and "formatted as" not in clean_line.lower()
|
| 223 |
+
and ":" not in clean_line):
|
| 224 |
+
|
| 225 |
+
pad_keywords.append(clean_line.lower())
|
| 226 |
+
|
| 227 |
+
combined_keywords = list(set(combined_keywords + pad_keywords))
|
| 228 |
+
|
| 229 |
+
# Double check relevance before returning
|
| 230 |
+
if len(combined_keywords) > 10:
|
| 231 |
+
validated_keywords = filter_irrelevant_keywords(combined_keywords, scraped_text)
|
| 232 |
+
else:
|
| 233 |
+
validated_keywords = combined_keywords
|
| 234 |
+
|
| 235 |
+
# Fallback if validation was too aggressive
|
| 236 |
+
if len(validated_keywords) < 10:
|
| 237 |
+
validated_keywords = combined_keywords
|
| 238 |
|
| 239 |
+
print(f"[INFO] Final validated count: {len(validated_keywords)}")
|
| 240 |
+
return validated_keywords[:30], scraped_text
|
| 241 |
|
| 242 |
|
| 243 |
|
|
|
|
| 282 |
keyword_str = ", ".join(keywords)
|
| 283 |
|
| 284 |
# Title generation from scraped text
|
| 285 |
+
# Title generation with stricter prompt
|
| 286 |
infer_topic_prompt = (
|
| 287 |
+
f"Write ONE catchy, click-worthy H1 Blog Title for the content below.\n"
|
| 288 |
+
f"STRICT RULES:\n"
|
| 289 |
+
f"- Return ONLY the title string.\n"
|
| 290 |
+
f"- Do NOT write 'Title:' or 'Here is a title'.\n"
|
| 291 |
+
f"- Do NOT use quotation marks.\n"
|
| 292 |
+
f"- Max 15 words.\n\n"
|
| 293 |
+
f"Content:\n{scraped[:2000]}"
|
| 294 |
)
|
| 295 |
+
|
| 296 |
+
seo_title_raw = get_response(
|
| 297 |
input=infer_topic_prompt,
|
| 298 |
+
template=lambda x: x.strip(),
|
| 299 |
llm="gemini",
|
| 300 |
md=False,
|
| 301 |
+
temperature=0.7,
|
| 302 |
+
max_tokens=60
|
| 303 |
)
|
| 304 |
+
|
| 305 |
+
# Cleanup: Remove quotes and "Title:" prefix if the LLM ignores rules
|
| 306 |
+
seo_title = seo_title_raw.replace('"', '').replace("Title:", "").strip()
|
| 307 |
+
# If it gave multiple options (detected by newlines), take the first one
|
| 308 |
+
if "\n" in seo_title:
|
| 309 |
+
seo_title = seo_title.split("\n")[0].strip()
|
| 310 |
|
| 311 |
# Blog generation with injected SEO
|
| 312 |
prompt = (
|