rsm-roguchi commited on
Commit
60c1d6e
·
1 Parent(s): 93589c3

dokcer changes

Browse files
Files changed (3) hide show
  1. Dockerfile +8 -2
  2. code/llm_connect.py +2 -2
  3. server/blog.py +175 -117
Dockerfile CHANGED
@@ -2,14 +2,20 @@
2
  FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
3
 
4
  # Optional system extras
 
5
  RUN apt-get update && apt-get install -y --no-install-recommends \
6
- ffmpeg fonts-noto-color-emoji fonts-liberation \
 
 
 
 
7
  && rm -rf /var/lib/apt/lists/*
8
 
9
  WORKDIR /app
10
 
11
  # Python deps
12
  COPY requirements.txt .
 
13
  # Make sure requirements.txt has: shap==0.48.0
14
  RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
15
 
@@ -20,4 +26,4 @@ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
20
  COPY . .
21
 
22
  EXPOSE 7860
23
- CMD ["shiny", "run", "--host", "0.0.0.0", "--port", "7860", "app:app"]
 
2
  FROM mcr.microsoft.com/playwright/python:v1.53.0-noble
3
 
4
  # Optional system extras
5
+ # FIXED: Added 'build-essential' and 'python3-dev' to allow compiling C libraries like SHAP
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ python3-dev \
9
+ ffmpeg \
10
+ fonts-noto-color-emoji \
11
+ fonts-liberation \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  WORKDIR /app
15
 
16
  # Python deps
17
  COPY requirements.txt .
18
+
19
  # Make sure requirements.txt has: shap==0.48.0
20
  RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
21
 
 
26
  COPY . .
27
 
28
  EXPOSE 7860
29
+ CMD ["shiny", "run", "--host", "0.0.0.0", "--port", "7860", "app:app"]
code/llm_connect.py CHANGED
@@ -91,7 +91,7 @@ def test_llama_connection(api_key: str, timeout: int = 20) -> bool:
91
 
92
  def query_gemini(
93
  messages: List[dict],
94
- model: str = "gemini-2.0-flash",
95
  max_tokens: int = 4000,
96
  temperature: int = 0.4,
97
  api_key: str = "",
@@ -173,7 +173,7 @@ def get_response(
173
  api_key=os.getenv("GEMINI_API_KEY"),
174
  temperature=temperature,
175
  max_tokens=max_tokens,
176
- model=model_name if model_name else 'gemini-2.0-flash'
177
  )
178
  else:
179
  raise ValueError("LLM: Invalid LLM specified")
 
91
 
92
  def query_gemini(
93
  messages: List[dict],
94
+ model: str = "gemma-3-12b-it",
95
  max_tokens: int = 4000,
96
  temperature: int = 0.4,
97
  api_key: str = "",
 
173
  api_key=os.getenv("GEMINI_API_KEY"),
174
  temperature=temperature,
175
  max_tokens=max_tokens,
176
+ model=model_name if model_name else 'gemma-3-12b-it'
177
  )
178
  else:
179
  raise ValueError("LLM: Invalid LLM specified")
server/blog.py CHANGED
@@ -40,6 +40,57 @@ async def scrape_div_content_from_url(url: str) -> str:
40
  print(f"[ERROR] Failed to render or scrape: {e}")
41
  return ""
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # === Async keyword + scrape + fallback logic ===
44
  async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
45
  scraped_text = await scrape_div_content_from_url(url)
@@ -50,124 +101,101 @@ async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
50
  # === Step 1: Extract condensed topic keywords ===
51
  try:
52
  condensed_prompt = (
53
- "Extract exactly 5 to 7 Google search phrases from the content below that reflect real user search intent. "
54
- "Each phrase should describe a specific product, use case, or collector topic — not generic brands or categories.\n\n"
55
- "⚠️ Rules:\n"
56
- "- Each phrase must be 2 to 5 words\n"
57
- "- All phrases must be lowercase and ASCII-only\n"
58
- "- Do NOT include apostrophes, single quotes, or quotation marks — rewrite or skip any phrases that contain them\n"
59
- "- Do NOT include single words or overly broad terms like 'pokemon'\n"
60
- "- Do NOT return line breaks, bullet points, or list formatting\n\n"
61
- "✅ Output format:\n"
62
- "Return a single comma-separated string of keyword phrases, with no brackets, no quotes, and no explanation.\n"
63
- "Example output:\n"
64
- "vintage charizard value, graded card pricing, rare booster packs, psa 10 umbreon, tcg price trends\n\n"
65
- f"Content:\n{scraped_text}"
66
  )
67
 
68
-
69
  condensed_topic_raw = get_response(
70
  input=condensed_prompt,
71
  template=lambda x: x.strip(),
72
  llm="gemini",
73
  md=False,
74
- temperature=0.6,
75
- max_tokens=100
76
  )
77
- print(condensed_topic_raw)
 
 
 
 
 
 
78
 
79
- # Parse comma-separated string
80
- condensed_topic = [kw.strip() for kw in condensed_topic_raw.split(",") if kw.strip()]
81
 
82
- if not condensed_topic:
83
- condensed_topic = ["trading cards"]
84
-
85
- print(f"[INFO] Condensed topic keywords: {condensed_topic}")
86
  except Exception as e:
87
- print(f"[WARN] Could not infer topics: {e}")
88
  condensed_topic = ["trading cards"]
89
 
90
- # === Step 2: Pull suggestions from PyTrends ===
91
- time.sleep(3)
92
  all_suggestions = set()
 
 
 
93
  try:
94
- pytrends = TrendReq(hl="en-US", tz=360, timeout=10)
95
- for topic in condensed_topic:
96
- time.sleep(5)
97
- suggestions = pytrends.suggestions(keyword=topic)
98
- if suggestions:
99
- titles = [s["title"] for s in suggestions]
100
- all_suggestions.update(titles)
101
- print(f"[INFO] Suggestions for '{topic}': {titles[:3]}")
102
  except Exception as e:
103
- print(f"[WARN] PyTrends suggestions failed: {e}")
104
-
105
- all_suggestions = list(all_suggestions)
106
-
107
- # === Step 3: Let Gemini filter suggestions for relevance ===
108
- filtered_keywords = []
109
- if all_suggestions:
110
- filter_prompt = (
111
- f"The following article was scraped:\n\n{scraped_text[:1500]}\n\n"
112
- f"Here is a list of keyword suggestions:\n{all_suggestions}\n\n"
113
- "Return only the keywords that are clearly relevant to the article topic. "
114
- "Return a valid Python list of strings only. No explanation, bullets, or formatting."
115
- )
116
 
117
- raw_filtered = get_response(
118
- input=filter_prompt,
119
- template=lambda x: x.strip(),
120
- llm="gemini",
121
- md=False,
122
- temperature=0.3,
123
- max_tokens=200
124
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- match = re.search(r"\[.*?\]", raw_filtered)
127
- if match:
128
- try:
129
- filtered_keywords = ast.literal_eval(match.group(0))
130
- except:
131
- filtered_keywords = []
132
-
133
- # === Step 4: Fallback to Gemini keyword generation if needed ===
134
- if not filtered_keywords:
135
- fallback_prompt = (
136
- f"You are an SEO expert. Generate {llm_n} niche-relevant SEO keywords "
137
- f"based on this content:\n\n{scraped_text}\n\n"
138
- "Return a comma-separated list of lowercase 2–5 word search phrases. No formatting."
139
- )
140
- fallback_keywords_raw = get_response(
141
- input=fallback_prompt,
142
- template=lambda x: x.strip(),
143
- llm="gemini",
144
- md=False,
145
- temperature=0.7,
146
- max_tokens=400
147
- )
148
- filtered_keywords = [kw.strip() for kw in fallback_keywords_raw.split(",") if kw.strip()]
149
- print(f"[INFO] Fallback keywords used: {filtered_keywords[:top_n]}")
150
 
151
- # === Step 5: Enforce minimum of 30 keywords ===
152
- time.sleep(3)
153
- combined_keywords = list(dict.fromkeys(filtered_keywords)) # remove duplicates
154
  if len(combined_keywords) < 30:
155
- needed = 30 - len(combined_keywords)
156
- print(f"[INFO] Need {needed} more keywords to reach 30. Using Gemini to pad.")
157
-
158
  pad_prompt = (
159
- f"The following article content is missing SEO keyword coverage:\n\n"
160
- f"{scraped_text}\n\n"
161
- f"Generate exactly {needed} additional SEO keyword phrases.\n"
162
- "Each keyword must:\n"
163
- "- be 2 to 5 words long\n"
164
- "- be lowercase only\n"
165
- "- use ASCII characters only (no symbols or accents)\n"
166
- "- be clearly relevant to the article\n"
167
- "- avoid generic terms like 'pokemon'\n\n"
168
- "Return only the keywords as a single comma-separated string, with no extra formatting or explanation.\n"
169
- "Example output:\n"
170
- "keyword one, keyword two, keyword three"
171
  )
172
 
173
  pad_raw = get_response(
@@ -175,23 +203,41 @@ async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
175
  template=lambda x: x.strip(),
176
  llm="gemini",
177
  md=False,
178
- temperature=0.7,
179
- max_tokens=200
180
  )
181
 
182
  pad_keywords = []
183
- print(pad_raw)
184
-
185
- try:
186
- pad_keywords = [kw.strip() for kw in pad_raw.split(",") if kw.strip()]
187
- except Exception as e:
188
- print(f"[WARN] Keyword parsing failed: {e}")
189
- pad_keywords = []
190
-
191
- combined_keywords = list(dict.fromkeys(combined_keywords + pad_keywords))
192
- print(f"[INFO] Padded {len(pad_keywords)} keywords:", pad_keywords)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- return combined_keywords[:30], scraped_text
 
195
 
196
 
197
 
@@ -236,19 +282,31 @@ def server(input, output, session):
236
  keyword_str = ", ".join(keywords)
237
 
238
  # Title generation from scraped text
 
239
  infer_topic_prompt = (
240
- f"Based on the following article content:\n\n{scraped[:2000]}\n\n"
241
- f"Return a short, descriptive blog post title (max 70 characters)."
242
- f"Return ONLY the TITLE"
 
 
 
 
243
  )
244
- seo_title = get_response(
 
245
  input=infer_topic_prompt,
246
- template=lambda x: x.strip().replace('"', ''),
247
  llm="gemini",
248
  md=False,
249
- temperature=0.5,
250
- max_tokens=20
251
  )
 
 
 
 
 
 
252
 
253
  # Blog generation with injected SEO
254
  prompt = (
 
40
  print(f"[ERROR] Failed to render or scrape: {e}")
41
  return ""
42
 
43
+ # === Step 6: Semantic Validation (The "Double Check") ===
44
+ # ==========================================
45
+ # 1. HELPER: Semantic Keyword Validation (Fixed)
46
+ # ==========================================
47
+ def filter_irrelevant_keywords(keywords: list, article_text: str) -> list:
48
+ print(f"[INFO] Validating {len(keywords)} keywords for relevance...")
49
+
50
+ validation_prompt = (
51
+ f"Role: You are an elite SEO Editor.\n"
52
+ f"Task: Review the list of keywords below against the provided Article Content.\n"
53
+ f"Action: REMOVE any keywords that are irrelevant, hallucinatory, or completely off-topic.\n"
54
+ f"Criteria: Keep specific, long-tail, and topically related keywords. Remove generic terms.\n\n"
55
+ f"--- KEYWORDS TO REVIEW ---\n"
56
+ f"{', '.join(keywords)}\n\n"
57
+ f"--- ARTICLE CONTEXT ---\n"
58
+ f"{article_text[:1500]}\n\n"
59
+ f"OUTPUT FORMAT:\n"
60
+ f"Return the CLEANED list as a simple BULLET LIST (one per line).\n"
61
+ f"Example:\n- keyword one\n- keyword two"
62
+ )
63
+
64
+ try:
65
+ validated_raw = get_response(
66
+ input=validation_prompt,
67
+ template=lambda x: x.strip(),
68
+ llm="gemini",
69
+ md=False,
70
+ temperature=0.1,
71
+ max_tokens=1000
72
+ )
73
+
74
+ # Robust Line-by-Line Parsing (No more SyntaxErrors)
75
+ clean_list = []
76
+ for line in validated_raw.split("\n"):
77
+ # Strip bullets (*, -) and surrounding whitespace
78
+ clean_item = line.lstrip("*- ").strip().lower()
79
+
80
+ # Basic sanity checks to avoid empty lines or conversational filler
81
+ if clean_item and len(clean_item) > 2 and "here are" not in clean_item:
82
+ clean_list.append(clean_item)
83
+
84
+ dropped_count = len(keywords) - len(clean_list)
85
+ if dropped_count > 0:
86
+ print(f"[INFO] Validation removed {dropped_count} irrelevant keywords.")
87
+
88
+ return clean_list
89
+
90
+ except Exception as e:
91
+ print(f"[WARN] Validation failed: {e}. Returning originals.")
92
+ return keywords
93
+
94
  # === Async keyword + scrape + fallback logic ===
95
  async def get_keywords_and_content(url: str, top_n=5, llm_n=25):
96
  scraped_text = await scrape_div_content_from_url(url)
 
101
  # === Step 1: Extract condensed topic keywords ===
102
  try:
103
  condensed_prompt = (
104
+ "You are an SEO expert. Identify exactly 5 distinct main topics from the text below.\n"
105
+ "Format: Return a BULLET LIST only.\n"
106
+ "Rules: NO intro text. NO numbering. NO explanations.\n"
107
+ f"TEXT TO ANALYZE:\n{scraped_text[:3000]}"
 
 
 
 
 
 
 
 
 
108
  )
109
 
 
110
  condensed_topic_raw = get_response(
111
  input=condensed_prompt,
112
  template=lambda x: x.strip(),
113
  llm="gemini",
114
  md=False,
115
+ temperature=0.3, # Lower temp = less chatty
116
+ max_tokens=200
117
  )
118
+
119
+ # Cleaner parsing logic
120
+ condensed_topic = []
121
+ for line in condensed_topic_raw.split("\n"):
122
+ clean = line.replace("*", "").replace("-", "").strip().lower()
123
+ if clean and "here are" not in clean:
124
+ condensed_topic.append(clean)
125
 
126
+ if len(condensed_topic) < 2:
127
+ condensed_topic = [k.strip() for k in condensed_topic_raw.split(",") if k.strip()]
128
 
 
 
 
 
129
  except Exception as e:
 
130
  condensed_topic = ["trading cards"]
131
 
132
+ # === Step 2: PyTrends Logic (Fixed) ===
133
+ print(f"[INFO] Starting PyTrends for topics: {condensed_topic[:3]}")
134
  all_suggestions = set()
135
+
136
+ # FIX: Initialize with retries=0 to bypass the 'method_whitelist' crash
137
+ # We will handle retries manually in the loop below.
138
  try:
139
+ pytrends = TrendReq(hl="en-US", tz=360, timeout=10, retries=0)
 
 
 
 
 
 
 
140
  except Exception as e:
141
+ print(f"[ERROR] Could not initialize PyTrends: {e}")
142
+ pytrends = None
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ if pytrends:
145
+ for topic in condensed_topic[:3]:
146
+ print(f"[INFO] Querying PyTrends for: '{topic}'...")
147
+
148
+ # Manual Retry Logic (since we disabled the internal one)
149
+ for attempt in range(3):
150
+ try:
151
+ # Sleep to prevent 429 Too Many Requests
152
+ time.sleep(2)
153
+
154
+ suggestions = pytrends.suggestions(keyword=topic)
155
+
156
+ if suggestions:
157
+ titles = [s["title"].lower().strip() for s in suggestions]
158
+ print(f" -> Found {len(titles)} suggestions: {titles}")
159
+ all_suggestions.update(titles)
160
+ break # Success, stop retrying this keyword
161
+ else:
162
+ print(" -> No suggestions found.")
163
+ break # No data, stop retrying
164
+
165
+ except Exception as inner_e:
166
+ # If it's a 429 error, wait longer and try again
167
+ if "429" in str(inner_e):
168
+ print(f" -> [WARN] Rate limited on '{topic}'. Waiting 5s...")
169
+ time.sleep(5)
170
+ else:
171
+ print(f" -> [WARN] Failed for '{topic}' (Attempt {attempt+1}/3): {inner_e}")
172
+ if attempt == 2: # Last attempt failed
173
+ print(" -> Giving up on this keyword.")
174
+
175
+ # Convert set to list
176
+ combined_keywords = list(all_suggestions)
177
+
178
+ if not combined_keywords:
179
+ print("[INFO] PyTrends returned 0 results. Switching to LLM Fallback.")
180
+ else:
181
+ print(f"[INFO] PyTrends successful. Total keywords: {len(combined_keywords)}")
182
 
183
+ # === Step 3: Fallback / Filtering ===
184
+ # If PyTrends gave results, we trust them. If not, we use LLM.
185
+ combined_keywords = list(all_suggestions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ # === Step 4: Padding (The Fix for "Okay here are...") ===
 
 
188
  if len(combined_keywords) < 30:
189
+ needed = 35 - len(combined_keywords)
190
+
 
191
  pad_prompt = (
192
+ f"Generate exactly {needed} NEW, DISTINCT long-tail SEO keywords based on this text.\n"
193
+ f"STRICT OUTPUT RULES:\n"
194
+ f"1. Return ONLY a raw bullet list (one keyword per line).\n"
195
+ f"2. DO NOT write 'Here are the keywords'.\n"
196
+ f"3. DO NOT add parentheses or explanations like '(best for beginners)'.\n"
197
+ f"4. Just the keywords.\n\n"
198
+ f"Context:\n{scraped_text[:2500]}"
 
 
 
 
 
199
  )
200
 
201
  pad_raw = get_response(
 
203
  template=lambda x: x.strip(),
204
  llm="gemini",
205
  md=False,
206
+ temperature=0.5, # Lower temp prevents hallucinated explanations
207
+ max_tokens=1000
208
  )
209
 
210
  pad_keywords = []
211
+ for line in pad_raw.split("\n"):
212
+ # remove bullets
213
+ clean_line = line.strip().lstrip("*-+1234567890. ").strip()
214
+
215
+ # remove parenthetical explanations using regex
216
+ # e.g., "op13 cards (rare)" -> "op13 cards"
217
+ clean_line = re.sub(r"\(.*?\)", "", clean_line).strip()
218
+
219
+ # Filter out chatty lines
220
+ if (len(clean_line) > 3
221
+ and "here are" not in clean_line.lower()
222
+ and "formatted as" not in clean_line.lower()
223
+ and ":" not in clean_line):
224
+
225
+ pad_keywords.append(clean_line.lower())
226
+
227
+ combined_keywords = list(set(combined_keywords + pad_keywords))
228
+
229
+ # Double check relevance before returning
230
+ if len(combined_keywords) > 10:
231
+ validated_keywords = filter_irrelevant_keywords(combined_keywords, scraped_text)
232
+ else:
233
+ validated_keywords = combined_keywords
234
+
235
+ # Fallback if validation was too aggressive
236
+ if len(validated_keywords) < 10:
237
+ validated_keywords = combined_keywords
238
 
239
+ print(f"[INFO] Final validated count: {len(validated_keywords)}")
240
+ return validated_keywords[:30], scraped_text
241
 
242
 
243
 
 
282
  keyword_str = ", ".join(keywords)
283
 
284
  # Title generation from scraped text
285
+ # Title generation with stricter prompt
286
  infer_topic_prompt = (
287
+ f"Write ONE catchy, click-worthy H1 Blog Title for the content below.\n"
288
+ f"STRICT RULES:\n"
289
+ f"- Return ONLY the title string.\n"
290
+ f"- Do NOT write 'Title:' or 'Here is a title'.\n"
291
+ f"- Do NOT use quotation marks.\n"
292
+ f"- Max 15 words.\n\n"
293
+ f"Content:\n{scraped[:2000]}"
294
  )
295
+
296
+ seo_title_raw = get_response(
297
  input=infer_topic_prompt,
298
+ template=lambda x: x.strip(),
299
  llm="gemini",
300
  md=False,
301
+ temperature=0.7,
302
+ max_tokens=60
303
  )
304
+
305
+ # Cleanup: Remove quotes and "Title:" prefix if the LLM ignores rules
306
+ seo_title = seo_title_raw.replace('"', '').replace("Title:", "").strip()
307
+ # If it gave multiple options (detected by newlines), take the first one
308
+ if "\n" in seo_title:
309
+ seo_title = seo_title.split("\n")[0].strip()
310
 
311
  # Blog generation with injected SEO
312
  prompt = (