Alpha108 commited on
Commit
fb2e00d
Β·
verified Β·
1 Parent(s): 88209fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -136
app.py CHANGED
@@ -5,19 +5,18 @@ import math
5
  import streamlit as st
6
  import pandas as pd
7
 
8
- # =========================
9
- # 0) CONFIG / CONSTANTS
10
- # =========================
11
- GROQ_DEFAULT_MODEL = "llama-3.3-70b-versatile" # Sidebar lets you change
12
- MAX_KEYWORDS = 12
13
- SEED_STOPWORDS = set("""
14
  a an and the or for nor but so yet of to in on with at by from as is are was were be being been
15
  i you he she it we they them us our your their this that these those here there
16
  """.split())
17
 
18
- # =========================
19
- # 1) GROQ CLIENT
20
- # =========================
21
  try:
22
  from groq import Groq
23
  except ImportError:
@@ -26,29 +25,31 @@ except ImportError:
26
  def get_groq_client():
27
  api_key = os.getenv("GROQ_API_KEY")
28
  if not api_key:
29
- raise RuntimeError("Missing GROQ_API_KEY. Set it in Space β†’ Settings β†’ Variables & Secrets.")
30
  if Groq is None:
31
  raise RuntimeError("Package 'groq' not installed. Add 'groq' to requirements.txt.")
32
  return Groq(api_key=api_key)
33
 
34
- def groq_generate(prompt, model, temperature, top_p, max_tokens):
35
  client = get_groq_client()
36
  resp = client.chat.completions.create(
37
  model=model,
38
  messages=[
39
- {"role": "system", "content": "You craft concise, insightful LinkedIn posts that feel original and practical."},
40
  {"role": "user", "content": prompt}
41
  ],
42
  temperature=temperature,
43
  top_p=top_p,
44
  max_tokens=max_tokens,
45
- n=1
46
  )
47
  return resp.choices[0].message.content.strip()
48
 
49
- # =========================
50
- # 2) TEXT UTILS
51
- # =========================
 
 
 
52
  def dedupe_sentences(text: str) -> str:
53
  parts = re.split(r'(?<=[.!?])\s+', text.strip())
54
  seen = set()
@@ -74,12 +75,9 @@ def strip_labels(text: str) -> str:
74
  cleaned.append(L)
75
  return "\n".join(cleaned).strip()
76
 
77
- def clamp(n, lo, hi):
78
- return max(lo, min(hi, n))
79
-
80
- # =========================
81
- # 3) DATA INGEST & KEYWORDS
82
- # =========================
83
  def load_posts_from_file(file) -> pd.DataFrame:
84
  name = file.name.lower()
85
  if name.endswith(".csv"):
@@ -87,13 +85,12 @@ def load_posts_from_file(file) -> pd.DataFrame:
87
  elif name.endswith(".json"):
88
  df = pd.read_json(file, lines=False)
89
  else:
90
- raise ValueError("Upload a CSV or JSON file containing LinkedIn posts.")
91
- # Normalize to 'text' column
92
- candidate = [c for c in df.columns if c.lower() in ("text", "post", "content", "body")]
93
- if not candidate:
94
- raise ValueError("Dataset must have a 'text' (or post/content/body) column.")
95
  if "text" not in df.columns:
96
- df["text"] = df[candidate[0]]
97
  df["text"] = df["text"].fillna("").astype(str)
98
  return df[["text"]]
99
 
@@ -101,7 +98,7 @@ def simple_rake(text, min_len=2, max_len=3, top_k=12):
101
  words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
102
  phrases, cur = [], []
103
  for w in words:
104
- if w in SEED_STOPWORDS:
105
  if cur:
106
  phrases.append(" ".join(cur))
107
  cur = []
@@ -109,178 +106,147 @@ def simple_rake(text, min_len=2, max_len=3, top_k=12):
109
  cur.append(w)
110
  if cur:
111
  phrases.append(" ".join(cur))
112
- # Score by frequency+degree
113
  freq, degree = {}, {}
114
  for ph in phrases:
115
  toks = ph.split()
116
  for t in toks:
117
  freq[t] = freq.get(t, 0) + 1
118
- degree[t] = degree.get(t, 0) + (len(toks) - 1)
119
  scores = {}
120
  for ph in phrases:
121
  s = 0.0
122
  for t in ph.split():
123
- s += (degree.get(t, 0) + 1) / (freq.get(t, 1))
124
- scores[ph] = scores.get(ph, 0) + s
125
  ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
126
- filtered = [p for p, _ in ranked if min_len <= len(p.split()) <= max_len]
127
  return filtered[:top_k]
128
 
129
- def tfidf_keywords_builder(texts, top_k=10):
130
  docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts]
131
  vocab = {}
132
  for d in docs:
133
  for w in set(d):
134
- vocab.setdefault(w, {"df": 0})
135
- vocab[w]["df"] += 1
136
  N = len(docs)
137
- def score_doc(text):
138
  doc = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
139
  tf = {}
140
  for w in doc:
141
- tf[w] = tf.get(w, 0) + 1
142
  scores = {}
143
- for w, cnt in tf.items():
144
- df = vocab.get(w, {}).get("df", 1)
145
- idf = math.log((N + 1) / (df + 1)) + 1
146
- scores[w] = (cnt / len(doc)) * idf
147
  ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
148
- return [w for w, _ in ranked[:top_k]]
149
- return score_doc
150
-
151
- def extract_keywords(topic: str, posts_df: pd.DataFrame | None):
152
- if posts_df is not None and len(posts_df):
153
- sample = posts_df["text"].sample(min(30, len(posts_df)), random_state=42).tolist()
154
- rake_kw = simple_rake(" ".join(sample + [topic]), min_len=2, max_len=3, top_k=MAX_KEYWORDS)
155
- tfidf_fn = tfidf_keywords_builder(posts_df["text"].tolist(), top_k=MAX_KEYWORDS//2)
156
  kw2 = tfidf_fn(topic + " " + " ".join(sample[:5]))
157
- all_kw = rake_kw + kw2
158
  else:
159
- all_kw = simple_rake(topic, min_len=1, max_len=2, top_k=8)
160
  seen, out = set(), []
161
- for k in all_kw:
162
- k2 = re.sub(r"\s+", " ", k.strip().lower())
163
  if k2 and k2 not in seen:
164
- seen.add(k2)
165
- out.append(k2)
166
- return out[:MAX_KEYWORDS]
167
-
168
- # =========================
169
- # 4) PROMPT (PLAIN OUTPUT)
170
- # =========================
171
- def build_viral_prompt(topic, audience, tone, target_len, style_refs, keywords):
172
- style_block = "\n".join(f"- {s}" for s in style_refs[:4]) if style_refs else "- None"
173
- kw_block = ", ".join(keywords[:8]) if keywords else "N/A"
174
  return (
175
  "You are a senior LinkedIn content strategist.\n"
176
- "Objective: Write a viral, insightful LinkedIn post as plain text only (no section headers, no labels), "
177
- f"around {target_len} words, for the audience and topic below.\n\n"
178
  f"Topic: \"{topic}\"\n"
179
- f"Audience: \"{audience}\"\n"
180
  f"Tone: \"{tone}\"\n"
181
- f"Keywords to naturally weave in: {kw_block}\n\n"
182
- "Style cues (reflect these, do not list them):\n"
183
- f"{style_block}\n\n"
184
- "Apply silently (do not mention these rules):\n"
185
- "- Open with a curiosity-driving first line.\n"
186
- "- Use short sentences and short paragraphs.\n"
187
- "- Include 3–5 concrete insights, examples, or steps (bullets allowed, but no section labels).\n"
188
- "- Be specific, novel, and practical; avoid clichΓ©s and filler.\n"
189
- "- Use up to 2 emojis; add 2–4 niche hashtags only at the very end (optional).\n"
190
- "- Never output headings like HOOK/BODY/TAKEAWAY/CTA.\n"
191
- "- Do not repeat the phrase: β€œit's a great example of how we can make a difference in the world.”\n\n"
192
- "Output: A single cohesive LinkedIn post as plain text only. No headings. No metadata. No explanations."
193
  )
194
 
195
- # =========================
196
- # 5) STREAMLIT UI
197
- # =========================
198
- st.set_page_config(page_title="LinkedIn Post Generator β€” Groq", layout="centered")
199
- st.title("πŸ”— LinkedIn Post Generator β€” Dataset Keywords + Groq")
200
- st.caption("Upload sample posts, extract keywords, and generate plain-text viral posts via Groq.")
201
 
202
  with st.sidebar:
203
  st.subheader("Groq & Decoding")
204
  model = st.selectbox(
205
  "Groq model",
206
- options=[
207
- "llama-3.3-70b-versatile",
208
- "llama-3.1-8b-instant",
209
- "mixtral-8x7b-32768"
210
- ],
211
  index=0
212
  )
213
  temperature = st.slider("Temperature", 0.1, 1.2, 0.6, 0.05)
214
- top_p = st.slider("Top-p", 0.1, 1.0, 0.9, 0.05)
215
  target_len = st.slider("Target length (words)", 60, 300, 140, 10)
216
  st.markdown("Set GROQ_API_KEY in Space β†’ Settings β†’ Variables & Secrets.")
217
 
218
- with st.form("gen_form"):
219
  topic = st.text_input("Topic", "Generative AI for Business")
220
- tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"], index=0)
221
- audience = st.text_input("Audience", "Startup founders")
222
 
223
- st.markdown("### Upload dataset (CSV/JSON) of LinkedIn posts")
224
- uploaded = st.file_uploader("Dataset must include a 'text' (or 'post'/'content'/'body') column.", type=["csv", "json"])
225
 
226
  st.markdown("Optional: add up to 4 style cues (one per line).")
227
- style_textarea = st.text_area("Style cues", value="", placeholder="Short hooks\nActionable bullets\nStories with numbers\nTactical CTA")
228
 
229
- submitted = st.form_submit_button("Generate Post")
230
 
231
  if submitted:
232
  if not os.getenv("GROQ_API_KEY"):
233
  st.error("GROQ_API_KEY missing. Add it in Space β†’ Settings β†’ Variables & Secrets.")
234
  st.stop()
235
- if not topic.strip():
236
- st.warning("Please enter a topic.")
237
- st.stop()
238
 
239
- # Load dataset if provided
240
  posts_df = None
241
- if uploaded:
242
  try:
243
  posts_df = load_posts_from_file(uploaded)
244
  except Exception as e:
245
  st.error(f"Dataset error: {e}")
246
  st.stop()
247
 
248
- # Extract keywords
249
  keywords = extract_keywords(topic, posts_df)
 
250
 
251
- # Style cues
252
- style_refs = []
253
- if style_textarea.strip():
254
- style_refs = [s.strip() for s in style_textarea.splitlines() if s.strip()]
255
- style_refs = style_refs[:4]
256
-
257
- # Build prompt and generate
258
- prompt = build_viral_prompt(
259
- topic=topic,
260
- audience=audience,
261
- tone=tone,
262
- target_len=target_len,
263
- style_refs=style_refs,
264
- keywords=keywords
265
- )
266
 
 
 
267
  with st.spinner("Generating with Groq..."):
268
  try:
269
- max_tokens = clamp(int(target_len * 1.6) + 120, 200, 1200)
270
- txt = groq_generate(
271
- prompt=prompt,
272
- model=model,
273
- temperature=temperature,
274
- top_p=top_p,
275
- max_tokens=max_tokens
276
- )
277
- # Clean and display
278
- txt = dedupe_sentences(strip_labels(txt))
279
- st.success("Generated Post")
280
- st.write(txt)
281
- st.download_button("Download (.txt)", txt, file_name="linkedin_post.txt")
282
- with st.expander("Debug: keywords & prompt"):
283
- st.write({"keywords": keywords, "style_refs": style_refs})
284
- st.code(prompt)
285
  except Exception as e:
286
  st.error(f"Groq generation failed: {e}")
 
 
 
 
 
 
 
5
  import streamlit as st
6
  import pandas as pd
7
 
8
+ # ─────────────────────────────────────────
9
+ # Config
10
+ # ─────────────────────────────────────────
11
+ DEFAULT_MODEL = "llama-3.3-70b-versatile" # Groq
12
+ STOPWORDS = set("""
 
13
  a an and the or for nor but so yet of to in on with at by from as is are was were be being been
14
  i you he she it we they them us our your their this that these those here there
15
  """.split())
16
 
17
+ # ─────────────────────────────────────────
18
+ # Groq client
19
+ # ─────────────────────────────────────────
20
  try:
21
  from groq import Groq
22
  except ImportError:
 
25
  def get_groq_client():
26
  api_key = os.getenv("GROQ_API_KEY")
27
  if not api_key:
28
+ raise RuntimeError("Missing GROQ_API_KEY. Set in Space β†’ Settings β†’ Variables & Secrets.")
29
  if Groq is None:
30
  raise RuntimeError("Package 'groq' not installed. Add 'groq' to requirements.txt.")
31
  return Groq(api_key=api_key)
32
 
33
+ def groq_chat(prompt, model, temperature, top_p, max_tokens):
34
  client = get_groq_client()
35
  resp = client.chat.completions.create(
36
  model=model,
37
  messages=[
38
+ {"role": "system", "content": "You craft concise, original, high-signal LinkedIn posts."},
39
  {"role": "user", "content": prompt}
40
  ],
41
  temperature=temperature,
42
  top_p=top_p,
43
  max_tokens=max_tokens,
 
44
  )
45
  return resp.choices[0].message.content.strip()
46
 
47
+ # ─────────────────────────────────────────
48
+ # Utilities
49
+ # ─────────────────────────────────────────
50
+ def clamp(n, lo, hi):
51
+ return max(lo, min(hi, n))
52
+
53
  def dedupe_sentences(text: str) -> str:
54
  parts = re.split(r'(?<=[.!?])\s+', text.strip())
55
  seen = set()
 
75
  cleaned.append(L)
76
  return "\n".join(cleaned).strip()
77
 
78
+ # ─────────────────────────────────────────
79
+ # Dataset ingest + keywords (optional, improves relevance)
80
+ # ─────────────────────────────────────────
 
 
 
81
  def load_posts_from_file(file) -> pd.DataFrame:
82
  name = file.name.lower()
83
  if name.endswith(".csv"):
 
85
  elif name.endswith(".json"):
86
  df = pd.read_json(file, lines=False)
87
  else:
88
+ raise ValueError("Upload CSV or JSON.")
89
+ cand = [c for c in df.columns if c.lower() in ("text","post","content","body")]
90
+ if not cand:
91
+ raise ValueError("Dataset must contain a 'text' (or post/content/body) column.")
 
92
  if "text" not in df.columns:
93
+ df["text"] = df[cand[0]]
94
  df["text"] = df["text"].fillna("").astype(str)
95
  return df[["text"]]
96
 
 
98
  words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
99
  phrases, cur = [], []
100
  for w in words:
101
+ if w in STOPWORDS:
102
  if cur:
103
  phrases.append(" ".join(cur))
104
  cur = []
 
106
  cur.append(w)
107
  if cur:
108
  phrases.append(" ".join(cur))
 
109
  freq, degree = {}, {}
110
  for ph in phrases:
111
  toks = ph.split()
112
  for t in toks:
113
  freq[t] = freq.get(t, 0) + 1
114
+ degree[t] = degree.get(t, 0) + (len(toks)-1)
115
  scores = {}
116
  for ph in phrases:
117
  s = 0.0
118
  for t in ph.split():
119
+ s += (degree.get(t,0)+1)/ (freq.get(t,1))
120
+ scores[ph] = scores.get(ph,0)+s
121
  ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
122
+ filtered = [p for p,_ in ranked if min_len <= len(p.split()) <= max_len]
123
  return filtered[:top_k]
124
 
125
+ def tfidf_builder(texts, top_k=8):
126
  docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts]
127
  vocab = {}
128
  for d in docs:
129
  for w in set(d):
130
+ vocab[w] = vocab.get(w,0)+1
 
131
  N = len(docs)
132
+ def score(text):
133
  doc = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
134
  tf = {}
135
  for w in doc:
136
+ tf[w] = tf.get(w,0)+1
137
  scores = {}
138
+ for w,c in tf.items():
139
+ df = vocab.get(w,1)
140
+ idf = math.log((N+1)/(df+1))+1
141
+ scores[w] = (c/len(doc))*idf
142
  ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
143
+ return [w for w,_ in ranked[:top_k]]
144
+ return score
145
+
146
+ def extract_keywords(topic, df: pd.DataFrame|None):
147
+ if df is not None and len(df):
148
+ sample = df["text"].sample(min(30, len(df)), random_state=42).tolist()
149
+ rake_kw = simple_rake(" ".join(sample + [topic]), min_len=2, max_len=3, top_k=12)
150
+ tfidf_fn = tfidf_builder(df["text"].tolist(), top_k=8)
151
  kw2 = tfidf_fn(topic + " " + " ".join(sample[:5]))
152
+ raw = rake_kw + kw2
153
  else:
154
+ raw = simple_rake(topic, min_len=1, max_len=2, top_k=8)
155
  seen, out = set(), []
156
+ for k in raw:
157
+ k2 = re.sub(r"\s+"," ",k.strip().lower())
158
  if k2 and k2 not in seen:
159
+ seen.add(k2); out.append(k2)
160
+ return out[:12]
161
+
162
+ # ─────────────────────────────────────────
163
+ # Stage‑2 Prompt (hidden structure, plain output)
164
+ # ─────────────────────────────────────────
165
+ def build_stage2_prompt(topic, language, target_len, tone, keywords=None, style_cues=None):
166
+ kw_block = ", ".join((keywords or [])[:8]) if keywords else "N/A"
167
+ cues_block = "\n".join(f"- {c}" for c in (style_cues or [])[:4]) if style_cues else "- None"
 
168
  return (
169
  "You are a senior LinkedIn content strategist.\n"
170
+ "Objective: Write a viral, insightful LinkedIn post as plain text only (no section headers, no labels).\n\n"
171
+ f"Language: {language}\n"
172
  f"Topic: \"{topic}\"\n"
 
173
  f"Tone: \"{tone}\"\n"
174
+ f"Approx length: ~{target_len} words\n"
175
+ f"Keywords to weave in naturally: {kw_block}\n"
176
+ "Style cues (apply silently):\n"
177
+ f"{cues_block}\n\n"
178
+ "Apply without mentioning rules:\n"
179
+ "- Curiosity‑driven first line.\n"
180
+ "- Short paragraphs; concrete, novel insights (3–5), examples welcome.\n"
181
+ "- Max 2 emojis; 2–4 niche hashtags only at very end (optional).\n"
182
+ "- No repeated sentences; avoid clichΓ©s.\n"
183
+ "- Output must be one cohesive post in plain text. No labels or headings."
 
 
184
  )
185
 
186
+ # ─────────────────────────────────────────
187
+ # UI
188
+ # ─────────────────────────────────────────
189
+ st.set_page_config(page_title="LinkedIn Post Generator β€” Stage 2 (Groq)", layout="centered")
190
+ st.title("Stage 2: Topic β†’ Prompt β†’ Llama‑3.x (Groq) β†’ 3 Variants")
 
191
 
192
  with st.sidebar:
193
  st.subheader("Groq & Decoding")
194
  model = st.selectbox(
195
  "Groq model",
196
+ options=["llama-3.3-70b-versatile","llama-3.1-8b-instant","mixtral-8x7b-32768"],
 
 
 
 
197
  index=0
198
  )
199
  temperature = st.slider("Temperature", 0.1, 1.2, 0.6, 0.05)
200
+ top_p = st.slider("Top‑p", 0.1, 1.0, 0.9, 0.05)
201
  target_len = st.slider("Target length (words)", 60, 300, 140, 10)
202
  st.markdown("Set GROQ_API_KEY in Space β†’ Settings β†’ Variables & Secrets.")
203
 
204
+ with st.form("stage2_form"):
205
  topic = st.text_input("Topic", "Generative AI for Business")
206
+ language = st.selectbox("Language", ["English","Urdu","Arabic","French","Spanish"], index=0)
207
+ tone = st.selectbox("Tone", ["Professional","Friendly","Inspirational","Technical","Concise"], index=0)
208
 
209
+ st.markdown("Optional: upload a dataset of past LinkedIn posts (CSV/JSON) with a 'text' column.")
210
+ uploaded = st.file_uploader("Upload CSV/JSON", type=["csv","json"])
211
 
212
  st.markdown("Optional: add up to 4 style cues (one per line).")
213
+ style_text = st.text_area("Style cues", value="", placeholder="Short hooks\nActionable bullets\nStories with numbers\nTactical CTA")
214
 
215
+ submitted = st.form_submit_button("Generate 3 Variants")
216
 
217
  if submitted:
218
  if not os.getenv("GROQ_API_KEY"):
219
  st.error("GROQ_API_KEY missing. Add it in Space β†’ Settings β†’ Variables & Secrets.")
220
  st.stop()
 
 
 
221
 
 
222
  posts_df = None
223
+ if uploaded is not None:
224
  try:
225
  posts_df = load_posts_from_file(uploaded)
226
  except Exception as e:
227
  st.error(f"Dataset error: {e}")
228
  st.stop()
229
 
 
230
  keywords = extract_keywords(topic, posts_df)
231
+ style_cues = [s.strip() for s in style_text.splitlines() if s.strip()][:4]
232
 
233
+ prompt = build_stage2_prompt(topic, language, target_len, tone, keywords, style_cues)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ st.subheader("Variants")
236
+ variants = []
237
  with st.spinner("Generating with Groq..."):
238
  try:
239
+ max_tokens = clamp(int(target_len*1.6)+120, 200, 1200)
240
+ # Generate 3 separate candidates
241
+ for i in range(3):
242
+ raw = groq_chat(prompt, model, temperature, top_p, max_tokens)
243
+ clean = dedupe_sentences(strip_labels(raw))
244
+ variants.append(clean)
 
 
 
 
 
 
 
 
 
 
245
  except Exception as e:
246
  st.error(f"Groq generation failed: {e}")
247
+ st.stop()
248
+
249
+ for i, v in enumerate(variants, start=1):
250
+ st.markdown(f"### Post {i}")
251
+ st.write(v)
252
+ st.download_button(f"Download Post {i}", v, file_name=f"post_{i}.txt")