Alpha108 commited on
Commit
e79628e
Β·
verified Β·
1 Parent(s): af89629

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +267 -102
app.py CHANGED
@@ -1,19 +1,34 @@
1
- import streamlit as st
2
- import json
3
  import os
4
  import re
5
- from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- STYLE_SAMPLES_FILE = "style_samples.json"
 
 
 
 
 
 
8
 
9
- def load_style_samples():
10
- if os.path.exists(STYLE_SAMPLES_FILE):
11
- with open(STYLE_SAMPLES_FILE, "r") as f:
12
- return json.load(f)
13
- return []
14
 
 
 
 
15
  def dedupe_sentences(text: str) -> str:
16
- # Remove verbatim repeated sentences, keep order
17
  parts = re.split(r'(?<=[.!?])\s+', text.strip())
18
  seen = set()
19
  out = []
@@ -22,120 +37,270 @@ def dedupe_sentences(text: str) -> str:
22
  if norm and norm not in seen:
23
  seen.add(norm)
24
  out.append(p.strip())
25
- return " ".join(out)
26
-
27
- @st.cache_resource(show_spinner=False)
28
- def load_pipeline():
29
- # CPU-friendly model; swap later to a stronger instruct model if available
30
- model_id = "google/flan-t5-base"
31
- gen_pipe = pipeline(
32
- task="text2text-generation",
33
- model=model_id
34
- # Note: no device_map to avoid Accelerate requirement on CPU Spaces
35
- )
36
- return gen_pipe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- def build_prompt(topic, audience, tone, length, style_example_text):
39
- # Structured prompt reduces looping and anchors the model
40
  return (
41
- "Task: Write a LinkedIn post.\n\n"
 
42
  f"Topic: \"{topic}\"\n"
43
  f"Audience: \"{audience}\"\n"
44
  f"Tone: \"{tone}\"\n"
45
- f"Target length: ~{length} words.\n\n"
46
- "Style requirements:\n"
47
- "- Start with a 1–2 line HOOK with a concrete claim or question.\n"
48
- "- Use 2–3 short BODY paragraphs; sentences under 20 words.\n"
49
- "- Add 3–5 specific insights or steps; bullets allowed.\n"
50
- "- End with a clear CTA inviting comments.\n\n"
51
  "Constraints:\n"
52
- "- Do NOT repeat sentences or phrases.\n"
53
  "- Avoid clichΓ©s like β€œit's a great example of how we can make a difference in the world.”\n"
54
- "- Use plain business English.\n\n"
55
- f"Reference style (optional):\n{style_example_text}\n\n"
56
- "Output format (use these headers exactly):\n"
57
  "HOOK:\n"
58
  "BODY:\n"
 
 
 
59
  "TAKEAWAY:\n"
60
  "CTA:\n"
61
  )
62
 
63
- # Load resources
64
- pipe = load_pipeline()
65
- style_samples = load_style_samples()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # UI
68
- st.set_page_config(page_title="LinkedIn Post Generator", layout="centered")
69
- st.title("πŸ”— LinkedIn Post Generator (Hugging Face)")
70
- st.write("Generate concise, structured LinkedIn posts with few-shot style guidance.")
 
 
 
 
 
 
 
 
 
 
 
 
71
 
 
72
  with st.form("gen_form"):
73
- topic = st.text_input("Post Topic", "Generative AI for Business")
74
- tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"])
75
  audience = st.text_input("Audience", "Startup founders")
76
- length = st.slider("Length (approx words)", 40, 300, 120, 10)
77
 
78
- use_sample = st.selectbox(
79
- "Style Sample (optional)",
80
- ["None"] + [f"Sample {i+1}" for i in range(len(style_samples))]
81
- )
82
- custom_style = st.text_area("Or paste your own style sample (optional)")
83
 
84
- with st.expander("Advanced generation settings"):
85
- temperature = st.slider("Temperature", 0.1, 1.2, 0.7, 0.05)
86
- top_p = st.slider("Top-p (nucleus)", 0.1, 1.0, 0.9, 0.05)
87
- repetition_penalty = st.slider("Repetition penalty", 1.0, 2.0, 1.2, 0.05)
88
- no_repeat_ngram_size = st.slider("No-repeat n-gram size", 1, 6, 3, 1)
89
 
90
  submitted = st.form_submit_button("Generate Post")
91
 
92
- style_example_text = ""
93
- if use_sample != "None":
94
- idx = int(use_sample.split()[1]) - 1
95
- style_example_text += f"Sample style:\n{style_samples[idx]}\n"
96
- if custom_style.strip():
97
- style_example_text += f"Custom style:\n{custom_style}\n"
98
-
99
  if submitted:
 
 
 
 
100
  if not topic.strip():
101
- st.warning("Please enter a topic.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  else:
103
- prompt = build_prompt(topic, audience, tone, length, style_example_text)
104
- with st.spinner("Generating..."):
105
- try:
106
- outputs = pipe(
107
- prompt,
108
- max_new_tokens=length + 120,
109
- temperature=temperature,
110
- top_p=top_p,
111
- repetition_penalty=repetition_penalty,
112
- no_repeat_ngram_size=no_repeat_ngram_size
113
- )
114
- # Handle list/dict return variants
115
- if isinstance(outputs, list) and outputs and "generated_text" in outputs[0]:
116
- raw = outputs[0]["generated_text"].strip()
117
- elif isinstance(outputs, dict) and "generated_text" in outputs:
118
- raw = outputs["generated_text"].strip()
119
- else:
120
- raw = str(outputs)
121
-
122
- result = dedupe_sentences(raw)
123
- st.success("Here's your LinkedIn post:")
124
- st.write(result)
125
- st.download_button("Download post as .txt", result, file_name="linkedin_post.txt")
126
- except Exception as e:
127
- st.error(f"Error generating post: {e}")
128
-
129
- st.markdown("---")
130
- st.write("Upload a JSON array of style sample strings (overwrites existing).")
131
- file = st.file_uploader("Upload style_samples.json", type=["json"])
132
- if file:
133
- try:
134
- data = json.load(file)
135
- if not isinstance(data, list) or not all(isinstance(x, str) for x in data):
136
- raise ValueError("JSON must be a list of strings.")
137
- with open(STYLE_SAMPLES_FILE, "w") as f:
138
- json.dump(data, f)
139
- st.success(f"Saved {len(data)} samples. Reload the app to use them.")
140
- except Exception as e:
141
- st.error(f"Upload failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
+ import json
4
+ import time
5
+ import math
6
+ import streamlit as st
7
+ import pandas as pd
8
+
9
+ # ─────────────────────────────────────────────────────────────
10
+ # 1) GROQ CLIENT (Chat Completions)
11
+ # ─────────────────────────────────────────────────────────────
12
+ try:
13
+ from groq import Groq
14
+ except ImportError:
15
+ Groq = None
16
 
17
+ def get_groq_client():
18
+ api_key = os.getenv("GROQ_API_KEY")
19
+ if not api_key:
20
+ raise RuntimeError("Missing GROQ_API_KEY. Add it in Space β†’ Settings β†’ Variables & Secrets.")
21
+ if Groq is None:
22
+ raise RuntimeError("groq package not installed. Ensure 'groq' is listed in requirements.txt.")
23
+ return Groq(api_key=api_key)
24
 
25
+ # Default Groq model. You can expose this via UI if you want.
26
+ GROQ_MODEL = "llama-3.3-70b-versatile"
 
 
 
27
 
28
+ # ─────────────────────────────────────────────────────────────
29
+ # 2) TEXT UTILITIES (dedupe, clamp)
30
+ # ─────────────────────────────────────────────────────────────
31
  def dedupe_sentences(text: str) -> str:
 
32
  parts = re.split(r'(?<=[.!?])\s+', text.strip())
33
  seen = set()
34
  out = []
 
37
  if norm and norm not in seen:
38
  seen.add(norm)
39
  out.append(p.strip())
40
+ return " ".join(out).strip()
41
+
42
+ def clamp(n, lo, hi):
43
+ return max(lo, min(hi, n))
44
+
45
+ # ─────────────────────────────────────────────────────────────
46
+ # 3) DATASET INGEST & KEYWORD EXTRACTION
47
+ # Inspired by Codebasics style-mining workflow
48
+ # ─────────────────────────────────────────────────────────────
49
+ # RAKE keyword extraction (simple, no heavy deps)
50
+ STOPWORDS = set("""
51
+ a an and the or for nor but so yet of to in on with at by from as is are was were be being been
52
+ i you he she it we they them us our your their this that these those here there
53
+ """.split())
54
+
55
+ def simple_rake(text, min_len=3, max_len=3, top_k=10):
56
+ # Split by stopwords to get candidate phrases
57
+ words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
58
+ phrases, cur = [], []
59
+ for w in words:
60
+ if w in STOPWORDS:
61
+ if cur:
62
+ phrases.append(" ".join(cur))
63
+ cur = []
64
+ else:
65
+ cur.append(w)
66
+ if cur:
67
+ phrases.append(" ".join(cur))
68
+
69
+ # Score by sum of word degrees
70
+ freq = {}
71
+ degree = {}
72
+ for ph in phrases:
73
+ tokens = ph.split()
74
+ for t in tokens:
75
+ freq[t] = freq.get(t, 0) + 1
76
+ degree[t] = degree.get(t, 0) + (len(tokens) - 1)
77
+
78
+ scores = {}
79
+ for ph in phrases:
80
+ s = 0.0
81
+ for t in ph.split():
82
+ s += (degree.get(t, 0) + 1) / (freq.get(t, 1))
83
+ scores[ph] = scores.get(ph, 0) + s
84
+
85
+ ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
86
+ filtered = [p for p, _ in ranked if len(p.split()) >= min_len and len(p.split()) <= max_len]
87
+ return filtered[:top_k]
88
+
89
+ def tfidf_keywords(texts, top_k=10):
90
+ # Extremely small TF-IDF for robustness without sklearn
91
+ # Build df
92
+ docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts]
93
+ vocab = {}
94
+ for i, d in enumerate(docs):
95
+ for w in set(d):
96
+ vocab.setdefault(w, {"df": 0})
97
+ vocab[w]["df"] += 1
98
+ N = len(docs)
99
+
100
+ def score_doc(doc):
101
+ tf = {}
102
+ for w in doc:
103
+ tf[w] = tf.get(w, 0) + 1
104
+ scores = {}
105
+ for w, c in tf.items():
106
+ df = vocab.get(w, {}).get("df", 1)
107
+ idf = math.log((N + 1) / (df + 1)) + 1
108
+ scores[w] = (c / len(doc)) * idf
109
+ ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
110
+ return [w for w, s in ranked[:top_k]]
111
+
112
+ # Return a function to score a single new doc compared to corpus
113
+ return lambda doc_text: score_doc(re.findall(r"[A-Za-z0-9#+\-_/']+", doc_text.lower()))
114
+
115
+ def load_posts_from_file(file) -> pd.DataFrame:
116
+ name = file.name.lower()
117
+ if name.endswith(".csv"):
118
+ df = pd.read_csv(file)
119
+ elif name.endswith(".json"):
120
+ df = pd.read_json(file, lines=False)
121
+ else:
122
+ raise ValueError("Please upload a CSV or JSON file containing LinkedIn posts.")
123
+ # Normalize columns: expect a column 'text' for post content
124
+ candidate_cols = [c for c in df.columns if c.lower() in ("text", "post", "content", "body")]
125
+ if not candidate_cols:
126
+ raise ValueError("Dataset must have a 'text' (or post/content/body) column.")
127
+ if "text" not in df.columns:
128
+ df["text"] = df[candidate_cols[0]]
129
+ df["text"] = df["text"].fillna("").astype(str)
130
+ return df[["text"]]
131
+
132
+ # ─────────────────────────────────────────────────────────────
133
+ # 4) PROMPT BUILDING
134
+ # ─────────────────────────────────────────────────────────────
135
+ def build_structured_prompt(topic, audience, tone, target_len, style_refs, keywords):
136
+ style_block = "\n".join(f"- {s}" for s in style_refs[:4]) if style_refs else "- None"
137
+ kw_block = ", ".join(keywords[:8]) if keywords else "N/A"
138
 
 
 
139
  return (
140
+ "You are a senior LinkedIn content strategist.\n"
141
+ "Write a high-quality LinkedIn post following the schema below.\n\n"
142
  f"Topic: \"{topic}\"\n"
143
  f"Audience: \"{audience}\"\n"
144
  f"Tone: \"{tone}\"\n"
145
+ f"Target length: ~{target_len} words\n"
146
+ f"Seed keywords to weave in: {kw_block}\n\n"
147
+ "Reference style cues (bullet points):\n"
148
+ f"{style_block}\n\n"
 
 
149
  "Constraints:\n"
150
+ "- No repeated sentences or filler phrases.\n"
151
  "- Avoid clichΓ©s like β€œit's a great example of how we can make a difference in the world.”\n"
152
+ "- Short sentences (< 20 words); business English; concrete examples.\n"
153
+ "- Use emojis sparingly (0–2), no hashtags inside the body.\n\n"
154
+ "Output format (use headers exactly):\n"
155
  "HOOK:\n"
156
  "BODY:\n"
157
+ "- bullet 1\n"
158
+ "- bullet 2\n"
159
+ "- bullet 3\n"
160
  "TAKEAWAY:\n"
161
  "CTA:\n"
162
  )
163
 
164
+ # ─────────────────────────────────────────────────────────────
165
+ # 5) CALL GROQ CHAT COMPLETIONS
166
+ # ─────────────────────────────────────────────────────────────
167
+ def groq_generate(prompt, model=GROQ_MODEL, temperature=0.6, top_p=0.9, max_tokens=400):
168
+ client = get_groq_client()
169
+ resp = client.chat.completions.create(
170
+ model=model,
171
+ messages=[
172
+ {"role": "system", "content": "You craft concise, structured LinkedIn posts."},
173
+ {"role": "user", "content": prompt}
174
+ ],
175
+ temperature=temperature,
176
+ top_p=top_p,
177
+ max_tokens=max_tokens,
178
+ n=1 # Groq currently supports n=1 in most cases
179
+ )
180
+ return resp.choices[0].message.content.strip()
181
+
182
+ # ─────────────────────────────────────────────────────────────
183
+ # 6) STREAMLIT UI
184
+ # ─────────────────────────────────────────────────────────────
185
+ st.set_page_config(page_title="LinkedIn Post Generator (Groq)", layout="centered")
186
+ st.title("πŸ”— LinkedIn Post Generator β€” Dataset + Keywords + Groq")
187
+ st.caption("Upload sample posts, extract keywords, and generate on Groq LLMs with structured prompts.")
188
 
189
+ # Sidebar: Model and decoding controls
190
+ with st.sidebar:
191
+ st.subheader("Model & Decoding")
192
+ model = st.selectbox(
193
+ "Groq model",
194
+ options=[
195
+ "llama-3.3-70b-versatile",
196
+ "llama-3.1-8b-instant",
197
+ "mixtral-8x7b-32768"
198
+ ],
199
+ index=0
200
+ )
201
+ temperature = st.slider("Temperature", 0.1, 1.2, 0.6, 0.05)
202
+ top_p = st.slider("Top-p", 0.1, 1.0, 0.9, 0.05)
203
+ target_len = st.slider("Target length (words)", 60, 300, 140, 10)
204
+ st.markdown("Secrets: Set GROQ_API_KEY in Space β†’ Settings β†’ Variables & Secrets.")
205
 
206
+ # Main form
207
  with st.form("gen_form"):
208
+ topic = st.text_input("Topic", "Generative AI for Business")
209
+ tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"], index=0)
210
  audience = st.text_input("Audience", "Startup founders")
 
211
 
212
+ st.markdown("### Upload dataset of LinkedIn posts (CSV or JSON)")
213
+ uploaded = st.file_uploader("Your dataset should have a 'text' column (or 'post'/'content'/'body').", type=["csv", "json"])
 
 
 
214
 
215
+ st.markdown("Optional: add up to 4 style cue snippets (one per line).")
216
+ style_textarea = st.text_area("Style cues", value="", placeholder="e.g.\nShort, punchy hooks\nActionable bullets\nStories with numbers\nTactical CTA")
 
 
 
217
 
218
  submitted = st.form_submit_button("Generate Post")
219
 
220
+ # Process
 
 
 
 
 
 
221
  if submitted:
222
+ if not os.getenv("GROQ_API_KEY"):
223
+ st.error("GROQ_API_KEY missing. Add it in Space β†’ Settings β†’ Variables & Secrets (name it exactly GROQ_API_KEY).")
224
+ st.stop()
225
+
226
  if not topic.strip():
227
+ st.warning("Please provide a topic.")
228
+ st.stop()
229
+
230
+ # Load posts
231
+ posts_df = None
232
+ if uploaded:
233
+ try:
234
+ posts_df = load_posts_from_file(uploaded)
235
+ except Exception as e:
236
+ st.error(f"Dataset error: {e}")
237
+ st.stop()
238
+
239
+ # Build keyword extractors
240
+ tfidf_fn = None
241
+ if posts_df is not None and len(posts_df) >= 3:
242
+ # prepare a TF-IDF scorer over the corpus
243
+ tfidf_fn = tfidf_keywords(posts_df["text"].tolist(), top_k=10)
244
+
245
+ # Extract keywords from dataset context + topic
246
+ keywords = []
247
+ if posts_df is not None and len(posts_df):
248
+ # Use top-k sampled posts to seed keyword candidates
249
+ sample_texts = posts_df["text"].sample(min(30, len(posts_df)), random_state=42).tolist()
250
+ # RAKE on concatenated sample
251
+ rake_kw = simple_rake(" ".join(sample_texts + [topic]), min_len=2, max_len=3, top_k=12)
252
+ keywords.extend(rake_kw)
253
+ # TF-IDF relative to corpus on the topic text
254
+ if tfidf_fn is not None:
255
+ kw2 = tfidf_fn(topic + " " + " ".join(sample_texts[:5]))
256
+ keywords.extend(kw2)
257
  else:
258
+ # Fallback: RAKE on topic only
259
+ keywords = simple_rake(topic, min_len=1, max_len=2, top_k=8)
260
+
261
+ # Normalize and dedupe keywords
262
+ norm_kw = []
263
+ seen = set()
264
+ for k in keywords:
265
+ k2 = re.sub(r"\s+", " ", k.strip().lower())
266
+ if k2 and k2 not in seen:
267
+ seen.add(k2)
268
+ norm_kw.append(k2)
269
+ keywords = norm_kw[:12]
270
+
271
+ # Style cues
272
+ style_refs = []
273
+ if style_textarea.strip():
274
+ style_refs = [s.strip() for s in style_textarea.splitlines() if s.strip()]
275
+ style_refs = style_refs[:4]
276
+
277
+ # Prompt
278
+ prompt = build_structured_prompt(
279
+ topic=topic,
280
+ audience=audience,
281
+ tone=tone,
282
+ target_len=target_len,
283
+ style_refs=style_refs,
284
+ keywords=keywords
285
+ )
286
+
287
+ with st.spinner("Generating with Groq..."):
288
+ try:
289
+ # Convert words to approximate tokens for cap (rough 1.4x)
290
+ max_tokens = clamp(int(target_len * 1.6) + 120, 200, 1200)
291
+ txt = groq_generate(
292
+ prompt=prompt,
293
+ model=model,
294
+ temperature=temperature,
295
+ top_p=top_p,
296
+ max_tokens=max_tokens
297
+ )
298
+ txt = dedupe_sentences(txt)
299
+ st.success("Generated Post")
300
+ st.write(txt)
301
+ st.download_button("Download post (.txt)", txt, file_name="linkedin_post.txt")
302
+ with st.expander("Debug: keywords & prompt"):
303
+ st.write({"keywords": keywords, "style_refs": style_refs})
304
+ st.code(prompt)
305
+ except Exception as e:
306
+ st.error(f"Groq generation failed: {e}")