LinkedInPostGenerator1.1

Sleeping

App Files Files Community

Alpha108 commited on Nov 8, 2025

Commit

88209fc

verified ·

1 Parent(s): bec91ec

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -155

app.py CHANGED Viewed

@@ -1,14 +1,23 @@
 import os
 import re
 import json
-import time
 import math
 import streamlit as st
 import pandas as pd
-# ─────────────────────────────────────────────────────────────
-# 1) GROQ CLIENT (Chat Completions)
-# ─────────────────────────────────────────────────────────────
 try:
     from groq import Groq
 except ImportError:
@@ -17,17 +26,29 @@ except ImportError:
 def get_groq_client():
     api_key = os.getenv("GROQ_API_KEY")
     if not api_key:
-        raise RuntimeError("Missing GROQ_API_KEY. Add it in Space → Settings → Variables & Secrets.")
     if Groq is None:
-        raise RuntimeError("groq package not installed. Ensure 'groq' is listed in requirements.txt.")
     return Groq(api_key=api_key)
-# Default Groq model. You can expose this via UI if you want.
-GROQ_MODEL = "llama-3.3-70b-versatile"
-# ─────────────────────────────────────────────────────────────
-# 2) TEXT UTILITIES (dedupe, clamp)
-# ─────────────────────────────────────────────────────────────
 def dedupe_sentences(text: str) -> str:
     parts = re.split(r'(?<=[.!?])\s+', text.strip())
     seen = set()
@@ -39,25 +60,48 @@ def dedupe_sentences(text: str) -> str:
             out.append(p.strip())
     return " ".join(out).strip()
 def clamp(n, lo, hi):
     return max(lo, min(hi, n))
-# ─────────────────────────────────────────────────────────────
-# 3) DATASET INGEST & KEYWORD EXTRACTION
-#    Inspired by Codebasics style-mining workflow
-# ─────────────────────────────────────────────────────────────
-# RAKE keyword extraction (simple, no heavy deps)
-STOPWORDS = set("""
-a an and the or for nor but so yet of to in on with at by from as is are was were be being been
-i you he she it we they them us our your their this that these those here there
-""".split())
-def simple_rake(text, min_len=3, max_len=3, top_k=10):
-    # Split by stopwords to get candidate phrases
     words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
     phrases, cur = [], []
     for w in words:
-        if w in STOPWORDS:
             if cur:
                 phrases.append(" ".join(cur))
                 cur = []
@@ -65,130 +109,98 @@ def simple_rake(text, min_len=3, max_len=3, top_k=10):
             cur.append(w)
     if cur:
         phrases.append(" ".join(cur))
-    # Score by sum of word degrees
-    freq = {}
-    degree = {}
     for ph in phrases:
-        tokens = ph.split()
-        for t in tokens:
             freq[t] = freq.get(t, 0) + 1
-            degree[t] = degree.get(t, 0) + (len(tokens) - 1)
     scores = {}
     for ph in phrases:
         s = 0.0
         for t in ph.split():
             s += (degree.get(t, 0) + 1) / (freq.get(t, 1))
         scores[ph] = scores.get(ph, 0) + s
     ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
-    filtered = [p for p, _ in ranked if len(p.split()) >= min_len and len(p.split()) <= max_len]
     return filtered[:top_k]
-def tfidf_keywords(texts, top_k=10):
-    # Extremely small TF-IDF for robustness without sklearn
-    # Build df
     docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts]
     vocab = {}
-    for i, d in enumerate(docs):
         for w in set(d):
             vocab.setdefault(w, {"df": 0})
             vocab[w]["df"] += 1
     N = len(docs)
-    def score_doc(doc):
         tf = {}
         for w in doc:
             tf[w] = tf.get(w, 0) + 1
         scores = {}
-        for w, c in tf.items():
             df = vocab.get(w, {}).get("df", 1)
             idf = math.log((N + 1) / (df + 1)) + 1
-            scores[w] = (c / len(doc)) * idf
         ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
-        return [w for w, s in ranked[:top_k]]
-    # Return a function to score a single new doc compared to corpus
-    return lambda doc_text: score_doc(re.findall(r"[A-Za-z0-9#+\-_/']+", doc_text.lower()))
-def load_posts_from_file(file) -> pd.DataFrame:
-    name = file.name.lower()
-    if name.endswith(".csv"):
-        df = pd.read_csv(file)
-    elif name.endswith(".json"):
-        df = pd.read_json(file, lines=False)
     else:
-        raise ValueError("Please upload a CSV or JSON file containing LinkedIn posts.")
-    # Normalize columns: expect a column 'text' for post content
-    candidate_cols = [c for c in df.columns if c.lower() in ("text", "post", "content", "body")]
-    if not candidate_cols:
-        raise ValueError("Dataset must have a 'text' (or post/content/body) column.")
-    if "text" not in df.columns:
-        df["text"] = df[candidate_cols[0]]
-    df["text"] = df["text"].fillna("").astype(str)
-    return df[["text"]]
-# ─────────────────────────────────────────────────────────────
-# 4) PROMPT BUILDING
-# ─────────────────────────────────────────────────────────────
-def build_structured_prompt(topic, audience, tone, target_len, style_refs, keywords):
     style_block = "\n".join(f"- {s}" for s in style_refs[:4]) if style_refs else "- None"
     kw_block = ", ".join(keywords[:8]) if keywords else "N/A"
     return (
         "You are a senior LinkedIn content strategist.\n"
-        "Write a high-quality LinkedIn post following the schema below.\n\n"
         f"Topic: \"{topic}\"\n"
         f"Audience: \"{audience}\"\n"
         f"Tone: \"{tone}\"\n"
-        f"Target length: ~{target_len} words\n"
-        f"Seed keywords to weave in: {kw_block}\n\n"
-        "Reference style cues (bullet points):\n"
         f"{style_block}\n\n"
-        "Constraints:\n"
-        "- No repeated sentences or filler phrases.\n"
-        "- Avoid clichés like “it's a great example of how we can make a difference in the world.”\n"
-        "- Short sentences (< 20 words); business English; concrete examples.\n"
-        "- Use emojis sparingly (0–2), no hashtags inside the body.\n\n"
-        "Output format (use headers exactly):\n"
-        "HOOK:\n"
-        "BODY:\n"
-        "- bullet 1\n"
-        "- bullet 2\n"
-        "- bullet 3\n"
-        "TAKEAWAY:\n"
-        "CTA:\n"
     )
-# ─────────────────────────────────────────────────────────────
-# 5) CALL GROQ CHAT COMPLETIONS
-# ─────────────────────────────────────────────────────────────
-def groq_generate(prompt, model=GROQ_MODEL, temperature=0.6, top_p=0.9, max_tokens=400):
-    client = get_groq_client()
-    resp = client.chat.completions.create(
-        model=model,
-        messages=[
-            {"role": "system", "content": "You craft concise, structured LinkedIn posts."},
-            {"role": "user", "content": prompt}
-        ],
-        temperature=temperature,
-        top_p=top_p,
-        max_tokens=max_tokens,
-        n=1  # Groq currently supports n=1 in most cases
-    )
-    return resp.choices[0].message.content.strip()
-# ─────────────────────────────────────────────────────────────
-# 6) STREAMLIT UI
-# ─────────────────────────────────────────────────────────────
-st.set_page_config(page_title="LinkedIn Post Generator (Groq)", layout="centered")
-st.title("🔗 LinkedIn Post Generator — Dataset + Keywords + Groq")
-st.caption("Upload sample posts, extract keywords, and generate on Groq LLMs with structured prompts.")
-# Sidebar: Model and decoding controls
 with st.sidebar:
-    st.subheader("Model & Decoding")
     model = st.selectbox(
         "Groq model",
         options=[
@@ -201,33 +213,30 @@ with st.sidebar:
     temperature = st.slider("Temperature", 0.1, 1.2, 0.6, 0.05)
     top_p = st.slider("Top-p", 0.1, 1.0, 0.9, 0.05)
     target_len = st.slider("Target length (words)", 60, 300, 140, 10)
-    st.markdown("Secrets: Set GROQ_API_KEY in Space → Settings → Variables & Secrets.")
-# Main form
 with st.form("gen_form"):
     topic = st.text_input("Topic", "Generative AI for Business")
     tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"], index=0)
     audience = st.text_input("Audience", "Startup founders")
-    st.markdown("### Upload dataset of LinkedIn posts (CSV or JSON)")
-    uploaded = st.file_uploader("Your dataset should have a 'text' column (or 'post'/'content'/'body').", type=["csv", "json"])
-    st.markdown("Optional: add up to 4 style cue snippets (one per line).")
-    style_textarea = st.text_area("Style cues", value="", placeholder="e.g.\nShort, punchy hooks\nActionable bullets\nStories with numbers\nTactical CTA")
     submitted = st.form_submit_button("Generate Post")
-# Process
 if submitted:
     if not os.getenv("GROQ_API_KEY"):
-        st.error("GROQ_API_KEY missing. Add it in Space → Settings → Variables & Secrets (name it exactly GROQ_API_KEY).")
         st.stop()
     if not topic.strip():
-        st.warning("Please provide a topic.")
         st.stop()
-    # Load posts
     posts_df = None
     if uploaded:
         try:
@@ -236,37 +245,8 @@ if submitted:
             st.error(f"Dataset error: {e}")
             st.stop()
-    # Build keyword extractors
-    tfidf_fn = None
-    if posts_df is not None and len(posts_df) >= 3:
-        # prepare a TF-IDF scorer over the corpus
-        tfidf_fn = tfidf_keywords(posts_df["text"].tolist(), top_k=10)
-    # Extract keywords from dataset context + topic
-    keywords = []
-    if posts_df is not None and len(posts_df):
-        # Use top-k sampled posts to seed keyword candidates
-        sample_texts = posts_df["text"].sample(min(30, len(posts_df)), random_state=42).tolist()
-        # RAKE on concatenated sample
-        rake_kw = simple_rake(" ".join(sample_texts + [topic]), min_len=2, max_len=3, top_k=12)
-        keywords.extend(rake_kw)
-        # TF-IDF relative to corpus on the topic text
-        if tfidf_fn is not None:
-            kw2 = tfidf_fn(topic + " " + " ".join(sample_texts[:5]))
-            keywords.extend(kw2)
-    else:
-        # Fallback: RAKE on topic only
-        keywords = simple_rake(topic, min_len=1, max_len=2, top_k=8)
-    # Normalize and dedupe keywords
-    norm_kw = []
-    seen = set()
-    for k in keywords:
-        k2 = re.sub(r"\s+", " ", k.strip().lower())
-        if k2 and k2 not in seen:
-            seen.add(k2)
-            norm_kw.append(k2)
-    keywords = norm_kw[:12]
     # Style cues
     style_refs = []
@@ -274,8 +254,8 @@ if submitted:
         style_refs = [s.strip() for s in style_textarea.splitlines() if s.strip()]
         style_refs = style_refs[:4]
-    # Prompt
-    prompt = build_structured_prompt(
         topic=topic,
         audience=audience,
         tone=tone,
@@ -286,7 +266,6 @@ if submitted:
     with st.spinner("Generating with Groq..."):
         try:
-            # Convert words to approximate tokens for cap (rough 1.4x)
             max_tokens = clamp(int(target_len * 1.6) + 120, 200, 1200)
             txt = groq_generate(
                 prompt=prompt,
@@ -295,10 +274,11 @@ if submitted:
                 top_p=top_p,
                 max_tokens=max_tokens
             )
-            txt = dedupe_sentences(txt)
             st.success("Generated Post")
             st.write(txt)
-            st.download_button("Download post (.txt)", txt, file_name="linkedin_post.txt")
             with st.expander("Debug: keywords & prompt"):
                 st.write({"keywords": keywords, "style_refs": style_refs})
                 st.code(prompt)

 import os
 import re
 import json
 import math
 import streamlit as st
 import pandas as pd
+# =========================
+# 0) CONFIG / CONSTANTS
+# =========================
+GROQ_DEFAULT_MODEL = "llama-3.3-70b-versatile"  # Sidebar lets you change
+MAX_KEYWORDS = 12
+SEED_STOPWORDS = set("""
+a an and the or for nor but so yet of to in on with at by from as is are was were be being been
+i you he she it we they them us our your their this that these those here there
+""".split())
+# =========================
+# 1) GROQ CLIENT
+# =========================
 try:
     from groq import Groq
 except ImportError:
 def get_groq_client():
     api_key = os.getenv("GROQ_API_KEY")
     if not api_key:
+        raise RuntimeError("Missing GROQ_API_KEY. Set it in Space → Settings → Variables & Secrets.")
     if Groq is None:
+        raise RuntimeError("Package 'groq' not installed. Add 'groq' to requirements.txt.")
     return Groq(api_key=api_key)
+def groq_generate(prompt, model, temperature, top_p, max_tokens):
+    client = get_groq_client()
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You craft concise, insightful LinkedIn posts that feel original and practical."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        n=1
+    )
+    return resp.choices[0].message.content.strip()
+# =========================
+# 2) TEXT UTILS
+# =========================
 def dedupe_sentences(text: str) -> str:
     parts = re.split(r'(?<=[.!?])\s+', text.strip())
     seen = set()
             out.append(p.strip())
     return " ".join(out).strip()
+def strip_labels(text: str) -> str:
+    patterns = [
+        r'^\s*hook:\s*', r'^\s*body:\s*', r'^\s*takeaway:\s*', r'^\s*cta:\s*',
+        r'^\s*Hook:\s*', r'^\s*Body:\s*', r'^\s*Takeaway:\s*', r'^\s*CTA:\s*'
+    ]
+    lines = text.splitlines()
+    cleaned = []
+    for line in lines:
+        L = line
+        for p in patterns:
+            L = re.sub(p, '', L)
+        cleaned.append(L)
+    return "\n".join(cleaned).strip()
 def clamp(n, lo, hi):
     return max(lo, min(hi, n))
+# =========================
+# 3) DATA INGEST & KEYWORDS
+# =========================
+def load_posts_from_file(file) -> pd.DataFrame:
+    name = file.name.lower()
+    if name.endswith(".csv"):
+        df = pd.read_csv(file)
+    elif name.endswith(".json"):
+        df = pd.read_json(file, lines=False)
+    else:
+        raise ValueError("Upload a CSV or JSON file containing LinkedIn posts.")
+    # Normalize to 'text' column
+    candidate = [c for c in df.columns if c.lower() in ("text", "post", "content", "body")]
+    if not candidate:
+        raise ValueError("Dataset must have a 'text' (or post/content/body) column.")
+    if "text" not in df.columns:
+        df["text"] = df[candidate[0]]
+    df["text"] = df["text"].fillna("").astype(str)
+    return df[["text"]]
+def simple_rake(text, min_len=2, max_len=3, top_k=12):
     words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
     phrases, cur = [], []
     for w in words:
+        if w in SEED_STOPWORDS:
             if cur:
                 phrases.append(" ".join(cur))
                 cur = []
             cur.append(w)
     if cur:
         phrases.append(" ".join(cur))
+    # Score by frequency+degree
+    freq, degree = {}, {}
     for ph in phrases:
+        toks = ph.split()
+        for t in toks:
             freq[t] = freq.get(t, 0) + 1
+            degree[t] = degree.get(t, 0) + (len(toks) - 1)
     scores = {}
     for ph in phrases:
         s = 0.0
         for t in ph.split():
             s += (degree.get(t, 0) + 1) / (freq.get(t, 1))
         scores[ph] = scores.get(ph, 0) + s
     ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    filtered = [p for p, _ in ranked if min_len <= len(p.split()) <= max_len]
     return filtered[:top_k]
+def tfidf_keywords_builder(texts, top_k=10):
     docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts]
     vocab = {}
+    for d in docs:
         for w in set(d):
             vocab.setdefault(w, {"df": 0})
             vocab[w]["df"] += 1
     N = len(docs)
+    def score_doc(text):
+        doc = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
         tf = {}
         for w in doc:
             tf[w] = tf.get(w, 0) + 1
         scores = {}
+        for w, cnt in tf.items():
             df = vocab.get(w, {}).get("df", 1)
             idf = math.log((N + 1) / (df + 1)) + 1
+            scores[w] = (cnt / len(doc)) * idf
         ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+        return [w for w, _ in ranked[:top_k]]
+    return score_doc
+def extract_keywords(topic: str, posts_df: pd.DataFrame | None):
+    if posts_df is not None and len(posts_df):
+        sample = posts_df["text"].sample(min(30, len(posts_df)), random_state=42).tolist()
+        rake_kw = simple_rake(" ".join(sample + [topic]), min_len=2, max_len=3, top_k=MAX_KEYWORDS)
+        tfidf_fn = tfidf_keywords_builder(posts_df["text"].tolist(), top_k=MAX_KEYWORDS//2)
+        kw2 = tfidf_fn(topic + " " + " ".join(sample[:5]))
+        all_kw = rake_kw + kw2
     else:
+        all_kw = simple_rake(topic, min_len=1, max_len=2, top_k=8)
+    seen, out = set(), []
+    for k in all_kw:
+        k2 = re.sub(r"\s+", " ", k.strip().lower())
+        if k2 and k2 not in seen:
+            seen.add(k2)
+            out.append(k2)
+    return out[:MAX_KEYWORDS]
+# =========================
+# 4) PROMPT (PLAIN OUTPUT)
+# =========================
+def build_viral_prompt(topic, audience, tone, target_len, style_refs, keywords):
     style_block = "\n".join(f"- {s}" for s in style_refs[:4]) if style_refs else "- None"
     kw_block = ", ".join(keywords[:8]) if keywords else "N/A"
     return (
         "You are a senior LinkedIn content strategist.\n"
+        "Objective: Write a viral, insightful LinkedIn post as plain text only (no section headers, no labels), "
+        f"around {target_len} words, for the audience and topic below.\n\n"
         f"Topic: \"{topic}\"\n"
         f"Audience: \"{audience}\"\n"
         f"Tone: \"{tone}\"\n"
+        f"Keywords to naturally weave in: {kw_block}\n\n"
+        "Style cues (reflect these, do not list them):\n"
         f"{style_block}\n\n"
+        "Apply silently (do not mention these rules):\n"
+        "- Open with a curiosity-driving first line.\n"
+        "- Use short sentences and short paragraphs.\n"
+        "- Include 3–5 concrete insights, examples, or steps (bullets allowed, but no section labels).\n"
+        "- Be specific, novel, and practical; avoid clichés and filler.\n"
+        "- Use up to 2 emojis; add 2–4 niche hashtags only at the very end (optional).\n"
+        "- Never output headings like HOOK/BODY/TAKEAWAY/CTA.\n"
+        "- Do not repeat the phrase: “it's a great example of how we can make a difference in the world.”\n\n"
+        "Output: A single cohesive LinkedIn post as plain text only. No headings. No metadata. No explanations."
     )
+# =========================
+# 5) STREAMLIT UI
+# =========================
+st.set_page_config(page_title="LinkedIn Post Generator — Groq", layout="centered")
+st.title("🔗 LinkedIn Post Generator — Dataset Keywords + Groq")
+st.caption("Upload sample posts, extract keywords, and generate plain-text viral posts via Groq.")
 with st.sidebar:
+    st.subheader("Groq & Decoding")
     model = st.selectbox(
         "Groq model",
         options=[
     temperature = st.slider("Temperature", 0.1, 1.2, 0.6, 0.05)
     top_p = st.slider("Top-p", 0.1, 1.0, 0.9, 0.05)
     target_len = st.slider("Target length (words)", 60, 300, 140, 10)
+    st.markdown("Set GROQ_API_KEY in Space → Settings → Variables & Secrets.")
 with st.form("gen_form"):
     topic = st.text_input("Topic", "Generative AI for Business")
     tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"], index=0)
     audience = st.text_input("Audience", "Startup founders")
+    st.markdown("### Upload dataset (CSV/JSON) of LinkedIn posts")
+    uploaded = st.file_uploader("Dataset must include a 'text' (or 'post'/'content'/'body') column.", type=["csv", "json"])
+    st.markdown("Optional: add up to 4 style cues (one per line).")
+    style_textarea = st.text_area("Style cues", value="", placeholder="Short hooks\nActionable bullets\nStories with numbers\nTactical CTA")
     submitted = st.form_submit_button("Generate Post")
 if submitted:
     if not os.getenv("GROQ_API_KEY"):
+        st.error("GROQ_API_KEY missing. Add it in Space → Settings → Variables & Secrets.")
         st.stop()
     if not topic.strip():
+        st.warning("Please enter a topic.")
         st.stop()
+    # Load dataset if provided
     posts_df = None
     if uploaded:
         try:
             st.error(f"Dataset error: {e}")
             st.stop()
+    # Extract keywords
+    keywords = extract_keywords(topic, posts_df)
     # Style cues
     style_refs = []
         style_refs = [s.strip() for s in style_textarea.splitlines() if s.strip()]
         style_refs = style_refs[:4]
+    # Build prompt and generate
+    prompt = build_viral_prompt(
         topic=topic,
         audience=audience,
         tone=tone,
     with st.spinner("Generating with Groq..."):
         try:
             max_tokens = clamp(int(target_len * 1.6) + 120, 200, 1200)
             txt = groq_generate(
                 prompt=prompt,
                 top_p=top_p,
                 max_tokens=max_tokens
             )
+            # Clean and display
+            txt = dedupe_sentences(strip_labels(txt))
             st.success("Generated Post")
             st.write(txt)
+            st.download_button("Download (.txt)", txt, file_name="linkedin_post.txt")
             with st.expander("Debug: keywords & prompt"):
                 st.write({"keywords": keywords, "style_refs": style_refs})
                 st.code(prompt)