LinkedInPostGenerator1.1

Sleeping

App Files Files Community

Alpha108 commited on Nov 8, 2025

Commit

e79628e

verified ·

1 Parent(s): af89629

Update app.py

Browse files

Files changed (1) hide show

app.py +267 -102

app.py CHANGED Viewed

@@ -1,19 +1,34 @@
-import streamlit as st
-import json
 import os
 import re
-from transformers import pipeline
-STYLE_SAMPLES_FILE = "style_samples.json"
-def load_style_samples():
-    if os.path.exists(STYLE_SAMPLES_FILE):
-        with open(STYLE_SAMPLES_FILE, "r") as f:
-            return json.load(f)
-    return []
 def dedupe_sentences(text: str) -> str:
-    # Remove verbatim repeated sentences, keep order
     parts = re.split(r'(?<=[.!?])\s+', text.strip())
     seen = set()
     out = []
@@ -22,120 +37,270 @@ def dedupe_sentences(text: str) -> str:
         if norm and norm not in seen:
             seen.add(norm)
             out.append(p.strip())
-    return " ".join(out)
-@st.cache_resource(show_spinner=False)
-def load_pipeline():
-    # CPU-friendly model; swap later to a stronger instruct model if available
-    model_id = "google/flan-t5-base"
-    gen_pipe = pipeline(
-        task="text2text-generation",
-        model=model_id
-        # Note: no device_map to avoid Accelerate requirement on CPU Spaces
-    )
-    return gen_pipe
-def build_prompt(topic, audience, tone, length, style_example_text):
-    # Structured prompt reduces looping and anchors the model
     return (
-        "Task: Write a LinkedIn post.\n\n"
         f"Topic: \"{topic}\"\n"
         f"Audience: \"{audience}\"\n"
         f"Tone: \"{tone}\"\n"
-        f"Target length: ~{length} words.\n\n"
-        "Style requirements:\n"
-        "- Start with a 1–2 line HOOK with a concrete claim or question.\n"
-        "- Use 2–3 short BODY paragraphs; sentences under 20 words.\n"
-        "- Add 3–5 specific insights or steps; bullets allowed.\n"
-        "- End with a clear CTA inviting comments.\n\n"
         "Constraints:\n"
-        "- Do NOT repeat sentences or phrases.\n"
         "- Avoid clichés like “it's a great example of how we can make a difference in the world.”\n"
-        "- Use plain business English.\n\n"
-        f"Reference style (optional):\n{style_example_text}\n\n"
-        "Output format (use these headers exactly):\n"
         "HOOK:\n"
         "BODY:\n"
         "TAKEAWAY:\n"
         "CTA:\n"
     )
-# Load resources
-pipe = load_pipeline()
-style_samples = load_style_samples()
-# UI
-st.set_page_config(page_title="LinkedIn Post Generator", layout="centered")
-st.title("🔗 LinkedIn Post Generator (Hugging Face)")
-st.write("Generate concise, structured LinkedIn posts with few-shot style guidance.")
 with st.form("gen_form"):
-    topic = st.text_input("Post Topic", "Generative AI for Business")
-    tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"])
     audience = st.text_input("Audience", "Startup founders")
-    length = st.slider("Length (approx words)", 40, 300, 120, 10)
-    use_sample = st.selectbox(
-        "Style Sample (optional)",
-        ["None"] + [f"Sample {i+1}" for i in range(len(style_samples))]
-    )
-    custom_style = st.text_area("Or paste your own style sample (optional)")
-    with st.expander("Advanced generation settings"):
-        temperature = st.slider("Temperature", 0.1, 1.2, 0.7, 0.05)
-        top_p = st.slider("Top-p (nucleus)", 0.1, 1.0, 0.9, 0.05)
-        repetition_penalty = st.slider("Repetition penalty", 1.0, 2.0, 1.2, 0.05)
-        no_repeat_ngram_size = st.slider("No-repeat n-gram size", 1, 6, 3, 1)
     submitted = st.form_submit_button("Generate Post")
-style_example_text = ""
-if use_sample != "None":
-    idx = int(use_sample.split()[1]) - 1
-    style_example_text += f"Sample style:\n{style_samples[idx]}\n"
-if custom_style.strip():
-    style_example_text += f"Custom style:\n{custom_style}\n"
 if submitted:
     if not topic.strip():
-        st.warning("Please enter a topic.")
     else:
-        prompt = build_prompt(topic, audience, tone, length, style_example_text)
-        with st.spinner("Generating..."):
-            try:
-                outputs = pipe(
-                    prompt,
-                    max_new_tokens=length + 120,
-                    temperature=temperature,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    no_repeat_ngram_size=no_repeat_ngram_size
-                )
-                # Handle list/dict return variants
-                if isinstance(outputs, list) and outputs and "generated_text" in outputs[0]:
-                    raw = outputs[0]["generated_text"].strip()
-                elif isinstance(outputs, dict) and "generated_text" in outputs:
-                    raw = outputs["generated_text"].strip()
-                else:
-                    raw = str(outputs)
-                result = dedupe_sentences(raw)
-                st.success("Here's your LinkedIn post:")
-                st.write(result)
-                st.download_button("Download post as .txt", result, file_name="linkedin_post.txt")
-            except Exception as e:
-                st.error(f"Error generating post: {e}")
-st.markdown("---")
-st.write("Upload a JSON array of style sample strings (overwrites existing).")
-file = st.file_uploader("Upload style_samples.json", type=["json"])
-if file:
-    try:
-        data = json.load(file)
-        if not isinstance(data, list) or not all(isinstance(x, str) for x in data):
-            raise ValueError("JSON must be a list of strings.")
-        with open(STYLE_SAMPLES_FILE, "w") as f:
-            json.dump(data, f)
-        st.success(f"Saved {len(data)} samples. Reload the app to use them.")
-    except Exception as e:
-        st.error(f"Upload failed: {e}")

 import os
 import re
+import json
+import time
+import math
+import streamlit as st
+import pandas as pd
+# ─────────────────────────────────────────────────────────────
+# 1) GROQ CLIENT (Chat Completions)
+# ─────────────────────────────────────────────────────────────
+try:
+    from groq import Groq
+except ImportError:
+    Groq = None
+def get_groq_client():
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        raise RuntimeError("Missing GROQ_API_KEY. Add it in Space → Settings → Variables & Secrets.")
+    if Groq is None:
+        raise RuntimeError("groq package not installed. Ensure 'groq' is listed in requirements.txt.")
+    return Groq(api_key=api_key)
+# Default Groq model. You can expose this via UI if you want.
+GROQ_MODEL = "llama-3.3-70b-versatile"
+# ─────────────────────────────────────────────────────────────
+# 2) TEXT UTILITIES (dedupe, clamp)
+# ─────────────────────────────────────────────────────────────
 def dedupe_sentences(text: str) -> str:
     parts = re.split(r'(?<=[.!?])\s+', text.strip())
     seen = set()
     out = []
         if norm and norm not in seen:
             seen.add(norm)
             out.append(p.strip())
+    return " ".join(out).strip()
+def clamp(n, lo, hi):
+    return max(lo, min(hi, n))
+# ─────────────────────────────────────────────────────────────
+# 3) DATASET INGEST & KEYWORD EXTRACTION
+#    Inspired by Codebasics style-mining workflow
+# ─────────────────────────────────────────────────────────────
+# RAKE keyword extraction (simple, no heavy deps)
+STOPWORDS = set("""
+a an and the or for nor but so yet of to in on with at by from as is are was were be being been
+i you he she it we they them us our your their this that these those here there
+""".split())
+def simple_rake(text, min_len=3, max_len=3, top_k=10):
+    # Split by stopwords to get candidate phrases
+    words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower())
+    phrases, cur = [], []
+    for w in words:
+        if w in STOPWORDS:
+            if cur:
+                phrases.append(" ".join(cur))
+                cur = []
+        else:
+            cur.append(w)
+    if cur:
+        phrases.append(" ".join(cur))
+    # Score by sum of word degrees
+    freq = {}
+    degree = {}
+    for ph in phrases:
+        tokens = ph.split()
+        for t in tokens:
+            freq[t] = freq.get(t, 0) + 1
+            degree[t] = degree.get(t, 0) + (len(tokens) - 1)
+    scores = {}
+    for ph in phrases:
+        s = 0.0
+        for t in ph.split():
+            s += (degree.get(t, 0) + 1) / (freq.get(t, 1))
+        scores[ph] = scores.get(ph, 0) + s
+    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    filtered = [p for p, _ in ranked if len(p.split()) >= min_len and len(p.split()) <= max_len]
+    return filtered[:top_k]
+def tfidf_keywords(texts, top_k=10):
+    # Extremely small TF-IDF for robustness without sklearn
+    # Build df
+    docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts]
+    vocab = {}
+    for i, d in enumerate(docs):
+        for w in set(d):
+            vocab.setdefault(w, {"df": 0})
+            vocab[w]["df"] += 1
+    N = len(docs)
+    def score_doc(doc):
+        tf = {}
+        for w in doc:
+            tf[w] = tf.get(w, 0) + 1
+        scores = {}
+        for w, c in tf.items():
+            df = vocab.get(w, {}).get("df", 1)
+            idf = math.log((N + 1) / (df + 1)) + 1
+            scores[w] = (c / len(doc)) * idf
+        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+        return [w for w, s in ranked[:top_k]]
+    # Return a function to score a single new doc compared to corpus
+    return lambda doc_text: score_doc(re.findall(r"[A-Za-z0-9#+\-_/']+", doc_text.lower()))
+def load_posts_from_file(file) -> pd.DataFrame:
+    name = file.name.lower()
+    if name.endswith(".csv"):
+        df = pd.read_csv(file)
+    elif name.endswith(".json"):
+        df = pd.read_json(file, lines=False)
+    else:
+        raise ValueError("Please upload a CSV or JSON file containing LinkedIn posts.")
+    # Normalize columns: expect a column 'text' for post content
+    candidate_cols = [c for c in df.columns if c.lower() in ("text", "post", "content", "body")]
+    if not candidate_cols:
+        raise ValueError("Dataset must have a 'text' (or post/content/body) column.")
+    if "text" not in df.columns:
+        df["text"] = df[candidate_cols[0]]
+    df["text"] = df["text"].fillna("").astype(str)
+    return df[["text"]]
+# ─────────────────────────────────────────────────────────────
+# 4) PROMPT BUILDING
+# ─────────────────────────────────────────────────────────────
+def build_structured_prompt(topic, audience, tone, target_len, style_refs, keywords):
+    style_block = "\n".join(f"- {s}" for s in style_refs[:4]) if style_refs else "- None"
+    kw_block = ", ".join(keywords[:8]) if keywords else "N/A"
     return (
+        "You are a senior LinkedIn content strategist.\n"
+        "Write a high-quality LinkedIn post following the schema below.\n\n"
         f"Topic: \"{topic}\"\n"
         f"Audience: \"{audience}\"\n"
         f"Tone: \"{tone}\"\n"
+        f"Target length: ~{target_len} words\n"
+        f"Seed keywords to weave in: {kw_block}\n\n"
+        "Reference style cues (bullet points):\n"
+        f"{style_block}\n\n"
         "Constraints:\n"
+        "- No repeated sentences or filler phrases.\n"
         "- Avoid clichés like “it's a great example of how we can make a difference in the world.”\n"
+        "- Short sentences (< 20 words); business English; concrete examples.\n"
+        "- Use emojis sparingly (0–2), no hashtags inside the body.\n\n"
+        "Output format (use headers exactly):\n"
         "HOOK:\n"
         "BODY:\n"
+        "- bullet 1\n"
+        "- bullet 2\n"
+        "- bullet 3\n"
         "TAKEAWAY:\n"
         "CTA:\n"
     )
+# ─────────────────────────────────────────────────────────────
+# 5) CALL GROQ CHAT COMPLETIONS
+# ─────────────────────────────────────────────────────────────
+def groq_generate(prompt, model=GROQ_MODEL, temperature=0.6, top_p=0.9, max_tokens=400):
+    client = get_groq_client()
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You craft concise, structured LinkedIn posts."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        n=1  # Groq currently supports n=1 in most cases
+    )
+    return resp.choices[0].message.content.strip()
+# ─────────────────────────────────────────────────────────────
+# 6) STREAMLIT UI
+# ─────────────────────────────────────────────────────────────
+st.set_page_config(page_title="LinkedIn Post Generator (Groq)", layout="centered")
+st.title("🔗 LinkedIn Post Generator — Dataset + Keywords + Groq")
+st.caption("Upload sample posts, extract keywords, and generate on Groq LLMs with structured prompts.")
+# Sidebar: Model and decoding controls
+with st.sidebar:
+    st.subheader("Model & Decoding")
+    model = st.selectbox(
+        "Groq model",
+        options=[
+            "llama-3.3-70b-versatile",
+            "llama-3.1-8b-instant",
+            "mixtral-8x7b-32768"
+        ],
+        index=0
+    )
+    temperature = st.slider("Temperature", 0.1, 1.2, 0.6, 0.05)
+    top_p = st.slider("Top-p", 0.1, 1.0, 0.9, 0.05)
+    target_len = st.slider("Target length (words)", 60, 300, 140, 10)
+    st.markdown("Secrets: Set GROQ_API_KEY in Space → Settings → Variables & Secrets.")
+# Main form
 with st.form("gen_form"):
+    topic = st.text_input("Topic", "Generative AI for Business")
+    tone = st.selectbox("Tone", ["Professional", "Friendly", "Inspirational", "Technical", "Concise"], index=0)
     audience = st.text_input("Audience", "Startup founders")
+    st.markdown("### Upload dataset of LinkedIn posts (CSV or JSON)")
+    uploaded = st.file_uploader("Your dataset should have a 'text' column (or 'post'/'content'/'body').", type=["csv", "json"])
+    st.markdown("Optional: add up to 4 style cue snippets (one per line).")
+    style_textarea = st.text_area("Style cues", value="", placeholder="e.g.\nShort, punchy hooks\nActionable bullets\nStories with numbers\nTactical CTA")
     submitted = st.form_submit_button("Generate Post")
+# Process
 if submitted:
+    if not os.getenv("GROQ_API_KEY"):
+        st.error("GROQ_API_KEY missing. Add it in Space → Settings → Variables & Secrets (name it exactly GROQ_API_KEY).")
+        st.stop()
     if not topic.strip():
+        st.warning("Please provide a topic.")
+        st.stop()
+    # Load posts
+    posts_df = None
+    if uploaded:
+        try:
+            posts_df = load_posts_from_file(uploaded)
+        except Exception as e:
+            st.error(f"Dataset error: {e}")
+            st.stop()
+    # Build keyword extractors
+    tfidf_fn = None
+    if posts_df is not None and len(posts_df) >= 3:
+        # prepare a TF-IDF scorer over the corpus
+        tfidf_fn = tfidf_keywords(posts_df["text"].tolist(), top_k=10)
+    # Extract keywords from dataset context + topic
+    keywords = []
+    if posts_df is not None and len(posts_df):
+        # Use top-k sampled posts to seed keyword candidates
+        sample_texts = posts_df["text"].sample(min(30, len(posts_df)), random_state=42).tolist()
+        # RAKE on concatenated sample
+        rake_kw = simple_rake(" ".join(sample_texts + [topic]), min_len=2, max_len=3, top_k=12)
+        keywords.extend(rake_kw)
+        # TF-IDF relative to corpus on the topic text
+        if tfidf_fn is not None:
+            kw2 = tfidf_fn(topic + " " + " ".join(sample_texts[:5]))
+            keywords.extend(kw2)
     else:
+        # Fallback: RAKE on topic only
+        keywords = simple_rake(topic, min_len=1, max_len=2, top_k=8)
+    # Normalize and dedupe keywords
+    norm_kw = []
+    seen = set()
+    for k in keywords:
+        k2 = re.sub(r"\s+", " ", k.strip().lower())
+        if k2 and k2 not in seen:
+            seen.add(k2)
+            norm_kw.append(k2)
+    keywords = norm_kw[:12]
+    # Style cues
+    style_refs = []
+    if style_textarea.strip():
+        style_refs = [s.strip() for s in style_textarea.splitlines() if s.strip()]
+        style_refs = style_refs[:4]
+    # Prompt
+    prompt = build_structured_prompt(
+        topic=topic,
+        audience=audience,
+        tone=tone,
+        target_len=target_len,
+        style_refs=style_refs,
+        keywords=keywords
+    )
+    with st.spinner("Generating with Groq..."):
+        try:
+            # Convert words to approximate tokens for cap (rough 1.4x)
+            max_tokens = clamp(int(target_len * 1.6) + 120, 200, 1200)
+            txt = groq_generate(
+                prompt=prompt,
+                model=model,
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=max_tokens
+            )
+            txt = dedupe_sentences(txt)
+            st.success("Generated Post")
+            st.write(txt)
+            st.download_button("Download post (.txt)", txt, file_name="linkedin_post.txt")
+            with st.expander("Debug: keywords & prompt"):
+                st.write({"keywords": keywords, "style_refs": style_refs})
+                st.code(prompt)
+        except Exception as e:
+            st.error(f"Groq generation failed: {e}")