Spaces:
Sleeping
Sleeping
| import re | |
| import math | |
| import pandas as pd | |
| from typing import List | |
| STOPWORDS = set(""" | |
| a an and the or for nor but so yet of to in on with at by from as is are was were be being been | |
| i you he she it we they them us our your their this that these those here there | |
| """.split()) | |
| def dedupe_sentences(text: str) -> str: | |
| parts = re.split(r'(?<=[.!?])\s+', text.strip()) | |
| seen, out = set(), [] | |
| for p in parts: | |
| norm = re.sub(r'\s+', ' ', p.strip().lower()) | |
| if norm and norm not in seen: | |
| seen.add(norm); out.append(p.strip()) | |
| return " ".join(out).strip() | |
| def strip_labels(text: str) -> str: | |
| patterns = [r'^\s*(hook|body|takeaway|cta):\s*', r'^\s*(Hook|Body|Takeaway|CTA):\s*'] | |
| lines = text.splitlines() | |
| cleaned = [] | |
| for line in lines: | |
| L = line | |
| for p in patterns: | |
| L = re.sub(p, '', L) | |
| cleaned.append(L) | |
| return "\n".join(cleaned).strip() | |
| def load_posts(file) -> pd.DataFrame: | |
| name = file.name.lower() | |
| if name.endswith(".csv"): | |
| df = pd.read_csv(file) | |
| elif name.endswith(".json"): | |
| df = pd.read_json(file, lines=False) | |
| else: | |
| raise ValueError("Upload CSV or JSON.") | |
| cand = [c for c in df.columns if c.lower() in ("text","post","content","body")] | |
| if not cand: | |
| raise ValueError("Dataset must include 'text' (or post/content/body).") | |
| if "text" not in df.columns: | |
| df["text"] = df[cand[0]] | |
| df["text"] = df["text"].fillna("").astype(str) | |
| return df[["text"]] | |
| def simple_rake(text, min_len=2, max_len=3, top_k=12): | |
| words = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower()) | |
| phrases, cur = [], [] | |
| for w in words: | |
| if w in STOPWORDS: | |
| if cur: | |
| phrases.append(" ".join(cur)); cur=[] | |
| else: | |
| cur.append(w) | |
| if cur: | |
| phrases.append(" ".join(cur)) | |
| freq, degree, scores = {}, {}, {} | |
| for ph in phrases: | |
| toks = ph.split() | |
| for t in toks: | |
| freq[t] = freq.get(t,0)+1 | |
| degree[t] = degree.get(t,0)+(len(toks)-1) | |
| for ph in phrases: | |
| scores[ph] = sum((degree.get(t,0)+1)/(freq.get(t,1)) for t in ph.split()) | |
| ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
| return [p for p,_ in ranked if min_len <= len(p.split()) <= max_len][:top_k] | |
| def tfidf_builder(texts: List[str], top_k=8): | |
| docs = [re.findall(r"[A-Za-z0-9#+\-_/']+", t.lower()) for t in texts] | |
| vocab = {} | |
| for d in docs: | |
| for w in set(d): | |
| vocab[w] = vocab.get(w,0)+1 | |
| N = len(docs) | |
| def score(text): | |
| doc = re.findall(r"[A-Za-z0-9#+\-_/']+", text.lower()) | |
| tf = {} | |
| for w in doc: | |
| tf[w] = tf.get(w,0)+1 | |
| scores = {} | |
| for w,c in tf.items(): | |
| df = vocab.get(w,1) | |
| idf = math.log((N+1)/(df+1))+1 | |
| scores[w] = (c/len(doc))*idf | |
| ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
| return [w for w,_ in ranked[:top_k]] | |
| return score | |
| def extract_keywords(topic: str, df: pd.DataFrame|None) -> List[str]: | |
| if df is not None and len(df): | |
| sample = df["text"].sample(min(30, len(df)), random_state=42).tolist() | |
| rake_kw = simple_rake(" ".join(sample + [topic]), min_len=2, max_len=3, top_k=12) | |
| tfidf_fn = tfidf_builder(df["text"].tolist(), top_k=8) | |
| kw2 = tfidf_fn(topic + " " + " ".join(sample[:5])) | |
| raw = rake_kw + kw2 | |
| else: | |
| raw = simple_rake(topic, min_len=1, max_len=2, top_k=8) | |
| seen, out = set(), [] | |
| for k in raw: | |
| k2 = re.sub(r"\s+"," ",k.strip().lower()) | |
| if k2 and k2 not in seen: | |
| seen.add(k2); out.append(k2) | |
| return out[:12] | |