Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,10 @@ import pandas as pd
|
|
| 6 |
import gradio as gr
|
| 7 |
from sentence_transformers import SentenceTransformer, util
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# =========================
|
| 10 |
# ALWAYS-GENERATE (HF) DATASET
|
| 11 |
# =========================
|
|
@@ -29,8 +33,9 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
|
|
| 29 |
"Transformers not installed. Install with: pip install transformers torch"
|
| 30 |
) from e
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
countries = [
|
| 36 |
"USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
|
|
@@ -38,43 +43,68 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
|
|
| 38 |
]
|
| 39 |
niches = [
|
| 40 |
"Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
|
| 41 |
-
"Lifestyle","Education","Finance","Sports","Parenting","DIY",
|
| 42 |
-
"
|
| 43 |
]
|
| 44 |
platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
out = name_gen(
|
| 51 |
prompt,
|
| 52 |
-
max_new_tokens=
|
| 53 |
-
|
| 54 |
do_sample=True,
|
| 55 |
-
temperature=0.
|
| 56 |
-
top_p=0.
|
| 57 |
)[0]["generated_text"]
|
| 58 |
|
| 59 |
-
|
| 60 |
-
name =
|
| 61 |
-
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# --- Structured fields sampled to look realistic ---
|
| 67 |
followers = random.randint(5_000, 5_000_000)
|
| 68 |
er = round(random.uniform(0.5, 15.0), 2) # %
|
| 69 |
country = random.choice(countries)
|
| 70 |
niche = random.choice(niches)
|
| 71 |
reach = int(followers * random.uniform(0.25, 0.95))
|
| 72 |
-
|
| 73 |
platform_token = random.choice(platforms) # e.g., 'youtube'
|
| 74 |
region_hint = country.lower().replace(" ", "")
|
| 75 |
source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
|
| 76 |
source_path = f"synthetic/{source_file}"
|
| 77 |
|
|
|
|
|
|
|
|
|
|
| 78 |
rows.append([
|
| 79 |
rank, name, followers, er, country, niche, reach, source_file, source_path
|
| 80 |
])
|
|
@@ -167,7 +197,7 @@ iface = gr.Interface(
|
|
| 167 |
article=(
|
| 168 |
"**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
|
| 169 |
"**Models:**\n"
|
| 170 |
-
"-
|
| 171 |
"- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
|
| 172 |
"**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
|
| 173 |
),
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from sentence_transformers import SentenceTransformer, util
|
| 8 |
|
| 9 |
+
# Optional: make HF downloads less flaky on Spaces
|
| 10 |
+
os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
|
| 11 |
+
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
|
| 12 |
+
|
| 13 |
# =========================
|
| 14 |
# ALWAYS-GENERATE (HF) DATASET
|
| 15 |
# =========================
|
|
|
|
| 33 |
"Transformers not installed. Install with: pip install transformers torch"
|
| 34 |
) from e
|
| 35 |
|
| 36 |
+
# Use an instruction-following model for names (much cleaner than distilgpt2)
|
| 37 |
+
# Small & CPU-friendly; you can bump to "google/flan-t5-base" if you want even better quality.
|
| 38 |
+
name_gen = pipeline("text2text-generation", model="google/flan-t5-small")
|
| 39 |
|
| 40 |
countries = [
|
| 41 |
"USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
|
|
|
|
| 43 |
]
|
| 44 |
niches = [
|
| 45 |
"Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
|
| 46 |
+
"Lifestyle","Education","Finance","Sports","Parenting","DIY",
|
| 47 |
+
"Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
|
| 48 |
]
|
| 49 |
platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
|
| 50 |
|
| 51 |
+
def generate_person_name(country: str) -> str:
|
| 52 |
+
# Prompt FLAN to return exactly one First Last
|
| 53 |
+
prompt = (
|
| 54 |
+
f"Generate one realistic influencer full name (first and last) from {country}. "
|
| 55 |
+
"Return only: Firstname Lastname."
|
| 56 |
+
)
|
| 57 |
out = name_gen(
|
| 58 |
prompt,
|
| 59 |
+
max_new_tokens=16,
|
| 60 |
+
num_beams=1,
|
| 61 |
do_sample=True,
|
| 62 |
+
temperature=0.7,
|
| 63 |
+
top_p=0.9
|
| 64 |
)[0]["generated_text"]
|
| 65 |
|
| 66 |
+
name = out.strip().split("\n")[0]
|
| 67 |
+
name = re.sub(r"[^A-Za-z脌-脰脴-枚酶-每' \-]", "", name).strip()
|
| 68 |
+
|
| 69 |
+
# Normalize spacing and capitalization
|
| 70 |
+
parts = [p for p in re.split(r"[ \-]+", name) if p]
|
| 71 |
+
if len(parts) < 2:
|
| 72 |
+
# one gentle retry with slightly different prompt
|
| 73 |
+
prompt2 = "Give one realistic full human name. Return only: Firstname Lastname."
|
| 74 |
+
out2 = name_gen(prompt2, max_new_tokens=12, do_sample=True, temperature=0.7, top_p=0.9)[0]["generated_text"]
|
| 75 |
+
name = out2.strip().split("\n")[0]
|
| 76 |
+
name = re.sub(r"[^A-Za-z脌-脰脴-枚酶-每' \-]", "", name).strip()
|
| 77 |
+
parts = [p for p in re.split(r"[ \-]+", name) if p]
|
| 78 |
+
|
| 79 |
+
if len(parts) < 2:
|
| 80 |
+
# final minimal fallback (rare)
|
| 81 |
+
return "Alex Morgan"
|
| 82 |
+
|
| 83 |
+
def fix_case(s):
|
| 84 |
+
# keep O'Connor-style capitalization
|
| 85 |
+
chunks = s.split("'")
|
| 86 |
+
chunks = [c.capitalize() for c in chunks]
|
| 87 |
+
return "'".join(chunks)
|
| 88 |
|
| 89 |
+
first, last = fix_case(parts[0]), fix_case(parts[1])
|
| 90 |
+
return f"{first} {last}"
|
| 91 |
+
|
| 92 |
+
rows = []
|
| 93 |
+
for rank in range(1, n + 1):
|
| 94 |
# --- Structured fields sampled to look realistic ---
|
| 95 |
followers = random.randint(5_000, 5_000_000)
|
| 96 |
er = round(random.uniform(0.5, 15.0), 2) # %
|
| 97 |
country = random.choice(countries)
|
| 98 |
niche = random.choice(niches)
|
| 99 |
reach = int(followers * random.uniform(0.25, 0.95))
|
|
|
|
| 100 |
platform_token = random.choice(platforms) # e.g., 'youtube'
|
| 101 |
region_hint = country.lower().replace(" ", "")
|
| 102 |
source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
|
| 103 |
source_path = f"synthetic/{source_file}"
|
| 104 |
|
| 105 |
+
# --- Name via HF (FLAN-T5) ---
|
| 106 |
+
name = generate_person_name(country)
|
| 107 |
+
|
| 108 |
rows.append([
|
| 109 |
rank, name, followers, er, country, niche, reach, source_file, source_path
|
| 110 |
])
|
|
|
|
| 197 |
article=(
|
| 198 |
"**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
|
| 199 |
"**Models:**\n"
|
| 200 |
+
"- google/flan-t5-small for synthetic influencer full names (dataset creation)\n"
|
| 201 |
"- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
|
| 202 |
"**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
|
| 203 |
),
|