yoniif's picture
Update app.py
d14dd35 verified
# app.py
import os
import re
import random
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer, util
# Make HF downloads less flaky on Spaces
os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
# =========================
# ALWAYS-GENERATE (HF) DATASET
# =========================
CSV_PATH = "synthetic_influencers.csv"
NUM_ROWS = 1200 # β‰₯1000 as required
SEED = 42 # reproducibility
random.seed(SEED)
def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
"""
Creates a synthetic dataset that mirrors your current schema:
Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country,
then samples realistic First Last names from those pools.
"""
random.seed(seed)
try:
from transformers import pipeline
except Exception as e:
raise RuntimeError(
"Transformers not installed. Install with: pip install transformers torch"
) from e
# Smaller instruction-following model; you can bump to "google/flan-t5-base" if you want
t5 = pipeline("text2text-generation", model="google/flan-t5-small")
countries = [
"USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
"Israel","UAE","Netherlands","Sweden","Mexico"
]
niches = [
"Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
"Lifestyle","Education","Finance","Sports","Parenting","DIY",
"Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
]
platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
# -------- Name pool builder (uses HF model once per country) --------
COUNTRY_FALLBACKS = {
"USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"],
["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]),
"UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"],
["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]),
"Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"],
["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]),
"Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"],
["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]),
"Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"],
["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]),
"India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"],
["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]),
"France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","ChloΓ©"],
["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]),
"Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"],
["MΓΌller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]),
"Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"],
["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]),
"Spain": (["Hugo","Mateo","MartΓ­n","Lucas","LucΓ­a","Martina","SofΓ­a","Julia"],
["GarcΓ­a","FernΓ‘ndez","GonzΓ‘lez","RodrΓ­guez","LΓ³pez","MartΓ­nez","SΓ‘nchez","PΓ©rez"]),
"Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"],
["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]),
"UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"],
["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]),
"Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"],
["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]),
"Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"],
["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]),
"Mexico": (["Santiago","Mateo","SebastiΓ‘n","Emiliano","SofΓ­a","Valentina","Regina","Camila"],
["HernΓ‘ndez","GarcΓ­a","MartΓ­nez","LΓ³pez","GonzΓ‘lez","PΓ©rez","RodrΓ­guez","SΓ‘nchez"]),
}
first_cache, last_cache = {}, {}
def _clean_list_text(txt: str):
# turn "Emma, Olivia; Ava\nMia" -> ["Emma","Olivia","Ava","Mia"]
txt = re.sub(r"[\[\]\(\)\"']", " ", txt)
parts = re.split(r"[,\n;]+", txt)
names = []
for p in parts:
p = re.sub(r"[^A-Za-zΓ€-Γ–Γ˜-ΓΆΓΈ-ΓΏ \-]", "", p).strip()
if 2 <= len(p) <= 20:
# keep one token (first) for first names; for last names allow hyphenated
names.append(p.split()[0].capitalize())
# dedupe, keep order
seen = set()
out = []
for n in names:
if n.lower() not in seen:
out.append(n)
seen.add(n.lower())
return out
def get_name_pools(country: str):
"""Use HF model once per country to get lists of first names and surnames."""
if country in first_cache and country in last_cache:
return first_cache[country], last_cache[country]
try:
first_prompt = (
f"List 20 common first names in {country}. "
"Return comma-separated names only."
)
last_prompt = (
f"List 20 common surnames in {country}. "
"Return comma-separated names only."
)
first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
last_txt = t5(last_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
firsts = _clean_list_text(first_txt)
lasts = _clean_list_text(last_txt)
# Ensure we have reasonable pools; otherwise fall back
if len(firsts) < 8 or len(lasts) < 8:
raise ValueError("too few names parsed")
except Exception:
firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"])
first_cache[country], last_cache[country] = firsts, lasts
return firsts, lasts
def sample_full_name(country: str) -> str:
firsts, lasts = get_name_pools(country)
first = random.choice(firsts)
last = random.choice(lasts)
# Keep O'Connor/Al Nahyan formatting reasonable (space or apostrophe already in last)
# Capitalize first token if last is multi-word (e.g., "Al Nahyan" -> keep as-is)
def cap_name(s):
if "'" in s:
return "'".join([p.capitalize() for p in s.split("'")])
return " ".join([p.capitalize() for p in s.split(" ")])
return f"{cap_name(first)} {cap_name(last)}"
rows = []
for rank in range(1, n + 1):
# --- Structured fields sampled to look realistic ---
followers = random.randint(5_000, 5_000_000)
er = round(random.uniform(0.5, 15.0), 2) # %
country = random.choice(countries)
niche = random.choice(niches)
reach = int(followers * random.uniform(0.25, 0.95))
platform_token = random.choice(platforms) # e.g., 'youtube'
region_hint = country.lower().replace(" ", "")
source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
source_path = f"synthetic/{source_file}"
# --- Name via HF model generated pools ---
name = sample_full_name(country)
rows.append([
rank, name, followers, er, country, niche, reach, source_file, source_path
])
df_syn = pd.DataFrame(rows, columns=[
"Rank","Name","Followers","ER","Country","Niche","Reach","Source File","Source Path"
])
df_syn.to_csv(out_csv, index=False)
return df_syn
def load_or_build_synthetic():
# Always build (or rebuild) to guarantee HF-generated data is used
if os.path.exists(CSV_PATH):
os.remove(CSV_PATH)
_ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED)
df_local = pd.read_csv(CSV_PATH)
df_local.fillna("", inplace=True)
return df_local
# Build & load synthetic dataset
df = load_or_build_synthetic()
# =========================
# FEATURE ENGINEERING
# =========================
# Extract platform name from Source File (first token before '_'), capitalize
df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize()
# Prepare text for embedding (include platform)
profile_fields = ["Name", "Platform", "Niche", "Country"]
df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1)
# =========================
# EMBEDDINGS & RECOMMENDER
# =========================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True)
def recommend_influencers(brand_description):
query_embedding = model.encode(brand_description, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0]
top_indices = cosine_scores.topk(3).indices.tolist()
recs = []
for idx in top_indices:
row = df.iloc[idx]
recs.append({
"Name": row["Name"],
"Platform": row.get("Platform", ""),
"Niche": row["Niche"],
"Country": row["Country"],
"ER": f"{row.get('ER', 'N/A')}",
"Followers": int(row["Followers"]),
"Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "")
})
return recs
def format_output(brand_input):
recs = recommend_influencers(brand_input)
html = ""
for i, rec in enumerate(recs, 1):
html += f"""
<div style='background:#ffffff; padding:1em; margin-bottom:1em; border-radius:8px; box-shadow:0 2px 6px rgba(0,0,0,0.1);'>
<h3 style='margin:0; color:#0a1f44;'>🎯 {i}. {rec['Name']} <span style='font-size:0.9em; color:#555;'>({rec['Platform']})</span></h3>
<p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p>
<p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p>
<p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p>
<p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']:,}</p>
{f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {int(rec['Reach']):,}</p>" if isinstance(rec['Reach'], int) else ""}
</div>
"""
return html
# =========================
# GRADIO UI
# =========================
iface = gr.Interface(
fn=format_output,
inputs=gr.Textbox(
lines=3,
label="πŸ—£οΈ Describe Your Campaign or Brand",
placeholder="e.g., Targeted fitness brand outreach for Gen Z"
),
outputs=gr.HTML(label="πŸ“ˆ Recommended Influencers"),
title="πŸ’‘ InfluencerMatch.AI: Targeted Influencer Discovery for Social Media Marketing",
description=(
"Enhance your social media marketing by pinpointing the perfect influencers for your niche.\n\n"
"πŸ› οΈ AI-driven matching based on niche, audience, and engagement metrics β€” get top 3 influencer recommendations instantly."
),
article=(
"**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
"**Models:**\n"
"- google/flan-t5-small to synthesize country-specific first/last name pools\n"
"- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
"**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
),
examples=[
["Sustainable fashion campaign targeting eco-conscious millennials"],
["Tech gadget launch aimed at early adopters in the US"],
["Healthy snack brand outreach for fitness enthusiasts"],
["Luxury travel experiences for affluent couples in Europe"]
],
theme="soft",
flagging_mode="never"
)
if __name__ == "__main__":
iface.launch(share=True)