|
|
|
|
|
import os |
|
|
import re |
|
|
import random |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
|
|
|
os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60") |
|
|
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSV_PATH = "synthetic_influencers.csv" |
|
|
NUM_ROWS = 1200 |
|
|
SEED = 42 |
|
|
random.seed(SEED) |
|
|
|
|
|
def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42): |
|
|
""" |
|
|
Creates a synthetic dataset that mirrors your current schema: |
|
|
Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path |
|
|
Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country, |
|
|
then samples realistic First Last names from those pools. |
|
|
""" |
|
|
random.seed(seed) |
|
|
|
|
|
try: |
|
|
from transformers import pipeline |
|
|
except Exception as e: |
|
|
raise RuntimeError( |
|
|
"Transformers not installed. Install with: pip install transformers torch" |
|
|
) from e |
|
|
|
|
|
|
|
|
t5 = pipeline("text2text-generation", model="google/flan-t5-small") |
|
|
|
|
|
countries = [ |
|
|
"USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain", |
|
|
"Israel","UAE","Netherlands","Sweden","Mexico" |
|
|
] |
|
|
niches = [ |
|
|
"Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography", |
|
|
"Lifestyle","Education","Finance","Sports","Parenting","DIY", |
|
|
"Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness" |
|
|
] |
|
|
platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] |
|
|
|
|
|
|
|
|
COUNTRY_FALLBACKS = { |
|
|
"USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"], |
|
|
["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]), |
|
|
"UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"], |
|
|
["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]), |
|
|
"Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"], |
|
|
["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]), |
|
|
"Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"], |
|
|
["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]), |
|
|
"Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"], |
|
|
["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]), |
|
|
"India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"], |
|
|
["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]), |
|
|
"France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","ChloΓ©"], |
|
|
["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]), |
|
|
"Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"], |
|
|
["MΓΌller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]), |
|
|
"Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"], |
|
|
["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]), |
|
|
"Spain": (["Hugo","Mateo","MartΓn","Lucas","LucΓa","Martina","SofΓa","Julia"], |
|
|
["GarcΓa","FernΓ‘ndez","GonzΓ‘lez","RodrΓguez","LΓ³pez","MartΓnez","SΓ‘nchez","PΓ©rez"]), |
|
|
"Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"], |
|
|
["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]), |
|
|
"UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"], |
|
|
["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]), |
|
|
"Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"], |
|
|
["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]), |
|
|
"Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"], |
|
|
["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]), |
|
|
"Mexico": (["Santiago","Mateo","SebastiΓ‘n","Emiliano","SofΓa","Valentina","Regina","Camila"], |
|
|
["HernΓ‘ndez","GarcΓa","MartΓnez","LΓ³pez","GonzΓ‘lez","PΓ©rez","RodrΓguez","SΓ‘nchez"]), |
|
|
} |
|
|
|
|
|
first_cache, last_cache = {}, {} |
|
|
|
|
|
def _clean_list_text(txt: str): |
|
|
|
|
|
txt = re.sub(r"[\[\]\(\)\"']", " ", txt) |
|
|
parts = re.split(r"[,\n;]+", txt) |
|
|
names = [] |
|
|
for p in parts: |
|
|
p = re.sub(r"[^A-Za-zΓ-ΓΓ-ΓΆΓΈ-ΓΏ \-]", "", p).strip() |
|
|
if 2 <= len(p) <= 20: |
|
|
|
|
|
names.append(p.split()[0].capitalize()) |
|
|
|
|
|
seen = set() |
|
|
out = [] |
|
|
for n in names: |
|
|
if n.lower() not in seen: |
|
|
out.append(n) |
|
|
seen.add(n.lower()) |
|
|
return out |
|
|
|
|
|
def get_name_pools(country: str): |
|
|
"""Use HF model once per country to get lists of first names and surnames.""" |
|
|
if country in first_cache and country in last_cache: |
|
|
return first_cache[country], last_cache[country] |
|
|
try: |
|
|
first_prompt = ( |
|
|
f"List 20 common first names in {country}. " |
|
|
"Return comma-separated names only." |
|
|
) |
|
|
last_prompt = ( |
|
|
f"List 20 common surnames in {country}. " |
|
|
"Return comma-separated names only." |
|
|
) |
|
|
first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"] |
|
|
last_txt = t5(last_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"] |
|
|
firsts = _clean_list_text(first_txt) |
|
|
lasts = _clean_list_text(last_txt) |
|
|
|
|
|
if len(firsts) < 8 or len(lasts) < 8: |
|
|
raise ValueError("too few names parsed") |
|
|
except Exception: |
|
|
firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"]) |
|
|
first_cache[country], last_cache[country] = firsts, lasts |
|
|
return firsts, lasts |
|
|
|
|
|
def sample_full_name(country: str) -> str: |
|
|
firsts, lasts = get_name_pools(country) |
|
|
first = random.choice(firsts) |
|
|
last = random.choice(lasts) |
|
|
|
|
|
|
|
|
def cap_name(s): |
|
|
if "'" in s: |
|
|
return "'".join([p.capitalize() for p in s.split("'")]) |
|
|
return " ".join([p.capitalize() for p in s.split(" ")]) |
|
|
return f"{cap_name(first)} {cap_name(last)}" |
|
|
|
|
|
rows = [] |
|
|
for rank in range(1, n + 1): |
|
|
|
|
|
followers = random.randint(5_000, 5_000_000) |
|
|
er = round(random.uniform(0.5, 15.0), 2) |
|
|
country = random.choice(countries) |
|
|
niche = random.choice(niches) |
|
|
reach = int(followers * random.uniform(0.25, 0.95)) |
|
|
platform_token = random.choice(platforms) |
|
|
region_hint = country.lower().replace(" ", "") |
|
|
source_file = f"{platform_token}_data_{region_hint}.csv" |
|
|
source_path = f"synthetic/{source_file}" |
|
|
|
|
|
|
|
|
name = sample_full_name(country) |
|
|
|
|
|
rows.append([ |
|
|
rank, name, followers, er, country, niche, reach, source_file, source_path |
|
|
]) |
|
|
|
|
|
df_syn = pd.DataFrame(rows, columns=[ |
|
|
"Rank","Name","Followers","ER","Country","Niche","Reach","Source File","Source Path" |
|
|
]) |
|
|
df_syn.to_csv(out_csv, index=False) |
|
|
return df_syn |
|
|
|
|
|
def load_or_build_synthetic(): |
|
|
|
|
|
if os.path.exists(CSV_PATH): |
|
|
os.remove(CSV_PATH) |
|
|
_ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED) |
|
|
df_local = pd.read_csv(CSV_PATH) |
|
|
df_local.fillna("", inplace=True) |
|
|
return df_local |
|
|
|
|
|
|
|
|
df = load_or_build_synthetic() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize() |
|
|
|
|
|
|
|
|
profile_fields = ["Name", "Platform", "Niche", "Country"] |
|
|
df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True) |
|
|
|
|
|
def recommend_influencers(brand_description): |
|
|
query_embedding = model.encode(brand_description, convert_to_tensor=True) |
|
|
cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0] |
|
|
top_indices = cosine_scores.topk(3).indices.tolist() |
|
|
|
|
|
recs = [] |
|
|
for idx in top_indices: |
|
|
row = df.iloc[idx] |
|
|
recs.append({ |
|
|
"Name": row["Name"], |
|
|
"Platform": row.get("Platform", ""), |
|
|
"Niche": row["Niche"], |
|
|
"Country": row["Country"], |
|
|
"ER": f"{row.get('ER', 'N/A')}", |
|
|
"Followers": int(row["Followers"]), |
|
|
"Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "") |
|
|
}) |
|
|
return recs |
|
|
|
|
|
def format_output(brand_input): |
|
|
recs = recommend_influencers(brand_input) |
|
|
html = "" |
|
|
for i, rec in enumerate(recs, 1): |
|
|
html += f""" |
|
|
<div style='background:#ffffff; padding:1em; margin-bottom:1em; border-radius:8px; box-shadow:0 2px 6px rgba(0,0,0,0.1);'> |
|
|
<h3 style='margin:0; color:#0a1f44;'>π― {i}. {rec['Name']} <span style='font-size:0.9em; color:#555;'>({rec['Platform']})</span></h3> |
|
|
<p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p> |
|
|
<p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p> |
|
|
<p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p> |
|
|
<p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']:,}</p> |
|
|
{f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {int(rec['Reach']):,}</p>" if isinstance(rec['Reach'], int) else ""} |
|
|
</div> |
|
|
""" |
|
|
return html |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=format_output, |
|
|
inputs=gr.Textbox( |
|
|
lines=3, |
|
|
label="π£οΈ Describe Your Campaign or Brand", |
|
|
placeholder="e.g., Targeted fitness brand outreach for Gen Z" |
|
|
), |
|
|
outputs=gr.HTML(label="π Recommended Influencers"), |
|
|
title="π‘ InfluencerMatch.AI: Targeted Influencer Discovery for Social Media Marketing", |
|
|
description=( |
|
|
"Enhance your social media marketing by pinpointing the perfect influencers for your niche.\n\n" |
|
|
"π οΈ AI-driven matching based on niche, audience, and engagement metrics β get top 3 influencer recommendations instantly." |
|
|
), |
|
|
article=( |
|
|
"**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n" |
|
|
"**Models:**\n" |
|
|
"- google/flan-t5-small to synthesize country-specific first/last name pools\n" |
|
|
"- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n" |
|
|
"**Dataset:** 1,200-row synthetic influencer dataset generated at runtime." |
|
|
), |
|
|
examples=[ |
|
|
["Sustainable fashion campaign targeting eco-conscious millennials"], |
|
|
["Tech gadget launch aimed at early adopters in the US"], |
|
|
["Healthy snack brand outreach for fitness enthusiasts"], |
|
|
["Luxury travel experiences for affluent couples in Europe"] |
|
|
], |
|
|
theme="soft", |
|
|
flagging_mode="never" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch(share=True) |
|
|
|