# app.py import os import re import random import pandas as pd import gradio as gr from sentence_transformers import SentenceTransformer, util # Make HF downloads less flaky on Spaces os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60") os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # ========================= # ALWAYS-GENERATE (HF) DATASET # ========================= CSV_PATH = "synthetic_influencers.csv" NUM_ROWS = 1200 # ≥1000 as required SEED = 42 # reproducibility random.seed(SEED) def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42): """ Creates a synthetic dataset that mirrors your current schema: Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country, then samples realistic First Last names from those pools. """ random.seed(seed) try: from transformers import pipeline except Exception as e: raise RuntimeError( "Transformers not installed. Install with: pip install transformers torch" ) from e # Smaller instruction-following model; you can bump to "google/flan-t5-base" if you want t5 = pipeline("text2text-generation", model="google/flan-t5-small") countries = [ "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain", "Israel","UAE","Netherlands","Sweden","Mexico" ] niches = [ "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography", "Lifestyle","Education","Finance","Sports","Parenting","DIY", "Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness" ] platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix # -------- Name pool builder (uses HF model once per country) -------- COUNTRY_FALLBACKS = { "USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"], ["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]), "UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"], ["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]), "Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"], ["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]), "Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"], ["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]), "Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"], ["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]), "India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"], ["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]), "France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","Chloé"], ["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]), "Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"], ["Müller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]), "Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"], ["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]), "Spain": (["Hugo","Mateo","Martín","Lucas","Lucía","Martina","Sofía","Julia"], ["García","Fernández","González","Rodríguez","López","Martínez","Sánchez","Pérez"]), "Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"], ["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]), "UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"], ["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]), "Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"], ["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]), "Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"], ["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]), "Mexico": (["Santiago","Mateo","Sebastián","Emiliano","Sofía","Valentina","Regina","Camila"], ["Hernández","García","Martínez","López","González","Pérez","Rodríguez","Sánchez"]), } first_cache, last_cache = {}, {} def _clean_list_text(txt: str): # turn "Emma, Olivia; Ava\nMia" -> ["Emma","Olivia","Ava","Mia"] txt = re.sub(r"[\[\]\(\)\"']", " ", txt) parts = re.split(r"[,\n;]+", txt) names = [] for p in parts: p = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ \-]", "", p).strip() if 2 <= len(p) <= 20: # keep one token (first) for first names; for last names allow hyphenated names.append(p.split()[0].capitalize()) # dedupe, keep order seen = set() out = [] for n in names: if n.lower() not in seen: out.append(n) seen.add(n.lower()) return out def get_name_pools(country: str): """Use HF model once per country to get lists of first names and surnames.""" if country in first_cache and country in last_cache: return first_cache[country], last_cache[country] try: first_prompt = ( f"List 20 common first names in {country}. " "Return comma-separated names only." ) last_prompt = ( f"List 20 common surnames in {country}. " "Return comma-separated names only." ) first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"] last_txt = t5(last_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"] firsts = _clean_list_text(first_txt) lasts = _clean_list_text(last_txt) # Ensure we have reasonable pools; otherwise fall back if len(firsts) < 8 or len(lasts) < 8: raise ValueError("too few names parsed") except Exception: firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"]) first_cache[country], last_cache[country] = firsts, lasts return firsts, lasts def sample_full_name(country: str) -> str: firsts, lasts = get_name_pools(country) first = random.choice(firsts) last = random.choice(lasts) # Keep O'Connor/Al Nahyan formatting reasonable (space or apostrophe already in last) # Capitalize first token if last is multi-word (e.g., "Al Nahyan" -> keep as-is) def cap_name(s): if "'" in s: return "'".join([p.capitalize() for p in s.split("'")]) return " ".join([p.capitalize() for p in s.split(" ")]) return f"{cap_name(first)} {cap_name(last)}" rows = [] for rank in range(1, n + 1): # --- Structured fields sampled to look realistic --- followers = random.randint(5_000, 5_000_000) er = round(random.uniform(0.5, 15.0), 2) # % country = random.choice(countries) niche = random.choice(niches) reach = int(followers * random.uniform(0.25, 0.95)) platform_token = random.choice(platforms) # e.g., 'youtube' region_hint = country.lower().replace(" ", "") source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform source_path = f"synthetic/{source_file}" # --- Name via HF model generated pools --- name = sample_full_name(country) rows.append([ rank, name, followers, er, country, niche, reach, source_file, source_path ]) df_syn = pd.DataFrame(rows, columns=[ "Rank","Name","Followers","ER","Country","Niche","Reach","Source File","Source Path" ]) df_syn.to_csv(out_csv, index=False) return df_syn def load_or_build_synthetic(): # Always build (or rebuild) to guarantee HF-generated data is used if os.path.exists(CSV_PATH): os.remove(CSV_PATH) _ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED) df_local = pd.read_csv(CSV_PATH) df_local.fillna("", inplace=True) return df_local # Build & load synthetic dataset df = load_or_build_synthetic() # ========================= # FEATURE ENGINEERING # ========================= # Extract platform name from Source File (first token before '_'), capitalize df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize() # Prepare text for embedding (include platform) profile_fields = ["Name", "Platform", "Niche", "Country"] df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1) # ========================= # EMBEDDINGS & RECOMMENDER # ========================= model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True) def recommend_influencers(brand_description): query_embedding = model.encode(brand_description, convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0] top_indices = cosine_scores.topk(3).indices.tolist() recs = [] for idx in top_indices: row = df.iloc[idx] recs.append({ "Name": row["Name"], "Platform": row.get("Platform", ""), "Niche": row["Niche"], "Country": row["Country"], "ER": f"{row.get('ER', 'N/A')}", "Followers": int(row["Followers"]), "Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "") }) return recs def format_output(brand_input): recs = recommend_influencers(brand_input) html = "" for i, rec in enumerate(recs, 1): html += f"""
Niche: {rec['Niche']}
Country: {rec['Country']}
Engagement: {rec['ER']}%
Followers: {rec['Followers']:,}
{f"Reach: {int(rec['Reach']):,}
" if isinstance(rec['Reach'], int) else ""}