Spaces:

yoniif
/

final_assignment_yoni_gavriel

Sleeping

File size: 12,717 Bytes

# app.py
import os
import re
import random
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer, util

# Make HF downloads less flaky on Spaces
os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")

# =========================
# ALWAYS-GENERATE (HF) DATASET
# =========================
CSV_PATH = "synthetic_influencers.csv"
NUM_ROWS = 1200       # ≥1000 as required
SEED = 42             # reproducibility
random.seed(SEED)

def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
    """
    Creates a synthetic dataset that mirrors your current schema:
    Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
    Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country,
    then samples realistic First Last names from those pools.
    """
    random.seed(seed)

    try:
        from transformers import pipeline
    except Exception as e:
        raise RuntimeError(
            "Transformers not installed. Install with: pip install transformers torch"
        ) from e

    # Smaller instruction-following model; you can bump to "google/flan-t5-base" if you want
    t5 = pipeline("text2text-generation", model="google/flan-t5-small")

    countries = [
        "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
        "Israel","UAE","Netherlands","Sweden","Mexico"
    ]
    niches = [
        "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
        "Lifestyle","Education","Finance","Sports","Parenting","DIY",
        "Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
    ]
    platforms = ["youtube", "instagram", "tiktok", "twitch", "x"]  # lowercase -> file prefix

    # -------- Name pool builder (uses HF model once per country) --------
    COUNTRY_FALLBACKS = {
        "USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"],
                ["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]),
        "UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"],
               ["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]),
        "Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"],
                   ["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]),
        "Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"],
                      ["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]),
        "Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"],
                   ["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]),
        "India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"],
                  ["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]),
        "France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","Chloé"],
                   ["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]),
        "Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"],
                    ["Müller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]),
        "Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"],
                  ["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]),
        "Spain": (["Hugo","Mateo","Martín","Lucas","Lucía","Martina","Sofía","Julia"],
                  ["García","Fernández","González","Rodríguez","López","Martínez","Sánchez","Pérez"]),
        "Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"],
                   ["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]),
        "UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"],
                ["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]),
        "Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"],
                        ["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]),
        "Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"],
                   ["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]),
        "Mexico": (["Santiago","Mateo","Sebastián","Emiliano","Sofía","Valentina","Regina","Camila"],
                   ["Hernández","García","Martínez","López","González","Pérez","Rodríguez","Sánchez"]),
    }

    first_cache, last_cache = {}, {}

    def _clean_list_text(txt: str):
        # turn "Emma, Olivia; Ava\nMia" -> ["Emma","Olivia","Ava","Mia"]
        txt = re.sub(r"[\[\]\(\)\"']", " ", txt)
        parts = re.split(r"[,\n;]+", txt)
        names = []
        for p in parts:
            p = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ \-]", "", p).strip()
            if 2 <= len(p) <= 20:
                # keep one token (first) for first names; for last names allow hyphenated
                names.append(p.split()[0].capitalize())
        # dedupe, keep order
        seen = set()
        out = []
        for n in names:
            if n.lower() not in seen:
                out.append(n)
                seen.add(n.lower())
        return out

    def get_name_pools(country: str):
        """Use HF model once per country to get lists of first names and surnames."""
        if country in first_cache and country in last_cache:
            return first_cache[country], last_cache[country]
        try:
            first_prompt = (
                f"List 20 common first names in {country}. "
                "Return comma-separated names only."
            )
            last_prompt = (
                f"List 20 common surnames in {country}. "
                "Return comma-separated names only."
            )
            first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
            last_txt  = t5(last_prompt,  max_new_tokens=128, do_sample=False)[0]["generated_text"]
            firsts = _clean_list_text(first_txt)
            lasts  = _clean_list_text(last_txt)
            # Ensure we have reasonable pools; otherwise fall back
            if len(firsts) < 8 or len(lasts) < 8:
                raise ValueError("too few names parsed")
        except Exception:
            firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"])
        first_cache[country], last_cache[country] = firsts, lasts
        return firsts, lasts

    def sample_full_name(country: str) -> str:
        firsts, lasts = get_name_pools(country)
        first = random.choice(firsts)
        last  = random.choice(lasts)
        # Keep O'Connor/Al Nahyan formatting reasonable (space or apostrophe already in last)
        # Capitalize first token if last is multi-word (e.g., "Al Nahyan" -> keep as-is)
        def cap_name(s):
            if "'" in s:
                return "'".join([p.capitalize() for p in s.split("'")])
            return " ".join([p.capitalize() for p in s.split(" ")])
        return f"{cap_name(first)} {cap_name(last)}"

    rows = []
    for rank in range(1, n + 1):
        # --- Structured fields sampled to look realistic ---
        followers = random.randint(5_000, 5_000_000)
        er = round(random.uniform(0.5, 15.0), 2)  # %
        country = random.choice(countries)
        niche = random.choice(niches)
        reach = int(followers * random.uniform(0.25, 0.95))
        platform_token = random.choice(platforms)  # e.g., 'youtube'
        region_hint = country.lower().replace(" ", "")
        source_file = f"{platform_token}_data_{region_hint}.csv"  # <- first token = platform
        source_path = f"synthetic/{source_file}"

        # --- Name via HF model generated pools ---
        name = sample_full_name(country)

        rows.append([
            rank, name, followers, er, country, niche, reach, source_file, source_path
        ])

    df_syn = pd.DataFrame(rows, columns=[
        "Rank","Name","Followers","ER","Country","Niche","Reach","Source File","Source Path"
    ])
    df_syn.to_csv(out_csv, index=False)
    return df_syn

def load_or_build_synthetic():
    # Always build (or rebuild) to guarantee HF-generated data is used
    if os.path.exists(CSV_PATH):
        os.remove(CSV_PATH)
    _ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED)
    df_local = pd.read_csv(CSV_PATH)
    df_local.fillna("", inplace=True)
    return df_local

# Build & load synthetic dataset
df = load_or_build_synthetic()

# =========================
# FEATURE ENGINEERING
# =========================
# Extract platform name from Source File (first token before '_'), capitalize
df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize()

# Prepare text for embedding (include platform)
profile_fields = ["Name", "Platform", "Niche", "Country"]
df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1)

# =========================
# EMBEDDINGS & RECOMMENDER
# =========================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True)

def recommend_influencers(brand_description):
    query_embedding = model.encode(brand_description, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0]
    top_indices = cosine_scores.topk(3).indices.tolist()

    recs = []
    for idx in top_indices:
        row = df.iloc[idx]
        recs.append({
            "Name": row["Name"],
            "Platform": row.get("Platform", ""),
            "Niche": row["Niche"],
            "Country": row["Country"],
            "ER": f"{row.get('ER', 'N/A')}",
            "Followers": int(row["Followers"]),
            "Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "")
        })
    return recs

def format_output(brand_input):
    recs = recommend_influencers(brand_input)
    html = ""
    for i, rec in enumerate(recs, 1):
        html += f"""
        <div style='background:#ffffff; padding:1em; margin-bottom:1em; border-radius:8px; box-shadow:0 2px 6px rgba(0,0,0,0.1);'>
          <h3 style='margin:0; color:#0a1f44;'>🎯 {i}. {rec['Name']} <span style='font-size:0.9em; color:#555;'>({rec['Platform']})</span></h3>
          <p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p>
          <p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p>
          <p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p>
          <p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']:,}</p>
          {f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {int(rec['Reach']):,}</p>" if isinstance(rec['Reach'], int) else ""}
        </div>
        """
    return html

# =========================
# GRADIO UI
# =========================
iface = gr.Interface(
    fn=format_output,
    inputs=gr.Textbox(
        lines=3,
        label="🗣️ Describe Your Campaign or Brand",
        placeholder="e.g., Targeted fitness brand outreach for Gen Z"
    ),
    outputs=gr.HTML(label="📈 Recommended Influencers"),
    title="💡 InfluencerMatch.AI: Targeted Influencer Discovery for Social Media Marketing",
    description=(
        "Enhance your social media marketing by pinpointing the perfect influencers for your niche.\n\n"
        "🛠️ AI-driven matching based on niche, audience, and engagement metrics — get top 3 influencer recommendations instantly."
    ),
    article=(
        "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
        "**Models:**\n"
        "- google/flan-t5-small to synthesize country-specific first/last name pools\n"
        "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
        "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
    ),
    examples=[
        ["Sustainable fashion campaign targeting eco-conscious millennials"],
        ["Tech gadget launch aimed at early adopters in the US"],
        ["Healthy snack brand outreach for fitness enthusiasts"],
        ["Luxury travel experiences for affluent couples in Europe"]
    ],
    theme="soft",
    flagging_mode="never"
)

if __name__ == "__main__":
    iface.launch(share=True)