File size: 12,717 Bytes
b2b432a
6825b01
b2b432a
d14dd35
e815416
0d6d77d
e815416
 
d14dd35
48ed4a4
 
 
6825b01
2476aae
6825b01
 
2476aae
 
d14dd35
2476aae
 
 
 
 
d14dd35
 
2476aae
 
03f7cd5
6825b01
 
 
 
2476aae
6825b01
2476aae
d14dd35
 
6825b01
 
 
 
 
 
 
48ed4a4
 
6825b01
2476aae
6825b01
d14dd35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48ed4a4
 
 
2476aae
 
 
6825b01
 
 
2476aae
6825b01
2476aae
6825b01
 
d14dd35
 
48ed4a4
6825b01
 
 
 
 
 
 
 
 
 
2476aae
 
6825b01
 
 
 
 
 
 
2476aae
 
6825b01
 
b2b432a
6825b01
 
 
764e3a3
 
 
03f7cd5
9f4cdec
6825b01
b2b432a
6825b01
e815416
 
 
2476aae
e815416
 
 
0d6d77d
03f7cd5
e815416
 
03f7cd5
 
 
e815416
 
6825b01
b2b432a
 
e815416
03f7cd5
 
2476aae
e815416
03f7cd5
e815416
03f7cd5
bb0926c
 
 
 
6825b01
b2b432a
 
03f7cd5
 
 
e815416
6825b01
 
 
bb0926c
1c445b4
42e461b
03f7cd5
bb0926c
 
 
6825b01
 
bb0926c
 
 
 
 
febddb7
6825b01
d14dd35
6825b01
f998a9b
42e461b
e815416
00961a0
 
 
 
587dfbd
bb0926c
35dd891
e815416
 
 
bb0926c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# app.py
import os
import re
import random
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer, util

# Make HF downloads less flaky on Spaces
os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")

# =========================
# ALWAYS-GENERATE (HF) DATASET
# =========================
CSV_PATH = "synthetic_influencers.csv"
NUM_ROWS = 1200       # ≥1000 as required
SEED = 42             # reproducibility
random.seed(SEED)

def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
    """
    Creates a synthetic dataset that mirrors your current schema:
    Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
    Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country,
    then samples realistic First Last names from those pools.
    """
    random.seed(seed)

    try:
        from transformers import pipeline
    except Exception as e:
        raise RuntimeError(
            "Transformers not installed. Install with: pip install transformers torch"
        ) from e

    # Smaller instruction-following model; you can bump to "google/flan-t5-base" if you want
    t5 = pipeline("text2text-generation", model="google/flan-t5-small")

    countries = [
        "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
        "Israel","UAE","Netherlands","Sweden","Mexico"
    ]
    niches = [
        "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
        "Lifestyle","Education","Finance","Sports","Parenting","DIY",
        "Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
    ]
    platforms = ["youtube", "instagram", "tiktok", "twitch", "x"]  # lowercase -> file prefix

    # -------- Name pool builder (uses HF model once per country) --------
    COUNTRY_FALLBACKS = {
        "USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"],
                ["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]),
        "UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"],
               ["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]),
        "Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"],
                   ["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]),
        "Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"],
                      ["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]),
        "Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"],
                   ["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]),
        "India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"],
                  ["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]),
        "France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","Chloé"],
                   ["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]),
        "Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"],
                    ["Müller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]),
        "Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"],
                  ["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]),
        "Spain": (["Hugo","Mateo","Martín","Lucas","Lucía","Martina","Sofía","Julia"],
                  ["García","Fernández","González","Rodríguez","López","Martínez","Sánchez","Pérez"]),
        "Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"],
                   ["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]),
        "UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"],
                ["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]),
        "Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"],
                        ["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]),
        "Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"],
                   ["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]),
        "Mexico": (["Santiago","Mateo","Sebastián","Emiliano","Sofía","Valentina","Regina","Camila"],
                   ["Hernández","García","Martínez","López","González","Pérez","Rodríguez","Sánchez"]),
    }

    first_cache, last_cache = {}, {}

    def _clean_list_text(txt: str):
        # turn "Emma, Olivia; Ava\nMia" -> ["Emma","Olivia","Ava","Mia"]
        txt = re.sub(r"[\[\]\(\)\"']", " ", txt)
        parts = re.split(r"[,\n;]+", txt)
        names = []
        for p in parts:
            p = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ \-]", "", p).strip()
            if 2 <= len(p) <= 20:
                # keep one token (first) for first names; for last names allow hyphenated
                names.append(p.split()[0].capitalize())
        # dedupe, keep order
        seen = set()
        out = []
        for n in names:
            if n.lower() not in seen:
                out.append(n)
                seen.add(n.lower())
        return out

    def get_name_pools(country: str):
        """Use HF model once per country to get lists of first names and surnames."""
        if country in first_cache and country in last_cache:
            return first_cache[country], last_cache[country]
        try:
            first_prompt = (
                f"List 20 common first names in {country}. "
                "Return comma-separated names only."
            )
            last_prompt = (
                f"List 20 common surnames in {country}. "
                "Return comma-separated names only."
            )
            first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
            last_txt  = t5(last_prompt,  max_new_tokens=128, do_sample=False)[0]["generated_text"]
            firsts = _clean_list_text(first_txt)
            lasts  = _clean_list_text(last_txt)
            # Ensure we have reasonable pools; otherwise fall back
            if len(firsts) < 8 or len(lasts) < 8:
                raise ValueError("too few names parsed")
        except Exception:
            firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"])
        first_cache[country], last_cache[country] = firsts, lasts
        return firsts, lasts

    def sample_full_name(country: str) -> str:
        firsts, lasts = get_name_pools(country)
        first = random.choice(firsts)
        last  = random.choice(lasts)
        # Keep O'Connor/Al Nahyan formatting reasonable (space or apostrophe already in last)
        # Capitalize first token if last is multi-word (e.g., "Al Nahyan" -> keep as-is)
        def cap_name(s):
            if "'" in s:
                return "'".join([p.capitalize() for p in s.split("'")])
            return " ".join([p.capitalize() for p in s.split(" ")])
        return f"{cap_name(first)} {cap_name(last)}"

    rows = []
    for rank in range(1, n + 1):
        # --- Structured fields sampled to look realistic ---
        followers = random.randint(5_000, 5_000_000)
        er = round(random.uniform(0.5, 15.0), 2)  # %
        country = random.choice(countries)
        niche = random.choice(niches)
        reach = int(followers * random.uniform(0.25, 0.95))
        platform_token = random.choice(platforms)  # e.g., 'youtube'
        region_hint = country.lower().replace(" ", "")
        source_file = f"{platform_token}_data_{region_hint}.csv"  # <- first token = platform
        source_path = f"synthetic/{source_file}"

        # --- Name via HF model generated pools ---
        name = sample_full_name(country)

        rows.append([
            rank, name, followers, er, country, niche, reach, source_file, source_path
        ])

    df_syn = pd.DataFrame(rows, columns=[
        "Rank","Name","Followers","ER","Country","Niche","Reach","Source File","Source Path"
    ])
    df_syn.to_csv(out_csv, index=False)
    return df_syn

def load_or_build_synthetic():
    # Always build (or rebuild) to guarantee HF-generated data is used
    if os.path.exists(CSV_PATH):
        os.remove(CSV_PATH)
    _ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED)
    df_local = pd.read_csv(CSV_PATH)
    df_local.fillna("", inplace=True)
    return df_local

# Build & load synthetic dataset
df = load_or_build_synthetic()

# =========================
# FEATURE ENGINEERING
# =========================
# Extract platform name from Source File (first token before '_'), capitalize
df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize()

# Prepare text for embedding (include platform)
profile_fields = ["Name", "Platform", "Niche", "Country"]
df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1)

# =========================
# EMBEDDINGS & RECOMMENDER
# =========================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True)

def recommend_influencers(brand_description):
    query_embedding = model.encode(brand_description, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0]
    top_indices = cosine_scores.topk(3).indices.tolist()

    recs = []
    for idx in top_indices:
        row = df.iloc[idx]
        recs.append({
            "Name": row["Name"],
            "Platform": row.get("Platform", ""),
            "Niche": row["Niche"],
            "Country": row["Country"],
            "ER": f"{row.get('ER', 'N/A')}",
            "Followers": int(row["Followers"]),
            "Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "")
        })
    return recs

def format_output(brand_input):
    recs = recommend_influencers(brand_input)
    html = ""
    for i, rec in enumerate(recs, 1):
        html += f"""
        <div style='background:#ffffff; padding:1em; margin-bottom:1em; border-radius:8px; box-shadow:0 2px 6px rgba(0,0,0,0.1);'>
          <h3 style='margin:0; color:#0a1f44;'>🎯 {i}. {rec['Name']} <span style='font-size:0.9em; color:#555;'>({rec['Platform']})</span></h3>
          <p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p>
          <p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p>
          <p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p>
          <p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']:,}</p>
          {f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {int(rec['Reach']):,}</p>" if isinstance(rec['Reach'], int) else ""}
        </div>
        """
    return html

# =========================
# GRADIO UI
# =========================
iface = gr.Interface(
    fn=format_output,
    inputs=gr.Textbox(
        lines=3,
        label="🗣️ Describe Your Campaign or Brand",
        placeholder="e.g., Targeted fitness brand outreach for Gen Z"
    ),
    outputs=gr.HTML(label="📈 Recommended Influencers"),
    title="💡 InfluencerMatch.AI: Targeted Influencer Discovery for Social Media Marketing",
    description=(
        "Enhance your social media marketing by pinpointing the perfect influencers for your niche.\n\n"
        "🛠️ AI-driven matching based on niche, audience, and engagement metrics — get top 3 influencer recommendations instantly."
    ),
    article=(
        "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
        "**Models:**\n"
        "- google/flan-t5-small to synthesize country-specific first/last name pools\n"
        "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
        "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
    ),
    examples=[
        ["Sustainable fashion campaign targeting eco-conscious millennials"],
        ["Tech gadget launch aimed at early adopters in the US"],
        ["Healthy snack brand outreach for fitness enthusiasts"],
        ["Luxury travel experiences for affluent couples in Europe"]
    ],
    theme="soft",
    flagging_mode="never"
)

if __name__ == "__main__":
    iface.launch(share=True)