yoniif commited on
Commit
2476aae
·
verified ·
1 Parent(s): 727364f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -68
app.py CHANGED
@@ -1,59 +1,35 @@
1
- # influencer_match_app.py
2
  import os
3
- import re
4
  import random
5
  import pandas as pd
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer, util
8
 
9
  # =========================
10
- # SYNTHETIC DATA (HF) CONFIG
11
  # =========================
12
  CSV_PATH = "synthetic_influencers.csv"
13
- NUM_ROWS = 1200 # >= 1000 for the assignment
14
- SEED = 42 # reproducibility
 
 
 
 
 
 
 
 
15
 
16
- # -------------------------
17
- # Hugging Face name generator
18
- # -------------------------
19
- def _load_name_pipeline():
20
  try:
21
  from transformers import pipeline
22
  except Exception as e:
23
  raise RuntimeError(
24
- "Transformers not installed. Install with: pip install transformers torch --upgrade"
25
  ) from e
26
-
27
- # small model for faster startup on HF Spaces (used for generating synthetic dataset)
28
- return pipeline("text-generation", model="distilgpt2")
29
-
30
- NAME_PIPE = _load_name_pipeline()
31
-
32
- def generate_hf_name(niches_list, platform, country):
33
- niche_phrase = ", ".join(niches_list)
34
- prompt = (
35
- "Generate ONE short, brandable influencer username.\n"
36
- f"Context: niches = [{niche_phrase}], country = {country}, platform = {platform}.\n"
37
- "Constraints: 1–3 words, letters/numbers only, no emojis, no spaces, no hyphens, "
38
- "avoid the word 'Creator', avoid generic words like 'User' or 'Channel'. "
39
- "Prefer CamelCase or concise compounds (e.g., FitAtlas, StyleLab, CodeBytes). "
40
- "Return ONLY the username:"
41
- )
42
- out = NAME_PIPE(prompt, max_new_tokens=14, do_sample=True, temperature=0.9, top_p=0.92)[0]["generated_text"]
43
- name = out.strip().splitlines()[-1].strip()
44
- # keep alphanum, remove spaces/hyphens just in case
45
- import re
46
- name = re.sub(r"[^A-Za-z0-9]", "", name)
47
- if not name or len(name) > 24:
48
- # still simple: if the model slips, lightly trim
49
- name = name[:24] or "BrandHandle"
50
- return name
51
-
52
- # -------------------------
53
- # Synthetic dataset builder
54
- # -------------------------
55
- def create_synthetic_influencer_dataset(n=1200, out_csv=CSV_PATH, seed=SEED) -> pd.DataFrame:
56
- random.seed(seed)
57
 
58
  countries = [
59
  "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
@@ -61,27 +37,33 @@ def create_synthetic_influencer_dataset(n=1200, out_csv=CSV_PATH, seed=SEED) ->
61
  ]
62
  niches = [
63
  "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
64
- "Lifestyle","Education","Finance","Sports","Parenting","DIY", "Fashion & Travel", "Food & Fitness",
65
- "Tech & Gaming", "Lifestyle & Travel", "Sports & Fitness"
66
  ]
67
- platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase for file prefix
68
 
69
  rows = []
70
  for rank in range(1, n + 1):
71
- # --- Structured fields first (so name prompt can use them) ---
 
 
 
 
 
 
 
 
 
 
 
72
  country = random.choice(countries)
73
  niche = random.choice(niches)
74
- platform_token = random.choice(platforms) # 'youtube', 'instagram', ...
75
- followers = random.randint(5_000, 5_000_000)
76
- er = round(random.uniform(0.5, 15.0), 2) # percent
77
  reach = int(followers * random.uniform(0.25, 0.95))
 
 
78
  region_hint = country.lower().replace(" ", "")
79
- source_file = f"{platform_token}_data_{region_hint}.csv" # first token remains the platform
80
  source_path = f"synthetic/{source_file}"
81
 
82
- # --- Now generate the name using those fields (HF model) ---
83
- name = generate_hf_name(niche, platform_token, country)
84
-
85
  rows.append([
86
  rank, name, followers, er, country, niche, reach, source_file, source_path
87
  ])
@@ -92,8 +74,8 @@ def create_synthetic_influencer_dataset(n=1200, out_csv=CSV_PATH, seed=SEED) ->
92
  df_syn.to_csv(out_csv, index=False)
93
  return df_syn
94
 
95
- def build_and_load_synthetic() -> pd.DataFrame:
96
- # Always rebuild to ensure HF-generated data is used
97
  if os.path.exists(CSV_PATH):
98
  os.remove(CSV_PATH)
99
  _ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED)
@@ -101,13 +83,11 @@ def build_and_load_synthetic() -> pd.DataFrame:
101
  df_local.fillna("", inplace=True)
102
  return df_local
103
 
104
- # =========================
105
- # LOAD DATA (always synthetic)
106
- # =========================
107
- df = build_and_load_synthetic()
108
 
109
  # =========================
110
- # FEATURE ENGINEERING (same logic)
111
  # =========================
112
  # Extract platform name from Source File (first token before '_'), capitalize
113
  df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize()
@@ -117,12 +97,12 @@ profile_fields = ["Name", "Platform", "Niche", "Country"]
117
  df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1)
118
 
119
  # =========================
120
- # EMBEDDINGS & RECOMMENDER
121
  # =========================
122
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
123
  influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True)
124
 
125
- def recommend_influencers(brand_description: str):
126
  query_embedding = model.encode(brand_description, convert_to_tensor=True)
127
  cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0]
128
  top_indices = cosine_scores.topk(3).indices.tolist()
@@ -136,12 +116,12 @@ def recommend_influencers(brand_description: str):
136
  "Niche": row["Niche"],
137
  "Country": row["Country"],
138
  "ER": f"{row.get('ER', 'N/A')}",
139
- "Followers": int(row["Followers"]),
140
- "Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "")
141
  })
142
  return recs
143
 
144
- def format_output(brand_input: str):
145
  recs = recommend_influencers(brand_input)
146
  html = ""
147
  for i, rec in enumerate(recs, 1):
@@ -151,8 +131,8 @@ def format_output(brand_input: str):
151
  <p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p>
152
  <p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p>
153
  <p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p>
154
- <p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']:,}</p>
155
- {f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {int(rec['Reach']):,}</p>" if isinstance(rec['Reach'], int) else ""}
156
  </div>
157
  """
158
  return html
@@ -176,9 +156,9 @@ iface = gr.Interface(
176
  article=(
177
  "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
178
  "**Models:**\n"
179
- "- Hugging Face text-generation (distilgpt2) to create a 1,200-row synthetic influencer dataset at runtime\n"
180
  "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
181
- "**Dataset:** Synthetic dataset suitable for text modality of influencer data"
182
  ),
183
  examples=[
184
  ["Sustainable fashion campaign targeting eco-conscious millennials"],
@@ -192,3 +172,4 @@ iface = gr.Interface(
192
 
193
  if __name__ == "__main__":
194
  iface.launch(share=True)
 
 
1
+
2
  import os
 
3
  import random
4
  import pandas as pd
5
  import gradio as gr
6
  from sentence_transformers import SentenceTransformer, util
7
 
8
  # =========================
9
+ # ALWAYS-GENERATE (HF) DATASET
10
  # =========================
11
  CSV_PATH = "synthetic_influencers.csv"
12
+ NUM_ROWS = 1200 # 1000 as required
13
+ SEED = 42 # reproducibility
14
+
15
+ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
16
+ """
17
+ Creates a synthetic dataset that mirrors your current schema:
18
+ Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
19
+ Uses a Hugging Face text-generation model to generate influencer names.
20
+ """
21
+ random.seed(seed)
22
 
23
+ # Lazy import to keep dependencies minimal in environments where transformers isn't preinstalled
 
 
 
24
  try:
25
  from transformers import pipeline
26
  except Exception as e:
27
  raise RuntimeError(
28
+ "Transformers not installed. Install with: pip install transformers torch"
29
  ) from e
30
+
31
+ # Use a lighter model to keep HF Space startup snappy (swap to 'gpt2' if you prefer)
32
+ name_gen = pipeline("text-generation", model="distilgpt2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  countries = [
35
  "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
 
37
  ]
38
  niches = [
39
  "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
40
+ "Lifestyle","Education","Finance","Sports","Parenting","DIY"
 
41
  ]
42
+ platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
43
 
44
  rows = []
45
  for rank in range(1, n + 1):
46
+ # --- Name via HF text generation ---
47
+ prompt = "Short, catchy influencer name:"
48
+ raw = name_gen(prompt, max_new_tokens=8, num_return_sequences=1, do_sample=True, temperature=0.9)[0]["generated_text"]
49
+ # cleanup
50
+ name = raw.replace(prompt, "").strip().split("\n")[0]
51
+ name = "".join(ch for ch in name if ch.isalnum() or ch in " -_.").strip()
52
+ if len(name) < 3 or len(name) > 40:
53
+ name = f"Creator_{rank}"
54
+
55
+ # --- Structured fields sampled to look realistic ---
56
+ followers = random.randint(5_000, 5_000_000)
57
+ er = round(random.uniform(0.5, 15.0), 2) # %
58
  country = random.choice(countries)
59
  niche = random.choice(niches)
 
 
 
60
  reach = int(followers * random.uniform(0.25, 0.95))
61
+
62
+ platform_token = random.choice(platforms) # e.g., 'youtube'
63
  region_hint = country.lower().replace(" ", "")
64
+ source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
65
  source_path = f"synthetic/{source_file}"
66
 
 
 
 
67
  rows.append([
68
  rank, name, followers, er, country, niche, reach, source_file, source_path
69
  ])
 
74
  df_syn.to_csv(out_csv, index=False)
75
  return df_syn
76
 
77
+ def load_or_build_synthetic():
78
+ # Always build (or rebuild) to guarantee HF-generated data is used
79
  if os.path.exists(CSV_PATH):
80
  os.remove(CSV_PATH)
81
  _ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED)
 
83
  df_local.fillna("", inplace=True)
84
  return df_local
85
 
86
+ # Build & load synthetic dataset
87
+ df = load_or_build_synthetic()
 
 
88
 
89
  # =========================
90
+ # FEATURE ENGINEERING (same as yours)
91
  # =========================
92
  # Extract platform name from Source File (first token before '_'), capitalize
93
  df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize()
 
97
  df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1)
98
 
99
  # =========================
100
+ # EMBEDDINGS & RECOMMENDER (same as yours)
101
  # =========================
102
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
103
  influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True)
104
 
105
+ def recommend_influencers(brand_description):
106
  query_embedding = model.encode(brand_description, convert_to_tensor=True)
107
  cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0]
108
  top_indices = cosine_scores.topk(3).indices.tolist()
 
116
  "Niche": row["Niche"],
117
  "Country": row["Country"],
118
  "ER": f"{row.get('ER', 'N/A')}",
119
+ "Followers": row["Followers"],
120
+ "Reach": row.get("Reach", "")
121
  })
122
  return recs
123
 
124
+ def format_output(brand_input):
125
  recs = recommend_influencers(brand_input)
126
  html = ""
127
  for i, rec in enumerate(recs, 1):
 
131
  <p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p>
132
  <p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p>
133
  <p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p>
134
+ <p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']}</p>
135
+ {f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {rec['Reach']}</p>" if rec['Reach'] else ""}
136
  </div>
137
  """
138
  return html
 
156
  article=(
157
  "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
158
  "**Models:**\n"
159
+ "- text-generation (Hugging Face) for synthetic influencer names (dataset creation)\n"
160
  "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
161
+ "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
162
  ),
163
  examples=[
164
  ["Sustainable fashion campaign targeting eco-conscious millennials"],
 
172
 
173
  if __name__ == "__main__":
174
  iface.launch(share=True)
175
+