yoniif commited on
Commit
d14dd35
·
verified ·
1 Parent(s): 48ed4a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -50
app.py CHANGED
@@ -1,12 +1,12 @@
1
  # app.py
2
  import os
3
- import random
4
  import re
 
5
  import pandas as pd
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer, util
8
 
9
- # Optional: make HF downloads less flaky on Spaces
10
  os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
11
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
12
 
@@ -16,16 +16,17 @@ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
16
  CSV_PATH = "synthetic_influencers.csv"
17
  NUM_ROWS = 1200 # ≥1000 as required
18
  SEED = 42 # reproducibility
 
19
 
20
  def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
21
  """
22
  Creates a synthetic dataset that mirrors your current schema:
23
  Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
24
- Uses a Hugging Face text-generation model to generate realistic first + last names.
 
25
  """
26
  random.seed(seed)
27
 
28
- # Lazy import so the app still runs even if transformers isn't preinstalled locally
29
  try:
30
  from transformers import pipeline
31
  except Exception as e:
@@ -33,9 +34,8 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
33
  "Transformers not installed. Install with: pip install transformers torch"
34
  ) from e
35
 
36
- # Use an instruction-following model for names (much cleaner than distilgpt2)
37
- # Small & CPU-friendly; you can bump to "google/flan-t5-base" if you want even better quality.
38
- name_gen = pipeline("text2text-generation", model="google/flan-t5-small")
39
 
40
  countries = [
41
  "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
@@ -48,46 +48,97 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
48
  ]
49
  platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
50
 
51
- def generate_person_name(country: str) -> str:
52
- # Prompt FLAN to return exactly one First Last
53
- prompt = (
54
- f"Generate one realistic influencer full name (first and last) from {country}. "
55
- "Return only: Firstname Lastname."
56
- )
57
- out = name_gen(
58
- prompt,
59
- max_new_tokens=16,
60
- num_beams=1,
61
- do_sample=True,
62
- temperature=0.7,
63
- top_p=0.9
64
- )[0]["generated_text"]
65
-
66
- name = out.strip().split("\n")[0]
67
- name = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ' \-]", "", name).strip()
68
-
69
- # Normalize spacing and capitalization
70
- parts = [p for p in re.split(r"[ \-]+", name) if p]
71
- if len(parts) < 2:
72
- # one gentle retry with slightly different prompt
73
- prompt2 = "Give one realistic full human name. Return only: Firstname Lastname."
74
- out2 = name_gen(prompt2, max_new_tokens=12, do_sample=True, temperature=0.7, top_p=0.9)[0]["generated_text"]
75
- name = out2.strip().split("\n")[0]
76
- name = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ' \-]", "", name).strip()
77
- parts = [p for p in re.split(r"[ \-]+", name) if p]
78
-
79
- if len(parts) < 2:
80
- # final minimal fallback (rare)
81
- return "Alex Morgan"
82
-
83
- def fix_case(s):
84
- # keep O'Connor-style capitalization
85
- chunks = s.split("'")
86
- chunks = [c.capitalize() for c in chunks]
87
- return "'".join(chunks)
88
-
89
- first, last = fix_case(parts[0]), fix_case(parts[1])
90
- return f"{first} {last}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  rows = []
93
  for rank in range(1, n + 1):
@@ -102,8 +153,8 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
102
  source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
103
  source_path = f"synthetic/{source_file}"
104
 
105
- # --- Name via HF (FLAN-T5) ---
106
- name = generate_person_name(country)
107
 
108
  rows.append([
109
  rank, name, followers, er, country, niche, reach, source_file, source_path
@@ -197,7 +248,7 @@ iface = gr.Interface(
197
  article=(
198
  "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
199
  "**Models:**\n"
200
- "- google/flan-t5-small for synthetic influencer full names (dataset creation)\n"
201
  "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
202
  "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
203
  ),
 
1
  # app.py
2
  import os
 
3
  import re
4
+ import random
5
  import pandas as pd
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer, util
8
 
9
+ # Make HF downloads less flaky on Spaces
10
  os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
11
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
12
 
 
16
  CSV_PATH = "synthetic_influencers.csv"
17
  NUM_ROWS = 1200 # ≥1000 as required
18
  SEED = 42 # reproducibility
19
+ random.seed(SEED)
20
 
21
  def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
22
  """
23
  Creates a synthetic dataset that mirrors your current schema:
24
  Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
25
+ Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country,
26
+ then samples realistic First Last names from those pools.
27
  """
28
  random.seed(seed)
29
 
 
30
  try:
31
  from transformers import pipeline
32
  except Exception as e:
 
34
  "Transformers not installed. Install with: pip install transformers torch"
35
  ) from e
36
 
37
+ # Smaller instruction-following model; you can bump to "google/flan-t5-base" if you want
38
+ t5 = pipeline("text2text-generation", model="google/flan-t5-small")
 
39
 
40
  countries = [
41
  "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
 
48
  ]
49
  platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
50
 
51
+ # -------- Name pool builder (uses HF model once per country) --------
52
+ COUNTRY_FALLBACKS = {
53
+ "USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"],
54
+ ["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]),
55
+ "UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"],
56
+ ["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]),
57
+ "Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"],
58
+ ["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]),
59
+ "Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"],
60
+ ["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]),
61
+ "Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"],
62
+ ["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]),
63
+ "India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"],
64
+ ["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]),
65
+ "France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","Chloé"],
66
+ ["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]),
67
+ "Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"],
68
+ ["Müller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]),
69
+ "Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"],
70
+ ["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]),
71
+ "Spain": (["Hugo","Mateo","Martín","Lucas","Lucía","Martina","Sofía","Julia"],
72
+ ["García","Fernández","González","Rodríguez","López","Martínez","Sánchez","Pérez"]),
73
+ "Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"],
74
+ ["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]),
75
+ "UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"],
76
+ ["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]),
77
+ "Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"],
78
+ ["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]),
79
+ "Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"],
80
+ ["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]),
81
+ "Mexico": (["Santiago","Mateo","Sebastián","Emiliano","Sofía","Valentina","Regina","Camila"],
82
+ ["Hernández","García","Martínez","López","González","Pérez","Rodríguez","Sánchez"]),
83
+ }
84
+
85
+ first_cache, last_cache = {}, {}
86
+
87
+ def _clean_list_text(txt: str):
88
+ # turn "Emma, Olivia; Ava\nMia" -> ["Emma","Olivia","Ava","Mia"]
89
+ txt = re.sub(r"[\[\]\(\)\"']", " ", txt)
90
+ parts = re.split(r"[,\n;]+", txt)
91
+ names = []
92
+ for p in parts:
93
+ p = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ \-]", "", p).strip()
94
+ if 2 <= len(p) <= 20:
95
+ # keep one token (first) for first names; for last names allow hyphenated
96
+ names.append(p.split()[0].capitalize())
97
+ # dedupe, keep order
98
+ seen = set()
99
+ out = []
100
+ for n in names:
101
+ if n.lower() not in seen:
102
+ out.append(n)
103
+ seen.add(n.lower())
104
+ return out
105
+
106
+ def get_name_pools(country: str):
107
+ """Use HF model once per country to get lists of first names and surnames."""
108
+ if country in first_cache and country in last_cache:
109
+ return first_cache[country], last_cache[country]
110
+ try:
111
+ first_prompt = (
112
+ f"List 20 common first names in {country}. "
113
+ "Return comma-separated names only."
114
+ )
115
+ last_prompt = (
116
+ f"List 20 common surnames in {country}. "
117
+ "Return comma-separated names only."
118
+ )
119
+ first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
120
+ last_txt = t5(last_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
121
+ firsts = _clean_list_text(first_txt)
122
+ lasts = _clean_list_text(last_txt)
123
+ # Ensure we have reasonable pools; otherwise fall back
124
+ if len(firsts) < 8 or len(lasts) < 8:
125
+ raise ValueError("too few names parsed")
126
+ except Exception:
127
+ firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"])
128
+ first_cache[country], last_cache[country] = firsts, lasts
129
+ return firsts, lasts
130
+
131
+ def sample_full_name(country: str) -> str:
132
+ firsts, lasts = get_name_pools(country)
133
+ first = random.choice(firsts)
134
+ last = random.choice(lasts)
135
+ # Keep O'Connor/Al Nahyan formatting reasonable (space or apostrophe already in last)
136
+ # Capitalize first token if last is multi-word (e.g., "Al Nahyan" -> keep as-is)
137
+ def cap_name(s):
138
+ if "'" in s:
139
+ return "'".join([p.capitalize() for p in s.split("'")])
140
+ return " ".join([p.capitalize() for p in s.split(" ")])
141
+ return f"{cap_name(first)} {cap_name(last)}"
142
 
143
  rows = []
144
  for rank in range(1, n + 1):
 
153
  source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
154
  source_path = f"synthetic/{source_file}"
155
 
156
+ # --- Name via HF model generated pools ---
157
+ name = sample_full_name(country)
158
 
159
  rows.append([
160
  rank, name, followers, er, country, niche, reach, source_file, source_path
 
248
  article=(
249
  "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
250
  "**Models:**\n"
251
+ "- google/flan-t5-small to synthesize country-specific first/last name pools\n"
252
  "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
253
  "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
254
  ),