yoniif commited on
Commit
48ed4a4
verified
1 Parent(s): fa98656

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -20
app.py CHANGED
@@ -6,6 +6,10 @@ import pandas as pd
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer, util
8
 
 
 
 
 
9
  # =========================
10
  # ALWAYS-GENERATE (HF) DATASET
11
  # =========================
@@ -29,8 +33,9 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
29
  "Transformers not installed. Install with: pip install transformers torch"
30
  ) from e
31
 
32
- # Small model for fast startup on Spaces; swap to "gpt2" if you prefer
33
- name_gen = pipeline("text-generation", model="distilgpt2")
 
34
 
35
  countries = [
36
  "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
@@ -38,43 +43,68 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
38
  ]
39
  niches = [
40
  "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
41
- "Lifestyle","Education","Finance","Sports","Parenting","DIY", "Fashion + Lifestyle",
42
- "Rech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
43
  ]
44
  platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
45
 
46
- rows = []
47
- for rank in range(1, n + 1):
48
- # --- Name via HF text generation (realistic first + last name) ---
49
- prompt = "Generate a first and last name:"
 
 
50
  out = name_gen(
51
  prompt,
52
- max_new_tokens=8,
53
- num_return_sequences=1,
54
  do_sample=True,
55
- temperature=0.9,
56
- top_p=0.92
57
  )[0]["generated_text"]
58
 
59
- # cleanup: remove the prompt text, keep letters & spaces only, single line
60
- name = out.replace(prompt, "").strip().split("\n")[0]
61
- name = re.sub(r"[^A-Za-z\s]", "", name).strip()
62
- # ensure it has at least two words; light fallback if the model returns junk
63
- if len(name.split()) < 2:
64
- name = f"Alex {rank}son" # simple readable fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
 
 
 
 
66
  # --- Structured fields sampled to look realistic ---
67
  followers = random.randint(5_000, 5_000_000)
68
  er = round(random.uniform(0.5, 15.0), 2) # %
69
  country = random.choice(countries)
70
  niche = random.choice(niches)
71
  reach = int(followers * random.uniform(0.25, 0.95))
72
-
73
  platform_token = random.choice(platforms) # e.g., 'youtube'
74
  region_hint = country.lower().replace(" ", "")
75
  source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
76
  source_path = f"synthetic/{source_file}"
77
 
 
 
 
78
  rows.append([
79
  rank, name, followers, er, country, niche, reach, source_file, source_path
80
  ])
@@ -167,7 +197,7 @@ iface = gr.Interface(
167
  article=(
168
  "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
169
  "**Models:**\n"
170
- "- text-generation (Hugging Face) for synthetic influencer full names (dataset creation)\n"
171
  "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
172
  "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
173
  ),
 
6
  import gradio as gr
7
  from sentence_transformers import SentenceTransformer, util
8
 
9
+ # Optional: make HF downloads less flaky on Spaces
10
+ os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
11
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
12
+
13
  # =========================
14
  # ALWAYS-GENERATE (HF) DATASET
15
  # =========================
 
33
  "Transformers not installed. Install with: pip install transformers torch"
34
  ) from e
35
 
36
+ # Use an instruction-following model for names (much cleaner than distilgpt2)
37
+ # Small & CPU-friendly; you can bump to "google/flan-t5-base" if you want even better quality.
38
+ name_gen = pipeline("text2text-generation", model="google/flan-t5-small")
39
 
40
  countries = [
41
  "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
 
43
  ]
44
  niches = [
45
  "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
46
+ "Lifestyle","Education","Finance","Sports","Parenting","DIY",
47
+ "Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
48
  ]
49
  platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix
50
 
51
+ def generate_person_name(country: str) -> str:
52
+ # Prompt FLAN to return exactly one First Last
53
+ prompt = (
54
+ f"Generate one realistic influencer full name (first and last) from {country}. "
55
+ "Return only: Firstname Lastname."
56
+ )
57
  out = name_gen(
58
  prompt,
59
+ max_new_tokens=16,
60
+ num_beams=1,
61
  do_sample=True,
62
+ temperature=0.7,
63
+ top_p=0.9
64
  )[0]["generated_text"]
65
 
66
+ name = out.strip().split("\n")[0]
67
+ name = re.sub(r"[^A-Za-z脌-脰脴-枚酶-每' \-]", "", name).strip()
68
+
69
+ # Normalize spacing and capitalization
70
+ parts = [p for p in re.split(r"[ \-]+", name) if p]
71
+ if len(parts) < 2:
72
+ # one gentle retry with slightly different prompt
73
+ prompt2 = "Give one realistic full human name. Return only: Firstname Lastname."
74
+ out2 = name_gen(prompt2, max_new_tokens=12, do_sample=True, temperature=0.7, top_p=0.9)[0]["generated_text"]
75
+ name = out2.strip().split("\n")[0]
76
+ name = re.sub(r"[^A-Za-z脌-脰脴-枚酶-每' \-]", "", name).strip()
77
+ parts = [p for p in re.split(r"[ \-]+", name) if p]
78
+
79
+ if len(parts) < 2:
80
+ # final minimal fallback (rare)
81
+ return "Alex Morgan"
82
+
83
+ def fix_case(s):
84
+ # keep O'Connor-style capitalization
85
+ chunks = s.split("'")
86
+ chunks = [c.capitalize() for c in chunks]
87
+ return "'".join(chunks)
88
 
89
+ first, last = fix_case(parts[0]), fix_case(parts[1])
90
+ return f"{first} {last}"
91
+
92
+ rows = []
93
+ for rank in range(1, n + 1):
94
  # --- Structured fields sampled to look realistic ---
95
  followers = random.randint(5_000, 5_000_000)
96
  er = round(random.uniform(0.5, 15.0), 2) # %
97
  country = random.choice(countries)
98
  niche = random.choice(niches)
99
  reach = int(followers * random.uniform(0.25, 0.95))
 
100
  platform_token = random.choice(platforms) # e.g., 'youtube'
101
  region_hint = country.lower().replace(" ", "")
102
  source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
103
  source_path = f"synthetic/{source_file}"
104
 
105
+ # --- Name via HF (FLAN-T5) ---
106
+ name = generate_person_name(country)
107
+
108
  rows.append([
109
  rank, name, followers, er, country, niche, reach, source_file, source_path
110
  ])
 
197
  article=(
198
  "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
199
  "**Models:**\n"
200
+ "- google/flan-t5-small for synthetic influencer full names (dataset creation)\n"
201
  "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
202
  "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
203
  ),