Spaces:

yoniif
/

final_assignment_yoni_gavriel

Sleeping

App Files Files Community

yoniif commited on Aug 13, 2025

Commit

48ed4a4

verified ·

1 Parent(s): fa98656

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -20

app.py CHANGED Viewed

@@ -6,6 +6,10 @@ import pandas as pd
 import gradio as gr
 from sentence_transformers import SentenceTransformer, util
 # =========================
 # ALWAYS-GENERATE (HF) DATASET
 # =========================
@@ -29,8 +33,9 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
             "Transformers not installed. Install with: pip install transformers torch"
         ) from e
-    # Small model for fast startup on Spaces; swap to "gpt2" if you prefer
-    name_gen = pipeline("text-generation", model="distilgpt2")
     countries = [
         "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
@@ -38,43 +43,68 @@ def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.c
     ]
     niches = [
         "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
-        "Lifestyle","Education","Finance","Sports","Parenting","DIY", "Fashion + Lifestyle",
-        "Rech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
     ]
     platforms = ["youtube", "instagram", "tiktok", "twitch", "x"]  # lowercase -> file prefix
-    rows = []
-    for rank in range(1, n + 1):
-        # --- Name via HF text generation (realistic first + last name) ---
-        prompt = "Generate a first and last name:"
         out = name_gen(
             prompt,
-            max_new_tokens=8,
-            num_return_sequences=1,
             do_sample=True,
-            temperature=0.9,
-            top_p=0.92
         )[0]["generated_text"]
-        # cleanup: remove the prompt text, keep letters & spaces only, single line
-        name = out.replace(prompt, "").strip().split("\n")[0]
-        name = re.sub(r"[^A-Za-z\s]", "", name).strip()
-        # ensure it has at least two words; light fallback if the model returns junk
-        if len(name.split()) < 2:
-            name = f"Alex {rank}son"  # simple readable fallback
         # --- Structured fields sampled to look realistic ---
         followers = random.randint(5_000, 5_000_000)
         er = round(random.uniform(0.5, 15.0), 2)  # %
         country = random.choice(countries)
         niche = random.choice(niches)
         reach = int(followers * random.uniform(0.25, 0.95))
         platform_token = random.choice(platforms)  # e.g., 'youtube'
         region_hint = country.lower().replace(" ", "")
         source_file = f"{platform_token}_data_{region_hint}.csv"  # <- first token = platform
         source_path = f"synthetic/{source_file}"
         rows.append([
             rank, name, followers, er, country, niche, reach, source_file, source_path
         ])
@@ -167,7 +197,7 @@ iface = gr.Interface(
     article=(
         "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
         "**Models:**\n"
-        "- text-generation (Hugging Face) for synthetic influencer full names (dataset creation)\n"
         "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
         "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
     ),

 import gradio as gr
 from sentence_transformers import SentenceTransformer, util
+# Optional: make HF downloads less flaky on Spaces
+os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 # =========================
 # ALWAYS-GENERATE (HF) DATASET
 # =========================
             "Transformers not installed. Install with: pip install transformers torch"
         ) from e
+    # Use an instruction-following model for names (much cleaner than distilgpt2)
+    # Small & CPU-friendly; you can bump to "google/flan-t5-base" if you want even better quality.
+    name_gen = pipeline("text2text-generation", model="google/flan-t5-small")
     countries = [
         "USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
     ]
     niches = [
         "Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
+        "Lifestyle","Education","Finance","Sports","Parenting","DIY",
+        "Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
     ]
     platforms = ["youtube", "instagram", "tiktok", "twitch", "x"]  # lowercase -> file prefix
+    def generate_person_name(country: str) -> str:
+        # Prompt FLAN to return exactly one First Last
+        prompt = (
+            f"Generate one realistic influencer full name (first and last) from {country}. "
+            "Return only: Firstname Lastname."
+        )
         out = name_gen(
             prompt,
+            max_new_tokens=16,
+            num_beams=1,
             do_sample=True,
+            temperature=0.7,
+            top_p=0.9
         )[0]["generated_text"]
+        name = out.strip().split("\n")[0]
+        name = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ' \-]", "", name).strip()
+        # Normalize spacing and capitalization
+        parts = [p for p in re.split(r"[ \-]+", name) if p]
+        if len(parts) < 2:
+            # one gentle retry with slightly different prompt
+            prompt2 = "Give one realistic full human name. Return only: Firstname Lastname."
+            out2 = name_gen(prompt2, max_new_tokens=12, do_sample=True, temperature=0.7, top_p=0.9)[0]["generated_text"]
+            name = out2.strip().split("\n")[0]
+            name = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ' \-]", "", name).strip()
+            parts = [p for p in re.split(r"[ \-]+", name) if p]
+        if len(parts) < 2:
+            # final minimal fallback (rare)
+            return "Alex Morgan"
+        def fix_case(s):
+            # keep O'Connor-style capitalization
+            chunks = s.split("'")
+            chunks = [c.capitalize() for c in chunks]
+            return "'".join(chunks)
+        first, last = fix_case(parts[0]), fix_case(parts[1])
+        return f"{first} {last}"
+    rows = []
+    for rank in range(1, n + 1):
         # --- Structured fields sampled to look realistic ---
         followers = random.randint(5_000, 5_000_000)
         er = round(random.uniform(0.5, 15.0), 2)  # %
         country = random.choice(countries)
         niche = random.choice(niches)
         reach = int(followers * random.uniform(0.25, 0.95))
         platform_token = random.choice(platforms)  # e.g., 'youtube'
         region_hint = country.lower().replace(" ", "")
         source_file = f"{platform_token}_data_{region_hint}.csv"  # <- first token = platform
         source_path = f"synthetic/{source_file}"
+        # --- Name via HF (FLAN-T5) ---
+        name = generate_person_name(country)
         rows.append([
             rank, name, followers, er, country, niche, reach, source_file, source_path
         ])
     article=(
         "**Project:** AI-Powered Influencer Recommender for Social Media Marketing\n\n"
         "**Models:**\n"
+        "- google/flan-t5-small for synthetic influencer full names (dataset creation)\n"
         "- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
         "**Dataset:** 1,200-row synthetic influencer dataset generated at runtime."
     ),