Spaces:

yoniif
/

final_assignment_yoni_gavriel

Sleeping

App Files Files Community

final_assignment_yoni_gavriel / app.py

yoniif

Update app.py

d14dd35 verified 6 months ago

raw

history blame contribute delete

12.7 kB

	# app.py
	import os
	import re
	import random
	import pandas as pd
	import gradio as gr
	from sentence_transformers import SentenceTransformer, util

	# Make HF downloads less flaky on Spaces
	os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
	os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")

	# =========================
	# ALWAYS-GENERATE (HF) DATASET
	# =========================
	CSV_PATH = "synthetic_influencers.csv"
	NUM_ROWS = 1200 # ≥1000 as required
	SEED = 42 # reproducibility
	random.seed(SEED)

	def create_synthetic_influencer_dataset(n=1200, out_csv="synthetic_influencers.csv", seed=42):
	"""
	Creates a synthetic dataset that mirrors your current schema:
	Columns: Rank, Name, Followers, ER, Country, Niche, Reach, Source File, Source Path
	Uses a Hugging Face model (FLAN-T5) to create pools of common first/last names per country,
	then samples realistic First Last names from those pools.
	"""
	random.seed(seed)

	try:
	from transformers import pipeline
	except Exception as e:
	raise RuntimeError(
	"Transformers not installed. Install with: pip install transformers torch"
	) from e

	# Smaller instruction-following model; you can bump to "google/flan-t5-base" if you want
	t5 = pipeline("text2text-generation", model="google/flan-t5-small")

	countries = [
	"USA","UK","Canada","Australia","Brazil","India","France","Germany","Italy","Spain",
	"Israel","UAE","Netherlands","Sweden","Mexico"
	]
	niches = [
	"Fashion","Travel","Food","Fitness","Tech","Beauty","Gaming","Music","Photography",
	"Lifestyle","Education","Finance","Sports","Parenting","DIY",
	"Fashion + Lifestyle", "Tech + Gaming", "Food + Fitness", "Beauty + Fashion", "Sports + Fitness"
	]
	platforms = ["youtube", "instagram", "tiktok", "twitch", "x"] # lowercase -> file prefix

	# -------- Name pool builder (uses HF model once per country) --------
	COUNTRY_FALLBACKS = {
	"USA": (["Emma","Olivia","Ava","Mia","Noah","Liam","Ethan","James"],
	["Smith","Johnson","Brown","Davis","Miller","Wilson","Moore","Taylor"]),
	"UK": (["Oliver","George","Amelia","Isla","Jack","Harry","Sophia","Emily"],
	["Smith","Jones","Taylor","Brown","Williams","Wilson","Johnson","Davies"]),
	"Canada": (["Liam","Noah","William","Olivia","Emma","Charlotte","Benjamin","Lucas"],
	["Smith","Brown","Tremblay","Martin","Roy","Wilson","Taylor","Johnson"]),
	"Australia": (["Oliver","Noah","William","Charlotte","Olivia","Isla","Jack","Ethan"],
	["Smith","Jones","Williams","Brown","Wilson","Taylor","Anderson","Martin"]),
	"Brazil": (["Gabriel","Miguel","Arthur","Heitor","Valentina","Laura","Julia","Maria"],
	["Silva","Santos","Oliveira","Souza","Rodrigues","Ferreira","Almeida","Lima"]),
	"India": (["Arjun","Aarav","Ishaan","Vihaan","Aanya","Anaya","Diya","Isha"],
	["Sharma","Patel","Gupta","Khan","Singh","Kumar","Reddy","Iyer"]),
	"France": (["Lucas","Louis","Hugo","Jules","Emma","Louise","Alice","Chloé"],
	["Martin","Bernard","Dubois","Thomas","Robert","Richard","Petit","Durand"]),
	"Germany": (["Leon","Noah","Elias","Finn","Mia","Emilia","Hannah","Sophia"],
	["Müller","Schmidt","Schneider","Fischer","Weber","Meyer","Wagner","Becker"]),
	"Italy": (["Alessandro","Leonardo","Lorenzo","Gabriele","Sofia","Giulia","Aurora","Alice"],
	["Rossi","Russo","Ferrari","Esposito","Bianchi","Romano","Colombo","Ricci"]),
	"Spain": (["Hugo","Mateo","Martín","Lucas","Lucía","Martina","Sofía","Julia"],
	["García","Fernández","González","Rodríguez","López","Martínez","Sánchez","Pérez"]),
	"Israel": (["Noa","Maya","Tamar","Yael","Ariel","Daniel","Itai","Lior"],
	["Cohen","Levi","Mizrahi","Peretz","Biton","Azulay","Dahan","Halevi"]),
	"UAE": (["Mohammed","Omar","Yousef","Khalid","Fatima","Aisha","Mariam","Noora"],
	["Al Nahyan","Al Maktoum","Al Qasimi","Al Mazrouei","Al Marri","Al Ali","Al Hammadi","Al Ketbi"]),
	"Netherlands": (["Daan","Sem","Luuk","Bram","Emma","Sophie","Julia","Tess"],
	["de Jong","Jansen","de Vries","Bakker","Visser","Smit","Meijer","de Boer"]),
	"Sweden": (["William","Liam","Noah","Ella","Alva","Alice","Maja","Astrid"],
	["Johansson","Andersson","Karlsson","Nilsson","Eriksson","Larsson","Olsson","Persson"]),
	"Mexico": (["Santiago","Mateo","Sebastián","Emiliano","Sofía","Valentina","Regina","Camila"],
	["Hernández","García","Martínez","López","González","Pérez","Rodríguez","Sánchez"]),
	}

	first_cache, last_cache = {}, {}

	def _clean_list_text(txt: str):
	# turn "Emma, Olivia; Ava\nMia" -> ["Emma","Olivia","Ava","Mia"]
	txt = re.sub(r"[\[\]\(\)\"']", " ", txt)
	parts = re.split(r"[,\n;]+", txt)
	names = []
	for p in parts:
	p = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ \-]", "", p).strip()
	if 2 <= len(p) <= 20:
	# keep one token (first) for first names; for last names allow hyphenated
	names.append(p.split()[0].capitalize())
	# dedupe, keep order
	seen = set()
	out = []
	for n in names:
	if n.lower() not in seen:
	out.append(n)
	seen.add(n.lower())
	return out

	def get_name_pools(country: str):
	"""Use HF model once per country to get lists of first names and surnames."""
	if country in first_cache and country in last_cache:
	return first_cache[country], last_cache[country]
	try:
	first_prompt = (
	f"List 20 common first names in {country}. "
	"Return comma-separated names only."
	)
	last_prompt = (
	f"List 20 common surnames in {country}. "
	"Return comma-separated names only."
	)
	first_txt = t5(first_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
	last_txt = t5(last_prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
	firsts = _clean_list_text(first_txt)
	lasts = _clean_list_text(last_txt)
	# Ensure we have reasonable pools; otherwise fall back
	if len(firsts) < 8 or len(lasts) < 8:
	raise ValueError("too few names parsed")
	except Exception:
	firsts, lasts = COUNTRY_FALLBACKS.get(country, COUNTRY_FALLBACKS["USA"])
	first_cache[country], last_cache[country] = firsts, lasts
	return firsts, lasts

	def sample_full_name(country: str) -> str:
	firsts, lasts = get_name_pools(country)
	first = random.choice(firsts)
	last = random.choice(lasts)
	# Keep O'Connor/Al Nahyan formatting reasonable (space or apostrophe already in last)
	# Capitalize first token if last is multi-word (e.g., "Al Nahyan" -> keep as-is)
	def cap_name(s):
	if "'" in s:
	return "'".join([p.capitalize() for p in s.split("'")])
	return " ".join([p.capitalize() for p in s.split(" ")])
	return f"{cap_name(first)} {cap_name(last)}"

	rows = []
	for rank in range(1, n + 1):
	# --- Structured fields sampled to look realistic ---
	followers = random.randint(5_000, 5_000_000)
	er = round(random.uniform(0.5, 15.0), 2) # %
	country = random.choice(countries)
	niche = random.choice(niches)
	reach = int(followers * random.uniform(0.25, 0.95))
	platform_token = random.choice(platforms) # e.g., 'youtube'
	region_hint = country.lower().replace(" ", "")
	source_file = f"{platform_token}_data_{region_hint}.csv" # <- first token = platform
	source_path = f"synthetic/{source_file}"

	# --- Name via HF model generated pools ---
	name = sample_full_name(country)

	rows.append([
	rank, name, followers, er, country, niche, reach, source_file, source_path
	])

	df_syn = pd.DataFrame(rows, columns=[
	"Rank","Name","Followers","ER","Country","Niche","Reach","Source File","Source Path"
	])
	df_syn.to_csv(out_csv, index=False)
	return df_syn

	def load_or_build_synthetic():
	# Always build (or rebuild) to guarantee HF-generated data is used
	if os.path.exists(CSV_PATH):
	os.remove(CSV_PATH)
	_ = create_synthetic_influencer_dataset(n=NUM_ROWS, out_csv=CSV_PATH, seed=SEED)
	df_local = pd.read_csv(CSV_PATH)
	df_local.fillna("", inplace=True)
	return df_local

	# Build & load synthetic dataset
	df = load_or_build_synthetic()

	# =========================
	# FEATURE ENGINEERING
	# =========================
	# Extract platform name from Source File (first token before '_'), capitalize
	df['Platform'] = df['Source File'].astype(str).str.split('_').str[0].str.capitalize()

	# Prepare text for embedding (include platform)
	profile_fields = ["Name", "Platform", "Niche", "Country"]
	df["profile_text"] = df[profile_fields].agg(" - ".join, axis=1)

	# =========================
	# EMBEDDINGS & RECOMMENDER
	# =========================
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	influencer_embeddings = model.encode(df["profile_text"].tolist(), convert_to_tensor=True)

	def recommend_influencers(brand_description):
	query_embedding = model.encode(brand_description, convert_to_tensor=True)
	cosine_scores = util.pytorch_cos_sim(query_embedding, influencer_embeddings)[0]
	top_indices = cosine_scores.topk(3).indices.tolist()

	recs = []
	for idx in top_indices:
	row = df.iloc[idx]
	recs.append({
	"Name": row["Name"],
	"Platform": row.get("Platform", ""),
	"Niche": row["Niche"],
	"Country": row["Country"],
	"ER": f"{row.get('ER', 'N/A')}",
	"Followers": int(row["Followers"]),
	"Reach": int(row["Reach"]) if str(row.get("Reach", "")).isdigit() else row.get("Reach", "")
	})
	return recs

	def format_output(brand_input):
	recs = recommend_influencers(brand_input)
	html = ""
	for i, rec in enumerate(recs, 1):
	html += f"""
	<div style='background:#ffffff; padding:1em; margin-bottom:1em; border-radius:8px; box-shadow:0 2px 6px rgba(0,0,0,0.1);'>
	<h3 style='margin:0; color:#0a1f44;'>🎯 {i}. {rec['Name']} <span style='font-size:0.9em; color:#555;'>({rec['Platform']})</span></h3>
	<p style='margin:0.5em 0;'><strong>Niche:</strong> {rec['Niche']}</p>
	<p style='margin:0.5em 0;'><strong>Country:</strong> {rec['Country']}</p>
	<p style='margin:0.5em 0;'><strong>Engagement:</strong> {rec['ER']}%</p>
	<p style='margin:0.5em 0;'><strong>Followers:</strong> {rec['Followers']:,}</p>
	{f"<p style='margin:0.5em 0;'><strong>Reach:</strong> {int(rec['Reach']):,}</p>" if isinstance(rec['Reach'], int) else ""}
	</div>
	"""
	return html

	# =========================
	# GRADIO UI
	# =========================
	iface = gr.Interface(
	fn=format_output,
	inputs=gr.Textbox(
	lines=3,
	label="🗣️ Describe Your Campaign or Brand",
	placeholder="e.g., Targeted fitness brand outreach for Gen Z"
	),
	outputs=gr.HTML(label="📈 Recommended Influencers"),
	title="💡 InfluencerMatch.AI: Targeted Influencer Discovery for Social Media Marketing",
	description=(
	"Enhance your social media marketing by pinpointing the perfect influencers for your niche.\n\n"
	"🛠️ AI-driven matching based on niche, audience, and engagement metrics — get top 3 influencer recommendations instantly."
	),
	article=(
	"Project: AI-Powered Influencer Recommender for Social Media Marketing\n\n"
	"Models:\n"
	"- google/flan-t5-small to synthesize country-specific first/last name pools\n"
	"- sentence-transformers/all-MiniLM-L6-v2 for semantic embeddings (recommendations)\n\n"
	"Dataset: 1,200-row synthetic influencer dataset generated at runtime."
	),
	examples=[
	["Sustainable fashion campaign targeting eco-conscious millennials"],
	["Tech gadget launch aimed at early adopters in the US"],
	["Healthy snack brand outreach for fitness enthusiasts"],
	["Luxury travel experiences for affluent couples in Europe"]
	],
	theme="soft",
	flagging_mode="never"
	)

	if __name__ == "__main__":
	iface.launch(share=True)