# app.py # ============================================================ # VentureMatch — Tinder-style Startup Matcher (HF Spaces / Gradio 6.x) # ✅ Embeddings (.npy) + FAISS (cosine) for fast search # ✅ Diverse sampling so same query returns different deck # ✅ Optional LLM (chat_completion) ONLY for insight/summary (never blocks search) # ============================================================ import os import re import math import time import json import random import numpy as np import pandas as pd import gradio as gr from datasets import load_dataset from sentence_transformers import SentenceTransformer import faiss # Optional LLM via HF Inference (CHAT API) try: from huggingface_hub import InferenceClient HF_OK = True except Exception: HF_OK = False # ------------------------- # CONFIG # ------------------------- DATASET_REPO = "Yoav-omer/startups" EMB_PATH = "embeddings_minilm.npy" EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" # must match embeddings dim (384) CANDIDATES_K = 800 DECK_SIZE = 10 # Optional: LLM (only if HF_TOKEN exists). Used for insight, not for retrieval. LLM_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" # chat-friendly on HF Inference LLM_MAX_TOKENS = 220 LLM_TEMPERATURE = 0.7 LLM_TIMEOUT_S = 18 RNG_SEED = 42 random.seed(RNG_SEED) np.random.seed(RNG_SEED) # ------------------------- # LOAD DATASET # ------------------------- print("🔄 Initializing VentureMatch Engine...") ds = load_dataset(DATASET_REPO) split_name = "train" if "train" in ds else list(ds.keys())[0] df_raw = ds[split_name].to_pandas() # ------------------------- # COLUMN NORMALIZATION # ------------------------- rename_map = { "startup_id": "entity_id", "id": "entity_id", "burn": "BURN_RATE", "BURN": "BURN_RATE", "ARR_usd": "ARR", "arr": "ARR", "valuation": "VALUE", "valuation_usd": "VALUE", "competitors": "competitors_count", } df_raw = df_raw.rename(columns={k: v for k, v in rename_map.items() if k in df_raw.columns}) required = ["entity_id", "name", "sector", "stage", "business_model", "ask_usd", "pitch"] missing = [c for c in required if c not in df_raw.columns] if missing: raise ValueError(f"Dataset is missing required column(s): {missing}") optional_defaults = { "elevator_speech": "", "keywords": "", "ARR": np.nan, "BURN_RATE": np.nan, "VALUE": np.nan, "competitors_count": np.nan, } for c, d in optional_defaults.items(): if c not in df_raw.columns: df_raw[c] = d for c in ["ask_usd", "ARR", "BURN_RATE", "VALUE", "competitors_count"]: df_raw[c] = pd.to_numeric(df_raw[c], errors="coerce") # ------------------------- # LOAD EMBEDDINGS + FAISS # ------------------------- if not os.path.exists(EMB_PATH): raise FileNotFoundError(f"❌ Missing {EMB_PATH}. Upload it to your Space repo root.") emb = np.load(EMB_PATH).astype(np.float32) if emb.shape[0] != len(df_raw): raise ValueError( f"❌ Embeddings rows ({emb.shape[0]}) != dataset rows ({len(df_raw)}).\n" "Your .npy must match dataset row order EXACTLY." ) # cosine via dot-product on normalized vectors emb /= (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-12) index = faiss.IndexFlatIP(emb.shape[1]) index.add(emb) # query embed model embedder = SentenceTransformer(EMBED_MODEL_ID, device="cpu") print(f"✅ Loaded: {len(df_raw)} rows | dim={emb.shape[1]} | FAISS={index.ntotal}") # ------------------------- # OPTIONAL LLM CLIENT (SAFE) # ------------------------- HF_TOKEN = os.getenv("HF_TOKEN", "").strip() llm_client = None if HF_OK and HF_TOKEN: try: llm_client = InferenceClient(token=HF_TOKEN) print("✅ LLM enabled via HF Inference (chat_completion).") except Exception as e: llm_client = None print(f"⚠️ LLM disabled: {e}") # ------------------------- # LISTS FOR UI # ------------------------- SECTOR_LIST = sorted(df_raw["sector"].dropna().astype(str).unique().tolist()) STAGE_LIST = sorted(df_raw["stage"].dropna().astype(str).unique().tolist()) BMODEL_LIST = sorted(df_raw["business_model"].dropna().astype(str).unique().tolist()) # ------------------------- # HELPERS # ------------------------- STOPWORDS = set(["the", "a", "an", "and", "or", "to", "for", "of", "in", "on", "with", "by", "from", "at", "as", "is", "are"]) def clean_text(s: str) -> str: s = "" if pd.isna(s) else str(s) return re.sub(r"\s+", " ", s).strip() def format_currency(value): try: v = float(value) if math.isnan(v): return "N/A" if v >= 1e9: return f"${v/1e9:.2f}B" if v >= 1e6: return f"${v/1e6:.2f}M" if v >= 1e3: return f"${v/1e3:.0f}K" return f"${v:.0f}" except: return "N/A" def clamp01(x: float) -> float: return max(0.0, min(1.0, x)) def similarity_to_pct(sim: float) -> int: pct = (sim - 0.25) / (0.80 - 0.25) return int(round(100 * clamp01(pct))) def tokenize_reason(query: str) -> list: q = re.sub(r"[^a-zA-Z0-9\s\-]", " ", query.lower()) toks = [t for t in q.split() if t and t not in STOPWORDS and len(t) > 2] seen, out = set(), [] for t in toks: if t not in seen: out.append(t); seen.add(t) return out[:8] def heuristic_insight(row: dict, query: str) -> str: toks = tokenize_reason(query) blob = f"{row.get('pitch','')} {row.get('keywords','')} {row.get('elevator_speech','')}".lower() hits = [t for t in toks if t in blob][:4] reason = "Matches: " + ", ".join(hits) if hits else "Semantically aligned with your thesis." return ( f"{reason} • Ask {format_currency(row.get('ask_usd'))}" f" • ARR {format_currency(row.get('ARR'))}" f" • Value {format_currency(row.get('VALUE'))}" ) def llm_insight(row: dict, query: str) -> str: """ Never blocks the app: - If LLM is not available or fails -> heuristic fallback. - Uses chat_completion (conversational task). """ if llm_client is None: return heuristic_insight(row, query) prompt = f""" You are a VC analyst. Given a user thesis and a startup profile, write 1 short insight: - 1 sentence why it's a match (or not) - Mention 1 key risk or missing detail Keep it under 35 words. User thesis: {query} Startup: Name: {row.get('name')} Sector: {row.get('sector')} Stage: {row.get('stage')} Business model: {row.get('business_model')} Ask: {row.get('ask_usd')} ARR: {row.get('ARR')} Burn/mo: {row.get('BURN_RATE')} Pitch: {row.get('pitch')} """.strip() try: # chat_completion API (supported task: conversational) resp = llm_client.chat_completion( model=LLM_MODEL_ID, messages=[ {"role": "system", "content": "You are concise, practical, and skeptical."}, {"role": "user", "content": prompt}, ], max_tokens=LLM_MAX_TOKENS, temperature=LLM_TEMPERATURE, timeout=LLM_TIMEOUT_S, ) text = resp.choices[0].message.content.strip() text = re.sub(r"\s+", " ", text) return text[:300] if text else heuristic_insight(row, query) except Exception: return heuristic_insight(row, query) def make_cover_svg(name: str, sector: str, stage: str) -> str: name = clean_text(name)[:26] sector = clean_text(sector)[:18] stage = clean_text(stage)[:14] return f""" {name} {sector} • {stage} """.strip() def card_html(row: dict, sim: float, query: str, insight_text: str, stamp: str = "") -> str: pct = similarity_to_pct(sim) cover = make_cover_svg(row.get("name",""), row.get("sector",""), row.get("stage","")) comp = row.get("competitors_count") comp_txt = "N/A" if pd.isna(comp) else str(int(comp)) stamp_html = "" if stamp == "LIKE": stamp_html = """
INVEST
""" elif stamp == "NOPE": stamp_html = """
PASS
""" return f"""
{stamp_html}
{pct}% MATCH #{row.get("entity_id","")}
{cover}
{row.get("name","")}
{row.get("sector","")} • {row.get("stage","")} • {row.get("business_model","")}
“{clean_text(row.get("pitch",""))}”
Ask
{format_currency(row.get("ask_usd"))}
ARR
{format_currency(row.get("ARR"))}
Burn/Mo
{format_currency(row.get("BURN_RATE"))}
Value
{format_currency(row.get("VALUE"))}
Competitors
{comp_txt}
✨ AI Insight: {clean_text(insight_text)}
""".strip() def semantic_search(query: str): qv = embedder.encode([query], normalize_embeddings=True).astype(np.float32) scores, idxs = index.search(qv, CANDIDATES_K) return scores[0], idxs[0] def apply_filters(df: pd.DataFrame, sectors, stages, bmodels, ask_min, ask_max): out = df.copy() if sectors: out = out[out["sector"].isin(sectors)] if stages: out = out[out["stage"].isin(stages)] if bmodels: out = out[out["business_model"].isin(bmodels)] # keep rows with NaN too (so it doesn't kill results) out = out[(out["ask_usd"].isna()) | ((out["ask_usd"] >= ask_min) & (out["ask_usd"] <= ask_max))] return out def diverse_sample(df: pd.DataFrame, n: int, diversity: float) -> pd.DataFrame: """ diversity in [0..1] 0 -> deterministic top-n 1 -> strong randomness from top pool """ df = df.sort_values("similarity", ascending=False).copy() if len(df) <= n: return df if diversity <= 0.05: return df.head(n) pool = df.head(min(140, len(df))).copy() sims = pool["similarity"].to_numpy() # temperature controls randomness temp = 0.06 + 0.55 * float(diversity) w = np.exp((sims - sims.max()) / max(1e-6, temp)) w = w / (w.sum() + 1e-12) # time-based seed to change every search rng = np.random.default_rng(int(time.time() * 1000) % (2**32 - 1)) chosen = rng.choice(len(pool), size=n, replace=False, p=w) sampled = pool.iloc[chosen].copy() sampled = sampled.sort_values("similarity", ascending=False) return sampled def portfolio_to_table(portfolio): rows = [] for p in (portfolio or []): rows.append([ p.get("entity_id",""), p.get("name",""), p.get("sector",""), p.get("stage",""), p.get("business_model",""), format_currency(p.get("ask_usd")), float(p.get("similarity", 0.0)), ]) return rows # ------------------------- # MAIN SEARCH # ------------------------- def start_search(user_query, sectors, stages, bmodels, ask_min, ask_max, diversity, portfolio_state): q = clean_text(user_query) if len(q) < 6: return ( gr.update(visible=True), gr.update(visible=False), "", [], 0, portfolio_state, "
Write a longer thesis (≥ 6 chars).
", "" ) # Semantic retrieval scores, idxs = semantic_search(q) cand = df_raw.iloc[idxs].copy() cand["similarity"] = scores # Filters cand = apply_filters(cand, sectors, stages, bmodels, float(ask_min), float(ask_max)) if cand.empty: return ( gr.update(visible=True), gr.update(visible=False), "", [], 0, portfolio_state, "
No matches. Try broader filters.
", "" ) deck_df = diverse_sample(cand, DECK_SIZE, diversity=float(diversity)) deck = deck_df.to_dict("records") first = deck[0] insight = llm_insight(first, q) html = card_html(first, float(first["similarity"]), q, insight) thesis_info = f"**Search mode:** Embeddings + FAISS • **Diversity:** {float(diversity):.2f}" if llm_client is not None: thesis_info += " • **AI Insight:** LLM enabled" else: thesis_info += " • **AI Insight:** heuristic" return ( gr.update(visible=False), gr.update(visible=True), html, deck, 0, portfolio_state, "", # status thesis_info ) def swipe_action(deck, pos, action, query, portfolio): if not deck: return "
No deck loaded.
", pos, gr.update(visible=True), portfolio pos = int(pos or 0) if pos >= len(deck): return "
🏁 End of deck. Start a new search.
", pos, gr.update(visible=False), portfolio current = deck[pos] if action == "INVEST": portfolio = (portfolio or []) portfolio.append(dict(current)) stamp = "LIKE" if action == "INVEST" else "NOPE" new_pos = pos + 1 if new_pos >= len(deck): end_html = "
🏁 You reached the end. Check your portfolio below.
" return end_html, new_pos, gr.update(visible=False), portfolio nxt = deck[new_pos] insight = llm_insight(nxt, query) html = card_html(nxt, float(nxt["similarity"]), query, insight, stamp=stamp) return html, new_pos, gr.update(visible=True), portfolio def remove_selected(portfolio, txt): portfolio = portfolio or [] txt = "" if txt is None else str(txt) parts = [p.strip() for p in txt.split(",") if p.strip()] idxs = set() for p in parts: if p.isdigit(): idxs.add(int(p)) new_port = [p for i, p in enumerate(portfolio) if i not in idxs] return new_port, portfolio_to_table(new_port) def clear_portfolio(): return [], [] # ------------------------- # CSS (Tinder-like) # ------------------------- CSS = """ :root{ --pink:#FD297B; --red:#FF5864; --cyan:#4CC9F0; --bg1:#0b0b10; --card: rgba(255,255,255,0.92); --shadow: 0 30px 70px rgba(0,0,0,0.25); } body{ background: radial-gradient(1200px 700px at 20% 20%, rgba(253,41,123,0.20), transparent 60%), radial-gradient(900px 600px at 80% 30%, rgba(76,201,240,0.18), transparent 55%), linear-gradient(180deg, #0b0b10 0%, #0f111a 70%, #0b0b10 100%) !important; } .vm-hero{ padding: 18px 14px 8px 14px; border-radius: 18px; background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.08); } .vm-wrap { display:flex; justify-content:center; padding: 10px 0 16px 0; } .vm-card { width: min(580px, 95vw); border-radius: 30px; background: var(--card); box-shadow: var(--shadow); border: 1px solid rgba(255,255,255,0.12); overflow: hidden; position: relative; backdrop-filter: blur(8px); } .vm-top{ display:flex; justify-content:space-between; align-items:center; padding: 14px 18px; background: linear-gradient(90deg, rgba(253,41,123,0.16), rgba(76,201,240,0.14)); } .pill{ font-weight: 900; font-size: 12px; letter-spacing: 0.8px; padding: 7px 12px; border-radius: 999px; color: #fff; background: linear-gradient(45deg, var(--pink), var(--red)); box-shadow: 0 10px 22px rgba(253,41,123,0.28); } .id{ color: rgba(0,0,0,0.55); font-size: 12px; font-weight: 700; } .vm-cover { background: #fff; padding: 12px 12px 0px 12px; } .vm-body { padding: 16px 18px 18px 18px; } .name { font-size: 32px; font-weight: 1000; letter-spacing: -0.7px; color: #0c0c10; } .meta { margin-top: 4px; font-size: 14px; color: rgba(0,0,0,0.65); font-weight: 800; } .bmodel { color: var(--red); } .vm-quote{ margin-top: 14px; background: rgba(0,0,0,0.04); border: 1px solid rgba(0,0,0,0.06); border-radius: 18px; padding: 14px 14px; font-size: 15px; line-height: 1.55; color: rgba(0,0,0,0.82); } .vm-grid{ margin-top: 14px; display:grid; grid-template-columns: 1fr 1fr; gap: 10px; } .vm-stat{ background: rgba(255,255,255,0.78); border: 1px solid rgba(0,0,0,0.06); border-radius: 16px; padding: 10px 12px; } .vm-stat .k{ font-size: 10px; font-weight: 1000; letter-spacing: 0.9px; text-transform: uppercase; color: rgba(0,0,0,0.48); } .vm-stat .v{ margin-top: 2px; font-size: 16px; font-weight: 1000; color: rgba(0,0,0,0.86); } .vm-insight{ margin-top: 14px; border-radius: 16px; padding: 12px 14px; font-size: 13px; line-height: 1.5; background: rgba(255,88,100,0.10); border: 1px dashed rgba(255,88,100,0.60); color: rgba(0,0,0,0.78); } .vm-error{ padding: 14px 16px; border-radius: 16px; background: rgba(255,88,100,0.16); border: 1px solid rgba(255,88,100,0.28); color: rgba(255,255,255,0.92); font-weight: 800; text-align:center; } .vm-end{ padding: 22px 16px; border-radius: 18px; background: rgba(76,201,240,0.14); border: 1px solid rgba(76,201,240,0.28); color: rgba(255,255,255,0.92); font-weight: 900; text-align:center; } .stamp{ position:absolute; top: 102px; left: 22px; transform: rotate(-14deg); font-size: 34px; font-weight: 1000; letter-spacing: 1px; padding: 10px 14px; border-radius: 14px; opacity: 0.0; animation: pop 0.55s ease forwards; z-index: 10; } .stamp.like { border: 6px solid rgba(50,205,50,0.88); color: rgba(50,205,50,0.92); } .stamp.nope { border: 6px solid rgba(255,59,92,0.88); color: rgba(255,59,92,0.92); } @keyframes pop{ 0% { opacity: 0.0; transform: translateY(8px) rotate(-14deg) scale(0.92); } 60% { opacity: 1.0; transform: translateY(0px) rotate(-14deg) scale(1.05); } 100% { opacity: 0.0; transform: translateY(-2px) rotate(-14deg) scale(1.02); } } """ # ------------------------- # UI # ------------------------- with gr.Blocks() as demo: deck_state = gr.State([]) pos_state = gr.State(0) portfolio_state = gr.State([]) last_query_state = gr.State("") with gr.Column(elem_id="onboarding") as onboarding_view: gr.Markdown( """
# 💘 VentureMatch ### Tinder-style startup search (Embeddings + FAISS) Write a thesis → filter → get a swipe deck. Same thesis twice? You'll still get **varied** results.
""".strip() ) with gr.Row(): with gr.Column(scale=2): query_input = gr.Textbox( label="Investment Thesis", placeholder="Example: 'Cybersecurity for SMBs, low burn, Seed, B2B SaaS'", lines=4 ) gr.Examples( examples=[ ["Cybersecurity for small businesses, phishing defense, low burn"], ["ClimateTech for factories: carbon accounting + compliance"], ["HealthTech remote monitoring for elderly patients, B2B SaaS"], ], inputs=query_input, label="Quick Starters (1-click)" ) with gr.Column(scale=1): sectors_input = gr.Dropdown(choices=SECTOR_LIST, multiselect=True, label="Sector (multi-select)") stages_input = gr.Dropdown(choices=STAGE_LIST, multiselect=True, label="Stage (multi-select)") bmodels_input = gr.Dropdown(choices=BMODEL_LIST, multiselect=True, label="Business Model (multi-select)") diversity = gr.Slider( minimum=0.0, maximum=1.0, value=0.50, step=0.05, label="Result Diversity", info="Higher = more different results for same query." ) with gr.Accordion("Advanced Filters", open=False): with gr.Row(): ask_min = gr.Number(value=0, label="Ask min (USD)") ask_max = gr.Number(value=10_000_000, label="Ask max (USD)") thesis_info = gr.Markdown("") status_box = gr.HTML("") start_btn = gr.Button("FIND STARTUPS 🔥", variant="primary") with gr.Column(visible=False) as matching_view: display_area = gr.HTML() with gr.Row(visible=True) as action_row: pass_btn = gr.Button("PASS ❌", variant="secondary") invest_btn = gr.Button("INVEST 💚", variant="primary") back_btn = gr.Button("⬅ Back to Search", variant="secondary") gr.Markdown("## 🏆 Portfolio") portfolio_table = gr.Dataframe( headers=["entity_id","name","sector","stage","business_model","ask","similarity"], datatype=["str","str","str","str","str","str","number"], interactive=False ) with gr.Row(): remove_rows = gr.Textbox(label="Remove rows (indices)", placeholder="Example: 0,2,3") remove_btn = gr.Button("Remove Selected", variant="secondary") clear_btn = gr.Button("Clear Portfolio", variant="stop") # Events def on_start(user_query, sectors, stages, bmodels, ask_min_v, ask_max_v, diversity_v, portfolio_v): try: a_min = float(ask_min_v); a_max = float(ask_max_v) if a_min > a_max: return ( gr.update(visible=True), gr.update(visible=False), "", [], 0, portfolio_v, "
Ask: min must be ≤ max
", thesis_info.value ) except: return ( gr.update(visible=True), gr.update(visible=False), "", [], 0, portfolio_v, "
Bad Ask min/max
", thesis_info.value ) return start_search( user_query, sectors, stages, bmodels, a_min, a_max, float(diversity_v), portfolio_v ) start_btn.click( on_start, inputs=[query_input, sectors_input, stages_input, bmodels_input, ask_min, ask_max, diversity, portfolio_state], outputs=[onboarding_view, matching_view, display_area, deck_state, pos_state, portfolio_state, status_box, thesis_info] ).then(lambda p: portfolio_to_table(p), inputs=portfolio_state, outputs=portfolio_table) invest_btn.click( lambda deck, pos, query, port: swipe_action(deck, pos, "INVEST", query, port), inputs=[deck_state, pos_state, query_input, portfolio_state], outputs=[display_area, pos_state, action_row, portfolio_state] ).then(lambda p: portfolio_to_table(p), inputs=portfolio_state, outputs=portfolio_table) pass_btn.click( lambda deck, pos, query, port: swipe_action(deck, pos, "PASS", query, port), inputs=[deck_state, pos_state, query_input, portfolio_state], outputs=[display_area, pos_state, action_row, portfolio_state] ).then(lambda p: portfolio_to_table(p), inputs=portfolio_state, outputs=portfolio_table) back_btn.click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[onboarding_view, matching_view]) remove_btn.click(remove_selected, inputs=[portfolio_state, remove_rows], outputs=[portfolio_state, portfolio_table]) clear_btn.click(lambda: clear_portfolio(), outputs=[portfolio_state, portfolio_table]) # Queue helps stability on Spaces demo.queue(default_concurrency_limit=1, max_size=32) # IMPORTANT: In Gradio 6.x pass css/theme via launch() demo.launch(css=CSS, theme=gr.themes.Default(primary_hue="pink"), ssr_mode=False)