# app.py
# ============================================================
# VentureMatch — Tinder-style Startup Matcher (HF Spaces / Gradio 6.x)
# ✅ Embeddings (.npy) + FAISS (cosine) for fast search
# ✅ Diverse sampling so same query returns different deck
# ✅ Optional LLM (chat_completion) ONLY for insight/summary (never blocks search)
# ============================================================
import os
import re
import math
import time
import json
import random
import numpy as np
import pandas as pd
import gradio as gr
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
# Optional LLM via HF Inference (CHAT API)
try:
from huggingface_hub import InferenceClient
HF_OK = True
except Exception:
HF_OK = False
# -------------------------
# CONFIG
# -------------------------
DATASET_REPO = "Yoav-omer/startups"
EMB_PATH = "embeddings_minilm.npy"
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" # must match embeddings dim (384)
CANDIDATES_K = 800
DECK_SIZE = 10
# Optional: LLM (only if HF_TOKEN exists). Used for insight, not for retrieval.
LLM_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" # chat-friendly on HF Inference
LLM_MAX_TOKENS = 220
LLM_TEMPERATURE = 0.7
LLM_TIMEOUT_S = 18
RNG_SEED = 42
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)
# -------------------------
# LOAD DATASET
# -------------------------
print("🔄 Initializing VentureMatch Engine...")
ds = load_dataset(DATASET_REPO)
split_name = "train" if "train" in ds else list(ds.keys())[0]
df_raw = ds[split_name].to_pandas()
# -------------------------
# COLUMN NORMALIZATION
# -------------------------
rename_map = {
"startup_id": "entity_id",
"id": "entity_id",
"burn": "BURN_RATE",
"BURN": "BURN_RATE",
"ARR_usd": "ARR",
"arr": "ARR",
"valuation": "VALUE",
"valuation_usd": "VALUE",
"competitors": "competitors_count",
}
df_raw = df_raw.rename(columns={k: v for k, v in rename_map.items() if k in df_raw.columns})
required = ["entity_id", "name", "sector", "stage", "business_model", "ask_usd", "pitch"]
missing = [c for c in required if c not in df_raw.columns]
if missing:
raise ValueError(f"Dataset is missing required column(s): {missing}")
optional_defaults = {
"elevator_speech": "",
"keywords": "",
"ARR": np.nan,
"BURN_RATE": np.nan,
"VALUE": np.nan,
"competitors_count": np.nan,
}
for c, d in optional_defaults.items():
if c not in df_raw.columns:
df_raw[c] = d
for c in ["ask_usd", "ARR", "BURN_RATE", "VALUE", "competitors_count"]:
df_raw[c] = pd.to_numeric(df_raw[c], errors="coerce")
# -------------------------
# LOAD EMBEDDINGS + FAISS
# -------------------------
if not os.path.exists(EMB_PATH):
raise FileNotFoundError(f"❌ Missing {EMB_PATH}. Upload it to your Space repo root.")
emb = np.load(EMB_PATH).astype(np.float32)
if emb.shape[0] != len(df_raw):
raise ValueError(
f"❌ Embeddings rows ({emb.shape[0]}) != dataset rows ({len(df_raw)}).\n"
"Your .npy must match dataset row order EXACTLY."
)
# cosine via dot-product on normalized vectors
emb /= (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-12)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
# query embed model
embedder = SentenceTransformer(EMBED_MODEL_ID, device="cpu")
print(f"✅ Loaded: {len(df_raw)} rows | dim={emb.shape[1]} | FAISS={index.ntotal}")
# -------------------------
# OPTIONAL LLM CLIENT (SAFE)
# -------------------------
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
llm_client = None
if HF_OK and HF_TOKEN:
try:
llm_client = InferenceClient(token=HF_TOKEN)
print("✅ LLM enabled via HF Inference (chat_completion).")
except Exception as e:
llm_client = None
print(f"⚠️ LLM disabled: {e}")
# -------------------------
# LISTS FOR UI
# -------------------------
SECTOR_LIST = sorted(df_raw["sector"].dropna().astype(str).unique().tolist())
STAGE_LIST = sorted(df_raw["stage"].dropna().astype(str).unique().tolist())
BMODEL_LIST = sorted(df_raw["business_model"].dropna().astype(str).unique().tolist())
# -------------------------
# HELPERS
# -------------------------
STOPWORDS = set(["the", "a", "an", "and", "or", "to", "for", "of", "in", "on", "with", "by", "from", "at", "as", "is", "are"])
def clean_text(s: str) -> str:
s = "" if pd.isna(s) else str(s)
return re.sub(r"\s+", " ", s).strip()
def format_currency(value):
try:
v = float(value)
if math.isnan(v): return "N/A"
if v >= 1e9: return f"${v/1e9:.2f}B"
if v >= 1e6: return f"${v/1e6:.2f}M"
if v >= 1e3: return f"${v/1e3:.0f}K"
return f"${v:.0f}"
except:
return "N/A"
def clamp01(x: float) -> float:
return max(0.0, min(1.0, x))
def similarity_to_pct(sim: float) -> int:
pct = (sim - 0.25) / (0.80 - 0.25)
return int(round(100 * clamp01(pct)))
def tokenize_reason(query: str) -> list:
q = re.sub(r"[^a-zA-Z0-9\s\-]", " ", query.lower())
toks = [t for t in q.split() if t and t not in STOPWORDS and len(t) > 2]
seen, out = set(), []
for t in toks:
if t not in seen:
out.append(t); seen.add(t)
return out[:8]
def heuristic_insight(row: dict, query: str) -> str:
toks = tokenize_reason(query)
blob = f"{row.get('pitch','')} {row.get('keywords','')} {row.get('elevator_speech','')}".lower()
hits = [t for t in toks if t in blob][:4]
reason = "Matches: " + ", ".join(hits) if hits else "Semantically aligned with your thesis."
return (
f"{reason} • Ask {format_currency(row.get('ask_usd'))}"
f" • ARR {format_currency(row.get('ARR'))}"
f" • Value {format_currency(row.get('VALUE'))}"
)
def llm_insight(row: dict, query: str) -> str:
"""
Never blocks the app:
- If LLM is not available or fails -> heuristic fallback.
- Uses chat_completion (conversational task).
"""
if llm_client is None:
return heuristic_insight(row, query)
prompt = f"""
You are a VC analyst. Given a user thesis and a startup profile, write 1 short insight:
- 1 sentence why it's a match (or not)
- Mention 1 key risk or missing detail
Keep it under 35 words.
User thesis:
{query}
Startup:
Name: {row.get('name')}
Sector: {row.get('sector')}
Stage: {row.get('stage')}
Business model: {row.get('business_model')}
Ask: {row.get('ask_usd')}
ARR: {row.get('ARR')}
Burn/mo: {row.get('BURN_RATE')}
Pitch: {row.get('pitch')}
""".strip()
try:
# chat_completion API (supported task: conversational)
resp = llm_client.chat_completion(
model=LLM_MODEL_ID,
messages=[
{"role": "system", "content": "You are concise, practical, and skeptical."},
{"role": "user", "content": prompt},
],
max_tokens=LLM_MAX_TOKENS,
temperature=LLM_TEMPERATURE,
timeout=LLM_TIMEOUT_S,
)
text = resp.choices[0].message.content.strip()
text = re.sub(r"\s+", " ", text)
return text[:300] if text else heuristic_insight(row, query)
except Exception:
return heuristic_insight(row, query)
def make_cover_svg(name: str, sector: str, stage: str) -> str:
name = clean_text(name)[:26]
sector = clean_text(sector)[:18]
stage = clean_text(stage)[:14]
return f"""
{name}
{sector} • {stage}
""".strip()
def card_html(row: dict, sim: float, query: str, insight_text: str, stamp: str = "") -> str:
pct = similarity_to_pct(sim)
cover = make_cover_svg(row.get("name",""), row.get("sector",""), row.get("stage",""))
comp = row.get("competitors_count")
comp_txt = "N/A" if pd.isna(comp) else str(int(comp))
stamp_html = ""
if stamp == "LIKE":
stamp_html = """
INVEST
"""
elif stamp == "NOPE":
stamp_html = """PASS
"""
return f"""
{stamp_html}
{pct}% MATCH
#{row.get("entity_id","")}
{cover}
{row.get("name","")}
{row.get("sector","")} • {row.get("stage","")} • {row.get("business_model","")}
“{clean_text(row.get("pitch",""))}”
Ask
{format_currency(row.get("ask_usd"))}
ARR
{format_currency(row.get("ARR"))}
Burn/Mo
{format_currency(row.get("BURN_RATE"))}
Value
{format_currency(row.get("VALUE"))}
✨ AI Insight: {clean_text(insight_text)}
""".strip()
def semantic_search(query: str):
qv = embedder.encode([query], normalize_embeddings=True).astype(np.float32)
scores, idxs = index.search(qv, CANDIDATES_K)
return scores[0], idxs[0]
def apply_filters(df: pd.DataFrame, sectors, stages, bmodels, ask_min, ask_max):
out = df.copy()
if sectors:
out = out[out["sector"].isin(sectors)]
if stages:
out = out[out["stage"].isin(stages)]
if bmodels:
out = out[out["business_model"].isin(bmodels)]
# keep rows with NaN too (so it doesn't kill results)
out = out[(out["ask_usd"].isna()) | ((out["ask_usd"] >= ask_min) & (out["ask_usd"] <= ask_max))]
return out
def diverse_sample(df: pd.DataFrame, n: int, diversity: float) -> pd.DataFrame:
"""
diversity in [0..1]
0 -> deterministic top-n
1 -> strong randomness from top pool
"""
df = df.sort_values("similarity", ascending=False).copy()
if len(df) <= n:
return df
if diversity <= 0.05:
return df.head(n)
pool = df.head(min(140, len(df))).copy()
sims = pool["similarity"].to_numpy()
# temperature controls randomness
temp = 0.06 + 0.55 * float(diversity)
w = np.exp((sims - sims.max()) / max(1e-6, temp))
w = w / (w.sum() + 1e-12)
# time-based seed to change every search
rng = np.random.default_rng(int(time.time() * 1000) % (2**32 - 1))
chosen = rng.choice(len(pool), size=n, replace=False, p=w)
sampled = pool.iloc[chosen].copy()
sampled = sampled.sort_values("similarity", ascending=False)
return sampled
def portfolio_to_table(portfolio):
rows = []
for p in (portfolio or []):
rows.append([
p.get("entity_id",""),
p.get("name",""),
p.get("sector",""),
p.get("stage",""),
p.get("business_model",""),
format_currency(p.get("ask_usd")),
float(p.get("similarity", 0.0)),
])
return rows
# -------------------------
# MAIN SEARCH
# -------------------------
def start_search(user_query, sectors, stages, bmodels, ask_min, ask_max, diversity, portfolio_state):
q = clean_text(user_query)
if len(q) < 6:
return (
gr.update(visible=True), gr.update(visible=False),
"", [], 0, portfolio_state,
"Write a longer thesis (≥ 6 chars).
",
""
)
# Semantic retrieval
scores, idxs = semantic_search(q)
cand = df_raw.iloc[idxs].copy()
cand["similarity"] = scores
# Filters
cand = apply_filters(cand, sectors, stages, bmodels, float(ask_min), float(ask_max))
if cand.empty:
return (
gr.update(visible=True), gr.update(visible=False),
"", [], 0, portfolio_state,
"No matches. Try broader filters.
",
""
)
deck_df = diverse_sample(cand, DECK_SIZE, diversity=float(diversity))
deck = deck_df.to_dict("records")
first = deck[0]
insight = llm_insight(first, q)
html = card_html(first, float(first["similarity"]), q, insight)
thesis_info = f"**Search mode:** Embeddings + FAISS • **Diversity:** {float(diversity):.2f}"
if llm_client is not None:
thesis_info += " • **AI Insight:** LLM enabled"
else:
thesis_info += " • **AI Insight:** heuristic"
return (
gr.update(visible=False), gr.update(visible=True),
html, deck, 0, portfolio_state,
"", # status
thesis_info
)
def swipe_action(deck, pos, action, query, portfolio):
if not deck:
return "No deck loaded.
", pos, gr.update(visible=True), portfolio
pos = int(pos or 0)
if pos >= len(deck):
return "🏁 End of deck. Start a new search.
", pos, gr.update(visible=False), portfolio
current = deck[pos]
if action == "INVEST":
portfolio = (portfolio or [])
portfolio.append(dict(current))
stamp = "LIKE" if action == "INVEST" else "NOPE"
new_pos = pos + 1
if new_pos >= len(deck):
end_html = "🏁 You reached the end. Check your portfolio below.
"
return end_html, new_pos, gr.update(visible=False), portfolio
nxt = deck[new_pos]
insight = llm_insight(nxt, query)
html = card_html(nxt, float(nxt["similarity"]), query, insight, stamp=stamp)
return html, new_pos, gr.update(visible=True), portfolio
def remove_selected(portfolio, txt):
portfolio = portfolio or []
txt = "" if txt is None else str(txt)
parts = [p.strip() for p in txt.split(",") if p.strip()]
idxs = set()
for p in parts:
if p.isdigit():
idxs.add(int(p))
new_port = [p for i, p in enumerate(portfolio) if i not in idxs]
return new_port, portfolio_to_table(new_port)
def clear_portfolio():
return [], []
# -------------------------
# CSS (Tinder-like)
# -------------------------
CSS = """
:root{
--pink:#FD297B;
--red:#FF5864;
--cyan:#4CC9F0;
--bg1:#0b0b10;
--card: rgba(255,255,255,0.92);
--shadow: 0 30px 70px rgba(0,0,0,0.25);
}
body{
background: radial-gradient(1200px 700px at 20% 20%, rgba(253,41,123,0.20), transparent 60%),
radial-gradient(900px 600px at 80% 30%, rgba(76,201,240,0.18), transparent 55%),
linear-gradient(180deg, #0b0b10 0%, #0f111a 70%, #0b0b10 100%) !important;
}
.vm-hero{
padding: 18px 14px 8px 14px;
border-radius: 18px;
background: rgba(255,255,255,0.04);
border: 1px solid rgba(255,255,255,0.08);
}
.vm-wrap { display:flex; justify-content:center; padding: 10px 0 16px 0; }
.vm-card {
width: min(580px, 95vw);
border-radius: 30px;
background: var(--card);
box-shadow: var(--shadow);
border: 1px solid rgba(255,255,255,0.12);
overflow: hidden;
position: relative;
backdrop-filter: blur(8px);
}
.vm-top{
display:flex; justify-content:space-between; align-items:center;
padding: 14px 18px;
background: linear-gradient(90deg, rgba(253,41,123,0.16), rgba(76,201,240,0.14));
}
.pill{
font-weight: 900;
font-size: 12px;
letter-spacing: 0.8px;
padding: 7px 12px;
border-radius: 999px;
color: #fff;
background: linear-gradient(45deg, var(--pink), var(--red));
box-shadow: 0 10px 22px rgba(253,41,123,0.28);
}
.id{ color: rgba(0,0,0,0.55); font-size: 12px; font-weight: 700; }
.vm-cover { background: #fff; padding: 12px 12px 0px 12px; }
.vm-body { padding: 16px 18px 18px 18px; }
.name { font-size: 32px; font-weight: 1000; letter-spacing: -0.7px; color: #0c0c10; }
.meta { margin-top: 4px; font-size: 14px; color: rgba(0,0,0,0.65); font-weight: 800; }
.bmodel { color: var(--red); }
.vm-quote{
margin-top: 14px;
background: rgba(0,0,0,0.04);
border: 1px solid rgba(0,0,0,0.06);
border-radius: 18px;
padding: 14px 14px;
font-size: 15px;
line-height: 1.55;
color: rgba(0,0,0,0.82);
}
.vm-grid{
margin-top: 14px;
display:grid;
grid-template-columns: 1fr 1fr;
gap: 10px;
}
.vm-stat{
background: rgba(255,255,255,0.78);
border: 1px solid rgba(0,0,0,0.06);
border-radius: 16px;
padding: 10px 12px;
}
.vm-stat .k{
font-size: 10px;
font-weight: 1000;
letter-spacing: 0.9px;
text-transform: uppercase;
color: rgba(0,0,0,0.48);
}
.vm-stat .v{
margin-top: 2px;
font-size: 16px;
font-weight: 1000;
color: rgba(0,0,0,0.86);
}
.vm-insight{
margin-top: 14px;
border-radius: 16px;
padding: 12px 14px;
font-size: 13px;
line-height: 1.5;
background: rgba(255,88,100,0.10);
border: 1px dashed rgba(255,88,100,0.60);
color: rgba(0,0,0,0.78);
}
.vm-error{
padding: 14px 16px;
border-radius: 16px;
background: rgba(255,88,100,0.16);
border: 1px solid rgba(255,88,100,0.28);
color: rgba(255,255,255,0.92);
font-weight: 800;
text-align:center;
}
.vm-end{
padding: 22px 16px;
border-radius: 18px;
background: rgba(76,201,240,0.14);
border: 1px solid rgba(76,201,240,0.28);
color: rgba(255,255,255,0.92);
font-weight: 900;
text-align:center;
}
.stamp{
position:absolute;
top: 102px;
left: 22px;
transform: rotate(-14deg);
font-size: 34px;
font-weight: 1000;
letter-spacing: 1px;
padding: 10px 14px;
border-radius: 14px;
opacity: 0.0;
animation: pop 0.55s ease forwards;
z-index: 10;
}
.stamp.like { border: 6px solid rgba(50,205,50,0.88); color: rgba(50,205,50,0.92); }
.stamp.nope { border: 6px solid rgba(255,59,92,0.88); color: rgba(255,59,92,0.92); }
@keyframes pop{
0% { opacity: 0.0; transform: translateY(8px) rotate(-14deg) scale(0.92); }
60% { opacity: 1.0; transform: translateY(0px) rotate(-14deg) scale(1.05); }
100% { opacity: 0.0; transform: translateY(-2px) rotate(-14deg) scale(1.02); }
}
"""
# -------------------------
# UI
# -------------------------
with gr.Blocks() as demo:
deck_state = gr.State([])
pos_state = gr.State(0)
portfolio_state = gr.State([])
last_query_state = gr.State("")
with gr.Column(elem_id="onboarding") as onboarding_view:
gr.Markdown(
"""
# 💘 VentureMatch
### Tinder-style startup search (Embeddings + FAISS)
Write a thesis → filter → get a swipe deck.
Same thesis twice? You'll still get **varied** results.
""".strip()
)
with gr.Row():
with gr.Column(scale=2):
query_input = gr.Textbox(
label="Investment Thesis",
placeholder="Example: 'Cybersecurity for SMBs, low burn, Seed, B2B SaaS'",
lines=4
)
gr.Examples(
examples=[
["Cybersecurity for small businesses, phishing defense, low burn"],
["ClimateTech for factories: carbon accounting + compliance"],
["HealthTech remote monitoring for elderly patients, B2B SaaS"],
],
inputs=query_input,
label="Quick Starters (1-click)"
)
with gr.Column(scale=1):
sectors_input = gr.Dropdown(choices=SECTOR_LIST, multiselect=True, label="Sector (multi-select)")
stages_input = gr.Dropdown(choices=STAGE_LIST, multiselect=True, label="Stage (multi-select)")
bmodels_input = gr.Dropdown(choices=BMODEL_LIST, multiselect=True, label="Business Model (multi-select)")
diversity = gr.Slider(
minimum=0.0, maximum=1.0, value=0.50, step=0.05,
label="Result Diversity",
info="Higher = more different results for same query."
)
with gr.Accordion("Advanced Filters", open=False):
with gr.Row():
ask_min = gr.Number(value=0, label="Ask min (USD)")
ask_max = gr.Number(value=10_000_000, label="Ask max (USD)")
thesis_info = gr.Markdown("")
status_box = gr.HTML("")
start_btn = gr.Button("FIND STARTUPS 🔥", variant="primary")
with gr.Column(visible=False) as matching_view:
display_area = gr.HTML()
with gr.Row(visible=True) as action_row:
pass_btn = gr.Button("PASS ❌", variant="secondary")
invest_btn = gr.Button("INVEST 💚", variant="primary")
back_btn = gr.Button("⬅ Back to Search", variant="secondary")
gr.Markdown("## 🏆 Portfolio")
portfolio_table = gr.Dataframe(
headers=["entity_id","name","sector","stage","business_model","ask","similarity"],
datatype=["str","str","str","str","str","str","number"],
interactive=False
)
with gr.Row():
remove_rows = gr.Textbox(label="Remove rows (indices)", placeholder="Example: 0,2,3")
remove_btn = gr.Button("Remove Selected", variant="secondary")
clear_btn = gr.Button("Clear Portfolio", variant="stop")
# Events
def on_start(user_query, sectors, stages, bmodels, ask_min_v, ask_max_v, diversity_v, portfolio_v):
try:
a_min = float(ask_min_v); a_max = float(ask_max_v)
if a_min > a_max:
return (
gr.update(visible=True), gr.update(visible=False),
"", [], 0, portfolio_v,
"Ask: min must be ≤ max
",
thesis_info.value
)
except:
return (
gr.update(visible=True), gr.update(visible=False),
"", [], 0, portfolio_v,
"Bad Ask min/max
",
thesis_info.value
)
return start_search(
user_query, sectors, stages, bmodels,
a_min, a_max,
float(diversity_v),
portfolio_v
)
start_btn.click(
on_start,
inputs=[query_input, sectors_input, stages_input, bmodels_input, ask_min, ask_max, diversity, portfolio_state],
outputs=[onboarding_view, matching_view, display_area, deck_state, pos_state, portfolio_state, status_box, thesis_info]
).then(lambda p: portfolio_to_table(p), inputs=portfolio_state, outputs=portfolio_table)
invest_btn.click(
lambda deck, pos, query, port: swipe_action(deck, pos, "INVEST", query, port),
inputs=[deck_state, pos_state, query_input, portfolio_state],
outputs=[display_area, pos_state, action_row, portfolio_state]
).then(lambda p: portfolio_to_table(p), inputs=portfolio_state, outputs=portfolio_table)
pass_btn.click(
lambda deck, pos, query, port: swipe_action(deck, pos, "PASS", query, port),
inputs=[deck_state, pos_state, query_input, portfolio_state],
outputs=[display_area, pos_state, action_row, portfolio_state]
).then(lambda p: portfolio_to_table(p), inputs=portfolio_state, outputs=portfolio_table)
back_btn.click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[onboarding_view, matching_view])
remove_btn.click(remove_selected, inputs=[portfolio_state, remove_rows], outputs=[portfolio_state, portfolio_table])
clear_btn.click(lambda: clear_portfolio(), outputs=[portfolio_state, portfolio_table])
# Queue helps stability on Spaces
demo.queue(default_concurrency_limit=1, max_size=32)
# IMPORTANT: In Gradio 6.x pass css/theme via launch()
demo.launch(css=CSS, theme=gr.themes.Default(primary_hue="pink"), ssr_mode=False)