PITCHFIGHT_AI / core /claim_extractor.py
Aspectgg's picture
Prepare PitchFight AI completion
8fab536
Raw
History Blame Contribute Delete
14.5 kB
"""Concrete claim and signal extractor for PitchFight AI scorecard.
Extracts evidence signals from founder answers using regex/rule-based logic.
No API calls — fast, local, deterministic.
Used by scoring_engine.py to:
1. Enrich the Nemotron scoring prompt with pre-extracted signal data
2. Build session-aware fallback scorecards when model scoring fails
"""
from __future__ import annotations
import re
from typing import Any
# ---------------------------------------------------------------------------
# Compiled patterns
# ---------------------------------------------------------------------------
_NUMBER_METRIC = re.compile(
r"\b\d[\d,]*\.?\d*\s*"
r"(?:%|percent|users?|students?|signups?|schools?|campuses?|"
r"colleges?|installs?|ambassadors?|interviews?|pilots?|teams?|"
r"paying|members?|founders?|customers?|clients?|respondents?|"
r"participants?|responses?)\b"
r"|\b\d[\d,]+\b",
re.IGNORECASE,
)
_PERCENTAGE = re.compile(
r"\b\d+\.?\d*\s*%|\b\d+\.?\d*\s*percent\b",
re.IGNORECASE,
)
_CURRENCY = re.compile(
r"[₹$€£¥]\s*\d[\d,.]*"
r"|\b\d[\d,.]*\s*(?:rupees?|dollars?|usd|inr|cad|euros?)\b"
r"|\brs\.?\s*\d[\d,.]*\b",
re.IGNORECASE,
)
_USER_COUNT = re.compile(
r"\b\d+\s*(?:beta\s+)?users?\b"
r"|\b\d+\s*(?:beta\s+)?signups?\b"
r"|\b\d+\s*students?\b"
r"|\b\d+\s*members?\b"
r"|\b\d+\s*paying\b"
r"|\b\d+\s*customers?\b",
re.IGNORECASE,
)
_VALIDATION = re.compile(
r"\b(?:beta|pilot|prototype|mvp|tested|validated|launched|surveyed|"
r"interviewed|user research|focus group|waitlist|signups?|onboarding|"
r"usability test|field test|user test|a/b test|experiment)\b"
r"|\bevent.?miss reports?\b"
r"|\bcampus ambassadors?\b",
re.IGNORECASE,
)
_COLLEGE_CAMPUS = re.compile(
r"\b(?:campus(?:es)?|colleges?|universities|university|iit|nit|bits|vit|"
r"mit|stanford|oxford|ambassadors?|chapters?)\b",
re.IGNORECASE,
)
_COMPETITORS = re.compile(
r"\b(?:luma|lu\.ma|eventbrite|devfolio|unstop|meetup|linkedin|facebook|"
r"whatsapp|google|notion|airtable|twitter|x\.com|slack|discord|"
r"internshala|naukri|glassdoor|monster)\b"
r"|\b(?:competitors?|alternatives?)\b"
r"|\bunlike\s+\w+\b"
r"|\bvs\.?\s+\w+\b",
re.IGNORECASE,
)
_TECH_MECHANISM = re.compile(
r"\b(?:embedding|vector|fine.?tun|retrieval|ranking model|cosine similarity|"
r"recommendation engine|nlp|llm|transformer|gpt|bert|neural|classifier|"
r"semantic search|knowledge graph|rag|inference|profile matching|"
r"skill.based|personali[sz]ation|latency|throughput|model|algorithm|"
r"api|webhook|scraping|crawler|pipeline)\b",
re.IGNORECASE,
)
_REVENUE = re.compile(
r"\b(?:revenue|mrr|arr|subscription|freemium|pay per|college pays|"
r"sponsor pays|b2b|b2c|saas|monetize|price|pricing|charge|conversion|"
r"cac|ltv|arpu|monthly plan|annual plan|tier|upsell|markup|margin|"
r"transaction fee|commission|licensing|enterprise)\b",
re.IGNORECASE,
)
_RETENTION = re.compile(
r"\b(?:retention|churn|dau|mau|wau|weekly active|daily active|"
r"returning users?|re.engagement|habit|repeat|sticky|lock.in|"
r"notification|reminder|follow.?up|engagement rate)\b",
re.IGNORECASE,
)
_GTM = re.compile(
r"\b(?:gtm|go.to.market|acquisition|channel|referral|viral|"
r"word.of.mouth|ambassador|campus rep|partnership|integration|"
r"distribution|onboard|launch|rollout|phase\s*\d)\b",
re.IGNORECASE,
)
_VAGUE_PHRASES = re.compile(
r"\bbig market\b|\bhuge market\b|\blarge market\b"
r"|\beveryone\b|\banybody\b"
r"|\buseful for\b|\bhelpful for\b|\bgood for\b|\bgreat for\b"
r"|\bai will\b|\bai can\b"
r"|\bautomatically\b|\bseamlessly\b|\beasily\b|\bquickly\b"
r"|\bbasically\b|\bgenerally\b|\btypically\b|\bobviously\b",
re.IGNORECASE,
)
_NON_ANSWER_EXACT: frozenset[str] = frozenset({
"ok", "okay", "idk", "i don't know", "dont know", "don't know",
"not sure", "maybe", "no idea", "hmm", "i'm not sure", "im not sure",
"i dont know", "i have no idea", "not really", "yeah", "sure",
"fine", "alright", "whatever", "pass", "i'll think about it",
"we'll figure it out", "good question",
})
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _match_all(pattern: re.Pattern, text: str) -> list[str]:
return [m.group(0).strip() for m in pattern.finditer(text)]
def _dedup(lst: list[str]) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for item in lst:
key = item.lower().strip()
if key and key not in seen:
seen.add(key)
out.append(item)
return out
def _is_non_answer(text: str) -> bool:
stripped = text.strip().lower()
if not stripped or len(stripped.split()) < 4:
return True
return stripped in _NON_ANSWER_EXACT or any(
stripped.startswith(p) for p in _NON_ANSWER_EXACT
)
# ---------------------------------------------------------------------------
# Main extractor
# ---------------------------------------------------------------------------
def _accumulate_signals_from_text(
text: str,
*,
all_numbers: list[str],
all_pct: list[str],
all_pricing: list[str],
all_user_counts: list[str],
all_validation: list[str],
all_colleges: list[str],
all_competitors: list[str],
all_tech: list[str],
all_revenue: list[str],
all_retention: list[str],
all_gtm: list[str],
all_vague: list[str],
) -> None:
"""Run regex extractors on a single text block into accumulator lists."""
if not text or not str(text).strip():
return
all_numbers.extend(_match_all(_NUMBER_METRIC, text))
all_pct.extend(_match_all(_PERCENTAGE, text))
all_pricing.extend(_match_all(_CURRENCY, text))
all_user_counts.extend(_match_all(_USER_COUNT, text))
all_validation.extend(_match_all(_VALIDATION, text))
all_colleges.extend(_match_all(_COLLEGE_CAMPUS, text))
all_competitors.extend(_match_all(_COMPETITORS, text))
all_tech.extend(_match_all(_TECH_MECHANISM, text))
all_revenue.extend(_match_all(_REVENUE, text))
all_retention.extend(_match_all(_RETENTION, text))
all_gtm.extend(_match_all(_GTM, text))
all_vague.extend(_match_all(_VAGUE_PHRASES, text))
def extract_startup_context_signals(session: dict) -> dict[str, Any]:
"""Extract evidence signals from startup form + voice pitch transcript only.
Used when the founder gave no battle answers but described the idea upfront.
Does not include battle Q&A history.
"""
startup = session.get("startup", {}) or {}
texts: list[str] = []
for key in (
"name", "problem", "target_users", "solution",
"why_ai", "competitors", "traction", "ask",
):
val = str(startup.get(key, "")).strip()
if val:
texts.append(val)
voice_pitch = session.get("voice_pitch") or {}
if isinstance(voice_pitch, dict):
transcript = str(voice_pitch.get("transcript", "")).strip()
if transcript:
texts.append(transcript)
all_numbers: list[str] = []
all_pct: list[str] = []
all_pricing: list[str] = []
all_user_counts: list[str] = []
all_validation: list[str] = []
all_colleges: list[str] = []
all_competitors: list[str] = []
all_tech: list[str] = []
all_revenue: list[str] = []
all_retention: list[str] = []
all_gtm: list[str] = []
all_vague: list[str] = []
for block in texts:
_accumulate_signals_from_text(
block,
all_numbers=all_numbers,
all_pct=all_pct,
all_pricing=all_pricing,
all_user_counts=all_user_counts,
all_validation=all_validation,
all_colleges=all_colleges,
all_competitors=all_competitors,
all_tech=all_tech,
all_revenue=all_revenue,
all_retention=all_retention,
all_gtm=all_gtm,
all_vague=all_vague,
)
total_signals = (
len(_dedup(all_numbers)) + len(_dedup(all_pct)) +
len(_dedup(all_pricing)) + len(_dedup(all_user_counts)) +
len(_dedup(all_validation)) + len(_dedup(all_colleges)) +
len(_dedup(all_competitors)) + len(_dedup(all_tech)) +
len(_dedup(all_revenue))
)
return {
"numbers": _dedup(all_numbers),
"percentages": _dedup(all_pct),
"pricing": _dedup(all_pricing),
"user_counts": _dedup(all_user_counts),
"validation": _dedup(all_validation),
"college_mentions": _dedup(all_colleges),
"competitors": _dedup(all_competitors),
"technical_mechanisms": _dedup(all_tech),
"revenue_signals": _dedup(all_revenue),
"retention_signals": _dedup(all_retention),
"gtm_signals": _dedup(all_gtm),
"non_answers": [],
"vague_claims": _dedup(all_vague),
"best_user_quotes": [t[:200] for t in texts if len(t.split()) >= 6][:3],
"all_user_answers": [],
"signal_count": total_signals,
"source": "startup_context",
}
def extract_concrete_signals(session: dict) -> dict[str, Any]:
"""Extract evidence signals from all user turns in a session.
Processes only user-role messages. No API calls.
Returns:
{
"numbers": list[str], # raw number matches
"percentages": list[str],
"pricing": list[str], # currency amounts
"user_counts": list[str], # "50 beta users", etc.
"validation": list[str], # tested / piloted / surveyed
"college_mentions": list[str], # campus / college / IIT etc.
"competitors": list[str], # named competitors
"technical_mechanisms": list[str], # embedding / ranking model etc.
"revenue_signals": list[str], # subscription / pricing / CAC
"retention_signals": list[str], # churn / DAU / retention
"gtm_signals": list[str], # referral / ambassador / launch
"non_answers": list[str], # evasion/non-answer turns
"vague_claims": list[str], # buzzword phrases
"best_user_quotes": list[str], # up to 5 most signal-dense answers
"all_user_answers": list[str], # every user message
"signal_count": int, # total unique signals found
}
"""
history = session.get("history", [])
user_answers = [
e["content"].strip()
for e in history
if e.get("role") == "user" and e.get("content", "").strip()
]
all_numbers: list[str] = []
all_pct: list[str] = []
all_pricing: list[str] = []
all_user_counts: list[str] = []
all_validation: list[str] = []
all_colleges: list[str] = []
all_competitors: list[str] = []
all_tech: list[str] = []
all_revenue: list[str] = []
all_retention: list[str] = []
all_gtm: list[str] = []
all_vague: list[str] = []
non_answer_turns: list[str] = []
answer_scores: list[tuple[int, str]] = []
for ans in user_answers:
if _is_non_answer(ans):
non_answer_turns.append(ans)
answer_scores.append((0, ans))
continue
nums: list[str] = []
pcts: list[str] = []
prices: list[str] = []
ucounts: list[str] = []
val: list[str] = []
cols: list[str] = []
comps: list[str] = []
techs: list[str] = []
revs: list[str] = []
rets: list[str] = []
gtms: list[str] = []
vagues: list[str] = []
_accumulate_signals_from_text(
ans,
all_numbers=all_numbers,
all_pct=all_pct,
all_pricing=all_pricing,
all_user_counts=all_user_counts,
all_validation=all_validation,
all_colleges=all_colleges,
all_competitors=all_competitors,
all_tech=all_tech,
all_revenue=all_revenue,
all_retention=all_retention,
all_gtm=all_gtm,
all_vague=all_vague,
)
nums = _match_all(_NUMBER_METRIC, ans)
pcts = _match_all(_PERCENTAGE, ans)
prices = _match_all(_CURRENCY, ans)
ucounts = _match_all(_USER_COUNT, ans)
val = _match_all(_VALIDATION, ans)
cols = _match_all(_COLLEGE_CAMPUS, ans)
comps = _match_all(_COMPETITORS, ans)
techs = _match_all(_TECH_MECHANISM, ans)
revs = _match_all(_REVENUE, ans)
density = (
len(nums) + len(pcts) + len(prices) + len(ucounts) +
len(val) + len(cols) + len(comps) + len(techs) + len(revs)
)
answer_scores.append((density, ans))
# Sort by density descending; take top 5 non-trivial answers
sorted_answers = sorted(answer_scores, key=lambda x: x[0], reverse=True)
best_quotes = [ans for score, ans in sorted_answers if score > 0][:5]
total_signals = (
len(_dedup(all_numbers)) + len(_dedup(all_pct)) +
len(_dedup(all_pricing)) + len(_dedup(all_user_counts)) +
len(_dedup(all_validation)) + len(_dedup(all_colleges)) +
len(_dedup(all_competitors)) + len(_dedup(all_tech)) +
len(_dedup(all_revenue))
)
return {
"numbers": _dedup(all_numbers),
"percentages": _dedup(all_pct),
"pricing": _dedup(all_pricing),
"user_counts": _dedup(all_user_counts),
"validation": _dedup(all_validation),
"college_mentions": _dedup(all_colleges),
"competitors": _dedup(all_competitors),
"technical_mechanisms": _dedup(all_tech),
"revenue_signals": _dedup(all_revenue),
"retention_signals": _dedup(all_retention),
"gtm_signals": _dedup(all_gtm),
"non_answers": non_answer_turns,
"vague_claims": _dedup(all_vague),
"best_user_quotes": best_quotes,
"all_user_answers": user_answers,
"signal_count": total_signals,
}