Upload 2 files
Browse files- app.py +802 -0
- scraper.py +347 -0
app.py
ADDED
|
@@ -0,0 +1,802 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import openai
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
import uuid
|
| 6 |
+
from typing import Dict
|
| 7 |
+
|
| 8 |
+
from config import OPENAI_API_KEY, DB_PATH, EMBED_MODEL
|
| 9 |
+
from utils import get_embedding, cosine_similarity, find_top_k_matches
|
| 10 |
+
from scraper import scrape_workshops_from_squarespace
|
| 11 |
+
from database import (
|
| 12 |
+
fetch_all_embeddings,
|
| 13 |
+
fetch_row_by_id,
|
| 14 |
+
fetch_all_faq_embeddings,
|
| 15 |
+
get_session_state,
|
| 16 |
+
update_session_state,
|
| 17 |
+
log_question
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# ============================================================================
|
| 21 |
+
# CONFIGURATION
|
| 22 |
+
# ============================================================================
|
| 23 |
+
|
| 24 |
+
if not OPENAI_API_KEY:
|
| 25 |
+
raise ValueError("OPENAI_API_KEY not found in .env file")
|
| 26 |
+
|
| 27 |
+
openai.api_key = OPENAI_API_KEY
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Store session ID for the conversation
|
| 31 |
+
session_id = str(uuid.uuid4())
|
| 32 |
+
|
| 33 |
+
# Cache for workshop data and embeddings
|
| 34 |
+
workshop_cache = {
|
| 35 |
+
'data': [],
|
| 36 |
+
'embeddings': [],
|
| 37 |
+
'last_updated': None,
|
| 38 |
+
'cache_duration': timedelta(hours=24)
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# ============================================================================
|
| 42 |
+
# KEYWORD LISTS FOR ROUTING
|
| 43 |
+
# ============================================================================
|
| 44 |
+
|
| 45 |
+
EMOTIONAL_KEYWORDS = [
|
| 46 |
+
'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
|
| 47 |
+
'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
|
| 48 |
+
'insecure', 'lost', 'confused', 'struggling', 'hard time',
|
| 49 |
+
'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
|
| 50 |
+
'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
|
| 51 |
+
'hopeless', 'stressed', 'pressure', 'imposter'
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
ACTION_KEYWORDS = [
|
| 55 |
+
'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
|
| 56 |
+
'more auditions', 'book', 'booking', 'callbacks', 'improve',
|
| 57 |
+
'better', 'self-tape', 'materials', 'headshots', 'reel',
|
| 58 |
+
'network', 'connections', 'industry', 'career', 'strategy',
|
| 59 |
+
'agent prep', 'total agent prep', 'workshop', 'class', 'training',
|
| 60 |
+
'results', 'success', 'grow', 'advance', 'level up'
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
POLICY_KEYWORDS = [
|
| 64 |
+
'refund', 'refunds', 'money back',
|
| 65 |
+
'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
|
| 66 |
+
'late', 'lateness', 'tardy',
|
| 67 |
+
'reschedule', 'change date', 'move class',
|
| 68 |
+
'credit', 'credits',
|
| 69 |
+
'cancel', 'cancellation', 'canceling',
|
| 70 |
+
'policy', 'policies'
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
DETAIL_SYNONYMS = [
|
| 74 |
+
'detail', 'details', 'explain', 'elaborate', 'tell me more',
|
| 75 |
+
'more info', 'describe', 'thorough', 'comprehensive'
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
PERSONA_INSTRUCTION = """
|
| 79 |
+
You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
|
| 80 |
+
- Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
|
| 81 |
+
- Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
|
| 82 |
+
- Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
# ============================================================================
|
| 86 |
+
# HELPER FUNCTIONS
|
| 87 |
+
# ============================================================================
|
| 88 |
+
|
| 89 |
+
def calculate_workshop_confidence(w: Dict) -> float:
|
| 90 |
+
"""Calculate confidence score of retrieved workshop data"""
|
| 91 |
+
score = 0.0
|
| 92 |
+
if w.get('title'): score += 0.3
|
| 93 |
+
if w.get('instructor_name'): score += 0.3
|
| 94 |
+
if w.get('date'): score += 0.2
|
| 95 |
+
if w.get('time'): score += 0.1
|
| 96 |
+
if w.get('source_url'): score += 0.1
|
| 97 |
+
return round(score, 2)
|
| 98 |
+
|
| 99 |
+
# ============================================================================
|
| 100 |
+
# WORKSHOP FUNCTIONS
|
| 101 |
+
# ============================================================================
|
| 102 |
+
|
| 103 |
+
def get_current_workshops():
|
| 104 |
+
"""Get current workshops with caching"""
|
| 105 |
+
global workshop_cache
|
| 106 |
+
|
| 107 |
+
now = datetime.now()
|
| 108 |
+
|
| 109 |
+
# Check if cache is still valid
|
| 110 |
+
if (workshop_cache['last_updated'] and
|
| 111 |
+
now - workshop_cache['last_updated'] < workshop_cache['cache_duration'] and
|
| 112 |
+
workshop_cache['data']):
|
| 113 |
+
print("Using cached workshop data")
|
| 114 |
+
return workshop_cache['data'], workshop_cache['embeddings']
|
| 115 |
+
|
| 116 |
+
print("Fetching fresh workshop data...")
|
| 117 |
+
|
| 118 |
+
# Use robust Squarespace scraping system
|
| 119 |
+
online_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/online")
|
| 120 |
+
instudio_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/instudio")
|
| 121 |
+
|
| 122 |
+
all_workshops = online_workshops + instudio_workshops
|
| 123 |
+
|
| 124 |
+
# Data Integrity: Validate and score workshops
|
| 125 |
+
valid_workshops = []
|
| 126 |
+
total_score = 0
|
| 127 |
+
for w in all_workshops:
|
| 128 |
+
conf = calculate_workshop_confidence(w)
|
| 129 |
+
if conf >= 0.8:
|
| 130 |
+
valid_workshops.append(w)
|
| 131 |
+
total_score += conf
|
| 132 |
+
else:
|
| 133 |
+
print(f"⚠️ Rejecting weak record (Confidence: {conf}): {w.get('title', 'Unknown')}", flush=True)
|
| 134 |
+
|
| 135 |
+
avg_conf = total_score / len(valid_workshops) if valid_workshops else 0
|
| 136 |
+
print(f"📊 DATA INTEGRITY: Found {len(all_workshops)} total, {len(valid_workshops)} valid (Confidence >= 0.8)", flush=True)
|
| 137 |
+
print(f"📈 Retrieval Confidence: {avg_conf:.2f} (Average)", flush=True)
|
| 138 |
+
|
| 139 |
+
all_workshops = valid_workshops
|
| 140 |
+
|
| 141 |
+
if not all_workshops:
|
| 142 |
+
if workshop_cache['data']:
|
| 143 |
+
print("Scraping failed, using cached data")
|
| 144 |
+
return workshop_cache['data'], workshop_cache['embeddings']
|
| 145 |
+
else:
|
| 146 |
+
print("No workshop data available")
|
| 147 |
+
return [], []
|
| 148 |
+
|
| 149 |
+
# Generate embeddings for workshops
|
| 150 |
+
workshop_embeddings = []
|
| 151 |
+
for workshop in all_workshops:
|
| 152 |
+
try:
|
| 153 |
+
embedding = get_embedding(workshop['full_text'])
|
| 154 |
+
workshop_embeddings.append(embedding)
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"Error generating embedding for workshop: {e}")
|
| 157 |
+
workshop_embeddings.append([0] * 1536)
|
| 158 |
+
|
| 159 |
+
# Update cache
|
| 160 |
+
workshop_cache['data'] = all_workshops
|
| 161 |
+
workshop_cache['embeddings'] = workshop_embeddings
|
| 162 |
+
workshop_cache['last_updated'] = now
|
| 163 |
+
|
| 164 |
+
print(f"Cached {len(all_workshops)} workshops")
|
| 165 |
+
return all_workshops, workshop_embeddings
|
| 166 |
+
|
| 167 |
+
def find_top_workshops(user_embedding, k=3):
|
| 168 |
+
"""Find top matching workshops using real-time data"""
|
| 169 |
+
workshops, workshop_embeddings = get_current_workshops()
|
| 170 |
+
|
| 171 |
+
if not workshops:
|
| 172 |
+
return []
|
| 173 |
+
|
| 174 |
+
scored = []
|
| 175 |
+
for i, (workshop, emb) in enumerate(zip(workshops, workshop_embeddings)):
|
| 176 |
+
try:
|
| 177 |
+
score = cosine_similarity(user_embedding, emb)
|
| 178 |
+
scored.append((score, i, workshop['full_text'], workshop))
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"Error calculating similarity: {e}")
|
| 181 |
+
continue
|
| 182 |
+
|
| 183 |
+
scored.sort(reverse=True)
|
| 184 |
+
return scored[:k]
|
| 185 |
+
|
| 186 |
+
# ============================================================================
|
| 187 |
+
# PROMPT BUILDING FUNCTIONS
|
| 188 |
+
# ============================================================================
|
| 189 |
+
|
| 190 |
+
def generate_enriched_links(row):
|
| 191 |
+
base_url = row.get("youtube_url")
|
| 192 |
+
guest_name = row.get("guest_name", "")
|
| 193 |
+
highlights = json.loads(row.get("highlight_json", "[]"))
|
| 194 |
+
summary = highlights[0]["summary"] if highlights else ""
|
| 195 |
+
|
| 196 |
+
# Truncate summary to first sentence only
|
| 197 |
+
if summary:
|
| 198 |
+
first_sentence = summary.split('.')[0] + '.'
|
| 199 |
+
|
| 200 |
+
if len(first_sentence) > 120:
|
| 201 |
+
short_summary = first_sentence[:117] + "..."
|
| 202 |
+
else:
|
| 203 |
+
short_summary = first_sentence
|
| 204 |
+
else:
|
| 205 |
+
short_summary = "Industry insights for actors"
|
| 206 |
+
|
| 207 |
+
markdown = f"🎧 [Watch {guest_name}'s episode here]({base_url}) - {short_summary}"
|
| 208 |
+
return [markdown]
|
| 209 |
+
|
| 210 |
+
def build_enhanced_prompt(user_question, context_results, top_workshops, user_preference=None, enriched_podcast_links=None, wants_details=False, current_topic=None):
|
| 211 |
+
"""Builds the system prompt with strict formatting rules."""
|
| 212 |
+
|
| 213 |
+
# Free classes are ONLY available online (never in-studio)
|
| 214 |
+
free_class_url = "https://www.getscenestudios.com/online"
|
| 215 |
+
|
| 216 |
+
# helper for clean links
|
| 217 |
+
def format_workshop(w):
|
| 218 |
+
if not w.get('title') or not w.get('instructor_name') or not w.get('date'):
|
| 219 |
+
return None
|
| 220 |
+
|
| 221 |
+
link = "https://www.getscenestudios.com/instudio" if "/instudio" in w.get('source_url', '') else "https://www.getscenestudios.com/online"
|
| 222 |
+
|
| 223 |
+
# User Preference Filtering
|
| 224 |
+
w_type = "Online" if "online" in w.get('source_url', '') else "In-Studio"
|
| 225 |
+
if user_preference:
|
| 226 |
+
if user_preference.lower() != w_type.lower():
|
| 227 |
+
return None
|
| 228 |
+
|
| 229 |
+
# Calculate confidence using logic (already present in HF app.py at line 89)
|
| 230 |
+
confidence = calculate_workshop_confidence(w)
|
| 231 |
+
if confidence < 0.70:
|
| 232 |
+
return None
|
| 233 |
+
|
| 234 |
+
# R2: Force format inclusion into the title link for robustness
|
| 235 |
+
display_title = f"{w['title']} ({w_type})"
|
| 236 |
+
return f"- [{display_title}]({link}) with {w['instructor_name']} on {w['date']} at {w.get('time', '')}"
|
| 237 |
+
|
| 238 |
+
# Prepare workshop list (Top 3 max to display, but check top 10 for better filtering)
|
| 239 |
+
workshop_lines = []
|
| 240 |
+
if top_workshops:
|
| 241 |
+
for _, _, _, w_data in top_workshops[:10]: # Check top 10, take top 3 valid after filtering
|
| 242 |
+
formatted = format_workshop(w_data)
|
| 243 |
+
if formatted:
|
| 244 |
+
workshop_lines.append(formatted)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
workshop_text = ""
|
| 248 |
+
if workshop_lines:
|
| 249 |
+
workshop_text = "\n".join(workshop_lines[:3])
|
| 250 |
+
else:
|
| 251 |
+
# Improved fallback to avoid generic/placeholder-like feeling
|
| 252 |
+
label = f"{user_preference.capitalize()} " if user_preference else ""
|
| 253 |
+
link = "https://www.getscenestudios.com/online" if user_preference == 'online' else "https://www.getscenestudios.com/instudio" if user_preference == 'instudio' else "https://www.getscenestudios.com/online"
|
| 254 |
+
workshop_text = f"We are constantly updating our schedule! Check our current {label}availability and latest workshops at {link}"
|
| 255 |
+
|
| 256 |
+
# Handle missing podcast data strictly
|
| 257 |
+
if not enriched_podcast_links:
|
| 258 |
+
single_podcast = "Our latest industry insights are available on YouTube: https://www.youtube.com/@GetSceneStudios"
|
| 259 |
+
else:
|
| 260 |
+
single_podcast = enriched_podcast_links[0]
|
| 261 |
+
|
| 262 |
+
# --- EMOTIONAL / SUPPORT MODE CHECK ---
|
| 263 |
+
is_emotional = detect_response_type(user_question) == "support"
|
| 264 |
+
|
| 265 |
+
if is_emotional:
|
| 266 |
+
prompt = f"""{PERSONA_INSTRUCTION}
|
| 267 |
+
|
| 268 |
+
You are acting in SUPPORT MODE.
|
| 269 |
+
|
| 270 |
+
CRITICAL INSTRUCTIONS:
|
| 271 |
+
1. ACKNOWLEDGE their feelings first (e.g., "I hear how frustrating it is to feel stuck...").
|
| 272 |
+
2. Provide SUPPORTIVE language (2-3 sentences max).
|
| 273 |
+
3. Offer EXACTLY ONE gentle follow-up resource: either the podcast OR the free class.
|
| 274 |
+
4. DO NOT suggest paid workshops or upsell in this response.
|
| 275 |
+
5. KEEP IT BRIEF (≤150 words).
|
| 276 |
+
|
| 277 |
+
USER'S QUESTION: {user_question}
|
| 278 |
+
|
| 279 |
+
REQUIRED RESPONSE FORMAT:
|
| 280 |
+
[Your empathetic, supportive acknowledgment]
|
| 281 |
+
|
| 282 |
+
Here's a free resource that might help you move forward:
|
| 283 |
+
[Pick ONE: {single_podcast} OR Free Class at {free_class_url}]
|
| 284 |
+
|
| 285 |
+
Questions? Contact info@getscenestudios.com"""
|
| 286 |
+
return prompt
|
| 287 |
+
|
| 288 |
+
# --- STANDARD LOGIC FOR CONTEXT SNIPPET ---
|
| 289 |
+
question_lower = user_question.lower()
|
| 290 |
+
context_snippet = ""
|
| 291 |
+
|
| 292 |
+
# Priority 1: Direct Keywords in current question
|
| 293 |
+
detected_topic = None
|
| 294 |
+
if any(word in question_lower for word in ['agent', 'representation', 'rep', 'manager']):
|
| 295 |
+
detected_topic = 'agent'
|
| 296 |
+
elif any(word in question_lower for word in ['beginner', 'new', 'start', 'beginning']):
|
| 297 |
+
detected_topic = 'beginner'
|
| 298 |
+
elif any(word in question_lower for word in ['callback', 'audition', 'tape', 'self-tape', 'booking']):
|
| 299 |
+
detected_topic = 'audition'
|
| 300 |
+
elif any(word in question_lower for word in ['mentorship', 'coaching']):
|
| 301 |
+
detected_topic = 'mentorship'
|
| 302 |
+
elif any(word in question_lower for word in ['price', 'cost', 'how much']):
|
| 303 |
+
detected_topic = 'pricing'
|
| 304 |
+
|
| 305 |
+
# Priority 2: Fallback to session context if current question is ambiguous
|
| 306 |
+
if not detected_topic and current_topic:
|
| 307 |
+
topic_map = {
|
| 308 |
+
'agent_seeking': 'agent',
|
| 309 |
+
'beginner': 'beginner',
|
| 310 |
+
'audition_help': 'audition',
|
| 311 |
+
'mentorship': 'mentorship',
|
| 312 |
+
'pricing': 'pricing'
|
| 313 |
+
}
|
| 314 |
+
detected_topic = topic_map.get(current_topic)
|
| 315 |
+
|
| 316 |
+
# Assign snippet based on topic
|
| 317 |
+
if detected_topic == 'agent':
|
| 318 |
+
context_snippet = "Get Scene Studios has helped 1000+ actors land representation. Total Agent Prep offers live practice with working agents (age 16+, limited to 12 actors)."
|
| 319 |
+
elif detected_topic == 'beginner':
|
| 320 |
+
context_snippet = "Get Scene Studios specializes in getting actors audition-ready fast with camera technique and professional self-tape skills."
|
| 321 |
+
elif detected_topic == 'audition':
|
| 322 |
+
context_snippet = "Get Scene offers Crush the Callback (Zoom simulation) and Perfect Submission (self-tape mastery) for actors refining their technique."
|
| 323 |
+
elif detected_topic == 'mentorship':
|
| 324 |
+
context_snippet = "Working Actor Mentorship is a 6-month program ($3,000) with structured feedback and industry access."
|
| 325 |
+
elif detected_topic == 'pricing':
|
| 326 |
+
context_snippet = "Get Scene Studios pricing varies by program. Most workshops cap at 12-14 actors for personalized feedback."
|
| 327 |
+
else:
|
| 328 |
+
context_snippet = "Get Scene Studios (founded by Jesse Malinowski) offers training for TV/film actors at all levels."
|
| 329 |
+
|
| 330 |
+
preference_instruction = ""
|
| 331 |
+
if not user_preference:
|
| 332 |
+
preference_instruction = """
|
| 333 |
+
IMPORTANT: We need to know if the user prefers "Online" or "In-Studio" workshops.
|
| 334 |
+
If their question implies a location or they haven't specified, ask: "Are you looking for Online or In-Studio training?" as part of your response.
|
| 335 |
+
"""
|
| 336 |
+
else:
|
| 337 |
+
preference_instruction = f"""
|
| 338 |
+
USER PREFERENCE KNOWN: {user_preference.upper()}
|
| 339 |
+
1. DO NOT ask "Online or In-Studio" again.
|
| 340 |
+
2. Ensure your recommendations align with {user_preference.upper()} where possible.
|
| 341 |
+
"""
|
| 342 |
+
|
| 343 |
+
# Brevity & Cognitive Load: Direct instructions based on user intent
|
| 344 |
+
detail_instruction = "Answer the user's question briefly (2-3 sentences max, ≤150 words total)."
|
| 345 |
+
if wants_details:
|
| 346 |
+
detail_instruction = "Provide a detailed and thorough explanation for the user's request, but keep it structured and readable."
|
| 347 |
+
|
| 348 |
+
prompt = f"""{PERSONA_INSTRUCTION}
|
| 349 |
+
|
| 350 |
+
{context_snippet}
|
| 351 |
+
|
| 352 |
+
CRITICAL INSTRUCTIONS:
|
| 353 |
+
- {detail_instruction}
|
| 354 |
+
- Use natural, human transitions between your answer and the recommendations.
|
| 355 |
+
- For each recommendation, add a tiny bit of "mentor advice" on why it helps.
|
| 356 |
+
- Then ALWAYS provide exactly these three numbered recommendations (1. 2. 3.):
|
| 357 |
+
- Use ONLY the provided links - do not invent recommendations
|
| 358 |
+
- Every workshop Title MUST be followed by its format in parentheses, e.g., "Workshop Name (Online)" or "Workshop Name (In-Studio)".
|
| 359 |
+
- Focus on clean, readable formatting.{preference_instruction}
|
| 360 |
+
|
| 361 |
+
USER'S QUESTION: {user_question}
|
| 362 |
+
|
| 363 |
+
REQUIRED RESPONSE FORMAT:
|
| 364 |
+
[Your brief answer to their question, ≤150 words total]
|
| 365 |
+
|
| 366 |
+
Here's your path forward:
|
| 367 |
+
1. Free class (start here, no credit card required): {free_class_url}
|
| 368 |
+
2. Recommended podcast episode:
|
| 369 |
+
{single_podcast}
|
| 370 |
+
3. Relevant paid workshop:
|
| 371 |
+
{workshop_text}
|
| 372 |
+
|
| 373 |
+
Questions? Contact info@getscenestudios.com"""
|
| 374 |
+
|
| 375 |
+
return prompt
|
| 376 |
+
|
| 377 |
+
# ============================================================================
|
| 378 |
+
# DETECTION FUNCTIONS
|
| 379 |
+
# ============================================================================
|
| 380 |
+
|
| 381 |
+
def detect_question_category(question):
|
| 382 |
+
"""Categorize user questions for better context injection"""
|
| 383 |
+
question_lower = question.lower()
|
| 384 |
+
|
| 385 |
+
categories = {
|
| 386 |
+
'agent_seeking': ['agent', 'representation', 'rep', 'manager', 'get an agent'],
|
| 387 |
+
'beginner': ['beginner', 'new', 'start', 'beginning', 'first time', 'never acted'],
|
| 388 |
+
'audition_help': ['audition', 'callback', 'tape', 'self-tape', 'submission'],
|
| 389 |
+
'mentorship': ['mentorship', 'coaching', 'intensive', 'mentor', 'one-on-one'],
|
| 390 |
+
'pricing': ['price', 'cost', 'pricing', '$', 'money', 'payment', 'fee'],
|
| 391 |
+
'classes': ['class', 'workshop', 'training', 'course', 'learn'],
|
| 392 |
+
'membership': ['membership', 'join', 'member', 'gsp', 'plus'],
|
| 393 |
+
'technical': ['self-tape', 'equipment', 'lighting', 'editing', 'camera']
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
detected = []
|
| 397 |
+
for category, keywords in categories.items():
|
| 398 |
+
if any(keyword in question_lower for keyword in keywords):
|
| 399 |
+
detected.append(category)
|
| 400 |
+
|
| 401 |
+
return detected
|
| 402 |
+
|
| 403 |
+
def detect_response_type(question):
|
| 404 |
+
"""Detect if question is emotional/support vs action/results oriented"""
|
| 405 |
+
question_lower = question.lower()
|
| 406 |
+
|
| 407 |
+
emotional_count = sum(1 for word in EMOTIONAL_KEYWORDS if word in question_lower)
|
| 408 |
+
action_count = sum(1 for word in ACTION_KEYWORDS if word in question_lower)
|
| 409 |
+
|
| 410 |
+
if emotional_count > 0 and emotional_count >= action_count:
|
| 411 |
+
return "support"
|
| 412 |
+
return "standard"
|
| 413 |
+
|
| 414 |
+
def detect_policy_issue(question):
|
| 415 |
+
"""Detect if question violates hard policy rules"""
|
| 416 |
+
question_lower = question.lower()
|
| 417 |
+
return any(word in question_lower for word in POLICY_KEYWORDS)
|
| 418 |
+
|
| 419 |
+
def detect_preference(question):
|
| 420 |
+
"""Detect if user is stating a preference"""
|
| 421 |
+
q_lower = question.lower()
|
| 422 |
+
if 'online' in q_lower and 'studio' not in q_lower:
|
| 423 |
+
return 'online'
|
| 424 |
+
if ('studio' in q_lower or 'person' in q_lower or 'atlanta' in q_lower) and 'online' not in q_lower:
|
| 425 |
+
return 'instudio'
|
| 426 |
+
return None
|
| 427 |
+
|
| 428 |
+
def get_contextual_business_info(categories):
|
| 429 |
+
"""Return relevant business information based on detected question categories"""
|
| 430 |
+
|
| 431 |
+
context_map = {
|
| 432 |
+
'agent_seeking': {
|
| 433 |
+
'programs': ['Total Agent Prep', 'Working Actor Mentorship'],
|
| 434 |
+
'key_info': 'Live pitch practice with real agents, Actors Access optimization',
|
| 435 |
+
'journey': 'Total Agent Prep → GSP → Mentorship for sustained progress'
|
| 436 |
+
},
|
| 437 |
+
'beginner': {
|
| 438 |
+
'programs': ['Free Classes', 'Get Scene 360', 'Get Scene Plus'],
|
| 439 |
+
'key_info': 'Start with holistic foundation, build consistency',
|
| 440 |
+
'journey': 'Free class → Get Scene 360 → GSP membership'
|
| 441 |
+
},
|
| 442 |
+
'audition_help': {
|
| 443 |
+
'programs': ['Perfect Submission', 'Crush the Callback', 'Audition Insight'],
|
| 444 |
+
'key_info': 'Self-tape mastery, callback simulation, pro feedback',
|
| 445 |
+
'journey': 'Perfect Submission → GSP for ongoing Audition Insight'
|
| 446 |
+
},
|
| 447 |
+
'mentorship': {
|
| 448 |
+
'programs': ['Working Actor Mentorship'],
|
| 449 |
+
'key_info': '6-month intensive with structured feedback and accountability',
|
| 450 |
+
'journey': 'Ready for commitment → WAM → Advanced workshops'
|
| 451 |
+
}
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
relevant_info = {}
|
| 455 |
+
for category in categories:
|
| 456 |
+
if category in context_map:
|
| 457 |
+
relevant_info[category] = context_map[category]
|
| 458 |
+
|
| 459 |
+
return relevant_info
|
| 460 |
+
|
| 461 |
+
# ============================================================================
|
| 462 |
+
# MAIN CHATBOT LOGIC
|
| 463 |
+
# ============================================================================
|
| 464 |
+
|
| 465 |
+
def update_knowledge_from_question(session_id: str, question: str):
|
| 466 |
+
"""Extract attributes and update knowledge dictionary"""
|
| 467 |
+
updates = {}
|
| 468 |
+
|
| 469 |
+
# Extract Format
|
| 470 |
+
pref = detect_preference(question)
|
| 471 |
+
if pref:
|
| 472 |
+
updates['format'] = pref
|
| 473 |
+
|
| 474 |
+
# Extract Topic
|
| 475 |
+
cats = detect_question_category(question)
|
| 476 |
+
if cats:
|
| 477 |
+
# Prioritize specific topics over generic ones
|
| 478 |
+
priority_topics = ['agent_seeking', 'beginner', 'audition_help', 'mentorship', 'pricing']
|
| 479 |
+
for topic in priority_topics:
|
| 480 |
+
if topic in cats:
|
| 481 |
+
updates['topic'] = topic
|
| 482 |
+
break
|
| 483 |
+
if 'topic' not in updates and cats:
|
| 484 |
+
updates['topic'] = cats[0]
|
| 485 |
+
|
| 486 |
+
if updates:
|
| 487 |
+
update_session_state(session_id, knowledge_update=updates, increment_count=False)
|
| 488 |
+
return updates
|
| 489 |
+
return {}
|
| 490 |
+
|
| 491 |
+
def process_question(question: str, current_session_id: str):
|
| 492 |
+
"""Main function to process user questions - replaces Flask /ask endpoint"""
|
| 493 |
+
|
| 494 |
+
if not question:
|
| 495 |
+
return "Question is required"
|
| 496 |
+
|
| 497 |
+
# 0. HARD POLICY CHECK
|
| 498 |
+
if detect_policy_issue(question):
|
| 499 |
+
log_question(question, current_session_id)
|
| 500 |
+
|
| 501 |
+
return "Please email info@getscenestudios.com."
|
| 502 |
+
|
| 503 |
+
# 1. Handle Session & Knowledge State
|
| 504 |
+
update_knowledge_from_question(current_session_id, question)
|
| 505 |
+
|
| 506 |
+
session_state = get_session_state(current_session_id)
|
| 507 |
+
|
| 508 |
+
try:
|
| 509 |
+
knowledge = json.loads(session_state.get('knowledge_context', '{}'))
|
| 510 |
+
except:
|
| 511 |
+
knowledge = {}
|
| 512 |
+
|
| 513 |
+
user_preference = knowledge.get('format')
|
| 514 |
+
current_topic = knowledge.get('topic')
|
| 515 |
+
|
| 516 |
+
if not user_preference:
|
| 517 |
+
user_preference = session_state.get('preference')
|
| 518 |
+
|
| 519 |
+
update_session_state(current_session_id, increment_count=True)
|
| 520 |
+
|
| 521 |
+
# Create embedding of user question
|
| 522 |
+
user_embedding = get_embedding(question)
|
| 523 |
+
|
| 524 |
+
# Check FAQ embeddings first
|
| 525 |
+
faq_data = fetch_all_faq_embeddings()
|
| 526 |
+
top_faqs = []
|
| 527 |
+
|
| 528 |
+
for entry_id, question_text, answer_text, emb in faq_data:
|
| 529 |
+
score = cosine_similarity(user_embedding, emb)
|
| 530 |
+
top_faqs.append((score, entry_id, question_text, answer_text))
|
| 531 |
+
top_faqs.sort(reverse=True)
|
| 532 |
+
|
| 533 |
+
faq_threshold = 0.85
|
| 534 |
+
ambiguous_threshold = 0.70
|
| 535 |
+
|
| 536 |
+
# If high-confidence FAQ match found
|
| 537 |
+
if top_faqs and top_faqs[0][0] >= faq_threshold:
|
| 538 |
+
update_session_state(current_session_id, reset_clarification=True, increment_count=False)
|
| 539 |
+
|
| 540 |
+
best_score, faq_id, question_text, answer_text = top_faqs[0]
|
| 541 |
+
|
| 542 |
+
mentor_framing_start = "That's a great question! Here's the information on that:"
|
| 543 |
+
mentor_framing_end = "I hope that clears things up! Remember, every bit of knowledge helps you steer your career in the right direction."
|
| 544 |
+
|
| 545 |
+
enhanced_answer = f"{mentor_framing_start}\n\n{answer_text}"
|
| 546 |
+
|
| 547 |
+
# R5: Policy Guard for FAQ answers
|
| 548 |
+
if any(word in enhanced_answer.lower() for word in POLICY_KEYWORDS):
|
| 549 |
+
enhanced_answer = "Please email info@getscenestudios.com for assistance with this."
|
| 550 |
+
else:
|
| 551 |
+
categories = detect_question_category(question)
|
| 552 |
+
contextual_info = get_contextual_business_info(categories)
|
| 553 |
+
|
| 554 |
+
if contextual_info:
|
| 555 |
+
next_steps = []
|
| 556 |
+
for category, info in contextual_info.items():
|
| 557 |
+
next_steps.append(f"A great next step for you: {info['journey']}")
|
| 558 |
+
|
| 559 |
+
if next_steps:
|
| 560 |
+
enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
|
| 561 |
+
|
| 562 |
+
enhanced_answer += f"\n\n{mentor_framing_end}\n\nQuestions? Contact info@getscenestudios.com"
|
| 563 |
+
|
| 564 |
+
# Log question
|
| 565 |
+
log_question(question, current_session_id, answer=enhanced_answer)
|
| 566 |
+
|
| 567 |
+
return enhanced_answer
|
| 568 |
+
|
| 569 |
+
elif top_faqs and top_faqs[0][0] >= ambiguous_threshold:
|
| 570 |
+
# AMBIGUOUS ZONE
|
| 571 |
+
needs_clarification = False
|
| 572 |
+
|
| 573 |
+
if not user_preference:
|
| 574 |
+
needs_clarification = True
|
| 575 |
+
|
| 576 |
+
is_generic_query = any(w in question.lower() for w in ['price', 'cost', 'how much', 'schedule', 'when'])
|
| 577 |
+
if is_generic_query and not current_topic:
|
| 578 |
+
needs_clarification = True
|
| 579 |
+
|
| 580 |
+
clarification_count = session_state.get('clarification_count', 0)
|
| 581 |
+
if clarification_count > 0:
|
| 582 |
+
needs_clarification = False
|
| 583 |
+
|
| 584 |
+
if needs_clarification:
|
| 585 |
+
update_session_state(current_session_id, increment_clarification=True, increment_count=False)
|
| 586 |
+
best_match_q = top_faqs[0][2]
|
| 587 |
+
return f"Did you mean: {best_match_q}?"
|
| 588 |
+
|
| 589 |
+
# Auto-Resolve
|
| 590 |
+
update_session_state(current_session_id, reset_clarification=True, increment_count=False)
|
| 591 |
+
|
| 592 |
+
best_score, faq_id, question_text, answer_text = top_faqs[0]
|
| 593 |
+
|
| 594 |
+
categories = detect_question_category(question)
|
| 595 |
+
contextual_info = get_contextual_business_info(categories)
|
| 596 |
+
|
| 597 |
+
enhanced_answer = answer_text
|
| 598 |
+
if contextual_info:
|
| 599 |
+
next_steps = []
|
| 600 |
+
for category, info in contextual_info.items():
|
| 601 |
+
next_steps.append(f"Next step: Consider {info['journey']}")
|
| 602 |
+
|
| 603 |
+
if next_steps:
|
| 604 |
+
enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
|
| 605 |
+
enhanced_answer += f"\n\nQuestions? Contact info@getscenestudios.com"
|
| 606 |
+
|
| 607 |
+
log_question(question, current_session_id, answer=enhanced_answer)
|
| 608 |
+
|
| 609 |
+
return enhanced_answer
|
| 610 |
+
|
| 611 |
+
else:
|
| 612 |
+
# 3. HALLUCINATION GUARD
|
| 613 |
+
categories = detect_question_category(question)
|
| 614 |
+
|
| 615 |
+
has_session_context = (current_topic is not None) or (user_preference is not None)
|
| 616 |
+
|
| 617 |
+
is_acting_related = (
|
| 618 |
+
len(categories) > 0 or
|
| 619 |
+
detect_response_type(question) == "support" or
|
| 620 |
+
any(k in question.lower() for k in ACTION_KEYWORDS) or
|
| 621 |
+
any(k in question.lower() for k in ['acting', 'actor', 'scene', 'audition', 'theatre', 'film', 'tv', 'commercial', 'agent', 'rep', 'manager'])
|
| 622 |
+
)
|
| 623 |
+
|
| 624 |
+
if not is_acting_related:
|
| 625 |
+
return "I'm not exactly sure about that. Please email info@getscenestudios.com so a member of our team can get you the most accurate answer!"
|
| 626 |
+
|
| 627 |
+
# 4. LLM PATH
|
| 628 |
+
update_session_state(current_session_id, reset_clarification=True, increment_count=False)
|
| 629 |
+
podcast_data = fetch_all_embeddings("podcast_episodes")
|
| 630 |
+
top_workshops = find_top_workshops(user_embedding, k=10)
|
| 631 |
+
top_podcasts = find_top_k_matches(user_embedding, podcast_data, k=3)
|
| 632 |
+
|
| 633 |
+
enriched_podcast_links = []
|
| 634 |
+
for _, podcast_id, _ in top_podcasts:
|
| 635 |
+
row = fetch_row_by_id("podcast_episodes", podcast_id)
|
| 636 |
+
enriched_podcast_links.extend(generate_enriched_links(row))
|
| 637 |
+
|
| 638 |
+
if not enriched_podcast_links:
|
| 639 |
+
fallback = fetch_row_by_id("podcast_episodes", podcast_data[0][0])
|
| 640 |
+
enriched_podcast_links = generate_enriched_links(fallback)
|
| 641 |
+
|
| 642 |
+
# 5. Brevity & Detail Detection
|
| 643 |
+
wants_details = any(syn in question.lower() for syn in DETAIL_SYNONYMS)
|
| 644 |
+
|
| 645 |
+
final_prompt = build_enhanced_prompt(
|
| 646 |
+
question,
|
| 647 |
+
None,
|
| 648 |
+
top_workshops,
|
| 649 |
+
user_preference=user_preference,
|
| 650 |
+
enriched_podcast_links=enriched_podcast_links,
|
| 651 |
+
wants_details=wants_details,
|
| 652 |
+
current_topic=current_topic
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
response = openai.chat.completions.create(
|
| 656 |
+
model="gpt-4",
|
| 657 |
+
messages=[
|
| 658 |
+
{"role": "system", "content": final_prompt},
|
| 659 |
+
{"role": "user", "content": question}
|
| 660 |
+
]
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
# Log question
|
| 664 |
+
log_question(question, current_session_id)
|
| 665 |
+
|
| 666 |
+
return response.choices[0].message.content.strip()
|
| 667 |
+
|
| 668 |
+
# ============================================================================
|
| 669 |
+
# GRADIO INTERFACE
|
| 670 |
+
# ============================================================================
|
| 671 |
+
|
| 672 |
+
def chat_with_bot(message, history):
|
| 673 |
+
"""
|
| 674 |
+
Process message directly without Flask API
|
| 675 |
+
|
| 676 |
+
Args:
|
| 677 |
+
message: User's current message
|
| 678 |
+
history: Chat history (list of message dictionaries)
|
| 679 |
+
|
| 680 |
+
Returns:
|
| 681 |
+
Updated history with new exchange
|
| 682 |
+
"""
|
| 683 |
+
global session_id
|
| 684 |
+
|
| 685 |
+
if not message.strip():
|
| 686 |
+
return history
|
| 687 |
+
|
| 688 |
+
try:
|
| 689 |
+
# Process question directly
|
| 690 |
+
bot_reply = process_question(message, session_id)
|
| 691 |
+
except Exception as e:
|
| 692 |
+
bot_reply = f"❌ Error: {str(e)}"
|
| 693 |
+
|
| 694 |
+
# Append to history in Gradio 6.0 format
|
| 695 |
+
history.append({"role": "user", "content": message})
|
| 696 |
+
history.append({"role": "assistant", "content": bot_reply})
|
| 697 |
+
return history
|
| 698 |
+
|
| 699 |
+
def reset_session():
|
| 700 |
+
"""Reset session ID for new conversation"""
|
| 701 |
+
global session_id
|
| 702 |
+
session_id = str(uuid.uuid4())
|
| 703 |
+
return [] #, f"🔄 New session started: {session_id[:8]}..."
|
| 704 |
+
|
| 705 |
+
# Create Gradio interface
|
| 706 |
+
with gr.Blocks(title="Get Scene Studios Chatbot") as demo:
|
| 707 |
+
|
| 708 |
+
gr.Markdown(
|
| 709 |
+
"""
|
| 710 |
+
# 🎬 Get Scene Studios AI Chatbot
|
| 711 |
+
|
| 712 |
+
Ask questions about acting classes, workshops and more!
|
| 713 |
+
"""
|
| 714 |
+
)
|
| 715 |
+
|
| 716 |
+
# # Session info display
|
| 717 |
+
# session_info = gr.Textbox(
|
| 718 |
+
# label="Current Session ID",
|
| 719 |
+
# value=f"Session: {session_id[:8]}...",
|
| 720 |
+
# interactive=False,
|
| 721 |
+
# scale=1
|
| 722 |
+
# )
|
| 723 |
+
|
| 724 |
+
# Chatbot interface
|
| 725 |
+
chatbot = gr.Chatbot(
|
| 726 |
+
label="Conversation",
|
| 727 |
+
height=500
|
| 728 |
+
)
|
| 729 |
+
|
| 730 |
+
# Input area
|
| 731 |
+
with gr.Row():
|
| 732 |
+
msg = gr.Textbox(
|
| 733 |
+
label="Your Message",
|
| 734 |
+
lines=2,
|
| 735 |
+
scale=4
|
| 736 |
+
)
|
| 737 |
+
submit_btn = gr.Button("Send 📤", scale=1, variant="primary")
|
| 738 |
+
|
| 739 |
+
# Action buttons
|
| 740 |
+
with gr.Row():
|
| 741 |
+
clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
|
| 742 |
+
reset_btn = gr.Button("New Session 🔄", scale=1)
|
| 743 |
+
|
| 744 |
+
# Example questions
|
| 745 |
+
# gr.Examples(
|
| 746 |
+
# examples=[
|
| 747 |
+
# "How much does it cost?",
|
| 748 |
+
# "I want to get an agent",
|
| 749 |
+
# "I'm a beginner, where should I start?",
|
| 750 |
+
# "Tell me about your workshops",
|
| 751 |
+
# "Do you have online classes?",
|
| 752 |
+
# "What's the difference between Perfect Submission and Crush the Callback?",
|
| 753 |
+
# "I prefer in-studio training",
|
| 754 |
+
# "Tell me about mentorship programs"
|
| 755 |
+
# ],
|
| 756 |
+
# inputs=msg,
|
| 757 |
+
# label="💡 Try these example questions:"
|
| 758 |
+
# )
|
| 759 |
+
|
| 760 |
+
# Event handlers
|
| 761 |
+
submit_btn.click(
|
| 762 |
+
fn=chat_with_bot,
|
| 763 |
+
inputs=[msg, chatbot],
|
| 764 |
+
outputs=[chatbot]
|
| 765 |
+
).then(
|
| 766 |
+
fn=lambda: "",
|
| 767 |
+
inputs=None,
|
| 768 |
+
outputs=[msg]
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
msg.submit(
|
| 772 |
+
fn=chat_with_bot,
|
| 773 |
+
inputs=[msg, chatbot],
|
| 774 |
+
outputs=[chatbot]
|
| 775 |
+
).then(
|
| 776 |
+
fn=lambda: "",
|
| 777 |
+
inputs=None,
|
| 778 |
+
outputs=[msg]
|
| 779 |
+
)
|
| 780 |
+
|
| 781 |
+
clear_btn.click(
|
| 782 |
+
fn=lambda: [],
|
| 783 |
+
inputs=None,
|
| 784 |
+
outputs=[chatbot]
|
| 785 |
+
)
|
| 786 |
+
|
| 787 |
+
reset_btn.click(
|
| 788 |
+
fn=reset_session,
|
| 789 |
+
inputs=None,
|
| 790 |
+
outputs=[chatbot] #, session_info]
|
| 791 |
+
)
|
| 792 |
+
|
| 793 |
+
# Launch the app
|
| 794 |
+
if __name__ == "__main__":
|
| 795 |
+
print("\n" + "="*60)
|
| 796 |
+
print("🎬 Get Scene Studios Chatbot")
|
| 797 |
+
print("="*60)
|
| 798 |
+
print("\n✅ No Flask API needed - all processing is done directly!")
|
| 799 |
+
print("🌐 Gradio interface will open in your browser")
|
| 800 |
+
print("="*60 + "\n")
|
| 801 |
+
|
| 802 |
+
demo.launch()
|
scraper.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from typing import List, Dict, Any, Tuple
|
| 6 |
+
from utils import clean_time
|
| 7 |
+
|
| 8 |
+
def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
|
| 9 |
+
"""
|
| 10 |
+
Extract workshops using our robust Squarespace JSON + HTML parsing system
|
| 11 |
+
"""
|
| 12 |
+
headers = {
|
| 13 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
# First try the Squarespace JSON API
|
| 18 |
+
json_url = f"{url}?format=json"
|
| 19 |
+
print(f"🔍 Trying Squarespace JSON API: {json_url}")
|
| 20 |
+
|
| 21 |
+
response = requests.get(json_url, headers=headers, timeout=10)
|
| 22 |
+
if response.status_code == 200:
|
| 23 |
+
try:
|
| 24 |
+
json_data = response.json()
|
| 25 |
+
workshops = extract_workshops_from_json(json_data, json_url)
|
| 26 |
+
if workshops:
|
| 27 |
+
print(f"✅ Extracted {len(workshops)} workshops from JSON API")
|
| 28 |
+
return workshops
|
| 29 |
+
else:
|
| 30 |
+
print("❌ No workshops found in JSON, falling back to HTML")
|
| 31 |
+
except json.JSONDecodeError:
|
| 32 |
+
print("❌ Invalid JSON response, falling back to HTML")
|
| 33 |
+
|
| 34 |
+
# Fallback to HTML scraping if JSON fails
|
| 35 |
+
print(f"📄 Falling back to HTML scraping for {url}")
|
| 36 |
+
response = requests.get(url, headers=headers, timeout=10)
|
| 37 |
+
response.raise_for_status()
|
| 38 |
+
|
| 39 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 40 |
+
workshops = parse_workshops_from_html(soup, url)
|
| 41 |
+
|
| 42 |
+
if workshops:
|
| 43 |
+
print(f"✅ Extracted {len(workshops)} workshops from HTML parsing")
|
| 44 |
+
return workshops
|
| 45 |
+
else:
|
| 46 |
+
print("❌ No workshops found in HTML")
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"❌ Error scraping workshops from {url}: {e}")
|
| 51 |
+
return []
|
| 52 |
+
|
| 53 |
+
def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
|
| 54 |
+
"""Extract workshop information from Squarespace JSON data"""
|
| 55 |
+
workshops = []
|
| 56 |
+
|
| 57 |
+
# Check if there's mainContent HTML to parse
|
| 58 |
+
if isinstance(data, dict) and 'mainContent' in data:
|
| 59 |
+
main_content_html = data['mainContent']
|
| 60 |
+
if isinstance(main_content_html, str):
|
| 61 |
+
print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
|
| 62 |
+
|
| 63 |
+
soup = BeautifulSoup(main_content_html, 'html.parser')
|
| 64 |
+
workshops = parse_workshops_from_html(soup, source_url)
|
| 65 |
+
|
| 66 |
+
if workshops:
|
| 67 |
+
return workshops
|
| 68 |
+
|
| 69 |
+
return workshops
|
| 70 |
+
|
| 71 |
+
def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
|
| 72 |
+
"""Enhanced HTML parsing specifically for workshop content"""
|
| 73 |
+
workshops = []
|
| 74 |
+
workshop_texts = set()
|
| 75 |
+
|
| 76 |
+
print(f"🔍 ENHANCED HTML PARSING:")
|
| 77 |
+
|
| 78 |
+
# Method 1: Find individual workshop containers
|
| 79 |
+
potential_containers = soup.find_all(['div', 'section', 'article'],
|
| 80 |
+
attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
|
| 81 |
+
|
| 82 |
+
print(f" Found {len(potential_containers)} potential workshop containers")
|
| 83 |
+
|
| 84 |
+
for container in potential_containers:
|
| 85 |
+
workshop_text = container.get_text(strip=True)
|
| 86 |
+
|
| 87 |
+
if len(workshop_text) < 30 or workshop_text in workshop_texts:
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
|
| 91 |
+
workshop = extract_single_workshop_from_text(workshop_text, source_url)
|
| 92 |
+
if workshop and not is_duplicate_workshop(workshop, workshops):
|
| 93 |
+
workshops.append(workshop)
|
| 94 |
+
workshop_texts.add(workshop_text)
|
| 95 |
+
|
| 96 |
+
# Method 2: Pattern-based extraction from full text
|
| 97 |
+
all_text = soup.get_text()
|
| 98 |
+
|
| 99 |
+
workshop_patterns = [
|
| 100 |
+
# Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
|
| 101 |
+
r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
|
| 102 |
+
|
| 103 |
+
# Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
|
| 104 |
+
r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
|
| 105 |
+
|
| 106 |
+
# Pattern 3: "Casting Director Name, Date @ Time"
|
| 107 |
+
r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
|
| 108 |
+
]
|
| 109 |
+
|
| 110 |
+
for i, pattern in enumerate(workshop_patterns):
|
| 111 |
+
matches = re.findall(pattern, all_text, re.IGNORECASE)
|
| 112 |
+
for match in matches:
|
| 113 |
+
workshop = parse_refined_workshop_match(match, i+1, source_url)
|
| 114 |
+
if workshop and not is_duplicate_workshop(workshop, workshops):
|
| 115 |
+
workshops.append(workshop)
|
| 116 |
+
|
| 117 |
+
print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
|
| 118 |
+
return workshops
|
| 119 |
+
|
| 120 |
+
def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
|
| 121 |
+
"""Extract workshop info from a single text block"""
|
| 122 |
+
|
| 123 |
+
# Clean up the text
|
| 124 |
+
text = re.sub(r'\$[0-9,]+\.00', '', text)
|
| 125 |
+
text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
|
| 126 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 127 |
+
text = re.sub(r'\n+', ' ', text)
|
| 128 |
+
|
| 129 |
+
patterns = [
|
| 130 |
+
# Pattern A: "Title with Professional Name on Date @ Time"
|
| 131 |
+
r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
|
| 132 |
+
|
| 133 |
+
# Pattern B: "Professional Name, Title on Date @ Time"
|
| 134 |
+
r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
|
| 135 |
+
|
| 136 |
+
# Pattern C: "Casting Director Name, Date at Time"
|
| 137 |
+
r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
|
| 138 |
+
|
| 139 |
+
# Pattern D: "Company Executive Producer Name on Date"
|
| 140 |
+
r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
|
| 141 |
+
|
| 142 |
+
# Pattern E: "Company Agent Name Date" (fixed "on" issue)
|
| 143 |
+
r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
|
| 144 |
+
|
| 145 |
+
# Pattern F: "Company, Person, Title on Date"
|
| 146 |
+
r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
|
| 147 |
+
|
| 148 |
+
# Pattern G: Flexible fallback
|
| 149 |
+
r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
for i, pattern in enumerate(patterns):
|
| 153 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 154 |
+
if match:
|
| 155 |
+
return parse_pattern_match(match, i, source_url)
|
| 156 |
+
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
|
| 160 |
+
"""Parse a regex match or tuple based on pattern type"""
|
| 161 |
+
# Use a helper to get group content whether it's a match object or tuple
|
| 162 |
+
def get_grp(m, idx):
|
| 163 |
+
val = ""
|
| 164 |
+
if hasattr(m, 'group'):
|
| 165 |
+
try:
|
| 166 |
+
val = m.group(idx)
|
| 167 |
+
except IndexError:
|
| 168 |
+
val = ""
|
| 169 |
+
# If it's a tuple (from findall), idx is 1-based in standard regex terminology
|
| 170 |
+
# but 0-indexed in the tuple.
|
| 171 |
+
elif isinstance(m, (tuple, list)):
|
| 172 |
+
if 0 <= idx-1 < len(m):
|
| 173 |
+
val = m[idx-1]
|
| 174 |
+
|
| 175 |
+
return val if val is not None else ""
|
| 176 |
+
|
| 177 |
+
# Initialize variables
|
| 178 |
+
workshop_title = ""
|
| 179 |
+
instructor_title = ""
|
| 180 |
+
instructor_name = ""
|
| 181 |
+
date_str = ""
|
| 182 |
+
time_str = ""
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
if pattern_index == 0: # Pattern A/1
|
| 186 |
+
workshop_title = get_grp(match, 1).strip()
|
| 187 |
+
professional_full = get_grp(match, 2).strip()
|
| 188 |
+
date_str = get_grp(match, 3).strip()
|
| 189 |
+
time_str = get_grp(match, 4).strip()
|
| 190 |
+
|
| 191 |
+
if professional_full.startswith('CD '):
|
| 192 |
+
professional_full = 'Casting Director ' + professional_full[3:]
|
| 193 |
+
|
| 194 |
+
instructor_title, instructor_name = parse_professional_info(professional_full)
|
| 195 |
+
|
| 196 |
+
elif pattern_index == 1: # Pattern B/2
|
| 197 |
+
professional_full = get_grp(match, 1).strip()
|
| 198 |
+
workshop_title = get_grp(match, 2).strip()
|
| 199 |
+
date_str = get_grp(match, 3).strip()
|
| 200 |
+
time_str = get_grp(match, 4).strip()
|
| 201 |
+
|
| 202 |
+
instructor_title, instructor_name = parse_professional_info(professional_full)
|
| 203 |
+
|
| 204 |
+
elif pattern_index == 2: # Pattern C/3
|
| 205 |
+
instructor_title = get_grp(match, 1).strip()
|
| 206 |
+
instructor_name = get_grp(match, 2).strip()
|
| 207 |
+
date_str = get_grp(match, 3).strip()
|
| 208 |
+
time_str = get_grp(match, 4).strip()
|
| 209 |
+
workshop_title = "Casting Workshop"
|
| 210 |
+
|
| 211 |
+
elif pattern_index == 3: # Pattern D
|
| 212 |
+
instructor_title = get_grp(match, 1).strip()
|
| 213 |
+
instructor_name = get_grp(match, 2).strip()
|
| 214 |
+
date_str = get_grp(match, 3).strip()
|
| 215 |
+
time_str = get_grp(match, 4).strip()
|
| 216 |
+
workshop_title = "Industry Workshop"
|
| 217 |
+
|
| 218 |
+
elif pattern_index == 4: # Pattern E
|
| 219 |
+
company_name = get_grp(match, 1).strip()
|
| 220 |
+
agent_type = get_grp(match, 2).strip()
|
| 221 |
+
instructor_name = get_grp(match, 3).strip()
|
| 222 |
+
date_str = get_grp(match, 4).strip()
|
| 223 |
+
time_str = get_grp(match, 5).strip()
|
| 224 |
+
|
| 225 |
+
instructor_title = f"{company_name} {agent_type}"
|
| 226 |
+
workshop_title = "Industry Workshop"
|
| 227 |
+
|
| 228 |
+
elif pattern_index == 5: # Pattern F
|
| 229 |
+
company_name = get_grp(match, 1).strip()
|
| 230 |
+
instructor_name = get_grp(match, 2).strip()
|
| 231 |
+
workshop_title = get_grp(match, 3).strip()
|
| 232 |
+
date_str = get_grp(match, 4).strip()
|
| 233 |
+
time_str = get_grp(match, 5).strip()
|
| 234 |
+
|
| 235 |
+
instructor_title = company_name
|
| 236 |
+
|
| 237 |
+
else: # Pattern G
|
| 238 |
+
professional_full = get_grp(match, 1).strip() + " " + get_grp(match, 2).strip()
|
| 239 |
+
date_str = get_grp(match, 3).strip()
|
| 240 |
+
time_str = get_grp(match, 4).strip()
|
| 241 |
+
workshop_title = "Industry Workshop"
|
| 242 |
+
|
| 243 |
+
if len(professional_full) > 50 or '\n' in professional_full:
|
| 244 |
+
return None
|
| 245 |
+
|
| 246 |
+
instructor_title, instructor_name = parse_professional_info(professional_full)
|
| 247 |
+
|
| 248 |
+
if instructor_name and date_str:
|
| 249 |
+
# Create full_text for embedding (required by existing Flask API)
|
| 250 |
+
full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
|
| 251 |
+
if date_str:
|
| 252 |
+
full_text += f" on {date_str}"
|
| 253 |
+
if time_str:
|
| 254 |
+
full_text += f" at {clean_time(time_str)}"
|
| 255 |
+
|
| 256 |
+
return {
|
| 257 |
+
'title': workshop_title,
|
| 258 |
+
'instructor_name': instructor_name,
|
| 259 |
+
'instructor_title': instructor_title,
|
| 260 |
+
'date': date_str,
|
| 261 |
+
'time': clean_time(time_str),
|
| 262 |
+
'full_text': full_text, # Required for existing embedding system
|
| 263 |
+
'source_url': source_url
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
print(f"Error parsing pattern match: {e}")
|
| 268 |
+
|
| 269 |
+
return None
|
| 270 |
+
|
| 271 |
+
def parse_professional_info(professional_full: str) -> tuple:
|
| 272 |
+
"""Parse professional title and name from full string"""
|
| 273 |
+
|
| 274 |
+
professional_full = re.sub(r'\s+', ' ', professional_full).strip()
|
| 275 |
+
|
| 276 |
+
# Handle specific multi-word titles
|
| 277 |
+
specific_titles = [
|
| 278 |
+
'Atlanta Models & Talent President',
|
| 279 |
+
'Executive Casting Producer',
|
| 280 |
+
'Casting Director',
|
| 281 |
+
'Casting Associate',
|
| 282 |
+
'DDO Agent',
|
| 283 |
+
'Talent Agent',
|
| 284 |
+
'Acting Coach'
|
| 285 |
+
]
|
| 286 |
+
|
| 287 |
+
for title in specific_titles:
|
| 288 |
+
if title in professional_full:
|
| 289 |
+
title_pos = professional_full.find(title)
|
| 290 |
+
|
| 291 |
+
if title_pos == 0:
|
| 292 |
+
name_part = professional_full[len(title):].strip()
|
| 293 |
+
return title, name_part
|
| 294 |
+
else:
|
| 295 |
+
name_part = professional_full[:title_pos].strip().rstrip(',')
|
| 296 |
+
return title, name_part
|
| 297 |
+
|
| 298 |
+
# Fallback for single-word titles
|
| 299 |
+
single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
|
| 300 |
+
|
| 301 |
+
words = professional_full.split()
|
| 302 |
+
for i, word in enumerate(words):
|
| 303 |
+
if word in single_word_titles:
|
| 304 |
+
if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
|
| 305 |
+
title = f"{words[i-1]} {word}"
|
| 306 |
+
name_parts = words[:i-1] + words[i+1:]
|
| 307 |
+
else:
|
| 308 |
+
title = word
|
| 309 |
+
name_parts = words[:i] + words[i+1:]
|
| 310 |
+
|
| 311 |
+
name = ' '.join(name_parts).strip()
|
| 312 |
+
return title, name
|
| 313 |
+
|
| 314 |
+
# Final fallback
|
| 315 |
+
if len(words) >= 2:
|
| 316 |
+
return words[0], ' '.join(words[1:])
|
| 317 |
+
|
| 318 |
+
return '', professional_full
|
| 319 |
+
|
| 320 |
+
def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
|
| 321 |
+
"""Parse a regex match into a clean workshop dictionary"""
|
| 322 |
+
return parse_pattern_match(match, pattern_num-1, source_url) # Adjust for 0-based indexing
|
| 323 |
+
|
| 324 |
+
def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
|
| 325 |
+
"""Enhanced duplicate detection"""
|
| 326 |
+
for existing in existing_workshops:
|
| 327 |
+
if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
|
| 328 |
+
existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
|
| 329 |
+
|
| 330 |
+
existing_title = existing.get('title', '').strip().lower()
|
| 331 |
+
new_title = new_workshop.get('title', '').strip().lower()
|
| 332 |
+
|
| 333 |
+
if (existing_title == new_title or
|
| 334 |
+
'workshop' in existing_title and 'workshop' in new_title or
|
| 335 |
+
existing_title in new_title or new_title in existing_title):
|
| 336 |
+
return True
|
| 337 |
+
return False
|
| 338 |
+
|
| 339 |
+
def calculate_workshop_confidence(w: Dict) -> float:
|
| 340 |
+
"""Calculate confidence score of retrieved workshop data"""
|
| 341 |
+
score = 0.0
|
| 342 |
+
if w.get('title'): score += 0.3
|
| 343 |
+
if w.get('instructor_name'): score += 0.3
|
| 344 |
+
if w.get('date'): score += 0.2
|
| 345 |
+
if w.get('time'): score += 0.1
|
| 346 |
+
if w.get('source_url'): score += 0.1
|
| 347 |
+
return round(score, 2)
|