saim1309 commited on
Commit
e7f736a
·
verified ·
1 Parent(s): 407925d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +802 -0
  2. scraper.py +347 -0
app.py ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai
3
+ import json
4
+ from datetime import datetime, timedelta
5
+ import uuid
6
+ from typing import Dict
7
+
8
+ from config import OPENAI_API_KEY, DB_PATH, EMBED_MODEL
9
+ from utils import get_embedding, cosine_similarity, find_top_k_matches
10
+ from scraper import scrape_workshops_from_squarespace
11
+ from database import (
12
+ fetch_all_embeddings,
13
+ fetch_row_by_id,
14
+ fetch_all_faq_embeddings,
15
+ get_session_state,
16
+ update_session_state,
17
+ log_question
18
+ )
19
+
20
+ # ============================================================================
21
+ # CONFIGURATION
22
+ # ============================================================================
23
+
24
+ if not OPENAI_API_KEY:
25
+ raise ValueError("OPENAI_API_KEY not found in .env file")
26
+
27
+ openai.api_key = OPENAI_API_KEY
28
+
29
+
30
+ # Store session ID for the conversation
31
+ session_id = str(uuid.uuid4())
32
+
33
+ # Cache for workshop data and embeddings
34
+ workshop_cache = {
35
+ 'data': [],
36
+ 'embeddings': [],
37
+ 'last_updated': None,
38
+ 'cache_duration': timedelta(hours=24)
39
+ }
40
+
41
+ # ============================================================================
42
+ # KEYWORD LISTS FOR ROUTING
43
+ # ============================================================================
44
+
45
+ EMOTIONAL_KEYWORDS = [
46
+ 'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
47
+ 'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
48
+ 'insecure', 'lost', 'confused', 'struggling', 'hard time',
49
+ 'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
50
+ 'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
51
+ 'hopeless', 'stressed', 'pressure', 'imposter'
52
+ ]
53
+
54
+ ACTION_KEYWORDS = [
55
+ 'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
56
+ 'more auditions', 'book', 'booking', 'callbacks', 'improve',
57
+ 'better', 'self-tape', 'materials', 'headshots', 'reel',
58
+ 'network', 'connections', 'industry', 'career', 'strategy',
59
+ 'agent prep', 'total agent prep', 'workshop', 'class', 'training',
60
+ 'results', 'success', 'grow', 'advance', 'level up'
61
+ ]
62
+
63
+ POLICY_KEYWORDS = [
64
+ 'refund', 'refunds', 'money back',
65
+ 'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
66
+ 'late', 'lateness', 'tardy',
67
+ 'reschedule', 'change date', 'move class',
68
+ 'credit', 'credits',
69
+ 'cancel', 'cancellation', 'canceling',
70
+ 'policy', 'policies'
71
+ ]
72
+
73
+ DETAIL_SYNONYMS = [
74
+ 'detail', 'details', 'explain', 'elaborate', 'tell me more',
75
+ 'more info', 'describe', 'thorough', 'comprehensive'
76
+ ]
77
+
78
+ PERSONA_INSTRUCTION = """
79
+ You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
80
+ - Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
81
+ - Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
82
+ - Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
83
+ """
84
+
85
+ # ============================================================================
86
+ # HELPER FUNCTIONS
87
+ # ============================================================================
88
+
89
+ def calculate_workshop_confidence(w: Dict) -> float:
90
+ """Calculate confidence score of retrieved workshop data"""
91
+ score = 0.0
92
+ if w.get('title'): score += 0.3
93
+ if w.get('instructor_name'): score += 0.3
94
+ if w.get('date'): score += 0.2
95
+ if w.get('time'): score += 0.1
96
+ if w.get('source_url'): score += 0.1
97
+ return round(score, 2)
98
+
99
+ # ============================================================================
100
+ # WORKSHOP FUNCTIONS
101
+ # ============================================================================
102
+
103
+ def get_current_workshops():
104
+ """Get current workshops with caching"""
105
+ global workshop_cache
106
+
107
+ now = datetime.now()
108
+
109
+ # Check if cache is still valid
110
+ if (workshop_cache['last_updated'] and
111
+ now - workshop_cache['last_updated'] < workshop_cache['cache_duration'] and
112
+ workshop_cache['data']):
113
+ print("Using cached workshop data")
114
+ return workshop_cache['data'], workshop_cache['embeddings']
115
+
116
+ print("Fetching fresh workshop data...")
117
+
118
+ # Use robust Squarespace scraping system
119
+ online_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/online")
120
+ instudio_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/instudio")
121
+
122
+ all_workshops = online_workshops + instudio_workshops
123
+
124
+ # Data Integrity: Validate and score workshops
125
+ valid_workshops = []
126
+ total_score = 0
127
+ for w in all_workshops:
128
+ conf = calculate_workshop_confidence(w)
129
+ if conf >= 0.8:
130
+ valid_workshops.append(w)
131
+ total_score += conf
132
+ else:
133
+ print(f"⚠️ Rejecting weak record (Confidence: {conf}): {w.get('title', 'Unknown')}", flush=True)
134
+
135
+ avg_conf = total_score / len(valid_workshops) if valid_workshops else 0
136
+ print(f"📊 DATA INTEGRITY: Found {len(all_workshops)} total, {len(valid_workshops)} valid (Confidence >= 0.8)", flush=True)
137
+ print(f"📈 Retrieval Confidence: {avg_conf:.2f} (Average)", flush=True)
138
+
139
+ all_workshops = valid_workshops
140
+
141
+ if not all_workshops:
142
+ if workshop_cache['data']:
143
+ print("Scraping failed, using cached data")
144
+ return workshop_cache['data'], workshop_cache['embeddings']
145
+ else:
146
+ print("No workshop data available")
147
+ return [], []
148
+
149
+ # Generate embeddings for workshops
150
+ workshop_embeddings = []
151
+ for workshop in all_workshops:
152
+ try:
153
+ embedding = get_embedding(workshop['full_text'])
154
+ workshop_embeddings.append(embedding)
155
+ except Exception as e:
156
+ print(f"Error generating embedding for workshop: {e}")
157
+ workshop_embeddings.append([0] * 1536)
158
+
159
+ # Update cache
160
+ workshop_cache['data'] = all_workshops
161
+ workshop_cache['embeddings'] = workshop_embeddings
162
+ workshop_cache['last_updated'] = now
163
+
164
+ print(f"Cached {len(all_workshops)} workshops")
165
+ return all_workshops, workshop_embeddings
166
+
167
+ def find_top_workshops(user_embedding, k=3):
168
+ """Find top matching workshops using real-time data"""
169
+ workshops, workshop_embeddings = get_current_workshops()
170
+
171
+ if not workshops:
172
+ return []
173
+
174
+ scored = []
175
+ for i, (workshop, emb) in enumerate(zip(workshops, workshop_embeddings)):
176
+ try:
177
+ score = cosine_similarity(user_embedding, emb)
178
+ scored.append((score, i, workshop['full_text'], workshop))
179
+ except Exception as e:
180
+ print(f"Error calculating similarity: {e}")
181
+ continue
182
+
183
+ scored.sort(reverse=True)
184
+ return scored[:k]
185
+
186
+ # ============================================================================
187
+ # PROMPT BUILDING FUNCTIONS
188
+ # ============================================================================
189
+
190
+ def generate_enriched_links(row):
191
+ base_url = row.get("youtube_url")
192
+ guest_name = row.get("guest_name", "")
193
+ highlights = json.loads(row.get("highlight_json", "[]"))
194
+ summary = highlights[0]["summary"] if highlights else ""
195
+
196
+ # Truncate summary to first sentence only
197
+ if summary:
198
+ first_sentence = summary.split('.')[0] + '.'
199
+
200
+ if len(first_sentence) > 120:
201
+ short_summary = first_sentence[:117] + "..."
202
+ else:
203
+ short_summary = first_sentence
204
+ else:
205
+ short_summary = "Industry insights for actors"
206
+
207
+ markdown = f"🎧 [Watch {guest_name}'s episode here]({base_url}) - {short_summary}"
208
+ return [markdown]
209
+
210
+ def build_enhanced_prompt(user_question, context_results, top_workshops, user_preference=None, enriched_podcast_links=None, wants_details=False, current_topic=None):
211
+ """Builds the system prompt with strict formatting rules."""
212
+
213
+ # Free classes are ONLY available online (never in-studio)
214
+ free_class_url = "https://www.getscenestudios.com/online"
215
+
216
+ # helper for clean links
217
+ def format_workshop(w):
218
+ if not w.get('title') or not w.get('instructor_name') or not w.get('date'):
219
+ return None
220
+
221
+ link = "https://www.getscenestudios.com/instudio" if "/instudio" in w.get('source_url', '') else "https://www.getscenestudios.com/online"
222
+
223
+ # User Preference Filtering
224
+ w_type = "Online" if "online" in w.get('source_url', '') else "In-Studio"
225
+ if user_preference:
226
+ if user_preference.lower() != w_type.lower():
227
+ return None
228
+
229
+ # Calculate confidence using logic (already present in HF app.py at line 89)
230
+ confidence = calculate_workshop_confidence(w)
231
+ if confidence < 0.70:
232
+ return None
233
+
234
+ # R2: Force format inclusion into the title link for robustness
235
+ display_title = f"{w['title']} ({w_type})"
236
+ return f"- [{display_title}]({link}) with {w['instructor_name']} on {w['date']} at {w.get('time', '')}"
237
+
238
+ # Prepare workshop list (Top 3 max to display, but check top 10 for better filtering)
239
+ workshop_lines = []
240
+ if top_workshops:
241
+ for _, _, _, w_data in top_workshops[:10]: # Check top 10, take top 3 valid after filtering
242
+ formatted = format_workshop(w_data)
243
+ if formatted:
244
+ workshop_lines.append(formatted)
245
+
246
+
247
+ workshop_text = ""
248
+ if workshop_lines:
249
+ workshop_text = "\n".join(workshop_lines[:3])
250
+ else:
251
+ # Improved fallback to avoid generic/placeholder-like feeling
252
+ label = f"{user_preference.capitalize()} " if user_preference else ""
253
+ link = "https://www.getscenestudios.com/online" if user_preference == 'online' else "https://www.getscenestudios.com/instudio" if user_preference == 'instudio' else "https://www.getscenestudios.com/online"
254
+ workshop_text = f"We are constantly updating our schedule! Check our current {label}availability and latest workshops at {link}"
255
+
256
+ # Handle missing podcast data strictly
257
+ if not enriched_podcast_links:
258
+ single_podcast = "Our latest industry insights are available on YouTube: https://www.youtube.com/@GetSceneStudios"
259
+ else:
260
+ single_podcast = enriched_podcast_links[0]
261
+
262
+ # --- EMOTIONAL / SUPPORT MODE CHECK ---
263
+ is_emotional = detect_response_type(user_question) == "support"
264
+
265
+ if is_emotional:
266
+ prompt = f"""{PERSONA_INSTRUCTION}
267
+
268
+ You are acting in SUPPORT MODE.
269
+
270
+ CRITICAL INSTRUCTIONS:
271
+ 1. ACKNOWLEDGE their feelings first (e.g., "I hear how frustrating it is to feel stuck...").
272
+ 2. Provide SUPPORTIVE language (2-3 sentences max).
273
+ 3. Offer EXACTLY ONE gentle follow-up resource: either the podcast OR the free class.
274
+ 4. DO NOT suggest paid workshops or upsell in this response.
275
+ 5. KEEP IT BRIEF (≤150 words).
276
+
277
+ USER'S QUESTION: {user_question}
278
+
279
+ REQUIRED RESPONSE FORMAT:
280
+ [Your empathetic, supportive acknowledgment]
281
+
282
+ Here's a free resource that might help you move forward:
283
+ [Pick ONE: {single_podcast} OR Free Class at {free_class_url}]
284
+
285
+ Questions? Contact info@getscenestudios.com"""
286
+ return prompt
287
+
288
+ # --- STANDARD LOGIC FOR CONTEXT SNIPPET ---
289
+ question_lower = user_question.lower()
290
+ context_snippet = ""
291
+
292
+ # Priority 1: Direct Keywords in current question
293
+ detected_topic = None
294
+ if any(word in question_lower for word in ['agent', 'representation', 'rep', 'manager']):
295
+ detected_topic = 'agent'
296
+ elif any(word in question_lower for word in ['beginner', 'new', 'start', 'beginning']):
297
+ detected_topic = 'beginner'
298
+ elif any(word in question_lower for word in ['callback', 'audition', 'tape', 'self-tape', 'booking']):
299
+ detected_topic = 'audition'
300
+ elif any(word in question_lower for word in ['mentorship', 'coaching']):
301
+ detected_topic = 'mentorship'
302
+ elif any(word in question_lower for word in ['price', 'cost', 'how much']):
303
+ detected_topic = 'pricing'
304
+
305
+ # Priority 2: Fallback to session context if current question is ambiguous
306
+ if not detected_topic and current_topic:
307
+ topic_map = {
308
+ 'agent_seeking': 'agent',
309
+ 'beginner': 'beginner',
310
+ 'audition_help': 'audition',
311
+ 'mentorship': 'mentorship',
312
+ 'pricing': 'pricing'
313
+ }
314
+ detected_topic = topic_map.get(current_topic)
315
+
316
+ # Assign snippet based on topic
317
+ if detected_topic == 'agent':
318
+ context_snippet = "Get Scene Studios has helped 1000+ actors land representation. Total Agent Prep offers live practice with working agents (age 16+, limited to 12 actors)."
319
+ elif detected_topic == 'beginner':
320
+ context_snippet = "Get Scene Studios specializes in getting actors audition-ready fast with camera technique and professional self-tape skills."
321
+ elif detected_topic == 'audition':
322
+ context_snippet = "Get Scene offers Crush the Callback (Zoom simulation) and Perfect Submission (self-tape mastery) for actors refining their technique."
323
+ elif detected_topic == 'mentorship':
324
+ context_snippet = "Working Actor Mentorship is a 6-month program ($3,000) with structured feedback and industry access."
325
+ elif detected_topic == 'pricing':
326
+ context_snippet = "Get Scene Studios pricing varies by program. Most workshops cap at 12-14 actors for personalized feedback."
327
+ else:
328
+ context_snippet = "Get Scene Studios (founded by Jesse Malinowski) offers training for TV/film actors at all levels."
329
+
330
+ preference_instruction = ""
331
+ if not user_preference:
332
+ preference_instruction = """
333
+ IMPORTANT: We need to know if the user prefers "Online" or "In-Studio" workshops.
334
+ If their question implies a location or they haven't specified, ask: "Are you looking for Online or In-Studio training?" as part of your response.
335
+ """
336
+ else:
337
+ preference_instruction = f"""
338
+ USER PREFERENCE KNOWN: {user_preference.upper()}
339
+ 1. DO NOT ask "Online or In-Studio" again.
340
+ 2. Ensure your recommendations align with {user_preference.upper()} where possible.
341
+ """
342
+
343
+ # Brevity & Cognitive Load: Direct instructions based on user intent
344
+ detail_instruction = "Answer the user's question briefly (2-3 sentences max, ≤150 words total)."
345
+ if wants_details:
346
+ detail_instruction = "Provide a detailed and thorough explanation for the user's request, but keep it structured and readable."
347
+
348
+ prompt = f"""{PERSONA_INSTRUCTION}
349
+
350
+ {context_snippet}
351
+
352
+ CRITICAL INSTRUCTIONS:
353
+ - {detail_instruction}
354
+ - Use natural, human transitions between your answer and the recommendations.
355
+ - For each recommendation, add a tiny bit of "mentor advice" on why it helps.
356
+ - Then ALWAYS provide exactly these three numbered recommendations (1. 2. 3.):
357
+ - Use ONLY the provided links - do not invent recommendations
358
+ - Every workshop Title MUST be followed by its format in parentheses, e.g., "Workshop Name (Online)" or "Workshop Name (In-Studio)".
359
+ - Focus on clean, readable formatting.{preference_instruction}
360
+
361
+ USER'S QUESTION: {user_question}
362
+
363
+ REQUIRED RESPONSE FORMAT:
364
+ [Your brief answer to their question, ≤150 words total]
365
+
366
+ Here's your path forward:
367
+ 1. Free class (start here, no credit card required): {free_class_url}
368
+ 2. Recommended podcast episode:
369
+ {single_podcast}
370
+ 3. Relevant paid workshop:
371
+ {workshop_text}
372
+
373
+ Questions? Contact info@getscenestudios.com"""
374
+
375
+ return prompt
376
+
377
+ # ============================================================================
378
+ # DETECTION FUNCTIONS
379
+ # ============================================================================
380
+
381
+ def detect_question_category(question):
382
+ """Categorize user questions for better context injection"""
383
+ question_lower = question.lower()
384
+
385
+ categories = {
386
+ 'agent_seeking': ['agent', 'representation', 'rep', 'manager', 'get an agent'],
387
+ 'beginner': ['beginner', 'new', 'start', 'beginning', 'first time', 'never acted'],
388
+ 'audition_help': ['audition', 'callback', 'tape', 'self-tape', 'submission'],
389
+ 'mentorship': ['mentorship', 'coaching', 'intensive', 'mentor', 'one-on-one'],
390
+ 'pricing': ['price', 'cost', 'pricing', '$', 'money', 'payment', 'fee'],
391
+ 'classes': ['class', 'workshop', 'training', 'course', 'learn'],
392
+ 'membership': ['membership', 'join', 'member', 'gsp', 'plus'],
393
+ 'technical': ['self-tape', 'equipment', 'lighting', 'editing', 'camera']
394
+ }
395
+
396
+ detected = []
397
+ for category, keywords in categories.items():
398
+ if any(keyword in question_lower for keyword in keywords):
399
+ detected.append(category)
400
+
401
+ return detected
402
+
403
+ def detect_response_type(question):
404
+ """Detect if question is emotional/support vs action/results oriented"""
405
+ question_lower = question.lower()
406
+
407
+ emotional_count = sum(1 for word in EMOTIONAL_KEYWORDS if word in question_lower)
408
+ action_count = sum(1 for word in ACTION_KEYWORDS if word in question_lower)
409
+
410
+ if emotional_count > 0 and emotional_count >= action_count:
411
+ return "support"
412
+ return "standard"
413
+
414
+ def detect_policy_issue(question):
415
+ """Detect if question violates hard policy rules"""
416
+ question_lower = question.lower()
417
+ return any(word in question_lower for word in POLICY_KEYWORDS)
418
+
419
+ def detect_preference(question):
420
+ """Detect if user is stating a preference"""
421
+ q_lower = question.lower()
422
+ if 'online' in q_lower and 'studio' not in q_lower:
423
+ return 'online'
424
+ if ('studio' in q_lower or 'person' in q_lower or 'atlanta' in q_lower) and 'online' not in q_lower:
425
+ return 'instudio'
426
+ return None
427
+
428
+ def get_contextual_business_info(categories):
429
+ """Return relevant business information based on detected question categories"""
430
+
431
+ context_map = {
432
+ 'agent_seeking': {
433
+ 'programs': ['Total Agent Prep', 'Working Actor Mentorship'],
434
+ 'key_info': 'Live pitch practice with real agents, Actors Access optimization',
435
+ 'journey': 'Total Agent Prep → GSP → Mentorship for sustained progress'
436
+ },
437
+ 'beginner': {
438
+ 'programs': ['Free Classes', 'Get Scene 360', 'Get Scene Plus'],
439
+ 'key_info': 'Start with holistic foundation, build consistency',
440
+ 'journey': 'Free class → Get Scene 360 → GSP membership'
441
+ },
442
+ 'audition_help': {
443
+ 'programs': ['Perfect Submission', 'Crush the Callback', 'Audition Insight'],
444
+ 'key_info': 'Self-tape mastery, callback simulation, pro feedback',
445
+ 'journey': 'Perfect Submission → GSP for ongoing Audition Insight'
446
+ },
447
+ 'mentorship': {
448
+ 'programs': ['Working Actor Mentorship'],
449
+ 'key_info': '6-month intensive with structured feedback and accountability',
450
+ 'journey': 'Ready for commitment → WAM → Advanced workshops'
451
+ }
452
+ }
453
+
454
+ relevant_info = {}
455
+ for category in categories:
456
+ if category in context_map:
457
+ relevant_info[category] = context_map[category]
458
+
459
+ return relevant_info
460
+
461
+ # ============================================================================
462
+ # MAIN CHATBOT LOGIC
463
+ # ============================================================================
464
+
465
+ def update_knowledge_from_question(session_id: str, question: str):
466
+ """Extract attributes and update knowledge dictionary"""
467
+ updates = {}
468
+
469
+ # Extract Format
470
+ pref = detect_preference(question)
471
+ if pref:
472
+ updates['format'] = pref
473
+
474
+ # Extract Topic
475
+ cats = detect_question_category(question)
476
+ if cats:
477
+ # Prioritize specific topics over generic ones
478
+ priority_topics = ['agent_seeking', 'beginner', 'audition_help', 'mentorship', 'pricing']
479
+ for topic in priority_topics:
480
+ if topic in cats:
481
+ updates['topic'] = topic
482
+ break
483
+ if 'topic' not in updates and cats:
484
+ updates['topic'] = cats[0]
485
+
486
+ if updates:
487
+ update_session_state(session_id, knowledge_update=updates, increment_count=False)
488
+ return updates
489
+ return {}
490
+
491
+ def process_question(question: str, current_session_id: str):
492
+ """Main function to process user questions - replaces Flask /ask endpoint"""
493
+
494
+ if not question:
495
+ return "Question is required"
496
+
497
+ # 0. HARD POLICY CHECK
498
+ if detect_policy_issue(question):
499
+ log_question(question, current_session_id)
500
+
501
+ return "Please email info@getscenestudios.com."
502
+
503
+ # 1. Handle Session & Knowledge State
504
+ update_knowledge_from_question(current_session_id, question)
505
+
506
+ session_state = get_session_state(current_session_id)
507
+
508
+ try:
509
+ knowledge = json.loads(session_state.get('knowledge_context', '{}'))
510
+ except:
511
+ knowledge = {}
512
+
513
+ user_preference = knowledge.get('format')
514
+ current_topic = knowledge.get('topic')
515
+
516
+ if not user_preference:
517
+ user_preference = session_state.get('preference')
518
+
519
+ update_session_state(current_session_id, increment_count=True)
520
+
521
+ # Create embedding of user question
522
+ user_embedding = get_embedding(question)
523
+
524
+ # Check FAQ embeddings first
525
+ faq_data = fetch_all_faq_embeddings()
526
+ top_faqs = []
527
+
528
+ for entry_id, question_text, answer_text, emb in faq_data:
529
+ score = cosine_similarity(user_embedding, emb)
530
+ top_faqs.append((score, entry_id, question_text, answer_text))
531
+ top_faqs.sort(reverse=True)
532
+
533
+ faq_threshold = 0.85
534
+ ambiguous_threshold = 0.70
535
+
536
+ # If high-confidence FAQ match found
537
+ if top_faqs and top_faqs[0][0] >= faq_threshold:
538
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
539
+
540
+ best_score, faq_id, question_text, answer_text = top_faqs[0]
541
+
542
+ mentor_framing_start = "That's a great question! Here's the information on that:"
543
+ mentor_framing_end = "I hope that clears things up! Remember, every bit of knowledge helps you steer your career in the right direction."
544
+
545
+ enhanced_answer = f"{mentor_framing_start}\n\n{answer_text}"
546
+
547
+ # R5: Policy Guard for FAQ answers
548
+ if any(word in enhanced_answer.lower() for word in POLICY_KEYWORDS):
549
+ enhanced_answer = "Please email info@getscenestudios.com for assistance with this."
550
+ else:
551
+ categories = detect_question_category(question)
552
+ contextual_info = get_contextual_business_info(categories)
553
+
554
+ if contextual_info:
555
+ next_steps = []
556
+ for category, info in contextual_info.items():
557
+ next_steps.append(f"A great next step for you: {info['journey']}")
558
+
559
+ if next_steps:
560
+ enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
561
+
562
+ enhanced_answer += f"\n\n{mentor_framing_end}\n\nQuestions? Contact info@getscenestudios.com"
563
+
564
+ # Log question
565
+ log_question(question, current_session_id, answer=enhanced_answer)
566
+
567
+ return enhanced_answer
568
+
569
+ elif top_faqs and top_faqs[0][0] >= ambiguous_threshold:
570
+ # AMBIGUOUS ZONE
571
+ needs_clarification = False
572
+
573
+ if not user_preference:
574
+ needs_clarification = True
575
+
576
+ is_generic_query = any(w in question.lower() for w in ['price', 'cost', 'how much', 'schedule', 'when'])
577
+ if is_generic_query and not current_topic:
578
+ needs_clarification = True
579
+
580
+ clarification_count = session_state.get('clarification_count', 0)
581
+ if clarification_count > 0:
582
+ needs_clarification = False
583
+
584
+ if needs_clarification:
585
+ update_session_state(current_session_id, increment_clarification=True, increment_count=False)
586
+ best_match_q = top_faqs[0][2]
587
+ return f"Did you mean: {best_match_q}?"
588
+
589
+ # Auto-Resolve
590
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
591
+
592
+ best_score, faq_id, question_text, answer_text = top_faqs[0]
593
+
594
+ categories = detect_question_category(question)
595
+ contextual_info = get_contextual_business_info(categories)
596
+
597
+ enhanced_answer = answer_text
598
+ if contextual_info:
599
+ next_steps = []
600
+ for category, info in contextual_info.items():
601
+ next_steps.append(f"Next step: Consider {info['journey']}")
602
+
603
+ if next_steps:
604
+ enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
605
+ enhanced_answer += f"\n\nQuestions? Contact info@getscenestudios.com"
606
+
607
+ log_question(question, current_session_id, answer=enhanced_answer)
608
+
609
+ return enhanced_answer
610
+
611
+ else:
612
+ # 3. HALLUCINATION GUARD
613
+ categories = detect_question_category(question)
614
+
615
+ has_session_context = (current_topic is not None) or (user_preference is not None)
616
+
617
+ is_acting_related = (
618
+ len(categories) > 0 or
619
+ detect_response_type(question) == "support" or
620
+ any(k in question.lower() for k in ACTION_KEYWORDS) or
621
+ any(k in question.lower() for k in ['acting', 'actor', 'scene', 'audition', 'theatre', 'film', 'tv', 'commercial', 'agent', 'rep', 'manager'])
622
+ )
623
+
624
+ if not is_acting_related:
625
+ return "I'm not exactly sure about that. Please email info@getscenestudios.com so a member of our team can get you the most accurate answer!"
626
+
627
+ # 4. LLM PATH
628
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
629
+ podcast_data = fetch_all_embeddings("podcast_episodes")
630
+ top_workshops = find_top_workshops(user_embedding, k=10)
631
+ top_podcasts = find_top_k_matches(user_embedding, podcast_data, k=3)
632
+
633
+ enriched_podcast_links = []
634
+ for _, podcast_id, _ in top_podcasts:
635
+ row = fetch_row_by_id("podcast_episodes", podcast_id)
636
+ enriched_podcast_links.extend(generate_enriched_links(row))
637
+
638
+ if not enriched_podcast_links:
639
+ fallback = fetch_row_by_id("podcast_episodes", podcast_data[0][0])
640
+ enriched_podcast_links = generate_enriched_links(fallback)
641
+
642
+ # 5. Brevity & Detail Detection
643
+ wants_details = any(syn in question.lower() for syn in DETAIL_SYNONYMS)
644
+
645
+ final_prompt = build_enhanced_prompt(
646
+ question,
647
+ None,
648
+ top_workshops,
649
+ user_preference=user_preference,
650
+ enriched_podcast_links=enriched_podcast_links,
651
+ wants_details=wants_details,
652
+ current_topic=current_topic
653
+ )
654
+
655
+ response = openai.chat.completions.create(
656
+ model="gpt-4",
657
+ messages=[
658
+ {"role": "system", "content": final_prompt},
659
+ {"role": "user", "content": question}
660
+ ]
661
+ )
662
+
663
+ # Log question
664
+ log_question(question, current_session_id)
665
+
666
+ return response.choices[0].message.content.strip()
667
+
668
+ # ============================================================================
669
+ # GRADIO INTERFACE
670
+ # ============================================================================
671
+
672
+ def chat_with_bot(message, history):
673
+ """
674
+ Process message directly without Flask API
675
+
676
+ Args:
677
+ message: User's current message
678
+ history: Chat history (list of message dictionaries)
679
+
680
+ Returns:
681
+ Updated history with new exchange
682
+ """
683
+ global session_id
684
+
685
+ if not message.strip():
686
+ return history
687
+
688
+ try:
689
+ # Process question directly
690
+ bot_reply = process_question(message, session_id)
691
+ except Exception as e:
692
+ bot_reply = f"❌ Error: {str(e)}"
693
+
694
+ # Append to history in Gradio 6.0 format
695
+ history.append({"role": "user", "content": message})
696
+ history.append({"role": "assistant", "content": bot_reply})
697
+ return history
698
+
699
+ def reset_session():
700
+ """Reset session ID for new conversation"""
701
+ global session_id
702
+ session_id = str(uuid.uuid4())
703
+ return [] #, f"🔄 New session started: {session_id[:8]}..."
704
+
705
+ # Create Gradio interface
706
+ with gr.Blocks(title="Get Scene Studios Chatbot") as demo:
707
+
708
+ gr.Markdown(
709
+ """
710
+ # 🎬 Get Scene Studios AI Chatbot
711
+
712
+ Ask questions about acting classes, workshops and more!
713
+ """
714
+ )
715
+
716
+ # # Session info display
717
+ # session_info = gr.Textbox(
718
+ # label="Current Session ID",
719
+ # value=f"Session: {session_id[:8]}...",
720
+ # interactive=False,
721
+ # scale=1
722
+ # )
723
+
724
+ # Chatbot interface
725
+ chatbot = gr.Chatbot(
726
+ label="Conversation",
727
+ height=500
728
+ )
729
+
730
+ # Input area
731
+ with gr.Row():
732
+ msg = gr.Textbox(
733
+ label="Your Message",
734
+ lines=2,
735
+ scale=4
736
+ )
737
+ submit_btn = gr.Button("Send 📤", scale=1, variant="primary")
738
+
739
+ # Action buttons
740
+ with gr.Row():
741
+ clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
742
+ reset_btn = gr.Button("New Session 🔄", scale=1)
743
+
744
+ # Example questions
745
+ # gr.Examples(
746
+ # examples=[
747
+ # "How much does it cost?",
748
+ # "I want to get an agent",
749
+ # "I'm a beginner, where should I start?",
750
+ # "Tell me about your workshops",
751
+ # "Do you have online classes?",
752
+ # "What's the difference between Perfect Submission and Crush the Callback?",
753
+ # "I prefer in-studio training",
754
+ # "Tell me about mentorship programs"
755
+ # ],
756
+ # inputs=msg,
757
+ # label="💡 Try these example questions:"
758
+ # )
759
+
760
+ # Event handlers
761
+ submit_btn.click(
762
+ fn=chat_with_bot,
763
+ inputs=[msg, chatbot],
764
+ outputs=[chatbot]
765
+ ).then(
766
+ fn=lambda: "",
767
+ inputs=None,
768
+ outputs=[msg]
769
+ )
770
+
771
+ msg.submit(
772
+ fn=chat_with_bot,
773
+ inputs=[msg, chatbot],
774
+ outputs=[chatbot]
775
+ ).then(
776
+ fn=lambda: "",
777
+ inputs=None,
778
+ outputs=[msg]
779
+ )
780
+
781
+ clear_btn.click(
782
+ fn=lambda: [],
783
+ inputs=None,
784
+ outputs=[chatbot]
785
+ )
786
+
787
+ reset_btn.click(
788
+ fn=reset_session,
789
+ inputs=None,
790
+ outputs=[chatbot] #, session_info]
791
+ )
792
+
793
+ # Launch the app
794
+ if __name__ == "__main__":
795
+ print("\n" + "="*60)
796
+ print("🎬 Get Scene Studios Chatbot")
797
+ print("="*60)
798
+ print("\n✅ No Flask API needed - all processing is done directly!")
799
+ print("🌐 Gradio interface will open in your browser")
800
+ print("="*60 + "\n")
801
+
802
+ demo.launch()
scraper.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ from bs4 import BeautifulSoup
5
+ from typing import List, Dict, Any, Tuple
6
+ from utils import clean_time
7
+
8
+ def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
9
+ """
10
+ Extract workshops using our robust Squarespace JSON + HTML parsing system
11
+ """
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
14
+ }
15
+
16
+ try:
17
+ # First try the Squarespace JSON API
18
+ json_url = f"{url}?format=json"
19
+ print(f"🔍 Trying Squarespace JSON API: {json_url}")
20
+
21
+ response = requests.get(json_url, headers=headers, timeout=10)
22
+ if response.status_code == 200:
23
+ try:
24
+ json_data = response.json()
25
+ workshops = extract_workshops_from_json(json_data, json_url)
26
+ if workshops:
27
+ print(f"✅ Extracted {len(workshops)} workshops from JSON API")
28
+ return workshops
29
+ else:
30
+ print("❌ No workshops found in JSON, falling back to HTML")
31
+ except json.JSONDecodeError:
32
+ print("❌ Invalid JSON response, falling back to HTML")
33
+
34
+ # Fallback to HTML scraping if JSON fails
35
+ print(f"📄 Falling back to HTML scraping for {url}")
36
+ response = requests.get(url, headers=headers, timeout=10)
37
+ response.raise_for_status()
38
+
39
+ soup = BeautifulSoup(response.content, 'html.parser')
40
+ workshops = parse_workshops_from_html(soup, url)
41
+
42
+ if workshops:
43
+ print(f"✅ Extracted {len(workshops)} workshops from HTML parsing")
44
+ return workshops
45
+ else:
46
+ print("❌ No workshops found in HTML")
47
+ return []
48
+
49
+ except Exception as e:
50
+ print(f"❌ Error scraping workshops from {url}: {e}")
51
+ return []
52
+
53
+ def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
54
+ """Extract workshop information from Squarespace JSON data"""
55
+ workshops = []
56
+
57
+ # Check if there's mainContent HTML to parse
58
+ if isinstance(data, dict) and 'mainContent' in data:
59
+ main_content_html = data['mainContent']
60
+ if isinstance(main_content_html, str):
61
+ print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
62
+
63
+ soup = BeautifulSoup(main_content_html, 'html.parser')
64
+ workshops = parse_workshops_from_html(soup, source_url)
65
+
66
+ if workshops:
67
+ return workshops
68
+
69
+ return workshops
70
+
71
+ def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
72
+ """Enhanced HTML parsing specifically for workshop content"""
73
+ workshops = []
74
+ workshop_texts = set()
75
+
76
+ print(f"🔍 ENHANCED HTML PARSING:")
77
+
78
+ # Method 1: Find individual workshop containers
79
+ potential_containers = soup.find_all(['div', 'section', 'article'],
80
+ attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
81
+
82
+ print(f" Found {len(potential_containers)} potential workshop containers")
83
+
84
+ for container in potential_containers:
85
+ workshop_text = container.get_text(strip=True)
86
+
87
+ if len(workshop_text) < 30 or workshop_text in workshop_texts:
88
+ continue
89
+
90
+ if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
91
+ workshop = extract_single_workshop_from_text(workshop_text, source_url)
92
+ if workshop and not is_duplicate_workshop(workshop, workshops):
93
+ workshops.append(workshop)
94
+ workshop_texts.add(workshop_text)
95
+
96
+ # Method 2: Pattern-based extraction from full text
97
+ all_text = soup.get_text()
98
+
99
+ workshop_patterns = [
100
+ # Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
101
+ r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
102
+
103
+ # Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
104
+ r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
105
+
106
+ # Pattern 3: "Casting Director Name, Date @ Time"
107
+ r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
108
+ ]
109
+
110
+ for i, pattern in enumerate(workshop_patterns):
111
+ matches = re.findall(pattern, all_text, re.IGNORECASE)
112
+ for match in matches:
113
+ workshop = parse_refined_workshop_match(match, i+1, source_url)
114
+ if workshop and not is_duplicate_workshop(workshop, workshops):
115
+ workshops.append(workshop)
116
+
117
+ print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
118
+ return workshops
119
+
120
+ def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
121
+ """Extract workshop info from a single text block"""
122
+
123
+ # Clean up the text
124
+ text = re.sub(r'\$[0-9,]+\.00', '', text)
125
+ text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
126
+ text = re.sub(r'\s+', ' ', text).strip()
127
+ text = re.sub(r'\n+', ' ', text)
128
+
129
+ patterns = [
130
+ # Pattern A: "Title with Professional Name on Date @ Time"
131
+ r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
132
+
133
+ # Pattern B: "Professional Name, Title on Date @ Time"
134
+ r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
135
+
136
+ # Pattern C: "Casting Director Name, Date at Time"
137
+ r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
138
+
139
+ # Pattern D: "Company Executive Producer Name on Date"
140
+ r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
141
+
142
+ # Pattern E: "Company Agent Name Date" (fixed "on" issue)
143
+ r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
144
+
145
+ # Pattern F: "Company, Person, Title on Date"
146
+ r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
147
+
148
+ # Pattern G: Flexible fallback
149
+ r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
150
+ ]
151
+
152
+ for i, pattern in enumerate(patterns):
153
+ match = re.search(pattern, text, re.IGNORECASE)
154
+ if match:
155
+ return parse_pattern_match(match, i, source_url)
156
+
157
+ return None
158
+
159
+ def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
160
+ """Parse a regex match or tuple based on pattern type"""
161
+ # Use a helper to get group content whether it's a match object or tuple
162
+ def get_grp(m, idx):
163
+ val = ""
164
+ if hasattr(m, 'group'):
165
+ try:
166
+ val = m.group(idx)
167
+ except IndexError:
168
+ val = ""
169
+ # If it's a tuple (from findall), idx is 1-based in standard regex terminology
170
+ # but 0-indexed in the tuple.
171
+ elif isinstance(m, (tuple, list)):
172
+ if 0 <= idx-1 < len(m):
173
+ val = m[idx-1]
174
+
175
+ return val if val is not None else ""
176
+
177
+ # Initialize variables
178
+ workshop_title = ""
179
+ instructor_title = ""
180
+ instructor_name = ""
181
+ date_str = ""
182
+ time_str = ""
183
+
184
+ try:
185
+ if pattern_index == 0: # Pattern A/1
186
+ workshop_title = get_grp(match, 1).strip()
187
+ professional_full = get_grp(match, 2).strip()
188
+ date_str = get_grp(match, 3).strip()
189
+ time_str = get_grp(match, 4).strip()
190
+
191
+ if professional_full.startswith('CD '):
192
+ professional_full = 'Casting Director ' + professional_full[3:]
193
+
194
+ instructor_title, instructor_name = parse_professional_info(professional_full)
195
+
196
+ elif pattern_index == 1: # Pattern B/2
197
+ professional_full = get_grp(match, 1).strip()
198
+ workshop_title = get_grp(match, 2).strip()
199
+ date_str = get_grp(match, 3).strip()
200
+ time_str = get_grp(match, 4).strip()
201
+
202
+ instructor_title, instructor_name = parse_professional_info(professional_full)
203
+
204
+ elif pattern_index == 2: # Pattern C/3
205
+ instructor_title = get_grp(match, 1).strip()
206
+ instructor_name = get_grp(match, 2).strip()
207
+ date_str = get_grp(match, 3).strip()
208
+ time_str = get_grp(match, 4).strip()
209
+ workshop_title = "Casting Workshop"
210
+
211
+ elif pattern_index == 3: # Pattern D
212
+ instructor_title = get_grp(match, 1).strip()
213
+ instructor_name = get_grp(match, 2).strip()
214
+ date_str = get_grp(match, 3).strip()
215
+ time_str = get_grp(match, 4).strip()
216
+ workshop_title = "Industry Workshop"
217
+
218
+ elif pattern_index == 4: # Pattern E
219
+ company_name = get_grp(match, 1).strip()
220
+ agent_type = get_grp(match, 2).strip()
221
+ instructor_name = get_grp(match, 3).strip()
222
+ date_str = get_grp(match, 4).strip()
223
+ time_str = get_grp(match, 5).strip()
224
+
225
+ instructor_title = f"{company_name} {agent_type}"
226
+ workshop_title = "Industry Workshop"
227
+
228
+ elif pattern_index == 5: # Pattern F
229
+ company_name = get_grp(match, 1).strip()
230
+ instructor_name = get_grp(match, 2).strip()
231
+ workshop_title = get_grp(match, 3).strip()
232
+ date_str = get_grp(match, 4).strip()
233
+ time_str = get_grp(match, 5).strip()
234
+
235
+ instructor_title = company_name
236
+
237
+ else: # Pattern G
238
+ professional_full = get_grp(match, 1).strip() + " " + get_grp(match, 2).strip()
239
+ date_str = get_grp(match, 3).strip()
240
+ time_str = get_grp(match, 4).strip()
241
+ workshop_title = "Industry Workshop"
242
+
243
+ if len(professional_full) > 50 or '\n' in professional_full:
244
+ return None
245
+
246
+ instructor_title, instructor_name = parse_professional_info(professional_full)
247
+
248
+ if instructor_name and date_str:
249
+ # Create full_text for embedding (required by existing Flask API)
250
+ full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
251
+ if date_str:
252
+ full_text += f" on {date_str}"
253
+ if time_str:
254
+ full_text += f" at {clean_time(time_str)}"
255
+
256
+ return {
257
+ 'title': workshop_title,
258
+ 'instructor_name': instructor_name,
259
+ 'instructor_title': instructor_title,
260
+ 'date': date_str,
261
+ 'time': clean_time(time_str),
262
+ 'full_text': full_text, # Required for existing embedding system
263
+ 'source_url': source_url
264
+ }
265
+
266
+ except Exception as e:
267
+ print(f"Error parsing pattern match: {e}")
268
+
269
+ return None
270
+
271
+ def parse_professional_info(professional_full: str) -> tuple:
272
+ """Parse professional title and name from full string"""
273
+
274
+ professional_full = re.sub(r'\s+', ' ', professional_full).strip()
275
+
276
+ # Handle specific multi-word titles
277
+ specific_titles = [
278
+ 'Atlanta Models & Talent President',
279
+ 'Executive Casting Producer',
280
+ 'Casting Director',
281
+ 'Casting Associate',
282
+ 'DDO Agent',
283
+ 'Talent Agent',
284
+ 'Acting Coach'
285
+ ]
286
+
287
+ for title in specific_titles:
288
+ if title in professional_full:
289
+ title_pos = professional_full.find(title)
290
+
291
+ if title_pos == 0:
292
+ name_part = professional_full[len(title):].strip()
293
+ return title, name_part
294
+ else:
295
+ name_part = professional_full[:title_pos].strip().rstrip(',')
296
+ return title, name_part
297
+
298
+ # Fallback for single-word titles
299
+ single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
300
+
301
+ words = professional_full.split()
302
+ for i, word in enumerate(words):
303
+ if word in single_word_titles:
304
+ if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
305
+ title = f"{words[i-1]} {word}"
306
+ name_parts = words[:i-1] + words[i+1:]
307
+ else:
308
+ title = word
309
+ name_parts = words[:i] + words[i+1:]
310
+
311
+ name = ' '.join(name_parts).strip()
312
+ return title, name
313
+
314
+ # Final fallback
315
+ if len(words) >= 2:
316
+ return words[0], ' '.join(words[1:])
317
+
318
+ return '', professional_full
319
+
320
+ def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
321
+ """Parse a regex match into a clean workshop dictionary"""
322
+ return parse_pattern_match(match, pattern_num-1, source_url) # Adjust for 0-based indexing
323
+
324
+ def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
325
+ """Enhanced duplicate detection"""
326
+ for existing in existing_workshops:
327
+ if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
328
+ existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
329
+
330
+ existing_title = existing.get('title', '').strip().lower()
331
+ new_title = new_workshop.get('title', '').strip().lower()
332
+
333
+ if (existing_title == new_title or
334
+ 'workshop' in existing_title and 'workshop' in new_title or
335
+ existing_title in new_title or new_title in existing_title):
336
+ return True
337
+ return False
338
+
339
+ def calculate_workshop_confidence(w: Dict) -> float:
340
+ """Calculate confidence score of retrieved workshop data"""
341
+ score = 0.0
342
+ if w.get('title'): score += 0.3
343
+ if w.get('instructor_name'): score += 0.3
344
+ if w.get('date'): score += 0.2
345
+ if w.get('time'): score += 0.1
346
+ if w.get('source_url'): score += 0.1
347
+ return round(score, 2)