saim1309 commited on
Commit
9f57d5e
·
verified ·
1 Parent(s): 77b1a0f

Upload 7 files

Browse files
Files changed (6) hide show
  1. app.py +794 -0
  2. config.py +58 -0
  3. database.py +144 -0
  4. getscene_ai.sqlite +3 -0
  5. scraper.py +331 -0
  6. utils.py +42 -0
app.py ADDED
@@ -0,0 +1,794 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai
3
+ import json
4
+ from datetime import datetime, timedelta
5
+ import uuid
6
+ from typing import Dict
7
+
8
+ from config import OPENAI_API_KEY, DB_PATH, EMBED_MODEL
9
+ from utils import get_embedding, cosine_similarity, find_top_k_matches
10
+ from scraper import scrape_workshops_from_squarespace
11
+ from database import (
12
+ fetch_all_embeddings,
13
+ fetch_row_by_id,
14
+ fetch_all_faq_embeddings,
15
+ get_session_state,
16
+ update_session_state,
17
+ log_question
18
+ )
19
+
20
+ # ============================================================================
21
+ # CONFIGURATION
22
+ # ============================================================================
23
+
24
+ if not OPENAI_API_KEY:
25
+ raise ValueError("OPENAI_API_KEY not found in .env file")
26
+
27
+ openai.api_key = OPENAI_API_KEY
28
+
29
+
30
+ # Store session ID for the conversation
31
+ session_id = str(uuid.uuid4())
32
+
33
+ # Cache for workshop data and embeddings
34
+ workshop_cache = {
35
+ 'data': [],
36
+ 'embeddings': [],
37
+ 'last_updated': None,
38
+ 'cache_duration': timedelta(hours=24)
39
+ }
40
+
41
+ # ============================================================================
42
+ # KEYWORD LISTS FOR ROUTING
43
+ # ============================================================================
44
+
45
+ EMOTIONAL_KEYWORDS = [
46
+ 'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
47
+ 'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
48
+ 'insecure', 'lost', 'confused', 'struggling', 'hard time',
49
+ 'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
50
+ 'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
51
+ 'hopeless', 'stressed', 'pressure', 'imposter'
52
+ ]
53
+
54
+ ACTION_KEYWORDS = [
55
+ 'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
56
+ 'more auditions', 'book', 'booking', 'callbacks', 'improve',
57
+ 'better', 'self-tape', 'materials', 'headshots', 'reel',
58
+ 'network', 'connections', 'industry', 'career', 'strategy',
59
+ 'agent prep', 'total agent prep', 'workshop', 'class', 'training',
60
+ 'results', 'success', 'grow', 'advance', 'level up'
61
+ ]
62
+
63
+ POLICY_KEYWORDS = [
64
+ 'refund', 'refunds', 'money back',
65
+ 'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
66
+ 'late', 'lateness', 'tardy',
67
+ 'reschedule', 'change date', 'move class',
68
+ 'credit', 'credits',
69
+ 'cancel', 'cancellation', 'canceling',
70
+ 'policy', 'policies'
71
+ ]
72
+
73
+ DETAIL_SYNONYMS = [
74
+ 'detail', 'details', 'explain', 'elaborate', 'tell me more',
75
+ 'more info', 'describe', 'thorough', 'comprehensive'
76
+ ]
77
+
78
+ PERSONA_INSTRUCTION = """
79
+ You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
80
+ - Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
81
+ - Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
82
+ - Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
83
+ """
84
+
85
+ # ============================================================================
86
+ # HELPER FUNCTIONS
87
+ # ============================================================================
88
+
89
+ def calculate_workshop_confidence(w: Dict) -> float:
90
+ """Calculate confidence score of retrieved workshop data"""
91
+ score = 0.0
92
+ if w.get('title'): score += 0.3
93
+ if w.get('instructor_name'): score += 0.3
94
+ if w.get('date'): score += 0.2
95
+ if w.get('time'): score += 0.1
96
+ if w.get('source_url'): score += 0.1
97
+ return round(score, 2)
98
+
99
+ # ============================================================================
100
+ # WORKSHOP FUNCTIONS
101
+ # ============================================================================
102
+
103
+ def get_current_workshops():
104
+ """Get current workshops with caching"""
105
+ global workshop_cache
106
+
107
+ now = datetime.now()
108
+
109
+ # Check if cache is still valid
110
+ if (workshop_cache['last_updated'] and
111
+ now - workshop_cache['last_updated'] < workshop_cache['cache_duration'] and
112
+ workshop_cache['data']):
113
+ print("Using cached workshop data")
114
+ return workshop_cache['data'], workshop_cache['embeddings']
115
+
116
+ print("Fetching fresh workshop data...")
117
+
118
+ # Use robust Squarespace scraping system
119
+ online_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/online")
120
+ instudio_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/instudio")
121
+
122
+ all_workshops = online_workshops + instudio_workshops
123
+
124
+ # Data Integrity: Validate and score workshops
125
+ valid_workshops = []
126
+ total_score = 0
127
+ for w in all_workshops:
128
+ conf = calculate_workshop_confidence(w)
129
+ if conf >= 0.8:
130
+ valid_workshops.append(w)
131
+ total_score += conf
132
+ else:
133
+ print(f"⚠️ Rejecting weak record (Confidence: {conf}): {w.get('title', 'Unknown')}", flush=True)
134
+
135
+ avg_conf = total_score / len(valid_workshops) if valid_workshops else 0
136
+ print(f"📊 DATA INTEGRITY: Found {len(all_workshops)} total, {len(valid_workshops)} valid (Confidence >= 0.8)", flush=True)
137
+ print(f"📈 Retrieval Confidence: {avg_conf:.2f} (Average)", flush=True)
138
+
139
+ all_workshops = valid_workshops
140
+
141
+ if not all_workshops:
142
+ if workshop_cache['data']:
143
+ print("Scraping failed, using cached data")
144
+ return workshop_cache['data'], workshop_cache['embeddings']
145
+ else:
146
+ print("No workshop data available")
147
+ return [], []
148
+
149
+ # Generate embeddings for workshops
150
+ workshop_embeddings = []
151
+ for workshop in all_workshops:
152
+ try:
153
+ embedding = get_embedding(workshop['full_text'])
154
+ workshop_embeddings.append(embedding)
155
+ except Exception as e:
156
+ print(f"Error generating embedding for workshop: {e}")
157
+ workshop_embeddings.append([0] * 1536)
158
+
159
+ # Update cache
160
+ workshop_cache['data'] = all_workshops
161
+ workshop_cache['embeddings'] = workshop_embeddings
162
+ workshop_cache['last_updated'] = now
163
+
164
+ print(f"Cached {len(all_workshops)} workshops")
165
+ return all_workshops, workshop_embeddings
166
+
167
+ def find_top_workshops(user_embedding, k=3):
168
+ """Find top matching workshops using real-time data"""
169
+ workshops, workshop_embeddings = get_current_workshops()
170
+
171
+ if not workshops:
172
+ return []
173
+
174
+ scored = []
175
+ for i, (workshop, emb) in enumerate(zip(workshops, workshop_embeddings)):
176
+ try:
177
+ score = cosine_similarity(user_embedding, emb)
178
+ scored.append((score, i, workshop['full_text'], workshop))
179
+ except Exception as e:
180
+ print(f"Error calculating similarity: {e}")
181
+ continue
182
+
183
+ scored.sort(reverse=True)
184
+ return scored[:k]
185
+
186
+ # ============================================================================
187
+ # PROMPT BUILDING FUNCTIONS
188
+ # ============================================================================
189
+
190
+ def generate_enriched_links(row):
191
+ base_url = row.get("youtube_url")
192
+ guest_name = row.get("guest_name", "")
193
+ highlights = json.loads(row.get("highlight_json", "[]"))
194
+ summary = highlights[0]["summary"] if highlights else ""
195
+
196
+ # Truncate summary to first sentence only
197
+ if summary:
198
+ first_sentence = summary.split('.')[0] + '.'
199
+
200
+ if len(first_sentence) > 120:
201
+ short_summary = first_sentence[:117] + "..."
202
+ else:
203
+ short_summary = first_sentence
204
+ else:
205
+ short_summary = "Industry insights for actors"
206
+
207
+ markdown = f"🎧 [Watch {guest_name}'s episode here]({base_url}) - {short_summary}"
208
+ return [markdown]
209
+
210
+ def build_enhanced_prompt(user_question, context_results, top_workshops, user_preference=None, enriched_podcast_links=None, wants_details=False, current_topic=None):
211
+ """Builds the system prompt with strict formatting rules."""
212
+
213
+ # Free classes are ONLY available online (never in-studio)
214
+ free_class_url = "https://www.getscenestudios.com/online"
215
+
216
+ single_podcast = ""
217
+
218
+ # helper for clean links
219
+ def format_workshop(w):
220
+ if not w.get('title') or not w.get('instructor_name') or not w.get('date'):
221
+ return None
222
+
223
+ link = "https://www.getscenestudios.com/instudio" if "/instudio" in w.get('source_url', '') else "https://www.getscenestudios.com/online"
224
+
225
+ # User Preference Filtering
226
+ if user_preference:
227
+ w_type = "Online" if "online" in w.get('source_url', '') else "In-Studio"
228
+ if user_preference.lower() != w_type.lower():
229
+ return None
230
+
231
+ return f"- [{w['title']}]({link}) with {w['instructor_name']} ({w.get('time', '')}) on {w['date']}"
232
+
233
+ # Prepare workshop list (Top 3 max to display, but check top 10 for better filtering)
234
+ workshop_lines = []
235
+ if top_workshops:
236
+ for _, _, _, w_data in top_workshops[:10]: # Check top 10, take top 3 valid after filtering
237
+ formatted = format_workshop(w_data)
238
+ if formatted:
239
+ workshop_lines.append(formatted)
240
+
241
+
242
+ workshop_text = ""
243
+ if workshop_lines:
244
+ workshop_text = "\n".join(workshop_lines[:3])
245
+ else:
246
+ # Fallback link should respect user preference
247
+ if user_preference and user_preference.lower() == 'instudio':
248
+ workshop_text = "Check our schedule for current availability at https://www.getscenestudios.com/instudio"
249
+ else:
250
+ workshop_text = "Check our schedule for current availability at https://www.getscenestudios.com/online"
251
+
252
+ # Handle missing podcast data strictly
253
+ if not enriched_podcast_links:
254
+ single_podcast = "Our latest industry insights are available on YouTube: https://www.youtube.com/@GetSceneStudios"
255
+ else:
256
+ single_podcast = enriched_podcast_links[0]
257
+
258
+ # --- EMOTIONAL / SUPPORT MODE CHECK ---
259
+ is_emotional = detect_response_type(user_question) == "support"
260
+
261
+ if is_emotional:
262
+ prompt = f"""{PERSONA_INSTRUCTION}
263
+
264
+ You are acting in SUPPORT MODE. Provide a response with extra empathy.
265
+
266
+ CRITICAL INSTRUCTIONS:
267
+ - Answer with EMPATHY and ENCOURAGEMENT (2-3 sentences)
268
+ - Acknowledge their feelings ("stuck", "frustrated", etc.)
269
+ - DO NOT upsell paid workshops in this response.
270
+ - Offer ONE gentle step: a free resource (podcast or free class).
271
+
272
+ USER'S QUESTION: {user_question}
273
+
274
+ REQUIRED RESPONSE FORMAT:
275
+ [Your empathetic, supportive response]
276
+
277
+ Here's a free resource that might help:
278
+ 1. Podcast episode: {single_podcast}
279
+ 2. Or join our free class: {free_class_url}
280
+
281
+ Questions? Contact info@getscenestudios.com"""
282
+ return prompt
283
+
284
+ # --- STANDARD LOGIC FOR CONTEXT SNIPPET ---
285
+ question_lower = user_question.lower()
286
+ context_snippet = ""
287
+
288
+ # Priority 1: Direct Keywords in current question
289
+ detected_topic = None
290
+ if any(word in question_lower for word in ['agent', 'representation', 'rep', 'manager']):
291
+ detected_topic = 'agent'
292
+ elif any(word in question_lower for word in ['beginner', 'new', 'start', 'beginning']):
293
+ detected_topic = 'beginner'
294
+ elif any(word in question_lower for word in ['callback', 'audition', 'tape', 'self-tape', 'booking']):
295
+ detected_topic = 'audition'
296
+ elif any(word in question_lower for word in ['mentorship', 'coaching']):
297
+ detected_topic = 'mentorship'
298
+ elif any(word in question_lower for word in ['price', 'cost', 'how much']):
299
+ detected_topic = 'pricing'
300
+
301
+ # Priority 2: Fallback to session context if current question is ambiguous
302
+ if not detected_topic and current_topic:
303
+ topic_map = {
304
+ 'agent_seeking': 'agent',
305
+ 'beginner': 'beginner',
306
+ 'audition_help': 'audition',
307
+ 'mentorship': 'mentorship',
308
+ 'pricing': 'pricing'
309
+ }
310
+ detected_topic = topic_map.get(current_topic)
311
+
312
+ # Assign snippet based on topic
313
+ if detected_topic == 'agent':
314
+ context_snippet = "Get Scene Studios has helped 1000+ actors land representation. Total Agent Prep offers live practice with working agents (age 16+, limited to 12 actors)."
315
+ elif detected_topic == 'beginner':
316
+ context_snippet = "Get Scene Studios specializes in getting actors audition-ready fast with camera technique and professional self-tape skills."
317
+ elif detected_topic == 'audition':
318
+ context_snippet = "Get Scene offers Crush the Callback (Zoom simulation) and Perfect Submission (self-tape mastery) for actors refining their technique."
319
+ elif detected_topic == 'mentorship':
320
+ context_snippet = "Working Actor Mentorship is a 6-month program ($3,000) with structured feedback and industry access."
321
+ elif detected_topic == 'pricing':
322
+ context_snippet = "Get Scene Studios pricing varies by program. Most workshops cap at 12-14 actors for personalized feedback."
323
+ else:
324
+ context_snippet = "Get Scene Studios (founded by Jesse Malinowski) offers training for TV/film actors at all levels."
325
+
326
+ preference_instruction = ""
327
+ if not user_preference:
328
+ preference_instruction = """
329
+ IMPORTANT: We need to know if the user prefers "Online" or "In-Studio" workshops.
330
+ If their question implies a location or they haven't specified, ask: "Are you looking for Online or In-Studio training?" as part of your response.
331
+ """
332
+ else:
333
+ preference_instruction = f"""
334
+ USER PREFERENCE KNOWN: {user_preference.upper()}
335
+ 1. DO NOT ask "Online or In-Studio" again.
336
+ 2. Ensure your recommendations align with {user_preference.upper()} where possible.
337
+ """
338
+
339
+ # Brevity & Cognitive Load: Direct instructions based on user intent
340
+ detail_instruction = "Answer the user's question briefly (2-3 sentences max, ≤150 words total)."
341
+ if wants_details:
342
+ detail_instruction = "Provide a detailed and thorough explanation for the user's request, but keep it structured and readable."
343
+
344
+ prompt = f"""{PERSONA_INSTRUCTION}
345
+
346
+ {context_snippet}
347
+
348
+ CRITICAL INSTRUCTIONS:
349
+ - {detail_instruction}
350
+ - Use natural, human transitions between your answer and the recommendations.
351
+ - For each recommendation, add a tiny bit of "mentor advice" on why it helps.
352
+ - Then ALWAYS provide exactly these three numbered recommendations (1. 2. 3.):
353
+ - Use ONLY the provided links - do not invent recommendations
354
+ - Focus on clean, readable formatting.{preference_instruction}
355
+
356
+ USER'S QUESTION: {user_question}
357
+
358
+ REQUIRED RESPONSE FORMAT:
359
+ [Your brief answer to their question]
360
+
361
+ Here's your path forward:
362
+ 1. Free class (start here, no credit card required): {free_class_url}
363
+ 2. Recommended podcast episode:
364
+ {single_podcast}
365
+ 3. Relevant paid workshop:
366
+ {workshop_text}
367
+
368
+ Questions? Contact info@getscenestudios.com"""
369
+
370
+ return prompt
371
+
372
+ # ============================================================================
373
+ # DETECTION FUNCTIONS
374
+ # ============================================================================
375
+
376
+ def detect_question_category(question):
377
+ """Categorize user questions for better context injection"""
378
+ question_lower = question.lower()
379
+
380
+ categories = {
381
+ 'agent_seeking': ['agent', 'representation', 'rep', 'manager', 'get an agent'],
382
+ 'beginner': ['beginner', 'new', 'start', 'beginning', 'first time', 'never acted'],
383
+ 'audition_help': ['audition', 'callback', 'tape', 'self-tape', 'submission'],
384
+ 'mentorship': ['mentorship', 'coaching', 'intensive', 'mentor', 'one-on-one'],
385
+ 'pricing': ['price', 'cost', 'pricing', '$', 'money', 'payment', 'fee'],
386
+ 'classes': ['class', 'workshop', 'training', 'course', 'learn'],
387
+ 'membership': ['membership', 'join', 'member', 'gsp', 'plus'],
388
+ 'technical': ['self-tape', 'equipment', 'lighting', 'editing', 'camera']
389
+ }
390
+
391
+ detected = []
392
+ for category, keywords in categories.items():
393
+ if any(keyword in question_lower for keyword in keywords):
394
+ detected.append(category)
395
+
396
+ return detected
397
+
398
+ def detect_response_type(question):
399
+ """Detect if question is emotional/support vs action/results oriented"""
400
+ question_lower = question.lower()
401
+
402
+ emotional_count = sum(1 for word in EMOTIONAL_KEYWORDS if word in question_lower)
403
+ action_count = sum(1 for word in ACTION_KEYWORDS if word in question_lower)
404
+
405
+ if emotional_count > 0 and emotional_count >= action_count:
406
+ return "support"
407
+ return "standard"
408
+
409
+ def detect_policy_issue(question):
410
+ """Detect if question violates hard policy rules"""
411
+ question_lower = question.lower()
412
+ return any(word in question_lower for word in POLICY_KEYWORDS)
413
+
414
+ def detect_preference(question):
415
+ """Detect if user is stating a preference"""
416
+ q_lower = question.lower()
417
+ if 'online' in q_lower and 'studio' not in q_lower:
418
+ return 'online'
419
+ if ('studio' in q_lower or 'person' in q_lower or 'atlanta' in q_lower) and 'online' not in q_lower:
420
+ return 'instudio'
421
+ return None
422
+
423
+ def get_contextual_business_info(categories):
424
+ """Return relevant business information based on detected question categories"""
425
+
426
+ context_map = {
427
+ 'agent_seeking': {
428
+ 'programs': ['Total Agent Prep', 'Working Actor Mentorship'],
429
+ 'key_info': 'Live pitch practice with real agents, Actors Access optimization',
430
+ 'journey': 'Total Agent Prep → GSP → Mentorship for sustained progress'
431
+ },
432
+ 'beginner': {
433
+ 'programs': ['Free Classes', 'Get Scene 360', 'Get Scene Plus'],
434
+ 'key_info': 'Start with holistic foundation, build consistency',
435
+ 'journey': 'Free class → Get Scene 360 → GSP membership'
436
+ },
437
+ 'audition_help': {
438
+ 'programs': ['Perfect Submission', 'Crush the Callback', 'Audition Insight'],
439
+ 'key_info': 'Self-tape mastery, callback simulation, pro feedback',
440
+ 'journey': 'Perfect Submission → GSP for ongoing Audition Insight'
441
+ },
442
+ 'mentorship': {
443
+ 'programs': ['Working Actor Mentorship'],
444
+ 'key_info': '6-month intensive with structured feedback and accountability',
445
+ 'journey': 'Ready for commitment → WAM → Advanced workshops'
446
+ }
447
+ }
448
+
449
+ relevant_info = {}
450
+ for category in categories:
451
+ if category in context_map:
452
+ relevant_info[category] = context_map[category]
453
+
454
+ return relevant_info
455
+
456
+ # ============================================================================
457
+ # MAIN CHATBOT LOGIC
458
+ # ============================================================================
459
+
460
+ def update_knowledge_from_question(session_id: str, question: str):
461
+ """Extract attributes and update knowledge dictionary"""
462
+ updates = {}
463
+
464
+ # Extract Format
465
+ pref = detect_preference(question)
466
+ if pref:
467
+ updates['format'] = pref
468
+
469
+ # Extract Topic
470
+ cats = detect_question_category(question)
471
+ if cats:
472
+ # Prioritize specific topics over generic ones
473
+ priority_topics = ['agent_seeking', 'beginner', 'audition_help', 'mentorship', 'pricing']
474
+ for topic in priority_topics:
475
+ if topic in cats:
476
+ updates['topic'] = topic
477
+ break
478
+ if 'topic' not in updates and cats:
479
+ updates['topic'] = cats[0]
480
+
481
+ if updates:
482
+ update_session_state(session_id, knowledge_update=updates, increment_count=False)
483
+ return updates
484
+ return {}
485
+
486
+ def process_question(question: str, current_session_id: str):
487
+ """Main function to process user questions - replaces Flask /ask endpoint"""
488
+
489
+ if not question:
490
+ return "Question is required"
491
+
492
+ # 0. HARD POLICY CHECK
493
+ if detect_policy_issue(question):
494
+ log_question(question, current_session_id)
495
+
496
+ return "Please email info@getscenestudios.com."
497
+
498
+ # 1. Handle Session & Knowledge State
499
+ update_knowledge_from_question(current_session_id, question)
500
+
501
+ session_state = get_session_state(current_session_id)
502
+
503
+ try:
504
+ knowledge = json.loads(session_state.get('knowledge_context', '{}'))
505
+ except:
506
+ knowledge = {}
507
+
508
+ user_preference = knowledge.get('format')
509
+ current_topic = knowledge.get('topic')
510
+
511
+ if not user_preference:
512
+ user_preference = session_state.get('preference')
513
+
514
+ update_session_state(current_session_id, increment_count=True)
515
+
516
+ # Create embedding of user question
517
+ user_embedding = get_embedding(question)
518
+
519
+ # Check FAQ embeddings first
520
+ faq_data = fetch_all_faq_embeddings()
521
+ top_faqs = []
522
+
523
+ for entry_id, question_text, answer_text, emb in faq_data:
524
+ score = cosine_similarity(user_embedding, emb)
525
+ top_faqs.append((score, entry_id, question_text, answer_text))
526
+ top_faqs.sort(reverse=True)
527
+
528
+ faq_threshold = 0.85
529
+ ambiguous_threshold = 0.70
530
+
531
+ # If high-confidence FAQ match found
532
+ if top_faqs and top_faqs[0][0] >= faq_threshold:
533
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
534
+
535
+ best_score, faq_id, question_text, answer_text = top_faqs[0]
536
+
537
+ mentor_framing_start = "That's a great question! Here's the information on that:"
538
+ mentor_framing_end = "I hope that clears things up! Remember, every bit of knowledge helps you steer your career in the right direction."
539
+
540
+ enhanced_answer = f"{mentor_framing_start}\n\n{answer_text}"
541
+
542
+ categories = detect_question_category(question)
543
+ contextual_info = get_contextual_business_info(categories)
544
+
545
+ if contextual_info:
546
+ next_steps = []
547
+ for category, info in contextual_info.items():
548
+ next_steps.append(f"A great next step for you: {info['journey']}")
549
+
550
+ if next_steps:
551
+ enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
552
+
553
+ enhanced_answer += f"\n\n{mentor_framing_end}\n\nQuestions? Contact info@getscenestudios.com"
554
+
555
+ # Log question
556
+ log_question(question, current_session_id, answer=enhanced_answer)
557
+
558
+ return enhanced_answer
559
+
560
+ elif top_faqs and top_faqs[0][0] >= ambiguous_threshold:
561
+ # AMBIGUOUS ZONE
562
+ needs_clarification = False
563
+
564
+ if not user_preference:
565
+ needs_clarification = True
566
+
567
+ is_generic_query = any(w in question.lower() for w in ['price', 'cost', 'how much', 'schedule', 'when'])
568
+ if is_generic_query and not current_topic:
569
+ needs_clarification = True
570
+
571
+ clarification_count = session_state.get('clarification_count', 0)
572
+ if clarification_count > 0:
573
+ needs_clarification = False
574
+
575
+ if needs_clarification:
576
+ update_session_state(current_session_id, increment_clarification=True, increment_count=False)
577
+ best_match_q = top_faqs[0][2]
578
+ return f"Did you mean: {best_match_q}?"
579
+
580
+ # Auto-Resolve
581
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
582
+
583
+ best_score, faq_id, question_text, answer_text = top_faqs[0]
584
+
585
+ categories = detect_question_category(question)
586
+ contextual_info = get_contextual_business_info(categories)
587
+
588
+ enhanced_answer = answer_text
589
+ if contextual_info:
590
+ next_steps = []
591
+ for category, info in contextual_info.items():
592
+ next_steps.append(f"Next step: Consider {info['journey']}")
593
+
594
+ if next_steps:
595
+ enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
596
+ enhanced_answer += f"\n\nQuestions? Contact info@getscenestudios.com"
597
+
598
+ log_question(question, current_session_id, answer=enhanced_answer)
599
+
600
+ return enhanced_answer
601
+
602
+ else:
603
+ # 3. HALLUCINATION GUARD
604
+ categories = detect_question_category(question)
605
+
606
+ has_session_context = (current_topic is not None) or (user_preference is not None)
607
+
608
+ is_acting_related = (
609
+ len(categories) > 0 or
610
+ detect_response_type(question) == "support" or
611
+ any(k in question.lower() for k in ACTION_KEYWORDS) or
612
+ any(k in question.lower() for k in ['class', 'workshop', 'coaching', 'studio', 'acting', 'online', 'person', 'atlanta', 'training']) or
613
+ (has_session_context and len(question.split()) <= 10)
614
+ )
615
+
616
+ if not is_acting_related:
617
+ return "I'm not exactly sure about that. Please email info@getscenestudios.com so a member of our team can get you the most accurate answer!"
618
+
619
+ # 4. LLM PATH
620
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
621
+ podcast_data = fetch_all_embeddings("podcast_episodes")
622
+ top_workshops = find_top_workshops(user_embedding, k=10)
623
+ top_podcasts = find_top_k_matches(user_embedding, podcast_data, k=3)
624
+
625
+ enriched_podcast_links = []
626
+ for _, podcast_id, _ in top_podcasts:
627
+ row = fetch_row_by_id("podcast_episodes", podcast_id)
628
+ enriched_podcast_links.extend(generate_enriched_links(row))
629
+
630
+ if not enriched_podcast_links:
631
+ fallback = fetch_row_by_id("podcast_episodes", podcast_data[0][0])
632
+ enriched_podcast_links = generate_enriched_links(fallback)
633
+
634
+ # 5. Brevity & Detail Detection
635
+ wants_details = any(syn in question.lower() for syn in DETAIL_SYNONYMS)
636
+
637
+ final_prompt = build_enhanced_prompt(
638
+ question,
639
+ None,
640
+ top_workshops,
641
+ user_preference=user_preference,
642
+ enriched_podcast_links=enriched_podcast_links,
643
+ wants_details=wants_details,
644
+ current_topic=current_topic
645
+ )
646
+
647
+ response = openai.chat.completions.create(
648
+ model="gpt-4",
649
+ messages=[
650
+ {"role": "system", "content": final_prompt},
651
+ {"role": "user", "content": question}
652
+ ]
653
+ )
654
+
655
+ # Log question
656
+ log_question(question, current_session_id)
657
+
658
+ return response.choices[0].message.content.strip()
659
+
660
+ # ============================================================================
661
+ # GRADIO INTERFACE
662
+ # ============================================================================
663
+
664
+ def chat_with_bot(message, history):
665
+ """
666
+ Process message directly without Flask API
667
+
668
+ Args:
669
+ message: User's current message
670
+ history: Chat history (list of message dictionaries)
671
+
672
+ Returns:
673
+ Updated history with new exchange
674
+ """
675
+ global session_id
676
+
677
+ if not message.strip():
678
+ return history
679
+
680
+ try:
681
+ # Process question directly
682
+ bot_reply = process_question(message, session_id)
683
+ except Exception as e:
684
+ bot_reply = f"❌ Error: {str(e)}"
685
+
686
+ # Append to history in Gradio 6.0 format
687
+ history.append({"role": "user", "content": message})
688
+ history.append({"role": "assistant", "content": bot_reply})
689
+ return history
690
+
691
+ def reset_session():
692
+ """Reset session ID for new conversation"""
693
+ global session_id
694
+ session_id = str(uuid.uuid4())
695
+ return [] #, f"🔄 New session started: {session_id[:8]}..."
696
+
697
+ # Create Gradio interface
698
+ with gr.Blocks(title="Get Scene Studios Chatbot") as demo:
699
+
700
+ gr.Markdown(
701
+ """
702
+ # 🎬 Get Scene Studios AI Chatbot
703
+
704
+ Ask questions about acting classes, workshops and more!
705
+ """
706
+ )
707
+
708
+ # # Session info display
709
+ # session_info = gr.Textbox(
710
+ # label="Current Session ID",
711
+ # value=f"Session: {session_id[:8]}...",
712
+ # interactive=False,
713
+ # scale=1
714
+ # )
715
+
716
+ # Chatbot interface
717
+ chatbot = gr.Chatbot(
718
+ label="Conversation",
719
+ height=500
720
+ )
721
+
722
+ # Input area
723
+ with gr.Row():
724
+ msg = gr.Textbox(
725
+ label="Your Message",
726
+ lines=2,
727
+ scale=4
728
+ )
729
+ submit_btn = gr.Button("Send 📤", scale=1, variant="primary")
730
+
731
+ # Action buttons
732
+ with gr.Row():
733
+ clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
734
+ reset_btn = gr.Button("New Session 🔄", scale=1)
735
+
736
+ # Example questions
737
+ # gr.Examples(
738
+ # examples=[
739
+ # "How much does it cost?",
740
+ # "I want to get an agent",
741
+ # "I'm a beginner, where should I start?",
742
+ # "Tell me about your workshops",
743
+ # "Do you have online classes?",
744
+ # "What's the difference between Perfect Submission and Crush the Callback?",
745
+ # "I prefer in-studio training",
746
+ # "Tell me about mentorship programs"
747
+ # ],
748
+ # inputs=msg,
749
+ # label="💡 Try these example questions:"
750
+ # )
751
+
752
+ # Event handlers
753
+ submit_btn.click(
754
+ fn=chat_with_bot,
755
+ inputs=[msg, chatbot],
756
+ outputs=[chatbot]
757
+ ).then(
758
+ fn=lambda: "",
759
+ inputs=None,
760
+ outputs=[msg]
761
+ )
762
+
763
+ msg.submit(
764
+ fn=chat_with_bot,
765
+ inputs=[msg, chatbot],
766
+ outputs=[chatbot]
767
+ ).then(
768
+ fn=lambda: "",
769
+ inputs=None,
770
+ outputs=[msg]
771
+ )
772
+
773
+ clear_btn.click(
774
+ fn=lambda: [],
775
+ inputs=None,
776
+ outputs=[chatbot]
777
+ )
778
+
779
+ reset_btn.click(
780
+ fn=reset_session,
781
+ inputs=None,
782
+ outputs=[chatbot] #, session_info]
783
+ )
784
+
785
+ # Launch the app
786
+ if __name__ == "__main__":
787
+ print("\n" + "="*60)
788
+ print("🎬 Get Scene Studios Chatbot")
789
+ print("="*60)
790
+ print("\n✅ No Flask API needed - all processing is done directly!")
791
+ print("🌐 Gradio interface will open in your browser")
792
+ print("="*60 + "\n")
793
+
794
+ demo.launch()
config.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import timedelta
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ # API Keys
8
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+
10
+ # Database
11
+ DB_PATH = "getscene_ai.sqlite"
12
+
13
+ # Models
14
+ EMBED_MODEL = "text-embedding-3-small"
15
+
16
+ # Caching
17
+ CACHE_DURATION = timedelta(hours=24)
18
+
19
+ # Keyword Lists
20
+ EMOTIONAL_KEYWORDS = [
21
+ 'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
22
+ 'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
23
+ 'insecure', 'lost', 'confused', 'struggling', 'hard time',
24
+ 'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
25
+ 'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
26
+ 'hopeless', 'stressed', 'pressure', 'imposter'
27
+ ]
28
+
29
+ ACTION_KEYWORDS = [
30
+ 'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
31
+ 'more auditions', 'book', 'booking', 'callbacks', 'improve',
32
+ 'better', 'self-tape', 'materials', 'headshots', 'reel',
33
+ 'network', 'connections', 'industry', 'career', 'strategy',
34
+ 'agent prep', 'total agent prep', 'workshop', 'class', 'training',
35
+ 'results', 'success', 'grow', 'advance', 'level up'
36
+ ]
37
+
38
+ POLICY_KEYWORDS = [
39
+ 'refund', 'refunds', 'money back',
40
+ 'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
41
+ 'late', 'lateness', 'tardy',
42
+ 'reschedule', 'change date', 'move class',
43
+ 'credit', 'credits',
44
+ 'cancel', 'cancellation', 'canceling',
45
+ 'policy', 'policies'
46
+ ]
47
+
48
+ DETAIL_SYNONYMS = [
49
+ 'detail', 'details', 'explain', 'elaborate', 'tell me more',
50
+ 'more info', 'describe', 'thorough', 'comprehensive'
51
+ ]
52
+
53
+ PERSONA_INSTRUCTION = """
54
+ You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
55
+ - Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
56
+ - Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
57
+ - Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
58
+ """
database.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+ from contextlib import contextmanager
4
+ from typing import List, Dict, Any, Tuple
5
+ from config import DB_PATH
6
+
7
+ @contextmanager
8
+ def get_db_connection():
9
+ """Context manager for database connections."""
10
+ conn = sqlite3.connect(DB_PATH)
11
+ conn.row_factory = sqlite3.Row
12
+ try:
13
+ yield conn
14
+ finally:
15
+ conn.close()
16
+
17
+ def fetch_all_embeddings(table: str) -> List[Tuple[int, str, List[float]]]:
18
+ """Fetch all embeddings from a table."""
19
+ with get_db_connection() as conn:
20
+ cur = conn.cursor()
21
+ cur.execute(f"SELECT id, full_text, embedding FROM {table}")
22
+ rows = cur.fetchall()
23
+
24
+ parsed = []
25
+ for row in rows:
26
+ try:
27
+ parsed.append((row['id'], row['full_text'], json.loads(row['embedding'])))
28
+ except (json.JSONDecodeError, TypeError):
29
+ continue
30
+ return parsed
31
+
32
+ def fetch_row_by_id(table: str, row_id: int) -> Dict[str, Any]:
33
+ """Fetch a single row by ID."""
34
+ with get_db_connection() as conn:
35
+ cur = conn.cursor()
36
+ cur.execute(f"SELECT * FROM {table} WHERE id = ?", (row_id,))
37
+ row = cur.fetchone()
38
+ return dict(row) if row else {}
39
+
40
+ def fetch_all_faq_embeddings() -> List[Tuple[int, str, str, List[float]]]:
41
+ """Fetch all FAQ embeddings."""
42
+ with get_db_connection() as conn:
43
+ cur = conn.cursor()
44
+ cur.execute("SELECT id, question, answer, embedding FROM faq_entries")
45
+ rows = cur.fetchall()
46
+
47
+ parsed = []
48
+ for row in rows:
49
+ try:
50
+ parsed.append((row['id'], row['question'], row['answer'], json.loads(row['embedding'])))
51
+ except (json.JSONDecodeError, TypeError):
52
+ continue
53
+ return parsed
54
+
55
+ def log_question(question: str, session_id: str = None, category: str = None, answer: str = None):
56
+ """Log a user question to the database with full context."""
57
+ with get_db_connection() as conn:
58
+ cur = conn.cursor()
59
+
60
+ # Check if table has the new columns, if not just log question (migration safety)
61
+ # Or better, just try insert with all columns assuming schema is up to date or we updated it.
62
+ # Given schema.sql suggests full schema, we'll try full insert.
63
+
64
+ try:
65
+ cur.execute("""
66
+ INSERT INTO question_logs (session_id, question, category, answer)
67
+ VALUES (?, ?, ?, ?)
68
+ """, (session_id, question, category, answer))
69
+ except sqlite3.OperationalError:
70
+ # Fallback for older schema versions
71
+ cur.execute("INSERT INTO question_logs (question) VALUES (?)", (question,))
72
+
73
+ conn.commit()
74
+
75
+ def get_session_state(session_id: str) -> Dict[str, Any]:
76
+ """Get session state from DB"""
77
+ with get_db_connection() as conn:
78
+ cur = conn.cursor()
79
+ cur.execute("SELECT * FROM user_sessions WHERE session_id = ?", (session_id,))
80
+ row = cur.fetchone()
81
+ if row:
82
+ return dict(row)
83
+ return {"preference": None, "msg_count": 0, "clarification_count": 0, "knowledge_context": "{}"}
84
+
85
+ def update_session_state(session_id: str, preference: str = None, increment_count: bool = True, increment_clarification: bool = False, reset_clarification: bool = False, knowledge_update: Dict = None):
86
+ """Update session state with Knowledge Dictionary support"""
87
+ with get_db_connection() as conn:
88
+ cur = conn.cursor()
89
+
90
+ # Check if exists
91
+ cur.execute("SELECT preference, msg_count, clarification_count, knowledge_context FROM user_sessions WHERE session_id = ?", (session_id,))
92
+ row = cur.fetchone()
93
+
94
+ current_knowledge = {}
95
+ if row:
96
+ curr_pref, curr_count, curr_clarification, curr_knowledge_json = row
97
+ try:
98
+ current_knowledge = json.loads(curr_knowledge_json)
99
+ except:
100
+ current_knowledge = {}
101
+
102
+ new_pref = preference if preference else curr_pref
103
+ new_count = curr_count + 1 if increment_count else curr_count
104
+
105
+ # 10-Message Memory Rule: Reset if we hit the limit
106
+ if new_count > 10:
107
+ print(f"🔄 Session {session_id} reached 10 messages. Resetting memory context.")
108
+ new_count = 1
109
+ new_pref = None
110
+ current_knowledge = {}
111
+ new_clarification = 0
112
+ else:
113
+ new_clarification = curr_clarification
114
+ if reset_clarification:
115
+ new_clarification = 0
116
+ elif increment_clarification:
117
+ new_clarification = curr_clarification + 1
118
+
119
+ # Merge knowledge updates
120
+ if knowledge_update:
121
+ current_knowledge.update(knowledge_update)
122
+
123
+ new_knowledge_json = json.dumps(current_knowledge)
124
+
125
+ cur.execute("""
126
+ UPDATE user_sessions
127
+ SET preference = ?, msg_count = ?, clarification_count = ?, knowledge_context = ?, last_updated = CURRENT_TIMESTAMP
128
+ WHERE session_id = ?
129
+ """, (new_pref, new_count, new_clarification, new_knowledge_json, session_id))
130
+ else:
131
+ new_pref = preference
132
+ new_count = 1 if increment_count else 0
133
+ new_clarification = 1 if increment_clarification else 0
134
+
135
+ if knowledge_update:
136
+ current_knowledge.update(knowledge_update)
137
+ new_knowledge_json = json.dumps(current_knowledge)
138
+
139
+ cur.execute("""
140
+ INSERT INTO user_sessions (session_id, preference, msg_count, clarification_count, knowledge_context)
141
+ VALUES (?, ?, ?, ?, ?)
142
+ """, (session_id, new_pref, new_count, new_clarification, new_knowledge_json))
143
+
144
+ conn.commit()
getscene_ai.sqlite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ee0236c123938dcf755d0808488ed9b2696759e5ce3e14c852e7ce4b513115f
3
+ size 10178560
scraper.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ from bs4 import BeautifulSoup
5
+ from typing import List, Dict, Any, Tuple
6
+ from utils import clean_time
7
+
8
+ def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
9
+ """
10
+ Extract workshops using our robust Squarespace JSON + HTML parsing system
11
+ """
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
14
+ }
15
+
16
+ try:
17
+ # First try the Squarespace JSON API
18
+ json_url = f"{url}?format=json"
19
+ print(f"🔍 Trying Squarespace JSON API: {json_url}")
20
+
21
+ response = requests.get(json_url, headers=headers, timeout=10)
22
+ if response.status_code == 200:
23
+ try:
24
+ json_data = response.json()
25
+ workshops = extract_workshops_from_json(json_data, json_url)
26
+ if workshops:
27
+ print(f"✅ Extracted {len(workshops)} workshops from JSON API")
28
+ return workshops
29
+ else:
30
+ print("❌ No workshops found in JSON, falling back to HTML")
31
+ except json.JSONDecodeError:
32
+ print("❌ Invalid JSON response, falling back to HTML")
33
+
34
+ # Fallback to HTML scraping if JSON fails
35
+ print(f"📄 Falling back to HTML scraping for {url}")
36
+ response = requests.get(url, headers=headers, timeout=10)
37
+ response.raise_for_status()
38
+
39
+ soup = BeautifulSoup(response.content, 'html.parser')
40
+ workshops = parse_workshops_from_html(soup, url)
41
+
42
+ if workshops:
43
+ print(f"✅ Extracted {len(workshops)} workshops from HTML parsing")
44
+ return workshops
45
+ else:
46
+ print("❌ No workshops found in HTML")
47
+ return []
48
+
49
+ except Exception as e:
50
+ print(f"❌ Error scraping workshops from {url}: {e}")
51
+ return []
52
+
53
+ def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
54
+ """Extract workshop information from Squarespace JSON data"""
55
+ workshops = []
56
+
57
+ # Check if there's mainContent HTML to parse
58
+ if isinstance(data, dict) and 'mainContent' in data:
59
+ main_content_html = data['mainContent']
60
+ if isinstance(main_content_html, str):
61
+ print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
62
+
63
+ soup = BeautifulSoup(main_content_html, 'html.parser')
64
+ workshops = parse_workshops_from_html(soup, source_url)
65
+
66
+ if workshops:
67
+ return workshops
68
+
69
+ return workshops
70
+
71
+ def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
72
+ """Enhanced HTML parsing specifically for workshop content"""
73
+ workshops = []
74
+ workshop_texts = set()
75
+
76
+ print(f"🔍 ENHANCED HTML PARSING:")
77
+
78
+ # Method 1: Find individual workshop containers
79
+ potential_containers = soup.find_all(['div', 'section', 'article'],
80
+ attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
81
+
82
+ print(f" Found {len(potential_containers)} potential workshop containers")
83
+
84
+ for container in potential_containers:
85
+ workshop_text = container.get_text(strip=True)
86
+
87
+ if len(workshop_text) < 30 or workshop_text in workshop_texts:
88
+ continue
89
+
90
+ if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
91
+ workshop = extract_single_workshop_from_text(workshop_text, source_url)
92
+ if workshop and not is_duplicate_workshop(workshop, workshops):
93
+ workshops.append(workshop)
94
+ workshop_texts.add(workshop_text)
95
+
96
+ # Method 2: Pattern-based extraction from full text
97
+ all_text = soup.get_text()
98
+
99
+ workshop_patterns = [
100
+ # Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
101
+ r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
102
+
103
+ # Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
104
+ r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
105
+
106
+ # Pattern 3: "Casting Director Name, Date @ Time"
107
+ r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
108
+ ]
109
+
110
+ for i, pattern in enumerate(workshop_patterns):
111
+ matches = re.findall(pattern, all_text, re.IGNORECASE)
112
+ for match in matches:
113
+ workshop = parse_refined_workshop_match(match, i+1, source_url)
114
+ if workshop and not is_duplicate_workshop(workshop, workshops):
115
+ workshops.append(workshop)
116
+
117
+ print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
118
+ return workshops
119
+
120
+ def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
121
+ """Extract workshop info from a single text block"""
122
+
123
+ # Clean up the text
124
+ text = re.sub(r'\$[0-9,]+\.00', '', text)
125
+ text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
126
+ text = re.sub(r'\s+', ' ', text).strip()
127
+ text = re.sub(r'\n+', ' ', text)
128
+
129
+ patterns = [
130
+ # Pattern A: "Title with Professional Name on Date @ Time"
131
+ r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
132
+
133
+ # Pattern B: "Professional Name, Title on Date @ Time"
134
+ r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
135
+
136
+ # Pattern C: "Casting Director Name, Date at Time"
137
+ r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
138
+
139
+ # Pattern D: "Company Executive Producer Name on Date"
140
+ r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
141
+
142
+ # Pattern E: "Company Agent Name Date" (fixed "on" issue)
143
+ r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
144
+
145
+ # Pattern F: "Company, Person, Title on Date"
146
+ r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
147
+
148
+ # Pattern G: Flexible fallback
149
+ r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
150
+ ]
151
+
152
+ for i, pattern in enumerate(patterns):
153
+ match = re.search(pattern, text, re.IGNORECASE)
154
+ if match:
155
+ return parse_pattern_match(match, i, source_url)
156
+
157
+ return None
158
+
159
+ def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
160
+ """Parse a regex match based on pattern type"""
161
+ # Initialize variables to avoid UnboundLocalError
162
+ workshop_title = ""
163
+ instructor_title = ""
164
+ instructor_name = ""
165
+ date_str = ""
166
+ time_str = ""
167
+
168
+ try:
169
+ if pattern_index == 0: # Pattern A
170
+ workshop_title = match.group(1).strip()
171
+ professional_full = match.group(2).strip()
172
+ date_str = match.group(3).strip()
173
+ time_str = match.group(4).strip() if match.group(4) else ""
174
+
175
+ if professional_full.startswith('CD '):
176
+ professional_full = 'Casting Director ' + professional_full[3:]
177
+
178
+ instructor_title, instructor_name = parse_professional_info(professional_full)
179
+
180
+ elif pattern_index == 1: # Pattern B
181
+ professional_full = match.group(1).strip()
182
+ workshop_title = match.group(2).strip()
183
+ date_str = match.group(3).strip()
184
+ time_str = match.group(4).strip() if match.group(4) else ""
185
+
186
+ instructor_title, instructor_name = parse_professional_info(professional_full)
187
+
188
+ elif pattern_index == 2: # Pattern C
189
+ instructor_title = match.group(1).strip()
190
+ instructor_name = match.group(2).strip()
191
+ date_str = match.group(3).strip()
192
+ time_str = match.group(4).strip() if match.group(4) else ""
193
+ workshop_title = "Casting Workshop"
194
+
195
+ elif pattern_index == 3: # Pattern D
196
+ instructor_title = match.group(1).strip()
197
+ instructor_name = match.group(2).strip()
198
+ date_str = match.group(3).strip()
199
+ time_str = match.group(4).strip() if match.group(4) else ""
200
+ workshop_title = "Industry Workshop"
201
+
202
+ elif pattern_index == 4: # Pattern E
203
+ company_name = match.group(1).strip()
204
+ agent_type = match.group(2).strip()
205
+ instructor_name = match.group(3).strip()
206
+ date_str = match.group(4).strip()
207
+ time_str = match.group(5).strip() if len(match.groups()) > 4 and match.group(5) else ""
208
+
209
+ instructor_title = f"{company_name} {agent_type}"
210
+ workshop_title = "Industry Workshop"
211
+
212
+ elif pattern_index == 5: # Pattern F
213
+ company_name = match.group(1).strip()
214
+ instructor_name = match.group(2).strip()
215
+ workshop_title = match.group(3).strip()
216
+ date_str = match.group(4).strip()
217
+ time_str = match.group(5).strip() if len(match.groups()) > 4 and match.group(5) else ""
218
+
219
+ instructor_title = company_name
220
+
221
+ else: # Pattern G
222
+ professional_full = match.group(1).strip() + " " + match.group(2).strip()
223
+ date_str = match.group(3).strip()
224
+ time_str = match.group(4).strip() if match.group(4) else ""
225
+ workshop_title = "Industry Workshop"
226
+
227
+ if len(professional_full) > 50 or '\n' in professional_full:
228
+ return None
229
+
230
+ instructor_title, instructor_name = parse_professional_info(professional_full)
231
+
232
+ if instructor_name and date_str:
233
+ # Create full_text for embedding (required by existing Flask API)
234
+ full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
235
+ if date_str:
236
+ full_text += f" on {date_str}"
237
+ if time_str:
238
+ full_text += f" at {clean_time(time_str)}"
239
+
240
+ return {
241
+ 'title': workshop_title,
242
+ 'instructor_name': instructor_name,
243
+ 'instructor_title': instructor_title,
244
+ 'date': date_str,
245
+ 'time': clean_time(time_str),
246
+ 'full_text': full_text, # Required for existing embedding system
247
+ 'source_url': source_url
248
+ }
249
+
250
+ except Exception as e:
251
+ print(f"Error parsing pattern match: {e}")
252
+
253
+ return None
254
+
255
+ def parse_professional_info(professional_full: str) -> tuple:
256
+ """Parse professional title and name from full string"""
257
+
258
+ professional_full = re.sub(r'\s+', ' ', professional_full).strip()
259
+
260
+ # Handle specific multi-word titles
261
+ specific_titles = [
262
+ 'Atlanta Models & Talent President',
263
+ 'Executive Casting Producer',
264
+ 'Casting Director',
265
+ 'Casting Associate',
266
+ 'DDO Agent',
267
+ 'Talent Agent',
268
+ 'Acting Coach'
269
+ ]
270
+
271
+ for title in specific_titles:
272
+ if title in professional_full:
273
+ title_pos = professional_full.find(title)
274
+
275
+ if title_pos == 0:
276
+ name_part = professional_full[len(title):].strip()
277
+ return title, name_part
278
+ else:
279
+ name_part = professional_full[:title_pos].strip().rstrip(',')
280
+ return title, name_part
281
+
282
+ # Fallback for single-word titles
283
+ single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
284
+
285
+ words = professional_full.split()
286
+ for i, word in enumerate(words):
287
+ if word in single_word_titles:
288
+ if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
289
+ title = f"{words[i-1]} {word}"
290
+ name_parts = words[:i-1] + words[i+1:]
291
+ else:
292
+ title = word
293
+ name_parts = words[:i] + words[i+1:]
294
+
295
+ name = ' '.join(name_parts).strip()
296
+ return title, name
297
+
298
+ # Final fallback
299
+ if len(words) >= 2:
300
+ return words[0], ' '.join(words[1:])
301
+
302
+ return '', professional_full
303
+
304
+ def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
305
+ """Parse a regex match into a clean workshop dictionary"""
306
+ return parse_pattern_match(match, pattern_num-1, source_url) # Adjust for 0-based indexing
307
+
308
+ def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
309
+ """Enhanced duplicate detection"""
310
+ for existing in existing_workshops:
311
+ if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
312
+ existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
313
+
314
+ existing_title = existing.get('title', '').strip().lower()
315
+ new_title = new_workshop.get('title', '').strip().lower()
316
+
317
+ if (existing_title == new_title or
318
+ 'workshop' in existing_title and 'workshop' in new_title or
319
+ existing_title in new_title or new_title in existing_title):
320
+ return True
321
+ return False
322
+
323
+ def calculate_workshop_confidence(w: Dict) -> float:
324
+ """Calculate confidence score of retrieved workshop data"""
325
+ score = 0.0
326
+ if w.get('title'): score += 0.3
327
+ if w.get('instructor_name'): score += 0.3
328
+ if w.get('date'): score += 0.2
329
+ if w.get('time'): score += 0.1
330
+ if w.get('source_url'): score += 0.1
331
+ return round(score, 2)
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import numpy as np
3
+ import re
4
+ from typing import List, Tuple
5
+ from config import EMBED_MODEL
6
+
7
+ def get_embedding(text: str) -> List[float]:
8
+ """Generate embedding for a given text."""
9
+ text_strip = text.replace("\n", " ").strip()
10
+ response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
11
+ return response.data[0].embedding
12
+
13
+ def cosine_similarity(a: List[float], b: List[float]) -> float:
14
+ """Calculate cosine similarity between two vectors."""
15
+ a = np.array(a)
16
+ b = np.array(b)
17
+ if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
18
+ return 0.0
19
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
20
+
21
+ def clean_time(time_str: str) -> str:
22
+ """Clean up time string."""
23
+ if not time_str:
24
+ return ""
25
+
26
+ time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
27
+ if time_match:
28
+ hour = time_match.group(1)
29
+ minute = time_match.group(2) or "00"
30
+ ampm = time_match.group(3).upper()
31
+ return f"{hour}:{minute} {ampm}"
32
+
33
+ return time_str.strip()
34
+
35
+ def find_top_k_matches(user_embedding, dataset, k=3):
36
+ """Find top k matching entries from a dataset."""
37
+ scored = []
38
+ for entry_id, text, emb in dataset:
39
+ score = cosine_similarity(user_embedding, emb)
40
+ scored.append((score, entry_id, text))
41
+ scored.sort(reverse=True)
42
+ return scored[:k]