Spaces:

MLBench
/

getscenes

Running

App Files Files Community

saim1309 commited on 27 days ago

Commit

e7f736a

verified ·

1 Parent(s): 407925d

Upload 2 files

Browse files

Files changed (2) hide show

app.py +802 -0
scraper.py +347 -0

app.py ADDED Viewed

	@@ -0,0 +1,802 @@

+import gradio as gr
+import openai
+import json
+from datetime import datetime, timedelta
+import uuid
+from typing import Dict
+from config import OPENAI_API_KEY, DB_PATH, EMBED_MODEL
+from utils import get_embedding, cosine_similarity, find_top_k_matches
+from scraper import scrape_workshops_from_squarespace
+from database import (
+    fetch_all_embeddings,
+    fetch_row_by_id,
+    fetch_all_faq_embeddings,
+    get_session_state,
+    update_session_state,
+    log_question
+)
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+if not OPENAI_API_KEY:
+    raise ValueError("OPENAI_API_KEY not found in .env file")
+openai.api_key = OPENAI_API_KEY
+# Store session ID for the conversation
+session_id = str(uuid.uuid4())
+# Cache for workshop data and embeddings
+workshop_cache = {
+    'data': [],
+    'embeddings': [],
+    'last_updated': None,
+    'cache_duration': timedelta(hours=24)
+}
+# ============================================================================
+# KEYWORD LISTS FOR ROUTING
+# ============================================================================
+EMOTIONAL_KEYWORDS = [
+    'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
+    'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
+    'insecure', 'lost', 'confused', 'struggling', 'hard time',
+    'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
+    'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
+    'hopeless', 'stressed', 'pressure', 'imposter'
+]
+ACTION_KEYWORDS = [
+    'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
+    'more auditions', 'book', 'booking', 'callbacks', 'improve',
+    'better', 'self-tape', 'materials', 'headshots', 'reel',
+    'network', 'connections', 'industry', 'career', 'strategy',
+    'agent prep', 'total agent prep', 'workshop', 'class', 'training',
+    'results', 'success', 'grow', 'advance', 'level up'
+]
+POLICY_KEYWORDS = [
+    'refund', 'refunds', 'money back',
+    'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
+    'late', 'lateness', 'tardy',
+    'reschedule', 'change date', 'move class',
+    'credit', 'credits',
+    'cancel', 'cancellation', 'canceling',
+    'policy', 'policies'
+]
+DETAIL_SYNONYMS = [
+    'detail', 'details', 'explain', 'elaborate', 'tell me more',
+    'more info', 'describe', 'thorough', 'comprehensive'
+]
+PERSONA_INSTRUCTION = """
+You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
+- Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
+- Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
+- Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
+"""
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def calculate_workshop_confidence(w: Dict) -> float:
+    """Calculate confidence score of retrieved workshop data"""
+    score = 0.0
+    if w.get('title'): score += 0.3
+    if w.get('instructor_name'): score += 0.3
+    if w.get('date'): score += 0.2
+    if w.get('time'): score += 0.1
+    if w.get('source_url'): score += 0.1
+    return round(score, 2)
+# ============================================================================
+# WORKSHOP FUNCTIONS
+# ============================================================================
+def get_current_workshops():
+    """Get current workshops with caching"""
+    global workshop_cache
+    now = datetime.now()
+    # Check if cache is still valid
+    if (workshop_cache['last_updated'] and
+        now - workshop_cache['last_updated'] < workshop_cache['cache_duration'] and
+        workshop_cache['data']):
+        print("Using cached workshop data")
+        return workshop_cache['data'], workshop_cache['embeddings']
+    print("Fetching fresh workshop data...")
+    # Use robust Squarespace scraping system
+    online_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/online")
+    instudio_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/instudio")
+    all_workshops = online_workshops + instudio_workshops
+    # Data Integrity: Validate and score workshops
+    valid_workshops = []
+    total_score = 0
+    for w in all_workshops:
+        conf = calculate_workshop_confidence(w)
+        if conf >= 0.8:
+            valid_workshops.append(w)
+            total_score += conf
+        else:
+            print(f"⚠️ Rejecting weak record (Confidence: {conf}): {w.get('title', 'Unknown')}", flush=True)
+    avg_conf = total_score / len(valid_workshops) if valid_workshops else 0
+    print(f"📊 DATA INTEGRITY: Found {len(all_workshops)} total, {len(valid_workshops)} valid (Confidence >= 0.8)", flush=True)
+    print(f"📈 Retrieval Confidence: {avg_conf:.2f} (Average)", flush=True)
+    all_workshops = valid_workshops
+    if not all_workshops:
+        if workshop_cache['data']:
+            print("Scraping failed, using cached data")
+            return workshop_cache['data'], workshop_cache['embeddings']
+        else:
+            print("No workshop data available")
+            return [], []
+    # Generate embeddings for workshops
+    workshop_embeddings = []
+    for workshop in all_workshops:
+        try:
+            embedding = get_embedding(workshop['full_text'])
+            workshop_embeddings.append(embedding)
+        except Exception as e:
+            print(f"Error generating embedding for workshop: {e}")
+            workshop_embeddings.append([0] * 1536)
+    # Update cache
+    workshop_cache['data'] = all_workshops
+    workshop_cache['embeddings'] = workshop_embeddings
+    workshop_cache['last_updated'] = now
+    print(f"Cached {len(all_workshops)} workshops")
+    return all_workshops, workshop_embeddings
+def find_top_workshops(user_embedding, k=3):
+    """Find top matching workshops using real-time data"""
+    workshops, workshop_embeddings = get_current_workshops()
+    if not workshops:
+        return []
+    scored = []
+    for i, (workshop, emb) in enumerate(zip(workshops, workshop_embeddings)):
+        try:
+            score = cosine_similarity(user_embedding, emb)
+            scored.append((score, i, workshop['full_text'], workshop))
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            continue
+    scored.sort(reverse=True)
+    return scored[:k]
+# ============================================================================
+# PROMPT BUILDING FUNCTIONS
+# ============================================================================
+def generate_enriched_links(row):
+    base_url = row.get("youtube_url")
+    guest_name = row.get("guest_name", "")
+    highlights = json.loads(row.get("highlight_json", "[]"))
+    summary = highlights[0]["summary"] if highlights else ""
+    # Truncate summary to first sentence only
+    if summary:
+        first_sentence = summary.split('.')[0] + '.'
+        if len(first_sentence) > 120:
+            short_summary = first_sentence[:117] + "..."
+        else:
+            short_summary = first_sentence
+    else:
+        short_summary = "Industry insights for actors"
+    markdown = f"🎧 [Watch {guest_name}'s episode here]({base_url}) - {short_summary}"
+    return [markdown]
+def build_enhanced_prompt(user_question, context_results, top_workshops, user_preference=None, enriched_podcast_links=None, wants_details=False, current_topic=None):
+    """Builds the system prompt with strict formatting rules."""
+    # Free classes are ONLY available online (never in-studio)
+    free_class_url = "https://www.getscenestudios.com/online"
+    # helper for clean links
+    def format_workshop(w):
+        if not w.get('title') or not w.get('instructor_name') or not w.get('date'):
+            return None
+        link = "https://www.getscenestudios.com/instudio" if "/instudio" in w.get('source_url', '') else "https://www.getscenestudios.com/online"
+        # User Preference Filtering
+        w_type = "Online" if "online" in w.get('source_url', '') else "In-Studio"
+        if user_preference:
+            if user_preference.lower() != w_type.lower():
+                return None
+        # Calculate confidence using logic (already present in HF app.py at line 89)
+        confidence = calculate_workshop_confidence(w)
+        if confidence < 0.70:
+            return None
+        # R2: Force format inclusion into the title link for robustness
+        display_title = f"{w['title']} ({w_type})"
+        return f"- [{display_title}]({link}) with {w['instructor_name']} on {w['date']} at {w.get('time', '')}"
+    # Prepare workshop list (Top 3 max to display, but check top 10 for better filtering)
+    workshop_lines = []
+    if top_workshops:
+        for _, _, _, w_data in top_workshops[:10]:  # Check top 10, take top 3 valid after filtering
+            formatted = format_workshop(w_data)
+            if formatted:
+                workshop_lines.append(formatted)
+    workshop_text = ""
+    if workshop_lines:
+        workshop_text = "\n".join(workshop_lines[:3])
+    else:
+        # Improved fallback to avoid generic/placeholder-like feeling
+        label = f"{user_preference.capitalize()} " if user_preference else ""
+        link = "https://www.getscenestudios.com/online" if user_preference == 'online' else "https://www.getscenestudios.com/instudio" if user_preference == 'instudio' else "https://www.getscenestudios.com/online"
+        workshop_text = f"We are constantly updating our schedule! Check our current {label}availability and latest workshops at {link}"
+    # Handle missing podcast data strictly
+    if not enriched_podcast_links:
+        single_podcast = "Our latest industry insights are available on YouTube: https://www.youtube.com/@GetSceneStudios"
+    else:
+        single_podcast = enriched_podcast_links[0]
+    # --- EMOTIONAL / SUPPORT MODE CHECK ---
+    is_emotional = detect_response_type(user_question) == "support"
+    if is_emotional:
+        prompt = f"""{PERSONA_INSTRUCTION}
+You are acting in SUPPORT MODE.
+CRITICAL INSTRUCTIONS:
+1. ACKNOWLEDGE their feelings first (e.g., "I hear how frustrating it is to feel stuck...").
+2. Provide SUPPORTIVE language (2-3 sentences max).
+3. Offer EXACTLY ONE gentle follow-up resource: either the podcast OR the free class.
+4. DO NOT suggest paid workshops or upsell in this response.
+5. KEEP IT BRIEF (≤150 words).
+USER'S QUESTION: {user_question}
+REQUIRED RESPONSE FORMAT:
+[Your empathetic, supportive acknowledgment]
+Here's a free resource that might help you move forward:
+[Pick ONE: {single_podcast} OR Free Class at {free_class_url}]
+Questions? Contact info@getscenestudios.com"""
+        return prompt
+    # --- STANDARD LOGIC FOR CONTEXT SNIPPET ---
+    question_lower = user_question.lower()
+    context_snippet = ""
+    # Priority 1: Direct Keywords in current question
+    detected_topic = None
+    if any(word in question_lower for word in ['agent', 'representation', 'rep', 'manager']):
+        detected_topic = 'agent'
+    elif any(word in question_lower for word in ['beginner', 'new', 'start', 'beginning']):
+        detected_topic = 'beginner'
+    elif any(word in question_lower for word in ['callback', 'audition', 'tape', 'self-tape', 'booking']):
+        detected_topic = 'audition'
+    elif any(word in question_lower for word in ['mentorship', 'coaching']):
+        detected_topic = 'mentorship'
+    elif any(word in question_lower for word in ['price', 'cost', 'how much']):
+        detected_topic = 'pricing'
+    # Priority 2: Fallback to session context if current question is ambiguous
+    if not detected_topic and current_topic:
+        topic_map = {
+            'agent_seeking': 'agent',
+            'beginner': 'beginner',
+            'audition_help': 'audition',
+            'mentorship': 'mentorship',
+            'pricing': 'pricing'
+        }
+        detected_topic = topic_map.get(current_topic)
+    # Assign snippet based on topic
+    if detected_topic == 'agent':
+        context_snippet = "Get Scene Studios has helped 1000+ actors land representation. Total Agent Prep offers live practice with working agents (age 16+, limited to 12 actors)."
+    elif detected_topic == 'beginner':
+        context_snippet = "Get Scene Studios specializes in getting actors audition-ready fast with camera technique and professional self-tape skills."
+    elif detected_topic == 'audition':
+        context_snippet = "Get Scene offers Crush the Callback (Zoom simulation) and Perfect Submission (self-tape mastery) for actors refining their technique."
+    elif detected_topic == 'mentorship':
+        context_snippet = "Working Actor Mentorship is a 6-month program ($3,000) with structured feedback and industry access."
+    elif detected_topic == 'pricing':
+        context_snippet = "Get Scene Studios pricing varies by program. Most workshops cap at 12-14 actors for personalized feedback."
+    else:
+        context_snippet = "Get Scene Studios (founded by Jesse Malinowski) offers training for TV/film actors at all levels."
+    preference_instruction = ""
+    if not user_preference:
+        preference_instruction = """
+IMPORTANT: We need to know if the user prefers "Online" or "In-Studio" workshops.
+If their question implies a location or they haven't specified, ask: "Are you looking for Online or In-Studio training?" as part of your response.
+"""
+    else:
+         preference_instruction = f"""
+USER PREFERENCE KNOWN: {user_preference.upper()}
+1. DO NOT ask "Online or In-Studio" again.
+2. Ensure your recommendations align with {user_preference.upper()} where possible.
+"""
+    # Brevity & Cognitive Load: Direct instructions based on user intent
+    detail_instruction = "Answer the user's question briefly (2-3 sentences max, ≤150 words total)."
+    if wants_details:
+        detail_instruction = "Provide a detailed and thorough explanation for the user's request, but keep it structured and readable."
+    prompt = f"""{PERSONA_INSTRUCTION}
+{context_snippet}
+CRITICAL INSTRUCTIONS:
+- {detail_instruction}
+- Use natural, human transitions between your answer and the recommendations.
+- For each recommendation, add a tiny bit of "mentor advice" on why it helps.
+- Then ALWAYS provide exactly these three numbered recommendations (1. 2. 3.):
+- Use ONLY the provided links - do not invent recommendations
+- Every workshop Title MUST be followed by its format in parentheses, e.g., "Workshop Name (Online)" or "Workshop Name (In-Studio)".
+- Focus on clean, readable formatting.{preference_instruction}
+USER'S QUESTION: {user_question}
+REQUIRED RESPONSE FORMAT:
+[Your brief answer to their question, ≤150 words total]
+Here's your path forward:
+1. Free class (start here, no credit card required): {free_class_url}
+2. Recommended podcast episode:
+{single_podcast}
+3. Relevant paid workshop:
+{workshop_text}
+Questions? Contact info@getscenestudios.com"""
+    return prompt
+# ============================================================================
+# DETECTION FUNCTIONS
+# ============================================================================
+def detect_question_category(question):
+    """Categorize user questions for better context injection"""
+    question_lower = question.lower()
+    categories = {
+        'agent_seeking': ['agent', 'representation', 'rep', 'manager', 'get an agent'],
+        'beginner': ['beginner', 'new', 'start', 'beginning', 'first time', 'never acted'],
+        'audition_help': ['audition', 'callback', 'tape', 'self-tape', 'submission'],
+        'mentorship': ['mentorship', 'coaching', 'intensive', 'mentor', 'one-on-one'],
+        'pricing': ['price', 'cost', 'pricing', '$', 'money', 'payment', 'fee'],
+        'classes': ['class', 'workshop', 'training', 'course', 'learn'],
+        'membership': ['membership', 'join', 'member', 'gsp', 'plus'],
+        'technical': ['self-tape', 'equipment', 'lighting', 'editing', 'camera']
+    }
+    detected = []
+    for category, keywords in categories.items():
+        if any(keyword in question_lower for keyword in keywords):
+            detected.append(category)
+    return detected
+def detect_response_type(question):
+    """Detect if question is emotional/support vs action/results oriented"""
+    question_lower = question.lower()
+    emotional_count = sum(1 for word in EMOTIONAL_KEYWORDS if word in question_lower)
+    action_count = sum(1 for word in ACTION_KEYWORDS if word in question_lower)
+    if emotional_count > 0 and emotional_count >= action_count:
+        return "support"
+    return "standard"
+def detect_policy_issue(question):
+    """Detect if question violates hard policy rules"""
+    question_lower = question.lower()
+    return any(word in question_lower for word in POLICY_KEYWORDS)
+def detect_preference(question):
+    """Detect if user is stating a preference"""
+    q_lower = question.lower()
+    if 'online' in q_lower and 'studio' not in q_lower:
+        return 'online'
+    if ('studio' in q_lower or 'person' in q_lower or 'atlanta' in q_lower) and 'online' not in q_lower:
+        return 'instudio'
+    return None
+def get_contextual_business_info(categories):
+    """Return relevant business information based on detected question categories"""
+    context_map = {
+        'agent_seeking': {
+            'programs': ['Total Agent Prep', 'Working Actor Mentorship'],
+            'key_info': 'Live pitch practice with real agents, Actors Access optimization',
+            'journey': 'Total Agent Prep → GSP → Mentorship for sustained progress'
+        },
+        'beginner': {
+            'programs': ['Free Classes', 'Get Scene 360', 'Get Scene Plus'],
+            'key_info': 'Start with holistic foundation, build consistency',
+            'journey': 'Free class → Get Scene 360 → GSP membership'
+        },
+        'audition_help': {
+            'programs': ['Perfect Submission', 'Crush the Callback', 'Audition Insight'],
+            'key_info': 'Self-tape mastery, callback simulation, pro feedback',
+            'journey': 'Perfect Submission → GSP for ongoing Audition Insight'
+        },
+        'mentorship': {
+            'programs': ['Working Actor Mentorship'],
+            'key_info': '6-month intensive with structured feedback and accountability',
+            'journey': 'Ready for commitment → WAM → Advanced workshops'
+        }
+    }
+    relevant_info = {}
+    for category in categories:
+        if category in context_map:
+            relevant_info[category] = context_map[category]
+    return relevant_info
+# ============================================================================
+# MAIN CHATBOT LOGIC
+# ============================================================================
+def update_knowledge_from_question(session_id: str, question: str):
+    """Extract attributes and update knowledge dictionary"""
+    updates = {}
+    # Extract Format
+    pref = detect_preference(question)
+    if pref:
+        updates['format'] = pref
+    # Extract Topic
+    cats = detect_question_category(question)
+    if cats:
+        # Prioritize specific topics over generic ones
+        priority_topics = ['agent_seeking', 'beginner', 'audition_help', 'mentorship', 'pricing']
+        for topic in priority_topics:
+            if topic in cats:
+                updates['topic'] = topic
+                break
+        if 'topic' not in updates and cats:
+             updates['topic'] = cats[0]
+    if updates:
+        update_session_state(session_id, knowledge_update=updates, increment_count=False)
+        return updates
+    return {}
+def process_question(question: str, current_session_id: str):
+    """Main function to process user questions - replaces Flask /ask endpoint"""
+    if not question:
+        return "Question is required"
+    # 0. HARD POLICY CHECK
+    if detect_policy_issue(question):
+        log_question(question, current_session_id)
+        return "Please email info@getscenestudios.com."
+    # 1. Handle Session & Knowledge State
+    update_knowledge_from_question(current_session_id, question)
+    session_state = get_session_state(current_session_id)
+    try:
+        knowledge = json.loads(session_state.get('knowledge_context', '{}'))
+    except:
+        knowledge = {}
+    user_preference = knowledge.get('format')
+    current_topic = knowledge.get('topic')
+    if not user_preference:
+        user_preference = session_state.get('preference')
+    update_session_state(current_session_id, increment_count=True)
+    # Create embedding of user question
+    user_embedding = get_embedding(question)
+    # Check FAQ embeddings first
+    faq_data = fetch_all_faq_embeddings()
+    top_faqs = []
+    for entry_id, question_text, answer_text, emb in faq_data:
+        score = cosine_similarity(user_embedding, emb)
+        top_faqs.append((score, entry_id, question_text, answer_text))
+    top_faqs.sort(reverse=True)
+    faq_threshold = 0.85
+    ambiguous_threshold = 0.70
+    # If high-confidence FAQ match found
+    if top_faqs and top_faqs[0][0] >= faq_threshold:
+        update_session_state(current_session_id, reset_clarification=True, increment_count=False)
+        best_score, faq_id, question_text, answer_text = top_faqs[0]
+        mentor_framing_start = "That's a great question! Here's the information on that:"
+        mentor_framing_end = "I hope that clears things up! Remember, every bit of knowledge helps you steer your career in the right direction."
+        enhanced_answer = f"{mentor_framing_start}\n\n{answer_text}"
+        # R5: Policy Guard for FAQ answers
+        if any(word in enhanced_answer.lower() for word in POLICY_KEYWORDS):
+            enhanced_answer = "Please email info@getscenestudios.com for assistance with this."
+        else:
+            categories = detect_question_category(question)
+            contextual_info = get_contextual_business_info(categories)
+            if contextual_info:
+                next_steps = []
+                for category, info in contextual_info.items():
+                    next_steps.append(f"A great next step for you: {info['journey']}")
+                if next_steps:
+                    enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
+            enhanced_answer += f"\n\n{mentor_framing_end}\n\nQuestions? Contact info@getscenestudios.com"
+        # Log question
+        log_question(question, current_session_id, answer=enhanced_answer)
+        return enhanced_answer
+    elif top_faqs and top_faqs[0][0] >= ambiguous_threshold:
+        # AMBIGUOUS ZONE
+        needs_clarification = False
+        if not user_preference:
+             needs_clarification = True
+        is_generic_query = any(w in question.lower() for w in ['price', 'cost', 'how much', 'schedule', 'when'])
+        if is_generic_query and not current_topic:
+             needs_clarification = True
+        clarification_count = session_state.get('clarification_count', 0)
+        if clarification_count > 0:
+            needs_clarification = False
+        if needs_clarification:
+            update_session_state(current_session_id, increment_clarification=True, increment_count=False)
+            best_match_q = top_faqs[0][2]
+            return f"Did you mean: {best_match_q}?"
+        # Auto-Resolve
+        update_session_state(current_session_id, reset_clarification=True, increment_count=False)
+        best_score, faq_id, question_text, answer_text = top_faqs[0]
+        categories = detect_question_category(question)
+        contextual_info = get_contextual_business_info(categories)
+        enhanced_answer = answer_text
+        if contextual_info:
+            next_steps = []
+            for category, info in contextual_info.items():
+                next_steps.append(f"Next step: Consider {info['journey']}")
+            if next_steps:
+                enhanced_answer += f"\n\n{chr(10).join(next_steps)}"
+                enhanced_answer += f"\n\nQuestions? Contact info@getscenestudios.com"
+        log_question(question, current_session_id, answer=enhanced_answer)
+        return enhanced_answer
+    else:
+        # 3. HALLUCINATION GUARD
+        categories = detect_question_category(question)
+        has_session_context = (current_topic is not None) or (user_preference is not None)
+        is_acting_related = (
+            len(categories) > 0 or
+            detect_response_type(question) == "support" or
+            any(k in question.lower() for k in ACTION_KEYWORDS) or
+            any(k in question.lower() for k in ['acting', 'actor', 'scene', 'audition', 'theatre', 'film', 'tv', 'commercial', 'agent', 'rep', 'manager'])
+        )
+        if not is_acting_related:
+            return "I'm not exactly sure about that. Please email info@getscenestudios.com so a member of our team can get you the most accurate answer!"
+    # 4. LLM PATH
+    update_session_state(current_session_id, reset_clarification=True, increment_count=False)
+    podcast_data = fetch_all_embeddings("podcast_episodes")
+    top_workshops = find_top_workshops(user_embedding, k=10)
+    top_podcasts = find_top_k_matches(user_embedding, podcast_data, k=3)
+    enriched_podcast_links = []
+    for _, podcast_id, _ in top_podcasts:
+        row = fetch_row_by_id("podcast_episodes", podcast_id)
+        enriched_podcast_links.extend(generate_enriched_links(row))
+    if not enriched_podcast_links:
+        fallback = fetch_row_by_id("podcast_episodes", podcast_data[0][0])
+        enriched_podcast_links = generate_enriched_links(fallback)
+    # 5. Brevity & Detail Detection
+    wants_details = any(syn in question.lower() for syn in DETAIL_SYNONYMS)
+    final_prompt = build_enhanced_prompt(
+        question,
+        None,
+        top_workshops,
+        user_preference=user_preference,
+        enriched_podcast_links=enriched_podcast_links,
+        wants_details=wants_details,
+        current_topic=current_topic
+    )
+    response = openai.chat.completions.create(
+        model="gpt-4",
+        messages=[
+            {"role": "system", "content": final_prompt},
+            {"role": "user", "content": question}
+        ]
+    )
+    # Log question
+    log_question(question, current_session_id)
+    return response.choices[0].message.content.strip()
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+def chat_with_bot(message, history):
+    """
+    Process message directly without Flask API
+    Args:
+        message: User's current message
+        history: Chat history (list of message dictionaries)
+    Returns:
+        Updated history with new exchange
+    """
+    global session_id
+    if not message.strip():
+        return history
+    try:
+        # Process question directly
+        bot_reply = process_question(message, session_id)
+    except Exception as e:
+        bot_reply = f"❌ Error: {str(e)}"
+    # Append to history in Gradio 6.0 format
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": bot_reply})
+    return history
+def reset_session():
+    """Reset session ID for new conversation"""
+    global session_id
+    session_id = str(uuid.uuid4())
+    return [] #, f"🔄 New session started: {session_id[:8]}..."
+# Create Gradio interface
+with gr.Blocks(title="Get Scene Studios Chatbot") as demo:
+    gr.Markdown(
+        """
+        # 🎬 Get Scene Studios AI Chatbot
+        Ask questions about acting classes, workshops and more!
+        """
+    )
+    # # Session info display
+    # session_info = gr.Textbox(
+    #     label="Current Session ID",
+    #     value=f"Session: {session_id[:8]}...",
+    #     interactive=False,
+    #     scale=1
+    # )
+    # Chatbot interface
+    chatbot = gr.Chatbot(
+        label="Conversation",
+        height=500
+    )
+    # Input area
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Your Message",
+            lines=2,
+            scale=4
+        )
+        submit_btn = gr.Button("Send 📤", scale=1, variant="primary")
+    # Action buttons
+    with gr.Row():
+        clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
+        reset_btn = gr.Button("New Session 🔄", scale=1)
+    # Example questions
+    # gr.Examples(
+    #     examples=[
+    #         "How much does it cost?",
+    #         "I want to get an agent",
+    #         "I'm a beginner, where should I start?",
+    #         "Tell me about your workshops",
+    #         "Do you have online classes?",
+    #         "What's the difference between Perfect Submission and Crush the Callback?",
+    #         "I prefer in-studio training",
+    #         "Tell me about mentorship programs"
+    #     ],
+    #     inputs=msg,
+    #     label="💡 Try these example questions:"
+    # )
+    # Event handlers
+    submit_btn.click(
+        fn=chat_with_bot,
+        inputs=[msg, chatbot],
+        outputs=[chatbot]
+    ).then(
+        fn=lambda: "",
+        inputs=None,
+        outputs=[msg]
+    )
+    msg.submit(
+        fn=chat_with_bot,
+        inputs=[msg, chatbot],
+        outputs=[chatbot]
+    ).then(
+        fn=lambda: "",
+        inputs=None,
+        outputs=[msg]
+    )
+    clear_btn.click(
+        fn=lambda: [],
+        inputs=None,
+        outputs=[chatbot]
+    )
+    reset_btn.click(
+        fn=reset_session,
+        inputs=None,
+        outputs=[chatbot] #, session_info]
+    )
+# Launch the app
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("🎬 Get Scene Studios Chatbot")
+    print("="*60)
+    print("\n✅ No Flask API needed - all processing is done directly!")
+    print("🌐 Gradio interface will open in your browser")
+    print("="*60 + "\n")
+    demo.launch()

scraper.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import requests
+import json
+import re
+from bs4 import BeautifulSoup
+from typing import List, Dict, Any, Tuple
+from utils import clean_time
+def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
+    """
+    Extract workshops using our robust Squarespace JSON + HTML parsing system
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    try:
+        # First try the Squarespace JSON API
+        json_url = f"{url}?format=json"
+        print(f"🔍 Trying Squarespace JSON API: {json_url}")
+        response = requests.get(json_url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            try:
+                json_data = response.json()
+                workshops = extract_workshops_from_json(json_data, json_url)
+                if workshops:
+                    print(f"✅ Extracted {len(workshops)} workshops from JSON API")
+                    return workshops
+                else:
+                    print("❌ No workshops found in JSON, falling back to HTML")
+            except json.JSONDecodeError:
+                print("❌ Invalid JSON response, falling back to HTML")
+        # Fallback to HTML scraping if JSON fails
+        print(f"📄 Falling back to HTML scraping for {url}")
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        workshops = parse_workshops_from_html(soup, url)
+        if workshops:
+            print(f"✅ Extracted {len(workshops)} workshops from HTML parsing")
+            return workshops
+        else:
+            print("❌ No workshops found in HTML")
+            return []
+    except Exception as e:
+        print(f"❌ Error scraping workshops from {url}: {e}")
+        return []
+def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
+    """Extract workshop information from Squarespace JSON data"""
+    workshops = []
+    # Check if there's mainContent HTML to parse
+    if isinstance(data, dict) and 'mainContent' in data:
+        main_content_html = data['mainContent']
+        if isinstance(main_content_html, str):
+            print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
+            soup = BeautifulSoup(main_content_html, 'html.parser')
+            workshops = parse_workshops_from_html(soup, source_url)
+            if workshops:
+                return workshops
+    return workshops
+def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
+    """Enhanced HTML parsing specifically for workshop content"""
+    workshops = []
+    workshop_texts = set()
+    print(f"🔍 ENHANCED HTML PARSING:")
+    # Method 1: Find individual workshop containers
+    potential_containers = soup.find_all(['div', 'section', 'article'],
+                                       attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
+    print(f"   Found {len(potential_containers)} potential workshop containers")
+    for container in potential_containers:
+        workshop_text = container.get_text(strip=True)
+        if len(workshop_text) < 30 or workshop_text in workshop_texts:
+            continue
+        if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
+            workshop = extract_single_workshop_from_text(workshop_text, source_url)
+            if workshop and not is_duplicate_workshop(workshop, workshops):
+                workshops.append(workshop)
+                workshop_texts.add(workshop_text)
+    # Method 2: Pattern-based extraction from full text
+    all_text = soup.get_text()
+    workshop_patterns = [
+        # Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
+        r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
+        r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern 3: "Casting Director Name, Date @ Time"
+        r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
+    ]
+    for i, pattern in enumerate(workshop_patterns):
+        matches = re.findall(pattern, all_text, re.IGNORECASE)
+        for match in matches:
+            workshop = parse_refined_workshop_match(match, i+1, source_url)
+            if workshop and not is_duplicate_workshop(workshop, workshops):
+                workshops.append(workshop)
+    print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
+    return workshops
+def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
+    """Extract workshop info from a single text block"""
+    # Clean up the text
+    text = re.sub(r'\$[0-9,]+\.00', '', text)
+    text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = re.sub(r'\n+', ' ', text)
+    patterns = [
+        # Pattern A: "Title with Professional Name on Date @ Time"
+        r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern B: "Professional Name, Title on Date @ Time"
+        r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern C: "Casting Director Name, Date at Time"
+        r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
+        # Pattern D: "Company Executive Producer Name on Date"
+        r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
+        # Pattern E: "Company Agent Name Date" (fixed "on" issue)
+        r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern F: "Company, Person, Title on Date"
+        r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern G: Flexible fallback
+        r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
+    ]
+    for i, pattern in enumerate(patterns):
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            return parse_pattern_match(match, i, source_url)
+    return None
+def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
+    """Parse a regex match or tuple based on pattern type"""
+    # Use a helper to get group content whether it's a match object or tuple
+    def get_grp(m, idx):
+        val = ""
+        if hasattr(m, 'group'):
+            try:
+                val = m.group(idx)
+            except IndexError:
+                val = ""
+        # If it's a tuple (from findall), idx is 1-based in standard regex terminology
+        # but 0-indexed in the tuple.
+        elif isinstance(m, (tuple, list)):
+            if 0 <= idx-1 < len(m):
+                val = m[idx-1]
+        return val if val is not None else ""
+    # Initialize variables
+    workshop_title = ""
+    instructor_title = ""
+    instructor_name = ""
+    date_str = ""
+    time_str = ""
+    try:
+        if pattern_index == 0:  # Pattern A/1
+            workshop_title = get_grp(match, 1).strip()
+            professional_full = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            if professional_full.startswith('CD '):
+                professional_full = 'Casting Director ' + professional_full[3:]
+            instructor_title, instructor_name = parse_professional_info(professional_full)
+        elif pattern_index == 1:  # Pattern B/2
+            professional_full = get_grp(match, 1).strip()
+            workshop_title = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            instructor_title, instructor_name = parse_professional_info(professional_full)
+        elif pattern_index == 2:  # Pattern C/3
+            instructor_title = get_grp(match, 1).strip()
+            instructor_name = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            workshop_title = "Casting Workshop"
+        elif pattern_index == 3:  # Pattern D
+            instructor_title = get_grp(match, 1).strip()
+            instructor_name = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            workshop_title = "Industry Workshop"
+        elif pattern_index == 4:  # Pattern E
+            company_name = get_grp(match, 1).strip()
+            agent_type = get_grp(match, 2).strip()
+            instructor_name = get_grp(match, 3).strip()
+            date_str = get_grp(match, 4).strip()
+            time_str = get_grp(match, 5).strip()
+            instructor_title = f"{company_name} {agent_type}"
+            workshop_title = "Industry Workshop"
+        elif pattern_index == 5:  # Pattern F
+            company_name = get_grp(match, 1).strip()
+            instructor_name = get_grp(match, 2).strip()
+            workshop_title = get_grp(match, 3).strip()
+            date_str = get_grp(match, 4).strip()
+            time_str = get_grp(match, 5).strip()
+            instructor_title = company_name
+        else:  # Pattern G
+            professional_full = get_grp(match, 1).strip() + " " + get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            workshop_title = "Industry Workshop"
+            if len(professional_full) > 50 or '\n' in professional_full:
+                return None
+            instructor_title, instructor_name = parse_professional_info(professional_full)
+        if instructor_name and date_str:
+            # Create full_text for embedding (required by existing Flask API)
+            full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
+            if date_str:
+                full_text += f" on {date_str}"
+            if time_str:
+                full_text += f" at {clean_time(time_str)}"
+            return {
+                'title': workshop_title,
+                'instructor_name': instructor_name,
+                'instructor_title': instructor_title,
+                'date': date_str,
+                'time': clean_time(time_str),
+                'full_text': full_text,  # Required for existing embedding system
+                'source_url': source_url
+            }
+    except Exception as e:
+        print(f"Error parsing pattern match: {e}")
+    return None
+def parse_professional_info(professional_full: str) -> tuple:
+    """Parse professional title and name from full string"""
+    professional_full = re.sub(r'\s+', ' ', professional_full).strip()
+    # Handle specific multi-word titles
+    specific_titles = [
+        'Atlanta Models & Talent President',
+        'Executive Casting Producer',
+        'Casting Director',
+        'Casting Associate',
+        'DDO Agent',
+        'Talent Agent',
+        'Acting Coach'
+    ]
+    for title in specific_titles:
+        if title in professional_full:
+            title_pos = professional_full.find(title)
+            if title_pos == 0:
+                name_part = professional_full[len(title):].strip()
+                return title, name_part
+            else:
+                name_part = professional_full[:title_pos].strip().rstrip(',')
+                return title, name_part
+    # Fallback for single-word titles
+    single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
+    words = professional_full.split()
+    for i, word in enumerate(words):
+        if word in single_word_titles:
+            if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
+                title = f"{words[i-1]} {word}"
+                name_parts = words[:i-1] + words[i+1:]
+            else:
+                title = word
+                name_parts = words[:i] + words[i+1:]
+            name = ' '.join(name_parts).strip()
+            return title, name
+    # Final fallback
+    if len(words) >= 2:
+        return words[0], ' '.join(words[1:])
+    return '', professional_full
+def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
+    """Parse a regex match into a clean workshop dictionary"""
+    return parse_pattern_match(match, pattern_num-1, source_url)  # Adjust for 0-based indexing
+def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
+    """Enhanced duplicate detection"""
+    for existing in existing_workshops:
+        if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
+            existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
+            existing_title = existing.get('title', '').strip().lower()
+            new_title = new_workshop.get('title', '').strip().lower()
+            if (existing_title == new_title or
+                'workshop' in existing_title and 'workshop' in new_title or
+                existing_title in new_title or new_title in existing_title):
+                return True
+    return False
+def calculate_workshop_confidence(w: Dict) -> float:
+    """Calculate confidence score of retrieved workshop data"""
+    score = 0.0
+    if w.get('title'): score += 0.3
+    if w.get('instructor_name'): score += 0.3
+    if w.get('date'): score += 0.2
+    if w.get('time'): score += 0.1
+    if w.get('source_url'): score += 0.1
+    return round(score, 2)