Spaces:

kreemyyyy
/

scriptwriter

Sleeping

App Files Files Community

kreemyyyy commited on Sep 10, 2025

Commit

fd88516

verified ·

1 Parent(s): 0c3a5df

Upload 13 files

Browse files

Files changed (13) hide show

.gitignore +34 -0
README.md +62 -20
app.py +844 -0
auto_scorer.py +240 -0
bandit_learner.py +330 -0
compliance.py +26 -0
db.py +248 -0
deepseek_client.py +59 -0
models.py +103 -0
packages.txt +1 -0
rag_integration.py +350 -0
rag_retrieval.py +444 -0
requirements.txt +16 -3

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Environment and secrets
+.env
+.streamlit/secrets.toml
+secrets.toml
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+# Database files
+*.db
+*.sqlite
+*.sqlite3
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+# Temporary files
+*.tmp
+*.temp

README.md CHANGED Viewed

@@ -1,20 +1,62 @@
----
-title: Scriptwriter
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+---
+title: AI Script Studio
+emoji: 🎬
+colorFrom: blue
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.37.1
+app_file: app.py
+pinned: false
+license: mit
+short_description: Generate Instagram-ready scripts with AI-powered RAG system
+---
+# 🎬 AI Script Studio
+Generate Instagram-ready scripts with AI using advanced RAG (Retrieval-Augmented Generation) system.
+## Features
+- 🤖 **AI-Powered Generation**: Uses DeepSeek API for high-quality script generation
+- 🧠 **RAG System**: Retrieval-Augmented Generation with semantic search
+- 📊 **Multi-Armed Bandit Learning**: Self-improving generation policies
+- 🎯 **Auto-Scoring**: LLM-based quality assessment
+- 📈 **Rating System**: Human feedback integration with learning
+- 🎨 **Multiple Personas**: Support for different creator styles
+- 📝 **Content Types**: Various Instagram content formats
+## How It Works
+1. **Reference Retrieval**: Uses semantic search to find relevant examples
+2. **Policy Learning**: Multi-armed bandit optimizes generation parameters
+3. **AI Generation**: Creates scripts using retrieved references
+4. **Auto-Scoring**: LLM judges quality across 5 dimensions
+5. **Learning Loop**: System improves based on feedback
+## Usage
+1. Select your creator persona
+2. Choose content type and tone
+3. Add reference examples (optional)
+4. Generate scripts with AI
+5. Rate and provide feedback
+6. System learns and improves
+## Technical Stack
+- **Frontend**: Streamlit
+- **AI**: DeepSeek API
+- **RAG**: Sentence Transformers + FAISS
+- **Database**: SQLite with SQLModel
+- **Learning**: Multi-armed bandit algorithms
+- **Scoring**: LLM-based evaluation
+## Setup
+1. Add your DeepSeek API key to the secrets
+2. The app will automatically initialize the database
+3. Start generating scripts!
+## API Key
+Get your free API key at: https://platform.deepseek.com/api_keys

app.py ADDED Viewed

	@@ -0,0 +1,844 @@

+import os, streamlit as st
+from dotenv import load_dotenv
+from sqlmodel import select
+from db import init_db, get_session, add_rating
+from models import Script, Revision
+from deepseek_client import generate_scripts, revise_for, selective_rewrite
+# Lazy import for RAG system to improve startup time
+# from rag_integration import generate_scripts_rag
+from compliance import blob_from, score_script
+import time
+# Configure page - MUST be first Streamlit command
+st.set_page_config(
+    page_title="🎬 AI Script Studio",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+def script_to_json_dict(script):
+    """Convert script to JSON-serializable dictionary"""
+    data = script.model_dump()
+    # Remove datetime fields that cause JSON serialization issues
+    data.pop('created_at', None)
+    data.pop('updated_at', None)
+    return data
+# Load environment - works both locally and on Hugging Face Spaces
+load_dotenv()
+# Initialize database with error handling for cloud deployment
+try:
+    init_db()
+    st.sidebar.write("✅ Database initialized successfully")
+except Exception as e:
+    st.sidebar.write(f"⚠️ Database init warning: {str(e)}")
+    # Continue anyway - some features may be limited
+# Check for API key in Streamlit secrets or environment
+api_key = st.secrets.get("DEEPSEEK_API_KEY") if hasattr(st, 'secrets') and "DEEPSEEK_API_KEY" in st.secrets else os.getenv("DEEPSEEK_API_KEY")
+# DEBUG INFO - remove after fixing
+if hasattr(st, 'secrets'):
+    st.sidebar.write("🔍 DEBUG: Secrets available")
+    if "DEEPSEEK_API_KEY" in st.secrets:
+        st.sidebar.write("✅ DEEPSEEK_API_KEY found in secrets")
+        st.sidebar.write(f"🔑 Key length: {len(st.secrets['DEEPSEEK_API_KEY'])}")
+        st.sidebar.write(f"🔑 Key starts with: {st.secrets['DEEPSEEK_API_KEY'][:10]}...")
+    else:
+        st.sidebar.write("❌ DEEPSEEK_API_KEY NOT in secrets")
+        st.sidebar.write(f"Available secrets: {list(st.secrets.keys())}")
+else:
+    st.sidebar.write("❌ No secrets available")
+if not api_key:
+    st.error("🔑 **DeepSeek API Key Required**")
+    st.markdown("""
+    **For Local Development:**
+    - Create a `.env` file and add: `DEEPSEEK_API_KEY=your_key_here`
+    **For Streamlit Cloud:**
+    - Go to your app settings → Secrets
+    - Add: `DEEPSEEK_API_KEY = "your_key_here"`
+    Get your free API key at: https://platform.deepseek.com/api_keys
+    """)
+    st.stop()
+else:
+    st.sidebar.write("✅ API key loaded successfully")
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        text-align: center;
+        padding: 1rem;
+        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border-radius: 10px;
+        margin-bottom: 2rem;
+    }
+    .step-container {
+        border: 2px solid #e1e1e1;
+        border-radius: 10px;
+        padding: 1rem;
+        margin-bottom: 1rem;
+        background-color: #f8f9fa;
+    }
+    .draft-card {
+        border: 1px solid #ddd;
+        border-radius: 8px;
+        padding: 0.8rem;
+        margin-bottom: 0.5rem;
+        background: white;
+        transition: all 0.2s ease;
+    }
+    .draft-card:hover {
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        border-color: #667eea;
+    }
+    .success-box {
+        background-color: #d4edda;
+        border: 1px solid #c3e6cb;
+        border-radius: 5px;
+        padding: 1rem;
+        margin: 1rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Header
+st.markdown("""
+<div class="main-header">
+    <h1>🎬 AI Script Studio</h1>
+    <p>Generate Instagram-ready scripts with AI • Powered by DeepSeek</p>
+</div>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'generation_step' not in st.session_state:
+    st.session_state.generation_step = 'setup'
+if 'generated_count' not in st.session_state:
+    st.session_state.generated_count = 0
+# Sidebar - Generation Controls
+with st.sidebar:
+    st.header("🎯 Script Generation")
+    # Step 1: Basic Settings
+    with st.expander("📝 Step 1: Basic Settings", expanded=True):
+        # Dynamic creator dropdown (pull from database + defaults)
+        with get_session() as ses:
+            db_creators = list(ses.exec(select(Script.creator).distinct()))
+            db_creator_names = [c for c in db_creators if c]
+        default_creators = ["Creator A", "Emily", "Anya", "Ava Cherrry", "Ava Xreyess", "FitBryceAdams", "RealCarlyJane", "Sophie Rain", "Zoe AloneAtHome"]
+        all_creators = list(set(default_creators + db_creator_names))
+        creator_options = sorted(all_creators)
+        creator = st.selectbox(
+            "Creator Name",
+            creator_options,
+            help="Choose from existing creators or your imported scripts"
+        )
+        # Expanded content types
+        content_type = st.selectbox(
+            "Content Type",
+            ["thirst-trap", "skit", "reaction-prank", "talking-style", "lifestyle", "fake-podcast", "dance-trend", "voice-tease-asmr"],
+            help="Choose the type of content you want to create"
+        )
+        # Multi-select tones
+        tone_options = ["naughty", "playful", "suggestive", "funny", "flirty", "bratty", "teasing", "intimate", "witty", "comedic", "confident", "wholesome", "asmr-voice"]
+        selected_tones = st.multiselect(
+            "Tone/Vibe (select multiple)",
+            tone_options,
+            default=["playful"],
+            help="Choose one or more tones - scripts often blend 2-3 vibes"
+        )
+        tone = ", ".join(selected_tones) if selected_tones else "playful"
+        n = st.slider(
+            "Number of drafts",
+            min_value=1,
+            max_value=20,
+            value=6,
+            help="How many script variations to generate"
+        )
+    # Step 2: Persona & Style
+    with st.expander("👤 Step 2: Persona & Style", expanded=True):
+        # Persona presets
+        persona_presets = {
+            "Girl-next-door": "girl-next-door; playful; witty; approachable",
+            "Bratty tease": "bratty; teasing; demanding; playful attitude",
+            "Dominant/In control": "confident; in control; commanding; assertive",
+            "Innocent but suggestive": "innocent; sweet; accidentally suggestive; naive charm",
+            "Party girl": "outgoing; fun; social; party vibes; energetic",
+            "Gym fitspo": "fitness focused; motivational; athletic; body confident",
+            "ASMR/Voice fetish": "soft spoken; intimate; soothing; sensual voice",
+            "Girlfriend experience": "loving; intimate; caring; relationship vibes",
+            "Funny meme-style": "comedic; meme references; internet culture; quirky",
+            "Candid/Lifestyle": "authentic; relatable; everyday life; natural"
+        }
+        col1, col2 = st.columns([0.6, 0.4])
+        with col1:
+            persona_preset = st.selectbox(
+                "Persona Preset",
+                ["Custom"] + list(persona_presets.keys()),
+                help="Choose a preset or use custom"
+            )
+        with col2:
+            if persona_preset != "Custom":
+                if st.button("📋 Use Preset", use_container_width=True):
+                    st.session_state.persona_text = persona_presets[persona_preset]
+        persona = st.text_area(
+            "Persona Description",
+            value=st.session_state.get('persona_text', "girl-next-door; playful; witty"),
+            help="Describe the character/personality for the scripts"
+        )
+        # Compliance/Boundaries presets
+        boundary_presets = {
+            "Safe IG mode": "No explicit words; no sexual acts; suggestive only; no banned IG terms; keep it flirty but clean",
+            "Spicy mode": "Innuendos allowed; suggestive language OK; no explicit acts; can be naughty but not graphic",
+            "Brand-safe": "No swearing; no sex references; just flirty and fun; wholesome with hint of tease",
+            "Mild NSFW": "Moaning sounds OK; wet references allowed; squirt innuendo OK; suggestive but not explicit",
+            "Platform optimized": "Avoid flagged keywords; use creative euphemisms; suggestive storytelling style"
+        }
+        col1, col2 = st.columns([0.6, 0.4])
+        with col1:
+            boundary_preset = st.selectbox(
+                "Compliance Preset",
+                ["Custom"] + list(boundary_presets.keys()),
+                help="Choose platform-appropriate safety rules"
+            )
+        with col2:
+            if boundary_preset != "Custom":
+                if st.button("🛡️ Use Preset", use_container_width=True):
+                    st.session_state.boundaries_text = boundary_presets[boundary_preset]
+        boundaries = st.text_area(
+            "Content Boundaries",
+            value=st.session_state.get('boundaries_text', "No explicit words; no solicitation; no age refs"),
+            help="What should the AI avoid? Set your safety guidelines here"
+        )
+    # Step 3: Advanced Options
+    with st.expander("⚡ Step 3: Advanced Options", expanded=False):
+        col1, col2 = st.columns(2)
+        with col1:
+            # Hook style
+            hook_style = st.selectbox(
+                "Hook Style",
+                ["Auto", "Question", "Confession", "Contrarian", "PSA", "Tease", "Command", "Shock"],
+                help="How should the hook start?"
+            )
+            # Length
+            length = st.selectbox(
+                "Target Length",
+                ["Auto", "Short (5-7s)", "Medium (8-12s)", "Longer (13-20s)"],
+                help="How long should the script be?"
+            )
+            # Risk level
+            risk_level = st.slider(
+                "Risk Level",
+                min_value=1,
+                max_value=5,
+                value=3,
+                help="1=Safe, 3=Suggestive, 5=Spicy"
+            )
+        with col2:
+            # Retention gimmick
+            retention = st.selectbox(
+                "Retention Hook",
+                ["Auto", "Twist ending", "Shock reveal", "Naughty payoff", "Innocent→dirty flip", "Cliffhanger"],
+                help="How to keep viewers watching?"
+            )
+            # Shot type
+            shot_type = st.selectbox(
+                "Shot Type",
+                ["Auto", "POV", "Selfie cam", "Tripod", "Over-the-shoulder", "Mirror shot"],
+                help="Camera angle/perspective"
+            )
+            # Wardrobe
+            wardrobe = st.selectbox(
+                "Wardrobe/Setting",
+                ["Auto", "Gym fit", "Bikini", "Bed outfit", "Towel", "Dress", "Casual", "Kitchen", "Car"],
+                help="Setting or outfit context"
+            )
+    # Step 4: Optional References
+    with st.expander("📚 Step 4: Extra References (Optional)", expanded=False):
+        st.info("💡 The AI automatically uses your database references, but you can add more here")
+        refs_text = st.text_area(
+            "Additional Reference Lines",
+            value="",
+            height=100,
+            help="Add extra inspiration lines (one per line)"
+        )
+    # Generation Button
+    st.markdown("---")
+    # Show reference count
+    from db import get_hybrid_refs
+    # Map new content types to existing database types for compatibility
+    content_type_mapping = {
+        "thirst-trap": "talking_style / thirst_trap",
+        "skit": "comedy",
+        "reaction-prank": "prank",
+        "talking-style": "talking_style",
+        "lifestyle": "lifestyle",
+        "fake-podcast": "fake-podcast",
+        "dance-trend": "trend-adaptation",
+        "voice-tease-asmr": "talking_style"
+    }
+    mapped_content_type = content_type_mapping.get(content_type, content_type)
+    ref_count = len(get_hybrid_refs(creator, mapped_content_type, k=6))
+    st.info(f"🤖 AI will use {ref_count} database references + your extras")
+    generate_button = st.button(
+        "🚀 Generate Scripts",
+        type="primary",
+        use_container_width=True
+    )
+    # Generation Process
+    if generate_button:
+        with st.spinner("🧠 AI is creating your scripts..."):
+            try:
+                # Get manual refs from text area
+                manual_refs = [x.strip() for x in refs_text.split("\n") if x.strip()]
+                # Get automatic refs from selected creator scripts in database using content type mapping
+                auto_refs = get_hybrid_refs(creator, mapped_content_type, k=6)
+                # Combine both
+                all_refs = manual_refs + auto_refs
+                # Progress indicator
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                status_text.text("🔍 Analyzing references...")
+                progress_bar.progress(25)
+                time.sleep(0.5)
+                status_text.text("🧠 RAG system selecting optimal references...")
+                progress_bar.progress(40)
+                time.sleep(0.3)
+                status_text.text("✨ Generating enhanced content with AI learning...")
+                progress_bar.progress(60)
+                # Build enhanced prompt from advanced options
+                advanced_prompt = ""
+                if hook_style != "Auto":
+                    advanced_prompt += f"Hook style: {hook_style}. "
+                if length != "Auto":
+                    advanced_prompt += f"Target length: {length}. "
+                if retention != "Auto":
+                    advanced_prompt += f"Retention strategy: {retention}. "
+                if shot_type != "Auto":
+                    advanced_prompt += f"Shot type: {shot_type}. "
+                if wardrobe != "Auto":
+                    advanced_prompt += f"Setting/wardrobe: {wardrobe}. "
+                if risk_level != 3:
+                    risk_desc = {1: "very safe", 2: "mild", 3: "suggestive", 4: "spicy", 5: "very spicy"}
+                    advanced_prompt += f"Risk level: {risk_desc[risk_level]}. "
+                # Enhance boundaries with advanced prompt
+                enhanced_boundaries = boundaries
+                if advanced_prompt:
+                    enhanced_boundaries += f"\n\nADVANCED GUIDANCE: {advanced_prompt}"
+                # Generate scripts with enhanced RAG system (lazy import)
+                try:
+                    from rag_integration import generate_scripts_rag
+                    drafts = generate_scripts_rag(persona, enhanced_boundaries, content_type, tone, all_refs, n=n)
+                except ImportError as e:
+                    st.warning(f"RAG system not available: {e}. Using fallback generation.")
+                    # Fallback to simple generation
+                    drafts = generate_scripts(enhanced_boundaries, n)
+                progress_bar.progress(75)
+                status_text.text("💾 Saving to database...")
+                # Save to database
+                with get_session() as ses:
+                    for d in drafts:
+                        lvl, _ = score_script(" ".join([d.get("title",""), d.get("hook",""), *d.get("beats",[]), d.get("voiceover",""), d.get("caption",""), d.get("cta","")]))
+                        s = Script(
+                            creator=creator, content_type=content_type, tone=tone,
+                            title=d["title"], hook=d["hook"], beats=d["beats"],
+                            voiceover=d["voiceover"], caption=d["caption"],
+                            hashtags=d.get("hashtags",[]), cta=d.get("cta",""),
+                            compliance=lvl, source="ai"
+                        )
+                        ses.add(s)
+                    ses.commit()
+                progress_bar.progress(100)
+                status_text.text("")
+                progress_bar.empty()
+                st.session_state.generated_count += len(drafts)
+                st.success(f"🎉 Generated {len(drafts)} scripts successfully!")
+                # Show which refs were used and advanced options
+                col1, col2 = st.columns(2)
+                with col1:
+                    if auto_refs:
+                        st.markdown("**🤖 Hybrid refs used this run:**")
+                        for line in auto_refs[:3]:  # Show first 3
+                            st.write(f"• {line}")
+                with col2:
+                    if advanced_prompt:
+                        st.markdown("**⚡ Advanced options applied:**")
+                        st.write(f"• {advanced_prompt[:100]}...")
+                    st.write(f"**📊 Settings:** {tone} • {content_type}")
+                st.balloons()
+                # Auto-refresh to show new drafts
+                time.sleep(1)
+                st.rerun()
+            except Exception as e:
+                st.error(f"❌ Generation failed: {str(e)}")
+                st.write("💡 Try adjusting your parameters or check your API key")
+    # Quick Actions
+    st.markdown("---")
+    st.subheader("⚡ Quick Actions")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("🔄 Refresh", use_container_width=True):
+            st.rerun()
+    with col2:
+        if st.button("🗑️ Clear All", use_container_width=True, help="Delete all your generated scripts"):
+            if st.session_state.get('confirm_clear'):
+                with get_session() as ses:
+                    scripts_to_delete = list(ses.exec(select(Script).where(Script.creator == creator, Script.source == "ai")))
+                    for script in scripts_to_delete:
+                        ses.delete(script)
+                    ses.commit()
+                st.success("🗑️ All drafts cleared!")
+                st.session_state.confirm_clear = False
+                st.rerun()
+            else:
+                st.session_state.confirm_clear = True
+                st.warning("Click again to confirm deletion!")
+# Main Area
+tab1, tab2, tab3 = st.tabs(["📝 Draft Review", "🎯 Filters", "📊 Analytics"])
+with tab1:
+    # Load drafts
+    with get_session() as ses:
+        q = select(Script).where(Script.creator == creator, Script.source == "ai")
+        all_drafts = list(ses.exec(q))
+    if not all_drafts:
+        st.markdown("""
+        <div style="text-align: center; padding: 3rem;">
+            <h3>🎬 Ready to Create Amazing Scripts?</h3>
+            <p style="font-size: 1.2rem; color: #666;">
+                👈 Use the sidebar to generate your first batch of AI scripts<br>
+                🤖 The AI will learn from successful examples in the database<br>
+                ✨ Then review, edit, and perfect your scripts here
+            </p>
+        </div>
+        """, unsafe_allow_html=True)
+        if st.session_state.generated_count > 0:
+            st.info(f"🎉 You've generated {st.session_state.generated_count} scripts so far! Use filters to find them.")
+    else:
+        # Draft management
+        col1, col2 = st.columns([0.4, 0.6], gap="large")
+        with col1:
+            st.subheader(f"📋 Your Drafts ({len(all_drafts)})")
+            # Quick filters
+            filter_col1, filter_col2 = st.columns(2)
+            with filter_col1:
+                compliance_filter = st.selectbox(
+                    "Compliance",
+                    ["All", "PASS", "WARN", "FAIL"],
+                    key="compliance_filter"
+                )
+            with filter_col2:
+                sort_by = st.selectbox(
+                    "Sort by",
+                    ["Newest", "Oldest", "Title"],
+                    key="sort_filter"
+                )
+            # Apply filters
+            filtered_drafts = all_drafts
+            if compliance_filter != "All":
+                filtered_drafts = [d for d in filtered_drafts if d.compliance.upper() == compliance_filter]
+            # Apply sorting
+            if sort_by == "Newest":
+                filtered_drafts.sort(key=lambda x: x.created_at, reverse=True)
+            elif sort_by == "Oldest":
+                filtered_drafts.sort(key=lambda x: x.created_at)
+            else:  # Title
+                filtered_drafts.sort(key=lambda x: x.title)
+            # Draft cards
+            selected_id = st.session_state.get("selected_id")
+            for draft in filtered_drafts:
+                # Compliance color coding
+                compliance_color = {
+                    "pass": "🟢",
+                    "warn": "🟡",
+                    "fail": "🔴"
+                }.get(draft.compliance, "⚪")
+                # Create card
+                with st.container(border=True):
+                    if st.button(
+                        f"{compliance_color} {draft.title}",
+                        key=f"select-{draft.id}",
+                        use_container_width=True
+                    ):
+                        st.session_state["selected_id"] = draft.id
+                        selected_id = draft.id
+                    st.caption(f"🎭 {draft.tone} • 📅 {draft.created_at.strftime('%m/%d %H:%M')}")
+                    # Preview hook
+                    if draft.hook:
+                        st.markdown(f"*{draft.hook[:80]}{'...' if len(draft.hook) > 80 else ''}*")
+        with col2:
+            st.subheader("✏️ Script Editor")
+            if not filtered_drafts:
+                st.info("No drafts match your filters. Try adjusting the filter settings.")
+            else:
+                # Auto-select first draft if none selected
+                if not selected_id or selected_id not in [d.id for d in filtered_drafts]:
+                    selected_id = filtered_drafts[0].id
+                    st.session_state["selected_id"] = selected_id
+                # Get current draft
+                current = next((x for x in filtered_drafts if x.id == selected_id), filtered_drafts[0])
+                # Editor tabs
+                edit_tab1, edit_tab2, edit_tab3 = st.tabs(["📝 Edit", "🛠️ AI Tools", "📜 History"])
+                with edit_tab1:
+                    # Main editing fields
+                    with st.form("edit_script"):
+                        title = st.text_input("Title", value=current.title)
+                        hook = st.text_area("Hook", value=current.hook or "", height=80)
+                        beats_text = st.text_area("Beats (one per line)", value="\n".join(current.beats or []), height=120)
+                        voiceover = st.text_area("Voiceover", value=current.voiceover or "", height=80)
+                        caption = st.text_area("Caption", value=current.caption or "", height=100)
+                        # Clean up hashtags display - remove commas, show as space-separated
+                        current_hashtags = current.hashtags or []
+                        hashtags_display = " ".join(current_hashtags) if current_hashtags else ""
+                        hashtags = st.text_input("Hashtags (space separated)", value=hashtags_display, help="Enter hashtags like: #gym #fitness #workout")
+                        cta = st.text_input("Call to Action", value=current.cta or "")
+                        # Submit button
+                        if st.form_submit_button("💾 Save Changes", type="primary", use_container_width=True):
+                            with get_session() as ses:
+                                dbs = ses.get(Script, current.id)
+                                dbs.title = title
+                                dbs.hook = hook
+                                dbs.beats = [x.strip() for x in beats_text.split("\n") if x.strip()]
+                                dbs.voiceover = voiceover
+                                dbs.caption = caption
+                                # Parse hashtags from space-separated input
+                                dbs.hashtags = [x.strip() for x in hashtags.split() if x.strip()]
+                                dbs.cta = cta
+                                # Update compliance
+                                lvl, _ = score_script(blob_from(dbs.model_dump()))
+                                dbs.compliance = lvl
+                                ses.add(dbs)
+                                ses.commit()
+                            st.success("✅ Script saved successfully!")
+                            time.sleep(1)
+                            st.rerun()
+                    # Rating widget
+                    st.markdown("### Rate this script (feeds future generations)")
+                    # Show current ratings if any
+                    if current.ratings_count > 0:
+                        st.info(f"📊 Current ratings ({current.ratings_count} ratings): Overall: {current.score_overall:.1f}/5.0, Hook: {current.score_hook:.1f}/5.0, Originality: {current.score_originality:.1f}/5.0")
+                    with st.form("rate_script"):
+                        colA, colB, colC, colD, colE = st.columns(5)
+                        overall = colA.slider("Overall", 1.0, 5.0, 4.0, 0.5)
+                        hook_s = colB.slider("Hook clarity", 1.0, 5.0, 4.0, 0.5)
+                        orig_s = colC.slider("Originality", 1.0, 5.0, 4.0, 0.5)
+                        fit_s  = colD.slider("Style fit", 1.0, 5.0, 4.0, 0.5)
+                        safe_s = colE.slider("Safety", 1.0, 5.0, 4.0, 0.5)
+                        notes  = st.text_input("Notes (optional)")
+                        if st.form_submit_button("💫 Save rating", type="secondary", use_container_width=True):
+                            add_rating(
+                                script_id=current.id,
+                                overall=overall, hook=hook_s, originality=orig_s,
+                                style_fit=fit_s, safety=safe_s, notes=notes, rater="human"
+                            )
+                            st.success("Rating saved. Future generations will weigh this higher.")
+                            time.sleep(1)
+                            st.rerun()
+                with edit_tab2:
+                    st.write("🤖 **AI-Powered Improvements**")
+                    # Quick AI actions
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        if st.button("🛡️ Make Safer", use_container_width=True):
+                            with st.spinner("Making content safer..."):
+                                revised = revise_for("be Instagram-compliant and safer", script_to_json_dict(current), "Remove risky phrases; keep intent and beat order.")
+                                with get_session() as ses:
+                                    dbs = ses.get(Script, current.id)
+                                    before = dbs.caption
+                                    dbs.caption = revised.get("caption", dbs.caption)
+                                    lvl, _ = score_script(blob_from(revised))
+                                    dbs.compliance = lvl
+                                    ses.add(dbs)
+                                    ses.commit()
+                                    ses.add(Revision(script_id=dbs.id, label="Auto safer", field="caption", before=before, after=dbs.caption))
+                                    ses.commit()
+                                st.success("✅ Content made safer!")
+                                st.rerun()
+                        if st.button("✨ More Playful", use_container_width=True):
+                            with st.spinner("Adding playful vibes..."):
+                                revised = revise_for("be more playful (keep safe)", script_to_json_dict(current), "Increase playful tone without adding risk.")
+                                with get_session() as ses:
+                                    dbs = ses.get(Script, current.id)
+                                    before = dbs.hook
+                                    dbs.hook = revised.get("hook", dbs.hook)
+                                    ses.add(dbs)
+                                    ses.commit()
+                                    ses.add(Revision(script_id=dbs.id, label="More playful", field="hook", before=before, after=dbs.hook))
+                                    ses.commit()
+                                st.success("✨ Added playful energy!")
+                                st.rerun()
+                    with col2:
+                        if st.button("✂️ Shorter Hook", use_container_width=True):
+                            with st.spinner("Tightening hook..."):
+                                revised = revise_for("shorten the hook to <= 8 words", script_to_json_dict(current), "Shorten only the hook, keep intent.")
+                                with get_session() as ses:
+                                    dbs = ses.get(Script, current.id)
+                                    before = dbs.hook
+                                    dbs.hook = revised.get("hook", dbs.hook)
+                                    ses.add(dbs)
+                                    ses.commit()
+                                    ses.add(Revision(script_id=dbs.id, label="Shorter hook", field="hook", before=before, after=dbs.hook))
+                                    ses.commit()
+                                st.success("✂️ Hook tightened!")
+                                st.rerun()
+                        if st.button("🇬🇧 Localize (UK)", use_container_width=True):
+                            with st.spinner("Localizing content..."):
+                                revised = revise_for("localize to UK English", script_to_json_dict(current), "Adjust spelling/phrasing to UK without changing content.")
+                                with get_session() as ses:
+                                    dbs = ses.get(Script, current.id)
+                                    before = dbs.caption
+                                    dbs.caption = revised.get("caption", dbs.caption)
+                                    ses.add(dbs)
+                                    ses.commit()
+                                    ses.add(Revision(script_id=dbs.id, label="Localize UK", field="caption", before=before, after=dbs.caption))
+                                    ses.commit()
+                                st.success("🇬🇧 Localized to UK!")
+                                st.rerun()
+                    # Custom rewrite section
+                    st.markdown("---")
+                    st.write("🎯 **Custom Rewrite**")
+                    with st.form("custom_rewrite"):
+                        rewrite_col1, rewrite_col2 = st.columns([0.6, 0.4])
+                        with rewrite_col1:
+                            field = st.selectbox("Field to Edit", ["title","hook","voiceover","caption","cta","beats"])
+                            snippet = st.text_input("Exact text you want to change")
+                        with rewrite_col2:
+                            prompt = st.text_input("How to rewrite it")
+                        if st.form_submit_button("🪄 Rewrite", use_container_width=True):
+                            if snippet and prompt:
+                                with st.spinner("AI is rewriting..."):
+                                    draft = script_to_json_dict(current)
+                                    revised = selective_rewrite(draft, field, snippet, prompt)
+                                    with get_session() as ses:
+                                        dbs = ses.get(Script, current.id)
+                                        before = getattr(dbs, field)
+                                        setattr(dbs, field, revised.get(field, before))
+                                        lvl, _ = score_script(blob_from(dbs.model_dump()))
+                                        dbs.compliance = lvl
+                                        ses.add(dbs)
+                                        ses.commit()
+                                        ses.add(Revision(script_id=dbs.id, label="Custom rewrite", field=field, before=str(before), after=str(getattr(dbs, field))))
+                                        ses.commit()
+                                    st.success("🪄 Rewrite complete!")
+                                    st.rerun()
+                            else:
+                                st.error("Please fill in both the text and rewrite instructions")
+                with edit_tab3:
+                    st.write("📜 **Revision History**")
+                    with get_session() as ses:
+                        revisions = list(ses.exec(
+                            select(Revision).where(Revision.script_id==current.id).order_by(Revision.created_at.desc())
+                        ))
+                    if not revisions:
+                        st.info("No revisions yet. Make some changes to see the history!")
+                    else:
+                        for rev in revisions:
+                            with st.expander(f"🔄 {rev.label} • {rev.field} • {rev.created_at.strftime('%m/%d %H:%M')}"):
+                                col1, col2 = st.columns(2)
+                                with col1:
+                                    st.write("**Before:**")
+                                    st.code(rev.before)
+                                with col2:
+                                    st.write("**After:**")
+                                    st.code(rev.after)
+with tab2:
+    st.subheader("🎯 Advanced Filters & Search")
+    # Advanced filtering interface
+    filter_col1, filter_col2, filter_col3 = st.columns(3)
+    with filter_col1:
+        creator_filter = st.selectbox("Creator", ["All"] + ["Creator A", "Emily"])
+        content_filter = st.selectbox("Content Type", ["All"] + ["thirst-trap", "lifestyle", "comedy", "prank", "fake-podcast", "trend-adaptation"])
+    with filter_col2:
+        compliance_filter_adv = st.selectbox("Compliance Status", ["All", "PASS", "WARN", "FAIL"])
+        source_filter = st.selectbox("Source", ["All", "AI Generated", "Imported", "Manual"])
+    with filter_col3:
+        date_filter = st.selectbox("Date Range", ["All Time", "Today", "This Week", "This Month"])
+        search_text = st.text_input("🔍 Search in titles/content")
+    # Apply advanced filters and show results
+    with get_session() as ses:
+        query = select(Script)
+        # Apply filters
+        if creator_filter != "All":
+            query = query.where(Script.creator == creator_filter)
+        if content_filter != "All":
+            query = query.where(Script.content_type == content_filter)
+        if compliance_filter_adv != "All":
+            query = query.where(Script.compliance == compliance_filter_adv.lower())
+        filtered_results = list(ses.exec(query))
+        # Search in text
+        if search_text:
+            filtered_results = [
+                r for r in filtered_results
+                if search_text.lower() in r.title.lower() or
+                   search_text.lower() in (r.hook or "").lower() or
+                   search_text.lower() in (r.caption or "").lower()
+            ]
+    st.write(f"**Found {len(filtered_results)} scripts**")
+    # Display filtered results
+    if filtered_results:
+        for script in filtered_results[:10]:  # Show first 10
+            with st.expander(f"{script.compliance.upper()} • {script.title} • {script.creator}"):
+                st.write(f"**Hook:** {script.hook}")
+                st.write(f"**Type:** {script.content_type} • **Tone:** {script.tone}")
+                st.write(f"**Created:** {script.created_at.strftime('%Y-%m-%d %H:%M')}")
+with tab3:
+    st.subheader("📊 Script Analytics")
+    # Get all scripts for analytics
+    with get_session() as ses:
+        all_scripts = list(ses.exec(select(Script)))
+    if all_scripts:
+        # Create metrics
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Total Scripts", len(all_scripts))
+        with col2:
+            ai_generated = len([s for s in all_scripts if s.source == "ai"])
+            st.metric("AI Generated", ai_generated)
+        with col3:
+            passed_compliance = len([s for s in all_scripts if s.compliance == "pass"])
+            st.metric("Compliance PASS", passed_compliance)
+        with col4:
+            unique_creators = len(set(s.creator for s in all_scripts))
+            st.metric("Creators", unique_creators)
+        # Charts and insights
+        st.markdown("### 📈 Content Insights")
+        # Compliance distribution
+        compliance_counts = {}
+        for script in all_scripts:
+            compliance_counts[script.compliance] = compliance_counts.get(script.compliance, 0) + 1
+        if compliance_counts:
+            st.bar_chart(compliance_counts)
+        # Content type distribution
+        type_counts = {}
+        for script in all_scripts:
+            type_counts[script.content_type] = type_counts.get(script.content_type, 0) + 1
+        if type_counts:
+            st.bar_chart(type_counts)
+    else:
+        st.info("📊 Generate some scripts to see analytics!")
+# Footer
+st.markdown("---")
+st.markdown("""
+<div style="text-align: center; color: #666; padding: 1rem;">
+    🎬 AI Script Studio • Built with Streamlit & DeepSeek AI<br>
+    💡 Tip: Generate scripts in batches, then refine with AI tools for best results
+</div>
+""", unsafe_allow_html=True)

auto_scorer.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Auto-scoring system using LLM judges for script quality assessment
+Integrates with existing DeepSeek client
+"""
+import json
+from typing import Dict, List, Tuple
+from sqlmodel import Session, select
+from datetime import datetime, timedelta
+from models import Script, AutoScore, PolicyWeights
+from db import get_session
+from deepseek_client import chat
+class AutoScorer:
+    def __init__(self, confidence_threshold: float = 0.7):
+        self.confidence_threshold = confidence_threshold
+    def score_script(self, script_data: Dict) -> Dict[str, float]:
+        """
+        Score a script using LLM judge across 5 dimensions
+        Returns scores and confidence level
+        """
+        system_prompt = """You are an expert Instagram content analyst. Score this script on 5 dimensions (1-5 scale):
+1. OVERALL: General quality and effectiveness (1=poor, 5=excellent)
+2. HOOK: How compelling is the opening (1=boring, 5=irresistible)
+3. ORIGINALITY: How unique/creative (1=generic, 5=highly original)
+4. STYLE_FIT: How well it matches the persona (1=off-brand, 5=perfect fit)
+5. SAFETY: Instagram compliance (1=risky, 5=completely safe)
+Return ONLY a JSON object with: {"overall": X, "hook": X, "originality": X, "style_fit": X, "safety": X, "confidence": X, "reasoning": "brief explanation"}
+Be consistent and objective. Confidence should be 0.1-1.0 based on how certain you are."""
+        user_prompt = f"""
+Script to score:
+Title: {script_data.get('title', '')}
+Hook: {script_data.get('hook', '')}
+Beats: {script_data.get('beats', [])}
+Caption: {script_data.get('caption', '')}
+Persona: {script_data.get('creator', '')}
+Content Type: {script_data.get('content_type', '')}
+Tone: {script_data.get('tone', '')}
+Score this script now."""
+        try:
+            response = chat([
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ], temperature=0.3)  # Low temperature for consistent scoring
+            # Extract JSON from response
+            start = response.find("{")
+            end = response.rfind("}") + 1
+            if start >= 0 and end > start:
+                scores = json.loads(response[start:end])
+                # Validate scores are in range
+                required_keys = ['overall', 'hook', 'originality', 'style_fit', 'safety']
+                for key in required_keys:
+                    if key not in scores or not (1 <= scores[key] <= 5):
+                        raise ValueError(f"Invalid score for {key}")
+                # Ensure confidence is present and valid
+                if 'confidence' not in scores or not (0.1 <= scores['confidence'] <= 1.0):
+                    scores['confidence'] = 0.7  # Default confidence
+                return scores
+            else:
+                raise ValueError("No valid JSON found in response")
+        except Exception as e:
+            print(f"Auto-scoring failed: {e}")
+            # Return neutral scores with low confidence
+            return {
+                'overall': 3.0,
+                'hook': 3.0,
+                'originality': 3.0,
+                'style_fit': 3.0,
+                'safety': 3.0,
+                'confidence': 0.3,
+                'reasoning': f"Scoring failed: {str(e)}"
+            }
+    def score_and_store(self, script_id: int) -> AutoScore:
+        """Score a script and store in database"""
+        with get_session() as ses:
+            script = ses.get(Script, script_id)
+            if not script:
+                raise ValueError(f"Script {script_id} not found")
+            # Prepare script data for scoring
+            script_data = {
+                'title': script.title,
+                'hook': script.hook,
+                'beats': script.beats,
+                'caption': script.caption,
+                'creator': script.creator,
+                'content_type': script.content_type,
+                'tone': script.tone
+            }
+            # Get scores
+            scores = self.score_script(script_data)
+            # Store auto-score
+            auto_score = AutoScore(
+                script_id=script_id,
+                overall=scores['overall'],
+                hook=scores['hook'],
+                originality=scores['originality'],
+                style_fit=scores['style_fit'],
+                safety=scores['safety'],
+                confidence=scores['confidence'],
+                notes=scores.get('reasoning', '')
+            )
+            ses.add(auto_score)
+            ses.commit()
+            ses.refresh(auto_score)
+            return auto_score
+    def batch_score_recent(self, hours: int = 24) -> List[AutoScore]:
+        """Score all recently generated scripts that haven't been auto-scored"""
+        cutoff = datetime.utcnow() - timedelta(hours=hours)
+        with get_session() as ses:
+            # Find scripts without auto-scores
+            recent_scripts = ses.exec(
+                select(Script).where(
+                    Script.created_at >= cutoff,
+                    Script.source == "ai"  # Only score AI-generated scripts
+                )
+            ).all()
+            # Filter out already scored
+            unscored = []
+            for script in recent_scripts:
+                existing_score = ses.exec(
+                    select(AutoScore).where(AutoScore.script_id == script.id)
+                ).first()
+                if not existing_score:
+                    unscored.append(script)
+            print(f"Auto-scoring {len(unscored)} recent scripts...")
+            results = []
+            for script in unscored:
+                try:
+                    auto_score = self.score_and_store(script.id)
+                    results.append(auto_score)
+                    print(f"Scored script {script.id}: {auto_score.overall:.1f}/5.0")
+                except Exception as e:
+                    print(f"Failed to score script {script.id}: {e}")
+            return results
+class ScriptReranker:
+    """Rerank generated scripts using composite scoring"""
+    def __init__(self, weights: Dict[str, float] = None):
+        self.weights = weights or {
+            'overall': 0.35,
+            'hook': 0.20,
+            'originality': 0.15,
+            'style_fit': 0.15,
+            'safety': 0.15
+        }
+    def rerank_scripts(self, script_ids: List[int]) -> List[Tuple[int, float]]:
+        """
+        Rerank scripts by composite score
+        Returns list of (script_id, composite_score) sorted by score descending
+        """
+        results = []
+        with get_session() as ses:
+            for script_id in script_ids:
+                # Try to get auto-score first
+                auto_score = ses.exec(
+                    select(AutoScore).where(AutoScore.script_id == script_id)
+                ).first()
+                if auto_score and auto_score.confidence >= 0.5:
+                    # Use auto-scores
+                    composite = (
+                        self.weights['overall'] * auto_score.overall +
+                        self.weights['hook'] * auto_score.hook +
+                        self.weights['originality'] * auto_score.originality +
+                        self.weights['style_fit'] * auto_score.style_fit +
+                        self.weights['safety'] * auto_score.safety
+                    )
+                else:
+                    # Fall back to human ratings if available
+                    script = ses.get(Script, script_id)
+                    if script and script.ratings_count > 0:
+                        composite = (
+                            self.weights['overall'] * (script.score_overall or 3.0) +
+                            self.weights['hook'] * (script.score_hook or 3.0) +
+                            self.weights['originality'] * (script.score_originality or 3.0) +
+                            self.weights['style_fit'] * (script.score_style_fit or 3.0) +
+                            self.weights['safety'] * (script.score_safety or 3.0)
+                        )
+                    else:
+                        # Default neutral score
+                        composite = 3.0
+                results.append((script_id, composite))
+        # Sort by composite score descending
+        results.sort(key=lambda x: x[1], reverse=True)
+        return results
+    def get_best_script(self, script_ids: List[int]) -> int:
+        """Get the ID of the highest-scoring script"""
+        ranked = self.rerank_scripts(script_ids)
+        return ranked[0][0] if ranked else script_ids[0]
+def auto_score_pipeline():
+    """Main pipeline to auto-score recent scripts"""
+    scorer = AutoScorer()
+    # Score recent scripts
+    new_scores = scorer.batch_score_recent(hours=24)
+    if new_scores:
+        print(f"\n📊 Auto-scoring Results ({len(new_scores)} scripts):")
+        for score in new_scores:
+            print(f"Script {score.script_id}: {score.overall:.1f}/5.0 (confidence: {score.confidence:.2f})")
+    else:
+        print("No new scripts to score.")
+if __name__ == "__main__":
+    auto_score_pipeline()

bandit_learner.py ADDED Viewed

	@@ -0,0 +1,330 @@

+"""
+Multi-armed bandit learning system for optimizing generation policies
+Learns which retrieval weights and generation parameters work best for each persona/content_type
+"""
+import numpy as np
+import random
+from typing import Dict, List, Tuple, Optional
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from sqlmodel import Session, select
+from models import Script, AutoScore, PolicyWeights, Rating
+from db import get_session
+@dataclass
+class BanditArm:
+    """Represents one configuration of parameters to test"""
+    name: str
+    semantic_weight: float
+    bm25_weight: float
+    quality_weight: float
+    freshness_weight: float
+    temp_low: float
+    temp_mid: float
+    temp_high: float
+    def __post_init__(self):
+        # Ensure weights sum to 1.0
+        total = self.semantic_weight + self.bm25_weight + self.quality_weight + self.freshness_weight
+        if total != 1.0:
+            self.semantic_weight /= total
+            self.bm25_weight /= total
+            self.quality_weight /= total
+            self.freshness_weight /= total
+class PolicyBandit:
+    """Multi-armed bandit for learning optimal generation policies"""
+    def __init__(self, epsilon: float = 0.15, decay_rate: float = 0.99):
+        self.epsilon = epsilon  # Exploration rate
+        self.decay_rate = decay_rate  # Epsilon decay over time
+        self.min_epsilon = 0.05
+        # Define arms (different parameter configurations)
+        self.arms = [
+            # Current default
+            BanditArm("balanced", 0.45, 0.25, 0.20, 0.10, 0.4, 0.7, 0.95),
+            # Semantic-heavy (focus on meaning)
+            BanditArm("semantic_heavy", 0.60, 0.15, 0.15, 0.10, 0.4, 0.7, 0.95),
+            # Quality-focused (use only best examples)
+            BanditArm("quality_focused", 0.35, 0.20, 0.35, 0.10, 0.3, 0.6, 0.85),
+            # Fresh-focused (prioritize recent trends)
+            BanditArm("fresh_focused", 0.40, 0.20, 0.15, 0.25, 0.5, 0.8, 1.0),
+            # Conservative (lower temperatures)
+            BanditArm("conservative", 0.45, 0.25, 0.20, 0.10, 0.3, 0.5, 0.7),
+            # Creative (higher temperatures)
+            BanditArm("creative", 0.45, 0.25, 0.20, 0.10, 0.6, 0.9, 1.2),
+            # Text-match heavy (traditional keyword matching)
+            BanditArm("text_heavy", 0.25, 0.45, 0.20, 0.10, 0.4, 0.7, 0.95)
+        ]
+        # Initialize arm statistics
+        self.arm_counts = {arm.name: 0 for arm in self.arms}
+        self.arm_rewards = {arm.name: 0.0 for arm in self.arms}
+    def select_arm(self, persona: str, content_type: str) -> BanditArm:
+        """Select arm using epsilon-greedy with UCB bias"""
+        # Load existing policy weights to initialize arm stats
+        self._load_arm_stats(persona, content_type)
+        # Decay epsilon over time
+        current_epsilon = max(self.min_epsilon, self.epsilon * (self.decay_rate ** sum(self.arm_counts.values())))
+        if random.random() < current_epsilon:
+            # Explore: random arm
+            selected_arm = random.choice(self.arms)
+            print(f"🔄 Exploring with {selected_arm.name} policy (ε={current_epsilon:.3f})")
+        else:
+            # Exploit: best arm with UCB confidence bounds
+            selected_arm = self._select_best_arm_ucb()
+            print(f"⭐ Exploiting with {selected_arm.name} policy")
+        return selected_arm
+    def _select_best_arm_ucb(self) -> BanditArm:
+        """Select arm using Upper Confidence Bound"""
+        total_counts = sum(self.arm_counts.values())
+        if total_counts == 0:
+            return self.arms[0]  # Default to first arm
+        best_arm = None
+        best_score = float('-inf')
+        for arm in self.arms:
+            count = self.arm_counts[arm.name]
+            if count == 0:
+                return arm  # Always try unplayed arms first
+            # UCB score = average reward + confidence interval
+            avg_reward = self.arm_rewards[arm.name] / count
+            confidence = np.sqrt(2 * np.log(total_counts) / count)
+            ucb_score = avg_reward + confidence
+            if ucb_score > best_score:
+                best_score = ucb_score
+                best_arm = arm
+        return best_arm or self.arms[0]
+    def _load_arm_stats(self, persona: str, content_type: str):
+        """Load historical performance for this persona/content_type"""
+        with get_session() as ses:
+            policy = ses.exec(
+                select(PolicyWeights).where(
+                    PolicyWeights.persona == persona,
+                    PolicyWeights.content_type == content_type
+                )
+            ).first()
+            if policy:
+                # Find matching arm and update stats
+                for arm in self.arms:
+                    if self._arm_matches_policy(arm, policy):
+                        self.arm_counts[arm.name] = policy.total_generations
+                        self.arm_rewards[arm.name] = policy.success_rate * policy.total_generations
+                        break
+    def _arm_matches_policy(self, arm: BanditArm, policy: PolicyWeights, tolerance: float = 0.05) -> bool:
+        """Check if an arm matches the stored policy within tolerance"""
+        return (
+            abs(arm.semantic_weight - policy.semantic_weight) < tolerance and
+            abs(arm.bm25_weight - policy.bm25_weight) < tolerance and
+            abs(arm.quality_weight - policy.quality_weight) < tolerance and
+            abs(arm.freshness_weight - policy.freshness_weight) < tolerance
+        )
+    def update_reward(self,
+                     arm: BanditArm,
+                     reward: float,
+                     persona: str,
+                     content_type: str,
+                     script_id: int):
+        """Update arm performance with new reward signal"""
+        # Update in-memory stats
+        self.arm_counts[arm.name] += 1
+        self.arm_rewards[arm.name] += reward
+        # Update database policy
+        self._update_policy_weights(arm, reward, persona, content_type)
+        print(f"📈 Updated {arm.name}: reward={reward:.3f}, avg={self.arm_rewards[arm.name]/self.arm_counts[arm.name]:.3f}")
+    def _update_policy_weights(self,
+                             arm: BanditArm,
+                             reward: float,
+                             persona: str,
+                             content_type: str):
+        """Update policy weights in database"""
+        with get_session() as ses:
+            policy = ses.exec(
+                select(PolicyWeights).where(
+                    PolicyWeights.persona == persona,
+                    PolicyWeights.content_type == content_type
+                )
+            ).first()
+            if not policy:
+                # Create new policy
+                policy = PolicyWeights(
+                    persona=persona,
+                    content_type=content_type,
+                    semantic_weight=arm.semantic_weight,
+                    bm25_weight=arm.bm25_weight,
+                    quality_weight=arm.quality_weight,
+                    freshness_weight=arm.freshness_weight,
+                    temp_low=arm.temp_low,
+                    temp_mid=arm.temp_mid,
+                    temp_high=arm.temp_high,
+                    total_generations=1,
+                    success_rate=reward
+                )
+            else:
+                # Update existing policy with exponential moving average
+                alpha = 0.1  # Learning rate
+                policy.success_rate = (1 - alpha) * policy.success_rate + alpha * reward
+                policy.total_generations += 1
+                # If this arm is performing well, shift weights toward it
+                if reward > policy.success_rate:
+                    shift = 0.05  # Small shift toward better performing arm
+                    policy.semantic_weight = (1 - shift) * policy.semantic_weight + shift * arm.semantic_weight
+                    policy.bm25_weight = (1 - shift) * policy.bm25_weight + shift * arm.bm25_weight
+                    policy.quality_weight = (1 - shift) * policy.quality_weight + shift * arm.quality_weight
+                    policy.freshness_weight = (1 - shift) * policy.freshness_weight + shift * arm.freshness_weight
+                    policy.temp_low = (1 - shift) * policy.temp_low + shift * arm.temp_low
+                    policy.temp_mid = (1 - shift) * policy.temp_mid + shift * arm.temp_mid
+                    policy.temp_high = (1 - shift) * policy.temp_high + shift * arm.temp_high
+            policy.updated_at = datetime.utcnow()
+            ses.add(policy)
+            ses.commit()
+    def calculate_reward(self, script_id: int) -> float:
+        """
+        Calculate reward signal from script performance
+        Combines auto-scores and human ratings when available
+        """
+        reward_components = []
+        with get_session() as ses:
+            # Get auto-score
+            auto_score = ses.exec(
+                select(AutoScore).where(AutoScore.script_id == script_id)
+            ).first()
+            if auto_score and auto_score.confidence > 0.5:
+                # Weighted composite of auto-scores
+                auto_reward = (
+                    0.35 * auto_score.overall +
+                    0.20 * auto_score.hook +
+                    0.15 * auto_score.originality +
+                    0.15 * auto_score.style_fit +
+                    0.15 * auto_score.safety
+                ) / 5.0  # Normalize to 0-1
+                reward_components.append(('auto', auto_reward, auto_score.confidence))
+            # Get human ratings
+            script = ses.get(Script, script_id)
+            if script and script.ratings_count > 0:
+                human_reward = script.score_overall / 5.0  # Normalize to 0-1
+                confidence = min(1.0, script.ratings_count / 3.0)  # More ratings = higher confidence
+                reward_components.append(('human', human_reward, confidence))
+        if not reward_components:
+            return 0.5  # Neutral reward if no scores available
+        # Weighted average of reward components by confidence
+        total_weight = sum(confidence for _, _, confidence in reward_components)
+        weighted_reward = sum(
+            reward * confidence for _, reward, confidence in reward_components
+        ) / total_weight
+        return weighted_reward
+class PolicyLearner:
+    """Main interface for policy learning"""
+    def __init__(self):
+        self.bandit = PolicyBandit()
+    def learn_from_generation_batch(self,
+                                  persona: str,
+                                  content_type: str,
+                                  generated_script_ids: List[int],
+                                  selected_arm: BanditArm):
+        """Learn from a batch of generated scripts"""
+        if not generated_script_ids:
+            return
+        # Calculate average reward from the batch
+        rewards = [self.bandit.calculate_reward(sid) for sid in generated_script_ids]
+        avg_reward = sum(rewards) / len(rewards)
+        # Update bandit with average performance
+        self.bandit.update_reward(
+            selected_arm,
+            avg_reward,
+            persona,
+            content_type,
+            generated_script_ids[0]  # Representative script ID
+        )
+        print(f"🧠 Policy learning: {persona}/{content_type} → {avg_reward:.3f} reward")
+    def get_optimized_policy(self, persona: str, content_type: str) -> BanditArm:
+        """Get the current best policy for this persona/content_type"""
+        return self.bandit.select_arm(persona, content_type)
+    def run_learning_cycle(self):
+        """Run a learning cycle on recent generations"""
+        print("🔄 Starting policy learning cycle...")
+        # Find recent AI-generated scripts by persona/content_type
+        cutoff = datetime.utcnow() - timedelta(hours=24)
+        with get_session() as ses:
+            recent_scripts = list(ses.exec(
+                select(Script).where(
+                    Script.created_at >= cutoff,
+                    Script.source == "ai"
+                )
+            ))
+        # Group by persona/content_type
+        groups = {}
+        for script in recent_scripts:
+            key = (script.creator, script.content_type)
+            if key not in groups:
+                groups[key] = []
+            groups[key].append(script.id)
+        # Learn from each group
+        for (persona, content_type), script_ids in groups.items():
+            if len(script_ids) >= 3:  # Need minimum batch size
+                # For now, assume they used the balanced policy
+                # In practice, you'd track which policy was used for each generation
+                balanced_arm = next(arm for arm in self.bandit.arms if arm.name == "balanced")
+                self.learn_from_generation_batch(persona, content_type, script_ids, balanced_arm)
+def run_policy_learning():
+    """Main entry point for policy learning"""
+    learner = PolicyLearner()
+    learner.run_learning_cycle()
+if __name__ == "__main__":
+    run_policy_learning()

compliance.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import re
+BANNED = {r"\b(naked|explicit|porn|onlyfans\.com)\b"}
+CAUTION = {r"\b(hot|naughty|spicy|thirsty)\b"}
+def compliance_level(text: str):
+    low = text.lower()
+    for pat in BANNED:
+        if re.search(pat, low):
+            return "fail", ["banned phrase"]
+    reasons = []
+    for pat in CAUTION:
+        if re.search(pat, low):
+            reasons.append("caution phrase")
+    return ("warn" if reasons else "pass"), reasons
+def score_script(blob: str):
+    return compliance_level(blob)
+def blob_from(script: dict) -> str:
+    parts = [
+        script.get("title",""), script.get("hook",""),
+        " ".join(script.get("beats",[])),
+        script.get("voiceover",""), script.get("caption",""), script.get("cta","")
+    ]
+    return " ".join(parts)

db.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# db.py
+import os, json, random
+from contextlib import contextmanager
+from typing import List, Iterable, Tuple, Optional
+from sqlmodel import SQLModel, create_engine, Session, select
+from datetime import datetime
+# ---- Configure DB ----
+DB_URL = os.environ.get("DB_URL", "sqlite:///studio.db")
+engine = create_engine(DB_URL, echo=False)
+# ---- Models ----
+from models import Script, Rating  # make sure Script has: is_reference: bool, plus the other fields
+# ---- Init / Session ----
+def init_db() -> None:
+    SQLModel.metadata.create_all(engine)
+@contextmanager
+def get_session():
+    with Session(engine) as ses:
+        yield ses
+# ---- Helpers for import ----
+def _payload_from_jsonl_row(row: dict) -> Tuple[dict, str, str]:
+    """
+    Map a JSONL row (the file I generated for you) into Script columns.
+    Returns (payload, dedupe_key_title, dedupe_key_creator).
+    You can also add 'external_id' to Script model and dedupe on that.
+    """
+    # Prefer using the JSON 'id' as an external identifier:
+    external_id = row.get("id", "")
+    # Tone could be an array; flatten for now
+    tone = ", ".join(row.get("tonality", [])) or "playful"
+    # Compact caption: use caption options line as a quick reference
+    caption = " | ".join(row.get("caption_options", []))[:180]
+    payload = dict(
+        # core identity
+        creator=row.get("model_name", "Unknown"),
+        content_type=(row.get("video_type", "") or "talking_style").lower(),
+        tone=tone,
+        title=external_id or row.get("theme", "") or "Imported Script",
+        hook=row.get("video_hook") or "",
+        # structured fields
+        beats=row.get("storyboard", []) or [],
+        voiceover="",
+        caption=caption,
+        hashtags=row.get("hashtags", []) or [],
+        cta="",
+        # flags
+        source="import",
+        is_reference=True,          # mark imported examples as references
+        compliance="pass",          # we'll score again after save if you want
+    )
+    return payload, payload["title"], payload["creator"]
+def _score_and_update_compliance(s: Script) -> None:
+    """Optional: score compliance using your simple rule-checker."""
+    try:
+        from compliance import blob_from, score_script
+        lvl, _ = score_script(blob_from(s.dict()))
+        s.compliance = lvl
+    except Exception:
+        # If no compliance module or error, keep default
+        pass
+def _iter_jsonl(path: str) -> Iterable[dict]:
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            yield json.loads(line)
+# ---- Public: Importer ----
+def import_jsonl(path: str) -> int:
+    """
+    Import (upsert) scripts from a JSONL file produced earlier.
+    Dedupe by (creator, title). Returns count of upserted rows.
+    """
+    init_db()
+    count = 0
+    with get_session() as ses:
+        for row in _iter_jsonl(path):
+            payload, key_title, key_creator = _payload_from_jsonl_row(row)
+            existing = ses.exec(
+                select(Script).where(
+                    Script.title == key_title,
+                    Script.creator == key_creator
+                )
+            ).first()
+            if existing:
+                # Update all fields
+                for k, v in payload.items():
+                    setattr(existing, k, v)
+                _score_and_update_compliance(existing)
+                existing.updated_at = datetime.utcnow()
+                ses.add(existing)
+            else:
+                obj = Script(**payload)
+                _score_and_update_compliance(obj)
+                ses.add(obj)
+            count += 1
+        ses.commit()
+    return count
+# ---- Ratings API ----
+def add_rating(script_id: int,
+               overall: float,
+               hook: Optional[float] = None,
+               originality: Optional[float] = None,
+               style_fit: Optional[float] = None,
+               safety: Optional[float] = None,
+               notes: Optional[str] = None,
+               rater: str = "human") -> None:
+    with get_session() as ses:
+        # store rating event
+        ses.add(Rating(
+            script_id=script_id, overall=overall, hook=hook,
+            originality=originality, style_fit=style_fit, safety=safety,
+            notes=notes, rater=rater
+        ))
+        ses.commit()
+        # recompute cached aggregates on Script
+        _recompute_script_aggregates(ses, script_id)
+        ses.commit()
+def _recompute_script_aggregates(ses: Session, script_id: int) -> None:
+    rows = list(ses.exec(select(Rating).where(Rating.script_id == script_id)))
+    if not rows:
+        return
+    def avg(field):
+        vals = [getattr(r, field) for r in rows if getattr(r, field) is not None]
+        return round(sum(vals)/len(vals), 3) if vals else None
+    s: Script = ses.get(Script, script_id)
+    s.score_overall = avg("overall")
+    s.score_hook = avg("hook")
+    s.score_originality = avg("originality")
+    s.score_style_fit = avg("style_fit")
+    s.score_safety = avg("safety")
+    s.ratings_count = len(rows)
+    s.updated_at = datetime.utcnow()
+    ses.add(s)
+# ---- Public: Reference retrieval for generation ----
+def extract_snippets_from_script(s: Script, max_lines: int = 3) -> List[str]:
+    items: List[str] = []
+    if s.hook:
+        items.append(s.hook.strip())
+    if s.beats:
+        items.extend([b.strip() for b in s.beats[:2]])  # first 1–2 beats
+    if s.caption:
+        items.append(s.caption.strip()[:120])
+    # dedupe while preserving order
+    seen, uniq = set(), []
+    for it in items:
+        if it and it not in seen:
+            uniq.append(it); seen.add(it)
+    return uniq[:max_lines]
+def get_library_refs(creator: str, content_type: str, k: int = 6) -> List[str]:
+    with get_session() as ses:
+        rows = list(ses.exec(
+            select(Script)
+            .where(
+                Script.creator == creator,
+                Script.content_type == content_type,
+                Script.is_reference == True,
+                Script.compliance != "fail"
+            )
+            .order_by(Script.created_at.desc())
+        ))[:k]
+    snippets: List[str] = []
+    for r in rows:
+        snippets.extend(extract_snippets_from_script(r))
+    # final dedupe
+    seen, uniq = set(), []
+    for s in snippets:
+        if s not in seen:
+            uniq.append(s); seen.add(s)
+    return uniq[:8]
+# ---- HYBRID reference retrieval ----
+def get_hybrid_refs(creator: str, content_type: str, k: int = 6,
+                    top_n: int = 3, explore_n: int = 2, newest_n: int = 1) -> List[str]:
+    """
+    Mix of:
+      - top_n best scored references (exploit)
+      - explore_n random references (explore)
+      - newest_n most recent references (freshness)
+    Returns flattened snippet list (cap ~8 to keep prompt lean).
+    """
+    with get_session() as ses:
+        all_refs = list(ses.exec(
+            select(Script).where(
+                Script.creator == creator,
+                Script.content_type == content_type,
+                Script.is_reference == True,
+                Script.compliance != "fail"
+            )
+        ))
+    if not all_refs:
+        return []
+    # sort by score_overall (fallback to 0) and pick top_n
+    scored = sorted(all_refs, key=lambda s: (s.score_overall or 0.0), reverse=True)
+    best = scored[:top_n]
+    # newest by created_at
+    newest = sorted(all_refs, key=lambda s: s.created_at, reverse=True)[:newest_n]
+    # explore = random sample from the remainder
+    remainder = [r for r in all_refs if r not in best and r not in newest]
+    explore = random.sample(remainder, min(explore_n, len(remainder))) if remainder else []
+    # merge (preserve order, dedupe)
+    chosen_scripts = []
+    seen_ids = set()
+    for bucket in (best, explore, newest):
+        for s in bucket:
+            if s.id not in seen_ids:
+                chosen_scripts.append(s)
+                seen_ids.add(s.id)
+    # cut to k scripts
+    chosen_scripts = chosen_scripts[:k]
+    # flatten snippets and cap to keep prompt compact
+    snippets: List[str] = []
+    for s in chosen_scripts:
+        snippets.extend(extract_snippets_from_script(s))
+    # dedupe again and cap ~8 lines
+    seen, out = set(), []
+    for sn in snippets:
+        if sn not in seen:
+            out.append(sn); seen.add(sn)
+    return out[:8]

deepseek_client.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os, requests, json
+import streamlit as st
+from dotenv import load_dotenv
+load_dotenv()
+# Get API key from Streamlit secrets or environment
+def get_api_key():
+    if hasattr(st, 'secrets') and "DEEPSEEK_API_KEY" in st.secrets:
+        return st.secrets["DEEPSEEK_API_KEY"]
+    return os.getenv("DEEPSEEK_API_KEY")
+DEEPSEEK_API_KEY = get_api_key()
+BASE = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
+def chat(messages, model="deepseek-chat", temperature=0.9):
+    headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}
+    payload = {"model": model, "messages": messages, "temperature": temperature}
+    r = requests.post(f"{BASE}/chat/completions", headers=headers, data=json.dumps(payload), timeout=60)
+    r.raise_for_status()
+    return r.json()["choices"][0]["message"]["content"]
+def generate_scripts(persona, boundaries, content_type, tone, refs, n=6):
+    system = (
+        "You write Instagram-compliant, suggestive-but-not-explicit Reels briefs. "
+        "Use tight hooks, concrete visual beats, clear CTAs. Avoid explicit sexual terms. "
+        "Return ONLY JSON: an array of length N, each with {title,hook,beats,voiceover,caption,hashtags,cta}."
+    )
+    user = f"""
+Persona: {persona}
+Boundaries: {boundaries}
+Content type: {content_type} | Tone: {tone} | Duration: 15–25s
+Reference snippets (inspire, don't copy):
+{chr(10).join(f"- {r}" for r in refs)}
+N = {n}
+JSON array ONLY.
+"""
+    out = chat([{"role":"system","content":system},{"role":"user","content":user}])
+    # Be lenient if model wraps JSON with text
+    start = out.find("[")
+    end = out.rfind("]")
+    return json.loads(out[start:end+1])
+def revise_for(prompt_label, draft: dict, guidance: str):
+    system = f"You revise scripts to {prompt_label}. Keep intent; return ONLY JSON with the same schema."
+    user = json.dumps({"draft": draft, "guidance": guidance})
+    out = chat([{"role":"system","content":system},{"role":"user","content":user}], temperature=0.6)
+    start = out.find("{")
+    end = out.rfind("}")
+    return json.loads(out[start:end+1])
+def selective_rewrite(draft: dict, field: str, snippet: str, prompt: str):
+    system = "You rewrite only the targeted snippet inside the specified field. Keep style. Return ONLY JSON."
+    user = json.dumps({"field": field, "snippet": snippet, "prompt": prompt, "draft": draft})
+    out = chat([{"role":"system","content":system},{"role":"user","content":user}], temperature=0.7)
+    start = out.find("{")
+    end = out.rfind("}")
+    return json.loads(out[start:end+1])

models.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from datetime import datetime
+from typing import List, Optional
+from sqlmodel import SQLModel, Field, Column
+from sqlalchemy import JSON
+class Script(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    creator: str
+    content_type: str
+    tone: str
+    title: str
+    hook: str
+    beats: List[str] = Field(sa_column=Column(JSON))
+    voiceover: str
+    caption: str
+    hashtags: List[str] = Field(sa_column=Column(JSON))
+    cta: str
+    compliance: str = "pass"   # pass | warn | fail
+    source: str = "ai"         # ai | manual | import
+    is_reference: bool = False  # mark imported examples as references
+    # --- NEW: cached aggregates from ratings (all optional) ---
+    score_overall: Optional[float] = None         # 1..5 (avg)
+    score_hook: Optional[float] = None            # 1..5 (avg)
+    score_originality: Optional[float] = None     # 1..5 (avg)
+    score_style_fit: Optional[float] = None       # 1..5 (avg)
+    score_safety: Optional[float] = None          # 1..5 (avg)
+    ratings_count: int = 0
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+class Revision(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    script_id: int = Field(index=True)
+    label: str
+    field: str
+    before: str
+    after: str
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+# NEW: store every rating event so you keep history
+class Rating(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    script_id: int = Field(index=True)
+    rater: str = "human"   # optional: store user/email
+    overall: float         # 1..5
+    hook: Optional[float] = None
+    originality: Optional[float] = None
+    style_fit: Optional[float] = None
+    safety: Optional[float] = None
+    notes: Optional[str] = None
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+# RAG Enhancement Models
+class Embedding(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    script_id: int = Field(index=True)
+    part: str = Field(index=True)  # 'full', 'hook', 'beats', 'caption'
+    vector: List[float] = Field(sa_column=Column(JSON))
+    meta: dict = Field(sa_column=Column(JSON))
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+class AutoScore(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    script_id: int = Field(index=True)
+    overall: float
+    hook: float
+    originality: float
+    style_fit: float
+    safety: float
+    confidence: float = 0.8  # LLM judge confidence
+    notes: Optional[str] = None
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+class PolicyWeights(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    persona: str = Field(index=True)
+    content_type: str = Field(index=True)
+    # Retrieval weights
+    semantic_weight: float = 0.45
+    bm25_weight: float = 0.25
+    quality_weight: float = 0.20
+    freshness_weight: float = 0.10
+    # Generation params
+    temp_low: float = 0.4
+    temp_mid: float = 0.7
+    temp_high: float = 0.95
+    # Performance tracking
+    success_rate: float = 0.0
+    total_generations: int = 0
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+class StyleCard(SQLModel, table=True, extend_existing=True):
+    id: Optional[int] = Field(default=None, primary_key=True)
+    persona: str = Field(index=True)
+    content_type: str = Field(index=True)
+    exemplar_hooks: List[str] = Field(sa_column=Column(JSON))
+    exemplar_beats: List[str] = Field(sa_column=Column(JSON))
+    exemplar_captions: List[str] = Field(sa_column=Column(JSON))
+    negative_patterns: List[str] = Field(sa_column=Column(JSON))
+    constraints: dict = Field(sa_column=Column(JSON))
+    updated_at: datetime = Field(default_factory=datetime.utcnow)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

rag_integration.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Integration layer between the existing system and new RAG capabilities
+Shows how to plug the enhanced system into the current workflow
+"""
+from typing import List, Dict, Any, Optional
+import json
+from sqlmodel import Session
+from datetime import datetime
+from models import Script, Embedding, AutoScore, PolicyWeights
+from db import get_session, init_db
+from deepseek_client import chat, get_api_key
+from rag_retrieval import RAGRetriever
+from auto_scorer import AutoScorer, ScriptReranker
+from bandit_learner import PolicyLearner
+class EnhancedScriptGenerator:
+    """
+    Enhanced version of script generation with RAG + policy learning
+    Drop-in replacement for the existing generate_scripts function
+    """
+    def __init__(self):
+        self.retriever = RAGRetriever()
+        self.scorer = AutoScorer()
+        self.reranker = ScriptReranker()
+        self.policy_learner = PolicyLearner()
+        # Verify we have API key
+        if not get_api_key():
+            raise ValueError("DeepSeek API key not found!")
+    def generate_scripts_enhanced(self,
+                                persona: str,
+                                boundaries: str,
+                                content_type: str,
+                                tone: str,
+                                manual_refs: List[str] = None,
+                                n: int = 6) -> List[Dict]:
+        """
+        Enhanced script generation with:
+        1. RAG-based reference selection
+        2. Policy-optimized parameters
+        3. Auto-scoring and reranking
+        4. Online learning feedback
+        """
+        print(f"🤖 Enhanced generation: {persona} × {content_type} × {n} scripts")
+        # Step 1: Get optimized policy for this persona/content_type
+        policy_arm = self.policy_learner.get_optimized_policy(persona, content_type)
+        # Step 2: Build dynamic few-shot pack using RAG
+        query_context = f"{persona} {content_type} {tone}"
+        few_shot_pack = self.retriever.build_dynamic_few_shot_pack(
+            persona=persona,
+            content_type=content_type,
+            query_context=query_context
+        )
+        # Step 3: Combine RAG refs with manual refs
+        rag_refs = (
+            few_shot_pack.get('best_hooks', []) +
+            few_shot_pack.get('best_beats', []) +
+            few_shot_pack.get('best_captions', [])
+        )
+        all_refs = (manual_refs or []) + rag_refs
+        print(f"📚 Using {len(rag_refs)} RAG refs + {len(manual_refs or [])} manual refs")
+        # Step 4: Enhanced generation with policy-optimized parameters
+        drafts = self._generate_with_policy(
+            persona=persona,
+            boundaries=boundaries,
+            content_type=content_type,
+            tone=tone,
+            refs=all_refs,
+            policy_arm=policy_arm,
+            n=n,
+            few_shot_pack=few_shot_pack
+        )
+        # Step 5: Anti-copying detection and cleanup
+        print(f"🛡️ Checking for similarity to reference content...")
+        # Extract reference texts for copying detection
+        reference_texts = rag_refs
+        cleaned_drafts = []
+        for draft in drafts:
+            # Check for copying
+            detection_results = self.retriever.detect_copying(
+                generated_content=draft,
+                reference_texts=reference_texts,
+                similarity_threshold=0.92
+            )
+            if detection_results['is_copying']:
+                print(f"⚠️ Anti-copy triggered for draft: {draft.get('title', 'Untitled')[:30]}")
+                print(f"   Max similarity: {detection_results['max_similarity']:.3f}")
+                # Auto-rewrite similar content
+                cleaned_draft = self.retriever.auto_rewrite_similar_content(
+                    generated_content=draft,
+                    detection_results=detection_results
+                )
+                cleaned_drafts.append(cleaned_draft)
+            else:
+                cleaned_drafts.append(draft)
+        # Step 6: Auto-score all generated drafts
+        script_ids = self._save_drafts_to_db(cleaned_drafts, persona, content_type, tone)
+        auto_scores = [self.scorer.score_and_store(sid) for sid in script_ids]
+        print(f"📊 Auto-scored {len(auto_scores)} drafts")
+        # Step 7: Rerank by composite score
+        ranked_script_ids = self.reranker.rerank_scripts(script_ids)
+        # Step 8: Policy learning feedback
+        self.policy_learner.learn_from_generation_batch(
+            persona=persona,
+            content_type=content_type,
+            generated_script_ids=script_ids,
+            selected_arm=policy_arm
+        )
+        # Return drafts in ranked order with scores
+        return self._format_enhanced_results(ranked_script_ids, cleaned_drafts)
+    def _generate_with_policy(self,
+                            persona: str,
+                            boundaries: str,
+                            content_type: str,
+                            tone: str,
+                            refs: List[str],
+                            policy_arm: Any,  # BanditArm
+                            n: int,
+                            few_shot_pack: Dict) -> List[Dict]:
+        """Generate scripts using policy-optimized parameters"""
+        # Enhanced system prompt with few-shot pack context
+        system = f"""You write Instagram-compliant, suggestive-but-not-explicit Reels briefs.
+STYLE CONTEXT: {few_shot_pack.get('style_card', '')}
+BEST PATTERNS TO EMULATE:
+Hooks: {json.dumps(few_shot_pack.get('best_hooks', []))}
+Beats: {json.dumps(few_shot_pack.get('best_beats', []))}
+Captions: {json.dumps(few_shot_pack.get('best_captions', []))}
+AVOID THESE PATTERNS: {json.dumps(few_shot_pack.get('negative_patterns', []))}
+Use tight hooks, concrete visual beats, clear CTAs. Avoid explicit sexual terms.
+Return ONLY JSON: an array of length {n}, each with {{title,hook,beats,voiceover,caption,hashtags,cta}}.
+"""
+        user = f"""
+Persona: {persona}
+Boundaries: {boundaries}
+Content type: {content_type} | Tone: {tone}
+Constraints: {json.dumps(few_shot_pack.get('constraints', {}))}
+Reference snippets (inspire, don't copy):
+{chr(10).join(f"- {r}" for r in refs[:8])}  # Limit to top 8 refs
+Generate {n} unique variations. JSON array ONLY.
+"""
+        # Generate with multiple temperatures (policy-optimized)
+        variants = []
+        temps = [policy_arm.temp_low, policy_arm.temp_mid, policy_arm.temp_high]
+        scripts_per_temp = max(1, n // len(temps))
+        for i, temp in enumerate(temps):
+            batch_size = scripts_per_temp
+            if i == len(temps) - 1:  # Last batch gets remainder
+                batch_size = n - len(variants)
+            if batch_size <= 0:
+                break
+            try:
+                out = chat([
+                    {"role": "system", "content": system},
+                    {"role": "user", "content": user.replace(f"Generate {n}", f"Generate {batch_size}")}
+                ], temperature=temp)
+                # Extract JSON
+                start = out.find("[")
+                end = out.rfind("]")
+                if start >= 0 and end > start:
+                    batch_variants = json.loads(out[start:end+1])
+                    variants.extend(batch_variants[:batch_size])
+                    print(f"✨ Generated {len(batch_variants)} scripts at temp={temp}")
+            except Exception as e:
+                print(f"❌ Generation failed at temp={temp}: {e}")
+        return variants[:n]  # Ensure we don't exceed requested count
+    def _save_drafts_to_db(self,
+                          drafts: List[Dict],
+                          persona: str,
+                          content_type: str,
+                          tone: str) -> List[int]:
+        """Save generated drafts to database and return script IDs"""
+        script_ids = []
+        with get_session() as ses:
+            for draft in drafts:
+                try:
+                    # Calculate basic compliance
+                    from compliance import score_script, blob_from
+                    content_blob = blob_from(draft)
+                    compliance_level, _ = score_script(content_blob)
+                    script = Script(
+                        creator=persona,
+                        content_type=content_type,
+                        tone=tone,
+                        title=draft.get("title", "Generated Script"),
+                        hook=draft.get("hook", ""),
+                        beats=draft.get("beats", []),
+                        voiceover=draft.get("voiceover", ""),
+                        caption=draft.get("caption", ""),
+                        hashtags=draft.get("hashtags", []),
+                        cta=draft.get("cta", ""),
+                        compliance=compliance_level,
+                        source="ai"
+                    )
+                    ses.add(script)
+                    ses.commit()
+                    ses.refresh(script)
+                    script_ids.append(script.id)
+                    # Generate embeddings for new script
+                    embeddings = self.retriever.generate_embeddings(script)
+                    for embedding in embeddings:
+                        ses.add(embedding)
+                except Exception as e:
+                    print(f"❌ Failed to save draft: {e}")
+                    continue
+            ses.commit()
+        return script_ids
+    def _format_enhanced_results(self,
+                               ranked_script_ids: List[tuple],
+                               original_drafts: List[Dict]) -> List[Dict]:
+        """Format results with ranking and score information"""
+        # Create a lookup for original drafts by content
+        draft_lookup = {}
+        for i, draft in enumerate(original_drafts):
+            key = draft.get("title", "") + draft.get("hook", "")
+            draft_lookup[key] = draft
+        results = []
+        with get_session() as ses:
+            for script_id, composite_score in ranked_script_ids:
+                script = ses.get(Script, script_id)
+                if script:
+                    # Convert back to the expected format
+                    result = {
+                        "title": script.title,
+                        "hook": script.hook,
+                        "beats": script.beats,
+                        "voiceover": script.voiceover,
+                        "caption": script.caption,
+                        "hashtags": script.hashtags,
+                        "cta": script.cta,
+                        # Enhanced metadata
+                        "_enhanced_score": round(composite_score, 3),
+                        "_script_id": script_id,
+                        "_compliance": script.compliance
+                    }
+                    results.append(result)
+        return results
+# Backward compatibility wrapper
+def generate_scripts_rag(persona: str,
+                        boundaries: str,
+                        content_type: str,
+                        tone: str,
+                        refs: List[str],
+                        n: int = 6) -> List[Dict]:
+    """
+    Drop-in replacement for existing generate_scripts function
+    Uses enhanced RAG system while maintaining API compatibility
+    """
+    generator = EnhancedScriptGenerator()
+    return generator.generate_scripts_enhanced(
+        persona=persona,
+        boundaries=boundaries,
+        content_type=content_type,
+        tone=tone,
+        manual_refs=refs,
+        n=n
+    )
+def setup_rag_system():
+    """One-time setup to initialize the RAG system"""
+    print("🔧 Setting up RAG system...")
+    # Initialize database with new tables
+    init_db()
+    print("✅ Database initialized")
+    # Generate embeddings for existing scripts
+    from rag_retrieval import index_all_scripts
+    index_all_scripts()
+    print("✅ Existing scripts indexed")
+    # Auto-score recent scripts
+    scorer = AutoScorer()
+    recent_scores = scorer.batch_score_recent(hours=24*7)  # Last week
+    print(f"✅ Auto-scored {len(recent_scores)} recent scripts")
+    print("🎉 RAG system setup complete!")
+if __name__ == "__main__":
+    # Demo the enhanced system
+    setup_rag_system()
+    # Test generation
+    generator = EnhancedScriptGenerator()
+    results = generator.generate_scripts_enhanced(
+        persona="Anya",
+        boundaries="Instagram-safe; suggestive but not explicit",
+        content_type="thirst-trap",
+        tone="playful, flirty",
+        manual_refs=["Just a quick workout session", "Getting ready for the day"],
+        n=3
+    )
+    print(f"\n🎬 Generated {len(results)} enhanced scripts:")
+    for i, script in enumerate(results, 1):
+        score = script.get('_enhanced_score', 0)
+        compliance = script.get('_compliance', 'unknown')
+        print(f"{i}. {script['title']} (score: {score}, compliance: {compliance})")
+        print(f"   Hook: {script['hook'][:60]}...")

rag_retrieval.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""
+Enhanced RAG retrieval system for AI Script Studio
+Extends the existing hybrid reference system with semantic search and policy learning
+"""
+import numpy as np
+import math
+from typing import List, Dict, Tuple, Optional
+from sentence_transformers import SentenceTransformer
+from sqlmodel import Session, select
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import json
+from datetime import datetime, timedelta
+from models import Script, Embedding, AutoScore, PolicyWeights, StyleCard
+from db import get_session
+class RAGRetriever:
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
+        """Initialize with lightweight but effective embedding model"""
+        self.encoder = SentenceTransformer(model_name)
+        self.tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
+    def generate_embeddings(self, script: Script) -> List[Embedding]:
+        """Generate embeddings for different parts of a script"""
+        parts = {
+            'full': self._get_full_text(script),
+            'hook': script.hook or '',
+            'beats': ' '.join(script.beats or []),
+            'caption': script.caption or ''
+        }
+        embeddings = []
+        for part, text in parts.items():
+            if text.strip():  # Only embed non-empty parts
+                vector = self.encoder.encode(text).tolist()
+                meta = {
+                    'creator': script.creator,
+                    'content_type': script.content_type,
+                    'tone': script.tone,
+                    'quality_score': script.score_overall or 0.0,
+                    'compliance': script.compliance
+                }
+                embeddings.append(Embedding(
+                    script_id=script.id,
+                    part=part,
+                    vector=vector,
+                    meta=meta
+                ))
+        return embeddings
+    def _get_full_text(self, script: Script) -> str:
+        """Combine all script parts into full text"""
+        parts = [
+            script.title,
+            script.hook or '',
+            ' '.join(script.beats or []),
+            script.voiceover or '',
+            script.caption or '',
+            script.cta or ''
+        ]
+        return ' '.join(p for p in parts if p.strip())
+    def hybrid_retrieve(self,
+                       query_text: str,
+                       persona: str,
+                       content_type: str,
+                       k: int = 6,
+                       global_quality_mean: float = 4.2,
+                       shrinkage_alpha: float = 10.0,
+                       freshness_tau_days: float = 28.0) -> List[Dict]:
+        """
+        Production-grade hybrid retrieval with proper score normalization:
+        - Semantic similarity (cosine normalized to [0,1])
+        - BM25/TF-IDF similarity (min-max normalized per query)
+        - Quality scores (Bayesian shrinkage)
+        - Freshness boost (exponential decay)
+        - Policy-learned weights
+        """
+        # Get policy weights for this persona/content_type
+        weights = self._get_policy_weights(persona, content_type)
+        with get_session() as ses:
+            # Get all relevant scripts
+            scripts = list(ses.exec(
+                select(Script).where(
+                    Script.creator == persona,
+                    Script.content_type == content_type,
+                    Script.is_reference == True,
+                    Script.compliance != "fail"
+                )
+            ))
+            if not scripts:
+                return []
+            # Get embeddings for semantic similarity
+            embeddings = list(ses.exec(
+                select(Embedding).join(Script, Embedding.script_id == Script.id).where(
+                    Embedding.part == 'full',
+                    Script.creator == persona,
+                    Script.content_type == content_type,
+                    Script.is_reference == True,
+                    Script.compliance != "fail"
+                )
+            ))
+            # Pre-calculate all raw scores for normalization
+            raw_scores = []
+            query_embedding = self.encoder.encode(query_text)
+            now = datetime.utcnow()
+            for script in scripts:
+                # Find matching embedding
+                script_embedding = next(
+                    (e for e in embeddings if e.script_id == script.id),
+                    None
+                )
+                # 1. Raw semantic similarity (cosine returns [-1,1])
+                if script_embedding:
+                    raw_cosine = cosine_similarity(
+                        [query_embedding],
+                        [script_embedding.vector]
+                    )[0][0]
+                else:
+                    raw_cosine = -1.0  # Worst case for missing embeddings
+                # 2. Raw BM25/TF-IDF similarity
+                script_text = self._get_full_text(script)
+                raw_bm25 = self._calculate_tfidf_similarity(query_text, script_text)
+                raw_scores.append({
+                    'script': script,
+                    'raw_cosine': raw_cosine,
+                    'raw_bm25': raw_bm25
+                })
+            # Normalize BM25 scores (min-max normalization across this query's candidates)
+            bm25_scores = [s['raw_bm25'] for s in raw_scores]
+            min_bm25 = min(bm25_scores)
+            max_bm25 = max(bm25_scores)
+            bm25_range = max_bm25 - min_bm25 + 1e-9  # Avoid division by zero
+            # Calculate final normalized scores
+            results = []
+            for raw_score in raw_scores:
+                script = raw_score['script']
+                scores = {}
+                # 1. Semantic similarity: normalize cosine [-1,1] → [0,1]
+                scores['semantic'] = (raw_score['raw_cosine'] + 1.0) / 2.0
+                # 2. BM25: min-max normalize within this query's candidate set
+                scores['bm25'] = (raw_score['raw_bm25'] - min_bm25) / bm25_range
+                # 3. Quality: Bayesian shrinkage toward global mean
+                n_ratings = script.ratings_count or 0
+                local_quality = script.score_overall or global_quality_mean
+                # Shrinkage: blend local mean with global mean based on sample size
+                shrunk_quality = (
+                    (n_ratings / (n_ratings + shrinkage_alpha)) * local_quality +
+                    (shrinkage_alpha / (n_ratings + shrinkage_alpha)) * global_quality_mean
+                )
+                # Normalize to [0,1] (assuming 1-5 rating scale)
+                scores['quality'] = max(0.0, min(1.0, (shrunk_quality - 1) / 4))
+                # 4. Freshness: exponential decay (smoother than linear)
+                days_old = max(0, (now - script.created_at).days)
+                scores['freshness'] = math.exp(-days_old / freshness_tau_days)
+                # Combined score using policy weights
+                combined_score = (
+                    weights.semantic_weight * scores['semantic'] +
+                    weights.bm25_weight * scores['bm25'] +
+                    weights.quality_weight * scores['quality'] +
+                    weights.freshness_weight * scores['freshness']
+                )
+                results.append({
+                    'script': script,
+                    'score': combined_score,
+                    'component_scores': scores,
+                    # Debug info
+                    '_debug': {
+                        'n_ratings': n_ratings,
+                        'raw_quality': local_quality,
+                        'shrunk_quality': shrunk_quality,
+                        'days_old': days_old
+                    }
+                })
+            # Sort by combined score and return top k
+            results.sort(key=lambda x: x['score'], reverse=True)
+            return results[:k]
+    def _calculate_tfidf_similarity(self, query: str, doc: str) -> float:
+        """Calculate TF-IDF similarity between query and document"""
+        try:
+            tfidf_matrix = self.tfidf.fit_transform([query, doc])
+            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
+            return float(similarity)
+        except:
+            return 0.0
+    def _get_policy_weights(self, persona: str, content_type: str) -> PolicyWeights:
+        """Get learned policy weights or create defaults"""
+        with get_session() as ses:
+            weights = ses.exec(
+                select(PolicyWeights).where(
+                    PolicyWeights.persona == persona,
+                    PolicyWeights.content_type == content_type
+                )
+            ).first()
+            if not weights:
+                # Create default weights
+                weights = PolicyWeights(
+                    persona=persona,
+                    content_type=content_type
+                )
+                ses.add(weights)
+                ses.commit()
+                ses.refresh(weights)
+            return weights
+    def build_dynamic_few_shot_pack(self,
+                                  persona: str,
+                                  content_type: str,
+                                  query_context: str = "") -> Dict:
+        """Build dynamic few-shot examples pack optimized for this request"""
+        # Get best references via hybrid retrieval
+        references = self.hybrid_retrieve(
+            query_text=query_context or f"{persona} {content_type}",
+            persona=persona,
+            content_type=content_type,
+            k=6
+        )
+        if not references:
+            return {"style_card": "", "examples": [], "constraints": {}}
+        # Extract best examples by type
+        best_hooks = []
+        best_beats = []
+        best_captions = []
+        for ref in references[:4]:  # Use top 4 references
+            script = ref['script']
+            if script.hook and len(best_hooks) < 2:
+                best_hooks.append(script.hook)
+            if script.beats and len(best_beats) < 1:
+                best_beats.extend(script.beats[:2])  # First 2 beats
+            if script.caption and len(best_captions) < 1:
+                best_captions.append(script.caption)
+        # Get or create style card
+        style_card = self._get_style_card(persona, content_type)
+        return {
+            "style_card": f"Persona: {persona} | Content: {content_type}",
+            "best_hooks": best_hooks[:2],
+            "best_beats": best_beats[:3],
+            "best_captions": best_captions[:1],
+            "constraints": {
+                "max_length": "15-25 seconds",
+                "compliance": "Instagram-safe",
+                "tone": references[0]['script'].tone if references else "playful"
+            },
+            "negative_patterns": style_card.negative_patterns if style_card else []
+        }
+    def _get_style_card(self, persona: str, content_type: str) -> Optional[StyleCard]:
+        """Get existing style card or return None"""
+        with get_session() as ses:
+            return ses.exec(
+                select(StyleCard).where(
+                    StyleCard.persona == persona,
+                    StyleCard.content_type == content_type
+                )
+            ).first()
+    def detect_copying(self,
+                      generated_content: Dict,
+                      reference_texts: List[str],
+                      similarity_threshold: float = 0.92) -> Dict:
+        """
+        Detect if generated content is too similar to reference material.
+        Returns detection results with flagged content and similarity scores.
+        Args:
+            generated_content: Dict with keys like 'hook', 'caption', 'beats', etc.
+            reference_texts: List of reference text snippets to compare against
+            similarity_threshold: Cosine similarity threshold (0.92 recommended)
+        Returns:
+            Dict with detection results and recommendations
+        """
+        detection_results = {
+            'is_copying': False,
+            'flagged_fields': [],
+            'max_similarity': 0.0,
+            'rewrite_recommendations': []
+        }
+        if not reference_texts:
+            return detection_results
+        # Encode all reference texts
+        reference_embeddings = self.encoder.encode(reference_texts)
+        # Fields to check for copying
+        fields_to_check = ['hook', 'caption', 'cta']
+        for field in fields_to_check:
+            if field in generated_content and generated_content[field]:
+                generated_text = str(generated_content[field])
+                # Skip very short texts (less than 10 characters)
+                if len(generated_text.strip()) < 10:
+                    continue
+                # Encode generated text
+                generated_embedding = self.encoder.encode([generated_text])
+                # Calculate similarity to all reference texts
+                similarities = cosine_similarity(generated_embedding, reference_embeddings)[0]
+                max_sim = float(np.max(similarities))
+                # Update overall max similarity
+                detection_results['max_similarity'] = max(detection_results['max_similarity'], max_sim)
+                # Check if similarity exceeds threshold
+                if max_sim >= similarity_threshold:
+                    detection_results['is_copying'] = True
+                    detection_results['flagged_fields'].append({
+                        'field': field,
+                        'text': generated_text,
+                        'similarity': max_sim,
+                        'similar_reference': reference_texts[int(np.argmax(similarities))]
+                    })
+                    # Generate rewrite recommendation
+                    if max_sim >= 0.95:
+                        urgency = "CRITICAL"
+                        action = "Completely rewrite this content"
+                    elif max_sim >= 0.92:
+                        urgency = "HIGH"
+                        action = "Significantly rephrase this content"
+                    else:
+                        urgency = "MEDIUM"
+                        action = "Minor rewording may be needed"
+                    detection_results['rewrite_recommendations'].append({
+                        'field': field,
+                        'urgency': urgency,
+                        'action': action,
+                        'original': generated_text
+                    })
+        return detection_results
+    def auto_rewrite_similar_content(self,
+                                   generated_content: Dict,
+                                   detection_results: Dict,
+                                   rewrite_instruction: str = "Rewrite to be more original while keeping the same intent") -> Dict:
+        """
+        Automatically rewrite content that's too similar to references.
+        Args:
+            generated_content: The original generated content
+            detection_results: Results from detect_copying()
+            rewrite_instruction: Instructions for how to rewrite
+        Returns:
+            Rewritten content dict
+        """
+        if not detection_results['is_copying']:
+            return generated_content
+        rewritten_content = generated_content.copy()
+        for flag in detection_results['flagged_fields']:
+            field = flag['field']
+            original_text = flag['text']
+            # Simple rewrite strategy: add instruction to modify the text
+            # In a production system, you'd call the LLM to rewrite
+            rewrite_prompt = f"""
+            Original: {original_text}
+            This text is too similar to existing reference material.
+            Please rewrite it to be more original while keeping the same intent and tone.
+            Make it clearly different from the reference but equally engaging.
+            Rewritten version:
+            """
+            # For now, add a flag that this needs rewriting
+            # In production, you'd call your LLM API here
+            rewritten_content[field] = f"[NEEDS_REWRITE] {original_text}"
+            # Log the issue
+            print(f"🚨 Anti-copy detection: {field} flagged (similarity: {flag['similarity']:.3f})")
+            print(f"   Original: {original_text[:60]}...")
+            print(f"   Similar to: {flag['similar_reference'][:60]}...")
+        return rewritten_content
+def index_all_scripts():
+    """Utility function to generate embeddings for all existing scripts"""
+    retriever = RAGRetriever()
+    with get_session() as ses:
+        scripts = list(ses.exec(select(Script)))
+        for script in scripts:
+            # Check if embeddings already exist
+            existing = ses.exec(
+                select(Embedding).where(Embedding.script_id == script.id)
+            ).first()
+            if not existing:
+                embeddings = retriever.generate_embeddings(script)
+                for embedding in embeddings:
+                    ses.add(embedding)
+                print(f"Generated embeddings for script {script.id}")
+        ses.commit()
+        print(f"Indexing complete! Processed {len(scripts)} scripts.")
+if __name__ == "__main__":
+    # Run this to index your existing scripts
+    index_all_scripts()

requirements.txt CHANGED Viewed

@@ -1,3 +1,16 @@
-altair
-pandas
-streamlit

+streamlit>=1.37.1
+sqlmodel>=0.0.16
+pydantic>=1.10.15
+python-dotenv>=1.0.1
+requests>=2.32.3
+sqlalchemy>=2.0.0
+# RAG Enhancement Dependencies
+sentence-transformers>=2.2.2
+scikit-learn>=1.3.0
+numpy>=1.24.0
+faiss-cpu>=1.7.4
+# Additional dependencies for deployment
+torch>=2.0.0
+transformers>=4.30.0