sbs-API

Build error

App Files Files Community

rairo commited on Jul 19, 2025

Commit

78debf6

verified ·

1 Parent(s): 034567d

Update sozo_gen.py

Browse files

Files changed (1) hide show

sozo_gen.py +111 -341

sozo_gen.py CHANGED Viewed

@@ -13,8 +13,8 @@ import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from matplotlib.animation import FuncAnimation, FFMpegWriter
-import seaborn as sns # Added for heatmaps
-from scipy import stats # Added for scatterplot regression
 from PIL import Image
 import cv2
 import inspect
@@ -29,13 +29,13 @@ import requests
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 FPS, WIDTH, HEIGHT = 24, 1280, 720
 MAX_CHARTS, VIDEO_SCENES = 5, 5
 # --- API Initialization ---
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if not API_KEY:
     raise ValueError("GOOGLE_API_KEY environment variable not set.")
-# NEW: Pexels API Key
 PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")
 # --- Helper Functions ---
@@ -68,13 +68,11 @@ def audio_duration(path: str) -> float:
         return float(res.stdout.strip())
     except Exception: return 5.0
-# UPDATED: Regex for chart tags and NEW regex for stock video tags
 TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]', re.I, )
 TAG_RE_PEXELS = re.compile( r'[<[]\s*generate_?stock_?video\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]', re.I, )
 extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
 extract_pexels_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE_PEXELS.finditer(t or "")) )
 re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I | re.M)
 def clean_narration(txt: str) -> str:
     txt = TAG_RE.sub("", txt); txt = TAG_RE_PEXELS.sub("", txt); txt = re_scene.sub("", txt)
@@ -98,7 +96,6 @@ def generate_image_from_prompt(prompt: str) -> Image.Image:
     except Exception:
         return placeholder_img()
-# NEW: Pexels video search and download function
 def search_and_download_pexels_video(query: str, duration: float, out_path: Path) -> str:
     if not PEXELS_API_KEY:
         logging.warning("PEXELS_API_KEY not set. Cannot fetch stock video.")
@@ -113,7 +110,6 @@ def search_and_download_pexels_video(query: str, duration: float, out_path: Path
             logging.warning(f"No Pexels videos found for query: '{query}'")
             return None
-        # Find a suitable video file (prefer HD)
         video_to_download = None
         for video in videos:
             for f in video.get('video_files', []):
@@ -127,7 +123,6 @@ def search_and_download_pexels_video(query: str, duration: float, out_path: Path
             logging.warning(f"No suitable HD video file found for query: '{query}'")
             return None
-        # Download to a temporary file
         with requests.get(video_to_download, stream=True, timeout=60) as r:
             r.raise_for_status()
             with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_dl_file:
@@ -135,7 +130,6 @@ def search_and_download_pexels_video(query: str, duration: float, out_path: Path
                     temp_dl_file.write(chunk)
                 temp_dl_path = Path(temp_dl_file.name)
-        # Use FFmpeg to resize, crop, and trim to exact duration
         cmd = [
             "ffmpeg", "-y", "-i", str(temp_dl_path),
             "-vf", f"scale={WIDTH}:{HEIGHT}:force_original_aspect_ratio=decrease,pad={WIDTH}:{HEIGHT}:(ow-iw)/2:(oh-ih)/2,setsar=1",
@@ -154,28 +148,19 @@ def search_and_download_pexels_video(query: str, duration: float, out_path: Path
         return None
 # --- Chart Generation System ---
-# UPDATED: ChartSpecification to include size_col for bubble charts
 class ChartSpecification:
     def __init__(self, chart_type: str, title: str, x_col: str, y_col: str = None, size_col: str = None, agg_method: str = None, filter_condition: str = None, top_n: int = None, color_scheme: str = "professional"):
         self.chart_type = chart_type; self.title = title; self.x_col = x_col; self.y_col = y_col; self.size_col = size_col
         self.agg_method = agg_method or "sum"; self.filter_condition = filter_condition; self.top_n = top_n; self.color_scheme = color_scheme
-def enhance_data_context(df: pd.DataFrame, ctx_dict: Dict) -> Dict:
-    enhanced_ctx = ctx_dict.copy(); numeric_cols = df.select_dtypes(include=['number']).columns.tolist(); categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
-    enhanced_ctx.update({"numeric_columns": numeric_cols, "categorical_columns": categorical_cols})
-    return enhanced_ctx
 class ChartGenerator:
     def __init__(self, llm, df: pd.DataFrame):
         self.llm = llm; self.df = df
-        self.enhanced_ctx = enhance_data_context(df, {"columns": list(df.columns), "shape": df.shape, "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}})
-    def generate_chart_spec(self, description: str) -> ChartSpecification:
-        safe_ctx = json_serializable(self.enhanced_ctx)
-        # UPDATED: Prompt to include new chart types
         spec_prompt = f"""
-        You are a data visualization expert. Based on the dataset and chart description, generate a precise chart specification.
-        **Dataset Info:** {json.dumps(safe_ctx, indent=2)}
         **Chart Request:** {description}
         **Return a JSON specification with these exact fields:**
         {{
@@ -187,7 +172,7 @@ class ChartGenerator:
             "agg_method": "sum|mean|count|max|min|null",
             "top_n": "number_for_top_n_filtering_or_null"
         }}
-        Return only the JSON specification, no additional text. For heatmaps, x_col and y_col can be null if it's a correlation matrix of all numeric columns.
         """
         try:
             response = self.llm.invoke(spec_prompt).content.strip()
@@ -199,18 +184,15 @@ class ChartGenerator:
             return ChartSpecification(**filtered_dict)
         except Exception as e:
             logging.error(f"Spec generation failed: {e}. Using fallback.")
-            return self._create_fallback_spec(description)
-    def _create_fallback_spec(self, description: str) -> ChartSpecification:
-        numeric_cols = self.enhanced_ctx['numeric_columns']; categorical_cols = self.enhanced_ctx['categorical_columns']
-        ctype = "bar"
-        for t in ["pie", "line", "scatter", "hist", "heatmap", "area", "bubble"]:
-            if t in description.lower(): ctype = t
-        x = categorical_cols[0] if categorical_cols else self.df.columns[0]
-        y = numeric_cols[0] if numeric_cols and len(self.df.columns) > 1 else (self.df.columns[1] if len(self.df.columns) > 1 else None)
-        return ChartSpecification(ctype, description, x, y)
-# UPDATED: execute_chart_spec to include new chart types
 def execute_chart_spec(spec: ChartSpecification, df: pd.DataFrame, output_path: Path) -> bool:
     try:
         plot_data = prepare_plot_data(spec, df)
@@ -231,7 +213,6 @@ def execute_chart_spec(spec: ChartSpecification, df: pd.DataFrame, output_path:
         return True
     except Exception as e: logging.error(f"Static chart generation failed for '{spec.title}': {e}"); return False
-# UPDATED: prepare_plot_data to handle new chart types
 def prepare_plot_data(spec: ChartSpecification, df: pd.DataFrame):
     if spec.chart_type not in ["heatmap"]:
         if spec.x_col not in df.columns or (spec.y_col and spec.y_col not in df.columns): raise ValueError(f"Invalid columns in chart spec: {spec.x_col}, {spec.y_col}")
@@ -253,7 +234,6 @@ def prepare_plot_data(spec: ChartSpecification, df: pd.DataFrame):
     return df[spec.x_col]
 # --- Animation & Video Generation ---
-# UPDATED: animate_chart with enhanced animations and new chart types
 def animate_chart(spec: ChartSpecification, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
     plot_data = prepare_plot_data(spec, df)
     frames = max(10, int(dur * fps))
@@ -276,30 +256,25 @@ def animate_chart(spec: ChartSpecification, df: pd.DataFrame, dur: float, out: P
             return bars
     elif ctype == "scatter":
         x_full, y_full = plot_data.iloc[:, 0], plot_data.iloc[:, 1]
-        # Calculate regression line
         slope, intercept, _, _, _ = stats.linregress(x_full, y_full)
         reg_line_x = np.array([x_full.min(), x_full.max()])
         reg_line_y = slope * reg_line_x + intercept
         scat = ax.scatter([], [], alpha=0.7, color='#F18F01')
-        line, = ax.plot([], [], 'r--', lw=2) # Regression line
         ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min(), y_full.max())
         ax.set_title(spec.title); ax.grid(alpha=.3); ax.set_xlabel(spec.x_col); ax.set_ylabel(spec.y_col)
         def init():
-            scat.set_offsets(np.empty((0, 2)))
-            line.set_data([], [])
             return [scat, line]
         def update(i):
-            # Animate points for the first 70% of frames
             point_frames = int(frames * 0.7)
             if i <= point_frames:
                 k = max(1, int(len(x_full) * (i / point_frames)))
                 scat.set_offsets(plot_data.iloc[:k].values)
-            # Animate regression line for the last 30%
             else:
-                line_frame = i - point_frames
-                line_total_frames = frames - point_frames
                 current_x = reg_line_x[0] + (reg_line_x[1] - reg_line_x[0]) * (line_frame / line_total_frames)
                 line.set_data([reg_line_x[0], current_x], [reg_line_y[0], slope * current_x + intercept])
             return [scat, line]
@@ -320,32 +295,19 @@ def animate_chart(spec: ChartSpecification, df: pd.DataFrame, dur: float, out: P
             k = max(2, int(len(x_full) * (i / (frames - 1))))
             fill = ax.fill_between(x_full[:k], y_full[:k], color="#4E79A7", alpha=0.4)
             return [fill]
-    elif ctype == "heatmap":
-        sns.heatmap(plot_data, annot=True, cmap="viridis", ax=ax, alpha=0)
-        ax.set_title(spec.title)
-        def init(): ax.collections[0].set_alpha(0); return [ax.collections[0]]
-        def update(i): ax.collections[0].set_alpha(i / (frames - 1)); return [ax.collections[0]]
-    elif ctype == "bubble":
-        sizes = (plot_data[spec.size_col] - plot_data[spec.size_col].min() + 1) / (plot_data[spec.size_col].max() - plot_data[spec.size_col].min() + 1) * 2000 + 50
-        scat = ax.scatter(plot_data[spec.x_col], plot_data[spec.y_col], s=sizes, alpha=0, color='#59A14F')
-        ax.set_title(spec.title); ax.grid(alpha=.3); ax.set_xlabel(spec.x_col); ax.set_ylabel(spec.y_col)
-        def init(): scat.set_alpha(0); return [scat]
-        def update(i): scat.set_alpha(i / (frames - 1) * 0.7); return [scat]
     else: # line (Time Series)
         line, = ax.plot([], [], lw=2, color='#A23B72')
-        markers, = ax.plot([], [], 'o', color='#A23B72', markersize=5) # Animated markers
         plot_data = plot_data.sort_index() if not plot_data.index.is_monotonic_increasing else plot_data
         x_full, y_full = plot_data.index, plot_data.values
         ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min() * 0.9, y_full.max() * 1.1)
         ax.set_title(spec.title); ax.grid(alpha=.3); ax.set_xlabel(spec.x_col); ax.set_ylabel(spec.y_col)
         def init():
-            line.set_data([], [])
-            markers.set_data([], [])
             return [line, markers]
         def update(i):
             k = max(2, int(len(x_full) * (i / (frames - 1))))
-            line.set_data(x_full[:k], y_full[:k])
-            markers.set_data(x_full[:k], y_full[:k])
             return [line, markers]
     anim = FuncAnimation(fig, update, init_func=init, frames=frames, blit=True, interval=1000 / fps)
@@ -363,11 +325,11 @@ def animate_image_fade(img: np.ndarray, dur: float, out: Path, fps: int = 24) ->
     video_writer.release()
     return str(out)
-def safe_chart(desc: str, df: pd.DataFrame, dur: float, out: Path) -> str:
     try:
         llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
         chart_generator = ChartGenerator(llm, df)
-        chart_spec = chart_generator.generate_chart_spec(desc)
         return animate_chart(chart_spec, df, dur, out)
     except Exception as e:
         logging.error(f"Chart animation failed for '{desc}': {e}. Falling back to static image.")
@@ -375,7 +337,7 @@ def safe_chart(desc: str, df: pd.DataFrame, dur: float, out: Path) -> str:
             temp_png = Path(temp_png_file.name)
         llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
         chart_generator = ChartGenerator(llm, df)
-        chart_spec = chart_generator.generate_chart_spec(desc)
         if execute_chart_spec(chart_spec, df, temp_png):
             img = cv2.imread(str(temp_png)); os.unlink(temp_png)
             img_resized = cv2.resize(img, (WIDTH, HEIGHT))
@@ -398,310 +360,118 @@ def concat_media(file_paths: List[str], output_path: Path):
     finally:
         list_file.unlink(missing_ok=True)
-# --- Main Business Logic Functions ---
-# This section containing generate_report_draft and its helpers is left unchanged as requested.
-# ... (all functions from sanitize_for_firebase_key to generate_single_chart) ...
-# The following functions are preserved exactly as they were in the original code provided.
 def sanitize_for_firebase_key(text: str) -> str:
-    """Replaces Firebase-forbidden characters in a string with underscores."""
     forbidden_chars = ['.', '$', '#', '[', ']', '/']
     for char in forbidden_chars:
         text = text.replace(char, '_')
     return text
-def analyze_data_intelligence(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
-    """
-    Autonomous data intelligence system that classifies domain,
-    detects patterns, and determines optimal analytical approach.
-    """
-    # Domain Classification Engine
-    domain_signals = {
-        'financial': ['amount', 'price', 'cost', 'revenue', 'profit', 'balance', 'transaction', 'payment'],
-        'survey': ['rating', 'satisfaction', 'score', 'response', 'feedback', 'opinion', 'agree', 'likert'],
-        'scientific': ['measurement', 'experiment', 'trial', 'test', 'control', 'variable', 'hypothesis'],
-        'marketing': ['campaign', 'conversion', 'click', 'impression', 'engagement', 'customer', 'segment'],
-        'operational': ['performance', 'efficiency', 'throughput', 'capacity', 'utilization', 'process'],
-        'temporal': ['date', 'time', 'timestamp', 'period', 'month', 'year', 'day', 'hour']
-    }
-    # Analyze column patterns
-    columns_lower = [col.lower() for col in df.columns]
-    domain_scores = {}
-    for domain, keywords in domain_signals.items():
-        score = sum(1 for col in columns_lower if any(keyword in col for keyword in keywords))
-        domain_scores[domain] = score
-    # Determine primary domain
-    primary_domain = max(domain_scores, key=domain_scores.get) if max(domain_scores.values()) > 0 else 'general'
-    # Data Structure Analysis
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
-    datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
-    # Detect time series
-    is_timeseries = len(datetime_cols) > 0 or any('date' in col.lower() or 'time' in col.lower() for col in columns_lower)
-    # Statistical Profile
-    statistical_summary = {}
-    if numeric_cols:
-        try:
-            correlations = df[numeric_cols].corr().abs().max()
-            correlations_dict = {k: float(v) if pd.notna(v) else 0.0 for k, v in correlations.to_dict().items()}
-            distributions = {}
-            for col in numeric_cols:
-                if len(df[col].dropna()) > 8:
-                    try:
-                        p_value = stats.normaltest(df[col].dropna())[1]
-                        distributions[col] = 'normal' if p_value > 0.05 else 'non_normal'
-                    except:
-                        distributions[col] = 'unknown'
-            outliers = {}
-            for col in numeric_cols:
-                if len(df[col].dropna()) > 0:
-                    try:
-                        z_scores = np.abs(stats.zscore(df[col].dropna()))
-                        outliers[col] = int(len(df[col][z_scores > 3]))
-                    except:
-                        outliers[col] = 0
-            statistical_summary = {
-                'correlations': correlations_dict,
-                'distributions': distributions,
-                'outliers': outliers
-            }
-        except Exception as e:
-            statistical_summary = {'error': 'Could not compute statistical summary'}
-    # Pattern Detection
-    patterns = {
-        'has_missing_data': df.isnull().sum().sum() > 0,
-        'has_duplicates': df.duplicated().sum() > 0,
-        'has_negative_values': any(df[col].min() < 0 for col in numeric_cols if len(df[col].dropna()) > 0),
-        'has_categorical_hierarchy': any(len(df[col].unique()) > 10 for col in categorical_cols),
-        'potential_segments': len(categorical_cols) > 0
-    }
-    # Insight Opportunities
-    insight_opportunities = []
-    if is_timeseries:
-        insight_opportunities.append("temporal_trends")
-    if len(numeric_cols) > 1:
-        insight_opportunities.append("correlations")
-    if len(categorical_cols) > 0 and len(numeric_cols) > 0:
-        insight_opportunities.append("segmentation")
-    if any(statistical_summary.get('outliers', {}).values()):
-        insight_opportunities.append("anomalies")
-    return {
-        'primary_domain': primary_domain,
-        'domain_confidence': domain_scores,
-        'data_structure': {
-            'is_timeseries': is_timeseries,
-            'numeric_cols': numeric_cols,
-            'categorical_cols': categorical_cols,
-            'datetime_cols': datetime_cols
         },
-        'statistical_profile': statistical_summary,
-        'patterns': patterns,
-        'insight_opportunities': insight_opportunities,
-        'narrative_suggestions': get_narrative_suggestions(primary_domain, insight_opportunities, patterns)
     }
-def get_narrative_suggestions(domain: str, opportunities: List[str], patterns: Dict) -> Dict[str, str]:
-    """Generate narrative direction based on domain and data characteristics"""
-    narrative_frameworks = {
-        'financial': {
-            'hook': "Follow the money trail that reveals your business's hidden opportunities",
-            'structure': "performance → trends → risks → opportunities",
-            'focus': "profitability, efficiency, growth patterns, risk indicators"
-        },
-        'survey': {
-            'hook': "Your customers are speaking - here's what they're really saying",
-            'structure': "sentiment → segments → drivers → actions",
-            'focus': "satisfaction drivers, demographic patterns, improvement areas"
-        },
-        'scientific': {
-            'hook': "The data reveals relationships that challenge conventional thinking",
-            'structure': "hypothesis → evidence → significance → implications",
-            'focus': "statistical significance, correlations, experimental validity"
-        },
-        'marketing': {
-            'hook': "Discover the customer journey patterns driving your growth",
-            'structure': "performance → segments → optimization → strategy",
-            'focus': "conversion funnels, customer segments, campaign effectiveness"
-        },
-        'operational': {
-            'hook': "Operational excellence lives in the details - here's where to look",
-            'structure': "efficiency → bottlenecks → optimization → impact",
-            'focus': "process efficiency, capacity utilization, improvement opportunities"
-        },
-        'general': {
-            'hook': "Every dataset tells a story - here's what yours is saying",
-            'structure': "overview → patterns → insights → implications",
-            'focus': "key patterns, significant relationships, actionable insights"
         }
-    }
-    return narrative_frameworks.get(domain, narrative_frameworks['general'])
-def json_serializable(obj):
-    """Convert objects to JSON-serializable format"""
-    if isinstance(obj, (np.integer, np.floating)):
-        return float(obj)
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    elif isinstance(obj, (np.bool_, bool)):
-        return bool(obj)
-    elif isinstance(obj, dict):
-        return {k: json_serializable(v) for k, v in obj.items()}
-    elif isinstance(obj, (list, tuple)):
-        return [json_serializable(item) for item in obj]
-    elif pd.isna(obj):
-        return None
-    else:
-        return obj
-def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
-    """
-    Generate a dynamic, intelligence-driven prompt that creates compelling narratives
-    rather than following templates.
-    """
-    domain = intelligence['primary_domain']
-    opportunities = intelligence['insight_opportunities']
-    narrative = intelligence['narrative_suggestions']
-    # Dynamic chart strategy based on data characteristics
-    chart_strategy = generate_chart_strategy(intelligence)
-    # Make context JSON serializable
-    serializable_ctx = json_serializable(enhanced_ctx)
-    prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
-**THE DATA'S STORY CONTEXT:**
-{json.dumps(serializable_ctx, indent=2)}
-**INTELLIGENCE ANALYSIS:**
-- Primary Domain: {domain}
-- Key Opportunities: {', '.join(opportunities)}
-- Data Characteristics: {json_serializable(intelligence['data_structure'])}
-- Narrative Framework: {narrative['structure']}
-**YOUR STORYTELLING MISSION:**
-{narrative['hook']}
-**NARRATIVE CONSTRUCTION GUIDELINES:**
-1. **LEAD WITH INTRIGUE**: Start with the most compelling finding that hooks the reader
-2. **BUILD TENSION**: Present contrasts, surprises, or unexpected patterns
-3. **REVEAL INSIGHTS**: Use data to resolve the tension with clear comprehensive explanations
-4. **DRIVE ACTION**: End with specific, actionable recommendations
-**VISUALIZATION STRATEGY:**
-{chart_strategy}
-**CRITICAL INSTRUCTIONS:**
-- Write as if you're revealing a detective story, not filling a template
-- Every insight must be explained and supported by data evidence
-- Use compelling headers that create curiosity (not "Executive Summary")
-- Weave charts naturally into the narrative flow
-- Focus on business impact and actionable outcomes
-- Let the data's personality shine through your writing style
-**CHART INTEGRATION:**
-Insert charts using: `<generate_chart: "chart_type | compelling description that advances the story">`
-Available types: bar, pie, line, scatter, hist, heatmap, area, bubble
-Transform this data into a story that decision-makers can't stop reading."""
-    return prompt
-def generate_chart_strategy(intelligence: Dict) -> str:
-    """Generate visualization strategy based on data intelligence"""
-    domain = intelligence['primary_domain']
-    opportunities = intelligence['insight_opportunities']
-    structure = intelligence['data_structure']
-    strategies = {
-        'financial': "Focus on trend lines showing performance over time, comparative bars for different categories, and scatter plots revealing correlations between financial metrics.",
-        'survey': "Emphasize distribution histograms for satisfaction scores, segmented bar charts for demographic breakdowns, and correlation matrices for response patterns.",
-        'scientific': "Prioritize scatter plots with regression lines, distribution comparisons, and statistical significance visualizations.",
-        'marketing': "Highlight conversion funnels, customer segment comparisons, and campaign performance trends.",
-        'operational': "Show efficiency trends, capacity utilization charts, and process performance comparisons."
-    }
-    base_strategy = strategies.get(domain, "Create visualizations that best tell your data's unique story.")
-    # Add specific guidance based on data characteristics
-    if structure['is_timeseries']:
-        base_strategy += " Leverage time-series visualizations like line or area charts to show trends and patterns over time."
-    if 'correlations' in opportunities:
-        base_strategy += " Include correlation visualizations like scatterplots or heatmaps to reveal hidden relationships."
-    if 'segmentation' in opportunities:
-        base_strategy += " Use segmented charts to highlight different groups or categories."
-    return base_strategy
-def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
-    # This function remains unchanged as per the instructions.
-    logging.info(f"Generating autonomous report draft for project {project_id}")
-    df = load_dataframe_safely(buf, name)
-    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.1)
-    ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
-    enhanced_ctx = enhance_data_context(df, ctx_dict)
-    intelligence = analyze_data_intelligence(df, ctx_dict)
-    report_prompt = create_autonomous_prompt(df, enhanced_ctx, intelligence)
     md = llm.invoke(report_prompt).content
     chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
     chart_urls = {}
     chart_generator = ChartGenerator(llm, df)
     for desc in chart_descs:
         safe_desc = sanitize_for_firebase_key(desc)
         md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
         md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
             img_path = Path(temp_file.name)
             try:
-                chart_spec = chart_generator.generate_chart_spec(desc)
                 if execute_chart_spec(chart_spec, df, img_path):
                     blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
                     blob = bucket.blob(blob_name)
                     blob.upload_from_filename(str(img_path))
                     chart_urls[safe_desc] = blob.public_url
-                    logging.info(f"Uploaded chart '{desc}' to {blob.public_url} with safe key '{safe_desc}'")
             finally:
                 if os.path.exists(img_path):
                     os.unlink(img_path)
-    return {"raw_md": md, "chartUrls": chart_urls}
 def generate_single_chart(df: pd.DataFrame, description: str, uid: str, project_id: str, bucket):
     logging.info(f"Generating single chart '{description}' for project {project_id}")
     llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
     chart_generator = ChartGenerator(llm, df)
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
         img_path = Path(temp_file.name)
         try:
-            chart_spec = chart_generator.generate_chart_spec(description)
             if execute_chart_spec(chart_spec, df, img_path):
                 blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
                 blob = bucket.blob(blob_name)
@@ -713,26 +483,23 @@ def generate_single_chart(df: pd.DataFrame, description: str, uid: str, project_
                 os.unlink(img_path)
     return None
-# UPDATED: generate_video_from_project to handle Pexels integration
-def generate_video_from_project(df: pd.DataFrame, raw_md: str, uid: str, project_id: str, voice_model: str, bucket):
     logging.info(f"Generating video for project {project_id} with voice {voice_model}")
     llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
-    # UPDATED: Prompt to create Intro/Conclusion scenes with stock video tags
     story_prompt = f"""
     Based on the following report, create a script for a {VIDEO_SCENES}-scene video.
     1. The first scene MUST be an "Introduction". It must contain narration and a stock video tag like: <generate_stock_video: "search query">.
     2. The last scene MUST be a "Conclusion". It must also contain narration and a stock video tag.
     3. The middle scenes should each contain narration and one chart tag from the report.
     4. Separate each scene with '[SCENE_BREAK]'.
     Report: {raw_md}
     Only output the script, no extra text.
     """
     script = llm.invoke(story_prompt).content
     scenes = [s.strip() for s in script.split("[SCENE_BREAK]") if s.strip()]
     video_parts, audio_parts, temps = [], [], []
     for i, sc in enumerate(scenes):
         chart_descs = extract_chart_tags(sc)
@@ -745,35 +512,36 @@ def generate_video_from_project(df: pd.DataFrame, raw_md: str, uid: str, project
         audio_bytes = deepgram_tts(narrative, voice_model)
         mp3 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
         if audio_bytes:
-            mp3.write_bytes(audio_bytes); dur = audio_duration(str(mp3))
-            if dur <= 0.1: dur = 5.0
         else:
-            dur = 5.0; generate_silence_mp3(dur, mp3)
         audio_parts.append(str(mp3)); temps.append(mp3)
         mp4 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
         video_generated = False
         if pexels_descs:
-            logging.info(f"Scene {i+1}: Found Pexels tag '{pexels_descs[0]}'. Searching for video.")
-            video_path = search_and_download_pexels_video(pexels_descs[0], dur, mp4)
             if video_path:
-                video_parts.append(video_path)
-                temps.append(Path(video_path))
                 video_generated = True
         if not video_generated and chart_descs:
-            logging.info(f"Scene {i+1}: Found chart tag '{chart_descs[0]}'. Generating chart animation.")
-            safe_chart(chart_descs[0], df, dur, mp4)
             video_parts.append(str(mp4)); temps.append(mp4)
             video_generated = True
         if not video_generated:
-            logging.warning(f"Scene {i+1}: No valid chart or stock video tag found. Using fallback image.")
             img = generate_image_from_prompt(narrative)
             img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
-            animate_image_fade(img_cv, dur, mp4)
             video_parts.append(str(mp4)); temps.append(mp4)
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_vid, \
@@ -787,12 +555,14 @@ def generate_video_from_project(df: pd.DataFrame, raw_md: str, uid: str, project
         concat_media(video_parts, silent_vid_path)
         concat_media(audio_parts, audio_mix_path)
-        subprocess.run(
-            ["ffmpeg", "-y", "-i", str(silent_vid_path), "-i", str(audio_mix_path),
             "-c:v", "libx264", "-pix_fmt", "yuv420p", "-c:a", "aac",
-            "-map", "0:v:0", "-map", "1:a:0", "-shortest", str(final_vid_path)],
-            check=True, capture_output=True,
-        )
         blob_name = f"sozo_projects/{uid}/{project_id}/video.mp4"
         blob = bucket.blob(blob_name)

 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from matplotlib.animation import FuncAnimation, FFMpegWriter
+import seaborn as sns
+from scipy import stats
 from PIL import Image
 import cv2
 import inspect
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 FPS, WIDTH, HEIGHT = 24, 1280, 720
 MAX_CHARTS, VIDEO_SCENES = 5, 5
+MAX_CONTEXT_TOKENS = 250000 # Set max token limit for full dataset context
 # --- API Initialization ---
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if not API_KEY:
     raise ValueError("GOOGLE_API_KEY environment variable not set.")
 PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")
 # --- Helper Functions ---
         return float(res.stdout.strip())
     except Exception: return 5.0
 TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]', re.I, )
 TAG_RE_PEXELS = re.compile( r'[<[]\s*generate_?stock_?video\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]', re.I, )
 extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
 extract_pexels_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE_PEXELS.finditer(t or "")) )
 re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I | re.M)
 def clean_narration(txt: str) -> str:
     txt = TAG_RE.sub("", txt); txt = TAG_RE_PEXELS.sub("", txt); txt = re_scene.sub("", txt)
     except Exception:
         return placeholder_img()
 def search_and_download_pexels_video(query: str, duration: float, out_path: Path) -> str:
     if not PEXELS_API_KEY:
         logging.warning("PEXELS_API_KEY not set. Cannot fetch stock video.")
             logging.warning(f"No Pexels videos found for query: '{query}'")
             return None
         video_to_download = None
         for video in videos:
             for f in video.get('video_files', []):
             logging.warning(f"No suitable HD video file found for query: '{query}'")
             return None
         with requests.get(video_to_download, stream=True, timeout=60) as r:
             r.raise_for_status()
             with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_dl_file:
                     temp_dl_file.write(chunk)
                 temp_dl_path = Path(temp_dl_file.name)
         cmd = [
             "ffmpeg", "-y", "-i", str(temp_dl_path),
             "-vf", f"scale={WIDTH}:{HEIGHT}:force_original_aspect_ratio=decrease,pad={WIDTH}:{HEIGHT}:(ow-iw)/2:(oh-ih)/2,setsar=1",
         return None
 # --- Chart Generation System ---
 class ChartSpecification:
     def __init__(self, chart_type: str, title: str, x_col: str, y_col: str = None, size_col: str = None, agg_method: str = None, filter_condition: str = None, top_n: int = None, color_scheme: str = "professional"):
         self.chart_type = chart_type; self.title = title; self.x_col = x_col; self.y_col = y_col; self.size_col = size_col
         self.agg_method = agg_method or "sum"; self.filter_condition = filter_condition; self.top_n = top_n; self.color_scheme = color_scheme
 class ChartGenerator:
     def __init__(self, llm, df: pd.DataFrame):
         self.llm = llm; self.df = df
+    def generate_chart_spec(self, description: str, context: Dict) -> ChartSpecification:
         spec_prompt = f"""
+        You are a data visualization expert. Based on the dataset context and chart description, generate a precise chart specification.
+        **Dataset Context:** {json.dumps(context, indent=2)}
         **Chart Request:** {description}
         **Return a JSON specification with these exact fields:**
         {{
             "agg_method": "sum|mean|count|max|min|null",
             "top_n": "number_for_top_n_filtering_or_null"
         }}
+        Return only the JSON specification, no additional text.
         """
         try:
             response = self.llm.invoke(spec_prompt).content.strip()
             return ChartSpecification(**filtered_dict)
         except Exception as e:
             logging.error(f"Spec generation failed: {e}. Using fallback.")
+            numeric_cols = context.get('schema', {}).get('numeric_columns', list(self.df.select_dtypes(include=['number']).columns))
+            categorical_cols = context.get('schema', {}).get('categorical_columns', list(self.df.select_dtypes(exclude=['number']).columns))
+            ctype = "bar"
+            for t in ["pie", "line", "scatter", "hist", "heatmap", "area", "bubble"]:
+                if t in description.lower(): ctype = t
+            x = categorical_cols[0] if categorical_cols else self.df.columns[0]
+            y = numeric_cols[0] if numeric_cols and len(self.df.columns) > 1 else (self.df.columns[1] if len(self.df.columns) > 1 else None)
+            return ChartSpecification(ctype, description, x, y)
 def execute_chart_spec(spec: ChartSpecification, df: pd.DataFrame, output_path: Path) -> bool:
     try:
         plot_data = prepare_plot_data(spec, df)
         return True
     except Exception as e: logging.error(f"Static chart generation failed for '{spec.title}': {e}"); return False
 def prepare_plot_data(spec: ChartSpecification, df: pd.DataFrame):
     if spec.chart_type not in ["heatmap"]:
         if spec.x_col not in df.columns or (spec.y_col and spec.y_col not in df.columns): raise ValueError(f"Invalid columns in chart spec: {spec.x_col}, {spec.y_col}")
     return df[spec.x_col]
 # --- Animation & Video Generation ---
 def animate_chart(spec: ChartSpecification, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
     plot_data = prepare_plot_data(spec, df)
     frames = max(10, int(dur * fps))
             return bars
     elif ctype == "scatter":
         x_full, y_full = plot_data.iloc[:, 0], plot_data.iloc[:, 1]
         slope, intercept, _, _, _ = stats.linregress(x_full, y_full)
         reg_line_x = np.array([x_full.min(), x_full.max()])
         reg_line_y = slope * reg_line_x + intercept
         scat = ax.scatter([], [], alpha=0.7, color='#F18F01')
+        line, = ax.plot([], [], 'r--', lw=2)
         ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min(), y_full.max())
         ax.set_title(spec.title); ax.grid(alpha=.3); ax.set_xlabel(spec.x_col); ax.set_ylabel(spec.y_col)
         def init():
+            scat.set_offsets(np.empty((0, 2))); line.set_data([], [])
             return [scat, line]
         def update(i):
             point_frames = int(frames * 0.7)
             if i <= point_frames:
                 k = max(1, int(len(x_full) * (i / point_frames)))
                 scat.set_offsets(plot_data.iloc[:k].values)
             else:
+                line_frame = i - point_frames; line_total_frames = frames - point_frames
                 current_x = reg_line_x[0] + (reg_line_x[1] - reg_line_x[0]) * (line_frame / line_total_frames)
                 line.set_data([reg_line_x[0], current_x], [reg_line_y[0], slope * current_x + intercept])
             return [scat, line]
             k = max(2, int(len(x_full) * (i / (frames - 1))))
             fill = ax.fill_between(x_full[:k], y_full[:k], color="#4E79A7", alpha=0.4)
             return [fill]
     else: # line (Time Series)
         line, = ax.plot([], [], lw=2, color='#A23B72')
+        markers, = ax.plot([], [], 'o', color='#A23B72', markersize=5)
         plot_data = plot_data.sort_index() if not plot_data.index.is_monotonic_increasing else plot_data
         x_full, y_full = plot_data.index, plot_data.values
         ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min() * 0.9, y_full.max() * 1.1)
         ax.set_title(spec.title); ax.grid(alpha=.3); ax.set_xlabel(spec.x_col); ax.set_ylabel(spec.y_col)
         def init():
+            line.set_data([], []); markers.set_data([], [])
             return [line, markers]
         def update(i):
             k = max(2, int(len(x_full) * (i / (frames - 1))))
+            line.set_data(x_full[:k], y_full[:k]); markers.set_data(x_full[:k], y_full[:k])
             return [line, markers]
     anim = FuncAnimation(fig, update, init_func=init, frames=frames, blit=True, interval=1000 / fps)
     video_writer.release()
     return str(out)
+def safe_chart(desc: str, df: pd.DataFrame, dur: float, out: Path, context: Dict) -> str:
     try:
         llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
         chart_generator = ChartGenerator(llm, df)
+        chart_spec = chart_generator.generate_chart_spec(desc, context)
         return animate_chart(chart_spec, df, dur, out)
     except Exception as e:
         logging.error(f"Chart animation failed for '{desc}': {e}. Falling back to static image.")
             temp_png = Path(temp_png_file.name)
         llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
         chart_generator = ChartGenerator(llm, df)
+        chart_spec = chart_generator.generate_chart_spec(desc, context)
         if execute_chart_spec(chart_spec, df, temp_png):
             img = cv2.imread(str(temp_png)); os.unlink(temp_png)
             img_resized = cv2.resize(img, (WIDTH, HEIGHT))
     finally:
         list_file.unlink(missing_ok=True)
+# --- Main Business Logic ---
 def sanitize_for_firebase_key(text: str) -> str:
     forbidden_chars = ['.', '$', '#', '[', ']', '/']
     for char in forbidden_chars:
         text = text.replace(char, '_')
     return text
+def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
+    """Creates a detailed summary of the dataframe for the AI."""
+    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
+    categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
+    context = {
+        "user_context": user_ctx,
+        "dataset_shape": {"rows": df.shape[0], "columns": df.shape[1]},
+        "schema": {
+            "numeric_columns": numeric_cols,
+            "categorical_columns": categorical_cols
         },
+        "data_previews": {}
     }
+    for col in categorical_cols[:5]:
+        unique_vals = df[col].unique()
+        context["data_previews"][col] = {
+            "count": len(unique_vals),
+            "values": unique_vals[:5].tolist()
+        }
+    for col in numeric_cols[:5]:
+        context["data_previews"][col] = {
+            "mean": df[col].mean(),
+            "min": df[col].min(),
+            "max": df[col].max()
         }
+    return json.loads(json.dumps(context, default=str))
+def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
+    logging.info(f"Generating report draft for project {project_id}")
+    df = load_dataframe_safely(buf, name)
+    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.1)
+    data_context_str = ""
+    context_for_charts = {}
+    try:
+        df_json = df.to_json(orient='records')
+        estimated_tokens = len(df_json) / 4
+        if estimated_tokens < MAX_CONTEXT_TOKENS:
+            logging.info(f"Dataset is small enough ({estimated_tokens:.0f} tokens). Using full JSON context.")
+            data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
+            context_for_charts = get_augmented_context(df, ctx)
+        else:
+            raise ValueError("Dataset too large for full context.")
+    except Exception as e:
+        logging.warning(f"Could not use full JSON context ({e}). Falling back to augmented summary.")
+        augmented_context = get_augmented_context(df, ctx)
+        data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
+        context_for_charts = augmented_context
+    report_prompt = f"""
+    You are an expert data analyst and business intelligence storyteller. Your mission is to analyze the provided data context and write a comprehensive, executive-level report in Markdown format.
+    **Data Context:**
+    {data_context_str}
+    **Critical Instructions:**
+    1.  **Data Grounding:** Your entire analysis and narrative **must strictly** use the column names and data provided in the 'Data Context' section. Do not invent, modify, or assume any column names that are not on this list. This is the most important rule.
+    2.  **Report Goal:** Create a well-structured, professional report in Markdown that tells a compelling story from the data. The structure of the report is entirely up to you, but it should be logical and easy to follow.
+    3.  **Visual Support:** Wherever a key finding, trend, or significant point is made in your narrative, you **must** support it with a chart tag using the format: `<generate_chart: "chart_type | a specific, compelling description">`.
+    4.  **Chart Tag Grounding:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
+    5.  **Available Chart Types:** `bar, pie, line, scatter, hist, heatmap, area, bubble`.
+    Now, generate the complete Markdown report.
+    """
     md = llm.invoke(report_prompt).content
     chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
     chart_urls = {}
     chart_generator = ChartGenerator(llm, df)
     for desc in chart_descs:
         safe_desc = sanitize_for_firebase_key(desc)
         md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
         md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
             img_path = Path(temp_file.name)
             try:
+                chart_spec = chart_generator.generate_chart_spec(desc, context_for_charts)
                 if execute_chart_spec(chart_spec, df, img_path):
                     blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
                     blob = bucket.blob(blob_name)
                     blob.upload_from_filename(str(img_path))
                     chart_urls[safe_desc] = blob.public_url
             finally:
                 if os.path.exists(img_path):
                     os.unlink(img_path)
+    return {"raw_md": md, "chartUrls": chart_urls, "data_context": context_for_charts}
 def generate_single_chart(df: pd.DataFrame, description: str, uid: str, project_id: str, bucket):
     logging.info(f"Generating single chart '{description}' for project {project_id}")
     llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
     chart_generator = ChartGenerator(llm, df)
+    context = get_augmented_context(df, "")
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
         img_path = Path(temp_file.name)
         try:
+            chart_spec = chart_generator.generate_chart_spec(description, context)
             if execute_chart_spec(chart_spec, df, img_path):
                 blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
                 blob = bucket.blob(blob_name)
                 os.unlink(img_path)
     return None
+def generate_video_from_project(df: pd.DataFrame, raw_md: str, data_context: Dict, uid: str, project_id: str, voice_model: str, bucket):
     logging.info(f"Generating video for project {project_id} with voice {voice_model}")
     llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
     story_prompt = f"""
     Based on the following report, create a script for a {VIDEO_SCENES}-scene video.
     1. The first scene MUST be an "Introduction". It must contain narration and a stock video tag like: <generate_stock_video: "search query">.
     2. The last scene MUST be a "Conclusion". It must also contain narration and a stock video tag.
     3. The middle scenes should each contain narration and one chart tag from the report.
     4. Separate each scene with '[SCENE_BREAK]'.
     Report: {raw_md}
     Only output the script, no extra text.
     """
     script = llm.invoke(story_prompt).content
     scenes = [s.strip() for s in script.split("[SCENE_BREAK]") if s.strip()]
     video_parts, audio_parts, temps = [], [], []
+    total_audio_duration = 0.0
     for i, sc in enumerate(scenes):
         chart_descs = extract_chart_tags(sc)
         audio_bytes = deepgram_tts(narrative, voice_model)
         mp3 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
+        audio_dur = 5.0
         if audio_bytes:
+            mp3.write_bytes(audio_bytes)
+            audio_dur = audio_duration(str(mp3))
+            if audio_dur <= 0.1: audio_dur = 5.0
         else:
+            generate_silence_mp3(audio_dur, mp3)
         audio_parts.append(str(mp3)); temps.append(mp3)
+        total_audio_duration += audio_dur
+        video_dur = audio_dur + 0.5
         mp4 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
         video_generated = False
         if pexels_descs:
+            video_path = search_and_download_pexels_video(pexels_descs[0], video_dur, mp4)
             if video_path:
+                video_parts.append(video_path); temps.append(Path(video_path))
                 video_generated = True
         if not video_generated and chart_descs:
+            safe_chart(chart_descs[0], df, video_dur, mp4, data_context)
             video_parts.append(str(mp4)); temps.append(mp4)
             video_generated = True
         if not video_generated:
             img = generate_image_from_prompt(narrative)
             img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
+            animate_image_fade(img_cv, video_dur, mp4)
             video_parts.append(str(mp4)); temps.append(mp4)
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_vid, \
         concat_media(video_parts, silent_vid_path)
         concat_media(audio_parts, audio_mix_path)
+        cmd = [
+            "ffmpeg", "-y", "-i", str(silent_vid_path), "-i", str(audio_mix_path),
             "-c:v", "libx264", "-pix_fmt", "yuv420p", "-c:a", "aac",
+            "-map", "0:v:0", "-map", "1:a:0",
+            "-t", f"{total_audio_duration:.3f}",
+            str(final_vid_path)
+        ]
+        subprocess.run(cmd, check=True, capture_output=True)
         blob_name = f"sozo_projects/{uid}/{project_id}/video.mp4"
         blob = bucket.blob(blob_name)