Spaces:

Kacemath
/

data-mining-tp

Sleeping

App Files Files Community

Kacemath commited on 28 days ago

Commit

b490ee7

1 Parent(s): 7f14f0b

Deploy gradio movie revenue app with model and preprocessing

Browse files

Files changed (13) hide show

app.py +123 -0
components/__init__.py +7 -0
components/__pycache__/__init__.cpython-310.pyc +0 -0
components/__pycache__/examples.cpython-310.pyc +0 -0
components/__pycache__/inputs.cpython-310.pyc +0 -0
components/__pycache__/prediction.cpython-310.pyc +0 -0
components/examples.py +83 -0
components/inputs.py +158 -0
components/prediction.py +92 -0
model/final_model.pkl +3 -0
requirements.txt +4 -0
src/__pycache__/preprocess.cpython-310.pyc +0 -0
src/preprocess.py +157 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Movie Box Office Revenue Predictor - Gradio Web Application."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import gradio as gr
+from components import create_input_form, get_example_data, predict_revenue_from_form
+from src.preprocess import load_model, parse_feature_options
+# Configuration
+MODEL_PATH = Path("model/final_model.pkl")
+# Global state
+MODEL: Any | None = None
+MODEL_ERROR: str | None = None
+FEATURE_OPTIONS: dict[str, list[str]] = {}
+# Initialize model
+try:
+    MODEL = load_model(MODEL_PATH)
+    FEATURE_OPTIONS = parse_feature_options(list(MODEL.feature_names_in_))
+except Exception as exc:
+    MODEL_ERROR = str(exc)
+def build_app() -> gr.Blocks:
+    """Build and configure the Gradio interface."""
+    with gr.Blocks(
+        title="🎬 Movie Revenue Predictor",
+    ) as app:
+        # Header
+        with gr.Row():
+            gr.Markdown(
+                """
+                # 🎬 Movie Box Office Revenue Predictor
+                Predict movie revenue using machine learning trained on historical box office data.
+                Enter movie details below and get instant revenue predictions with profitability analysis.
+                """,
+                elem_classes=["header"]
+            )
+        # Model status
+        if MODEL is None:
+            gr.Warning(f"⚠️ Model loading error: {MODEL_ERROR}")
+        # Main content
+        with gr.Row():
+            with gr.Column(scale=3):
+                # Input form
+                input_dict, input_list = create_input_form(FEATURE_OPTIONS)
+                # Action buttons
+                with gr.Row():
+                    predict_btn = gr.Button(
+                        "🎯 Predict Revenue",
+                        variant="primary",
+                        scale=2,
+                        size="lg"
+                    )
+                    clear_btn = gr.ClearButton(
+                        components=input_list,
+                        value="🔄 Clear",
+                        scale=1,
+                        size="lg"
+                    )
+                # Examples
+                gr.Markdown("### 📝 Quick Examples")
+                gr.Examples(
+                    examples=get_example_data(FEATURE_OPTIONS),
+                    inputs=input_list,
+                    label="Click an example to auto-fill the form",
+                )
+            with gr.Column(scale=2):
+                gr.Markdown("### 📊 Prediction Results")
+                # Output displays
+                prediction_output = gr.Markdown(
+                    "💡 Fill in the form and click **Predict Revenue** to see results.",
+                    elem_classes=["output-box"]
+                )
+                profitability_output = gr.Markdown(
+                    "",
+                    elem_classes=["output-box"]
+                )
+        # Event handlers
+        predict_btn.click(
+            fn=lambda *args: predict_revenue_from_form(MODEL, *args),
+            inputs=input_list,
+            outputs=[prediction_output, profitability_output],
+        )
+    return app
+def main():
+    """Launch the application."""
+    theme = gr.themes.Default(
+        primary_hue="zinc",
+        secondary_hue="slate",
+        neutral_hue="slate",
+    )
+    app = build_app()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        theme=theme,
+    )
+if __name__ == "__main__":
+    main()

components/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""UI components for the movie revenue predictor."""
+from components.inputs import create_input_form
+from components.prediction import predict_revenue_from_form
+from components.examples import get_example_data
+__all__ = ["create_input_form", "predict_revenue_from_form", "get_example_data"]

components/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (404 Bytes). View file

components/__pycache__/examples.cpython-310.pyc ADDED Viewed

Binary file (1.78 kB). View file

components/__pycache__/inputs.cpython-310.pyc ADDED Viewed

Binary file (3.48 kB). View file

components/__pycache__/prediction.cpython-310.pyc ADDED Viewed

Binary file (2.17 kB). View file

components/examples.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Example data for the movie predictor."""
+from __future__ import annotations
+from typing import Any
+def get_example_data(feature_options: dict[str, list[str]]) -> list[list[Any]]:
+    """Generate example movie data for quick testing."""
+    genres = feature_options.get("genres", [])[:3]
+    companies = feature_options.get("production_companies", [])[:2]
+    keywords = feature_options.get("Keywords", [])[:3]
+    cast = feature_options.get("cast", [])[:2]
+    return [
+        [
+            # Blockbuster Example (similar to Avengers-type movies)
+            200_000_000,  # budget
+            64.0,  # popularity
+            143,  # runtime
+            "2019-04-26",  # release_date
+            "en",  # original_language
+            True,  # belongs_to_collection
+            True,  # homepage
+            "The Final Showdown",  # title
+            "Whatever it takes",  # tagline
+            "After devastating events, heroes must assemble once more to undo chaos and restore order to the universe.",  # overview
+            25,  # num_of_cast
+            50,  # num_of_crew
+            8,  # gender_cast_1 (female)
+            15,  # gender_cast_2 (male)
+            2,  # count_cast_other
+            genres[:3] if len(genres) >= 3 else genres,  # genres
+            companies[:2] if len(companies) >= 2 else companies,  # production_companies
+            keywords[:5] if len(keywords) >= 5 else keywords,  # keywords
+            cast[:5] if len(cast) >= 5 else cast,  # cast
+        ],
+        [
+            # Mid-Budget Comedy
+            40_000_000,  # budget
+            8.2,  # popularity
+            113,  # runtime
+            "2015-08-06",  # release_date
+            "en",  # original_language
+            True,  # belongs_to_collection
+            True,  # homepage
+            "Royal Wedding",  # title
+            "Royalty has its responsibilities",  # tagline
+            "A young princess must navigate royal duties while finding true love before her coronation.",  # overview
+            20,  # num_of_cast
+            30,  # num_of_crew
+            12,  # gender_cast_1 (female)
+            8,  # gender_cast_2 (male)
+            0,  # count_cast_other
+            genres[:2] if len(genres) >= 2 else genres,  # genres
+            companies[:1] if companies else [],  # production_companies
+            keywords[:3] if len(keywords) >= 3 else keywords,  # keywords
+            cast[:3] if len(cast) >= 3 else cast,  # cast
+        ],
+        [
+            # Low-Budget Thriller
+            3_300_000,  # budget
+            35.0,  # popularity
+            105,  # runtime
+            "2014-10-10",  # release_date
+            "en",  # original_language
+            False,  # belongs_to_collection
+            True,  # homepage
+            "Perfect Rhythm",  # title
+            "Greatness comes at a price",  # tagline
+            "A talented musician pushes beyond limits under the guidance of a demanding instructor.",  # overview
+            15,  # num_of_cast
+            25,  # num_of_crew
+            3,  # gender_cast_1 (female)
+            10,  # gender_cast_2 (male)
+            2,  # count_cast_other
+            genres[:1] if genres else [],  # genres
+            companies[:2] if len(companies) >= 2 else companies,  # production_companies
+            keywords[:4] if len(keywords) >= 4 else keywords,  # keywords
+            cast[:2] if len(cast) >= 2 else cast,  # cast
+        ],
+    ]

components/inputs.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""Input form components for the movie predictor."""
+from __future__ import annotations
+import gradio as gr
+def create_input_form(feature_options: dict[str, list[str]]) -> tuple[dict[str, gr.components.Component], list[gr.components.Component]]:
+    """Create the input form with all movie attributes."""
+    inputs = {}
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("### 📊 Core Metrics")
+            inputs["budget"] = gr.Number(
+                label="Budget ($)",
+                value=50_000_000,
+                info="Production budget in USD"
+            )
+            inputs["popularity"] = gr.Number(
+                label="Popularity Score",
+                value=10.0,
+                info="Trending score (0-100)"
+            )
+            inputs["runtime"] = gr.Number(
+                label="Runtime (minutes)",
+                value=105,
+                info="Movie duration"
+            )
+        with gr.Column(scale=2):
+            gr.Markdown("### 📅 Release Info")
+            inputs["release_date"] = gr.Textbox(
+                label="Release Date",
+                value="2015-06-12",
+                info="Format: YYYY-MM-DD"
+            )
+            inputs["original_language"] = gr.Dropdown(
+                label="Original Language",
+                choices=["en", "zh", "ja", "other"],
+                value="en"
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### ✓ Flags")
+            inputs["belongs_to_collection"] = gr.Checkbox(
+                label="Part of Collection",
+                value=False
+            )
+            inputs["homepage"] = gr.Checkbox(
+                label="Has Homepage",
+                value=False
+            )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 🎬 Movie Details")
+            inputs["title"] = gr.Textbox(
+                label="Title",
+                value="Sample Movie"
+            )
+            inputs["tagline"] = gr.Textbox(
+                label="Tagline",
+                value="A new story begins"
+            )
+            inputs["overview"] = gr.Textbox(
+                label="Overview",
+                value="A movie about discovery, conflict, and ambition.",
+                lines=3
+            )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 👥 Cast & Crew Statistics")
+            gr.Markdown("*These are typically derived from cast/crew data. Estimate if unknown.*")
+            with gr.Row():
+                inputs["num_of_cast"] = gr.Number(
+                    label="Total Cast Members",
+                    value=10,
+                    info="Number of actors"
+                )
+                inputs["num_of_crew"] = gr.Number(
+                    label="Total Crew Members",
+                    value=10,
+                    info="Number of crew"
+                )
+            with gr.Row():
+                inputs["gender_cast_1"] = gr.Number(
+                    label="Female Cast (Gender=1)",
+                    value=4,
+                    info="Number of female actors"
+                )
+                inputs["gender_cast_2"] = gr.Number(
+                    label="Male Cast (Gender=2)",
+                    value=5,
+                    info="Number of male actors"
+                )
+                inputs["count_cast_other"] = gr.Number(
+                    label="Other/Unknown Gender",
+                    value=1,
+                    info="Other gender identities"
+                )
+    with gr.Accordion("🎭 Optional: Genres, Companies & More", open=False):
+        with gr.Row():
+            inputs["genres"] = gr.Dropdown(
+                label="Genres",
+                choices=feature_options.get("genres", []),
+                multiselect=True,
+                info="Select one or more genres"
+            )
+            inputs["production_companies"] = gr.Dropdown(
+                label="Production Companies",
+                choices=feature_options.get("production_companies", []),
+                multiselect=True,
+                info="Select production companies"
+            )
+        with gr.Row():
+            inputs["keywords"] = gr.Dropdown(
+                label="Keywords",
+                choices=feature_options.get("Keywords", []),
+                multiselect=True,
+                info="Content keywords"
+            )
+            inputs["cast"] = gr.Dropdown(
+                label="Notable Cast",
+                choices=feature_options.get("cast", []),
+                multiselect=True,
+                info="Famous actors"
+            )
+    # Return both dict and ordered list for compatibility
+    ordered_list = [
+        inputs["budget"],
+        inputs["popularity"],
+        inputs["runtime"],
+        inputs["release_date"],
+        inputs["original_language"],
+        inputs["belongs_to_collection"],
+        inputs["homepage"],
+        inputs["title"],
+        inputs["tagline"],
+        inputs["overview"],
+        inputs["num_of_cast"],
+        inputs["num_of_crew"],
+        inputs["gender_cast_1"],
+        inputs["gender_cast_2"],
+        inputs["count_cast_other"],
+        inputs["genres"],
+        inputs["production_companies"],
+        inputs["keywords"],
+        inputs["cast"],
+    ]
+    return inputs, ordered_list

components/prediction.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Prediction logic and output formatting."""
+from __future__ import annotations
+from typing import Any
+from src.preprocess import predict_revenue
+INPUT_ORDER = [
+    "budget",
+    "popularity",
+    "runtime",
+    "release_date",
+    "original_language",
+    "belongs_to_collection",
+    "homepage",
+    "title",
+    "tagline",
+    "overview",
+    "num_of_cast",
+    "num_of_crew",
+    "gender_cast_1",
+    "gender_cast_2",
+    "count_cast_other",
+    "genres",
+    "production_companies",
+    "keywords",
+    "cast",
+]
+def format_currency(value: float) -> str:
+    """Format a number as currency."""
+    if value >= 1_000_000_000:
+        return f"${value / 1_000_000_000:.2f}B"
+    elif value >= 1_000_000:
+        return f"${value / 1_000_000:.2f}M"
+    else:
+        return f"${value:,.0f}"
+def predict_revenue_from_form(model: Any, *values: Any) -> tuple[str, str]:
+    """
+    Predict revenue from form inputs and return formatted results.
+    Returns:
+        Tuple of (prediction_text, profitability_text)
+    """
+    if model is None:
+        return "❌ Model not available", ""
+    # Build payload from form values
+    payload = dict(zip(INPUT_ORDER, values))
+    payload["belongs_to_collection"] = int(bool(payload.get("belongs_to_collection")))
+    payload["homepage"] = int(bool(payload.get("homepage")))
+    payload["has_tagline"] = 1 if str(payload.get("tagline") or "").strip() else 0
+    try:
+        prediction = predict_revenue(model, payload)
+        budget = float(payload.get("budget") or 0.0)
+        # Format prediction
+        prediction_text = f"## 💰 Predicted Revenue\n### {format_currency(prediction)}"
+        # Calculate ROI
+        if budget > 0:
+            roi = (prediction - budget) / budget * 100
+            multiple = prediction / budget
+            if roi > 100:
+                status = "🟢 **Highly Profitable**"
+            elif roi > 0:
+                status = "🟡 **Profitable**"
+            else:
+                status = "🔴 **Loss Expected**"
+            profitability_text = f"""
+{status}
+- **Budget:** {format_currency(budget)}
+- **Revenue Multiple:** {multiple:.2f}x
+- **ROI:** {roi:+.1f}%
+- **Estimated Profit:** {format_currency(prediction - budget)}
+"""
+        else:
+            profitability_text = "ℹ️ Enter a budget to see profitability analysis"
+        return prediction_text, profitability_text
+    except Exception as exc:
+        return f"❌ Prediction Error\n```\n{str(exc)}\n```", ""

model/final_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:674d26ddae3d3f076f531fea4cf8f0e8676075f4e082fb3dd3d8467864e064c6
+size 21812058

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==6.5.1
+numpy==2.2.6
+pandas==2.3.3
+scikit-learn==1.6.1

src/__pycache__/preprocess.cpython-310.pyc ADDED Viewed

Binary file (5.21 kB). View file

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from __future__ import annotations
+import math
+import pickle
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+LANGUAGE_MAPPING = {"en": 1, "zh": 2, "ja": 3}
+PREFIX_TO_FORM_KEY = {
+    "genres": "genres",
+    "production_companies": "production_companies",
+    "Keywords": "keywords",
+    "cast": "cast",
+}
+def load_model(model_path: str | Path) -> Any:
+    with Path(model_path).open("rb") as file:
+        return pickle.load(file)
+def get_model_feature_names(model: Any) -> list[str]:
+    if not hasattr(model, "feature_names_in_"):
+        raise ValueError("Model does not expose feature_names_in_.")
+    return list(model.feature_names_in_)
+def count_words(text: str | None) -> int:
+    if text is None:
+        return 0
+    normalized = str(text).strip()
+    if not normalized:
+        return 0
+    return len(normalized.split())
+def runtime_category_code(runtime: float) -> int:
+    if runtime < 90:
+        return 0
+    if runtime < 120:
+        return 1
+    return 2
+def parse_release_date(value: str | None) -> datetime:
+    if not value:
+        return datetime(2010, 1, 1)
+    try:
+        return datetime.strptime(value, "%Y-%m-%d")
+    except ValueError as exc:
+        raise ValueError("release_date must be in YYYY-MM-DD format.") from exc
+def parse_feature_options(feature_names: list[str]) -> dict[str, list[str]]:
+    options: dict[str, set[str]] = {k: set() for k in PREFIX_TO_FORM_KEY}
+    for name in feature_names:
+        for prefix in options:
+            key = f"{prefix}_"
+            if name.startswith(key) and name != f"{prefix}_other":
+                options[prefix].add(name[len(key) :])
+    return {k: sorted(v) for k, v in options.items()}
+def _to_float(value: Any, default: float = 0.0) -> float:
+    try:
+        if value is None:
+            return default
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+def _to_int(value: Any, default: int = 0) -> int:
+    try:
+        if value is None:
+            return default
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+def build_feature_row(form_data: dict[str, Any], feature_names: list[str]) -> pd.DataFrame:
+    row = {name: 0.0 for name in feature_names}
+    budget = max(_to_float(form_data.get("budget"), 0.0), 0.0)
+    popularity = max(_to_float(form_data.get("popularity"), 0.0), 0.0)
+    runtime = max(_to_float(form_data.get("runtime"), 0.0), 0.0)
+    release_date = parse_release_date(form_data.get("release_date"))
+    release_season = ((release_date.month % 12) + 3) // 3
+    title_text = str(form_data.get("title") or "")
+    tagline_text = str(form_data.get("tagline") or "")
+    overview_text = str(form_data.get("overview") or "")
+    values = {
+        "belongs_to_collection": _to_int(form_data.get("belongs_to_collection"), 0),
+        "homepage": _to_int(form_data.get("homepage"), 0),
+        "has_tagline": _to_int(form_data.get("has_tagline"), 1 if tagline_text.strip() else 0),
+        "original_language": LANGUAGE_MAPPING.get(str(form_data.get("original_language") or "").lower(), 0),
+        "runtime": runtime,
+        "num_of_cast": _to_float(form_data.get("num_of_cast"), 0.0),
+        "num_of_crew": _to_float(form_data.get("num_of_crew"), 0.0),
+        "gender_cast_1": _to_float(form_data.get("gender_cast_1"), 0.0),
+        "gender_cast_2": _to_float(form_data.get("gender_cast_2"), 0.0),
+        "count_cast_other": _to_float(form_data.get("count_cast_other"), 0.0),
+        "title_word_count": _to_float(form_data.get("title_word_count"), count_words(title_text)),
+        "tag_word_count": _to_float(form_data.get("tag_word_count"), count_words(tagline_text)),
+        "overview_word_count": _to_float(form_data.get("overview_word_count"), count_words(overview_text)),
+        "release_year": release_date.year,
+        "release_month": release_date.month,
+        "release_season": release_season,
+        "runtime_category": runtime_category_code(runtime),
+        "budget_log": math.log1p(budget),
+        "popularity_log": math.log1p(popularity),
+    }
+    for key, value in values.items():
+        if key in row:
+            row[key] = value
+    for prefix, form_key in PREFIX_TO_FORM_KEY.items():
+        selected = form_data.get(form_key) or []
+        if not isinstance(selected, list):
+            selected = [selected]
+        known = 0
+        for item in selected:
+            col = f"{prefix}_{item}"
+            if col in row:
+                row[col] = 1.0
+                known += 1
+        num_col = f"num_of_{prefix}"
+        if num_col in row:
+            row[num_col] = float(len(selected))
+        other_col = f"{prefix}_other"
+        if other_col in row:
+            row[other_col] = 1.0 if len(selected) > known else 0.0
+    df = pd.DataFrame([[row[name] for name in feature_names]], columns=feature_names)
+    return df.replace([np.inf, -np.inf], 0).fillna(0)
+def predict_revenue(model: Any, form_data: dict[str, Any]) -> float:
+    feature_names = get_model_feature_names(model)
+    frame = build_feature_row(form_data, feature_names)
+    pred = model.predict(frame)[0]
+    return float(pred)