Spaces:

maulana-m
/

spotlight-segment

Sleeping

App Files Files Community

maulana-m commited on May 26, 2025

Commit

5891917

1 Parent(s): 15a9e23

initial commit

Browse files

Files changed (26) hide show

app.py +85 -0
requirements.txt +189 -0
spotlight/__init__.py +0 -0
spotlight/__pycache__/__init__.cpython-313.pyc +0 -0
spotlight/api/__init__.py +16 -0
spotlight/api/__pycache__/__init__.cpython-313.pyc +0 -0
spotlight/api/__pycache__/error_handlers.cpython-313.pyc +0 -0
spotlight/api/error_handlers.py +57 -0
spotlight/api/routers/__pycache__/segments.cpython-313.pyc +0 -0
spotlight/api/routers/segments.py +24 -0
spotlight/cli/__init__.py +0 -0
spotlight/cli/main.py +37 -0
spotlight/core/__pycache__/config.cpython-313.pyc +0 -0
spotlight/core/__pycache__/constant.cpython-313.pyc +0 -0
spotlight/core/__pycache__/downloader.cpython-313.pyc +0 -0
spotlight/core/__pycache__/dto.cpython-313.pyc +0 -0
spotlight/core/__pycache__/exceptions.cpython-313.pyc +0 -0
spotlight/core/__pycache__/llm.cpython-313.pyc +0 -0
spotlight/core/__pycache__/service.cpython-313.pyc +0 -0
spotlight/core/config.py +13 -0
spotlight/core/constant.py +35 -0
spotlight/core/downloader.py +60 -0
spotlight/core/dto.py +27 -0
spotlight/core/exceptions.py +8 -0
spotlight/core/llm.py +22 -0
spotlight/core/service.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from spotlight.core.downloader import Downloader
+from spotlight.core.service import SpotlightService
+from spotlight.core.llm import GeminiApi
+from spotlight.core.dto import SpotlightRequest
+from spotlight.core.exceptions import VideoUrlInvalidError
+import asyncio
+import gradio as gr
+import json
+CSS = """
+.spotlight-item {
+    display: flex;
+    flex-direction: column; /* Stack topic name above the video */
+    align-items: center; /* Center the content horizontally */
+    border: 1px solid #ddd;
+    padding: 15px; /* Increased padding */
+    margin-bottom: 25px; /* Increased margin for spacing */
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    width: 350px; /* Set a fixed width for consistency */
+    text-align: center; /* Center the text in the topic name */
+}
+.spotlight-topic {
+    font-size: 1.2em;
+    font-weight: bold;
+    margin-bottom: 10px; /* Increased margin */
+    color: #28598a; /* A nicer color */
+    word-wrap: break-word; /* Handle long topic names */
+}
+.spotlight-video {
+    /* Add some margin around the iframe, if needed */
+    margin-bottom: 5px;
+}
+/* Optional: If you want the spotlight items to display in a row */
+.gradio-container {
+    display: flex; /* Use flexbox to arrange items in a row */
+    flex-wrap: wrap; /* Allow items to wrap to the next line if they don't fit */
+    justify-content: center; /* Distribute items evenly */
+}
+"""
+spotlight_service = SpotlightService(
+    _downloader=Downloader(),
+    llm=GeminiApi()
+)
+async def run_splotlight(video_url, lang):
+    try:
+        request = SpotlightRequest(
+          video_url=video_url,
+          lang=lang
+        )
+    except VideoUrlInvalidError as e:
+        raise gr.Error("Video Url is invalid")
+    spotlights = await spotlight_service.run(request)
+    html_output = ""
+    for row in spotlights:
+        html_content = f"""
+        <div class="spotlight-item">
+            <div class="spotlight-topic">{row["topic_name"]}</div>
+            <div class="spotlight-video">
+                <iframe width="320" height="180" src="{row["embed_url"]}" frameborder="0" allowfullscreen></iframe>
+            </div>
+        </div>
+        """
+        html_output += html_content
+    return html_output
+with gr.Blocks(css=CSS) as demo:
+    video_url = gr.Textbox(label="Enter youtube url")
+    lang = gr.Textbox(label="Language")
+    run_button = gr.Button("Run", variant="primary")
+    output_html = gr.HTML(label="Output")
+    run_button.click(run_splotlight, [video_url, lang], [output_html])
+    demo.queue(max_size=10).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,189 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   google-genai
+    #   gradio
+    #   httpx
+    #   starlette
+audioop-lts==0.2.1
+    # via gradio
+cachetools==5.5.2
+    # via google-auth
+certifi==2025.4.26
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.2
+    # via requests
+click==8.1.8
+    # via
+    #   typer
+    #   uvicorn
+fastapi==0.115.12
+    # via
+    #   spotlight-segment (pyproject.toml)
+    #   gradio
+ffmpy==0.5.0
+    # via gradio
+filelock==3.18.0
+    # via huggingface-hub
+fsspec==2025.5.1
+    # via
+    #   gradio-client
+    #   huggingface-hub
+google-auth==2.40.2
+    # via google-genai
+google-genai==1.16.1
+    # via spotlight-segment (pyproject.toml)
+gradio==5.31.0
+    # via spotlight-segment (pyproject.toml)
+gradio-client==1.10.1
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.2
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   google-genai
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+huggingface-hub==0.32.0
+    # via
+    #   gradio
+    #   gradio-client
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6
+    # via gradio
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+numpy==2.2.6
+    # via
+    #   gradio
+    #   pandas
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+pandas==2.2.3
+    # via gradio
+pillow==11.2.1
+    # via gradio
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pydantic==2.11.5
+    # via
+    #   spotlight-segment (pyproject.toml)
+    #   fastapi
+    #   google-genai
+    #   gradio
+    #   pydantic-settings
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-settings==2.9.1
+    # via spotlight-segment (pyproject.toml)
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.0
+    # via pydantic-settings
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   gradio
+    #   huggingface-hub
+requests==2.32.3
+    # via
+    #   google-genai
+    #   huggingface-hub
+rich==14.0.0
+    # via typer
+rsa==4.9.1
+    # via google-auth
+ruff==0.11.11
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+tomlkit==0.13.2
+    # via gradio
+tqdm==4.67.1
+    # via huggingface-hub
+typer==0.15.4
+    # via gradio
+typing-extensions==4.13.2
+    # via
+    #   fastapi
+    #   google-genai
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   typer
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via
+    #   pydantic
+    #   pydantic-settings
+tzdata==2025.2
+    # via pandas
+urllib3==2.4.0
+    # via requests
+uvicorn==0.34.2
+    # via
+    #   spotlight-segment (pyproject.toml)
+    #   gradio
+websockets==15.0.1
+    # via
+    #   google-genai
+    #   gradio-client
+yt-dlp==2025.5.22
+    # via spotlight-segment (pyproject.toml)

spotlight/__init__.py ADDED Viewed

File without changes

spotlight/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (164 Bytes). View file

spotlight/api/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from spotlight.api.routers import segments
+from fastapi import FastAPI
+from fastapi.exceptions import RequestValidationError
+from spotlight.api.error_handlers import (
+    request_validation_handler,
+    bad_request_exception_handler,
+    subtitle_not_found_handler
+)
+from spotlight.core.exceptions import VideoUrlInvalidError, SubtitleNotFoundError
+app = FastAPI()
+app.include_router(segments.router)
+app.exception_handler(RequestValidationError)(request_validation_handler)
+app.exception_handler(VideoUrlInvalidError)(bad_request_exception_handler)
+app.exception_handler(SubtitleNotFoundError)(subtitle_not_found_handler)

spotlight/api/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (945 Bytes). View file

spotlight/api/__pycache__/error_handlers.cpython-313.pyc ADDED Viewed

Binary file (2.97 kB). View file

spotlight/api/error_handlers.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+from starlette.exceptions import HTTPException
+from fastapi.responses import JSONResponse
+from typing import Optional, List
+import re
+import http
+import logging
+logger = logging.getLogger(__name__)
+def _camel_to_snake(name):
+    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).upper()
+def error_response(error: Exception, error_code: int = 500, descriptions: Optional[List] = None):
+    if descriptions is None:
+        descriptions = []
+    status = http.HTTPStatus(error_code).name
+    if isinstance(error, HTTPException):
+        name = error.detail.replace(" ", "")
+        error_code = error.status_code
+    else:
+        name = error.__class__.__name__
+        error_message = [
+            {"field": desc[0], "message": desc[1]} for desc in descriptions
+        ]
+    return JSONResponse(
+        dict(
+            status=status,
+            error=_camel_to_snake(name),
+            code=error_code,
+            descriptions=error_message,
+        ),
+        status_code=error_code,
+    )
+async def bad_request_exception_handler(_, error: Exception):
+    return error_response(error, 400)
+async def subtitle_not_found_handler(_, error: Exception):
+    return error_response(error, 404)
+async def request_validation_handler(_: Request, error: RequestValidationError):
+    validation_errors = [(".".join(str(x) for x in e["loc"]), e["type"])
+                         for e in error.errors()]
+    return error_response(error, 400, validation_errors)

spotlight/api/routers/__pycache__/segments.cpython-313.pyc ADDED Viewed

Binary file (1.15 kB). View file

spotlight/api/routers/segments.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi.routing import APIRouter
+from fastapi import Request
+from fastapi.responses import JSONResponse
+from spotlight.core.downloader import Downloader
+from spotlight.core.service import SpotlightService
+from spotlight.core.llm import GeminiApi
+from spotlight.core.dto import SpotlightRequest
+router = APIRouter()
+spotlight_service = SpotlightService(
+    _downloader=Downloader(),
+    llm=GeminiApi()
+)
+@router.post("/segments")
+async def segments(request: Request, spotlight_request: SpotlightRequest):
+    response = await spotlight_service.run(spotlight_request)
+    return JSONResponse(
+        dict(data=response, status="success")
+    )

spotlight/cli/__init__.py ADDED Viewed

File without changes

spotlight/cli/main.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from spotlight.core.downloader import Downloader
+from spotlight.core.service import SpotlightService
+from spotlight.core.llm import GeminiApi
+from spotlight.core.dto import SpotlightRequest
+import click
+import asyncio
+spotlight_service = SpotlightService(
+    _downloader=Downloader(),
+    llm=GeminiApi()
+)
+@click.command()
+@click.option("--video_url", help="video youtube link")
+@click.option("--lang", help="language output")
+def run(video_url, lang):
+    """Spotlight segment: Cli program to extract important segments for any youtube video"""
+    async def main(video_url, lang):
+        request = SpotlightRequest(
+          video_url=video_url,
+          lang=lang
+        )
+        output = await spotlight_service.run(request)
+        print(output)
+    if video_url is None and lang is None:
+        click.echo(click.get_current_context().get_help())
+    else:
+        asyncio.run(main(video_url, lang))
+if __name__ == '__main__':
+    run()

spotlight/core/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (923 Bytes). View file

spotlight/core/__pycache__/constant.cpython-313.pyc ADDED Viewed

Binary file (2.09 kB). View file

spotlight/core/__pycache__/downloader.cpython-313.pyc ADDED Viewed

Binary file (3.41 kB). View file

spotlight/core/__pycache__/dto.cpython-313.pyc ADDED Viewed

Binary file (1.63 kB). View file

spotlight/core/__pycache__/exceptions.cpython-313.pyc ADDED Viewed

Binary file (874 Bytes). View file

spotlight/core/__pycache__/llm.cpython-313.pyc ADDED Viewed

Binary file (1.46 kB). View file

spotlight/core/__pycache__/service.cpython-313.pyc ADDED Viewed

Binary file (3.73 kB). View file

spotlight/core/config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pydantic_settings import BaseSettings
+class GeneralConfig(BaseSettings):
+    GEMINI_API_KEY: str = ""
+    LLM_MODEL: str = "gemini-2.0-flash"
+    DEFAULT_LANGUAGE: str = "en"
+    class Config:
+        env_prefix = "GENERAL_CONFIG_"
+GENERAL_CONFIG = GeneralConfig()

spotlight/core/constant.py ADDED Viewed

	@@ -0,0 +1,35 @@

+PROMPT_TEMPLATE = """
+You are an AI assistant tasked with analyzing video subtitle XML data and identifying engaging segments for viewers.
+Task:
+1. Analyze the XML subtitle data: Parse the provided XML to extract subtitle text and their corresponding start and end timestamps.
+2. Identify "interesting parts": Segment the subtitles into meaningful and engaging segments. Consider the following factors when determining what makes a segment "interesting":
+ - Emotional impact: Segments containing humor, drama, suspense, or other strong emotional content.
+ - Key information: Segments that convey crucial plot points, character development, or explanations.
+ - Intrigue and cliffhangers: Segments that leave the viewer wanting more.
+ - Adjust the segment based on duration video. dont take the short duration if the video is long enough. with long video it is usually has segment that have medium - long duration (e,g > 20 second - 60 second)
+ - Contextual Relevance: Take the language parameter into consideration and tailor the topic titles accordingly.
+3. Generate Descriptive Topic Titles:
+ - For each identified segment, create a concise and compelling topic title in the specified {{language}}}. These titles should be designed to attract viewers and give them a clear idea of the segment's content.
+ - Focus on evocative language and avoid generic descriptions.
+ - DONT use language id in the output. only output topic title without additional langunge informationn
+Output JSON Format: Return the segmented data in a JSON format as follows:
+[
+  {
+    "topic": "Engaging Topic Title",
+    "start_timestamp": "00:01:30",  // Example: HH:MM:SS
+    "end_timestamp": "00:01:45"    // Example: HH:MM:SS
+  },
+  {
+    "topic": "Another Captivating Title",
+    "start_timestamp": "00:02:00",
+    "end_timestamp": "00:02:10"
+  }
+  // ... more segments
+]
+Input:
+Language: {{language}}
+Subtitle XML: {{xml_subtitle}}
+"""

spotlight/core/downloader.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from yt_dlp import YoutubeDL
+from yt_dlp.utils import DownloadError
+from spotlight.core.exceptions import SubtitleNotFoundError
+from spotlight.core.config import GENERAL_CONFIG
+from spotlight.core.dto import VideoInfo
+from typing import Dict, Any
+import json
+import requests
+import logging
+logger = logging.getLogger(__name__)
+class Downloader:
+    def __init__(self, params: Dict[str, Any] = {"quiet": True}):
+        self.client = YoutubeDL(params)
+    def get_video_info(self, video_url: str) -> dict:
+        try:
+            info = self.client.extract_info(video_url, download=False, process=False)
+        except DownloadError as e:
+            info = {}
+        return info
+    def save_info(self, info: Dict[str, Any], filename: str) -> None:
+        with open(filename, "w") as f:
+            f.write(json.dumps(info, indent=4))
+    def get_subtitle_content(self, subtitle_url: str):
+        response = requests.get(subtitle_url)
+        if response.status_code == 200:
+            return response.text
+        else:
+            logger.warning("Download subtitle failed")
+            return None
+    def get_subtitle_video(self, video_url: str) -> VideoInfo:
+        video_info = self.get_video_info(video_url)
+        # self.save_info(video_info, "video_info.json")
+        automatic_captions = video_info.get("automatic_captions", {})
+        # set english as default caption
+        # TODO customize based on input languange
+        origin_captions = automatic_captions.get(GENERAL_CONFIG.DEFAULT_LANGUAGE)
+        if origin_captions is None:
+            raise SubtitleNotFoundError("Subtitle not found")
+        subtitle_url = None
+        for subtitle_format in origin_captions:
+            if subtitle_format.get("ext") == "ttml":
+                subtitle_url = subtitle_format.get("url")
+        if subtitle_url:
+            subtitle_content = self.get_subtitle_content(subtitle_url)
+        else:
+            subtitle_content = None
+        return VideoInfo(video_id=video_info.get("id"), subtitle=subtitle_content)

spotlight/core/dto.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from pydantic import BaseModel, field_validator, ValidationError
+from spotlight.core.exceptions import VideoUrlInvalidError
+import re
+class SpotlightSchema(BaseModel):
+    topic_name: str
+    start_time: str
+    end_time: str
+class VideoInfo(BaseModel):
+    video_id: str
+    subtitle: str
+class SpotlightRequest(BaseModel):
+    video_url: str
+    lang: str
+    @field_validator("video_url")
+    def is_valid_youtube_url(cls, value):
+        pattern = r"^(https?\:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.*[\w-]+\/?$"
+        match = re.match(pattern, value)
+        if not bool(match):
+            raise VideoUrlInvalidError("URL must be a youtube URL")
+        return value

spotlight/core/exceptions.py ADDED Viewed

	@@ -0,0 +1,8 @@

+class VideoUrlInvalidError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+class SubtitleNotFoundError(Exception):
+    def __init__(self, msg):
+        self.msg = msg

spotlight/core/llm.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from spotlight.core.config import GENERAL_CONFIG
+from google import genai
+from google.genai import types
+class GeminiApi:
+    def __init__(self):
+        self.client = genai.Client(api_key=GENERAL_CONFIG.GEMINI_API_KEY)
+    async def generate_completion(self, prompt: str, model: str, response_mime_type: str, response_schema=None):
+        config = types.GenerateContentConfig(
+            response_mime_type=response_mime_type,
+            response_schema=response_schema
+        )
+        response = await self.client.aio.models.generate_content(
+            model=model,
+            contents=prompt,
+            config=config
+        )
+        return response

spotlight/core/service.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from spotlight.core.downloader import Downloader
+from spotlight.core.llm import GeminiApi
+from spotlight.core.dto import SpotlightRequest, SpotlightSchema
+from spotlight.core.constant import PROMPT_TEMPLATE
+from spotlight.core.config import GENERAL_CONFIG
+EMBED_URL = "https://youtube.com/embed/{id}?&start={start_time}&end={end_time}&autoplay=1"
+class SpotlightService:
+    def __init__(self, _downloader: Downloader, llm: GeminiApi):
+        self.downloader = _downloader
+        self.llm = llm
+    async def run(self, splotlight_request: SpotlightRequest):
+        video_url = splotlight_request.video_url
+        lang = splotlight_request.lang
+        video_info = self.downloader.get_subtitle_video(video_url)
+        prompt = self.construct_prompt(lang, video_info.subtitle)
+        response_schema = list[SpotlightSchema]
+        response_llm = await self.llm.generate_completion(
+            prompt=prompt,
+            model=GENERAL_CONFIG.LLM_MODEL,
+            response_mime_type="application/json",
+            response_schema=response_schema
+        )
+        response = self._parse_response(response_llm, video_info.video_id)
+        return response
+    def construct_prompt(self, language: str, xml_subtitle: str) -> str:
+        prompt = (PROMPT_TEMPLATE
+            .replace("{{language}}", language)
+            .replace("{{xml_subtitle}}", xml_subtitle)
+        )
+        return prompt
+    def _parse_response(self, response_llm, video_id):
+        response = [x.model_dump() for x in response_llm.parsed]
+        for data in response:
+            data["embed_url"] = self._embed_timeline(video_id, data["start_time"], data["end_time"])
+        return response
+    def _embed_timeline(self, video_id: str, start_time: str, end_time: str) -> str:
+        def time_to_second(time_str: str):
+            hours, minutes, seconds = time_str.split(':')
+            seconds = seconds[:2]
+            return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
+        embed_url = EMBED_URL.format(
+            id=video_id,
+            start_time=time_to_second(start_time),
+            end_time=time_to_second(end_time)
+        )
+        return embed_url