Spaces:

mustafoyev202
/

Video_Analyser_V2

Sleeping

App Files Files Community

mustafoyev202 commited on Apr 16, 2025

Commit

ee9d6a2

verified ·

1 Parent(s): 384313a

Upload 2 files

Browse files

Files changed (2) hide show

main.py +51 -0
utils.py +190 -0

main.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import os
+import zipfile
+import tempfile
+from multiprocessing import Pool, cpu_count
+from utils import getting_usage_info_from_results, process_multiple_videos_from_results, wrapper_with_delay, set_torch_threads
+set_torch_threads(safe_ratio=0.5)
+def gradio_interface(zip_file):
+    """Handles Gradio input: unzip and process videos."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+        video_paths = [
+            os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith(".mp4")
+        ]
+        if not video_paths:
+            raise ValueError("No .mp4 video files found in the zip archive.")
+        # First processing: returns results from analyze_single_video
+        results = []
+        with Pool(min(cpu_count(), len(video_paths))) as pool:
+            results = pool.map(wrapper_with_delay, video_paths)
+        df_result = process_multiple_videos_from_results(results)
+        df_info = getting_usage_info_from_results(video_paths, results)
+        csv_result = "emotion_results.csv"
+        csv_info = "usage_info.csv"
+        df_result.to_csv(csv_result, index=False)
+        df_info.to_csv(csv_info, index=False)
+        return df_result, df_info, csv_result, csv_info
+# Gradio interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(file_types=[".zip"], label="Upload a ZIP of videos"),
+    outputs=[gr.DataFrame(label="Emotion Analysis"),
+                gr.DataFrame(label="Token Usage Estimation"),
+                gr.File(label="Download Result CSV"),
+                gr.File(label="Download Usage Info CSV")],
+    title="Batch Video Emotion Analyzer (ZIP Upload)",
+    description="Upload a .zip file containing .mp4 videos. The app will extract and analyze the emotions in parallel."
+)
+iface.launch(share=True)

utils.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from moviepy.editor import VideoFileClip, concatenate_videoclips
+from pydub import AudioSegment
+import numpy as np
+import torch
+from silero_vad import load_silero_vad, get_speech_timestamps
+import os
+import json
+from google import genai
+import pandas as pd
+import re
+import time
+from dotenv import load_dotenv
+torch.set_num_threads(1)
+load_dotenv()
+client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
+def set_torch_threads(safe_ratio=0.5):
+    try:
+        total_cores = os.cpu_count()
+        optimal_threads = max(1, int(total_cores * safe_ratio))
+        torch.set_num_threads(optimal_threads)
+        print(f"Set torch threads to: {optimal_threads} (out of {total_cores} cores)")
+    except Exception as e:
+        print(f"Failed to set torch threads dynamically: {e}")
+        torch.set_num_threads(1)
+def analyze_single_video(video_path):
+    """Analyzes a single video for emotions using the GenAI model."""
+    prompt = """
+    Detect emotion from this video and classify into 3 categories: happy, sad, normal. Return only JSON format without any extra text.
+    Return this JSON schema:
+    {
+      "Vocal": {
+        "sad_score": (%),
+        "happy_score": (%),
+        "normal_score": (%),
+        "sad_reason": (list of timestamps),
+        "happy_reason": (list of timestamps),
+        "normal_reason": (list of timestamps)
+      },
+      "Verbal": {
+        "sad_score": (%),
+        "happy_score": (%),
+        "normal_score": (%),
+        "sad_reason": (list of timestamps),
+        "happy_reason": (list of timestamps),
+        "normal_reason": (list of timestamps)
+      },
+      "Vision": {
+        "sad_score": (%),
+        "happy_score": (%),
+        "normal_score": (%),
+        "sad_reason": (list of timestamps),
+        "happy_reason": (list of timestamps),
+        "normal_reason": (list of timestamps)
+      }
+    }
+    Reasons (sad_reason, happy_reason, normal_reason) should be a list of beginning-ending timestamps. For example: ['0:11-0:14', '0:23-0:25', '0:27-0:29']
+    """
+    try:
+        with open(video_path, 'rb') as video_file:
+            video_bytes = video_file.read()
+        print(f"Processing: {video_path}")
+        response = client.models.generate_content(
+            model="gemini-2.0-flash",
+            contents=[{"text": prompt}, {"inline_data": {"data": video_bytes, "mime_type": "video/mp4"}}],
+            config={"http_options": {"timeout": 60000}}
+        )
+        # Extract token usage information
+        input_token = response.usage_metadata.prompt_token_count
+        output_token = response.usage_metadata.candidates_token_count
+        total_token = response.usage_metadata.total_token_count
+        response_text = response.text.strip()
+        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response_text)
+        json_string = json_match.group(1).strip() if json_match else response_text
+        result = json.loads(json_string)
+        return (video_path, result, input_token, output_token, total_token)
+    except Exception as e:
+        print(f"Error processing {video_path}: {e}")
+        return (video_path, None, 0, 0, 0)
+def wrapper_with_delay(video_path):
+    time.sleep(2)  # Add delay to avoid throttling
+    return analyze_single_video(video_path)
+def process_multiple_videos_from_results(results):
+    """Processes results directly without re-analyzing."""
+    records = []
+    for video_path, result, _, _, _ in results:
+        if result is None:
+            continue
+        video_title = os.path.basename(video_path)
+        for category in ['Verbal', 'Vocal', 'Vision']:
+            for emotion in ['normal', 'happy', 'sad']:
+                score = result[category].get(f"{emotion}_score", 0)
+                reasons = result[category].get(f"{emotion}_reason", [])
+                records.append({
+                    'title': video_title,
+                    'category': category,
+                    'emotion': emotion,
+                    'score': score,
+                    'reasons': json.dumps(reasons)
+                })
+    df = pd.DataFrame(records)
+    return df
+def getting_video_length(vid):
+    clip = VideoFileClip(vid)
+    duration = clip.duration
+    return np.round(duration, decimals=2)
+def get_speech_only_video_duration(video_path: str, sampling_rate: int = 16000, use_onnx: bool = False) -> float:
+    # Load VAD model
+    model = load_silero_vad(onnx=use_onnx)
+    # Extract audio from video using pydub
+    audio = AudioSegment.from_file(video_path).set_frame_rate(sampling_rate).set_channels(1)
+    samples = np.array(audio.get_array_of_samples()).astype("float32") / (2**15)
+    audio_tensor = torch.from_numpy(samples)
+    # Get speech timestamps
+    speech_timestamps = get_speech_timestamps(audio_tensor, model, sampling_rate=sampling_rate)
+    # Convert sample indices to seconds
+    for ts in speech_timestamps:
+        ts['start'] /= sampling_rate
+        ts['end'] /= sampling_rate
+    if not speech_timestamps:
+        return 0.0  # No speech detected
+    # Load video
+    video = VideoFileClip(video_path)
+    # Extract speech-only clips
+    clips = [video.subclip(ts['start'], ts['end']) for ts in speech_timestamps]
+    # Concatenate and return duration
+    final_video = concatenate_videoclips(clips)
+    return final_video.duration
+def getting_usage_info_from_results(video_paths, results):
+    """Use pre-fetched results to avoid double processing."""
+    filenames = np.vectorize(os.path.basename)(video_paths).reshape(-1, 1)
+    durations = np.vectorize(getting_video_length)(video_paths).reshape(-1, 1)
+    speech_durations = np.vectorize(get_speech_only_video_duration)(video_paths).reshape(-1, 1)
+    token_data = np.array([[r[2], r[3], r[4]] for r in results if r[1] is not None])
+    if token_data.size == 0:
+        token_data = np.zeros((len(video_paths), 3))
+    token_data = token_data.astype(float)
+    X = 1_000_000
+    input_token_price = np.round(token_data[:, 0] * 0.10 / X, decimals=4).reshape(-1, 1)
+    output_token_price = np.round(token_data[:, 1] * 0.40 / X, decimals=4).reshape(-1, 1)
+    total_token_price = input_token_price + output_token_price
+    final_arr = np.concatenate(
+        (filenames, durations, speech_durations, token_data, input_token_price, output_token_price, total_token_price),
+        axis=1
+    )
+    df = pd.DataFrame(
+        final_arr,
+        columns=[
+            'title', 'total_duration(s)', 'speech_duration(s)', 'input_token', 'output_token', 'total_token',
+            'input_price($)', 'output_price($)', 'total_price($)'
+        ]
+    )
+    return df