Spaces:

clementBE
/

Video2Novel2

Sleeping

App Files Files Community

clementBE commited on Sep 23, 2025

Commit

cf494c7

verified ·

1 Parent(s): c37ed20

Create app.py

Browse files

Files changed (1) hide show

app.py +206 -0

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import gradio as gr
+import os
+import subprocess
+import whisper
+import librosa
+import matplotlib.pyplot as plt
+import numpy as np
+import uuid
+import base64
+model = whisper.load_model("base")
+def format_timestamp(seconds):
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int((seconds - int(seconds)) * 1000)
+    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
+def write_vtt(segments, filepath):
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write("WEBVTT\n\n")
+        for i, seg in enumerate(segments, start=1):
+            start = format_timestamp(seg['start'])
+            end = format_timestamp(seg['end'])
+            text = seg['text'].strip()
+            f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
+def parse_vtt(filepath):
+    entries = []
+    with open(filepath, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    idx = 0
+    while idx < len(lines):
+        line = lines[idx].strip()
+        if "-->" in line:
+            time_range = line
+            idx += 1
+            text_lines = []
+            while idx < len(lines) and lines[idx].strip() != '':
+                text_lines.append(lines[idx].strip())
+                idx += 1
+            entries.append((time_range, ' '.join(text_lines)))
+        else:
+            idx += 1
+    return entries
+def parse_timestamp(ts_str):
+    h, m, rest = ts_str.split(":")
+    s, ms = rest.split(".")
+    return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
+def capture_screenshot(video_path, time_sec, out_path):
+    cmd = [
+        "ffmpeg", "-ss", str(time_sec), "-i", video_path,
+        "-frames:v", "1", "-q:v", "2", out_path, "-y"
+    ]
+    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+def save_voice_plot(times, db, start_sec, out_path):
+    plt.figure(figsize=(8, 3))
+    plt.plot(times, db, color="purple")
+    plt.axvline(x=start_sec, color="red", linestyle="--")
+    interp_val = np.interp(start_sec, times, db)
+    plt.scatter([start_sec], [interp_val], color="red")
+    plt.xlabel("Time (s)")
+    plt.ylabel("Voice band dB")
+    plt.tight_layout()
+    plt.savefig(out_path)
+    plt.close()
+def file_to_base64(filepath):
+    with open(filepath, "rb") as f:
+        data = f.read()
+    ext = os.path.splitext(filepath)[1].lower().replace('.', '')
+    mime = f"image/{'jpeg' if ext=='jpg' else ext}"
+    b64 = base64.b64encode(data).decode('utf-8')
+    return f"data:{mime};base64,{b64}"
+def extract_audio(video_path, output_dir):
+    audio_path = os.path.join(output_dir, "audio.mp3")
+    subprocess.run([
+        "ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path
+    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return audio_path
+def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path):
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8"><title>{video_id}</title>
+<style>
+    body {{ font-family: Arial; font-size: 18px; margin: 20px; }}
+    .media img {{
+        width: 480px;
+        height: auto;
+        border: 1px solid #ccc;
+        border-radius: 6px;
+        box-shadow: 2px 2px 6px rgba(0,0,0,0.1);
+    }}
+    .segment {{
+        display: flex;
+        align-items: center;
+        gap: 20px;
+        margin-bottom: 40px;
+    }}
+    .text {{
+        flex: 2;
+    }}
+    .media {{
+        flex: 3;
+        display: flex;
+        flex-direction: column;
+        gap: 10px;
+    }}
+</style>
+</head>
+<body>
+<h1>Annotated Transcript for {video_id}</h1>
+<p>Uploaded video file: {os.path.basename(video_path)}</p>
+"""
+    for time_range, text in entries:
+        start = time_range.split(" --> ")[0]
+        start_sec = int(parse_timestamp(start))
+        screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg")
+        plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png")
+        screenshot_b64 = file_to_base64(screenshot_path) if os.path.exists(screenshot_path) else ""
+        plot_b64 = file_to_base64(plot_path) if os.path.exists(plot_path) else ""
+        html += f"""
+<div class="segment">
+  <div class="text">
+    <h3>{time_range}</h3>
+    <p contenteditable="true">{text}</p>
+  </div>
+  <div class="media">
+    <img src="{screenshot_b64}" alt="Screenshot at {start_sec}s">
+    <img src="{plot_b64}" alt="Voice energy plot at {start_sec}s">
+  </div>
+</div>
+"""
+    html += "</body></html>"
+    with open(output_html_path, "w", encoding="utf-8") as f:
+        f.write(html)
+    return output_html_path
+def process(video_file):
+    session_id = str(uuid.uuid4())
+    base_dir = os.path.join("session_data", session_id)
+    os.makedirs(base_dir, exist_ok=True)
+    screenshots_dir = os.path.join(base_dir, "screenshots")
+    plots_dir = os.path.join(base_dir, "plots")
+    os.makedirs(screenshots_dir, exist_ok=True)
+    os.makedirs(plots_dir, exist_ok=True)
+    video_path = video_file.name
+    video_id = os.path.splitext(os.path.basename(video_path))[0]
+    # Extract audio
+    audio_path = extract_audio(video_path, base_dir)
+    # Transcription
+    result = model.transcribe(audio_path)
+    vtt_path = os.path.join(base_dir, f"{video_id}.vtt")
+    write_vtt(result["segments"], vtt_path)
+    entries = parse_vtt(vtt_path)
+    # Voice intensity curve
+    y, sr = librosa.load(audio_path, sr=None)
+    S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
+    freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
+    voice_band = (freqs >= 300) & (freqs <= 3000)
+    voice_energy = S[voice_band, :].mean(axis=0)
+    voice_db = 20 * np.log10(voice_energy + 1e-6)
+    times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512)
+    # Generate screenshots + plots
+    for time_range, _ in entries:
+        start = time_range.split(" --> ")[0]
+        start_sec = parse_timestamp(start)
+        screenshot_out = os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg")
+        plot_out = os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png")
+        capture_screenshot(video_path, start_sec, screenshot_out)
+        save_voice_plot(times, voice_db, start_sec, plot_out)
+    # HTML output
+    html_output_path = os.path.join(base_dir, f"{video_id}.html")
+    final_html = generate_html(entries, video_id, video_path, screenshots_dir, plots_dir, html_output_path)
+    return final_html
+demo = gr.Interface(
+    fn=process,
+    inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])],
+    outputs=gr.File(label="Download Annotated HTML"),
+    title="Video Annotated Transcript",
+    description="🎥 Upload a video file (mp4/mov/mkv). The tool will transcribe speech, capture screenshots, analyze sound intensity, and generate an editable HTML transcript."
+)
+if __name__ == "__main__":
+    demo.launch()