Spaces:

clementBE
/

Video2Novel2

Sleeping

App Files Files Community

clementBE commited on Nov 17, 2025

Commit

1e2831b

verified ·

1 Parent(s): eee08a7

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -28

app.py CHANGED Viewed

@@ -1,3 +1,63 @@
 import gradio as gr
 import os
 import subprocess
@@ -9,20 +69,36 @@ import uuid
 import base64
 import torch
 import shutil
-from docx import Document  # for DOCX export
-# Auto-select device: GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = whisper.load_model("base", device=device)
 def format_timestamp(seconds):
     h = int(seconds // 3600)
     m = int((seconds % 3600) // 60)
     s = int(seconds % 60)
     ms = int((seconds - int(seconds)) * 1000)
     return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
 def write_vtt(segments, filepath):
     with open(filepath, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, seg in enumerate(segments, start=1):
@@ -31,18 +107,33 @@ def write_vtt(segments, filepath):
             text = seg['text'].strip()
             f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
 def write_docx(entries, filepath):
     doc = Document()
     doc.add_heading("Transcript", level=1)
-    full_text = " ".join([text for _, text in entries])  # concatenate all segments
-    doc.add_paragraph(full_text)  # single paragraph with all text
     doc.save(filepath)
     return filepath
 def parse_vtt(filepath):
     entries = []
     with open(filepath, "r", encoding="utf-8") as f:
         lines = f.readlines()
     idx = 0
     while idx < len(lines):
         line = lines[idx].strip()
@@ -58,19 +149,40 @@ def parse_vtt(filepath):
             idx += 1
     return entries
 def parse_timestamp(ts_str):
     h, m, rest = ts_str.split(":")
     s, ms = rest.split(".")
     return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
 def capture_screenshot(video_path, time_sec, out_path):
     cmd = [
         "ffmpeg", "-ss", str(time_sec), "-i", video_path,
         "-frames:v", "1", "-q:v", "2", out_path, "-y"
     ]
     subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 def save_voice_plot(times, db, start_sec, out_path):
     plt.figure(figsize=(8, 3))
     plt.plot(times, db, color="purple")
     plt.axvline(x=start_sec, color="red", linestyle="--")
@@ -82,7 +194,14 @@ def save_voice_plot(times, db, start_sec, out_path):
     plt.savefig(out_path)
     plt.close()
 def file_to_base64(filepath):
     with open(filepath, "rb") as f:
         data = f.read()
     ext = os.path.splitext(filepath)[1].lower().replace('.', '')
@@ -90,14 +209,33 @@ def file_to_base64(filepath):
     b64 = base64.b64encode(data).decode('utf-8')
     return f"data:{mime};base64,{b64}"
 def extract_audio(video_path, output_dir):
     audio_path = os.path.join(output_dir, "audio.mp3")
     subprocess.run([
-        "ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path
     ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return audio_path
 def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path):
     html = f"""<!DOCTYPE html>
 <html lang="en">
 <head>
@@ -113,19 +251,11 @@ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, outpu
     }}
     .segment {{
         display: flex;
-        align-items: center;
         gap: 20px;
         margin-bottom: 40px;
     }}
-    .text {{
-        flex: 2;
-    }}
-    .media {{
-        flex: 3;
-        display: flex;
-        flex-direction: column;
-        gap: 10px;
-    }}
 </style>
 </head>
 <body>
@@ -136,6 +266,7 @@ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, outpu
     for time_range, text in entries:
         start = time_range.split(" --> ")[0]
         start_sec = int(parse_timestamp(start))
         screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg")
         plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png")
@@ -159,9 +290,25 @@ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, outpu
     with open(output_html_path, "w", encoding="utf-8") as f:
         f.write(html)
     return output_html_path
 def process(video_file):
     session_id = str(uuid.uuid4())
     base_dir = os.path.join("session_data", session_id)
     os.makedirs(base_dir, exist_ok=True)
@@ -174,20 +321,20 @@ def process(video_file):
     video_path = video_file.name
     video_id = os.path.splitext(os.path.basename(video_path))[0]
-    # Extract audio
     audio_path = extract_audio(video_path, base_dir)
-    # Transcription
     result = model.transcribe(audio_path)
     vtt_path = os.path.join(base_dir, f"{video_id}.vtt")
     write_vtt(result["segments"], vtt_path)
     entries = parse_vtt(vtt_path)
-    # Create DOCX transcript
     docx_path = os.path.join(base_dir, f"{video_id}.docx")
     write_docx(entries, docx_path)
-    # Voice intensity curve
     y, sr = librosa.load(audio_path, sr=None)
     S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
     freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
@@ -196,31 +343,37 @@ def process(video_file):
     voice_db = 20 * np.log10(voice_energy + 1e-6)
     times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512)
-    # Generate screenshots + plots
     for time_range, _ in entries:
         start = time_range.split(" --> ")[0]
         start_sec = parse_timestamp(start)
-        screenshot_out = os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg")
-        plot_out = os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png")
-        capture_screenshot(video_path, start_sec, screenshot_out)
-        save_voice_plot(times, voice_db, start_sec, plot_out)
-    # HTML output
     html_output_path = os.path.join(base_dir, f"{video_id}.html")
-    final_html = generate_html(entries, video_id, video_path, screenshots_dir, plots_dir, html_output_path)
-    # Create ZIP of screenshots
     zip_path = os.path.join(base_dir, f"{video_id}_screenshots.zip")
     shutil.make_archive(zip_path.replace(".zip", ""), "zip", screenshots_dir)
-    # Return files + HTML preview
     with open(final_html, "r", encoding="utf-8") as f:
         html_content = f.read()
     return docx_path, final_html, zip_path, html_content
 # Gradio UI
 demo = gr.Interface(
     fn=process,
     inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])],

+"""
+===========================================================
+ Video Annotated Transcript Generator
+===========================================================
+This Gradio application processes a video file and produces:
+1. A full transcript (DOCX format)
+2. A WEBVTT subtitle file
+3. Screenshots at each transcript timestamp (ZIP)
+4. Voice intensity plots synchronized with the transcript
+5. An interactive HTML file showing:
+      - Screenshot
+      - Sound intensity plot
+      - Editable text of each segment
+The pipeline:
+-------------
+UPLOAD VIDEO
+ → Extract audio (ffmpeg)
+ → Transcribe speech using Whisper
+ → Produce VTT + DOCX
+ → Analyze sound intensity using Librosa
+ → Capture screenshots at segment timestamps
+ → Generate annotated HTML page
+ → Return all outputs to the user
+-----------------------------------------------------------
+ HOW TO GET VIDEOS USING “VIDEO DOWNLOADHELPER”
+-----------------------------------------------------------
+Video DownloadHelper is a browser extension (Firefox / Chrome)
+that allows you to save video files locally.
+Steps:
+1. Install the extension:
+   https://www.downloadhelper.net/
+2. Go to the video you want to download
+   (YouTube, Vimeo, news websites, etc.)
+3. Click the DownloadHelper icon in your browser.
+4. Choose a file format such as:
+     • MP4
+     • WebM
+     • MKV
+5. Save the file to your computer.
+6. Upload the saved file into this Gradio app.
+Note:
+- The extension cannot download YouTube videos with DRM.
+- If a website blocks downloading, try the “Companion App”
+  recommended by Video DownloadHelper.
+===========================================================
+"""
 import gradio as gr
 import os
 import subprocess
 import base64
 import torch
 import shutil
+from docx import Document  # DOCX export
+# ----------------------------------------------------------
+# Auto-select GPU if available for Whisper
+# ----------------------------------------------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = whisper.load_model("base", device=device)
+# ----------------------------------------------------------
+# Utility: Convert seconds → WebVTT timestamp format
+# ----------------------------------------------------------
 def format_timestamp(seconds):
+    """
+    Convert time in seconds to WebVTT format HH:MM:SS.MS
+    """
     h = int(seconds // 3600)
     m = int((seconds % 3600) // 60)
     s = int(seconds % 60)
     ms = int((seconds - int(seconds)) * 1000)
     return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
+# ----------------------------------------------------------
+# Write segments to a .vtt subtitle file
+# ----------------------------------------------------------
 def write_vtt(segments, filepath):
+    """
+    Save Whisper segments to a .vtt (WebVTT subtitle) file.
+    """
     with open(filepath, "w", encoding="utf-8") as f:
         f.write("WEBVTT\n\n")
         for i, seg in enumerate(segments, start=1):
             text = seg['text'].strip()
             f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
+# ----------------------------------------------------------
+# Export transcript to DOCX
+# ----------------------------------------------------------
 def write_docx(entries, filepath):
+    """
+    Export transcript text into a single DOCX document.
+    """
     doc = Document()
     doc.add_heading("Transcript", level=1)
+    full_text = " ".join([text for _, text in entries])
+    doc.add_paragraph(full_text)
     doc.save(filepath)
     return filepath
+# ----------------------------------------------------------
+# Read a .vtt file and return list of (timerange, text)
+# ----------------------------------------------------------
 def parse_vtt(filepath):
+    """
+    Basic VTT parser: returns a list of (timestamp, text)
+    """
     entries = []
     with open(filepath, "r", encoding="utf-8") as f:
         lines = f.readlines()
     idx = 0
     while idx < len(lines):
         line = lines[idx].strip()
             idx += 1
     return entries
+# ----------------------------------------------------------
+# Parse a VTT timestamp "HH:MM:SS.MS"
+# ----------------------------------------------------------
 def parse_timestamp(ts_str):
+    """
+    Convert WebVTT timestamp to seconds.
+    """
     h, m, rest = ts_str.split(":")
     s, ms = rest.split(".")
     return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
+# ----------------------------------------------------------
+# Capture screenshot using ffmpeg
+# ----------------------------------------------------------
 def capture_screenshot(video_path, time_sec, out_path):
+    """
+    Extract a frame at a specific time using ffmpeg.
+    """
     cmd = [
         "ffmpeg", "-ss", str(time_sec), "-i", video_path,
         "-frames:v", "1", "-q:v", "2", out_path, "-y"
     ]
     subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+# ----------------------------------------------------------
+# Save a voice intensity plot around the timestamp
+# ----------------------------------------------------------
 def save_voice_plot(times, db, start_sec, out_path):
+    """
+    Plot voice-band intensity (300–3000 Hz) and mark the timestamp.
+    """
     plt.figure(figsize=(8, 3))
     plt.plot(times, db, color="purple")
     plt.axvline(x=start_sec, color="red", linestyle="--")
     plt.savefig(out_path)
     plt.close()
+# ----------------------------------------------------------
+# Convert image → base64 to embed in HTML
+# ----------------------------------------------------------
 def file_to_base64(filepath):
+    """
+    Convert a file to a base64 string for HTML embedding.
+    """
     with open(filepath, "rb") as f:
         data = f.read()
     ext = os.path.splitext(filepath)[1].lower().replace('.', '')
     b64 = base64.b64encode(data).decode('utf-8')
     return f"data:{mime};base64,{b64}"
+# ----------------------------------------------------------
+# Extract audio track from video
+# ----------------------------------------------------------
 def extract_audio(video_path, output_dir):
+    """
+    Extract audio as MP3 using ffmpeg.
+    """
     audio_path = os.path.join(output_dir, "audio.mp3")
     subprocess.run([
+        "ffmpeg", "-y", "-i", video_path, "-vn",
+        "-acodec", "libmp3lame", audio_path
     ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return audio_path
+# ----------------------------------------------------------
+# Generate the annotated HTML transcript
+# ----------------------------------------------------------
 def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path):
+    """
+    Create a complete HTML page showing:
+      - text
+      - screenshot
+      - voice plot
+    for each segment.
+    """
     html = f"""<!DOCTYPE html>
 <html lang="en">
 <head>
     }}
     .segment {{
         display: flex;
         gap: 20px;
         margin-bottom: 40px;
     }}
+    .text {{ flex: 2; }}
+    .media {{ flex: 3; display: flex; flex-direction: column; gap: 10px; }}
 </style>
 </head>
 <body>
     for time_range, text in entries:
         start = time_range.split(" --> ")[0]
         start_sec = int(parse_timestamp(start))
         screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg")
         plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png")
     with open(output_html_path, "w", encoding="utf-8") as f:
         f.write(html)
     return output_html_path
+# ----------------------------------------------------------
+# The main processing pipeline executed by Gradio
+# ----------------------------------------------------------
 def process(video_file):
+    """
+    Main function:
+    - Creates session folder
+    - Extracts audio
+    - Runs Whisper transcription
+    - Generates VTT + DOCX
+    - Computes sound intensity
+    - Captures screenshots
+    - Builds annotated HTML
+    """
+    # Create isolated session
     session_id = str(uuid.uuid4())
     base_dir = os.path.join("session_data", session_id)
     os.makedirs(base_dir, exist_ok=True)
     video_path = video_file.name
     video_id = os.path.splitext(os.path.basename(video_path))[0]
+    # 1. Extract audio
     audio_path = extract_audio(video_path, base_dir)
+    # 2. Transcription using Whisper
     result = model.transcribe(audio_path)
     vtt_path = os.path.join(base_dir, f"{video_id}.vtt")
     write_vtt(result["segments"], vtt_path)
     entries = parse_vtt(vtt_path)
+    # 3. DOCX transcript
     docx_path = os.path.join(base_dir, f"{video_id}.docx")
     write_docx(entries, docx_path)
+    # 4. Voice intensity curve
     y, sr = librosa.load(audio_path, sr=None)
     S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
     freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
     voice_db = 20 * np.log10(voice_energy + 1e-6)
     times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512)
+    # 5. Screenshots + plots for each segment
     for time_range, _ in entries:
         start = time_range.split(" --> ")[0]
         start_sec = parse_timestamp(start)
+        capture_screenshot(video_path, start_sec,
+                           os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg"))
+        save_voice_plot(times, voice_db, start_sec,
+                        os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png"))
+    # 6. HTML output
     html_output_path = os.path.join(base_dir, f"{video_id}.html")
+    final_html = generate_html(
+        entries, video_id, video_path,
+        screenshots_dir, plots_dir,
+        html_output_path
+    )
+    # 7. ZIP screenshots
     zip_path = os.path.join(base_dir, f"{video_id}_screenshots.zip")
     shutil.make_archive(zip_path.replace(".zip", ""), "zip", screenshots_dir)
+    # 8. HTML preview as text
     with open(final_html, "r", encoding="utf-8") as f:
         html_content = f.read()
     return docx_path, final_html, zip_path, html_content
+# ----------------------------------------------------------
 # Gradio UI
+# ----------------------------------------------------------
 demo = gr.Interface(
     fn=process,
     inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])],