Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import subprocess | |
| import whisper | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import uuid | |
| import base64 | |
| import torch | |
| import shutil | |
| from docx import Document # DOCX export | |
| # ---------------------------------------------------------- | |
| # Auto-select GPU if available for Whisper | |
| # ---------------------------------------------------------- | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = whisper.load_model("base", device=device) | |
| # ---------------------------------------------------------- | |
| # Utility: Convert seconds → WebVTT timestamp format | |
| # ---------------------------------------------------------- | |
| def format_timestamp(seconds): | |
| """ | |
| Convert time in seconds to WebVTT format HH:MM:SS.MS | |
| """ | |
| h = int(seconds // 3600) | |
| m = int((seconds % 3600) // 60) | |
| s = int(seconds % 60) | |
| ms = int((seconds - int(seconds)) * 1000) | |
| return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}" | |
| # ---------------------------------------------------------- | |
| # Write segments to a .vtt subtitle file | |
| # ---------------------------------------------------------- | |
| def write_vtt(segments, filepath): | |
| """ | |
| Save Whisper segments to a .vtt (WebVTT subtitle) file. | |
| """ | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write("WEBVTT\n\n") | |
| for i, seg in enumerate(segments, start=1): | |
| start = format_timestamp(seg['start']) | |
| end = format_timestamp(seg['end']) | |
| text = seg['text'].strip() | |
| f.write(f"{i}\n{start} --> {end}\n{text}\n\n") | |
| # ---------------------------------------------------------- | |
| # Export transcript to DOCX | |
| # ---------------------------------------------------------- | |
| def write_docx(entries, filepath): | |
| """ | |
| Export transcript text into a single DOCX document. | |
| """ | |
| doc = Document() | |
| doc.add_heading("Transcript", level=1) | |
| full_text = " ".join([text for _, text in entries]) | |
| doc.add_paragraph(full_text) | |
| doc.save(filepath) | |
| return filepath | |
| # ---------------------------------------------------------- | |
| # Read a .vtt file and return list of (timerange, text) | |
| # ---------------------------------------------------------- | |
| def parse_vtt(filepath): | |
| """ | |
| Basic VTT parser: returns a list of (timestamp, text) | |
| """ | |
| entries = [] | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| lines = f.readlines() | |
| idx = 0 | |
| while idx < len(lines): | |
| line = lines[idx].strip() | |
| if "-->" in line: | |
| time_range = line | |
| idx += 1 | |
| text_lines = [] | |
| while idx < len(lines) and lines[idx].strip() != '': | |
| text_lines.append(lines[idx].strip()) | |
| idx += 1 | |
| entries.append((time_range, ' '.join(text_lines))) | |
| else: | |
| idx += 1 | |
| return entries | |
| # ---------------------------------------------------------- | |
| # Parse a VTT timestamp "HH:MM:SS.MS" | |
| # ---------------------------------------------------------- | |
| def parse_timestamp(ts_str): | |
| """ | |
| Convert WebVTT timestamp to seconds. | |
| """ | |
| h, m, rest = ts_str.split(":") | |
| s, ms = rest.split(".") | |
| return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000 | |
| # ---------------------------------------------------------- | |
| # Capture screenshot using ffmpeg | |
| # ---------------------------------------------------------- | |
| def capture_screenshot(video_path, time_sec, out_path): | |
| """ | |
| Extract a frame at a specific time using ffmpeg. | |
| """ | |
| cmd = [ | |
| "ffmpeg", "-ss", str(time_sec), "-i", video_path, | |
| "-frames:v", "1", "-q:v", "2", out_path, "-y" | |
| ] | |
| subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| # ---------------------------------------------------------- | |
| # Save a voice intensity plot around the timestamp | |
| # ---------------------------------------------------------- | |
| def save_voice_plot(times, db, start_sec, out_path): | |
| """ | |
| Plot voice-band intensity (300–3000 Hz) and mark the timestamp. | |
| """ | |
| plt.figure(figsize=(8, 3)) | |
| plt.plot(times, db, color="purple") | |
| plt.axvline(x=start_sec, color="red", linestyle="--") | |
| interp_val = np.interp(start_sec, times, db) | |
| plt.scatter([start_sec], [interp_val], color="red") | |
| plt.xlabel("Time (s)") | |
| plt.ylabel("Voice band dB") | |
| plt.tight_layout() | |
| plt.savefig(out_path) | |
| plt.close() | |
| # ---------------------------------------------------------- | |
| # Convert image → base64 to embed in HTML | |
| # ---------------------------------------------------------- | |
| def file_to_base64(filepath): | |
| """ | |
| Convert a file to a base64 string for HTML embedding. | |
| """ | |
| with open(filepath, "rb") as f: | |
| data = f.read() | |
| ext = os.path.splitext(filepath)[1].lower().replace('.', '') | |
| mime = f"image/{'jpeg' if ext=='jpg' else ext}" | |
| b64 = base64.b64encode(data).decode('utf-8') | |
| return f"data:{mime};base64,{b64}" | |
| # ---------------------------------------------------------- | |
| # Extract audio track from video | |
| # ---------------------------------------------------------- | |
| def extract_audio(video_path, output_dir): | |
| """ | |
| Extract audio as MP3 using ffmpeg. | |
| """ | |
| audio_path = os.path.join(output_dir, "audio.mp3") | |
| subprocess.run([ | |
| "ffmpeg", "-y", "-i", video_path, "-vn", | |
| "-acodec", "libmp3lame", audio_path | |
| ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| return audio_path | |
| # ---------------------------------------------------------- | |
| # Generate the annotated HTML transcript | |
| # ---------------------------------------------------------- | |
| def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path): | |
| """ | |
| Create a complete HTML page showing: | |
| - text | |
| - screenshot | |
| - voice plot | |
| for each segment. | |
| """ | |
| html = f"""<!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"><title>{video_id}</title> | |
| <style> | |
| body {{ font-family: Arial; font-size: 18px; margin: 20px; }} | |
| .media img {{ | |
| width: 480px; | |
| height: auto; | |
| border: 1px solid #ccc; | |
| border-radius: 6px; | |
| box-shadow: 2px 2px 6px rgba(0,0,0,0.1); | |
| }} | |
| .segment {{ | |
| display: flex; | |
| gap: 20px; | |
| margin-bottom: 40px; | |
| }} | |
| .text {{ flex: 2; }} | |
| .media {{ flex: 3; display: flex; flex-direction: column; gap: 10px; }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Annotated Transcript for {video_id}</h1> | |
| <p>Uploaded video file: {os.path.basename(video_path)}</p> | |
| """ | |
| for time_range, text in entries: | |
| start = time_range.split(" --> ")[0] | |
| start_sec = int(parse_timestamp(start)) | |
| screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg") | |
| plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png") | |
| screenshot_b64 = file_to_base64(screenshot_path) if os.path.exists(screenshot_path) else "" | |
| plot_b64 = file_to_base64(plot_path) if os.path.exists(plot_path) else "" | |
| html += f""" | |
| <div class="segment"> | |
| <div class="text"> | |
| <h3>{time_range}</h3> | |
| <p contenteditable="true">{text}</p> | |
| </div> | |
| <div class="media"> | |
| <img src="{screenshot_b64}" alt="Screenshot at {start_sec}s"> | |
| <img src="{plot_b64}" alt="Voice energy plot at {start_sec}s"> | |
| </div> | |
| </div> | |
| """ | |
| html += "</body></html>" | |
| with open(output_html_path, "w", encoding="utf-8") as f: | |
| f.write(html) | |
| return output_html_path | |
| # ---------------------------------------------------------- | |
| # The main processing pipeline executed by Gradio | |
| # ---------------------------------------------------------- | |
| def process(video_file): | |
| """ | |
| Main function: | |
| - Creates session folder | |
| - Extracts audio | |
| - Runs Whisper transcription | |
| - Generates VTT + DOCX | |
| - Computes sound intensity | |
| - Captures screenshots | |
| - Builds annotated HTML | |
| """ | |
| # Create isolated session | |
| session_id = str(uuid.uuid4()) | |
| base_dir = os.path.join("session_data", session_id) | |
| os.makedirs(base_dir, exist_ok=True) | |
| screenshots_dir = os.path.join(base_dir, "screenshots") | |
| plots_dir = os.path.join(base_dir, "plots") | |
| os.makedirs(screenshots_dir, exist_ok=True) | |
| os.makedirs(plots_dir, exist_ok=True) | |
| video_path = video_file.name | |
| video_id = os.path.splitext(os.path.basename(video_path))[0] | |
| # 1. Extract audio | |
| audio_path = extract_audio(video_path, base_dir) | |
| # 2. Transcription using Whisper | |
| result = model.transcribe(audio_path) | |
| vtt_path = os.path.join(base_dir, f"{video_id}.vtt") | |
| write_vtt(result["segments"], vtt_path) | |
| entries = parse_vtt(vtt_path) | |
| # 3. DOCX transcript | |
| docx_path = os.path.join(base_dir, f"{video_id}.docx") | |
| write_docx(entries, docx_path) | |
| # 4. Voice intensity curve | |
| y, sr = librosa.load(audio_path, sr=None) | |
| S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512)) | |
| freqs = librosa.fft_frequencies(sr=sr, n_fft=2048) | |
| voice_band = (freqs >= 300) & (freqs <= 3000) | |
| voice_energy = S[voice_band, :].mean(axis=0) | |
| voice_db = 20 * np.log10(voice_energy + 1e-6) | |
| times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512) | |
| # 5. Screenshots + plots for each segment | |
| for time_range, _ in entries: | |
| start = time_range.split(" --> ")[0] | |
| start_sec = parse_timestamp(start) | |
| capture_screenshot(video_path, start_sec, | |
| os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg")) | |
| save_voice_plot(times, voice_db, start_sec, | |
| os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png")) | |
| # 6. HTML output | |
| html_output_path = os.path.join(base_dir, f"{video_id}.html") | |
| final_html = generate_html( | |
| entries, video_id, video_path, | |
| screenshots_dir, plots_dir, | |
| html_output_path | |
| ) | |
| # 7. ZIP screenshots | |
| zip_path = os.path.join(base_dir, f"{video_id}_screenshots.zip") | |
| shutil.make_archive(zip_path.replace(".zip", ""), "zip", screenshots_dir) | |
| # 8. HTML preview as text | |
| with open(final_html, "r", encoding="utf-8") as f: | |
| html_content = f.read() | |
| return docx_path, final_html, zip_path, html_content | |
| # ---------------------------------------------------------- | |
| # Gradio UI | |
| # ---------------------------------------------------------- | |
| full_description = """ | |
| =========================================================== | |
| Video Annotated Transcript Generator | |
| =========================================================== | |
| This application processes a video file and produces: | |
| 1. A complete transcript (DOCX) | |
| 2. A WEBVTT subtitle file | |
| 3. A ZIP file with screenshots at each transcript timestamp | |
| 4. Sound intensity plots aligned with speech | |
| 5. An interactive HTML file showing: | |
| • The transcript (editable) | |
| • A screenshot for each segment | |
| • A voice intensity graph | |
| ----------------------------------------------------------- | |
| HOW THE PIPELINE WORKS | |
| ----------------------------------------------------------- | |
| UPLOAD VIDEO | |
| → Extract audio (ffmpeg) | |
| → Transcribe speech (Whisper) | |
| → Generate VTT + DOCX | |
| → Compute sound intensity (Librosa) | |
| → Capture screenshots (ffmpeg) | |
| → Generate interactive HTML | |
| → Return all outputs | |
| ----------------------------------------------------------- | |
| HOW TO DOWNLOAD VIDEOS USING VIDEO DOWNLOADHELPER | |
| ----------------------------------------------------------- | |
| Video DownloadHelper is a browser extension (Firefox / Chrome) | |
| that allows you to save video files directly. | |
| Steps: | |
| 1. Install the extension: | |
| https://www.downloadhelper.net/ | |
| 2. Visit the video page (YouTube, Vimeo, news websites, etc.) | |
| 3. Click the DownloadHelper icon in your browser. | |
| 4. Choose a file format: | |
| • MP4 | |
| • WebM | |
| • MKV | |
| 5. Save the video to your computer. | |
| 6. Upload that file into this Gradio app. | |
| =========================================================== | |
| """ | |
| demo = gr.Interface( | |
| fn=process, | |
| inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])], | |
| outputs=[ | |
| gr.File(label="Download Transcript (DOCX)"), | |
| gr.File(label="Download Annotated HTML"), | |
| gr.File(label="Download Screenshots (ZIP)"), | |
| gr.HTML(label="Preview Annotated Transcript") | |
| ], | |
| title="Video2Novel", | |
| description=full_description | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |