Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import subprocess | |
| import whisper | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import uuid | |
| import base64 | |
| import torch | |
| import shutil | |
| from docx import Document # DOCX export | |
| # ---------------------------------------------------------- | |
| # Auto-select GPU if available for Whisper | |
| # ---------------------------------------------------------- | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = whisper.load_model("base", device=device) | |
| # ---------------------------------------------------------- | |
| # Utility: Convert seconds → WebVTT timestamp format | |
| # ---------------------------------------------------------- | |
| def format_timestamp(seconds): | |
| h = int(seconds // 3600) | |
| m = int((seconds % 3600) // 60) | |
| s = int(seconds % 60) | |
| ms = int((seconds - int(seconds)) * 1000) | |
| return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}" | |
| # ---------------------------------------------------------- | |
| # Write segments to a .vtt subtitle file | |
| # ---------------------------------------------------------- | |
| def write_vtt(segments, filepath): | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write("WEBVTT\n\n") | |
| for i, seg in enumerate(segments, start=1): | |
| start = format_timestamp(seg['start']) | |
| end = format_timestamp(seg['end']) | |
| text = seg['text'].strip() | |
| f.write(f"{i}\n{start} --> {end}\n{text}\n\n") | |
| # ---------------------------------------------------------- | |
| # Export transcript to DOCX | |
| # ---------------------------------------------------------- | |
| def write_docx(entries, filepath): | |
| doc = Document() | |
| doc.add_heading("Transcript", level=1) | |
| full_text = " ".join([text for _, text in entries]) | |
| doc.add_paragraph(full_text) | |
| doc.save(filepath) | |
| return filepath | |
| # ---------------------------------------------------------- | |
| # Read a .vtt file and return list of (timerange, text) | |
| # ---------------------------------------------------------- | |
| def parse_vtt(filepath): | |
| entries = [] | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| lines = f.readlines() | |
| idx = 0 | |
| while idx < len(lines): | |
| line = lines[idx].strip() | |
| if "-->" in line: | |
| time_range = line | |
| idx += 1 | |
| text_lines = [] | |
| while idx < len(lines) and lines[idx].strip() != '': | |
| text_lines.append(lines[idx].strip()) | |
| idx += 1 | |
| entries.append((time_range, ' '.join(text_lines))) | |
| else: | |
| idx += 1 | |
| return entries | |
| # ---------------------------------------------------------- | |
| # Parse a VTT timestamp "HH:MM:SS.MS" | |
| # ---------------------------------------------------------- | |
| def parse_timestamp(ts_str): | |
| h, m, rest = ts_str.split(":") | |
| s, ms = rest.split(".") | |
| return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000 | |
| # ---------------------------------------------------------- | |
| # Capture screenshot using ffmpeg | |
| # ---------------------------------------------------------- | |
| def capture_screenshot(video_path, time_sec, out_path): | |
| cmd = [ | |
| "ffmpeg", "-ss", str(time_sec), "-i", video_path, | |
| "-frames:v", "1", "-q:v", "2", out_path, "-y" | |
| ] | |
| subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| # ---------------------------------------------------------- | |
| # Save a voice intensity plot around the timestamp | |
| # ---------------------------------------------------------- | |
| def save_voice_plot(times, db, start_sec, out_path): | |
| plt.figure(figsize=(8, 3)) | |
| plt.plot(times, db, color="purple") | |
| plt.axvline(x=start_sec, color="red", linestyle="--") | |
| interp_val = np.interp(start_sec, times, db) | |
| plt.scatter([start_sec], [interp_val], color="red") | |
| plt.xlabel("Time (s)") | |
| plt.ylabel("Voice band dB") | |
| plt.tight_layout() | |
| plt.savefig(out_path) | |
| plt.close() | |
| # ---------------------------------------------------------- | |
| # Convert image → base64 to embed in HTML | |
| # ---------------------------------------------------------- | |
| def file_to_base64(filepath): | |
| with open(filepath, "rb") as f: | |
| data = f.read() | |
| ext = os.path.splitext(filepath)[1].lower().replace('.', '') | |
| mime = f"image/{'jpeg' if ext=='jpg' else ext}" | |
| b64 = base64.b64encode(data).decode('utf-8') | |
| return f"data:{mime};base64,{b64}" | |
| # ---------------------------------------------------------- | |
| # Extract audio track from video | |
| # ---------------------------------------------------------- | |
| def extract_audio(video_path, output_dir): | |
| audio_path = os.path.join(output_dir, "audio.mp3") | |
| subprocess.run([ | |
| "ffmpeg", "-y", "-i", video_path, "-vn", | |
| "-acodec", "libmp3lame", audio_path | |
| ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| return audio_path | |
| # ---------------------------------------------------------- | |
| # Generate the annotated HTML transcript with category selection & XLSX export | |
| # ---------------------------------------------------------- | |
| def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path): | |
| dispositif_options = [ | |
| "Texte scripté incarné à l'écran", | |
| "Interview", | |
| "Conversation/Débats (texte non scripté, incarnation à l'écran)", | |
| "Montage sans voix-off (absence de texte, absence d'incarnation)", | |
| "Montage avec voix-off (texte scripté non incarné à l'écran)", | |
| "Autres (performance artistique, etc.)" | |
| ] | |
| thematic_options = [ | |
| "1. Politiques", "1.1 Politique institutionnelle", "1.2 Politiques publiques", | |
| "1.3 Mouvements sociaux et association", "1.4 Justice, police, sécurité", | |
| "1.5 Relations internationales, enjeux géopolitiques", "1.6 Démocratie, vie politique au sens large", | |
| "2. Économie & travail", "2.1 Entreprises, marchés, entrepreneuriat", "2.2 Travail, emploi, chômage", | |
| "2.3 Précarité, inégalités économiques", "2.4 Consommation, pouvoir d’achat", "2.5 Innovation économique, start-up, plateformes", | |
| "2.6 Finances et impôts", | |
| "3. Questions sociales", "3.1 (In)égalités sociales (classe, genre, race, âge)", "3.2 Famille, parentalité, intimité", | |
| "3.3 Santé", "3.4 Éducation, jeunesse", "3.5 Vie quotidienne / modes de vie / alimentation", | |
| "3.6 Solidarités, aides sociales", | |
| "4. Culture & médias", "4.1 Médias et journalisme", "4.2 Culture (Patrimoine, arts, littérature, cinéma, séries, musique)", | |
| "4.3 Célébrités, influenceurs", | |
| "5. Numérique, technologies & plateformes", "5.1 Réseaux sociaux et plateformes", "5.2 Intelligence artificielle", | |
| "5.3 Usages numériques (pratiques, dépendances)", "5.4 Cyberviolences, harcèlement en ligne", | |
| "6. Environnement & sciences", "6.1 Environnement, biodiversité", "6.2 Sciences et recherche", "6.3 Risques, catastrophes naturelles", | |
| "7. Faits divers", "7.1 Criminalité, violences, agression", "7.2 Accidents, drame", "7.3 Disparitions, affaires judiciaires", | |
| "8. Sport, loisirs & divertissement", "8.1 Sport professionnel / amateur", "8.2 Loisirs, voyage", "8.3 Jeux, divertissements" | |
| ] | |
| html = f"""<!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>{video_id}</title> | |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.18.5/xlsx.full.min.js"></script> | |
| <style> | |
| body {{ font-family: Arial; font-size: 18px; margin: 20px; }} | |
| .media img {{ width: 480px; height: auto; border: 1px solid #ccc; border-radius: 6px; box-shadow: 2px 2px 6px rgba(0,0,0,0.1); }} | |
| .segment {{ display: flex; gap: 20px; margin-bottom: 40px; }} | |
| .text {{ flex: 2; }} | |
| .media {{ flex: 3; display: flex; flex-direction: column; gap: 10px; }} | |
| select, textarea {{ width: 100%; margin-top: 5px; }} | |
| button {{ padding: 10px 15px; font-size: 16px; margin-bottom: 20px; }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Annotated Transcript for {video_id}</h1> | |
| <p>Uploaded video file: {os.path.basename(video_path)}</p> | |
| <button onclick="exportToExcel()">Export to XLSX</button> | |
| """ | |
| for idx, (time_range, text) in enumerate(entries): | |
| start = time_range.split(" --> ")[0] | |
| start_sec = int(parse_timestamp(start)) | |
| screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg") | |
| plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png") | |
| screenshot_b64 = file_to_base64(screenshot_path) if os.path.exists(screenshot_path) else "" | |
| plot_b64 = file_to_base64(plot_path) if os.path.exists(plot_path) else "" | |
| html += f""" | |
| <div class="segment" id="segment_{idx}"> | |
| <div class="text"> | |
| <h3>{time_range}</h3> | |
| <p contenteditable="true">{text}</p> | |
| <label>Catégorie dispositif médiatique:</label> | |
| <select class="dispositif" onchange="applyNext(this, 'dispositif')"> | |
| <option value="">--Select--</option> | |
| {''.join([f'<option value="{opt}">{opt}</option>' for opt in dispositif_options])} | |
| </select> | |
| <textarea placeholder="Commentaire dispositif..." rows="2"></textarea> | |
| <label>Catégorie thématique:</label> | |
| <select class="thematic" onchange="applyNext(this, 'thematic')"> | |
| <option value="">--Select--</option> | |
| {''.join([f'<option value="{opt}">{opt}</option>' for opt in thematic_options])} | |
| </select> | |
| <textarea placeholder="Commentaire thématique..." rows="2"></textarea> | |
| </div> | |
| <div class="media"> | |
| <img src="{screenshot_b64}" alt="Screenshot at {start_sec}s"> | |
| <img src="{plot_b64}" alt="Voice energy plot at {start_sec}s"> | |
| </div> | |
| </div> | |
| """ | |
| # JavaScript functions for apply-next and export | |
| html += """ | |
| <script> | |
| function applyNext(selectElem, cls) { | |
| if(!confirm("Apply this selection to all next segments?")) return; | |
| let segments = document.querySelectorAll('.segment'); | |
| let found = false; | |
| segments.forEach(seg => { | |
| if(found) { | |
| let target = seg.querySelector('.' + cls); | |
| if(target) target.value = selectElem.value; | |
| } | |
| if(seg.contains(selectElem)) found = true; | |
| }); | |
| } | |
| function exportToExcel() { | |
| let segments = document.querySelectorAll('.segment'); | |
| let data = []; | |
| segments.forEach((seg, idx) => { | |
| let time = seg.querySelector('h3').innerText; | |
| let text = seg.querySelector('p').innerText; | |
| let dispositif = seg.querySelector('.dispositif').value; | |
| let dispositif_comment = seg.querySelector('textarea:nth-of-type(1)').value; | |
| let thematic = seg.querySelector('.thematic').value; | |
| let thematic_comment = seg.querySelector('textarea:nth-of-type(2)').value; | |
| data.push({ | |
| "Segment #": idx + 1, | |
| "Time": time, | |
| "Transcript": text, | |
| "Dispositif médiatique": dispositif, | |
| "Commentaire dispositif": dispositif_comment, | |
| "Catégorie thématique": thematic, | |
| "Commentaire thématique": thematic_comment | |
| }); | |
| }); | |
| let ws = XLSX.utils.json_to_sheet(data); | |
| let wb = XLSX.utils.book_new(); | |
| XLSX.utils.book_append_sheet(wb, ws, "Transcript"); | |
| XLSX.writeFile(wb, "annotated_transcript.xlsx"); | |
| } | |
| </script> | |
| """ | |
| html += "</body></html>" | |
| with open(output_html_path, "w", encoding="utf-8") as f: | |
| f.write(html) | |
| return output_html_path | |
| # ---------------------------------------------------------- | |
| # Main processing pipeline executed by Gradio | |
| # ---------------------------------------------------------- | |
| def process(video_file): | |
| session_id = str(uuid.uuid4()) | |
| base_dir = os.path.join("session_data", session_id) | |
| os.makedirs(base_dir, exist_ok=True) | |
| screenshots_dir = os.path.join(base_dir, "screenshots") | |
| plots_dir = os.path.join(base_dir, "plots") | |
| os.makedirs(screenshots_dir, exist_ok=True) | |
| os.makedirs(plots_dir, exist_ok=True) | |
| video_path = video_file.name | |
| video_id = os.path.splitext(os.path.basename(video_path))[0] | |
| audio_path = extract_audio(video_path, base_dir) | |
| result = model.transcribe(audio_path) | |
| vtt_path = os.path.join(base_dir, f"{video_id}.vtt") | |
| write_vtt(result["segments"], vtt_path) | |
| entries = parse_vtt(vtt_path) | |
| docx_path = os.path.join(base_dir, f"{video_id}.docx") | |
| write_docx(entries, docx_path) | |
| y, sr = librosa.load(audio_path, sr=None) | |
| S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512)) | |
| freqs = librosa.fft_frequencies(sr=sr, n_fft=2048) | |
| voice_band = (freqs >= 300) & (freqs <= 3000) | |
| voice_energy = S[voice_band, :].mean(axis=0) | |
| voice_db = 20 * np.log10(voice_energy + 1e-6) | |
| times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512) | |
| for time_range, _ in entries: | |
| start = time_range.split(" --> ")[0] | |
| start_sec = parse_timestamp(start) | |
| capture_screenshot(video_path, start_sec, | |
| os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg")) | |
| save_voice_plot(times, voice_db, start_sec, | |
| os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png")) | |
| html_output_path = os.path.join(base_dir, f"{video_id}.html") | |
| final_html = generate_html( | |
| entries, video_id, video_path, | |
| screenshots_dir, plots_dir, | |
| html_output_path | |
| ) | |
| zip_path = os.path.join(base_dir, f"{video_id}_screenshots.zip") | |
| shutil.make_archive(zip_path.replace(".zip", ""), "zip", screenshots_dir) | |
| with open(final_html, "r", encoding="utf-8") as f: | |
| html_content = f.read() | |
| return docx_path, final_html, zip_path, html_content | |
| # ---------------------------------------------------------- | |
| # Gradio UI | |
| # ---------------------------------------------------------- | |
| full_description = """ | |
| =========================================================== | |
| Video Annotated Transcript Generator | |
| =========================================================== | |
| Upload a video and get: | |
| 1. Transcript (DOCX) | |
| 2. VTT subtitles | |
| 3. Screenshots (ZIP) | |
| 4. Voice intensity plots | |
| 5. Interactive HTML with editable text, screenshots, voice plots, category selection, and XLSX export. | |
| """ | |
| demo = gr.Interface( | |
| fn=process, | |
| inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])], | |
| outputs=[ | |
| gr.File(label="Download Transcript (DOCX)"), | |
| gr.File(label="Download Annotated HTML"), | |
| gr.File(label="Download Screenshots (ZIP)"), | |
| gr.HTML(label="Preview Annotated Transcript") | |
| ], | |
| title="Video2Novel", | |
| description=full_description | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |