SmartScribe / app.py
thomasanto7001's picture
Update app.py
daee22a verified
import whisper
from moviepy.video.io.VideoFileClip import VideoFileClip
from transformers import pipeline
import nltk
import os
import re
import random
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
# πŸ“¦ Download NLTK data (ideally run this in setup instead of here)
nltk.download('punkt_tab')
nltk.download('stopwords')
# πŸ”„ Global models (load once)
whisper_model = whisper.load_model("tiny.en") # Use 'base.en' for better accuracy if on GPU
summarizer = pipeline("summarization", model="t5-small", device=-1) # Use device=0 if GPU available
# πŸ”½ Download YouTube video
def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
print(f"⬇️ Downloading YouTube video via yt-dlp: {youtube_url}")
command = ["yt-dlp", "-f", "best[ext=mp4]+bestaudio/best", "-o", filename, youtube_url]
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
raise Exception("YouTube download failed: " + result.stderr)
return filename
# 🎧 Extract audio from video
def extract_audio(video_path):
clip = VideoFileClip(video_path)
audio_path = "temp_audio.wav"
clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
return audio_path
# πŸ“ Transcribe audio using Whisper
def transcribe_audio(audio_path):
result = whisper_model.transcribe(audio_path)
return result["text"]
# πŸ“„ Generate summary in chunks
def generate_summary(text, default_max_len=130, default_min_len=30):
sentences = sent_tokenize(text)
chunks = [' '.join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
summary = ""
for chunk in chunks:
input_len = len(chunk.split())
dynamic_max = max(20, min(default_max_len, input_len - 1))
dynamic_min = max(10, min(default_min_len, dynamic_max - 10))
result = summarizer(
chunk,
max_length=dynamic_max,
min_length=dynamic_min,
do_sample=False
)[0]["summary_text"]
summary += result + " "
return summary.strip()
# ❓ Generate quiz
def generate_quiz(text, num_questions=5):
sentences = sent_tokenize(text)
tfidf = TfidfVectorizer(stop_words='english', max_features=300)
X = tfidf.fit_transform(sentences)
quiz = []
used = set()
for _ in range(num_questions):
i = random.choice([x for x in range(len(sentences)) if x not in used])
used.add(i)
question = sentences[i]
options = [question]
while len(options) < 4:
j = random.randint(0, len(sentences) - 1)
if j != i and sentences[j] not in options:
options.append(sentences[j])
random.shuffle(options)
quiz.append({
"question": question,
"options": options,
"answer": question
})
return "\n\n".join([
f"Q{i + 1}: {q['question']}\nOptions:\n" +
"\n".join([f"{chr(65 + j)}. {opt}" for j, opt in enumerate(q['options'])])
for i, q in enumerate(quiz)
])
# πŸ“Ί Subtitle formatting
def generate_subtitles(text, max_words_per_line=10):
sentences = sent_tokenize(text)
subtitles = []
count = 1
for sentence in sentences:
chunks = [sentence[i:i + max_words_per_line] for i in range(0, len(sentence), max_words_per_line)]
for chunk in chunks:
subtitles.append(f"{count}. {chunk}")
count += 1
return "\n".join(subtitles)
# πŸ§ͺ Main processor
def process_video(video_path, selected_services):
results = {}
print("πŸ”§ Extracting audio...")
audio_path = extract_audio(video_path)
if "Transcription" in selected_services:
transcription = transcribe_audio(audio_path)
results["transcription"] = transcription
if "Summary" in selected_services:
results["summary"] = generate_summary(transcription)
if "Subtitles" in selected_services:
results["subtitles"] = generate_subtitles(transcription)
if "Quiz" in selected_services:
results["quiz"] = generate_quiz(transcription)
return results