thomasanto7001 commited on
Commit
08d3703
·
verified ·
1 Parent(s): f3aa119

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -63
app.py CHANGED
@@ -6,16 +6,19 @@ import os
6
  import re
7
  import random
8
  import subprocess
9
- import datetime
10
  from sklearn.feature_extraction.text import TfidfVectorizer
11
  from nltk.tokenize import sent_tokenize
12
  from nltk.corpus import stopwords
13
 
14
- # Download necessary NLTK data
15
  nltk.download('punkt')
16
  nltk.download('stopwords')
 
 
17
  stop_words = set(stopwords.words('english'))
 
18
 
 
19
  def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
20
  print(f"⬇️ Downloading YouTube video via yt-dlp: {youtube_url}")
21
  command = ["yt-dlp", "-f", "best[ext=mp4]+bestaudio/best", "-o", filename, youtube_url]
@@ -24,79 +27,42 @@ def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
24
  raise Exception("YouTube download failed: " + result.stderr)
25
  return filename
26
 
 
27
  def extract_audio(video_path):
28
  clip = VideoFileClip(video_path)
29
  audio_path = "temp_audio.wav"
30
  clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
31
  return audio_path
32
 
 
33
  def transcribe_audio(audio_path):
34
- model = whisper.load_model("base")
35
- result = model.transcribe(audio_path, word_timestamps=True) # includes segment timestamps
36
- return result # full result with segments
37
-
38
- def format_timestamp(seconds):
39
- """Convert seconds to SRT timestamp format."""
40
- td = datetime.timedelta(seconds=float(seconds))
41
- return str(td)[:12].replace('.', ',')
42
-
43
- def generate_subtitles(transcription):
44
- """
45
- Generate subtitles in a format similar to SRT using Whisper's segment output.
46
- Assumes transcription is a dict containing a 'segments' key with timestamps.
47
- """
48
- if not transcription or 'segments' not in transcription:
49
- raise ValueError("Transcription must include 'segments'.")
50
-
51
- subtitles = []
52
- for idx, segment in enumerate(transcription['segments'], start=1):
53
- start = format_timestamp(segment['start'])
54
- end = format_timestamp(segment['end'])
55
- text = segment['text'].strip()
56
-
57
- subtitle = {
58
- "index": idx,
59
- "start": start,
60
- "end": end,
61
- "text": text
62
- }
63
- subtitles.append(subtitle)
64
-
65
- return subtitles
66
 
 
67
  def generate_summary(text, max_len=130, min_len=30):
68
- summarizer = pipeline("summarization")
69
  sentences = sent_tokenize(text)
70
  chunks = [' '.join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
71
- summary = ""
72
-
73
- for chunk in chunks:
74
- input_len = len(chunk.split())
75
- dynamic_max_len = min(max_len, max(20, input_len // 2))
76
- dynamic_min_len = min(min_len, dynamic_max_len - 5)
77
-
78
- result = summarizer(
79
- chunk,
80
- max_length=dynamic_max_len,
81
- min_length=dynamic_min_len,
82
- do_sample=False
83
- )[0]["summary_text"]
84
-
85
- summary += result + " "
86
-
87
- return summary.strip()
88
 
 
89
  def generate_quiz(text, num_questions=5):
90
  sentences = sent_tokenize(text)
91
- tfidf = TfidfVectorizer(stop_words='english')
92
  X = tfidf.fit_transform(sentences)
 
93
  quiz = []
94
  used = set()
 
95
  for _ in range(num_questions):
96
  i = random.choice([x for x in range(len(sentences)) if x not in used])
97
  used.add(i)
98
  question = sentences[i]
99
  options = [question]
 
100
  while len(options) < 4:
101
  j = random.randint(0, len(sentences) - 1)
102
  if j != i and sentences[j] not in options:
@@ -107,28 +73,43 @@ def generate_quiz(text, num_questions=5):
107
  "options": options,
108
  "answer": question
109
  })
110
- return "\n\n".join(
111
- [f"Q{i+1}: {q['question']}\nOptions:\n" + "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(q['options'])]) for i, q in enumerate(quiz)]
112
- )
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def process_video(video_path, selected_services):
115
  results = {}
116
  print("🔧 Extracting audio...")
117
  audio_path = extract_audio(video_path)
118
 
119
- transcription_result = transcribe_audio(audio_path) if "Transcription" in selected_services else None
120
- transcription_text = transcription_result["text"] if transcription_result else None
121
 
122
- if transcription_result:
123
- results["transcription"] = transcription_text
124
 
125
  if "Summary" in selected_services:
126
- results["summary"] = generate_summary(transcription_text)
127
 
128
  if "Subtitles" in selected_services:
129
- results["subtitles"] = generate_subtitles(transcription_result)
130
 
131
  if "Quiz" in selected_services:
132
- results["quiz"] = generate_quiz(transcription_text)
133
 
134
  return results
 
6
  import re
7
  import random
8
  import subprocess
 
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from nltk.tokenize import sent_tokenize
11
  from nltk.corpus import stopwords
12
 
13
+ # Download NLTK data once
14
  nltk.download('punkt')
15
  nltk.download('stopwords')
16
+
17
+ # Global objects (faster)
18
  stop_words = set(stopwords.words('english'))
19
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1) # Use device=0 if GPU
20
 
21
+ # 🧠 Function to download YouTube video
22
  def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
23
  print(f"⬇️ Downloading YouTube video via yt-dlp: {youtube_url}")
24
  command = ["yt-dlp", "-f", "best[ext=mp4]+bestaudio/best", "-o", filename, youtube_url]
 
27
  raise Exception("YouTube download failed: " + result.stderr)
28
  return filename
29
 
30
+ # 🎧 Audio extraction
31
  def extract_audio(video_path):
32
  clip = VideoFileClip(video_path)
33
  audio_path = "temp_audio.wav"
34
  clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
35
  return audio_path
36
 
37
+ # 📝 Audio transcription
38
  def transcribe_audio(audio_path):
39
+ model = whisper.load_model("tiny") # Use "base.en" for slightly better quality
40
+ result = model.transcribe(audio_path)
41
+ return result["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # 📄 Summary generator with batching
44
  def generate_summary(text, max_len=130, min_len=30):
 
45
  sentences = sent_tokenize(text)
46
  chunks = [' '.join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
47
+
48
+ results = summarizer(chunks, max_length=max_len, min_length=min_len, do_sample=False)
49
+ return " ".join([r["summary_text"] for r in results])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ # ❓ Quiz generator
52
  def generate_quiz(text, num_questions=5):
53
  sentences = sent_tokenize(text)
54
+ tfidf = TfidfVectorizer(stop_words='english', max_features=300)
55
  X = tfidf.fit_transform(sentences)
56
+
57
  quiz = []
58
  used = set()
59
+
60
  for _ in range(num_questions):
61
  i = random.choice([x for x in range(len(sentences)) if x not in used])
62
  used.add(i)
63
  question = sentences[i]
64
  options = [question]
65
+
66
  while len(options) < 4:
67
  j = random.randint(0, len(sentences) - 1)
68
  if j != i and sentences[j] not in options:
 
73
  "options": options,
74
  "answer": question
75
  })
 
 
 
76
 
77
+ return "\n\n".join([
78
+ f"Q{i + 1}: {q['question']}\nOptions:\n" +
79
+ "\n".join([f"{chr(65 + j)}. {opt}" for j, opt in enumerate(q['options'])])
80
+ for i, q in enumerate(quiz)
81
+ ])
82
+
83
+ # 📺 Subtitle formatter
84
+ def generate_subtitles(text, max_words_per_line=10):
85
+ sentences = sent_tokenize(text)
86
+ subtitles = []
87
+ count = 1
88
+ for sentence in sentences:
89
+ chunks = [sentence[i:i + max_words_per_line] for i in range(0, len(sentence), max_words_per_line)]
90
+ for chunk in chunks:
91
+ subtitles.append(f"{count}. {chunk}")
92
+ count += 1
93
+ return "\n".join(subtitles)
94
+
95
+ # 🧪 Master function
96
  def process_video(video_path, selected_services):
97
  results = {}
98
  print("🔧 Extracting audio...")
99
  audio_path = extract_audio(video_path)
100
 
101
+ transcription = transcribe_audio(audio_path) if "Transcription" in selected_services else None
 
102
 
103
+ if transcription:
104
+ results["transcription"] = transcription
105
 
106
  if "Summary" in selected_services:
107
+ results["summary"] = generate_summary(transcription)
108
 
109
  if "Subtitles" in selected_services:
110
+ results["subtitles"] = generate_subtitles(transcription)
111
 
112
  if "Quiz" in selected_services:
113
+ results["quiz"] = generate_quiz(transcription)
114
 
115
  return results