thomasanto7001 commited on
Commit
064b756
Β·
verified Β·
1 Parent(s): 33cfb41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -24
app.py CHANGED
@@ -10,15 +10,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
10
  from nltk.tokenize import sent_tokenize
11
  from nltk.corpus import stopwords
12
 
13
- # Download NLTK data once
14
  nltk.download('punkt_tab')
15
  nltk.download('stopwords')
16
 
17
- # Global objects (faster)
18
- stop_words = set(stopwords.words('english'))
19
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1) # Use device=0 if GPU
20
 
21
- # 🧠 Function to download YouTube video
22
  def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
23
  print(f"⬇️ Downloading YouTube video via yt-dlp: {youtube_url}")
24
  command = ["yt-dlp", "-f", "best[ext=mp4]+bestaudio/best", "-o", filename, youtube_url]
@@ -27,36 +27,29 @@ def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
27
  raise Exception("YouTube download failed: " + result.stderr)
28
  return filename
29
 
30
- # 🎧 Audio extraction
31
  def extract_audio(video_path):
32
  clip = VideoFileClip(video_path)
33
  audio_path = "temp_audio.wav"
34
- clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
35
  return audio_path
36
 
37
- # πŸ“ Audio transcription
38
  def transcribe_audio(audio_path):
39
- model = whisper.load_model("tiny.en") # Use "base.en" for slightly better quality
40
- result = model.transcribe(audio_path)
41
  return result["text"]
42
 
43
- # πŸ“„ Summary generator with batching
44
  def generate_summary(text, default_max_len=130, default_min_len=30):
45
- summarizer = pipeline("summarization")
46
  sentences = sent_tokenize(text)
47
-
48
- # Chunk sentences in groups of 10
49
  chunks = [' '.join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
50
  summary = ""
51
 
52
  for chunk in chunks:
53
  input_len = len(chunk.split())
54
-
55
- # Dynamically scale max and min length
56
  dynamic_max = max(20, min(default_max_len, input_len - 1))
57
  dynamic_min = max(10, min(default_min_len, dynamic_max - 10))
58
 
59
- # Avoid max_length > input_length error
60
  result = summarizer(
61
  chunk,
62
  max_length=dynamic_max,
@@ -68,12 +61,11 @@ def generate_summary(text, default_max_len=130, default_min_len=30):
68
 
69
  return summary.strip()
70
 
71
- # ❓ Quiz generator
72
  def generate_quiz(text, num_questions=5):
73
  sentences = sent_tokenize(text)
74
  tfidf = TfidfVectorizer(stop_words='english', max_features=300)
75
  X = tfidf.fit_transform(sentences)
76
-
77
  quiz = []
78
  used = set()
79
 
@@ -100,7 +92,7 @@ def generate_quiz(text, num_questions=5):
100
  for i, q in enumerate(quiz)
101
  ])
102
 
103
- # πŸ“Ί Subtitle formatter
104
  def generate_subtitles(text, max_words_per_line=10):
105
  sentences = sent_tokenize(text)
106
  subtitles = []
@@ -112,15 +104,14 @@ def generate_subtitles(text, max_words_per_line=10):
112
  count += 1
113
  return "\n".join(subtitles)
114
 
115
- # πŸ§ͺ Master function
116
  def process_video(video_path, selected_services):
117
  results = {}
118
  print("πŸ”§ Extracting audio...")
119
  audio_path = extract_audio(video_path)
120
 
121
- transcription = transcribe_audio(audio_path) if "Transcription" in selected_services else None
122
-
123
- if transcription:
124
  results["transcription"] = transcription
125
 
126
  if "Summary" in selected_services:
 
10
  from nltk.tokenize import sent_tokenize
11
  from nltk.corpus import stopwords
12
 
13
+ # πŸ“¦ Download NLTK data (ideally run this in setup instead of here)
14
  nltk.download('punkt_tab')
15
  nltk.download('stopwords')
16
 
17
+ # πŸ”„ Global models (load once)
18
+ whisper_model = whisper.load_model("tiny.en") # Use 'base.en' for better accuracy if on GPU
19
+ summarizer = pipeline("summarization", model="t5-small", device=-1) # Use device=0 if GPU available
20
 
21
+ # πŸ”½ Download YouTube video
22
  def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
23
  print(f"⬇️ Downloading YouTube video via yt-dlp: {youtube_url}")
24
  command = ["yt-dlp", "-f", "best[ext=mp4]+bestaudio/best", "-o", filename, youtube_url]
 
27
  raise Exception("YouTube download failed: " + result.stderr)
28
  return filename
29
 
30
+ # 🎧 Extract audio from video
31
  def extract_audio(video_path):
32
  clip = VideoFileClip(video_path)
33
  audio_path = "temp_audio.wav"
34
+ clip.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
35
  return audio_path
36
 
37
+ # πŸ“ Transcribe audio using Whisper
38
  def transcribe_audio(audio_path):
39
+ result = whisper_model.transcribe(audio_path)
 
40
  return result["text"]
41
 
42
+ # πŸ“„ Generate summary in chunks
43
  def generate_summary(text, default_max_len=130, default_min_len=30):
 
44
  sentences = sent_tokenize(text)
 
 
45
  chunks = [' '.join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
46
  summary = ""
47
 
48
  for chunk in chunks:
49
  input_len = len(chunk.split())
 
 
50
  dynamic_max = max(20, min(default_max_len, input_len - 1))
51
  dynamic_min = max(10, min(default_min_len, dynamic_max - 10))
52
 
 
53
  result = summarizer(
54
  chunk,
55
  max_length=dynamic_max,
 
61
 
62
  return summary.strip()
63
 
64
+ # ❓ Generate quiz
65
  def generate_quiz(text, num_questions=5):
66
  sentences = sent_tokenize(text)
67
  tfidf = TfidfVectorizer(stop_words='english', max_features=300)
68
  X = tfidf.fit_transform(sentences)
 
69
  quiz = []
70
  used = set()
71
 
 
92
  for i, q in enumerate(quiz)
93
  ])
94
 
95
+ # πŸ“Ί Subtitle formatting
96
  def generate_subtitles(text, max_words_per_line=10):
97
  sentences = sent_tokenize(text)
98
  subtitles = []
 
104
  count += 1
105
  return "\n".join(subtitles)
106
 
107
+ # πŸ§ͺ Main processor
108
  def process_video(video_path, selected_services):
109
  results = {}
110
  print("πŸ”§ Extracting audio...")
111
  audio_path = extract_audio(video_path)
112
 
113
+ if "Transcription" in selected_services:
114
+ transcription = transcribe_audio(audio_path)
 
115
  results["transcription"] = transcription
116
 
117
  if "Summary" in selected_services: