thomasanto7001 commited on
Commit
9fecb60
·
verified ·
1 Parent(s): 52e85e3
Files changed (1) hide show
  1. app-2.py +94 -0
app-2.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import whisper
3
+ from moviepy.editor import VideoFileClip
4
+ from transformers import pipeline
5
+ import nltk
6
+ import os
7
+ import re
8
+ import random
9
+ import subprocess
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ from nltk.tokenize import sent_tokenize
13
+ from nltk.corpus import stopwords
14
+
15
+ nltk.download('punkt')
16
+ nltk.download('stopwords')
17
+ stop_words = set(stopwords.words('english'))
18
+
19
+ def download_youtube_video(youtube_url, filename="youtube_video.mp4"):
20
+ print(f"⬇️ Downloading YouTube video via yt-dlp: {youtube_url}")
21
+ command = ["yt-dlp", "-f", "best[ext=mp4]+bestaudio/best", "-o", filename, youtube_url]
22
+ result = subprocess.run(command, capture_output=True, text=True)
23
+ if result.returncode != 0:
24
+ raise Exception("YouTube download failed: " + result.stderr)
25
+ return filename
26
+
27
+ def extract_audio(video_path):
28
+ clip = VideoFileClip(video_path)
29
+ audio_path = "temp_audio.wav"
30
+ clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
31
+ return audio_path
32
+
33
+ def transcribe_audio(audio_path):
34
+ model = whisper.load_model("base")
35
+ result = model.transcribe(audio_path)
36
+ return result["text"]
37
+
38
+ def generate_summary(text, max_len=130):
39
+ summarizer = pipeline("summarization")
40
+ sentences = sent_tokenize(text)
41
+ chunks = [' '.join(sentences[i:i+10]) for i in range(0, len(sentences), 10)]
42
+ summary = ""
43
+ for chunk in chunks:
44
+ summary += summarizer(chunk, max_length=max_len, min_length=30, do_sample=False)[0]["summary_text"] + " "
45
+ return summary.strip()
46
+
47
+ def generate_subtitles(text):
48
+ sentences = sent_tokenize(text)
49
+ subtitles = []
50
+ for i, sentence in enumerate(sentences):
51
+ start_time = i * 5
52
+ end_time = start_time + 5
53
+ subtitles.append(f"{i+1}\n00:00:{start_time:02},000 --> 00:00:{end_time:02},000\n{sentence}\n")
54
+ return "\n".join(subtitles)
55
+
56
+ def generate_quiz(text, num_questions=5):
57
+ sentences = sent_tokenize(text)
58
+ tfidf = TfidfVectorizer(stop_words='english')
59
+ X = tfidf.fit_transform(sentences)
60
+ quiz = []
61
+ used = set()
62
+ for _ in range(num_questions):
63
+ i = random.choice([x for x in range(len(sentences)) if x not in used])
64
+ used.add(i)
65
+ question = sentences[i]
66
+ options = [question]
67
+ while len(options) < 4:
68
+ j = random.randint(0, len(sentences) - 1)
69
+ if j != i and sentences[j] not in options:
70
+ options.append(sentences[j])
71
+ random.shuffle(options)
72
+ quiz.append({
73
+ "question": question,
74
+ "options": options,
75
+ "answer": question
76
+ })
77
+ return "\n\n".join(
78
+ [f"Q{i+1}: {q['question']}\nOptions:\n" + "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(q['options'])]) for i, q in enumerate(quiz)]
79
+ )
80
+
81
+ def process_video(video_path, selected_services):
82
+ results = {}
83
+ print("🔧 Extracting audio...")
84
+ audio_path = extract_audio(video_path)
85
+ transcription = transcribe_audio(audio_path) if "Transcription" in selected_services else None
86
+ if transcription:
87
+ results["transcription"] = transcription
88
+ if "Summary" in selected_services:
89
+ results["summary"] = generate_summary(transcription)
90
+ if "Subtitles" in selected_services:
91
+ results["subtitles"] = generate_subtitles(transcription)
92
+ if "Quiz" in selected_services:
93
+ results["quiz"] = generate_quiz(transcription)
94
+ return results