Spaces:
Sleeping
Sleeping
File size: 4,887 Bytes
b4afcc8 c083bcc b4afcc8 c083bcc eeac904 3ed2d94 c083bcc eeac904 3ed2d94 eeac904 c083bcc 3ed2d94 eeac904 3ed2d94 b4afcc8 eeac904 3ed2d94 eeac904 3ed2d94 eeac904 3ed2d94 eeac904 3ed2d94 eeac904 3ed2d94 eeac904 3ed2d94 eeac904 3ed2d94 eeac904 3ed2d94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import os
import tempfile
import whisper
import subprocess
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
import re
import json
from hashlib import md5
import browser_cookie3
class VideoProcessor:
def __init__(self):
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
self.models = {}
self.cookie_file = "cookies.txt" # Path to your cookies file
def load_model(self, model_size="base"):
if model_size not in self.models:
self.models[model_size] = whisper.load_model(model_size)
return self.models[model_size]
def _download_with_cookies(self, url):
"""Method 1: Download using browser cookies"""
cmd = [
"yt-dlp",
"--cookies", self.cookie_file,
"--extract-audio",
"--audio-format", "mp3",
"--audio-quality", "0",
"--quiet",
"-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
url
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Cookie download failed: {result.stderr}")
return self._find_downloaded_file()
def _download_with_yt_dlp(self, url):
"""Method 2: Regular download"""
cmd = [
"yt-dlp",
"--extract-audio",
"--audio-format", "mp3",
"--quiet",
"-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
url
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Download failed: {result.stderr}")
return self._find_downloaded_file()
def _find_downloaded_file(self):
"""Helper to find downloaded audio file"""
for root, _, files in os.walk(tempfile.gettempdir()):
for file in files:
if file.endswith('.mp3'):
return os.path.join(root, file)
raise Exception("Downloaded audio file not found")
def download_audio(self, url, use_cookies=False):
"""Robust download with fallback methods"""
try:
if use_cookies and os.path.exists(self.cookie_file):
return self._download_with_cookies(url)
return self._download_with_yt_dlp(url)
except Exception as e:
raise Exception(f"All download methods failed: {str(e)}")
def transcribe_audio(self, audio_path, model_size="base"):
model = self.load_model(model_size)
result = model.transcribe(audio_path)
return result["text"]
def clean_transcript(self, text):
text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
return re.sub(r'\s+', ' ', text).strip()
def summarize_chunk(self, chunk):
return self.summarizer(chunk, max_length=150, min_length=30)[0]['summary_text']
def summarize_text(self, text, chunk_size=1000):
text = self.clean_transcript(text)
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
with ThreadPoolExecutor(max_workers=4) as executor:
summaries = list(executor.map(self.summarize_chunk, chunks))
return "\n".join(summaries)
def extract_key_points(self, text):
prompt = f"""Extract 5-7 key points from this transcript. Each point should:
- Start with a bullet (-)
- Be concise but specific
- Include numbers/dates when mentioned
Transcript:
{text[:8000]}
Key Points:"""
result = self.summarizer(prompt, max_length=300, min_length=100)[0]['summary_text']
return re.sub(r'(^|\n)(?=\w)', '\n- ', result)
def get_video_id(self, url):
return md5(url.encode()).hexdigest()
def process(self, youtube_url, chunk_size=1000, model_size="base", use_cookies=False):
video_id = self.get_video_id(youtube_url)
cache_file = f"cache_{video_id}.json"
if os.path.exists(cache_file):
with open(cache_file) as f:
return json.load(f)
try:
audio_path = self.download_audio(youtube_url, use_cookies)
transcript = self.transcribe_audio(audio_path, model_size)
result = {
'summary': self.summarize_text(transcript, chunk_size),
'key_points': self.extract_key_points(transcript),
'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
}
with open(cache_file, 'w') as f:
json.dump(result, f)
return result
except Exception as e:
return {'error': str(e)} |