File size: 4,887 Bytes
b4afcc8
c083bcc
b4afcc8
 
c083bcc
eeac904
 
3ed2d94
 
 
c083bcc
eeac904
 
3ed2d94
 
 
 
eeac904
 
 
 
c083bcc
3ed2d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeac904
3ed2d94
 
 
 
 
b4afcc8
eeac904
 
 
 
 
 
 
 
 
 
3ed2d94
eeac904
 
 
 
 
3ed2d94
eeac904
 
 
 
 
3ed2d94
 
 
 
eeac904
 
3ed2d94
eeac904
3ed2d94
eeac904
3ed2d94
 
 
 
 
eeac904
3ed2d94
 
 
eeac904
3ed2d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import tempfile
import whisper
import subprocess
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
import re
import json
from hashlib import md5
import browser_cookie3

class VideoProcessor:
    def __init__(self):
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.models = {}
        self.cookie_file = "cookies.txt"  # Path to your cookies file

    def load_model(self, model_size="base"):
        if model_size not in self.models:
            self.models[model_size] = whisper.load_model(model_size)
        return self.models[model_size]

    def _download_with_cookies(self, url):
        """Method 1: Download using browser cookies"""
        cmd = [
            "yt-dlp",
            "--cookies", self.cookie_file,
            "--extract-audio",
            "--audio-format", "mp3",
            "--audio-quality", "0",
            "--quiet",
            "-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
            url
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(f"Cookie download failed: {result.stderr}")
        return self._find_downloaded_file()

    def _download_with_yt_dlp(self, url):
        """Method 2: Regular download"""
        cmd = [
            "yt-dlp",
            "--extract-audio",
            "--audio-format", "mp3",
            "--quiet",
            "-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
            url
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            raise Exception(f"Download failed: {result.stderr}")
        return self._find_downloaded_file()

    def _find_downloaded_file(self):
        """Helper to find downloaded audio file"""
        for root, _, files in os.walk(tempfile.gettempdir()):
            for file in files:
                if file.endswith('.mp3'):
                    return os.path.join(root, file)
        raise Exception("Downloaded audio file not found")

    def download_audio(self, url, use_cookies=False):
        """Robust download with fallback methods"""
        try:
            if use_cookies and os.path.exists(self.cookie_file):
                return self._download_with_cookies(url)
            return self._download_with_yt_dlp(url)
        except Exception as e:
            raise Exception(f"All download methods failed: {str(e)}")

    def transcribe_audio(self, audio_path, model_size="base"):
        model = self.load_model(model_size)
        result = model.transcribe(audio_path)
        return result["text"]

    def clean_transcript(self, text):
        text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
        return re.sub(r'\s+', ' ', text).strip()

    def summarize_chunk(self, chunk):
        return self.summarizer(chunk, max_length=150, min_length=30)[0]['summary_text']

    def summarize_text(self, text, chunk_size=1000):
        text = self.clean_transcript(text)
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        
        with ThreadPoolExecutor(max_workers=4) as executor:
            summaries = list(executor.map(self.summarize_chunk, chunks))
            
        return "\n".join(summaries)

    def extract_key_points(self, text):
        prompt = f"""Extract 5-7 key points from this transcript. Each point should:
        - Start with a bullet (-)
        - Be concise but specific
        - Include numbers/dates when mentioned
        
        Transcript:
        {text[:8000]}
        
        Key Points:"""
        
        result = self.summarizer(prompt, max_length=300, min_length=100)[0]['summary_text']
        return re.sub(r'(^|\n)(?=\w)', '\n- ', result)

    def get_video_id(self, url):
        return md5(url.encode()).hexdigest()

    def process(self, youtube_url, chunk_size=1000, model_size="base", use_cookies=False):
        video_id = self.get_video_id(youtube_url)
        cache_file = f"cache_{video_id}.json"
        
        if os.path.exists(cache_file):
            with open(cache_file) as f:
                return json.load(f)
        
        try:
            audio_path = self.download_audio(youtube_url, use_cookies)
            transcript = self.transcribe_audio(audio_path, model_size)
            
            result = {
                'summary': self.summarize_text(transcript, chunk_size),
                'key_points': self.extract_key_points(transcript),
                'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
            }
            
            with open(cache_file, 'w') as f:
                json.dump(result, f)
            
            return result
            
        except Exception as e:
            return {'error': str(e)}