Spaces:

TorchLLM
/

GeminiRAG

Build error

File size: 4,168 Bytes

d9e3edb

import os
import traceback

import requests
import yt_dlp
from bs4 import BeautifulSoup
from download_video import downlaod_video_from_url
from pytube import YouTube


def download_youtube_video(url, download_path="../data/"):
    try:
        yt = YouTube(url)

        # Get the best stream (highest resolution video)
        video_stream = (
            yt.streams.filter(progressive=True, file_extension="mp4")
            .order_by("resolution")
            .desc()
            .first()
        )

        # If the stream exists, download it
        if video_stream:
            video_stream.download(output_path=download_path)
            print(f"Video downloaded successfully to {download_path}")
        else:
            print("No suitable video stream found")
    except Exception as e:
        print(f"Error in downloading YouTube video: {e}")


def download_audio(url, download_path="../data/"):
    """
    Download audio from YouTube and convert to MP3 format.

    Args:
        url: YouTube video URL
        download_path: Path where the MP3 file will be saved
    """
    ydl_opts = {
        "outtmpl": f"{download_path}%(title)s.%(ext)s",
        "format": "bestaudio/best",
        "geo-bypass": True,
        "noplaylist": True,
        "force-ipv4": True,
        # Add postprocessors for MP3 conversion
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }
        ],
        "headers": {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        },
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print(f"Audio downloaded and converted to MP3 successfully at {download_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


# Function to download PDF, DOC, or other files
def download_file(url, download_path="../data/"):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        filename = os.path.join(download_path, url.split("/")[-1])

        with open(filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
        print(f"File downloaded successfully to {filename}")
    except Exception as e:
        print(f"An error occurred: {e}")


# Function to download text or webpage content
def download_text_or_webpage(url, download_path="../data/", is_text=False):
    try:
        response = requests.get(url)
        response.raise_for_status()

        if is_text:
            filename = os.path.join(download_path, url.split("/")[-1] + ".txt")
            with open(filename, "w") as file:
                file.write(response.text)
            print(f"Text file downloaded successfully to {filename}")
        else:
            soup = BeautifulSoup(response.text, "html.parser")
            filename = os.path.join(download_path, url.split("/")[-1] + ".html")
            with open(filename, "w", encoding="utf-8") as file:
                file.write(soup.prettify())
            print(f"Webpage downloaded successfully to {filename}")

    except Exception as e:
        print(f"An error occurred: {e}")


def main():
    # Example Usage:
    # url_video = "https://www.youtube.com/watch?v=dIYmzf21d1g"
    # downlaod_video_from_url(
    #     youtube_url=url_video, download_path="../data/"
    # )  # Download video
    url_audio = "https://www.youtube.com/watch?v=8OHYynw7Yh4"
    download_audio(url_audio)  # Download audio

    # url_pdf = "https://example.com/somefile.pdf"
    # download_file(url_pdf)  # Download PDF, DOC, or any other file

    # url_text = "https://example.com/sometextfile"
    # download_text_or_webpage(url_text, is_text=True)  # Download text

    # url_webpage = "https://en.wikipedia.org/wiki/Microsoft"
    # download_text_or_webpage(url_webpage)  # Download webpage content


if __name__ == "__main__":
    main()