GeminiRAG / src /utils /download.py
TorchLLM's picture
Initial commit for deploying the project
d9e3edb
import os
import traceback
import requests
import yt_dlp
from bs4 import BeautifulSoup
from download_video import downlaod_video_from_url
from pytube import YouTube
def download_youtube_video(url, download_path="../data/"):
try:
yt = YouTube(url)
# Get the best stream (highest resolution video)
video_stream = (
yt.streams.filter(progressive=True, file_extension="mp4")
.order_by("resolution")
.desc()
.first()
)
# If the stream exists, download it
if video_stream:
video_stream.download(output_path=download_path)
print(f"Video downloaded successfully to {download_path}")
else:
print("No suitable video stream found")
except Exception as e:
print(f"Error in downloading YouTube video: {e}")
def download_audio(url, download_path="../data/"):
"""
Download audio from YouTube and convert to MP3 format.
Args:
url: YouTube video URL
download_path: Path where the MP3 file will be saved
"""
ydl_opts = {
"outtmpl": f"{download_path}%(title)s.%(ext)s",
"format": "bestaudio/best",
"geo-bypass": True,
"noplaylist": True,
"force-ipv4": True,
# Add postprocessors for MP3 conversion
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "192",
}
],
"headers": {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
},
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
print(f"Audio downloaded and converted to MP3 successfully at {download_path}")
except Exception as e:
print(f"An error occurred: {e}")
# Function to download PDF, DOC, or other files
def download_file(url, download_path="../data/"):
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Check if the request was successful
filename = os.path.join(download_path, url.split("/")[-1])
with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print(f"File downloaded successfully to {filename}")
except Exception as e:
print(f"An error occurred: {e}")
# Function to download text or webpage content
def download_text_or_webpage(url, download_path="../data/", is_text=False):
try:
response = requests.get(url)
response.raise_for_status()
if is_text:
filename = os.path.join(download_path, url.split("/")[-1] + ".txt")
with open(filename, "w") as file:
file.write(response.text)
print(f"Text file downloaded successfully to {filename}")
else:
soup = BeautifulSoup(response.text, "html.parser")
filename = os.path.join(download_path, url.split("/")[-1] + ".html")
with open(filename, "w", encoding="utf-8") as file:
file.write(soup.prettify())
print(f"Webpage downloaded successfully to {filename}")
except Exception as e:
print(f"An error occurred: {e}")
def main():
# Example Usage:
# url_video = "https://www.youtube.com/watch?v=dIYmzf21d1g"
# downlaod_video_from_url(
# youtube_url=url_video, download_path="../data/"
# ) # Download video
url_audio = "https://www.youtube.com/watch?v=8OHYynw7Yh4"
download_audio(url_audio) # Download audio
# url_pdf = "https://example.com/somefile.pdf"
# download_file(url_pdf) # Download PDF, DOC, or any other file
# url_text = "https://example.com/sometextfile"
# download_text_or_webpage(url_text, is_text=True) # Download text
# url_webpage = "https://en.wikipedia.org/wiki/Microsoft"
# download_text_or_webpage(url_webpage) # Download webpage content
if __name__ == "__main__":
main()