Spaces:

ariansyahdedy
/

yt_dataset

Sleeping

App Files Files Community

ariansyahdedy commited on Aug 22, 2024

Commit

b0b30af

1 Parent(s): a67ff5f

Download YT video

Browse files

Files changed (9) hide show

.gitignore +27 -0
Dockerfile +35 -0
app/api/endpoints/video.py +55 -0
app/main.py +13 -0
app/services/audio_service.py +0 -0
app/services/video_service.py +111 -0
autocorrect.py +18 -0
download.py +67 -0
requirements.txt +59 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Ignore Python cache files
+__pycache__/
+*.pyc
+# Ignore virtual environments
+.venv/
+# Ignore log files
+*.log
+# Ignore node_modules directory (for Node.js projects)
+node_modules/
+# Ignore environment variables file
+.env
+# Ignore operating system files
+.DS_Store  # macOS
+Thumbs.db  # Windows
+# Ignore Hugging Face model files (if large)
+/model/checkpoints/*
+/downloaded_videos/*
+/extracted_audio/*
+/frame

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+# Step 1: Use Python 3.10.6 as the base image
+FROM python:3.10.6-slim
+# Step 2: Set the working directory inside the container
+WORKDIR /app
+# Step 3: Copy the requirements.txt file to the working directory
+COPY requirements.txt .
+# Step 4: Install the Python dependencies from the requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Step 5: Create a directory for data storage (e.g., for downloaded videos or extracted audio)
+# Ensure that this directory has the right permissions for writing.
+RUN mkdir -p /app/downloaded_videos /app/extracted_audio \
+    && chmod -R 755 /app/downloaded_videos /app/extracted_audio
+# Step 6: Create a non-root user for security and switch to that user
+RUN useradd -m myuser \
+    && chown -R myuser:myuser /app
+# Step 7: Copy the rest of the application code to the working directory
+COPY . .
+# Step 8: Change ownership of the app directory and files
+RUN chown -R myuser:myuser /app
+# Step 9: Switch to the non-root user
+USER myuser
+# Step 10: Expose the port that FastAPI runs on
+EXPOSE 7860
+# Step 11: Command to run FastAPI using Uvicorn when the container starts
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/api/endpoints/video.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# app/routers/video.py
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from app.services.video_service import (
+    download_youtube_videos,
+    split_video_and_extract_audio,
+    downscale_video,
+)
+import os
+router = APIRouter()
+class YouTubeRequest(BaseModel):
+    urls: list[str]  # Accept multiple URLs
+    chunk_duration: int = 30  # Default chunk duration is 30 seconds
+    quality: str = "worst"  # Allow specifying the video quality
+class DownscaleRequest(BaseModel):
+    video_path: str
+    resolution: str = "720p"
+download_dir = "downloaded_videos"
+audio_dir = "extracted_audio"
+@router.post("/download_youtube_videos")
+async def download_video(data: YouTubeRequest):
+    if not os.path.exists(download_dir):
+        os.makedirs(download_dir)
+    try:
+        video_paths = await download_youtube_videos(data.urls, download_dir, data.quality)
+        return {"video_paths": video_paths}
+    except FileNotFoundError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+@router.post("/split-and-extract-audio")
+async def split_and_extract_audio(data: YouTubeRequest):
+    try:
+        video_path = await download_youtube_video(data.url, download_dir)
+        await split_video_and_extract_audio(video_path, data.chunk_duration, audio_dir)
+        return {"message": "Audio extraction complete", "audio_dir": audio_dir}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/downscale")
+async def downscale_video_endpoint(data: DownscaleRequest):
+    downscaled_video_path = os.path.join(download_dir, "downscaled_video.mp4")
+    try:
+        await downscale_video(data.video_path, downscaled_video_path, data.resolution)
+        return {"downscaled_video_path": downscaled_video_path}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

app/main.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# app/main.py
+from fastapi import FastAPI
+from app.api.endpoints import video
+app = FastAPI()
+# Register the video router
+app.include_router(video.router, prefix="/video", tags=["Video download and Processing"])
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the YouTube Video Processing API!"}

app/services/audio_service.py ADDED Viewed

File without changes

app/services/video_service.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# app/services/video_service.py
+import os, asyncio, json
+import subprocess
+from moviepy.editor import VideoFileClip
+from fastapi import HTTPException
+from concurrent.futures import ThreadPoolExecutor
+from fastapi.concurrency import run_in_threadpool
+# Initialize thread pool
+executor = ThreadPoolExecutor()
+# Metadata file to store processed videos
+metadata_file = "processed_videos.json"
+# Helper function to load metadata
+def load_metadata():
+    if os.path.exists(metadata_file):
+        with open(metadata_file, "r") as f:
+            return json.load(f)
+    else:
+        return {}
+# Helper function to save metadata
+def save_metadata(metadata):
+    with open(metadata_file, "w") as f:
+        json.dump(metadata, f, indent=4)
+# Check if video has been processed
+def is_video_processed(video_file):
+    metadata = load_metadata()
+    return metadata.get(video_file, False)
+# Mark video as processed
+def mark_video_as_processed(video_file):
+    metadata = load_metadata()
+    metadata[video_file] = True
+    save_metadata(metadata)
+# Download the YouTube videos using yt-dlp asynchronously with quality option
+async def download_youtube_videos(urls: list[str], output_path: str, quality: str = "best") -> list[str]:
+    downloaded_files = []
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    for url in urls:
+        output_file = os.path.join(output_path, '%(title)s.%(ext)s')
+        command = ["yt-dlp", "-o", output_file, "-f", quality, url]
+        if is_video_processed(output_file):
+            print(f"Video {output_file} has already been processed.")
+            continue
+        # Run the yt-dlp command in a separate thread
+        await run_in_threadpool(subprocess.run, command)
+        # Find the downloaded file in the directory
+        video_files = [f for f in os.listdir(output_path) if f.endswith(".mp4")]
+        if video_files:
+            video_file = os.path.join(output_path, video_files[0])
+            downloaded_files.append(video_file)
+        else:
+            raise HTTPException(status_code=404, detail=f"Video not found for URL: {url}")
+    return downloaded_files
+# Split the video into chunks and extract audio asynchronously with codec support
+async def split_video_and_extract_audio(video_path: str, chunk_duration: int, audio_output_dir: str):
+    if not os.path.exists(audio_output_dir):
+        os.makedirs(audio_output_dir)
+    print(audio_output_dir)
+    video_clip = await run_in_threadpool(VideoFileClip, video_path)
+    print(video_clip.duration)
+    total_duration = video_clip.duration
+    chunk_count = int(total_duration // chunk_duration)
+    if total_duration % chunk_duration != 0:
+        chunk_count += 1
+    for i in range(chunk_count):
+        start_time = i * chunk_duration
+        end_time = min((i + 1) * chunk_duration, total_duration)
+        chunk_clip = await run_in_threadpool(video_clip.subclip, start_time, end_time)
+        audio_file = os.path.join(audio_output_dir, f"{i + 1}.wav")  # Changed to .wav to match the codec
+        print(audio_file)
+        # Call write_audiofile directly with codec='pcm_s16le'
+        await run_in_threadpool(chunk_clip.audio.write_audiofile, audio_file, codec='pcm_s16le')
+    video_clip.close()
+# Downscale the video resolution asynchronously
+async def downscale_video(video_path: str, output_path: str, resolution: str = "720p"):
+    video_clip = await run_in_threadpool(VideoFileClip, video_path)
+    video_clip_resized = await run_in_threadpool(video_clip.resize, height=int(resolution.replace("p", "")))
+    await run_in_threadpool(video_clip_resized.write_videofile, output_path)
+    video_clip.close()
+# Async entry point for running in __main__
+async def main():
+    try:
+        await split_video_and_extract_audio("./downloaded_videos/me.mp4", 30, "./extracted_audio")
+        print("Audio files extracted successfully.")
+    except Exception as e:
+        print(e)
+if __name__ == "__main__":
+    asyncio.run(main())  # This will properly run the async code

autocorrect.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import json5
+import json
+def correct_malformed_json(json_string):
+    try:
+        # Use json5 to parse the more flexible JSON format
+        parsed_json = json5.loads(json_string)
+        # Reformat and pretty-print the JSON using json.dumps
+        formatted_json = json.dumps(parsed_json, indent=4)
+        return formatted_json
+    except json5.JSONDecodeError as e:
+        return f"Error: {e}"
+# Example usage with a malformed JSON string
+malformed_json = '{"greeting": ["Good afternoon Tony", "Let me introduce our guest""]}'
+print(correct_malformed_json(malformed_json))

download.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import subprocess
+from moviepy.editor import VideoFileClip
+# Function to download the YouTube video using yt-dlp
+def download_youtube_video(url, output_path):
+    output_file = os.path.join(output_path, '%(title)s.%(ext)s')
+    command = ["yt-dlp", "-o", output_file, url]
+    # Running the yt-dlp command to download the video
+    subprocess.run(command)
+    # Find the downloaded file in the directory
+    downloaded_files = [f for f in os.listdir(output_path) if f.endswith(".mp4")]
+    if downloaded_files:
+        video_file = os.path.join(output_path, downloaded_files[0])
+        print(f"Downloaded video: {video_file}")
+        return video_file
+    else:
+        raise FileNotFoundError("Video not found in the specified directory.")
+# Function to split the video into chunks and extract audio
+def split_video_and_extract_audio(video_path, chunk_duration, audio_output_dir):
+    if not os.path.exists(audio_output_dir):
+        os.makedirs(audio_output_dir)
+    video_clip = VideoFileClip(video_path)
+    total_duration = video_clip.duration
+    chunk_count = int(total_duration // chunk_duration)
+    if total_duration % chunk_duration != 0:
+        chunk_count += 1
+    for i in range(chunk_count):
+        start_time = i * chunk_duration
+        end_time = min((i + 1) * chunk_duration, total_duration)
+        chunk_clip = video_clip.subclip(start_time, end_time)
+        # Define audio filename
+        audio_file = os.path.join(audio_output_dir, f"audio_chunk_{i + 1}.mp3")
+        # Extract audio
+        chunk_clip.audio.write_audiofile(audio_file)
+        print(f"Extracted audio chunk {i + 1} to {audio_file}")
+    video_clip.close()
+    print(f"Processing complete! Audio chunks are saved in {audio_output_dir}")
+# Main function
+def main():
+    url = 'https://www.youtube.com/watch?v=lTxn2BuqyzU'
+    output_dir = "downloaded_videos"
+    audio_output_dir = "extracted_audio"
+    chunk_duration = 30  # Duration of each chunk in seconds
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Download the YouTube video
+    video_path = download_youtube_video(url, output_dir)
+    # Split video into chunks and extract audio
+    split_video_and_extract_audio(video_path, chunk_duration, audio_output_dir)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+aiomysql==0.2.0
+annotated-types==0.7.0
+anyio==4.4.0
+bcrypt==3.2.0
+certifi==2024.7.4
+cffi==1.16.0
+click==8.1.7
+colorama==0.4.6
+dnspython==2.6.1
+ecdsa==0.19.0
+email_validator==2.2.0
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+fastapi-security==0.5.0
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+idna==3.7
+imutils==0.5.4
+Jinja2==3.1.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+motor==3.5.0
+numpy==2.0.0
+opencv-python==4.10.0.84
+orjson==3.10.6
+packaging==24.1
+passlib==1.7.4
+pillow==10.4.0
+pyasn1==0.6.0
+pycparser==2.22
+pydantic==1.10.11
+pydantic_core==2.20.1
+Pygments==2.18.0
+pymongo==4.8.0
+PyMySQL==1.1.1
+pytesseract==0.3.10
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-jose==3.3.0
+python-multipart==0.0.9
+PyYAML==6.0.1
+rich==13.7.1
+rsa==4.9
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+SQLAlchemy==2.0.31
+starlette==0.37.2
+typer==0.12.3
+typing_extensions==4.12.2
+ujson==5.10.0
+uvicorn==0.30.1
+watchfiles==0.22.0
+websockets==12.0