ariansyahdedy commited on
Commit
b0b30af
·
1 Parent(s): a67ff5f

Download YT video

Browse files
.gitignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore Python cache files
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ # Ignore virtual environments
6
+ .venv/
7
+
8
+
9
+ # Ignore log files
10
+ *.log
11
+
12
+ # Ignore node_modules directory (for Node.js projects)
13
+ node_modules/
14
+
15
+ # Ignore environment variables file
16
+ .env
17
+
18
+ # Ignore operating system files
19
+ .DS_Store # macOS
20
+ Thumbs.db # Windows
21
+
22
+ # Ignore Hugging Face model files (if large)
23
+ /model/checkpoints/*
24
+
25
+ /downloaded_videos/*
26
+ /extracted_audio/*
27
+ /frame
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Step 1: Use Python 3.10.6 as the base image
2
+ FROM python:3.10.6-slim
3
+
4
+ # Step 2: Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # Step 3: Copy the requirements.txt file to the working directory
8
+ COPY requirements.txt .
9
+
10
+ # Step 4: Install the Python dependencies from the requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Step 5: Create a directory for data storage (e.g., for downloaded videos or extracted audio)
14
+ # Ensure that this directory has the right permissions for writing.
15
+ RUN mkdir -p /app/downloaded_videos /app/extracted_audio \
16
+ && chmod -R 755 /app/downloaded_videos /app/extracted_audio
17
+
18
+ # Step 6: Create a non-root user for security and switch to that user
19
+ RUN useradd -m myuser \
20
+ && chown -R myuser:myuser /app
21
+
22
+ # Step 7: Copy the rest of the application code to the working directory
23
+ COPY . .
24
+
25
+ # Step 8: Change ownership of the app directory and files
26
+ RUN chown -R myuser:myuser /app
27
+
28
+ # Step 9: Switch to the non-root user
29
+ USER myuser
30
+
31
+ # Step 10: Expose the port that FastAPI runs on
32
+ EXPOSE 7860
33
+
34
+ # Step 11: Command to run FastAPI using Uvicorn when the container starts
35
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/api/endpoints/video.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/routers/video.py
2
+
3
+ from fastapi import APIRouter, HTTPException
4
+ from pydantic import BaseModel
5
+ from app.services.video_service import (
6
+ download_youtube_videos,
7
+ split_video_and_extract_audio,
8
+ downscale_video,
9
+ )
10
+ import os
11
+
12
+ router = APIRouter()
13
+
14
+ class YouTubeRequest(BaseModel):
15
+ urls: list[str] # Accept multiple URLs
16
+ chunk_duration: int = 30 # Default chunk duration is 30 seconds
17
+ quality: str = "worst" # Allow specifying the video quality
18
+
19
+ class DownscaleRequest(BaseModel):
20
+ video_path: str
21
+ resolution: str = "720p"
22
+
23
+ download_dir = "downloaded_videos"
24
+ audio_dir = "extracted_audio"
25
+
26
+ @router.post("/download_youtube_videos")
27
+ async def download_video(data: YouTubeRequest):
28
+ if not os.path.exists(download_dir):
29
+ os.makedirs(download_dir)
30
+
31
+ try:
32
+ video_paths = await download_youtube_videos(data.urls, download_dir, data.quality)
33
+ return {"video_paths": video_paths}
34
+ except FileNotFoundError as e:
35
+ raise HTTPException(status_code=404, detail=str(e))
36
+
37
+
38
+ @router.post("/split-and-extract-audio")
39
+ async def split_and_extract_audio(data: YouTubeRequest):
40
+ try:
41
+ video_path = await download_youtube_video(data.url, download_dir)
42
+ await split_video_and_extract_audio(video_path, data.chunk_duration, audio_dir)
43
+ return {"message": "Audio extraction complete", "audio_dir": audio_dir}
44
+ except Exception as e:
45
+ raise HTTPException(status_code=500, detail=str(e))
46
+
47
+
48
+ @router.post("/downscale")
49
+ async def downscale_video_endpoint(data: DownscaleRequest):
50
+ downscaled_video_path = os.path.join(download_dir, "downscaled_video.mp4")
51
+ try:
52
+ await downscale_video(data.video_path, downscaled_video_path, data.resolution)
53
+ return {"downscaled_video_path": downscaled_video_path}
54
+ except Exception as e:
55
+ raise HTTPException(status_code=500, detail=str(e))
app/main.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/main.py
2
+
3
+ from fastapi import FastAPI
4
+ from app.api.endpoints import video
5
+
6
+ app = FastAPI()
7
+
8
+ # Register the video router
9
+ app.include_router(video.router, prefix="/video", tags=["Video download and Processing"])
10
+
11
+ @app.get("/")
12
+ async def root():
13
+ return {"message": "Welcome to the YouTube Video Processing API!"}
app/services/audio_service.py ADDED
File without changes
app/services/video_service.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/video_service.py
2
+
3
+ import os, asyncio, json
4
+ import subprocess
5
+ from moviepy.editor import VideoFileClip
6
+ from fastapi import HTTPException
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from fastapi.concurrency import run_in_threadpool
9
+
10
+ # Initialize thread pool
11
+ executor = ThreadPoolExecutor()
12
+
13
+ # Metadata file to store processed videos
14
+ metadata_file = "processed_videos.json"
15
+
16
+ # Helper function to load metadata
17
+ def load_metadata():
18
+ if os.path.exists(metadata_file):
19
+ with open(metadata_file, "r") as f:
20
+ return json.load(f)
21
+ else:
22
+ return {}
23
+
24
+ # Helper function to save metadata
25
+ def save_metadata(metadata):
26
+ with open(metadata_file, "w") as f:
27
+ json.dump(metadata, f, indent=4)
28
+
29
+ # Check if video has been processed
30
+ def is_video_processed(video_file):
31
+ metadata = load_metadata()
32
+ return metadata.get(video_file, False)
33
+
34
+ # Mark video as processed
35
+ def mark_video_as_processed(video_file):
36
+ metadata = load_metadata()
37
+ metadata[video_file] = True
38
+ save_metadata(metadata)
39
+
40
+ # Download the YouTube videos using yt-dlp asynchronously with quality option
41
+ async def download_youtube_videos(urls: list[str], output_path: str, quality: str = "best") -> list[str]:
42
+ downloaded_files = []
43
+
44
+ if not os.path.exists(output_path):
45
+ os.makedirs(output_path)
46
+
47
+ for url in urls:
48
+
49
+
50
+ output_file = os.path.join(output_path, '%(title)s.%(ext)s')
51
+ command = ["yt-dlp", "-o", output_file, "-f", quality, url]
52
+
53
+ if is_video_processed(output_file):
54
+ print(f"Video {output_file} has already been processed.")
55
+ continue
56
+
57
+ # Run the yt-dlp command in a separate thread
58
+ await run_in_threadpool(subprocess.run, command)
59
+
60
+ # Find the downloaded file in the directory
61
+ video_files = [f for f in os.listdir(output_path) if f.endswith(".mp4")]
62
+ if video_files:
63
+ video_file = os.path.join(output_path, video_files[0])
64
+ downloaded_files.append(video_file)
65
+ else:
66
+ raise HTTPException(status_code=404, detail=f"Video not found for URL: {url}")
67
+
68
+ return downloaded_files
69
+
70
+ # Split the video into chunks and extract audio asynchronously with codec support
71
+ async def split_video_and_extract_audio(video_path: str, chunk_duration: int, audio_output_dir: str):
72
+ if not os.path.exists(audio_output_dir):
73
+ os.makedirs(audio_output_dir)
74
+ print(audio_output_dir)
75
+ video_clip = await run_in_threadpool(VideoFileClip, video_path)
76
+ print(video_clip.duration)
77
+ total_duration = video_clip.duration
78
+ chunk_count = int(total_duration // chunk_duration)
79
+ if total_duration % chunk_duration != 0:
80
+ chunk_count += 1
81
+
82
+ for i in range(chunk_count):
83
+ start_time = i * chunk_duration
84
+ end_time = min((i + 1) * chunk_duration, total_duration)
85
+ chunk_clip = await run_in_threadpool(video_clip.subclip, start_time, end_time)
86
+
87
+ audio_file = os.path.join(audio_output_dir, f"{i + 1}.wav") # Changed to .wav to match the codec
88
+ print(audio_file)
89
+
90
+ # Call write_audiofile directly with codec='pcm_s16le'
91
+ await run_in_threadpool(chunk_clip.audio.write_audiofile, audio_file, codec='pcm_s16le')
92
+
93
+ video_clip.close()
94
+
95
+ # Downscale the video resolution asynchronously
96
+ async def downscale_video(video_path: str, output_path: str, resolution: str = "720p"):
97
+ video_clip = await run_in_threadpool(VideoFileClip, video_path)
98
+ video_clip_resized = await run_in_threadpool(video_clip.resize, height=int(resolution.replace("p", "")))
99
+ await run_in_threadpool(video_clip_resized.write_videofile, output_path)
100
+ video_clip.close()
101
+
102
+ # Async entry point for running in __main__
103
+ async def main():
104
+ try:
105
+ await split_video_and_extract_audio("./downloaded_videos/me.mp4", 30, "./extracted_audio")
106
+ print("Audio files extracted successfully.")
107
+ except Exception as e:
108
+ print(e)
109
+
110
+ if __name__ == "__main__":
111
+ asyncio.run(main()) # This will properly run the async code
autocorrect.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json5
2
+ import json
3
+
4
+ def correct_malformed_json(json_string):
5
+ try:
6
+ # Use json5 to parse the more flexible JSON format
7
+ parsed_json = json5.loads(json_string)
8
+
9
+ # Reformat and pretty-print the JSON using json.dumps
10
+ formatted_json = json.dumps(parsed_json, indent=4)
11
+
12
+ return formatted_json
13
+ except json5.JSONDecodeError as e:
14
+ return f"Error: {e}"
15
+
16
+ # Example usage with a malformed JSON string
17
+ malformed_json = '{"greeting": ["Good afternoon Tony", "Let me introduce our guest""]}'
18
+ print(correct_malformed_json(malformed_json))
download.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from moviepy.editor import VideoFileClip
4
+
5
+ # Function to download the YouTube video using yt-dlp
6
+ def download_youtube_video(url, output_path):
7
+ output_file = os.path.join(output_path, '%(title)s.%(ext)s')
8
+ command = ["yt-dlp", "-o", output_file, url]
9
+
10
+ # Running the yt-dlp command to download the video
11
+ subprocess.run(command)
12
+
13
+ # Find the downloaded file in the directory
14
+ downloaded_files = [f for f in os.listdir(output_path) if f.endswith(".mp4")]
15
+
16
+ if downloaded_files:
17
+ video_file = os.path.join(output_path, downloaded_files[0])
18
+ print(f"Downloaded video: {video_file}")
19
+ return video_file
20
+ else:
21
+ raise FileNotFoundError("Video not found in the specified directory.")
22
+
23
+ # Function to split the video into chunks and extract audio
24
+ def split_video_and_extract_audio(video_path, chunk_duration, audio_output_dir):
25
+ if not os.path.exists(audio_output_dir):
26
+ os.makedirs(audio_output_dir)
27
+
28
+ video_clip = VideoFileClip(video_path)
29
+ total_duration = video_clip.duration
30
+
31
+ chunk_count = int(total_duration // chunk_duration)
32
+ if total_duration % chunk_duration != 0:
33
+ chunk_count += 1
34
+
35
+ for i in range(chunk_count):
36
+ start_time = i * chunk_duration
37
+ end_time = min((i + 1) * chunk_duration, total_duration)
38
+ chunk_clip = video_clip.subclip(start_time, end_time)
39
+
40
+ # Define audio filename
41
+ audio_file = os.path.join(audio_output_dir, f"audio_chunk_{i + 1}.mp3")
42
+
43
+ # Extract audio
44
+ chunk_clip.audio.write_audiofile(audio_file)
45
+ print(f"Extracted audio chunk {i + 1} to {audio_file}")
46
+
47
+ video_clip.close()
48
+ print(f"Processing complete! Audio chunks are saved in {audio_output_dir}")
49
+
50
+ # Main function
51
+ def main():
52
+ url = 'https://www.youtube.com/watch?v=lTxn2BuqyzU'
53
+ output_dir = "downloaded_videos"
54
+ audio_output_dir = "extracted_audio"
55
+ chunk_duration = 30 # Duration of each chunk in seconds
56
+
57
+ if not os.path.exists(output_dir):
58
+ os.makedirs(output_dir)
59
+
60
+ # Download the YouTube video
61
+ video_path = download_youtube_video(url, output_dir)
62
+
63
+ # Split video into chunks and extract audio
64
+ split_video_and_extract_audio(video_path, chunk_duration, audio_output_dir)
65
+
66
+ if __name__ == "__main__":
67
+ main()
requirements.txt ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiomysql==0.2.0
2
+ annotated-types==0.7.0
3
+ anyio==4.4.0
4
+ bcrypt==3.2.0
5
+ certifi==2024.7.4
6
+ cffi==1.16.0
7
+ click==8.1.7
8
+ colorama==0.4.6
9
+ dnspython==2.6.1
10
+ ecdsa==0.19.0
11
+ email_validator==2.2.0
12
+ exceptiongroup==1.2.1
13
+ fastapi==0.111.0
14
+ fastapi-cli==0.0.4
15
+ fastapi-security==0.5.0
16
+ greenlet==3.0.3
17
+ h11==0.14.0
18
+ httpcore==1.0.5
19
+ httptools==0.6.1
20
+ httpx==0.27.0
21
+ idna==3.7
22
+ imutils==0.5.4
23
+ Jinja2==3.1.4
24
+ markdown-it-py==3.0.0
25
+ MarkupSafe==2.1.5
26
+ mdurl==0.1.2
27
+ motor==3.5.0
28
+ numpy==2.0.0
29
+ opencv-python==4.10.0.84
30
+ orjson==3.10.6
31
+ packaging==24.1
32
+ passlib==1.7.4
33
+ pillow==10.4.0
34
+ pyasn1==0.6.0
35
+ pycparser==2.22
36
+ pydantic==1.10.11
37
+ pydantic_core==2.20.1
38
+ Pygments==2.18.0
39
+ pymongo==4.8.0
40
+ PyMySQL==1.1.1
41
+ pytesseract==0.3.10
42
+ python-dateutil==2.9.0.post0
43
+ python-dotenv==1.0.1
44
+ python-jose==3.3.0
45
+ python-multipart==0.0.9
46
+ PyYAML==6.0.1
47
+ rich==13.7.1
48
+ rsa==4.9
49
+ shellingham==1.5.4
50
+ six==1.16.0
51
+ sniffio==1.3.1
52
+ SQLAlchemy==2.0.31
53
+ starlette==0.37.2
54
+ typer==0.12.3
55
+ typing_extensions==4.12.2
56
+ ujson==5.10.0
57
+ uvicorn==0.30.1
58
+ watchfiles==0.22.0
59
+ websockets==12.0