Spaces:
Sleeping
Sleeping
adjusted_segments
Browse files- app.py +44 -21
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -8,6 +8,9 @@ from openai import OpenAI
|
|
| 8 |
from groq import Groq
|
| 9 |
import uuid
|
| 10 |
from gtts import gTTS
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 13 |
from youtube_transcript_api._errors import NoTranscriptFound
|
|
@@ -377,32 +380,52 @@ def generate_transcription(video_id):
|
|
| 377 |
}],
|
| 378 |
'outtmpl': outtmpl,
|
| 379 |
}
|
| 380 |
-
|
| 381 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 382 |
ydl.download([youtube_url])
|
| 383 |
|
| 384 |
audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
}
|
| 403 |
-
for item in segments
|
| 404 |
-
]
|
| 405 |
-
return transcription
|
| 406 |
|
| 407 |
def process_transcript_and_screenshots(video_id):
|
| 408 |
print("====process_transcript_and_screenshots====")
|
|
|
|
| 8 |
from groq import Groq
|
| 9 |
import uuid
|
| 10 |
from gtts import gTTS
|
| 11 |
+
import math
|
| 12 |
+
from pydub import AudioSegment
|
| 13 |
+
|
| 14 |
|
| 15 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 16 |
from youtube_transcript_api._errors import NoTranscriptFound
|
|
|
|
| 380 |
}],
|
| 381 |
'outtmpl': outtmpl,
|
| 382 |
}
|
| 383 |
+
|
| 384 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 385 |
ydl.download([youtube_url])
|
| 386 |
|
| 387 |
audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
|
| 388 |
+
full_audio = AudioSegment.from_mp3(audio_path)
|
| 389 |
+
|
| 390 |
+
max_part_duration = 10 * 60 * 1000 # 10 minutes
|
| 391 |
+
full_duration = len(full_audio) # in milliseconds
|
| 392 |
+
parts = math.ceil(full_duration / max_part_duration)
|
| 393 |
+
print(f"parts: {parts}")
|
| 394 |
+
transcription = []
|
| 395 |
+
|
| 396 |
+
for i in range(parts):
|
| 397 |
+
print(f"== i: {i}==")
|
| 398 |
+
start_time = i * max_part_duration
|
| 399 |
+
end_time = min((i + 1) * max_part_duration, full_duration)
|
| 400 |
+
print(f"time: {start_time/1000} - {end_time/1000}")
|
| 401 |
+
chunk = full_audio[start_time:end_time]
|
| 402 |
+
chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
|
| 403 |
+
chunk.export(chunk_path, format=codec_name)
|
| 404 |
+
|
| 405 |
+
with open(chunk_path, "rb") as chunk_file:
|
| 406 |
+
response = OPEN_AI_CLIENT.audio.transcriptions.create(
|
| 407 |
+
model="whisper-1",
|
| 408 |
+
file=chunk_file,
|
| 409 |
+
response_format="verbose_json",
|
| 410 |
+
timestamp_granularities=["segment"],
|
| 411 |
+
prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# Adjusting the timestamps for the chunk based on its position in the full audio
|
| 415 |
+
adjusted_segments = [{
|
| 416 |
+
'text': segment['text'],
|
| 417 |
+
'start': math.ceil(segment['start'] + start_time / 1000.0), # Converting milliseconds to seconds
|
| 418 |
+
'end': math.ceil(segment['end'] + start_time / 1000.0),
|
| 419 |
+
'duration': math.ceil(segment['end'] - segment['start'])
|
| 420 |
+
} for segment in response.segments]
|
| 421 |
|
| 422 |
+
transcription.extend(adjusted_segments)
|
| 423 |
+
|
| 424 |
+
# Remove temporary chunk files after processing
|
| 425 |
+
os.remove(chunk_path)
|
| 426 |
+
|
| 427 |
+
return transcription
|
| 428 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
def process_transcript_and_screenshots(video_id):
|
| 431 |
print("====process_transcript_and_screenshots====")
|
requirements.txt
CHANGED
|
@@ -16,4 +16,5 @@ groq
|
|
| 16 |
yt_dlp
|
| 17 |
uuid
|
| 18 |
gtts
|
| 19 |
-
boto3
|
|
|
|
|
|
| 16 |
yt_dlp
|
| 17 |
uuid
|
| 18 |
gtts
|
| 19 |
+
boto3
|
| 20 |
+
pydub
|