Upload 2 files
Browse files- app.py +3 -111
- requirements.txt +0 -2
app.py
CHANGED
|
@@ -14,12 +14,8 @@ import traceback
|
|
| 14 |
import asyncio
|
| 15 |
import httpx
|
| 16 |
import re
|
| 17 |
-
import pytube
|
| 18 |
-
from pytube import YouTube
|
| 19 |
-
from pytube.cli import on_progress
|
| 20 |
-
import speech_recognition as sr
|
| 21 |
from pydub import AudioSegment
|
| 22 |
-
|
| 23 |
|
| 24 |
load_dotenv()
|
| 25 |
|
|
@@ -147,9 +143,6 @@ async def global_exception_handler(request, exc):
|
|
| 147 |
class VideoURL(BaseModel):
|
| 148 |
video_url: str
|
| 149 |
|
| 150 |
-
class YouTubeURL(BaseModel):
|
| 151 |
-
url: str
|
| 152 |
-
|
| 153 |
class ProcessVideoOptions(BaseModel):
|
| 154 |
video_url: str
|
| 155 |
aspect_ratio: str = "16:9"
|
|
@@ -379,106 +372,5 @@ def extract_audio(data: VideoURL):
|
|
| 379 |
finally:
|
| 380 |
for p in [local_input, local_output]:
|
| 381 |
if os.path.exists(p): os.remove(p)
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
"""Extract transcript from a YouTube video by downloading audio and transcribing."""
|
| 385 |
-
local_audio = os.path.join(TEMP_DIR, f"yt_audio_{uuid.uuid4()}.wav")
|
| 386 |
-
local_mp3 = os.path.join(TEMP_DIR, f"yt_audio_{uuid.uuid4()}.mp3")
|
| 387 |
-
|
| 388 |
-
try:
|
| 389 |
-
# 1. Extract video ID
|
| 390 |
-
video_id = None
|
| 391 |
-
regexes = [
|
| 392 |
-
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
| 393 |
-
r'youtu\.be\/([0-9A-Za-z_-]{11})',
|
| 394 |
-
r'embed\/([0-9A-Za-z_-]{11})'
|
| 395 |
-
]
|
| 396 |
-
|
| 397 |
-
for regex in regexes:
|
| 398 |
-
match = re.search(regex, data.url)
|
| 399 |
-
if match:
|
| 400 |
-
video_id = match.group(1)
|
| 401 |
-
break
|
| 402 |
-
|
| 403 |
-
if not video_id:
|
| 404 |
-
raise HTTPException(status_code=400, detail="Invalid YouTube URL")
|
| 405 |
-
|
| 406 |
-
print(f"Downloading audio for video ID: {video_id}")
|
| 407 |
-
|
| 408 |
-
# 2. Download audio using pytube
|
| 409 |
-
yt = YouTube(data.url, on_progress_callback=on_progress)
|
| 410 |
-
|
| 411 |
-
# Target the audio stream (highest quality audio-only usually)
|
| 412 |
-
audio_stream = yt.streams.get_audio_only()
|
| 413 |
-
if not audio_stream:
|
| 414 |
-
raise HTTPException(status_code=500, detail="No audio streams found for this video")
|
| 415 |
-
|
| 416 |
-
print(f"Downloading stream: {audio_stream.abr}")
|
| 417 |
-
downloaded_file = audio_stream.download(output_path=TEMP_DIR, filename=f"yt_audio_{uuid.uuid4()}.mp3")
|
| 418 |
-
actual_mp3_path = downloaded_file
|
| 419 |
-
|
| 420 |
-
if not os.path.exists(actual_mp3_path):
|
| 421 |
-
raise HTTPException(status_code=500, detail="Failed to download YouTube audio")
|
| 422 |
-
|
| 423 |
-
# 3. Convert to WAV (16k, mono) for SpeechRecognition
|
| 424 |
-
print(f"Converting {actual_mp3_path} to WAV...")
|
| 425 |
-
cmd = [
|
| 426 |
-
"ffmpeg", "-i", actual_mp3_path,
|
| 427 |
-
"-ar", "16000", "-ac", "1", "-f", "wav",
|
| 428 |
-
local_audio
|
| 429 |
-
]
|
| 430 |
-
run_ffmpeg(cmd)
|
| 431 |
-
|
| 432 |
-
# 4. Transcribe using SpeechRecognition
|
| 433 |
-
print("Transcribing audio...")
|
| 434 |
-
recognizer = sr.Recognizer()
|
| 435 |
-
|
| 436 |
-
# Load audio with pydub to handle potentially long files by chunking
|
| 437 |
-
audio = AudioSegment.from_wav(local_audio)
|
| 438 |
-
|
| 439 |
-
# Define chunk size (e.g., 30 seconds)
|
| 440 |
-
chunk_length_ms = 30000
|
| 441 |
-
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
| 442 |
-
|
| 443 |
-
full_transcript = []
|
| 444 |
-
formatted_segments = []
|
| 445 |
-
|
| 446 |
-
for i, chunk in enumerate(chunks):
|
| 447 |
-
chunk_silent = AudioSegment.silent(duration=500) # add half second silence for padding
|
| 448 |
-
chunk_with_padding = chunk_silent + chunk + chunk_silent
|
| 449 |
-
|
| 450 |
-
chunk_path = os.path.join(TEMP_DIR, f"chunk_{i}_{uuid.uuid4()}.wav")
|
| 451 |
-
chunk_with_padding.export(chunk_path, format="wav")
|
| 452 |
-
|
| 453 |
-
with sr.AudioFile(chunk_path) as source:
|
| 454 |
-
audio_data = recognizer.record(source)
|
| 455 |
-
try:
|
| 456 |
-
# Using Google Web Speech API (free, no key needed for small use)
|
| 457 |
-
text = recognizer.recognize_google(audio_data)
|
| 458 |
-
full_transcript.append(text)
|
| 459 |
-
formatted_segments.append({
|
| 460 |
-
"text": text,
|
| 461 |
-
"offset": i * chunk_length_ms,
|
| 462 |
-
"duration": chunk_length_ms
|
| 463 |
-
})
|
| 464 |
-
except sr.UnknownValueError:
|
| 465 |
-
print(f"Chunk {i}: Speech was unintelligible")
|
| 466 |
-
except sr.RequestError as e:
|
| 467 |
-
print(f"Chunk {i}: Could not request results from Google Speech Recognition service; {e}")
|
| 468 |
-
finally:
|
| 469 |
-
if os.path.exists(chunk_path): os.remove(chunk_path)
|
| 470 |
-
|
| 471 |
-
return {
|
| 472 |
-
"success": True,
|
| 473 |
-
"videoId": video_id,
|
| 474 |
-
"transcript": " ".join(full_transcript),
|
| 475 |
-
"segments": formatted_segments
|
| 476 |
-
}
|
| 477 |
-
|
| 478 |
-
except Exception as e:
|
| 479 |
-
print(f"Extraction error: {str(e)}")
|
| 480 |
-
print(traceback.format_exc())
|
| 481 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 482 |
-
finally:
|
| 483 |
-
for p in [local_audio, local_mp3]:
|
| 484 |
-
if os.path.exists(p): os.remove(p)
|
|
|
|
| 14 |
import asyncio
|
| 15 |
import httpx
|
| 16 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
from pydub import AudioSegment
|
| 18 |
+
# Note: pydub used to be used for chunking YouTube audio, but that logic is now removed.
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
|
|
|
|
| 143 |
class VideoURL(BaseModel):
|
| 144 |
video_url: str
|
| 145 |
|
|
|
|
|
|
|
|
|
|
| 146 |
class ProcessVideoOptions(BaseModel):
|
| 147 |
video_url: str
|
| 148 |
aspect_ratio: str = "16:9"
|
|
|
|
| 372 |
finally:
|
| 373 |
for p in [local_input, local_output]:
|
| 374 |
if os.path.exists(p): os.remove(p)
|
| 375 |
+
if os.path.exists(local_input):
|
| 376 |
+
os.remove(local_input)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -6,6 +6,4 @@ python-dotenv==1.0.1
|
|
| 6 |
pydantic==2.6.1
|
| 7 |
requests==2.31.0
|
| 8 |
httpx==0.26.0
|
| 9 |
-
pytubefix==6.1.1
|
| 10 |
-
SpeechRecognition==3.10.1
|
| 11 |
pydub==0.25.1
|
|
|
|
| 6 |
pydantic==2.6.1
|
| 7 |
requests==2.31.0
|
| 8 |
httpx==0.26.0
|
|
|
|
|
|
|
| 9 |
pydub==0.25.1
|