samuelolubukun commited on
Commit
2fe27d2
·
verified ·
1 Parent(s): 0aae98e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +3 -111
  2. requirements.txt +0 -2
app.py CHANGED
@@ -14,12 +14,8 @@ import traceback
14
  import asyncio
15
  import httpx
16
  import re
17
- import pytube
18
- from pytube import YouTube
19
- from pytube.cli import on_progress
20
- import speech_recognition as sr
21
  from pydub import AudioSegment
22
- from pydub.silence import split_on_silence
23
 
24
  load_dotenv()
25
 
@@ -147,9 +143,6 @@ async def global_exception_handler(request, exc):
147
  class VideoURL(BaseModel):
148
  video_url: str
149
 
150
- class YouTubeURL(BaseModel):
151
- url: str
152
-
153
  class ProcessVideoOptions(BaseModel):
154
  video_url: str
155
  aspect_ratio: str = "16:9"
@@ -379,106 +372,5 @@ def extract_audio(data: VideoURL):
379
  finally:
380
  for p in [local_input, local_output]:
381
  if os.path.exists(p): os.remove(p)
382
- @app.post("/extract-youtube-transcript")
383
- def extract_youtube_transcript(data: YouTubeURL):
384
- """Extract transcript from a YouTube video by downloading audio and transcribing."""
385
- local_audio = os.path.join(TEMP_DIR, f"yt_audio_{uuid.uuid4()}.wav")
386
- local_mp3 = os.path.join(TEMP_DIR, f"yt_audio_{uuid.uuid4()}.mp3")
387
-
388
- try:
389
- # 1. Extract video ID
390
- video_id = None
391
- regexes = [
392
- r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
393
- r'youtu\.be\/([0-9A-Za-z_-]{11})',
394
- r'embed\/([0-9A-Za-z_-]{11})'
395
- ]
396
-
397
- for regex in regexes:
398
- match = re.search(regex, data.url)
399
- if match:
400
- video_id = match.group(1)
401
- break
402
-
403
- if not video_id:
404
- raise HTTPException(status_code=400, detail="Invalid YouTube URL")
405
-
406
- print(f"Downloading audio for video ID: {video_id}")
407
-
408
- # 2. Download audio using pytube
409
- yt = YouTube(data.url, on_progress_callback=on_progress)
410
-
411
- # Target the audio stream (highest quality audio-only usually)
412
- audio_stream = yt.streams.get_audio_only()
413
- if not audio_stream:
414
- raise HTTPException(status_code=500, detail="No audio streams found for this video")
415
-
416
- print(f"Downloading stream: {audio_stream.abr}")
417
- downloaded_file = audio_stream.download(output_path=TEMP_DIR, filename=f"yt_audio_{uuid.uuid4()}.mp3")
418
- actual_mp3_path = downloaded_file
419
-
420
- if not os.path.exists(actual_mp3_path):
421
- raise HTTPException(status_code=500, detail="Failed to download YouTube audio")
422
-
423
- # 3. Convert to WAV (16k, mono) for SpeechRecognition
424
- print(f"Converting {actual_mp3_path} to WAV...")
425
- cmd = [
426
- "ffmpeg", "-i", actual_mp3_path,
427
- "-ar", "16000", "-ac", "1", "-f", "wav",
428
- local_audio
429
- ]
430
- run_ffmpeg(cmd)
431
-
432
- # 4. Transcribe using SpeechRecognition
433
- print("Transcribing audio...")
434
- recognizer = sr.Recognizer()
435
-
436
- # Load audio with pydub to handle potentially long files by chunking
437
- audio = AudioSegment.from_wav(local_audio)
438
-
439
- # Define chunk size (e.g., 30 seconds)
440
- chunk_length_ms = 30000
441
- chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
442
-
443
- full_transcript = []
444
- formatted_segments = []
445
-
446
- for i, chunk in enumerate(chunks):
447
- chunk_silent = AudioSegment.silent(duration=500) # add half second silence for padding
448
- chunk_with_padding = chunk_silent + chunk + chunk_silent
449
-
450
- chunk_path = os.path.join(TEMP_DIR, f"chunk_{i}_{uuid.uuid4()}.wav")
451
- chunk_with_padding.export(chunk_path, format="wav")
452
-
453
- with sr.AudioFile(chunk_path) as source:
454
- audio_data = recognizer.record(source)
455
- try:
456
- # Using Google Web Speech API (free, no key needed for small use)
457
- text = recognizer.recognize_google(audio_data)
458
- full_transcript.append(text)
459
- formatted_segments.append({
460
- "text": text,
461
- "offset": i * chunk_length_ms,
462
- "duration": chunk_length_ms
463
- })
464
- except sr.UnknownValueError:
465
- print(f"Chunk {i}: Speech was unintelligible")
466
- except sr.RequestError as e:
467
- print(f"Chunk {i}: Could not request results from Google Speech Recognition service; {e}")
468
- finally:
469
- if os.path.exists(chunk_path): os.remove(chunk_path)
470
-
471
- return {
472
- "success": True,
473
- "videoId": video_id,
474
- "transcript": " ".join(full_transcript),
475
- "segments": formatted_segments
476
- }
477
-
478
- except Exception as e:
479
- print(f"Extraction error: {str(e)}")
480
- print(traceback.format_exc())
481
- raise HTTPException(status_code=500, detail=str(e))
482
- finally:
483
- for p in [local_audio, local_mp3]:
484
- if os.path.exists(p): os.remove(p)
 
14
  import asyncio
15
  import httpx
16
  import re
 
 
 
 
17
  from pydub import AudioSegment
18
+ # Note: pydub used to be used for chunking YouTube audio, but that logic is now removed.
19
 
20
  load_dotenv()
21
 
 
143
  class VideoURL(BaseModel):
144
  video_url: str
145
 
 
 
 
146
  class ProcessVideoOptions(BaseModel):
147
  video_url: str
148
  aspect_ratio: str = "16:9"
 
372
  finally:
373
  for p in [local_input, local_output]:
374
  if os.path.exists(p): os.remove(p)
375
+ if os.path.exists(local_input):
376
+ os.remove(local_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -6,6 +6,4 @@ python-dotenv==1.0.1
6
  pydantic==2.6.1
7
  requests==2.31.0
8
  httpx==0.26.0
9
- pytubefix==6.1.1
10
- SpeechRecognition==3.10.1
11
  pydub==0.25.1
 
6
  pydantic==2.6.1
7
  requests==2.31.0
8
  httpx==0.26.0
 
 
9
  pydub==0.25.1