Spaces:

teoha
/

holosubs_redis_queue_worker

Sleeping

App Files Files Community

teoha commited on Oct 13, 2023

Commit

2592a7b

1 Parent(s): d973984

Replicate api logic

Browse files

Files changed (8) hide show

.env +2 -0
Dockerfile +18 -0
README.md +1 -0
holosubs.py +119 -0
main.py +14 -0
requirements.txt +16 -0
transcribe.py +78 -0
youtubeaudio.py +51 -0

.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ peft_model_id ="teoha/openai-whisper-medium-LORA-ja"
2	+ install_location = "/tmp/elite_understanding"

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM pytorch/pytorch
+WORKDIR /code
+RUN mkdir /.cache
+RUN chmod 1777 /.cache
+COPY ./requirements.txt /code/requirements.txt
+RUN echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/libcudart.so' >> ~/.bashrc
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN /opt/conda/bin/pip install peft
+RUN /opt/conda/bin/pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
+# Expose the secret SECRET_EXAMPLE at buildtime and use its value as git remote URL
+RUN --mount=type=secret,id=HUGGINGFACE_TOKEN,mode=0444,required=true \
+ huggingface-cli login --token $(cat /run/secrets/HUGGINGFACE_TOKEN) && \
+ echo "HUGGINGFACE_TOKEN=$( cat /run/secrets/HUGGINGFACE_TOKEN )" >> .env
+COPY . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -6,6 +6,7 @@ colorTo: green
 sdk: docker
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk: docker
 pinned: false
 license: mit
+app_file: main.py
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

holosubs.py ADDED Viewed

	@@ -0,0 +1,119 @@

+""""
+Entry point and main execution block of the video transcription job
+"""
+import re
+from dotenv import load_dotenv
+from youtubeaudio import YoutubeAudio
+from transcribe import Transcriber
+import torchaudio
+from pyannote.audio import Pipeline
+from webvtt import WebVTT, Caption
+import torch
+import logging
+from huggingface_hub._login import _login
+import os
+load_dotenv()
+WHISPER_SAMPLE_RATE=16000
+TIMESTAMP_PATTERN='[0-9]+:[0-9]+:[0-9]+\.[0-9]+'
+MAX_CHUNK_DURATION=30000 # ms
+format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=format, level=logging.DEBUG,
+                    datefmt="%H:%M:%S")
+_login(token=os.getenv('HUGGINGFACE_TOKEN'), add_to_git_credential=False)
+def get_video_vtt(url) -> str:
+    # Download wav file
+    ytaudio=YoutubeAudio(url)
+    ytaudio.download_audio()
+    # Load audio
+    audio, sample_rate = torchaudio.load(ytaudio.filename)
+    audio_dict={"waveform": audio, "sample_rate": sample_rate}
+    # Diarization
+    pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization@2.1', use_auth_token=True)
+    dzs = pipeline(audio_dict)
+    groups = group_segments(str(dzs).splitlines())
+    # Preprocess audio segments for translation
+    audio = torchaudio.functional.resample(audio, orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE)
+    audio_segments, timestamps = get_segments(groups, audio)
+    # Decoding audio segments into subtitles
+    transcriber = Transcriber(task="translate")
+    captions = decode_segments(audio_segments, timestamps, transcriber)
+    vtt = create_vtt(captions)
+    ytaudio.clean()
+    return vtt.content
+def decode_segments(audio_segments, timestamps, transcriber):
+  captions = []
+  for i, segment in enumerate(audio_segments):
+    result = transcriber.decode(segment)
+    captions.append(Caption(timestamps[i][0], timestamps[i][1], result))
+    logging.info(f"Chunk output no.{i+1}: {result}")
+  return captions
+def millisec(timeStr):
+  spl = timeStr.split(":")
+  s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
+  return s
+def group_segments(dzs):
+  groups = []
+  g = []
+  lastend = 0
+  for d in dzs:
+    if g and (g[0].split()[-1] != d.split()[-1]):      #same speaker
+      groups.append(g)
+      g = []
+    g.append(d)
+    end = re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=d)[1]
+    end = millisec(end)
+    if (lastend > end):       #segment engulfed by a previous segment
+      groups.append(g)
+      g = []
+    else:
+      lastend = end
+  if g:
+    groups.append(g)
+  logging.debug(groups)
+  return groups
+def create_vtt(captions):
+  vtt = WebVTT()
+  for caption in captions:
+    vtt.captions.append(caption)
+  return vtt
+  # vtt.save(path)
+def get_segments(groups, audio):
+  monoaudio=torch.mean(input=audio,dim=0).numpy()
+  audio_segments = []
+  timestamps = []
+  for g in groups:
+    cur_start_time, cur_end_time = re.findall(TIMESTAMP_PATTERN, string=g[0])
+    cur_start_millisec = millisec(cur_start_time) #- spacermilli
+    cur_end_millisec = millisec(cur_end_time)  #- spacermilli
+    for window in g[1:]:
+      start_time, end_time = re.findall(TIMESTAMP_PATTERN, string=window)
+      start_millisec = millisec(start_time) #- spacermilli
+      end_millisec = millisec(end_time)  #- spacermilli
+      # Check if new window exceeds chunk size
+      seg_duration_with_window=end_millisec-cur_start_millisec
+      if seg_duration_with_window>MAX_CHUNK_DURATION: # Segment with window exceeds max chunk duration
+        start_frame, end_frame = cur_start_millisec*WHISPER_SAMPLE_RATE//1000, cur_end_millisec*WHISPER_SAMPLE_RATE//1000
+        audio_segments.append(monoaudio[start_frame:end_frame])
+        timestamps.append((cur_start_time, cur_end_time))
+        cur_start_time, cur_end_time = start_time, end_time
+        cur_start_millisec, cur_end_millisec = start_millisec, end_millisec
+      else:
+        cur_end_time=end_time
+        cur_end_millisec=end_millisec
+    # Final update
+    start_frame, end_frame = cur_start_millisec*WHISPER_SAMPLE_RATE//1000, cur_end_millisec*WHISPER_SAMPLE_RATE//1000
+    audio_segments.append(monoaudio[start_frame:end_frame])
+    timestamps.append((cur_start_time, cur_end_time))
+  return audio_segments, timestamps

main.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from fastapi import FastAPI
+from holosubs import get_video_vtt
+from pydantic import BaseModel
+class Url(BaseModel):
+    url: str
+app = FastAPI()
+@app.post("/captions/")
+def read_root(url: Url):
+    vtt_captions = get_video_vtt(url.url)
+    return {"captions": vtt_captions}

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+fastapi==0.74.*
+requests==2.27.*
+sentencepiece==0.1.*
+uvicorn[standard]==0.17.*
+numpy==1.24.4
+pyannote.audio==1.1.2
+pyannote.core==5.0.0
+pyannote.database==5.0.1
+pyannote.metrics==3.2.1
+pyannote.pipeline==1.5.2
+python-dotenv==1.0.0
+torch==2.0.1
+torchaudio==2.0.2
+transformers==4.31.0
+webvtt_py==0.4.6
+yt_dlp==2023.7.6

transcribe.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Represents a model that transcribes and translates audio.
+"""
+import logging
+import os
+from typing import Union
+import numpy as np
+import torch
+from dotenv import load_dotenv
+from peft import PeftConfig, PeftModel
+from transformers import (AutomaticSpeechRecognitionPipeline,
+                          WhisperForConditionalGeneration, WhisperProcessor,
+                          WhisperTokenizer)
+load_dotenv()
+format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=format, level=logging.DEBUG,
+                    datefmt="%H:%M:%S")
+class Transcriber:
+    def __init__(self, model_id="teoha/openai-whisper-medium-LORA-ja", language="Japanese", task="translate"):
+        self.language=language
+        self.task=task
+        peft_model_id = model_id if model_id else os.getenv('peft_model_id')
+        # TODO: Fix Download and install model locally
+        # self.install_model(peft_model_id)
+        self.initialize_pipe(peft_model_id) #initialize pipe
+    def install_model(self, peft_model_id:str) -> None:
+            save_location = os.path.join(os.getenv('install_location'), peft_model_id)
+            offload_location = os.path.join(os.getenv('install_location'), "offload")
+            #Save Model
+            peft_config = PeftConfig.from_pretrained(peft_model_id)
+            model = WhisperForConditionalGeneration.from_pretrained(
+                peft_config.base_model_name_or_path,
+                load_in_8bit=False, device_map="auto"
+            )
+            model = PeftModel.from_pretrained(model, peft_model_id, offload_folder="offload_location")
+            model.save_pretrained(save_location)
+            #Save tokenizer/processor
+            tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
+            processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
+            tokenizer.save_pretrained(save_location)
+            processor.save_pretrained(save_location)
+            logging.info("Installation Completed successfully")
+    def initialize_pipe(self, peft_model_id: str) -> None:
+        offload_location = os.path.join(os.getenv('install_location'), "offload")
+        # Initalize model configs
+        peft_config = PeftConfig.from_pretrained(peft_model_id)
+        model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto")
+        model = PeftModel.from_pretrained(model, peft_model_id, offload_folder=offload_location)
+        tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
+        processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
+        feature_extractor = processor.feature_extractor
+        # Initialize class variables
+        self.forced_decoder_ids = processor.get_decoder_prompt_ids(language=self.language, task=self.task)
+        self.pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+        logging.info("Pipe successfully initialized")
+    def decode(self, audio: Union[np.ndarray, bytes, str]) -> str:
+        '''
+        Transcribes a sequence of floats representing an audio snippet.
+        Args:
+            inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`):
+                The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or
+                :obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is
+                the filename of the audio file, the file will be read at the correct sampling rate to get the waveform
+                using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is
+                supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way.
+        '''
+        with torch.cuda.amp.autocast():
+            text = self.pipe(audio, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids})["text"]
+            return text

youtubeaudio.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Represents a Youtube video
+"""
+from dotenv import load_dotenv
+import logging
+from yt_dlp import YoutubeDL
+import os
+from pathlib import Path
+load_dotenv()
+format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=format, level=logging.DEBUG,
+                    datefmt="%H:%M:%S")
+class YoutubeAudio:
+    def __init__(self, url, dir="/tmp/holosubs/audio"):
+        self.url=url
+        self.dir=dir
+    def download_audio(self):
+        ydl_opts = {
+            'outtmpl': os.path.join(self.dir, "%(id)s_%(epoch)s.%(ext)s"),
+            'logger': logging,
+            'progress_hooks': [self.progress_hook],
+            'format': 'm4a/bestaudio/best',
+            'postprocessors': [{  # Extract audio using ffmpeg
+                'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'wav',
+            }]
+        }
+        with YoutubeDL(ydl_opts) as ydl:
+            error_code = ydl.download([self.url])
+    def clean(self):
+        if not self.filename:
+            logging.error("Audio not downloaded")
+            return
+        location=os.path.join(self.dir, self.filename)
+        if os.path.exists(self.filename):
+            os.remove(self.filename)
+            logging.info(f"File {self.filename} successfully removed")
+            self.filename=None
+        else:
+            print(f"File {self.filename} does not exist")
+    def progress_hook(self, d):
+        if d['status'] == 'finished':
+            self.filename=os.path.join(self.dir, Path(d.get('info_dict').get('_filename')).stem + ".wav")
+            print(f'Done downloading {self.filename}, now post-processing ...')