Spaces:

teoha
/

holosubs_redis_queue_worker

Sleeping

App Files Files Community

teoha commited on Dec 15, 2023

Commit

56aa734

1 Parent(s): 08a889b

use Whisper Cpp

Browse files

Files changed (9) hide show

.env +0 -2
Dockerfile +32 -14
__pycache__/youtubeaudio.cpython-310.pyc +0 -0
docker-compose.yml +0 -8
holosubs.py +0 -119
main.py +33 -4
requirements.txt +2 -13
transcribe.py +0 -78
youtubeaudio.py +12 -5

.env DELETED Viewed

	@@ -1,2 +0,0 @@
1	- peft_model_id ="teoha/openai-whisper-medium-LORA-ja"
2	- install_location = "/tmp/elite_understanding"

Dockerfile CHANGED Viewed

@@ -1,15 +1,33 @@
-FROM pytorch/pytorch
-WORKDIR /code
-RUN mkdir /.cache
-RUN chmod 1777 /.cache
-COPY ./requirements.txt /code/requirements.txt
-RUN echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/libcudart.so' >> ~/.bashrc
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-RUN /opt/conda/bin/pip install peft
-RUN /opt/conda/bin/pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
-RUN --mount=type=secret,id=HUGGINGFACE_TOKEN,mode=0444,required=true \
-    huggingface-cli login --token $(cat /run/secrets/HUGGINGFACE_TOKEN) && \
-    echo "HUGGINGFACE_TOKEN=$( cat /run/secrets/HUGGINGFACE_TOKEN )" >> .env
-COPY . .
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM python:3.10 as build
+RUN apt-get update
+RUN mkdir /whisper && \
+  wget -q https://github.com/ggerganov/whisper.cpp/tarball/master -O - | \
+  tar -xz -C /whisper --strip-components 1
+WORKDIR /whisper
+ARG model
+RUN bash ./models/download-ggml-model.sh "${model}"
+RUN make main
+FROM python:3.10 as whisper
+RUN apt-get update \
+ && apt-get install -y libsdl2-dev alsa-utils ffmpeg \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /
+RUN pip install -r /requirements.txt
+COPY main.py /root
+COPY youtubeaudio.py /root
+WORKDIR /root
+ARG model
+ENV model=$model
+RUN mkdir /root/models
+RUN mkdir -p -m 777 /tmp/holosubs/results
+COPY --from=build "/whisper/models/ggml-${model}.bin" "/root/models/ggml-${model}.bin"
+COPY --from=build /whisper/main /usr/local/bin/whisper
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/youtubeaudio.cpython-310.pyc ADDED Viewed

Binary file (2.42 kB). View file

docker-compose.yml DELETED Viewed

@@ -1,8 +0,0 @@
-services:
-  holosubs:
-    image: holosubs
-    container_name: local_holosubs
-    ports:
-      - "7860:7860"
-    volumes:
-      - ~/.cache:/.cache

holosubs.py DELETED Viewed

@@ -1,119 +0,0 @@
-""""
-Entry point and main execution block of the video transcription job
-"""
-import re
-from dotenv import load_dotenv
-from youtubeaudio import YoutubeAudio
-from transcribe import Transcriber
-import torchaudio
-from pyannote.audio import Pipeline
-from webvtt import WebVTT, Caption
-import torch
-import logging
-from huggingface_hub._login import _login
-import os
-load_dotenv()
-WHISPER_SAMPLE_RATE=16000
-TIMESTAMP_PATTERN='[0-9]+:[0-9]+:[0-9]+\.[0-9]+'
-MAX_CHUNK_DURATION=30000 # ms
-format = "%(asctime)s: %(message)s"
-logging.basicConfig(format=format, level=logging.DEBUG,
-                    datefmt="%H:%M:%S")
-_login(token=os.getenv('HUGGINGFACE_TOKEN'), add_to_git_credential=False)
-def get_video_vtt(url) -> str:
-    # Download wav file
-    ytaudio=YoutubeAudio(url)
-    ytaudio.download_audio()
-    # Load audio
-    audio, sample_rate = torchaudio.load(ytaudio.filename)
-    audio_dict={"waveform": audio, "sample_rate": sample_rate}
-    # Diarization
-    pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization@2.1', use_auth_token=True)
-    dzs = pipeline(audio_dict)
-    groups = group_segments(str(dzs).splitlines())
-    # Preprocess audio segments for translation
-    audio = torchaudio.functional.resample(audio, orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE)
-    audio_segments, timestamps = get_segments(groups, audio)
-    # Decoding audio segments into subtitles
-    transcriber = Transcriber(task="translate")
-    captions = decode_segments(audio_segments, timestamps, transcriber)
-    vtt = create_vtt(captions)
-    ytaudio.clean()
-    return vtt.content
-def decode_segments(audio_segments, timestamps, transcriber):
-  captions = []
-  for i, segment in enumerate(audio_segments):
-    result = transcriber.decode(segment)
-    captions.append(Caption(timestamps[i][0], timestamps[i][1], result))
-    logging.info(f"Chunk output no.{i+1}: {result}")
-  return captions
-def millisec(timeStr):
-  spl = timeStr.split(":")
-  s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
-  return s
-def group_segments(dzs):
-  groups = []
-  g = []
-  lastend = 0
-  for d in dzs:
-    if g and (g[0].split()[-1] != d.split()[-1]):      #same speaker
-      groups.append(g)
-      g = []
-    g.append(d)
-    end = re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=d)[1]
-    end = millisec(end)
-    if (lastend > end):       #segment engulfed by a previous segment
-      groups.append(g)
-      g = []
-    else:
-      lastend = end
-  if g:
-    groups.append(g)
-  logging.debug(groups)
-  return groups
-def create_vtt(captions):
-  vtt = WebVTT()
-  for caption in captions:
-    vtt.captions.append(caption)
-  return vtt
-  # vtt.save(path)
-def get_segments(groups, audio):
-  monoaudio=torch.mean(input=audio,dim=0).numpy()
-  audio_segments = []
-  timestamps = []
-  for g in groups:
-    cur_start_time, cur_end_time = re.findall(TIMESTAMP_PATTERN, string=g[0])
-    cur_start_millisec = millisec(cur_start_time) #- spacermilli
-    cur_end_millisec = millisec(cur_end_time)  #- spacermilli
-    for window in g[1:]:
-      start_time, end_time = re.findall(TIMESTAMP_PATTERN, string=window)
-      start_millisec = millisec(start_time) #- spacermilli
-      end_millisec = millisec(end_time)  #- spacermilli
-      # Check if new window exceeds chunk size
-      seg_duration_with_window=end_millisec-cur_start_millisec
-      if seg_duration_with_window>MAX_CHUNK_DURATION: # Segment with window exceeds max chunk duration
-        start_frame, end_frame = cur_start_millisec*WHISPER_SAMPLE_RATE//1000, cur_end_millisec*WHISPER_SAMPLE_RATE//1000
-        audio_segments.append(monoaudio[start_frame:end_frame])
-        timestamps.append((cur_start_time, cur_end_time))
-        cur_start_time, cur_end_time = start_time, end_time
-        cur_start_millisec, cur_end_millisec = start_millisec, end_millisec
-      else:
-        cur_end_time=end_time
-        cur_end_millisec=end_millisec
-    # Final update
-    start_frame, end_frame = cur_start_millisec*WHISPER_SAMPLE_RATE//1000, cur_end_millisec*WHISPER_SAMPLE_RATE//1000
-    audio_segments.append(monoaudio[start_frame:end_frame])
-    timestamps.append((cur_start_time, cur_end_time))
-  return audio_segments, timestamps

main.py CHANGED Viewed

@@ -1,14 +1,43 @@
-from fastapi import FastAPI
-from holosubs import get_video_vtt
 from pydantic import BaseModel
 class Url(BaseModel):
     url: str
 app = FastAPI()
 @app.post("/captions/")
 def read_root(url: Url):
-    vtt_captions = get_video_vtt(url.url)
-    return {"captions": vtt_captions}

+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from youtubeaudio import YoutubeAudio
+import subprocess
+import os
+import uuid
+import logging
+import ffmpeg
 class Url(BaseModel):
     url: str
+format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=format, level=logging.DEBUG,
+                    datefmt="%H:%M:%S")
+MODEL=os.environ['model']
 app = FastAPI()
 @app.post("/captions/")
 def read_root(url: Url):
+    # Download wav file and get filename
+    ytaudio=YoutubeAudio(url)
+    ytaudio.download_audio()
+    filename=ytaudio.filename
+    # Resample file
+    ytaudio.resample('16k')
+    # Generate subtitles
+    output_file=os.path.join("/tmp/holosubs/results", str(uuid.uuid4()))
+    logging.info(f'Writing to file {output_file}.vtt')
+    cmd=['/usr/local/bin/whisper','-m',f'/root/models/ggml-{MODEL}.bin'
+         ,'-f',filename, '-di', '-of', output_file, '-tr', '-ovtt', '-t', '8']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (output, err) = p.communicate()
+    p_status = p.wait()
+    logging.info(output)
+    if err:
+        logging.error("Whisper translation failed with error",err)
+        raise HTTPException(status_code=500, detail="Whisper translation failed")
+    with open(output_file+".vtt", 'r') as f:
+        raw_vtt=f.read()
+    os.remove(output_file+".vtt")
+    return {"captions": raw_vtt}

requirements.txt CHANGED Viewed

@@ -1,16 +1,5 @@
 fastapi==0.74.*
 requests==2.27.*
-sentencepiece==0.1.*
 uvicorn[standard]==0.17.*
-numpy==1.24.4
-pyannote.audio==1.1.2
-pyannote.core==5.0.0
-pyannote.database==5.0.1
-pyannote.metrics==3.2.1
-pyannote.pipeline==1.5.2
-python-dotenv==1.0.0
-torch==2.0.1
-torchaudio==2.0.2
-transformers==4.31.0
-webvtt_py==0.4.6
-yt_dlp==2023.7.6

 fastapi==0.74.*
 requests==2.27.*
 uvicorn[standard]==0.17.*
+yt_dlp==2023.7.6
+ffmpeg-python==0.2.0

transcribe.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""
-Represents a model that transcribes and translates audio.
-"""
-import logging
-import os
-from typing import Union
-import numpy as np
-import torch
-from dotenv import load_dotenv
-from peft import PeftConfig, PeftModel
-from transformers import (AutomaticSpeechRecognitionPipeline,
-                          WhisperForConditionalGeneration, WhisperProcessor,
-                          WhisperTokenizer)
-load_dotenv()
-format = "%(asctime)s: %(message)s"
-logging.basicConfig(format=format, level=logging.DEBUG,
-                    datefmt="%H:%M:%S")
-class Transcriber:
-    def __init__(self, model_id="teoha/openai-whisper-medium-LORA-ja", language="Japanese", task="translate"):
-        self.language=language
-        self.task=task
-        peft_model_id = model_id if model_id else os.getenv('peft_model_id')
-        # TODO: Fix Download and install model locally
-        # self.install_model(peft_model_id)
-        self.initialize_pipe(peft_model_id) #initialize pipe
-    def install_model(self, peft_model_id:str) -> None:
-            save_location = os.path.join(os.getenv('install_location'), peft_model_id)
-            offload_location = os.path.join(os.getenv('install_location'), "offload")
-            #Save Model
-            peft_config = PeftConfig.from_pretrained(peft_model_id)
-            model = WhisperForConditionalGeneration.from_pretrained(
-                peft_config.base_model_name_or_path,
-                load_in_8bit=False, device_map="auto"
-            )
-            model = PeftModel.from_pretrained(model, peft_model_id, offload_folder="offload_location")
-            model.save_pretrained(save_location)
-            #Save tokenizer/processor
-            tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-            processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-            tokenizer.save_pretrained(save_location)
-            processor.save_pretrained(save_location)
-            logging.info("Installation Completed successfully")
-    def initialize_pipe(self, peft_model_id: str) -> None:
-        offload_location = os.path.join(os.getenv('install_location'), "offload")
-        # Initalize model configs
-        peft_config = PeftConfig.from_pretrained(peft_model_id)
-        model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto")
-        model = PeftModel.from_pretrained(model, peft_model_id, offload_folder=offload_location)
-        tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-        processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-        feature_extractor = processor.feature_extractor
-        # Initialize class variables
-        self.forced_decoder_ids = processor.get_decoder_prompt_ids(language=self.language, task=self.task)
-        self.pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
-        logging.info("Pipe successfully initialized")
-    def decode(self, audio: Union[np.ndarray, bytes, str]) -> str:
-        '''
-        Transcribes a sequence of floats representing an audio snippet.
-        Args:
-            inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`):
-                The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or
-                :obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is
-                the filename of the audio file, the file will be read at the correct sampling rate to get the waveform
-                using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is
-                supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way.
-        '''
-        with torch.cuda.amp.autocast():
-            text = self.pipe(audio, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids})["text"]
-            return text

youtubeaudio.py CHANGED Viewed

@@ -2,13 +2,14 @@
 Represents a Youtube video
 """
-from dotenv import load_dotenv
 import logging
 from yt_dlp import YoutubeDL
 import os
 from pathlib import Path
-load_dotenv()
 format = "%(asctime)s: %(message)s"
 logging.basicConfig(format=format, level=logging.DEBUG,
                     datefmt="%H:%M:%S")
@@ -17,7 +18,7 @@ class YoutubeAudio:
     def __init__(self, url, dir="/tmp/holosubs/audio"):
         self.url=url
         self.dir=dir
     def download_audio(self):
         ydl_opts = {
             'outtmpl': os.path.join(self.dir, "%(id)s_%(epoch)s.%(ext)s"),
@@ -26,12 +27,18 @@ class YoutubeAudio:
             'format': 'm4a/bestaudio/best',
             'postprocessors': [{  # Extract audio using ffmpeg
                 'key': 'FFmpegExtractAudio',
-                'preferredcodec': 'wav',
             }]
         }
         with YoutubeDL(ydl_opts) as ydl:
-            error_code = ydl.download([self.url])
     def clean(self):
         if not self.filename:
             logging.error("Audio not downloaded")

 Represents a Youtube video
 """
 import logging
+import shutil
+import uuid
 from yt_dlp import YoutubeDL
 import os
 from pathlib import Path
+import ffmpeg
 format = "%(asctime)s: %(message)s"
 logging.basicConfig(format=format, level=logging.DEBUG,
                     datefmt="%H:%M:%S")
     def __init__(self, url, dir="/tmp/holosubs/audio"):
         self.url=url
         self.dir=dir
     def download_audio(self):
         ydl_opts = {
             'outtmpl': os.path.join(self.dir, "%(id)s_%(epoch)s.%(ext)s"),
             'format': 'm4a/bestaudio/best',
             'postprocessors': [{  # Extract audio using ffmpeg
                 'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'wav'
             }]
         }
         with YoutubeDL(ydl_opts) as ydl:
+            error_code = ydl.download([self.url.url])
+    def resample(self,sr='16k'):
+        tmp_filename=os.path.join(self.dir,str(uuid.uuid4()))+".wav"
+        ffmpeg.input(self.filename).output(tmp_filename,ar=sr).run()
+        shutil.move(tmp_filename, self.filename)
+        logging.info(f"Succesfuly resampled {self.filename}")
     def clean(self):
         if not self.filename:
             logging.error("Audio not downloaded")