Commit
·
32a0eda
1
Parent(s):
2f06523
Added new features and improved code formatting:
Browse files- Integrated image captioning using LLAVA.
- Implemented text-to-audio and audio-to-text features using OpenAI Whisper and GTTS.
- Added text translation service with FastAPI endpoints.
- Enhanced code formatting and organization for better readability.
- .gitignore +145 -0
- Dockerfile +21 -0
- examples/text_to speech_example.py +19 -0
- main.py +20 -0
- params.yaml +52 -0
- requirements.txt +8 -0
- src/__init__.py +4 -0
- src/apis/__init__.py +4 -0
- src/apis/img_processing_api.py +61 -0
- src/apis/language_translation_api.py +25 -0
- src/apis/speech_processing_api.py +35 -0
- src/models/__init__.py +4 -0
- src/models/models.py +48 -0
- src/pipeline/__init__.py +4 -0
- src/pipeline/image_processing_pipeline.py +37 -0
- src/pipeline/language_translation_pipeline.py +15 -0
- src/pipeline/speech_processing_pipeline.py +22 -0
- src/services/__init__.py +4 -0
- src/services/image_caption/__init__.py +4 -0
- src/services/image_caption/caption.py +44 -0
- src/services/image_generation/__init__.py +4 -0
- src/services/image_generation/image_generate.py +70 -0
- src/services/language_translation/__init__.py +4 -0
- src/services/language_translation/translation.py +51 -0
- src/services/speech/__init__.py +4 -0
- src/services/speech/speech_to_text.py +41 -0
- src/services/speech/text_to_speech.py +32 -0
- src/utils/__init__.py +4 -0
- src/utils/imutils.py +14 -0
.gitignore
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
pip-wheel-metadata/
|
| 24 |
+
share/python-wheels/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
MANIFEST
|
| 29 |
+
|
| 30 |
+
# PyInstaller
|
| 31 |
+
# Usually these files are written by a python script from a template
|
| 32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 33 |
+
*.manifest
|
| 34 |
+
*.spec
|
| 35 |
+
|
| 36 |
+
# Installer logs
|
| 37 |
+
pip-log.txt
|
| 38 |
+
pip-delete-this-directory.txt
|
| 39 |
+
|
| 40 |
+
# Unit test / coverage reports
|
| 41 |
+
htmlcov/
|
| 42 |
+
.tox/
|
| 43 |
+
.nox/
|
| 44 |
+
.coverage
|
| 45 |
+
.coverage.*
|
| 46 |
+
.cache
|
| 47 |
+
nosetests.xml
|
| 48 |
+
coverage.xml
|
| 49 |
+
*.cover
|
| 50 |
+
*.py,cover
|
| 51 |
+
.hypothesis/
|
| 52 |
+
.pytest_cache/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
target/
|
| 76 |
+
|
| 77 |
+
# Jupyter Notebook
|
| 78 |
+
.ipynb_checkpoints
|
| 79 |
+
|
| 80 |
+
# IPython
|
| 81 |
+
profile_default/
|
| 82 |
+
ipython_config.py
|
| 83 |
+
|
| 84 |
+
# pyenv
|
| 85 |
+
.python-version
|
| 86 |
+
|
| 87 |
+
# pipenv
|
| 88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 91 |
+
# install all needed dependencies.
|
| 92 |
+
#Pipfile.lock
|
| 93 |
+
|
| 94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 95 |
+
__pypackages__/
|
| 96 |
+
|
| 97 |
+
# Celery stuff
|
| 98 |
+
celerybeat-schedule
|
| 99 |
+
celerybeat.pid
|
| 100 |
+
|
| 101 |
+
# SageMath parsed files
|
| 102 |
+
*.sage.py
|
| 103 |
+
|
| 104 |
+
# Environments
|
| 105 |
+
.env
|
| 106 |
+
.venv
|
| 107 |
+
env/
|
| 108 |
+
venv/
|
| 109 |
+
ENV/
|
| 110 |
+
env.bak/
|
| 111 |
+
venv.bak/
|
| 112 |
+
|
| 113 |
+
# Spyder project settings
|
| 114 |
+
.spyderproject
|
| 115 |
+
.spyproject
|
| 116 |
+
|
| 117 |
+
# Rope project settings
|
| 118 |
+
.ropeproject
|
| 119 |
+
|
| 120 |
+
# mkdocs documentation
|
| 121 |
+
/site
|
| 122 |
+
|
| 123 |
+
# mypy
|
| 124 |
+
.mypy_cache/
|
| 125 |
+
.dmypy.json
|
| 126 |
+
dmypy.json
|
| 127 |
+
|
| 128 |
+
# Pyre type checker
|
| 129 |
+
.pyre/
|
| 130 |
+
|
| 131 |
+
# Machine Learning and Speech Libraries
|
| 132 |
+
# TensorFlow
|
| 133 |
+
*.ckpt*
|
| 134 |
+
*.pbtxt
|
| 135 |
+
*.tfevents*
|
| 136 |
+
# PyTorch
|
| 137 |
+
*.pt
|
| 138 |
+
# Keras
|
| 139 |
+
*.h5
|
| 140 |
+
# Scikit-learn
|
| 141 |
+
*.pkl
|
| 142 |
+
# Speech Recognition
|
| 143 |
+
*.wav
|
| 144 |
+
*.mp3
|
| 145 |
+
.idea/
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /learnableai
|
| 4 |
+
|
| 5 |
+
COPY . /learnableai
|
| 6 |
+
|
| 7 |
+
RUN chmod -R 777 /learnableai
|
| 8 |
+
|
| 9 |
+
RUN apt-get update && apt-get install -y libgl1-mesa-glx \
|
| 10 |
+
build-essential \
|
| 11 |
+
cmake \
|
| 12 |
+
git \
|
| 13 |
+
ffmpeg&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 16 |
+
|
| 17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
+
|
| 19 |
+
EXPOSE 7860
|
| 20 |
+
|
| 21 |
+
CMD ["python", "main.py","--host","0.0.0.0","--port","7860"]
|
examples/text_to speech_example.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
import base64
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
from src.services.speech.text_to_speech import TextToSpeech
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
tts = TextToSpeech()
|
| 12 |
+
paragraph = "Nepal, nestled between India and China, is renowned for its breathtaking landscapes and rich cultural heritage. Home to the Himalayas, including the world’s highest peak, Mount Everest, Nepal’s terrain varies from lush Terai plains to towering alpine regions. The country boasts a rich history with ancient temples and royal palaces, reflecting a blend of Hindu and Buddhist influences. Kathmandu Valley, a UNESCO World Heritage Site, showcases the artistic splendor of the Malla kings. Nepali, the official language, is spoken alongside over 120 other languages. Hinduism is the predominant religion, followed by Buddhism, Islam, and Christianity. Festivals like Dashain and Tihar highlight the vibrant culture. Agriculture remains the backbone of Nepal’s economy, though tourism also plays a crucial role, with visitors drawn to trekking and mountaineering adventures. Despite challenges such as political instability and natural disasters, including the 2015 earthquake, Nepal is making strides in development and recovery. The country’s commitment to sustainable growth and eco-tourism aims to preserve its natural beauty while promoting economic progress. Key attractions include Everest, Kathmandu’s historic sites, Pokhara’s stunning lakes, and Chitwan National Park’s diverse wildlife."
|
| 13 |
+
lang = 'en'
|
| 14 |
+
tld = 'com'
|
| 15 |
+
|
| 16 |
+
# Use the sentence_audio_generator to get base64-encoded audio for each sentence
|
| 17 |
+
for base64_audio in tts.sentence_audio_generator(paragraph, lang, tld):
|
| 18 |
+
with open(f"audio{random.randint(0, 1000000)}.mp3", "wb") as f:
|
| 19 |
+
f.write(base64.b64decode(base64_audio))
|
main.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uvicorn
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from src.apis.language_translation_api import language_translation_router
|
| 4 |
+
from src.apis.img_processing_api import image_processing_router
|
| 5 |
+
from src.apis.speech_processing_api import speech_transcription_router
|
| 6 |
+
import argparse
|
| 7 |
+
|
| 8 |
+
learnable_ai = FastAPI()
|
| 9 |
+
learnable_ai.include_router(language_translation_router, prefix='/translator', tags=["Language Translation"])
|
| 10 |
+
|
| 11 |
+
learnable_ai.include_router(image_processing_router, prefix='/image', tags=["Image Processing"])
|
| 12 |
+
learnable_ai.include_router(speech_transcription_router, prefix='/speech', tags=["Speech Processing"])
|
| 13 |
+
|
| 14 |
+
parser = argparse.ArgumentParser(description='LearnableAI API')
|
| 15 |
+
parser.add_argument('--host', type=str, default='0.0.0.0', help='Host IP address')
|
| 16 |
+
parser.add_argument('--port', type=int, default=7860, help='Port number')
|
| 17 |
+
args = parser.parse_args()
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
uvicorn.run(learnable_ai, host=args.host, port=args.port)
|
params.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
style_list: [
|
| 2 |
+
{
|
| 3 |
+
"name": "(No style)",
|
| 4 |
+
"prompt": "{prompt}",
|
| 5 |
+
"negative_prompt": "",
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"name": "Cinematic",
|
| 9 |
+
"prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
|
| 10 |
+
"negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"name": "Photographic",
|
| 14 |
+
"prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
|
| 15 |
+
"negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "Anime",
|
| 19 |
+
"prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
|
| 20 |
+
"negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"name": "Manga",
|
| 24 |
+
"prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
|
| 25 |
+
"negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"name": "Digital Art",
|
| 29 |
+
"prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
|
| 30 |
+
"negative_prompt": "photo, photorealistic, realism, ugly",
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "Pixel art",
|
| 34 |
+
"prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
|
| 35 |
+
"negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"name": "Fantasy art",
|
| 39 |
+
"prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
|
| 40 |
+
"negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "Neonpunk",
|
| 44 |
+
"prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
|
| 45 |
+
"negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"name": "3D Model",
|
| 49 |
+
"prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
|
| 50 |
+
"negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
|
| 51 |
+
},
|
| 52 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scipy
|
| 2 |
+
bitsandbytes==0.41.3
|
| 3 |
+
accelerate==0.25.0
|
| 4 |
+
git+https://github.com/huggingface/transformers.git
|
| 5 |
+
deep-translator
|
| 6 |
+
peft
|
| 7 |
+
diffusers
|
| 8 |
+
gtts
|
src/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/apis/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/apis/img_processing_api.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from fastapi import HTTPException, UploadFile
|
| 7 |
+
from fastapi.routing import APIRouter
|
| 8 |
+
from six import BytesIO
|
| 9 |
+
|
| 10 |
+
from src.models.models import ImageCaptionRequest, ImageGenerationRequest, LanguageTranslationRequest
|
| 11 |
+
from src.pipeline.image_processing_pipeline import ImageProcessingPipeline
|
| 12 |
+
|
| 13 |
+
image_processing_pipeline = ImageProcessingPipeline()
|
| 14 |
+
image_processing_router = APIRouter()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@image_processing_router.post("/generate_image")
|
| 18 |
+
async def generate_image(request: ImageGenerationRequest):
|
| 19 |
+
try:
|
| 20 |
+
image = image_processing_pipeline.generate_image(
|
| 21 |
+
request.prompt,
|
| 22 |
+
request.negative_prompt,
|
| 23 |
+
request.style,
|
| 24 |
+
request.use_negative_prompt,
|
| 25 |
+
request.num_inference_steps,
|
| 26 |
+
request.num_images_per_prompt,
|
| 27 |
+
request.seed,
|
| 28 |
+
request.width,
|
| 29 |
+
request.height,
|
| 30 |
+
request.guidance_scale,
|
| 31 |
+
request.randomize_seed
|
| 32 |
+
)
|
| 33 |
+
base_64_image = BytesIO()
|
| 34 |
+
image.save(base_64_image, format='PNG')
|
| 35 |
+
base_64_image = base_64_image.getvalue()
|
| 36 |
+
json = {'image': base_64_image, 'status_code': 200}
|
| 37 |
+
|
| 38 |
+
return json
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@image_processing_router.post("/generate_caption")
|
| 44 |
+
async def generate_caption(request: ImageCaptionRequest, image: Image = UploadFile(...)):
|
| 45 |
+
image = Image.open(BytesIO(await image.read())).convert('RGB')
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
caption = image_processing_pipeline.generate_caption(
|
| 49 |
+
image,
|
| 50 |
+
request.prompt,
|
| 51 |
+
request.temperature,
|
| 52 |
+
request.length_penalty,
|
| 53 |
+
request.repetition_penalty,
|
| 54 |
+
request.max_length,
|
| 55 |
+
request.min_length,
|
| 56 |
+
request.top_p
|
| 57 |
+
)
|
| 58 |
+
json = {'caption': caption, 'status_code': 200}
|
| 59 |
+
return json
|
| 60 |
+
except Exception as e:
|
| 61 |
+
raise HTTPException(status_code=500, detail=str(e))
|
src/apis/language_translation_api.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
from fastapi.routing import APIRouter
|
| 7 |
+
from src.models.models import LanguageTranslationRequest
|
| 8 |
+
from src.pipeline.language_translation_pipeline import LanguageTranslationPipeline
|
| 9 |
+
|
| 10 |
+
language_translation_router = APIRouter()
|
| 11 |
+
|
| 12 |
+
language_translation_pipeline = LanguageTranslationPipeline()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@language_translation_router.post("/translate")
|
| 16 |
+
async def translate_text(language_translation_info: LanguageTranslationRequest):
|
| 17 |
+
try:
|
| 18 |
+
translated_text = language_translation_pipeline.translate_text(
|
| 19 |
+
text=language_translation_info.text,
|
| 20 |
+
translator_backend_code=language_translation_info.translator_backend_code,
|
| 21 |
+
target=language_translation_info.target
|
| 22 |
+
)
|
| 23 |
+
return {"translated_text": translated_text, 'status': 200}
|
| 24 |
+
except Exception as e:
|
| 25 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
|
src/apis/speech_processing_api.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
from fastapi.routing import APIRouter
|
| 7 |
+
from src.models.models import SpeechToTextRequest, TextToSpeechRequest
|
| 8 |
+
from src.pipeline.speech_processing_pipeline import SpeechTranscriptionPipeline
|
| 9 |
+
from fastapi.responses import JSONResponse
|
| 10 |
+
|
| 11 |
+
speech_transcription_router = APIRouter()
|
| 12 |
+
|
| 13 |
+
speech_transcription_pipeline = SpeechTranscriptionPipeline()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@speech_transcription_router.post("/speech_to_text")
|
| 17 |
+
async def speech_to_text(request: SpeechToTextRequest):
|
| 18 |
+
try:
|
| 19 |
+
transcript = speech_transcription_pipeline.speech_to_text(request.audio, request.lang)
|
| 20 |
+
json = {'transcript': transcript, 'status_code': 200}
|
| 21 |
+
return json
|
| 22 |
+
except Exception as e:
|
| 23 |
+
raise HTTPException(status_code=500, detail=str)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@speech_transcription_router.post("/text_to_speech")
|
| 27 |
+
async def text_to_speech(request: TextToSpeechRequest):
|
| 28 |
+
try:
|
| 29 |
+
audio_bytes = speech_transcription_pipeline.text_to_speech(request.text, request.lang, request.tld)
|
| 30 |
+
if not audio_bytes:
|
| 31 |
+
raise ValueError("Audio generation failed.")
|
| 32 |
+
return JSONResponse(content={"audio": audio_bytes, "status_code": 200}, status_code=200)
|
| 33 |
+
|
| 34 |
+
except Exception as e:
|
| 35 |
+
raise HTTPException(status_code=500, detail="Internal Server Error")
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/models/models.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
from typing import List, Dict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class LanguageTranslationRequest(BaseModel):
|
| 11 |
+
text: str = Field(..., description="The text to translate")
|
| 12 |
+
translator_backend_code: str = Field(..., description="The code for the translation backend")
|
| 13 |
+
target: str = Field(..., description="The target language")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ImageGenerationRequest(BaseModel):
|
| 17 |
+
prompt: str = Field(..., description="The prompt for image generation")
|
| 18 |
+
negative_prompt: str = Field(..., description="The negative prompt for image generation")
|
| 19 |
+
style: str = Field(..., description="The style for image generation")
|
| 20 |
+
use_negative_prompt: bool = Field(..., description="Whether to use the negative prompt")
|
| 21 |
+
num_inference_steps: int = Field(..., description="The number of inference steps")
|
| 22 |
+
num_images_per_prompt: int = Field(..., description="The number of images per prompt")
|
| 23 |
+
seed: int = Field(..., description="The seed for image generation")
|
| 24 |
+
width: int = Field(..., description="The width of the image")
|
| 25 |
+
height: int = Field(..., description="The height of the image")
|
| 26 |
+
guidance_scale: float = Field(..., description="The guidance scale for image generation")
|
| 27 |
+
randomize_seed: bool = Field(..., description="Whether to randomize the seed")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ImageCaptionRequest(BaseModel):
|
| 31 |
+
prompt: str = Field(..., description="The prompt for image captioning")
|
| 32 |
+
temperature: float = Field(..., description="The temperature for image captioning")
|
| 33 |
+
length_penalty: float = Field(..., description="The length penalty for image captioning")
|
| 34 |
+
repetition_penalty: float = Field(..., description="The repetition penalty for image captioning")
|
| 35 |
+
max_length: int = Field(..., description="The maximum length for image captioning")
|
| 36 |
+
min_length: int = Field(..., description="The minimum length for image captioning")
|
| 37 |
+
top_p: float = Field(..., description="The top-p for image captioning")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class TextToSpeechRequest(BaseModel):
|
| 41 |
+
text: str = Field(..., description="The text to convert to speech")
|
| 42 |
+
lang: str = Field(..., description="The language of the text")
|
| 43 |
+
tld: str = Field(..., description="The TLD of the language")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class SpeechToTextRequest(BaseModel):
|
| 47 |
+
audio: str = Field(..., description="The audio to convert to text")
|
| 48 |
+
lang: str = Field(..., description="The language of the audio")
|
src/pipeline/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/pipeline/image_processing_pipeline.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from transformers import BitsAndBytesConfig
|
| 9 |
+
|
| 10 |
+
from src.services.image_caption.caption import ImageCaption
|
| 11 |
+
from src.services.image_generation.image_generate import ImageGenerator
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ImageProcessingPipeline:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
quantization_config = BitsAndBytesConfig(
|
| 17 |
+
load_in_4bit=True,
|
| 18 |
+
bnb_4bit_compute_dtype=torch.float16
|
| 19 |
+
)
|
| 20 |
+
self.image_caption = ImageCaption(model_id="llava-hf/llava-1.5-7b-hf", quantization_config=quantization_config)
|
| 21 |
+
self.image_generator = ImageGenerator()
|
| 22 |
+
|
| 23 |
+
def generate_image(self, prompt, negative_prompt, style, use_negative_prompt, num_inference_steps,
|
| 24 |
+
num_images_per_prompt, seed, width, height, guidance_scale, randomize_seed) -> Image:
|
| 25 |
+
image = self.image_generator.generate_image(prompt=prompt, negative_prompt=negative_prompt, style=style,
|
| 26 |
+
use_negative_prompt=use_negative_prompt,
|
| 27 |
+
num_inference_steps=num_inference_steps,
|
| 28 |
+
num_images_per_prompt=num_images_per_prompt, seed=seed, width=width,
|
| 29 |
+
height=height, guidance_scale=guidance_scale,
|
| 30 |
+
randomize_seed=randomize_seed)
|
| 31 |
+
return image
|
| 32 |
+
|
| 33 |
+
def generate_caption(self, image, prompt, temperature, length_penalty, repetition_penalty, max_length, min_length,
|
| 34 |
+
top_p):
|
| 35 |
+
caption = self.image_caption.generate([], prompt, image, temperature, length_penalty, repetition_penalty,
|
| 36 |
+
max_length, min_length, top_p)
|
| 37 |
+
return caption
|
src/pipeline/language_translation_pipeline.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
from src.services.language_translation.translation import LanguageTranslation
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class LanguageTranslationPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.language_translation = LanguageTranslation()
|
| 11 |
+
|
| 12 |
+
def translate_text(self, text: str, translator_backend_code: str, target: str) -> str:
|
| 13 |
+
translated_text = self.language_translation.translate_text(text=text, target=target,
|
| 14 |
+
translator=translator_backend_code)
|
| 15 |
+
return translated_text
|
src/pipeline/speech_processing_pipeline.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from src.services.speech.speech_to_text import SpeechToText
|
| 7 |
+
from src.services.speech.text_to_speech import TextToSpeech
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SpeechTranscriptionPipeline:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.speech_to_text_ = SpeechToText()
|
| 13 |
+
self.text_to_speech_ = TextToSpeech()
|
| 14 |
+
|
| 15 |
+
def text_to_speech(self, text: str, lang: str, tld: str) -> str:
|
| 16 |
+
base64_audio = self.text_to_speech_.sentence_audio_generator(text, lang, tld)
|
| 17 |
+
|
| 18 |
+
yield base64_audio
|
| 19 |
+
|
| 20 |
+
def speech_to_text(self, audio, lang: str) -> str:
|
| 21 |
+
transcript_with_timestamp, transcript = self.speech_to_text_.transcribe_audio(audio=audio, language=lang, )
|
| 22 |
+
return transcript
|
src/services/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/services/image_caption/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/services/image_caption/caption.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import copy
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ImageCaption:
|
| 7 |
+
def __init__(self, model_id, quantization_config):
|
| 8 |
+
self.pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
|
| 9 |
+
|
| 10 |
+
def infer(self, image, prompt, temperature, length_penalty, repetition_penalty, max_length, min_length, top_p):
|
| 11 |
+
outputs = self.pipe(images=image, prompt=prompt,
|
| 12 |
+
generate_kwargs={
|
| 13 |
+
"temperature": temperature,
|
| 14 |
+
"length_penalty": length_penalty,
|
| 15 |
+
"repetition_penalty": repetition_penalty,
|
| 16 |
+
"max_length": max_length,
|
| 17 |
+
"min_length": min_length,
|
| 18 |
+
"top_p": top_p})
|
| 19 |
+
return outputs[0]["generated_text"]
|
| 20 |
+
|
| 21 |
+
def extract_response_pairs(self, text):
|
| 22 |
+
turns = re.split(r'(USER:|ASSISTANT:)', text)[1:]
|
| 23 |
+
turns = [turn.strip() for turn in turns if turn.strip()]
|
| 24 |
+
conv_list = []
|
| 25 |
+
for i in range(0, len(turns[1::2]), 2):
|
| 26 |
+
if i + 1 < len(turns[1::2]):
|
| 27 |
+
conv_list.append([turns[1::2][i].lstrip(":"), turns[1::2][i + 1].lstrip(":")])
|
| 28 |
+
return conv_list
|
| 29 |
+
|
| 30 |
+
def add_text(self, history, text):
|
| 31 |
+
history.append([text, None])
|
| 32 |
+
return history, text
|
| 33 |
+
|
| 34 |
+
def generate(self, history_chat, text_input, image, temperature, length_penalty, repetition_penalty, max_length,
|
| 35 |
+
min_length, top_p):
|
| 36 |
+
chat_history = " ".join(history_chat)
|
| 37 |
+
chat_history += f"USER: <image>\n{text_input}\nASSISTANT:"
|
| 38 |
+
|
| 39 |
+
inference_result = self.infer(image, chat_history, temperature, length_penalty, repetition_penalty, max_length,
|
| 40 |
+
min_length, top_p)
|
| 41 |
+
chat_val = self.extract_response_pairs(inference_result)
|
| 42 |
+
|
| 43 |
+
chat_state_list = copy.deepcopy(chat_val)
|
| 44 |
+
return chat_state_list
|
src/services/image_generation/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/services/image_generation/image_generate.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
import random
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
from typing import Tuple, List
|
| 9 |
+
from diffusers import StableDiffusionXLPipeline
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from src.utils.imutils import yaml_read
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ImageGenerator:
|
| 15 |
+
def __init__(self, model_name: str = "RunDiffusion/Juggernaut-X-v10", device: str = "cuda"):
|
| 16 |
+
self.pipe = StableDiffusionXLPipeline.from_pretrained(
|
| 17 |
+
model_name,
|
| 18 |
+
torch_dtype=torch.float16,
|
| 19 |
+
)
|
| 20 |
+
self.pipe.to(device)
|
| 21 |
+
self.MAX_SEED = np.iinfo(np.int32).max
|
| 22 |
+
self.styles = self._initialize_styles()
|
| 23 |
+
self.DEFAULT_STYLE_NAME = "(No style)"
|
| 24 |
+
|
| 25 |
+
def _initialize_styles(self):
|
| 26 |
+
style_list = yaml_read("params.yaml")['style_list']
|
| 27 |
+
|
| 28 |
+
return {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
|
| 29 |
+
|
| 30 |
+
def randomize_seed_fn(self, seed: int, randomize_seed: bool) -> int:
|
| 31 |
+
if randomize_seed:
|
| 32 |
+
seed = random.randint(0, self.MAX_SEED)
|
| 33 |
+
return seed
|
| 34 |
+
|
| 35 |
+
def apply_style(self, style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
|
| 36 |
+
p, n = self.styles.get(style_name, self.styles[self.DEFAULT_STYLE_NAME])
|
| 37 |
+
if not negative:
|
| 38 |
+
negative = ""
|
| 39 |
+
return p.replace("{prompt}", positive), n + negative
|
| 40 |
+
|
| 41 |
+
def generate_image(self, prompt: str,
|
| 42 |
+
negative_prompt: str = "",
|
| 43 |
+
style: str = None,
|
| 44 |
+
use_negative_prompt: bool = False,
|
| 45 |
+
num_inference_steps: int = 30,
|
| 46 |
+
num_images_per_prompt: int = 1,
|
| 47 |
+
seed: int = 0,
|
| 48 |
+
width: int = 1024,
|
| 49 |
+
height: int = 1024,
|
| 50 |
+
guidance_scale: float = 3,
|
| 51 |
+
randomize_seed: bool = False,
|
| 52 |
+
) -> Tuple[List[Image.Image], int]:
|
| 53 |
+
if style is None:
|
| 54 |
+
style = self.DEFAULT_STYLE_NAME
|
| 55 |
+
seed = self.randomize_seed_fn(seed, randomize_seed)
|
| 56 |
+
if not use_negative_prompt:
|
| 57 |
+
negative_prompt = ""
|
| 58 |
+
prompt, negative_prompt = self.apply_style(style, prompt, negative_prompt)
|
| 59 |
+
|
| 60 |
+
images = self.pipe(prompt=prompt,
|
| 61 |
+
negative_prompt=negative_prompt,
|
| 62 |
+
width=width,
|
| 63 |
+
height=height,
|
| 64 |
+
guidance_scale=guidance_scale,
|
| 65 |
+
num_inference_steps=num_inference_steps,
|
| 66 |
+
num_images_per_prompt=num_images_per_prompt,
|
| 67 |
+
cross_attention_kwargs={"scale": 0.65},
|
| 68 |
+
output_type="pil").images
|
| 69 |
+
|
| 70 |
+
return images, seed
|
src/services/language_translation/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/services/language_translation/translation.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
from deep_translator import GoogleTranslator, BaiduTranslator, MicrosoftTranslator, YandexTranslator, \
|
| 6 |
+
MyMemoryTranslator, PonsTranslator, LingueeTranslator
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class LanguageTranslation:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def translate_text_google(self, text, target):
|
| 14 |
+
translated = GoogleTranslator(source='auto', target=target).translate(text)
|
| 15 |
+
return translated
|
| 16 |
+
|
| 17 |
+
def translate_text_baidu(self, text, target):
|
| 18 |
+
translator = BaiduTranslator(source='auto', target=target).translate(text)
|
| 19 |
+
return translator
|
| 20 |
+
|
| 21 |
+
def translate_text_microsoft(self, text, target):
|
| 22 |
+
translator = MicrosoftTranslator(source='auto', target=target).translate(text)
|
| 23 |
+
return translator
|
| 24 |
+
|
| 25 |
+
def translate_text_yandex(self, text, target):
|
| 26 |
+
translator = YandexTranslator(source='auto', target=target).translate(text)
|
| 27 |
+
return translator
|
| 28 |
+
|
| 29 |
+
def translate_text_my_memory(self, text, target):
|
| 30 |
+
translator = MyMemoryTranslator(source='auto', target=target).translate(text)
|
| 31 |
+
return translator
|
| 32 |
+
|
| 33 |
+
def translate_text_pons(self, text, target):
|
| 34 |
+
translator = PonsTranslator(source='auto', target=target).translate(text)
|
| 35 |
+
return translator
|
| 36 |
+
|
| 37 |
+
def translate_text_linguee(self, text, target):
|
| 38 |
+
translator = LingueeTranslator(source='auto', target=target).translate(text)
|
| 39 |
+
return translator
|
| 40 |
+
|
| 41 |
+
def translate_text(self, text, target, translator):
|
| 42 |
+
if translator == "TT01":
|
| 43 |
+
return self.translate_text_google(text, target)
|
| 44 |
+
elif translator == "TT02":
|
| 45 |
+
return self.translate_text_my_memory(text, target)
|
| 46 |
+
elif translator == "TT03":
|
| 47 |
+
return self.translate_text_pons(text, target)
|
| 48 |
+
elif translator == "TT04":
|
| 49 |
+
return self.translate_text_linguee(text, target)
|
| 50 |
+
else:
|
| 51 |
+
return "Invalid translator"
|
src/services/speech/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/services/speech/speech_to_text.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-07-31
|
| 4 |
+
"""
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SpeechToText:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 12 |
+
self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 13 |
+
|
| 14 |
+
model_id = "openai/whisper-large-v3"
|
| 15 |
+
|
| 16 |
+
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 17 |
+
model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
| 18 |
+
).to(self.device)
|
| 19 |
+
self.processor = AutoProcessor.from_pretrained(model_id)
|
| 20 |
+
self.speech_to_text_pipeline = self.pipeline()
|
| 21 |
+
|
| 22 |
+
def pipeline(self, max_new_tokens=128, chunk_length_s=30, batch_size=16):
|
| 23 |
+
pipe = pipeline(
|
| 24 |
+
"automatic-speech-recognition",
|
| 25 |
+
model=self.model,
|
| 26 |
+
tokenizer=self.processor.tokenizer,
|
| 27 |
+
feature_extractor=self.processor.feature_extractor,
|
| 28 |
+
max_new_tokens=max_new_tokens, # max number of tokens to generate at a time
|
| 29 |
+
chunk_length_s=chunk_length_s, # length of audio chunks to process at a time
|
| 30 |
+
batch_size=batch_size, # number of chunks to process at a time
|
| 31 |
+
return_timestamps=True,
|
| 32 |
+
torch_dtype=self.torch_dtype,
|
| 33 |
+
device=self.device,
|
| 34 |
+
|
| 35 |
+
)
|
| 36 |
+
return pipe
|
| 37 |
+
|
| 38 |
+
def transcribe_audio(self, audio, language: str = "en"):
|
| 39 |
+
result = self.speech_to_text_pipeline(audio, return_timestamps=True,
|
| 40 |
+
generate_kwargs={"language": language, "task": "translate"})
|
| 41 |
+
return result["chunks"], result["text"]
|
src/services/speech/text_to_speech.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import random
|
| 3 |
+
import re
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
from typing import List, Generator
|
| 6 |
+
|
| 7 |
+
from gtts import gTTS
|
| 8 |
+
from gtts.tokenizer import pre_processors
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TextToSpeech:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.preprocessing = [pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.word_sub,
|
| 14 |
+
pre_processors.abbreviations]
|
| 15 |
+
|
| 16 |
+
def _convert_sentence(self, text: str, lang: str, tld: str) -> bytes:
|
| 17 |
+
tts = gTTS(text=text, lang=lang, slow=False, tld=tld, pre_processor_funcs=self.preprocessing)
|
| 18 |
+
mp3_fp = BytesIO()
|
| 19 |
+
tts.write_to_fp(mp3_fp)
|
| 20 |
+
mp3_fp.seek(0)
|
| 21 |
+
return mp3_fp.getvalue()
|
| 22 |
+
|
| 23 |
+
def _split_corpus(self, corpus: str) -> List[str]:
|
| 24 |
+
sentences = re.split(r'(?<=[.!?]) +', corpus)
|
| 25 |
+
return sentences
|
| 26 |
+
|
| 27 |
+
def sentence_audio_generator(self, paragraph: str, lang: str, tld: str) -> Generator[str, None, None]:
|
| 28 |
+
sentences = self._split_corpus(paragraph)
|
| 29 |
+
for sentence in sentences:
|
| 30 |
+
mp3 = self._convert_sentence(sentence, lang, tld)
|
| 31 |
+
base64_audio = base64.b64encode(mp3).decode("utf-8")
|
| 32 |
+
yield base64_audio
|
src/utils/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
src/utils/imutils.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Created By: ishwor subedi
|
| 3 |
+
Date: 2024-08-13
|
| 4 |
+
"""
|
| 5 |
+
import yaml
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def yaml_read(file_path):
|
| 9 |
+
with open(file_path, 'r') as stream:
|
| 10 |
+
try:
|
| 11 |
+
return yaml.safe_load(stream)
|
| 12 |
+
except yaml.YAMLError as exc:
|
| 13 |
+
print(exc)
|
| 14 |
+
return None
|