ishworrsubedii commited on
Commit
32a0eda
·
1 Parent(s): 2f06523

Added new features and improved code formatting:

Browse files

- Integrated image captioning using LLAVA.
- Implemented text-to-audio and audio-to-text features using OpenAI Whisper and GTTS.
- Added text translation service with FastAPI endpoints.
- Enhanced code formatting and organization for better readability.

.gitignore ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # Machine Learning and Speech Libraries
132
+ # TensorFlow
133
+ *.ckpt*
134
+ *.pbtxt
135
+ *.tfevents*
136
+ # PyTorch
137
+ *.pt
138
+ # Keras
139
+ *.h5
140
+ # Scikit-learn
141
+ *.pkl
142
+ # Speech Recognition
143
+ *.wav
144
+ *.mp3
145
+ .idea/
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /learnableai
4
+
5
+ COPY . /learnableai
6
+
7
+ RUN chmod -R 777 /learnableai
8
+
9
+ RUN apt-get update && apt-get install -y libgl1-mesa-glx \
10
+ build-essential \
11
+ cmake \
12
+ git \
13
+ ffmpeg&& apt-get clean && rm -rf /var/lib/apt/lists/*
14
+
15
+ RUN pip install --no-cache-dir --upgrade pip
16
+
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ EXPOSE 7860
20
+
21
+ CMD ["python", "main.py","--host","0.0.0.0","--port","7860"]
examples/text_to speech_example.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ import base64
6
+ import random
7
+
8
+ from src.services.speech.text_to_speech import TextToSpeech
9
+
10
+ if __name__ == "__main__":
11
+ tts = TextToSpeech()
12
+ paragraph = "Nepal, nestled between India and China, is renowned for its breathtaking landscapes and rich cultural heritage. Home to the Himalayas, including the world’s highest peak, Mount Everest, Nepal’s terrain varies from lush Terai plains to towering alpine regions. The country boasts a rich history with ancient temples and royal palaces, reflecting a blend of Hindu and Buddhist influences. Kathmandu Valley, a UNESCO World Heritage Site, showcases the artistic splendor of the Malla kings. Nepali, the official language, is spoken alongside over 120 other languages. Hinduism is the predominant religion, followed by Buddhism, Islam, and Christianity. Festivals like Dashain and Tihar highlight the vibrant culture. Agriculture remains the backbone of Nepal’s economy, though tourism also plays a crucial role, with visitors drawn to trekking and mountaineering adventures. Despite challenges such as political instability and natural disasters, including the 2015 earthquake, Nepal is making strides in development and recovery. The country’s commitment to sustainable growth and eco-tourism aims to preserve its natural beauty while promoting economic progress. Key attractions include Everest, Kathmandu’s historic sites, Pokhara’s stunning lakes, and Chitwan National Park’s diverse wildlife."
13
+ lang = 'en'
14
+ tld = 'com'
15
+
16
+ # Use the sentence_audio_generator to get base64-encoded audio for each sentence
17
+ for base64_audio in tts.sentence_audio_generator(paragraph, lang, tld):
18
+ with open(f"audio{random.randint(0, 1000000)}.mp3", "wb") as f:
19
+ f.write(base64.b64decode(base64_audio))
main.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from fastapi import FastAPI
3
+ from src.apis.language_translation_api import language_translation_router
4
+ from src.apis.img_processing_api import image_processing_router
5
+ from src.apis.speech_processing_api import speech_transcription_router
6
+ import argparse
7
+
8
+ learnable_ai = FastAPI()
9
+ learnable_ai.include_router(language_translation_router, prefix='/translator', tags=["Language Translation"])
10
+
11
+ learnable_ai.include_router(image_processing_router, prefix='/image', tags=["Image Processing"])
12
+ learnable_ai.include_router(speech_transcription_router, prefix='/speech', tags=["Speech Processing"])
13
+
14
+ parser = argparse.ArgumentParser(description='LearnableAI API')
15
+ parser.add_argument('--host', type=str, default='0.0.0.0', help='Host IP address')
16
+ parser.add_argument('--port', type=int, default=7860, help='Port number')
17
+ args = parser.parse_args()
18
+
19
+ if __name__ == "__main__":
20
+ uvicorn.run(learnable_ai, host=args.host, port=args.port)
params.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ style_list: [
2
+ {
3
+ "name": "(No style)",
4
+ "prompt": "{prompt}",
5
+ "negative_prompt": "",
6
+ },
7
+ {
8
+ "name": "Cinematic",
9
+ "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
10
+ "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
11
+ },
12
+ {
13
+ "name": "Photographic",
14
+ "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
15
+ "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
16
+ },
17
+ {
18
+ "name": "Anime",
19
+ "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
20
+ "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
21
+ },
22
+ {
23
+ "name": "Manga",
24
+ "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
25
+ "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
26
+ },
27
+ {
28
+ "name": "Digital Art",
29
+ "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
30
+ "negative_prompt": "photo, photorealistic, realism, ugly",
31
+ },
32
+ {
33
+ "name": "Pixel art",
34
+ "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
35
+ "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
36
+ },
37
+ {
38
+ "name": "Fantasy art",
39
+ "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
40
+ "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
41
+ },
42
+ {
43
+ "name": "Neonpunk",
44
+ "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
45
+ "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
46
+ },
47
+ {
48
+ "name": "3D Model",
49
+ "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
50
+ "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
51
+ },
52
+ ]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ scipy
2
+ bitsandbytes==0.41.3
3
+ accelerate==0.25.0
4
+ git+https://github.com/huggingface/transformers.git
5
+ deep-translator
6
+ peft
7
+ diffusers
8
+ gtts
src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/apis/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/apis/img_processing_api.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ from PIL import Image
6
+ from fastapi import HTTPException, UploadFile
7
+ from fastapi.routing import APIRouter
8
+ from six import BytesIO
9
+
10
+ from src.models.models import ImageCaptionRequest, ImageGenerationRequest, LanguageTranslationRequest
11
+ from src.pipeline.image_processing_pipeline import ImageProcessingPipeline
12
+
13
+ image_processing_pipeline = ImageProcessingPipeline()
14
+ image_processing_router = APIRouter()
15
+
16
+
17
+ @image_processing_router.post("/generate_image")
18
+ async def generate_image(request: ImageGenerationRequest):
19
+ try:
20
+ image = image_processing_pipeline.generate_image(
21
+ request.prompt,
22
+ request.negative_prompt,
23
+ request.style,
24
+ request.use_negative_prompt,
25
+ request.num_inference_steps,
26
+ request.num_images_per_prompt,
27
+ request.seed,
28
+ request.width,
29
+ request.height,
30
+ request.guidance_scale,
31
+ request.randomize_seed
32
+ )
33
+ base_64_image = BytesIO()
34
+ image.save(base_64_image, format='PNG')
35
+ base_64_image = base_64_image.getvalue()
36
+ json = {'image': base_64_image, 'status_code': 200}
37
+
38
+ return json
39
+ except Exception as e:
40
+ raise HTTPException(status_code=500, detail=str(e))
41
+
42
+
43
+ @image_processing_router.post("/generate_caption")
44
+ async def generate_caption(request: ImageCaptionRequest, image: Image = UploadFile(...)):
45
+ image = Image.open(BytesIO(await image.read())).convert('RGB')
46
+
47
+ try:
48
+ caption = image_processing_pipeline.generate_caption(
49
+ image,
50
+ request.prompt,
51
+ request.temperature,
52
+ request.length_penalty,
53
+ request.repetition_penalty,
54
+ request.max_length,
55
+ request.min_length,
56
+ request.top_p
57
+ )
58
+ json = {'caption': caption, 'status_code': 200}
59
+ return json
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=str(e))
src/apis/language_translation_api.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ from fastapi import HTTPException
6
+ from fastapi.routing import APIRouter
7
+ from src.models.models import LanguageTranslationRequest
8
+ from src.pipeline.language_translation_pipeline import LanguageTranslationPipeline
9
+
10
+ language_translation_router = APIRouter()
11
+
12
+ language_translation_pipeline = LanguageTranslationPipeline()
13
+
14
+
15
+ @language_translation_router.post("/translate")
16
+ async def translate_text(language_translation_info: LanguageTranslationRequest):
17
+ try:
18
+ translated_text = language_translation_pipeline.translate_text(
19
+ text=language_translation_info.text,
20
+ translator_backend_code=language_translation_info.translator_backend_code,
21
+ target=language_translation_info.target
22
+ )
23
+ return {"translated_text": translated_text, 'status': 200}
24
+ except Exception as e:
25
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
src/apis/speech_processing_api.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ from fastapi import HTTPException
6
+ from fastapi.routing import APIRouter
7
+ from src.models.models import SpeechToTextRequest, TextToSpeechRequest
8
+ from src.pipeline.speech_processing_pipeline import SpeechTranscriptionPipeline
9
+ from fastapi.responses import JSONResponse
10
+
11
+ speech_transcription_router = APIRouter()
12
+
13
+ speech_transcription_pipeline = SpeechTranscriptionPipeline()
14
+
15
+
16
+ @speech_transcription_router.post("/speech_to_text")
17
+ async def speech_to_text(request: SpeechToTextRequest):
18
+ try:
19
+ transcript = speech_transcription_pipeline.speech_to_text(request.audio, request.lang)
20
+ json = {'transcript': transcript, 'status_code': 200}
21
+ return json
22
+ except Exception as e:
23
+ raise HTTPException(status_code=500, detail=str)
24
+
25
+
26
+ @speech_transcription_router.post("/text_to_speech")
27
+ async def text_to_speech(request: TextToSpeechRequest):
28
+ try:
29
+ audio_bytes = speech_transcription_pipeline.text_to_speech(request.text, request.lang, request.tld)
30
+ if not audio_bytes:
31
+ raise ValueError("Audio generation failed.")
32
+ return JSONResponse(content={"audio": audio_bytes, "status_code": 200}, status_code=200)
33
+
34
+ except Exception as e:
35
+ raise HTTPException(status_code=500, detail="Internal Server Error")
src/models/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/models/models.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ from PIL import Image
6
+ from pydantic import BaseModel, Field
7
+ from typing import List, Dict
8
+
9
+
10
+ class LanguageTranslationRequest(BaseModel):
11
+ text: str = Field(..., description="The text to translate")
12
+ translator_backend_code: str = Field(..., description="The code for the translation backend")
13
+ target: str = Field(..., description="The target language")
14
+
15
+
16
+ class ImageGenerationRequest(BaseModel):
17
+ prompt: str = Field(..., description="The prompt for image generation")
18
+ negative_prompt: str = Field(..., description="The negative prompt for image generation")
19
+ style: str = Field(..., description="The style for image generation")
20
+ use_negative_prompt: bool = Field(..., description="Whether to use the negative prompt")
21
+ num_inference_steps: int = Field(..., description="The number of inference steps")
22
+ num_images_per_prompt: int = Field(..., description="The number of images per prompt")
23
+ seed: int = Field(..., description="The seed for image generation")
24
+ width: int = Field(..., description="The width of the image")
25
+ height: int = Field(..., description="The height of the image")
26
+ guidance_scale: float = Field(..., description="The guidance scale for image generation")
27
+ randomize_seed: bool = Field(..., description="Whether to randomize the seed")
28
+
29
+
30
+ class ImageCaptionRequest(BaseModel):
31
+ prompt: str = Field(..., description="The prompt for image captioning")
32
+ temperature: float = Field(..., description="The temperature for image captioning")
33
+ length_penalty: float = Field(..., description="The length penalty for image captioning")
34
+ repetition_penalty: float = Field(..., description="The repetition penalty for image captioning")
35
+ max_length: int = Field(..., description="The maximum length for image captioning")
36
+ min_length: int = Field(..., description="The minimum length for image captioning")
37
+ top_p: float = Field(..., description="The top-p for image captioning")
38
+
39
+
40
+ class TextToSpeechRequest(BaseModel):
41
+ text: str = Field(..., description="The text to convert to speech")
42
+ lang: str = Field(..., description="The language of the text")
43
+ tld: str = Field(..., description="The TLD of the language")
44
+
45
+
46
+ class SpeechToTextRequest(BaseModel):
47
+ audio: str = Field(..., description="The audio to convert to text")
48
+ lang: str = Field(..., description="The language of the audio")
src/pipeline/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/pipeline/image_processing_pipeline.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ import numpy as np
6
+ import torch
7
+ from PIL import Image
8
+ from transformers import BitsAndBytesConfig
9
+
10
+ from src.services.image_caption.caption import ImageCaption
11
+ from src.services.image_generation.image_generate import ImageGenerator
12
+
13
+
14
+ class ImageProcessingPipeline:
15
+ def __init__(self):
16
+ quantization_config = BitsAndBytesConfig(
17
+ load_in_4bit=True,
18
+ bnb_4bit_compute_dtype=torch.float16
19
+ )
20
+ self.image_caption = ImageCaption(model_id="llava-hf/llava-1.5-7b-hf", quantization_config=quantization_config)
21
+ self.image_generator = ImageGenerator()
22
+
23
+ def generate_image(self, prompt, negative_prompt, style, use_negative_prompt, num_inference_steps,
24
+ num_images_per_prompt, seed, width, height, guidance_scale, randomize_seed) -> Image:
25
+ image = self.image_generator.generate_image(prompt=prompt, negative_prompt=negative_prompt, style=style,
26
+ use_negative_prompt=use_negative_prompt,
27
+ num_inference_steps=num_inference_steps,
28
+ num_images_per_prompt=num_images_per_prompt, seed=seed, width=width,
29
+ height=height, guidance_scale=guidance_scale,
30
+ randomize_seed=randomize_seed)
31
+ return image
32
+
33
+ def generate_caption(self, image, prompt, temperature, length_penalty, repetition_penalty, max_length, min_length,
34
+ top_p):
35
+ caption = self.image_caption.generate([], prompt, image, temperature, length_penalty, repetition_penalty,
36
+ max_length, min_length, top_p)
37
+ return caption
src/pipeline/language_translation_pipeline.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ from src.services.language_translation.translation import LanguageTranslation
6
+
7
+
8
+ class LanguageTranslationPipeline:
9
+ def __init__(self):
10
+ self.language_translation = LanguageTranslation()
11
+
12
+ def translate_text(self, text: str, translator_backend_code: str, target: str) -> str:
13
+ translated_text = self.language_translation.translate_text(text=text, target=target,
14
+ translator=translator_backend_code)
15
+ return translated_text
src/pipeline/speech_processing_pipeline.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+
6
+ from src.services.speech.speech_to_text import SpeechToText
7
+ from src.services.speech.text_to_speech import TextToSpeech
8
+
9
+
10
+ class SpeechTranscriptionPipeline:
11
+ def __init__(self):
12
+ self.speech_to_text_ = SpeechToText()
13
+ self.text_to_speech_ = TextToSpeech()
14
+
15
+ def text_to_speech(self, text: str, lang: str, tld: str) -> str:
16
+ base64_audio = self.text_to_speech_.sentence_audio_generator(text, lang, tld)
17
+
18
+ yield base64_audio
19
+
20
+ def speech_to_text(self, audio, lang: str) -> str:
21
+ transcript_with_timestamp, transcript = self.speech_to_text_.transcribe_audio(audio=audio, language=lang, )
22
+ return transcript
src/services/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/services/image_caption/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/services/image_caption/caption.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import copy
3
+ from transformers import pipeline
4
+
5
+
6
+ class ImageCaption:
7
+ def __init__(self, model_id, quantization_config):
8
+ self.pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
9
+
10
+ def infer(self, image, prompt, temperature, length_penalty, repetition_penalty, max_length, min_length, top_p):
11
+ outputs = self.pipe(images=image, prompt=prompt,
12
+ generate_kwargs={
13
+ "temperature": temperature,
14
+ "length_penalty": length_penalty,
15
+ "repetition_penalty": repetition_penalty,
16
+ "max_length": max_length,
17
+ "min_length": min_length,
18
+ "top_p": top_p})
19
+ return outputs[0]["generated_text"]
20
+
21
+ def extract_response_pairs(self, text):
22
+ turns = re.split(r'(USER:|ASSISTANT:)', text)[1:]
23
+ turns = [turn.strip() for turn in turns if turn.strip()]
24
+ conv_list = []
25
+ for i in range(0, len(turns[1::2]), 2):
26
+ if i + 1 < len(turns[1::2]):
27
+ conv_list.append([turns[1::2][i].lstrip(":"), turns[1::2][i + 1].lstrip(":")])
28
+ return conv_list
29
+
30
+ def add_text(self, history, text):
31
+ history.append([text, None])
32
+ return history, text
33
+
34
+ def generate(self, history_chat, text_input, image, temperature, length_penalty, repetition_penalty, max_length,
35
+ min_length, top_p):
36
+ chat_history = " ".join(history_chat)
37
+ chat_history += f"USER: <image>\n{text_input}\nASSISTANT:"
38
+
39
+ inference_result = self.infer(image, chat_history, temperature, length_penalty, repetition_penalty, max_length,
40
+ min_length, top_p)
41
+ chat_val = self.extract_response_pairs(inference_result)
42
+
43
+ chat_state_list = copy.deepcopy(chat_val)
44
+ return chat_state_list
src/services/image_generation/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/services/image_generation/image_generate.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ import random
6
+ import numpy as np
7
+ import torch
8
+ from typing import Tuple, List
9
+ from diffusers import StableDiffusionXLPipeline
10
+ from PIL import Image
11
+ from src.utils.imutils import yaml_read
12
+
13
+
14
+ class ImageGenerator:
15
+ def __init__(self, model_name: str = "RunDiffusion/Juggernaut-X-v10", device: str = "cuda"):
16
+ self.pipe = StableDiffusionXLPipeline.from_pretrained(
17
+ model_name,
18
+ torch_dtype=torch.float16,
19
+ )
20
+ self.pipe.to(device)
21
+ self.MAX_SEED = np.iinfo(np.int32).max
22
+ self.styles = self._initialize_styles()
23
+ self.DEFAULT_STYLE_NAME = "(No style)"
24
+
25
+ def _initialize_styles(self):
26
+ style_list = yaml_read("params.yaml")['style_list']
27
+
28
+ return {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
29
+
30
+ def randomize_seed_fn(self, seed: int, randomize_seed: bool) -> int:
31
+ if randomize_seed:
32
+ seed = random.randint(0, self.MAX_SEED)
33
+ return seed
34
+
35
+ def apply_style(self, style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
36
+ p, n = self.styles.get(style_name, self.styles[self.DEFAULT_STYLE_NAME])
37
+ if not negative:
38
+ negative = ""
39
+ return p.replace("{prompt}", positive), n + negative
40
+
41
+ def generate_image(self, prompt: str,
42
+ negative_prompt: str = "",
43
+ style: str = None,
44
+ use_negative_prompt: bool = False,
45
+ num_inference_steps: int = 30,
46
+ num_images_per_prompt: int = 1,
47
+ seed: int = 0,
48
+ width: int = 1024,
49
+ height: int = 1024,
50
+ guidance_scale: float = 3,
51
+ randomize_seed: bool = False,
52
+ ) -> Tuple[List[Image.Image], int]:
53
+ if style is None:
54
+ style = self.DEFAULT_STYLE_NAME
55
+ seed = self.randomize_seed_fn(seed, randomize_seed)
56
+ if not use_negative_prompt:
57
+ negative_prompt = ""
58
+ prompt, negative_prompt = self.apply_style(style, prompt, negative_prompt)
59
+
60
+ images = self.pipe(prompt=prompt,
61
+ negative_prompt=negative_prompt,
62
+ width=width,
63
+ height=height,
64
+ guidance_scale=guidance_scale,
65
+ num_inference_steps=num_inference_steps,
66
+ num_images_per_prompt=num_images_per_prompt,
67
+ cross_attention_kwargs={"scale": 0.65},
68
+ output_type="pil").images
69
+
70
+ return images, seed
src/services/language_translation/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/services/language_translation/translation.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ from deep_translator import GoogleTranslator, BaiduTranslator, MicrosoftTranslator, YandexTranslator, \
6
+ MyMemoryTranslator, PonsTranslator, LingueeTranslator
7
+
8
+
9
+ class LanguageTranslation:
10
+ def __init__(self):
11
+ pass
12
+
13
+ def translate_text_google(self, text, target):
14
+ translated = GoogleTranslator(source='auto', target=target).translate(text)
15
+ return translated
16
+
17
+ def translate_text_baidu(self, text, target):
18
+ translator = BaiduTranslator(source='auto', target=target).translate(text)
19
+ return translator
20
+
21
+ def translate_text_microsoft(self, text, target):
22
+ translator = MicrosoftTranslator(source='auto', target=target).translate(text)
23
+ return translator
24
+
25
+ def translate_text_yandex(self, text, target):
26
+ translator = YandexTranslator(source='auto', target=target).translate(text)
27
+ return translator
28
+
29
+ def translate_text_my_memory(self, text, target):
30
+ translator = MyMemoryTranslator(source='auto', target=target).translate(text)
31
+ return translator
32
+
33
+ def translate_text_pons(self, text, target):
34
+ translator = PonsTranslator(source='auto', target=target).translate(text)
35
+ return translator
36
+
37
+ def translate_text_linguee(self, text, target):
38
+ translator = LingueeTranslator(source='auto', target=target).translate(text)
39
+ return translator
40
+
41
+ def translate_text(self, text, target, translator):
42
+ if translator == "TT01":
43
+ return self.translate_text_google(text, target)
44
+ elif translator == "TT02":
45
+ return self.translate_text_my_memory(text, target)
46
+ elif translator == "TT03":
47
+ return self.translate_text_pons(text, target)
48
+ elif translator == "TT04":
49
+ return self.translate_text_linguee(text, target)
50
+ else:
51
+ return "Invalid translator"
src/services/speech/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/services/speech/speech_to_text.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-07-31
4
+ """
5
+ import torch
6
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
+
8
+
9
+ class SpeechToText:
10
+ def __init__(self):
11
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
+ self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
+
14
+ model_id = "openai/whisper-large-v3"
15
+
16
+ self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
17
+ model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
18
+ ).to(self.device)
19
+ self.processor = AutoProcessor.from_pretrained(model_id)
20
+ self.speech_to_text_pipeline = self.pipeline()
21
+
22
+ def pipeline(self, max_new_tokens=128, chunk_length_s=30, batch_size=16):
23
+ pipe = pipeline(
24
+ "automatic-speech-recognition",
25
+ model=self.model,
26
+ tokenizer=self.processor.tokenizer,
27
+ feature_extractor=self.processor.feature_extractor,
28
+ max_new_tokens=max_new_tokens, # max number of tokens to generate at a time
29
+ chunk_length_s=chunk_length_s, # length of audio chunks to process at a time
30
+ batch_size=batch_size, # number of chunks to process at a time
31
+ return_timestamps=True,
32
+ torch_dtype=self.torch_dtype,
33
+ device=self.device,
34
+
35
+ )
36
+ return pipe
37
+
38
+ def transcribe_audio(self, audio, language: str = "en"):
39
+ result = self.speech_to_text_pipeline(audio, return_timestamps=True,
40
+ generate_kwargs={"language": language, "task": "translate"})
41
+ return result["chunks"], result["text"]
src/services/speech/text_to_speech.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import random
3
+ import re
4
+ from io import BytesIO
5
+ from typing import List, Generator
6
+
7
+ from gtts import gTTS
8
+ from gtts.tokenizer import pre_processors
9
+
10
+
11
+ class TextToSpeech:
12
+ def __init__(self):
13
+ self.preprocessing = [pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.word_sub,
14
+ pre_processors.abbreviations]
15
+
16
+ def _convert_sentence(self, text: str, lang: str, tld: str) -> bytes:
17
+ tts = gTTS(text=text, lang=lang, slow=False, tld=tld, pre_processor_funcs=self.preprocessing)
18
+ mp3_fp = BytesIO()
19
+ tts.write_to_fp(mp3_fp)
20
+ mp3_fp.seek(0)
21
+ return mp3_fp.getvalue()
22
+
23
+ def _split_corpus(self, corpus: str) -> List[str]:
24
+ sentences = re.split(r'(?<=[.!?]) +', corpus)
25
+ return sentences
26
+
27
+ def sentence_audio_generator(self, paragraph: str, lang: str, tld: str) -> Generator[str, None, None]:
28
+ sentences = self._split_corpus(paragraph)
29
+ for sentence in sentences:
30
+ mp3 = self._convert_sentence(sentence, lang, tld)
31
+ base64_audio = base64.b64encode(mp3).decode("utf-8")
32
+ yield base64_audio
src/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
src/utils/imutils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created By: ishwor subedi
3
+ Date: 2024-08-13
4
+ """
5
+ import yaml
6
+
7
+
8
+ def yaml_read(file_path):
9
+ with open(file_path, 'r') as stream:
10
+ try:
11
+ return yaml.safe_load(stream)
12
+ except yaml.YAMLError as exc:
13
+ print(exc)
14
+ return None