Spaces:
Sleeping
Sleeping
Commit
·
db080f6
1
Parent(s):
3005b7a
init commit
Browse files- .gitignore +174 -0
- app.py +21 -0
- requirements.txt +3 -0
- utils/__init__.py +0 -0
- utils/caller/llm_client.py +81 -0
- utils/learner/__init__.py +0 -0
- utils/learner/dataclass.py +100 -0
- utils/learner/language.py +130 -0
- utils/learner/learner.py +45 -0
.gitignore
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 116 |
+
.pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 121 |
+
__pypackages__/
|
| 122 |
+
|
| 123 |
+
# Celery stuff
|
| 124 |
+
celerybeat-schedule
|
| 125 |
+
celerybeat.pid
|
| 126 |
+
|
| 127 |
+
# SageMath parsed files
|
| 128 |
+
*.sage.py
|
| 129 |
+
|
| 130 |
+
# Environments
|
| 131 |
+
.env
|
| 132 |
+
.venv
|
| 133 |
+
env/
|
| 134 |
+
venv/
|
| 135 |
+
ENV/
|
| 136 |
+
env.bak/
|
| 137 |
+
venv.bak/
|
| 138 |
+
|
| 139 |
+
# Spyder project settings
|
| 140 |
+
.spyderproject
|
| 141 |
+
.spyproject
|
| 142 |
+
|
| 143 |
+
# Rope project settings
|
| 144 |
+
.ropeproject
|
| 145 |
+
|
| 146 |
+
# mkdocs documentation
|
| 147 |
+
/site
|
| 148 |
+
|
| 149 |
+
# mypy
|
| 150 |
+
.mypy_cache/
|
| 151 |
+
.dmypy.json
|
| 152 |
+
dmypy.json
|
| 153 |
+
|
| 154 |
+
# Pyre type checker
|
| 155 |
+
.pyre/
|
| 156 |
+
|
| 157 |
+
# pytype static type analyzer
|
| 158 |
+
.pytype/
|
| 159 |
+
|
| 160 |
+
# Cython debug symbols
|
| 161 |
+
cython_debug/
|
| 162 |
+
|
| 163 |
+
# PyCharm
|
| 164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 168 |
+
#.idea/
|
| 169 |
+
|
| 170 |
+
# Ruff stuff:
|
| 171 |
+
.ruff_cache/
|
| 172 |
+
|
| 173 |
+
# PyPI configuration file
|
| 174 |
+
.pypirc
|
app.py
CHANGED
|
@@ -1,7 +1,28 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
with gr.Blocks() as demo:
|
| 4 |
gr.Markdown("# Lang Thrower")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
+
async def chat_fn(message, history, state_history, state_audios):
|
| 4 |
+
return message, state_history, state_audios
|
| 5 |
+
|
| 6 |
with gr.Blocks() as demo:
|
| 7 |
gr.Markdown("# Lang Thrower")
|
| 8 |
+
state_history = gr.State([]) # The state for openai usage
|
| 9 |
+
state_audios = gr.State([
|
| 10 |
+
# {"text":"...", "path":"..."}
|
| 11 |
+
])
|
| 12 |
+
|
| 13 |
+
textbox = gr.MultimodalTextbox(
|
| 14 |
+
file_types=["image"],
|
| 15 |
+
file_count="multiple",
|
| 16 |
+
placeholder="Please give text and image.",
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
chat_interface = gr.ChatInterface(
|
| 20 |
+
fn=chat_fn,
|
| 21 |
+
textbox=textbox,
|
| 22 |
+
additional_inputs=[state_history, state_audios],
|
| 23 |
+
additional_outputs=[state_history, state_audios],
|
| 24 |
+
)
|
| 25 |
+
textbox.render()
|
| 26 |
|
| 27 |
|
| 28 |
if __name__ == "__main__":
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai
|
| 2 |
+
python-magic
|
| 3 |
+
edge-tts
|
utils/__init__.py
ADDED
|
File without changes
|
utils/caller/llm_client.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from openai import Client
|
| 3 |
+
from gradio_client.utils import is_http_url_like
|
| 4 |
+
import magic
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from ..learner.learner import DefaultTool
|
| 7 |
+
|
| 8 |
+
def get_client(api_key: str | None = None, **kwargs):
|
| 9 |
+
return Client(
|
| 10 |
+
api_key=api_key,
|
| 11 |
+
**kwargs,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
def encode_image(image_path:str):
|
| 15 |
+
with open(image_path, "rb") as image_file:
|
| 16 |
+
base64_image = base64.b64encode(image_file.read()).decode("utf-8")
|
| 17 |
+
mime = magic.Magic(mime=True)
|
| 18 |
+
mime_type = mime.from_file(image_path)
|
| 19 |
+
return f"data:{mime_type};base64,{base64_image}"
|
| 20 |
+
|
| 21 |
+
def image_to_content(
|
| 22 |
+
image_path:str,
|
| 23 |
+
detail:str="auto",
|
| 24 |
+
):
|
| 25 |
+
url = (
|
| 26 |
+
image_path
|
| 27 |
+
if is_http_url_like(image_path) else
|
| 28 |
+
encode_image(image_path)
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
return {
|
| 32 |
+
"type":"image_url",
|
| 33 |
+
"image_url":{
|
| 34 |
+
"url":url,
|
| 35 |
+
"detail":detail,
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
def audio_to_content(
|
| 40 |
+
data:str,
|
| 41 |
+
format:str,
|
| 42 |
+
):
|
| 43 |
+
return {
|
| 44 |
+
"type":"input_audio",
|
| 45 |
+
"input_audio":{
|
| 46 |
+
"data": data,
|
| 47 |
+
"format": format,
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
async def chat_completions(
|
| 52 |
+
messages: list,
|
| 53 |
+
model:str,
|
| 54 |
+
*,
|
| 55 |
+
client : Client | None = None,
|
| 56 |
+
tool_models:list[BaseModel] = [DefaultTool],
|
| 57 |
+
**kwargs,
|
| 58 |
+
):
|
| 59 |
+
tools = kwargs.pop("tools", None)
|
| 60 |
+
if tools is None:
|
| 61 |
+
tools = []
|
| 62 |
+
for tool_model in tool_models:
|
| 63 |
+
tools.append(
|
| 64 |
+
{
|
| 65 |
+
"type":"function",
|
| 66 |
+
"function":{
|
| 67 |
+
"parameters":tool_model.model_json_schema(),
|
| 68 |
+
"strict":True,
|
| 69 |
+
},
|
| 70 |
+
}
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
client = client or get_client()
|
| 74 |
+
response = client.chat.completions.create(
|
| 75 |
+
model=model,
|
| 76 |
+
messages=messages,
|
| 77 |
+
tools=tools,
|
| 78 |
+
**kwargs,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
return response
|
utils/learner/__init__.py
ADDED
|
File without changes
|
utils/learner/dataclass.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing_extensions import Self
|
| 2 |
+
from typing import Any, Literal, Optional
|
| 3 |
+
from pydantic import BaseModel, Field, model_validator, ValidationError
|
| 4 |
+
from .language import (
|
| 5 |
+
LANGUAGE_CODES,
|
| 6 |
+
T_LANGUAGE_CODES,
|
| 7 |
+
LANGUAGES,
|
| 8 |
+
T_LANGUAGES,
|
| 9 |
+
CODE_TO_LANGUAGE,
|
| 10 |
+
LANGUAGE_TO_CODE,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class _Record(BaseModel):
|
| 16 |
+
"Base Data Model For Language Learner"
|
| 17 |
+
lang: T_LANGUAGE_CODES | T_LANGUAGES | str = Field(..., description="The language name.")
|
| 18 |
+
data: str = Field(..., description="The data for the record, like `apple` is vocabulary, `How are you.` is a phrase. `I like your product! How much is this` is a sentence.")
|
| 19 |
+
type: None = Field(None, description="The field needs to be defined in the sub data model.")
|
| 20 |
+
meta: dict | Any = Field(None, description="The field to be implement or overwrite, please do not fill this yet.")
|
| 21 |
+
IPA: Optional[str] = Field(None, description="International Phonetic Alphabet")
|
| 22 |
+
|
| 23 |
+
@model_validator(mode='after')
|
| 24 |
+
def _validator_lang(self)->Self:
|
| 25 |
+
lang = self.lang.lower()
|
| 26 |
+
if lang in LANGUAGE_CODES:
|
| 27 |
+
lang = CODE_TO_LANGUAGE[lang]
|
| 28 |
+
else:
|
| 29 |
+
if self.meta is None:
|
| 30 |
+
self.meta = {}
|
| 31 |
+
self.meta["warning.lang"] = f"The language is not in the language list {LANGUAGES}."
|
| 32 |
+
|
| 33 |
+
self.lang = lang
|
| 34 |
+
return self
|
| 35 |
+
|
| 36 |
+
class Vocabulary(_Record):
|
| 37 |
+
"""
|
| 38 |
+
This is for word level record.
|
| 39 |
+
Please fill the `data` field with vocabulary level input respect to the language.
|
| 40 |
+
e.g.
|
| 41 |
+
Chinese: "貓", "車", "醫生", "學校", "咖啡", "書"
|
| 42 |
+
English: "Cat", "Car", "Doctor", "School", "Coffee", "Book"
|
| 43 |
+
Japanese: "猫(ねこ)", "車(くるま)", "医者(いしゃ)", "学校(がっこう)", "コーヒー", "本(ほん)"
|
| 44 |
+
Korean: "고양이", "차", "의사", "학교", "커피", "책"
|
| 45 |
+
Italian: "Gatto", "Auto", "Dottore", "Scuola", "Caffè", "Libro"
|
| 46 |
+
"""
|
| 47 |
+
type: Literal['WORD'] = "WORD"
|
| 48 |
+
|
| 49 |
+
class Phrase(_Record):
|
| 50 |
+
"""
|
| 51 |
+
This is for phrase level record.
|
| 52 |
+
Please fill the `data` field with phrase level input respect to the language.
|
| 53 |
+
e.g.
|
| 54 |
+
Chinese: "你好", "謝謝你", "我愛你", "怎麼了?", "好久不見", "多少錢?"
|
| 55 |
+
English: "Hello", "Thank you", "I love you", "What's wrong?", "Long time no see", "How much is it?"
|
| 56 |
+
Japanese: "こんにちは", "ありがとう", "愛してる", "どうしたの?", "久しぶり(ひさしぶり)", "いくらですか?"
|
| 57 |
+
Korean: "안녕하세요", "감사합니다", "사랑해요", "왜 그래요?", "오랜만이에요", "얼마예요?"
|
| 58 |
+
Italian: "Ciao", "Grazie", "Ti amo", "Che succede?", "È da tanto tempo!", "Quanto costa?"
|
| 59 |
+
"""
|
| 60 |
+
type: Literal['WORD'] = "WORD"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class Sentence(_Record):
|
| 64 |
+
"""
|
| 65 |
+
This is for sentence level record.
|
| 66 |
+
Please fill the `data` field with sentence level input respect to the language.
|
| 67 |
+
e.g.
|
| 68 |
+
Chinese: "這是一隻可愛的貓。", "我想喝一杯咖啡。", "你住在哪裡?", "今天的天氣很好。", "你能幫助我嗎?", "我正在學習日語和韓語。"
|
| 69 |
+
English: "This is a cute cat.", "I want to drink a cup of coffee.", "Where do you live?", "The weather is nice today.", "Can you help me?", "I am learning Japanese and Korean."
|
| 70 |
+
Japanese: "これはかわいい猫です。", "コーヒーを一杯飲みたいです。", "どこに住んでいますか?", "今日は天気がいいです。", "手伝ってくれますか?", "日本語と韓国語を勉強しています。"
|
| 71 |
+
Korean: "이건 귀여운 고양이예요.", "커피 한 잔 마시고 싶어요.", "어디에 살아요?", "오늘 날씨가 좋아요.", "저를 도와줄 수 있어요?", "일본어와 한국어를 공부하고 있어요."
|
| 72 |
+
Italian: "Questo è un gatto carino.", "Voglio bere una tazza di caffè.", "Dove vivi?", "Oggi il tempo è bello.", "Puoi aiutarmi?", "Sto imparando il giapponese e il coreano."
|
| 73 |
+
"""
|
| 74 |
+
type: Literal['SENTENCE'] = "SENTENCE"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class ReadableReference(BaseModel):
|
| 78 |
+
"""
|
| 79 |
+
This is a reference for the foregin language.
|
| 80 |
+
Try to let the user to understand the foregin language more easily.
|
| 81 |
+
Please use the user native language to do this.
|
| 82 |
+
"""
|
| 83 |
+
name: str
|
| 84 |
+
short_explain: str
|
| 85 |
+
description: str = Field(
|
| 86 |
+
...,
|
| 87 |
+
description="Try to describe the foreign language more comprehensively."
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
class __R(BaseModel):
|
| 91 |
+
reference: ReadableReference
|
| 92 |
+
|
| 93 |
+
class R_Vocabulary(__R):
|
| 94 |
+
foreign: Vocabulary
|
| 95 |
+
|
| 96 |
+
class R_Phrase(__R):
|
| 97 |
+
foreign: Phrase
|
| 98 |
+
|
| 99 |
+
class R_Sentence(__R):
|
| 100 |
+
foreign: Sentence
|
utils/learner/language.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
# We follow the openai language code
|
| 4 |
+
DICT = {
|
| 5 |
+
"en": "english",
|
| 6 |
+
"zh": "chinese",
|
| 7 |
+
"de": "german",
|
| 8 |
+
"es": "spanish",
|
| 9 |
+
"ru": "russian",
|
| 10 |
+
"ko": "korean",
|
| 11 |
+
"fr": "french",
|
| 12 |
+
"ja": "japanese",
|
| 13 |
+
"pt": "portuguese",
|
| 14 |
+
"tr": "turkish",
|
| 15 |
+
"pl": "polish",
|
| 16 |
+
"ca": "catalan",
|
| 17 |
+
"nl": "dutch",
|
| 18 |
+
"ar": "arabic",
|
| 19 |
+
"sv": "swedish",
|
| 20 |
+
"it": "italian",
|
| 21 |
+
"id": "indonesian",
|
| 22 |
+
"hi": "hindi",
|
| 23 |
+
"fi": "finnish",
|
| 24 |
+
"vi": "vietnamese",
|
| 25 |
+
"he": "hebrew",
|
| 26 |
+
"uk": "ukrainian",
|
| 27 |
+
"el": "greek",
|
| 28 |
+
"ms": "malay",
|
| 29 |
+
"cs": "czech",
|
| 30 |
+
"ro": "romanian",
|
| 31 |
+
"da": "danish",
|
| 32 |
+
"hu": "hungarian",
|
| 33 |
+
"ta": "tamil",
|
| 34 |
+
"no": "norwegian",
|
| 35 |
+
"th": "thai",
|
| 36 |
+
"ur": "urdu",
|
| 37 |
+
"hr": "croatian",
|
| 38 |
+
"bg": "bulgarian",
|
| 39 |
+
"lt": "lithuanian",
|
| 40 |
+
"la": "latin",
|
| 41 |
+
"mi": "maori",
|
| 42 |
+
"ml": "malayalam",
|
| 43 |
+
"cy": "welsh",
|
| 44 |
+
"sk": "slovak",
|
| 45 |
+
"te": "telugu",
|
| 46 |
+
"fa": "persian",
|
| 47 |
+
"lv": "latvian",
|
| 48 |
+
"bn": "bengali",
|
| 49 |
+
"sr": "serbian",
|
| 50 |
+
"az": "azerbaijani",
|
| 51 |
+
"sl": "slovenian",
|
| 52 |
+
"kn": "kannada",
|
| 53 |
+
"et": "estonian",
|
| 54 |
+
"mk": "macedonian",
|
| 55 |
+
"br": "breton",
|
| 56 |
+
"eu": "basque",
|
| 57 |
+
"is": "icelandic",
|
| 58 |
+
"hy": "armenian",
|
| 59 |
+
"ne": "nepali",
|
| 60 |
+
"mn": "mongolian",
|
| 61 |
+
"bs": "bosnian",
|
| 62 |
+
"kk": "kazakh",
|
| 63 |
+
"sq": "albanian",
|
| 64 |
+
"sw": "swahili",
|
| 65 |
+
"gl": "galician",
|
| 66 |
+
"mr": "marathi",
|
| 67 |
+
"pa": "punjabi",
|
| 68 |
+
"si": "sinhala",
|
| 69 |
+
"km": "khmer",
|
| 70 |
+
"sn": "shona",
|
| 71 |
+
"yo": "yoruba",
|
| 72 |
+
"so": "somali",
|
| 73 |
+
"af": "afrikaans",
|
| 74 |
+
"oc": "occitan",
|
| 75 |
+
"ka": "georgian",
|
| 76 |
+
"be": "belarusian",
|
| 77 |
+
"tg": "tajik",
|
| 78 |
+
"sd": "sindhi",
|
| 79 |
+
"gu": "gujarati",
|
| 80 |
+
"am": "amharic",
|
| 81 |
+
"yi": "yiddish",
|
| 82 |
+
"lo": "lao",
|
| 83 |
+
"uz": "uzbek",
|
| 84 |
+
"fo": "faroese",
|
| 85 |
+
"ht": "haitian creole",
|
| 86 |
+
"ps": "pashto",
|
| 87 |
+
"tk": "turkmen",
|
| 88 |
+
"nn": "nynorsk",
|
| 89 |
+
"mt": "maltese",
|
| 90 |
+
"sa": "sanskrit",
|
| 91 |
+
"lb": "luxembourgish",
|
| 92 |
+
"my": "myanmar",
|
| 93 |
+
"bo": "tibetan",
|
| 94 |
+
"tl": "tagalog",
|
| 95 |
+
"mg": "malagasy",
|
| 96 |
+
"as": "assamese",
|
| 97 |
+
"tt": "tatar",
|
| 98 |
+
"haw": "hawaiian",
|
| 99 |
+
"ln": "lingala",
|
| 100 |
+
"ha": "hausa",
|
| 101 |
+
"ba": "bashkir",
|
| 102 |
+
"jw": "javanese",
|
| 103 |
+
"su": "sundanese",
|
| 104 |
+
"yue": "cantonese",
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
LANGUAGE_CODES = ['en', 'zh', 'de', 'es', 'ru', 'ko', 'fr', 'ja', 'pt', 'tr', 'pl', 'ca', 'nl', 'ar', 'sv', 'it', 'id', 'hi', 'fi', 'vi', 'he', 'uk', 'el', 'ms', 'cs', 'ro', 'da', 'hu', 'ta', 'no', 'th', 'ur', 'hr', 'bg', 'lt', 'la', 'mi', 'ml', 'cy', 'sk', 'te', 'fa', 'lv', 'bn', 'sr', 'az', 'sl', 'kn', 'et', 'mk', 'br', 'eu', 'is', 'hy', 'ne', 'mn', 'bs', 'kk', 'sq', 'sw', 'gl', 'mr', 'pa', 'si', 'km', 'sn', 'yo', 'so', 'af', 'oc', 'ka', 'be', 'tg', 'sd', 'gu', 'am', 'yi', 'lo', 'uz', 'fo', 'ht', 'ps', 'tk', 'nn', 'mt', 'sa', 'lb', 'my', 'bo', 'tl', 'mg', 'as', 'tt', 'haw', 'ln', 'ha', 'ba', 'jw', 'su', 'yue']
|
| 108 |
+
T_LANGUAGE_CODES = Literal['en', 'zh', 'de', 'es', 'ru', 'ko', 'fr', 'ja', 'pt', 'tr', 'pl', 'ca', 'nl', 'ar', 'sv', 'it', 'id', 'hi', 'fi', 'vi', 'he', 'uk', 'el', 'ms', 'cs', 'ro', 'da', 'hu', 'ta', 'no', 'th', 'ur', 'hr', 'bg', 'lt', 'la', 'mi', 'ml', 'cy', 'sk', 'te', 'fa', 'lv', 'bn', 'sr', 'az', 'sl', 'kn', 'et', 'mk', 'br', 'eu', 'is', 'hy', 'ne', 'mn', 'bs', 'kk', 'sq', 'sw', 'gl', 'mr', 'pa', 'si', 'km', 'sn', 'yo', 'so', 'af', 'oc', 'ka', 'be', 'tg', 'sd', 'gu', 'am', 'yi', 'lo', 'uz', 'fo', 'ht', 'ps', 'tk', 'nn', 'mt', 'sa', 'lb', 'my', 'bo', 'tl', 'mg', 'as', 'tt', 'haw', 'ln', 'ha', 'ba', 'jw', 'su', 'yue']
|
| 109 |
+
|
| 110 |
+
LANGUAGES = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'cantonese']
|
| 111 |
+
T_LANGUAGES = Literal['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'cantonese']
|
| 112 |
+
|
| 113 |
+
CODE_TO_LANGUAGE = {
|
| 114 |
+
code: language
|
| 115 |
+
for code, language in DICT.items()
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
LANGUAGE_TO_CODE = {
|
| 119 |
+
language: code
|
| 120 |
+
for code, language in DICT.items()
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
__all__ = [
|
| 124 |
+
"LANGUAGE_CODES",
|
| 125 |
+
"T_LANGUAGE_CODES",
|
| 126 |
+
"LANGUAGES",
|
| 127 |
+
"T_LANGUAGES",
|
| 128 |
+
"CODE_TO_LANGUAGE",
|
| 129 |
+
"LANGUAGE_TO_CODE",
|
| 130 |
+
]
|
utils/learner/learner.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .dataclass import *
|
| 2 |
+
|
| 3 |
+
class DefaultTool(BaseModel):
|
| 4 |
+
"""
|
| 5 |
+
This is the data model for the `ThrowLingo`.
|
| 6 |
+
It is to help the user to learn the Foregin Languagae.
|
| 7 |
+
|
| 8 |
+
The suggest max_length are
|
| 9 |
+
`vocabulary` is less than 15
|
| 10 |
+
`phrase` is less than 8
|
| 11 |
+
`sentence` is less then 5
|
| 12 |
+
"""
|
| 13 |
+
vocabulary: list[R_Vocabulary]
|
| 14 |
+
phrase: list[R_Phrase]
|
| 15 |
+
sentence: list[R_Sentence]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_default_system_prompt():
|
| 19 |
+
return """
|
| 20 |
+
# Assistant Author:
|
| 21 |
+
* 湯沂達 / Tang Yi Dar
|
| 22 |
+
- changethewhat@gmail.com
|
| 23 |
+
- https://github.com/mistake0316
|
| 24 |
+
- https://www.linkedin.com/in/yi-dar-tang-89866717a/
|
| 25 |
+
- https://medium.com/@changethewhat
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# System Prompt
|
| 29 |
+
You are an assistant for doing the language learning.
|
| 30 |
+
The reason for the author to create this assistant is that he is a guy love to visit different place in different country, but struggle with his poor language skill and bad memorization ability.
|
| 31 |
+
|
| 32 |
+
Some struggle scenarios are that:
|
| 33 |
+
* He is good at math, but do not know how to describe that in Japanese.
|
| 34 |
+
* He is injured, but do not know what how to talk to the doctor in different language, he needs to prepare some words to describe his status.
|
| 35 |
+
* He want to learn gymnastic but do not know what kind of object and the name of motion in both his native language and the foreign language.
|
| 36 |
+
|
| 37 |
+
To fill the gap, he decide to create a instant language learner, which is able to generate the target language text and audio together.
|
| 38 |
+
|
| 39 |
+
Most of the time, the input will be photos and texts.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
__all__ = [
|
| 43 |
+
"DefaultTool",
|
| 44 |
+
"get_default_system_prompt"
|
| 45 |
+
]
|