tthhanh commited on
Commit
8a42f02
·
1 Parent(s): c7998ab

feat: add scripts to translate into vietnamese

Browse files
Files changed (2) hide show
  1. scripts/translation.py +85 -0
  2. scripts/vi.py +125 -0
scripts/translation.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from huggingface_hub import InferenceClient
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+
7
+
8
+ hf_token = os.environ.get("HF_TOKEN")
9
+ if not hf_token:
10
+ raise ValueError("HF_TOKEN not found in environment variables. Please set it in a .env file.")
11
+
12
+
13
+ # Get the directory containing the current script
14
+ script_dir = os.path.dirname(os.path.abspath(__file__))
15
+ default_inp_dir = os.path.join(script_dir, '..', 'units/en')
16
+ default_model = "deepseek-ai/DeepSeek-R1"
17
+ default_client = InferenceClient(
18
+ provider="together",
19
+ # api_key is read from the environment
20
+ )
21
+
22
+ def auto_translate(
23
+ output_lang: str,
24
+ prompt: callable,
25
+ inp_dir: str = default_inp_dir,
26
+ model: str = default_model,
27
+ client: InferenceClient = default_client
28
+ ):
29
+ get_output_path = lambda x: x.replace('/en', f'/{output_lang}')
30
+ escape_special_tokens = lambda x: x.replace('<think>', '<%%think%%>').replace('</think>', '<%%/think%%>')
31
+ unescape_special_tokens = lambda x: x.replace('<%%think%%>', '<think>').replace('<%%/think%%>', '</think>')
32
+
33
+ # Get the list of all files in the directory, recursively
34
+ inp_files: list[str] = []
35
+ print('Collecting files...')
36
+ for root, dirs, files in os.walk(inp_dir):
37
+ for file in files:
38
+ if file.endswith('.mdx') or file == "_toctree.yml":
39
+ fname = os.path.join(root, file)
40
+ print(' +', fname)
41
+ inp_files.append(fname)
42
+
43
+ def write_out_file(fpath: str, content: str):
44
+ base_path = os.path.dirname(fpath)
45
+ os.makedirs(base_path, exist_ok=True)
46
+ with open(fpath, 'w', encoding='utf-8') as f:
47
+ f.write(content)
48
+
49
+ # Read the content of the file and process
50
+ for i, inp_file in enumerate(inp_files):
51
+ out_file = get_output_path(inp_file)
52
+ if os.path.exists(out_file):
53
+ print(f'[{i+1}/{len(inp_files)}] Skipping file: {inp_file}')
54
+ continue
55
+ with open(inp_file, 'r', encoding='utf-8') as f:
56
+ content: str = f.read()
57
+ content = escape_special_tokens(content)
58
+ if content.strip() == "":
59
+ print(f'[{i+1}/{len(inp_files)}] Skipping empty file: {inp_file}')
60
+ write_out_file(out_file, "")
61
+ continue
62
+
63
+ print(f'[{i+1}/{len(inp_files)}] Processing file: {inp_file}')
64
+ stream = client.chat.completions.create(
65
+ model=model,
66
+ temperature=0.0,
67
+ messages=[
68
+ {"role": "user", "content": prompt(content)},
69
+ ],
70
+ stream=True,
71
+ )
72
+ final_text = ""
73
+ for chunk in stream:
74
+ content_chunk = chunk.choices[0].delta.content
75
+ print(content_chunk, end="", flush=True)
76
+ final_text += content_chunk
77
+ # Optionally filter <think>...</think> reasoning process
78
+ final_text = final_text.split('</think>').pop().strip()
79
+ # Write the output to the file
80
+ final_text = unescape_special_tokens(final_text)
81
+ write_out_file(out_file, final_text)
82
+ print()
83
+ print(f' -> Translated to: {out_file}')
84
+ print("--" * 20)
85
+ #break
scripts/vi.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from translation import auto_translate
2
+
3
+ output_lang = "vi"
4
+
5
+ # Fix the prompt function to escape curly braces in the content
6
+ prompt = lambda content: f'''
7
+ You are a translator for the Vietnamese translation team. You are tasked with translating the following texts into Vietnamese. You must follow these instructions:
8
+ - Translate the texts into Vietnamese, while keeping the original formatting (either Markdown, MDX or HTML)
9
+ - Inside code blocks, translate the comments but leave the code as-is; If the code block contains quite plain texts, you MUST provide the translation in <details> tag
10
+ - Do not translate inline code, the URLs and file paths
11
+ - If the term is abbreviated, keep the original term and provide the translation in parentheses for the first time it appears in the text
12
+ - If there are any slag or funny joke in english, keep it (do not translate) and give an explanation so Vietnamese reader can understand
13
+ - Use "ta", "mình, "chúng ta", "chúng mình", "các bạn" as pronouns
14
+
15
+ KEEP THESE TERMS (DO NOT TRANSLATE, do NOT add translation in parentheses): MCP, API, SDK, CLI, HTML, GGUF, AI, Client, Server, Hugging Face, Space, CodeAgent, LangGraph, LangChain, Llama, Gemma, inference, notebook, python, transformers, token, pretrain, format, certificate.
16
+
17
+ For these terms, use the pre-defined translation:
18
+ - Quick Quiz: Kiểm tra nhanh
19
+ - Unit: Chương
20
+ - Bonus Unit: Chương bổ trợ
21
+ - Module: Mô-đun
22
+ - Lesson ...: Bài ...
23
+ - Model: Mô hình
24
+ - Dataset: Tập dữ liệu
25
+ - Course: Khóa học
26
+ - state-of-the-art: nổi tiếng
27
+ - Q&A: Hỏi và Đáp
28
+ - Dummy: ảo (or "giả", or "thử" depending on the context)
29
+ - onboarding: làm quen
30
+ - Hands-on: Thực hành
31
+ - Challenge: Bài tập lớn
32
+ - Training: Huấn luyện
33
+ - Model Context Protocol: Giao Thức Ngữ Cảnh Mô Hình
34
+
35
+ Here is an example:
36
+ - Original text: [Agents Course](https://huggingface.co/learn/agents-course/) will guide you through building AI agents with LLMs.
37
+ - Translation: [Agents Course](https://huggingface.co/learn/agents-course/) sẽ hướng dẫn các bạn cách xây dựng AI Agents với LLMs.
38
+
39
+ Here is another example:
40
+ - Original text: JSON-RPC defines the message format, but MCP also specifies how these messages are transported between Clients and Servers.
41
+ - Translation: JSON-RPC định nghĩa định dạng tin nhắn, nhưng MCP cũng chỉ định cách thức các tin nhắn này được truyền tải giữa Máy khách và Máy chủ.
42
+
43
+ If the code block contains many plain texts, prove translation in collapsible <details> tag. Example:
44
+ - Original text:
45
+ ```python
46
+ def get_weather(location: str) -> dict:
47
+ """Get the current weather for a specified location."""
48
+ # Connect to weather API and fetch data
49
+ return {{
50
+ "temperature": 72,
51
+ "conditions": "Sunny",
52
+ "humidity": 45
53
+ }}
54
+ ```
55
+ - Translation (add the <details> collapsible ABOVE of the original code block):
56
+ <details>
57
+ <summary>Bấm để xem bản dịch tiếng Việt</summary>
58
+ ```
59
+ def get_weather(location: str) -> dict:
60
+ """Nhận thông tin thời tiết hiện tại ở một địa điểm cụ thể."""
61
+ # Connect to weather API and fetch data
62
+ return {{
63
+ "temperature": 72,
64
+ "conditions": "Sunny",
65
+ "humidity": 45
66
+ }}
67
+ ```
68
+ </details>
69
+ ```
70
+ def get_weather(location: str) -> dict:
71
+ """Get the current weather for a specified location."""
72
+ # Connect to weather API and fetch data
73
+ return {{
74
+ "temperature": 72,
75
+ "conditions": "Sunny",
76
+ "humidity": 45
77
+ }}
78
+ ```
79
+
80
+ If the code block does not contain any plain texts or comments, leave it as it is. Example:
81
+ - Original text:
82
+ ```json
83
+ {{
84
+ "servers": [
85
+ {{
86
+ "name": "File Explorer",
87
+ "transport": {{
88
+ "type": "stdio",
89
+ "command": "python",
90
+ "args": ["/path/to/file_explorer_server.py"]
91
+ }}
92
+ }}
93
+ ]
94
+ }}
95
+ ```
96
+
97
+ - Translation:
98
+ ```json
99
+ {{
100
+ "servers": [
101
+ {{
102
+ "name": "File Explorer",
103
+ "transport": {{
104
+ "type": "stdio",
105
+ "command": "python",
106
+ "args": ["/path/to/file_explorer_server.py"]
107
+ }}
108
+ }}
109
+ ]
110
+ }}
111
+ ```
112
+
113
+ IMPORTANT: Only output the translated texts and nothing else, no need explaination or instruction. The input text is between "=== BEGIN OF TEXT ===" and "=== END OF TEXT ===".
114
+
115
+ Please translate the following texts to Vietnamese:
116
+
117
+ === BEGIN OF TEXT ===
118
+ {content}
119
+ === END OF TEXT ===
120
+ '''.strip()
121
+
122
+ auto_translate(
123
+ prompt=prompt,
124
+ output_lang=output_lang,
125
+ )