Spaces:
Build error
Build error
Merge pull request #2 from Jwaminju/update-translator
Browse files- README.md +1 -1
- agent/handler.py +81 -17
- agent/workflow.py +68 -25
- app.py +54 -11
- pr_generator/agent.py +1 -1
- translation_result/docs/source/en/accelerator_selection.md +13 -13
- translator/content.py +95 -27
- translator/retriever.py +54 -0
README.md
CHANGED
|
@@ -54,7 +54,7 @@ This project was specifically created to solve [Hugging Face Transformers Issue
|
|
| 54 |
|
| 55 |
## 🎥 Demo Video
|
| 56 |
|
| 57 |
-
[
|
| 58 |
|
| 59 |
*Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
|
| 60 |
|
|
|
|
| 54 |
|
| 55 |
## 🎥 Demo Video
|
| 56 |
|
| 57 |
+
[Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
|
| 58 |
|
| 59 |
*Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
|
| 60 |
|
agent/handler.py
CHANGED
|
@@ -8,10 +8,12 @@ import gradio as gr
|
|
| 8 |
|
| 9 |
from agent.workflow import (
|
| 10 |
report_translation_target_files,
|
|
|
|
| 11 |
translate_docs_interactive,
|
| 12 |
generate_github_pr,
|
| 13 |
)
|
| 14 |
from pr_generator.searcher import find_reference_pr_simple_stream
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
# State management
|
|
@@ -21,6 +23,7 @@ class ChatState:
|
|
| 21 |
self.target_language = "ko"
|
| 22 |
self.k_files = 10
|
| 23 |
self.files_to_translate = []
|
|
|
|
| 24 |
self.current_file_content = {"translated": ""}
|
| 25 |
self.pr_result = None # Store PR creation result
|
| 26 |
# GitHub configuration
|
|
@@ -70,22 +73,29 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
| 70 |
state.step = "find_files"
|
| 71 |
|
| 72 |
status_report, files_list = report_translation_target_files(lang, k)
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
response = f"""**✅ File search completed!**
|
| 76 |
|
| 77 |
**Status Report:**
|
| 78 |
{status_report}
|
| 79 |
-
|
| 80 |
**📁 Found first {len(state.files_to_translate)} files to translate:**
|
| 81 |
"""
|
| 82 |
|
| 83 |
if state.files_to_translate:
|
| 84 |
-
for i, file in enumerate(state.files_to_translate
|
| 85 |
response += f"\n{i}. `{file}`"
|
| 86 |
|
| 87 |
-
if len(state.files_to_translate) > 5:
|
| 88 |
-
|
| 89 |
|
| 90 |
response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
| 91 |
else:
|
|
@@ -96,7 +106,18 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
| 96 |
cleared_input = ""
|
| 97 |
selected_tab = 1 if state.files_to_translate else 0
|
| 98 |
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
|
| 102 |
def start_translation_process():
|
|
@@ -108,8 +129,8 @@ def start_translation_process():
|
|
| 108 |
|
| 109 |
# Call translation function (simplified for demo)
|
| 110 |
try:
|
| 111 |
-
|
| 112 |
-
state.target_language, [[current_file]]
|
| 113 |
)
|
| 114 |
|
| 115 |
state.current_file_content = {"translated": translated}
|
|
@@ -124,18 +145,24 @@ def start_translation_process():
|
|
| 124 |
original_file_link = (
|
| 125 |
"https://github.com/huggingface/transformers/blob/main/" + current_file
|
| 126 |
)
|
|
|
|
|
|
|
|
|
|
| 127 |
response = (
|
| 128 |
-
f"""🔄 Translation for: `{current_file}`
|
| 129 |
"**📄 Original Content Link:**\n"
|
| 130 |
""
|
| 131 |
f"{original_file_link}\n"
|
| 132 |
"**🌐 Translated Content:**\n"
|
| 133 |
-
f"\n```\n\n{_extract_content_for_display(translated)}```
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
)
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
print("extracted")
|
| 139 |
|
| 140 |
except Exception as e:
|
| 141 |
response = f"❌ Translation failed: {str(e)}"
|
|
@@ -191,12 +218,14 @@ def handle_user_message(message, history):
|
|
| 191 |
# User wants to start translation
|
| 192 |
if state.files_to_translate:
|
| 193 |
state.step = "translate"
|
| 194 |
-
response = start_translation_process()
|
|
|
|
|
|
|
|
|
|
| 195 |
else:
|
| 196 |
response = (
|
| 197 |
"❌ No files available for translation. Please search for files first."
|
| 198 |
)
|
| 199 |
-
|
| 200 |
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
| 201 |
else:
|
| 202 |
# General response
|
|
@@ -288,14 +317,44 @@ def update_github_config(token, owner, repo, reference_pr_url):
|
|
| 288 |
return f"✅ GitHub configuration updated: {owner}/{repo}"
|
| 289 |
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
def send_message(message, history):
|
| 292 |
new_history, cleared_input = handle_user_message(message, history)
|
| 293 |
return new_history, cleared_input, update_status()
|
| 294 |
|
| 295 |
|
| 296 |
# Button handlers with tab switching
|
| 297 |
-
def start_translate_handler(history, anthropic_key):
|
| 298 |
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
|
|
|
|
|
|
|
|
|
| 299 |
new_hist, cleared_input = handle_user_message("start translation", history)
|
| 300 |
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
| 301 |
return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
|
|
@@ -363,11 +422,16 @@ def approve_handler(history, owner, repo, reference_pr_url):
|
|
| 363 |
translated_content = state.current_file_content["translated"]
|
| 364 |
response += "\n\n🚀 **Generating GitHub PR...**"
|
| 365 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
pr_response = generate_github_pr(
|
| 367 |
target_language=state.target_language,
|
| 368 |
filepath=current_file,
|
| 369 |
translated_content=translated_content,
|
| 370 |
github_config=state.github_config,
|
|
|
|
| 371 |
)
|
| 372 |
response += f"\n{pr_response}"
|
| 373 |
else:
|
|
|
|
| 8 |
|
| 9 |
from agent.workflow import (
|
| 10 |
report_translation_target_files,
|
| 11 |
+
report_in_translation_status_files,
|
| 12 |
translate_docs_interactive,
|
| 13 |
generate_github_pr,
|
| 14 |
)
|
| 15 |
from pr_generator.searcher import find_reference_pr_simple_stream
|
| 16 |
+
from translator.content import get_full_prompt, get_content, preprocess_content
|
| 17 |
|
| 18 |
|
| 19 |
# State management
|
|
|
|
| 23 |
self.target_language = "ko"
|
| 24 |
self.k_files = 10
|
| 25 |
self.files_to_translate = []
|
| 26 |
+
self.additional_instruction = ""
|
| 27 |
self.current_file_content = {"translated": ""}
|
| 28 |
self.pr_result = None # Store PR creation result
|
| 29 |
# GitHub configuration
|
|
|
|
| 73 |
state.step = "find_files"
|
| 74 |
|
| 75 |
status_report, files_list = report_translation_target_files(lang, k)
|
| 76 |
+
in_progress_status_report, in_progress_docs = report_in_translation_status_files(
|
| 77 |
+
lang
|
| 78 |
+
)
|
| 79 |
+
state.files_to_translate = (
|
| 80 |
+
[file[0] for file in files_list if file[0] not in in_progress_docs]
|
| 81 |
+
if files_list
|
| 82 |
+
else []
|
| 83 |
+
)
|
| 84 |
|
| 85 |
response = f"""**✅ File search completed!**
|
| 86 |
|
| 87 |
**Status Report:**
|
| 88 |
{status_report}
|
| 89 |
+
{in_progress_status_report}
|
| 90 |
**📁 Found first {len(state.files_to_translate)} files to translate:**
|
| 91 |
"""
|
| 92 |
|
| 93 |
if state.files_to_translate:
|
| 94 |
+
for i, file in enumerate(state.files_to_translate, 1):
|
| 95 |
response += f"\n{i}. `{file}`"
|
| 96 |
|
| 97 |
+
# if len(state.files_to_translate) > 5:
|
| 98 |
+
# response += f"\n... and {len(state.files_to_translate) - 5} more files"
|
| 99 |
|
| 100 |
response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
| 101 |
else:
|
|
|
|
| 106 |
cleared_input = ""
|
| 107 |
selected_tab = 1 if state.files_to_translate else 0
|
| 108 |
|
| 109 |
+
# 드롭다운 choices로 쓸 파일 리스트 반환 추가
|
| 110 |
+
return (
|
| 111 |
+
history,
|
| 112 |
+
cleared_input,
|
| 113 |
+
update_status(),
|
| 114 |
+
gr.Tabs(selected=selected_tab),
|
| 115 |
+
update_dropdown_choices(state.files_to_translate),
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def update_dropdown_choices(file_list):
|
| 120 |
+
return gr.update(choices=file_list, value=None)
|
| 121 |
|
| 122 |
|
| 123 |
def start_translation_process():
|
|
|
|
| 129 |
|
| 130 |
# Call translation function (simplified for demo)
|
| 131 |
try:
|
| 132 |
+
translated = translate_docs_interactive(
|
| 133 |
+
state.target_language, [[current_file]], state.additional_instruction
|
| 134 |
)
|
| 135 |
|
| 136 |
state.current_file_content = {"translated": translated}
|
|
|
|
| 145 |
original_file_link = (
|
| 146 |
"https://github.com/huggingface/transformers/blob/main/" + current_file
|
| 147 |
)
|
| 148 |
+
print("Compeleted translation:\n")
|
| 149 |
+
print(translated)
|
| 150 |
+
print("----------------------------")
|
| 151 |
response = (
|
| 152 |
+
f"""🔄 Translation for: `{current_file}`\n"""
|
| 153 |
"**📄 Original Content Link:**\n"
|
| 154 |
""
|
| 155 |
f"{original_file_link}\n"
|
| 156 |
"**🌐 Translated Content:**\n"
|
| 157 |
+
# f"\n```\n\n{_extract_content_for_display(translated)}\n```"
|
| 158 |
+
# "\n```\n\n"
|
| 159 |
+
# f"\n{translated}\n"
|
| 160 |
+
# f"```"
|
| 161 |
+
# f"{status}\n"
|
| 162 |
+
# "✅ Translation completed. The code block will be added when generating PR."
|
| 163 |
)
|
| 164 |
+
return response, translated
|
| 165 |
+
|
|
|
|
| 166 |
|
| 167 |
except Exception as e:
|
| 168 |
response = f"❌ Translation failed: {str(e)}"
|
|
|
|
| 218 |
# User wants to start translation
|
| 219 |
if state.files_to_translate:
|
| 220 |
state.step = "translate"
|
| 221 |
+
response, translated = start_translation_process()
|
| 222 |
+
history.append([message, response])
|
| 223 |
+
history.append(["", translated])
|
| 224 |
+
return history, ""
|
| 225 |
else:
|
| 226 |
response = (
|
| 227 |
"❌ No files available for translation. Please search for files first."
|
| 228 |
)
|
|
|
|
| 229 |
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
| 230 |
else:
|
| 231 |
# General response
|
|
|
|
| 317 |
return f"✅ GitHub configuration updated: {owner}/{repo}"
|
| 318 |
|
| 319 |
|
| 320 |
+
def update_prompt_preview(language, file_path, additional_instruction):
|
| 321 |
+
"""Update prompt preview based on current settings"""
|
| 322 |
+
if not file_path.strip():
|
| 323 |
+
return "Select a file to see the prompt preview..."
|
| 324 |
+
|
| 325 |
+
try:
|
| 326 |
+
# Get language name
|
| 327 |
+
if language == "ko":
|
| 328 |
+
translation_lang = "Korean"
|
| 329 |
+
else:
|
| 330 |
+
translation_lang = language
|
| 331 |
+
|
| 332 |
+
# Get sample content (first 500 characters)
|
| 333 |
+
content = get_content(file_path)
|
| 334 |
+
to_translate = preprocess_content(content)
|
| 335 |
+
|
| 336 |
+
# Truncate for preview
|
| 337 |
+
sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
|
| 338 |
+
|
| 339 |
+
# Generate prompt
|
| 340 |
+
prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
|
| 341 |
+
|
| 342 |
+
return prompt
|
| 343 |
+
except Exception as e:
|
| 344 |
+
return f"Error generating prompt preview: {str(e)}"
|
| 345 |
+
|
| 346 |
+
|
| 347 |
def send_message(message, history):
|
| 348 |
new_history, cleared_input = handle_user_message(message, history)
|
| 349 |
return new_history, cleared_input, update_status()
|
| 350 |
|
| 351 |
|
| 352 |
# Button handlers with tab switching
|
| 353 |
+
def start_translate_handler(history, anthropic_key, file_to_translate, additional_instruction=""):
|
| 354 |
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
| 355 |
+
|
| 356 |
+
state.additional_instruction = additional_instruction
|
| 357 |
+
state.files_to_translate = [file_to_translate]
|
| 358 |
new_hist, cleared_input = handle_user_message("start translation", history)
|
| 359 |
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
| 360 |
return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
|
|
|
|
| 422 |
translated_content = state.current_file_content["translated"]
|
| 423 |
response += "\n\n🚀 **Generating GitHub PR...**"
|
| 424 |
|
| 425 |
+
# Extract title from file for toctree mapping
|
| 426 |
+
file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
|
| 427 |
+
print(file_name)
|
| 428 |
+
|
| 429 |
pr_response = generate_github_pr(
|
| 430 |
target_language=state.target_language,
|
| 431 |
filepath=current_file,
|
| 432 |
translated_content=translated_content,
|
| 433 |
github_config=state.github_config,
|
| 434 |
+
en_title=file_name,
|
| 435 |
)
|
| 436 |
response += f"\n{pr_response}"
|
| 437 |
else:
|
agent/workflow.py
CHANGED
|
@@ -11,7 +11,7 @@ from translator.content import (
|
|
| 11 |
llm_translate,
|
| 12 |
preprocess_content,
|
| 13 |
)
|
| 14 |
-
from translator.retriever import report
|
| 15 |
|
| 16 |
# GitHub PR Agent import
|
| 17 |
try:
|
|
@@ -38,8 +38,34 @@ def report_translation_target_files(
|
|
| 38 |
return status_report, [[file] for file in filepath_list]
|
| 39 |
|
| 40 |
|
| 41 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
"""Translate documentation."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# step 1. Get content from file path
|
| 44 |
content = get_content(file_path)
|
| 45 |
to_translate = preprocess_content(content)
|
|
@@ -47,21 +73,25 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
|
|
| 47 |
# step 2. Prepare prompt with docs content
|
| 48 |
if lang == "ko":
|
| 49 |
translation_lang = "Korean"
|
| 50 |
-
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# step 3. Translate with LLM
|
| 53 |
# TODO: MCP clilent 넘길 부분
|
| 54 |
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
| 55 |
-
|
|
|
|
| 56 |
# step 4. Add scaffold to translation result
|
| 57 |
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
| 58 |
-
|
|
|
|
| 59 |
return callback_result, translated_doc
|
| 60 |
|
| 61 |
|
| 62 |
def translate_docs_interactive(
|
| 63 |
-
translate_lang: str, selected_files: list[list[str]]
|
| 64 |
-
) -> tuple[str, str
|
| 65 |
"""Interactive translation function that processes files one by one.
|
| 66 |
|
| 67 |
Args:
|
|
@@ -70,27 +100,17 @@ def translate_docs_interactive(
|
|
| 70 |
"""
|
| 71 |
# Extract file paths from the dataframe format
|
| 72 |
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
| 73 |
-
if not file_paths:
|
| 74 |
-
return (
|
| 75 |
-
"No files selected for translation.",
|
| 76 |
-
gr.update(visible=False),
|
| 77 |
-
gr.update(visible=False),
|
| 78 |
-
gr.update(visible=False),
|
| 79 |
-
[],
|
| 80 |
-
0,
|
| 81 |
-
)
|
| 82 |
|
| 83 |
# Start with the first file
|
| 84 |
current_file = file_paths[0]
|
| 85 |
|
| 86 |
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
| 87 |
-
callback_result, translated_content = translate_docs(translate_lang, current_file)
|
| 88 |
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
| 89 |
|
| 90 |
-
|
| 91 |
-
status += f"\n### 📝 Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
|
| 92 |
|
| 93 |
-
return
|
| 94 |
|
| 95 |
|
| 96 |
def generate_github_pr(
|
|
@@ -98,6 +118,7 @@ def generate_github_pr(
|
|
| 98 |
filepath: str,
|
| 99 |
translated_content: str = None,
|
| 100 |
github_config: dict = None,
|
|
|
|
| 101 |
) -> str:
|
| 102 |
"""Generate a GitHub PR for translated documentation.
|
| 103 |
|
|
@@ -106,6 +127,7 @@ def generate_github_pr(
|
|
| 106 |
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
| 107 |
translated_content: Translated content (if None, read from file)
|
| 108 |
github_config: GitHub configuration dictionary
|
|
|
|
| 109 |
|
| 110 |
Returns:
|
| 111 |
PR creation result message
|
|
@@ -149,9 +171,7 @@ def generate_github_pr(
|
|
| 149 |
print(f" 📁 File: {filepath}")
|
| 150 |
print(f" 🌍 Language: {target_language}")
|
| 151 |
print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
|
| 152 |
-
print(
|
| 153 |
-
f" 🏠 Repository: {github_config['owner']}/{github_config['repo_name']}"
|
| 154 |
-
)
|
| 155 |
|
| 156 |
agent = GitHubPRAgent()
|
| 157 |
result = agent.run_translation_pr_workflow(
|
|
@@ -163,14 +183,37 @@ def generate_github_pr(
|
|
| 163 |
repo_name=github_config["repo_name"],
|
| 164 |
base_branch=github_config.get("base_branch", "main"),
|
| 165 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# Process result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
if result["status"] == "success":
|
| 169 |
return f"""✅ **GitHub PR Creation Successful!**
|
| 170 |
|
| 171 |
🔗 **PR URL:** {result["pr_url"]}
|
| 172 |
🌿 **Branch:** {result["branch"]}
|
| 173 |
-
📁 **File:** {result["file_path"]}
|
| 174 |
|
| 175 |
{result["message"]}"""
|
| 176 |
|
|
@@ -178,7 +221,7 @@ def generate_github_pr(
|
|
| 178 |
return f"""⚠️ **Partial Success**
|
| 179 |
|
| 180 |
🌿 **Branch:** {result["branch"]}
|
| 181 |
-
📁 **File:** {result["file_path"]}
|
| 182 |
|
| 183 |
{result["message"]}
|
| 184 |
|
|
|
|
| 11 |
llm_translate,
|
| 12 |
preprocess_content,
|
| 13 |
)
|
| 14 |
+
from translator.retriever import report, get_github_issue_open_pr
|
| 15 |
|
| 16 |
# GitHub PR Agent import
|
| 17 |
try:
|
|
|
|
| 38 |
return status_report, [[file] for file in filepath_list]
|
| 39 |
|
| 40 |
|
| 41 |
+
def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[str]]:
|
| 42 |
+
docs, pr_info_list = get_github_issue_open_pr(translate_lang)
|
| 43 |
+
|
| 44 |
+
status_report = ""
|
| 45 |
+
if docs:
|
| 46 |
+
status_report = f"""\n🤖 Found {len(docs)} in progress for translation.
|
| 47 |
+
"""
|
| 48 |
+
for i, file in enumerate(docs):
|
| 49 |
+
status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
|
| 50 |
+
status_report += "\n"
|
| 51 |
+
return status_report, docs
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
| 55 |
"""Translate documentation."""
|
| 56 |
+
# Check if translation already exists
|
| 57 |
+
translation_file_path = (
|
| 58 |
+
Path(__file__).resolve().parent.parent
|
| 59 |
+
/ f"translation_result/{file_path}"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
if translation_file_path.exists():
|
| 63 |
+
print(f"📄 Found existing translation: {translation_file_path}")
|
| 64 |
+
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 65 |
+
existing_content = f.read()
|
| 66 |
+
if existing_content.strip():
|
| 67 |
+
return "Existing translation loaded (no tokens used)", existing_content
|
| 68 |
+
|
| 69 |
# step 1. Get content from file path
|
| 70 |
content = get_content(file_path)
|
| 71 |
to_translate = preprocess_content(content)
|
|
|
|
| 73 |
# step 2. Prepare prompt with docs content
|
| 74 |
if lang == "ko":
|
| 75 |
translation_lang = "Korean"
|
| 76 |
+
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
|
| 77 |
+
|
| 78 |
+
print("to_translate_with_prompt:\n", to_translate_with_prompt)
|
| 79 |
|
| 80 |
# step 3. Translate with LLM
|
| 81 |
# TODO: MCP clilent 넘길 부분
|
| 82 |
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
| 83 |
+
print("translated_content:\n")
|
| 84 |
+
print(translated_content)
|
| 85 |
# step 4. Add scaffold to translation result
|
| 86 |
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
| 87 |
+
print("translated_doc:\n")
|
| 88 |
+
print(translated_doc)
|
| 89 |
return callback_result, translated_doc
|
| 90 |
|
| 91 |
|
| 92 |
def translate_docs_interactive(
|
| 93 |
+
translate_lang: str, selected_files: list[list[str]], additional_instruction: str = ""
|
| 94 |
+
) -> tuple[str, str]:
|
| 95 |
"""Interactive translation function that processes files one by one.
|
| 96 |
|
| 97 |
Args:
|
|
|
|
| 100 |
"""
|
| 101 |
# Extract file paths from the dataframe format
|
| 102 |
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# Start with the first file
|
| 105 |
current_file = file_paths[0]
|
| 106 |
|
| 107 |
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
| 108 |
+
callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction)
|
| 109 |
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
| 110 |
|
| 111 |
+
print(status)
|
|
|
|
| 112 |
|
| 113 |
+
return translated_content
|
| 114 |
|
| 115 |
|
| 116 |
def generate_github_pr(
|
|
|
|
| 118 |
filepath: str,
|
| 119 |
translated_content: str = None,
|
| 120 |
github_config: dict = None,
|
| 121 |
+
en_title: str = None,
|
| 122 |
) -> str:
|
| 123 |
"""Generate a GitHub PR for translated documentation.
|
| 124 |
|
|
|
|
| 127 |
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
| 128 |
translated_content: Translated content (if None, read from file)
|
| 129 |
github_config: GitHub configuration dictionary
|
| 130 |
+
en_title: English title for toctree mapping
|
| 131 |
|
| 132 |
Returns:
|
| 133 |
PR creation result message
|
|
|
|
| 171 |
print(f" 📁 File: {filepath}")
|
| 172 |
print(f" 🌍 Language: {target_language}")
|
| 173 |
print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
|
| 174 |
+
print(f" 🏠 Repository: {github_config['owner']}/{github_config['repo_name']}")
|
|
|
|
|
|
|
| 175 |
|
| 176 |
agent = GitHubPRAgent()
|
| 177 |
result = agent.run_translation_pr_workflow(
|
|
|
|
| 183 |
repo_name=github_config["repo_name"],
|
| 184 |
base_branch=github_config.get("base_branch", "main"),
|
| 185 |
)
|
| 186 |
+
# result = {
|
| 187 |
+
# 'status': 'partial_success',
|
| 188 |
+
# 'branch': 'ko-attention_interface',
|
| 189 |
+
# 'file_path': 'docs/source/ko/attention_interface.md',
|
| 190 |
+
# 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
|
| 191 |
+
# }
|
| 192 |
+
# Process toctree update after successful translation PR
|
| 193 |
+
toctree_result = None
|
| 194 |
+
if en_title:
|
| 195 |
+
from agent.toctree_handler import TocTreeHandler
|
| 196 |
+
toctree_handler = TocTreeHandler()
|
| 197 |
+
toctree_result = toctree_handler.update_toctree_after_translation(
|
| 198 |
+
result, en_title, filepath, agent, github_config
|
| 199 |
+
)
|
| 200 |
+
print("toctree_result:", toctree_result)
|
| 201 |
|
| 202 |
# Process result
|
| 203 |
+
# Generate toctree status message (shared for both success and partial_success)
|
| 204 |
+
toctree_status = ""
|
| 205 |
+
if toctree_result:
|
| 206 |
+
if toctree_result["status"] == "success":
|
| 207 |
+
toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
|
| 208 |
+
else:
|
| 209 |
+
toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
|
| 210 |
+
|
| 211 |
if result["status"] == "success":
|
| 212 |
return f"""✅ **GitHub PR Creation Successful!**
|
| 213 |
|
| 214 |
🔗 **PR URL:** {result["pr_url"]}
|
| 215 |
🌿 **Branch:** {result["branch"]}
|
| 216 |
+
📁 **File:** {result["file_path"]}{toctree_status}
|
| 217 |
|
| 218 |
{result["message"]}"""
|
| 219 |
|
|
|
|
| 221 |
return f"""⚠️ **Partial Success**
|
| 222 |
|
| 223 |
🌿 **Branch:** {result["branch"]}
|
| 224 |
+
📁 **File:** {result["file_path"]}{toctree_status}
|
| 225 |
|
| 226 |
{result["message"]}
|
| 227 |
|
app.py
CHANGED
|
@@ -14,6 +14,7 @@ from agent.handler import (
|
|
| 14 |
send_message,
|
| 15 |
start_translate_handler,
|
| 16 |
sync_language_displays,
|
|
|
|
| 17 |
update_status,
|
| 18 |
update_github_config,
|
| 19 |
)
|
|
@@ -30,7 +31,7 @@ css = """
|
|
| 30 |
background: rgba(255, 255, 180, 0.25);
|
| 31 |
border-radius: 18px;
|
| 32 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 33 |
-
padding: 1.
|
| 34 |
backdrop-filter: blur(8px);
|
| 35 |
border: 1px solid rgba(255,255,180,0.25);
|
| 36 |
width: 100%;
|
|
@@ -40,10 +41,12 @@ css = """
|
|
| 40 |
background: rgba(255, 255, 180, 0.25);
|
| 41 |
border-radius: 18px;
|
| 42 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 43 |
-
padding: 1.
|
| 44 |
backdrop-filter: blur(8px);
|
| 45 |
border: 1px solid rgba(255,255,180,0.25);
|
| 46 |
width: 100%;
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
.status-card {
|
| 49 |
width: 100%
|
|
@@ -91,7 +94,6 @@ css = """
|
|
| 91 |
with gr.Blocks(
|
| 92 |
css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
|
| 93 |
) as demo:
|
| 94 |
-
|
| 95 |
# Title
|
| 96 |
with open("images/hfkr_logo.png", "rb") as img_file:
|
| 97 |
base64_img = base64.b64encode(img_file.read()).decode()
|
|
@@ -105,11 +107,12 @@ with gr.Blocks(
|
|
| 105 |
# Content
|
| 106 |
with gr.Row():
|
| 107 |
# Chat interface
|
| 108 |
-
with gr.Column(scale=
|
| 109 |
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
| 110 |
|
| 111 |
chatbot = gr.Chatbot(
|
| 112 |
-
value=[[None, get_welcome_message()]], scale=1, height=585
|
|
|
|
| 113 |
)
|
| 114 |
|
| 115 |
# Controller interface
|
|
@@ -122,16 +125,15 @@ with gr.Blocks(
|
|
| 122 |
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
| 123 |
with gr.TabItem("1. Find Files", id=0):
|
| 124 |
with gr.Group():
|
| 125 |
-
lang_dropdown = gr.
|
| 126 |
choices=[language.value for language in Languages],
|
| 127 |
label="🌍 Translate To",
|
| 128 |
value="ko",
|
| 129 |
)
|
| 130 |
k_input = gr.Number(
|
| 131 |
label="📊 First k missing translated docs",
|
| 132 |
-
value=
|
| 133 |
minimum=1,
|
| 134 |
-
maximum=100,
|
| 135 |
)
|
| 136 |
find_btn = gr.Button(
|
| 137 |
"🔍 Find Files to Translate",
|
|
@@ -140,6 +142,17 @@ with gr.Blocks(
|
|
| 140 |
|
| 141 |
with gr.TabItem("2. Translate", id=1):
|
| 142 |
with gr.Group():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
translate_lang_display = gr.Dropdown(
|
| 144 |
choices=[language.value for language in Languages],
|
| 145 |
label="🌍 Translation Language",
|
|
@@ -150,6 +163,21 @@ with gr.Blocks(
|
|
| 150 |
label="🔑 Anthropic API key for translation generation",
|
| 151 |
type="password",
|
| 152 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
start_translate_btn = gr.Button(
|
| 154 |
"🚀 Start Translation", elem_classes="action-button"
|
| 155 |
)
|
|
@@ -186,7 +214,7 @@ with gr.Blocks(
|
|
| 186 |
|
| 187 |
# Chat Controller
|
| 188 |
with gr.Column(elem_classes=["control-panel"]):
|
| 189 |
-
gr.Markdown("### 💬 Chat with agent")
|
| 190 |
msg_input = gr.Textbox(
|
| 191 |
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
| 192 |
container=False,
|
|
@@ -199,7 +227,7 @@ with gr.Blocks(
|
|
| 199 |
find_btn.click(
|
| 200 |
fn=process_file_search_handler,
|
| 201 |
inputs=[lang_dropdown, k_input, chatbot],
|
| 202 |
-
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 203 |
)
|
| 204 |
|
| 205 |
# Sync language across tabs
|
|
@@ -209,10 +237,17 @@ with gr.Blocks(
|
|
| 209 |
outputs=[translate_lang_display],
|
| 210 |
)
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
# Button event handlers
|
| 213 |
start_translate_btn.click(
|
| 214 |
fn=start_translate_handler,
|
| 215 |
-
inputs=[chatbot, anthropic_key],
|
| 216 |
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 217 |
)
|
| 218 |
|
|
@@ -247,5 +282,13 @@ with gr.Blocks(
|
|
| 247 |
outputs=[chatbot, msg_input, status_display],
|
| 248 |
)
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
| 251 |
demo.launch(root_path=root_path)
|
|
|
|
| 14 |
send_message,
|
| 15 |
start_translate_handler,
|
| 16 |
sync_language_displays,
|
| 17 |
+
update_prompt_preview,
|
| 18 |
update_status,
|
| 19 |
update_github_config,
|
| 20 |
)
|
|
|
|
| 31 |
background: rgba(255, 255, 180, 0.25);
|
| 32 |
border-radius: 18px;
|
| 33 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 34 |
+
padding: 1.0em;
|
| 35 |
backdrop-filter: blur(8px);
|
| 36 |
border: 1px solid rgba(255,255,180,0.25);
|
| 37 |
width: 100%;
|
|
|
|
| 41 |
background: rgba(255, 255, 180, 0.25);
|
| 42 |
border-radius: 18px;
|
| 43 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 44 |
+
padding: 1.0em;
|
| 45 |
backdrop-filter: blur(8px);
|
| 46 |
border: 1px solid rgba(255,255,180,0.25);
|
| 47 |
width: 100%;
|
| 48 |
+
overflow: visible !important;
|
| 49 |
+
|
| 50 |
}
|
| 51 |
.status-card {
|
| 52 |
width: 100%
|
|
|
|
| 94 |
with gr.Blocks(
|
| 95 |
css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
|
| 96 |
) as demo:
|
|
|
|
| 97 |
# Title
|
| 98 |
with open("images/hfkr_logo.png", "rb") as img_file:
|
| 99 |
base64_img = base64.b64encode(img_file.read()).decode()
|
|
|
|
| 107 |
# Content
|
| 108 |
with gr.Row():
|
| 109 |
# Chat interface
|
| 110 |
+
with gr.Column(scale=3, elem_classes=["chat-container"]):
|
| 111 |
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
| 112 |
|
| 113 |
chatbot = gr.Chatbot(
|
| 114 |
+
value=[[None, get_welcome_message()]], scale=1, height=585,
|
| 115 |
+
show_copy_button=True
|
| 116 |
)
|
| 117 |
|
| 118 |
# Controller interface
|
|
|
|
| 125 |
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
| 126 |
with gr.TabItem("1. Find Files", id=0):
|
| 127 |
with gr.Group():
|
| 128 |
+
lang_dropdown = gr.Radio(
|
| 129 |
choices=[language.value for language in Languages],
|
| 130 |
label="🌍 Translate To",
|
| 131 |
value="ko",
|
| 132 |
)
|
| 133 |
k_input = gr.Number(
|
| 134 |
label="📊 First k missing translated docs",
|
| 135 |
+
value=10,
|
| 136 |
minimum=1,
|
|
|
|
| 137 |
)
|
| 138 |
find_btn = gr.Button(
|
| 139 |
"🔍 Find Files to Translate",
|
|
|
|
| 142 |
|
| 143 |
with gr.TabItem("2. Translate", id=1):
|
| 144 |
with gr.Group():
|
| 145 |
+
files_to_translate = gr.Radio(
|
| 146 |
+
choices=[],
|
| 147 |
+
label="📄 Select a file to translate",
|
| 148 |
+
interactive=True,
|
| 149 |
+
value=None,
|
| 150 |
+
)
|
| 151 |
+
file_to_translate_input = gr.Textbox(
|
| 152 |
+
label="🌍 Select in the dropdown or write the file path to translate",
|
| 153 |
+
value="",
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
translate_lang_display = gr.Dropdown(
|
| 157 |
choices=[language.value for language in Languages],
|
| 158 |
label="🌍 Translation Language",
|
|
|
|
| 163 |
label="🔑 Anthropic API key for translation generation",
|
| 164 |
type="password",
|
| 165 |
)
|
| 166 |
+
additional_instruction = gr.Textbox(
|
| 167 |
+
label="📝 Additional instructions (Optional - e.g., custom glossary)",
|
| 168 |
+
placeholder="Example: Translate 'model' as '모델' consistently",
|
| 169 |
+
lines=2,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
with gr.Accordion("🔍 Preview Prompt", open=False):
|
| 173 |
+
prompt_preview = gr.Textbox(
|
| 174 |
+
label="Current Translation Prompt",
|
| 175 |
+
lines=8,
|
| 176 |
+
interactive=False,
|
| 177 |
+
placeholder="Select a file and language to see the prompt preview...",
|
| 178 |
+
show_copy_button=True,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
start_translate_btn = gr.Button(
|
| 182 |
"🚀 Start Translation", elem_classes="action-button"
|
| 183 |
)
|
|
|
|
| 214 |
|
| 215 |
# Chat Controller
|
| 216 |
with gr.Column(elem_classes=["control-panel"]):
|
| 217 |
+
gr.Markdown("### 💬 Chat with agent (Only simple chat is available)")
|
| 218 |
msg_input = gr.Textbox(
|
| 219 |
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
| 220 |
container=False,
|
|
|
|
| 227 |
find_btn.click(
|
| 228 |
fn=process_file_search_handler,
|
| 229 |
inputs=[lang_dropdown, k_input, chatbot],
|
| 230 |
+
outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
|
| 231 |
)
|
| 232 |
|
| 233 |
# Sync language across tabs
|
|
|
|
| 237 |
outputs=[translate_lang_display],
|
| 238 |
)
|
| 239 |
|
| 240 |
+
#
|
| 241 |
+
files_to_translate.change(
|
| 242 |
+
fn=lambda x: x,
|
| 243 |
+
inputs=[files_to_translate],
|
| 244 |
+
outputs=[file_to_translate_input],
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
# Button event handlers
|
| 248 |
start_translate_btn.click(
|
| 249 |
fn=start_translate_handler,
|
| 250 |
+
inputs=[chatbot, anthropic_key, file_to_translate_input, additional_instruction],
|
| 251 |
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 252 |
)
|
| 253 |
|
|
|
|
| 282 |
outputs=[chatbot, msg_input, status_display],
|
| 283 |
)
|
| 284 |
|
| 285 |
+
# Update prompt preview when inputs change
|
| 286 |
+
for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
|
| 287 |
+
input_component.change(
|
| 288 |
+
fn=update_prompt_preview,
|
| 289 |
+
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
| 290 |
+
outputs=[prompt_preview],
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
| 294 |
demo.launch(root_path=root_path)
|
pr_generator/agent.py
CHANGED
|
@@ -518,7 +518,7 @@ Please return only the commit message. No other explanation is needed."""
|
|
| 518 |
"status": "partial_success",
|
| 519 |
"branch": branch_name,
|
| 520 |
"file_path": target_filepath,
|
| 521 |
-
"message": f"File was saved
|
| 522 |
"error_details": pr_result,
|
| 523 |
}
|
| 524 |
elif "successful" in pr_result and "http" in pr_result:
|
|
|
|
| 518 |
"status": "partial_success",
|
| 519 |
"branch": branch_name,
|
| 520 |
"file_path": target_filepath,
|
| 521 |
+
"message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
|
| 522 |
"error_details": pr_result,
|
| 523 |
}
|
| 524 |
elif "successful" in pr_result and "http" in pr_result:
|
translation_result/docs/source/en/accelerator_selection.md
CHANGED
|
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
|
|
| 16 |
|
| 17 |
# 가속기 선택 [[accelerator-selection]]
|
| 18 |
|
| 19 |
-
분산
|
| 20 |
|
| 21 |
이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
|
| 22 |
|
|
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
|
|
| 27 |
<hfoptions id="select-accelerator">
|
| 28 |
<hfoption id="torchrun">
|
| 29 |
|
| 30 |
-
`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택
|
| 31 |
|
| 32 |
```bash
|
| 33 |
torchrun --nproc_per_node=2 trainer-program.py ...
|
|
@@ -36,7 +36,7 @@ torchrun --nproc_per_node=2 trainer-program.py ...
|
|
| 36 |
</hfoption>
|
| 37 |
<hfoption id="Accelerate">
|
| 38 |
|
| 39 |
-
`--num_processes`를 사용하여 사용할 가속기 수를 선택
|
| 40 |
|
| 41 |
```bash
|
| 42 |
accelerate launch --num_processes 2 trainer-program.py ...
|
|
@@ -45,7 +45,7 @@ accelerate launch --num_processes 2 trainer-program.py ...
|
|
| 45 |
</hfoption>
|
| 46 |
<hfoption id="DeepSpeed">
|
| 47 |
|
| 48 |
-
`--num_gpus`를 사용하여 사용할 GPU 수를 선택
|
| 49 |
|
| 50 |
```bash
|
| 51 |
deepspeed --num_gpus 2 trainer-program.py ...
|
|
@@ -55,7 +55,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
|
|
| 55 |
</hfoptions>
|
| 56 |
|
| 57 |
## 가속기 순서 [[order-of-accelerators]]
|
| 58 |
-
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 각 실행
|
| 59 |
|
| 60 |
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 61 |
|
|
@@ -66,7 +66,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
|
|
| 66 |
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
| 67 |
```
|
| 68 |
|
| 69 |
-
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 70 |
순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 71 |
|
| 72 |
|
|
@@ -80,15 +80,15 @@ GPU 없이 실행하려면:
|
|
| 80 |
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
| 81 |
```
|
| 82 |
|
| 83 |
-
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치 순서를 제어할 수도 있습니다:
|
| 84 |
|
| 85 |
-
- PCIe 버스 ID 순서
|
| 86 |
|
| 87 |
```bash
|
| 88 |
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
| 89 |
```
|
| 90 |
|
| 91 |
-
-
|
| 92 |
|
| 93 |
```bash
|
| 94 |
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
|
@@ -101,7 +101,7 @@ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
|
| 101 |
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
| 102 |
```
|
| 103 |
|
| 104 |
-
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 105 |
순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 106 |
|
| 107 |
```bash
|
|
@@ -109,13 +109,13 @@ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
|
| 109 |
```
|
| 110 |
|
| 111 |
|
| 112 |
-
다음
|
| 113 |
|
| 114 |
```bash
|
| 115 |
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
| 116 |
```
|
| 117 |
|
| 118 |
-
Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 119 |
|
| 120 |
</hfoption>
|
| 121 |
</hfoptions>
|
|
@@ -123,5 +123,5 @@ Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero]
|
|
| 123 |
|
| 124 |
|
| 125 |
> [!WARNING]
|
| 126 |
-
> 환경 변수는 명령줄에 추가하는 대신
|
| 127 |
```
|
|
|
|
| 16 |
|
| 17 |
# 가속기 선택 [[accelerator-selection]]
|
| 18 |
|
| 19 |
+
분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
|
| 20 |
|
| 21 |
이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
|
| 22 |
|
|
|
|
| 27 |
<hfoptions id="select-accelerator">
|
| 28 |
<hfoption id="torchrun">
|
| 29 |
|
| 30 |
+
`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
|
| 31 |
|
| 32 |
```bash
|
| 33 |
torchrun --nproc_per_node=2 trainer-program.py ...
|
|
|
|
| 36 |
</hfoption>
|
| 37 |
<hfoption id="Accelerate">
|
| 38 |
|
| 39 |
+
`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
|
| 40 |
|
| 41 |
```bash
|
| 42 |
accelerate launch --num_processes 2 trainer-program.py ...
|
|
|
|
| 45 |
</hfoption>
|
| 46 |
<hfoption id="DeepSpeed">
|
| 47 |
|
| 48 |
+
`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
|
| 49 |
|
| 50 |
```bash
|
| 51 |
deepspeed --num_gpus 2 trainer-program.py ...
|
|
|
|
| 55 |
</hfoptions>
|
| 56 |
|
| 57 |
## 가속기 순서 [[order-of-accelerators]]
|
| 58 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
|
| 59 |
|
| 60 |
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 61 |
|
|
|
|
| 66 |
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
| 67 |
```
|
| 68 |
|
| 69 |
+
GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 70 |
순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 71 |
|
| 72 |
|
|
|
|
| 80 |
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
| 81 |
```
|
| 82 |
|
| 83 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 84 |
|
| 85 |
+
- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
|
| 86 |
|
| 87 |
```bash
|
| 88 |
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
| 89 |
```
|
| 90 |
|
| 91 |
+
- 컴퓨팅 성능 순서 (가장 빠른 것부터):
|
| 92 |
|
| 93 |
```bash
|
| 94 |
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
|
|
|
| 101 |
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
| 102 |
```
|
| 103 |
|
| 104 |
+
XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 105 |
순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 106 |
|
| 107 |
```bash
|
|
|
|
| 109 |
```
|
| 110 |
|
| 111 |
|
| 112 |
+
다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 113 |
|
| 114 |
```bash
|
| 115 |
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
| 116 |
```
|
| 117 |
|
| 118 |
+
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 119 |
|
| 120 |
</hfoption>
|
| 121 |
</hfoptions>
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
> [!WARNING]
|
| 126 |
+
> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
|
| 127 |
```
|
translator/content.py
CHANGED
|
@@ -5,8 +5,13 @@ import requests
|
|
| 5 |
from langchain.callbacks import get_openai_callback
|
| 6 |
from langchain_anthropic import ChatAnthropic
|
| 7 |
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def get_content(filepath: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 10 |
url = string.Template(
|
| 11 |
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
| 12 |
).safe_substitute(filepath=filepath)
|
|
@@ -24,24 +29,31 @@ def preprocess_content(content: str) -> str:
|
|
| 24 |
## ignore top license comment
|
| 25 |
to_translate = content[content.find("#") :]
|
| 26 |
## remove code blocks from text
|
| 27 |
-
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
| 28 |
## remove markdown tables from text
|
| 29 |
-
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
| 30 |
## remove empty lines from text
|
| 31 |
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
| 32 |
-
|
| 33 |
return to_translate
|
| 34 |
|
| 35 |
|
| 36 |
-
def get_full_prompt(language: str, to_translate: str) -> str:
|
| 37 |
-
|
| 38 |
"What do these sentences about Hugging Face Transformers "
|
| 39 |
"(a machine learning library) mean in $language? "
|
| 40 |
"Please do not translate the word after a 🤗 emoji "
|
| 41 |
-
"as it is a product name. Output
|
| 42 |
-
"
|
| 43 |
).safe_substitute(language=language)
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def split_markdown_sections(markdown: str) -> list:
|
|
@@ -64,33 +76,89 @@ def make_scaffold(content: str, to_translate: str) -> string.Template:
|
|
| 64 |
scaffold = content
|
| 65 |
for i, text in enumerate(to_translate.split("\n\n")):
|
| 66 |
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
|
|
|
|
|
|
| 67 |
return string.Template(scaffold)
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
| 71 |
scaffold = make_scaffold(content, to_translate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
divided = split_markdown_sections(to_translate)
|
|
|
|
|
|
|
| 73 |
anchors = get_anchors(divided)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
translated
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
translated_doc = scaffold.safe_substitute(
|
| 93 |
-
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(
|
| 94 |
)
|
| 95 |
return translated_doc
|
| 96 |
|
|
|
|
| 5 |
from langchain.callbacks import get_openai_callback
|
| 6 |
from langchain_anthropic import ChatAnthropic
|
| 7 |
|
| 8 |
+
from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
|
| 9 |
+
|
| 10 |
|
| 11 |
def get_content(filepath: str) -> str:
|
| 12 |
+
if filepath == "":
|
| 13 |
+
raise ValueError("No files selected for translation.")
|
| 14 |
+
|
| 15 |
url = string.Template(
|
| 16 |
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
| 17 |
).safe_substitute(filepath=filepath)
|
|
|
|
| 29 |
## ignore top license comment
|
| 30 |
to_translate = content[content.find("#") :]
|
| 31 |
## remove code blocks from text
|
| 32 |
+
# to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
| 33 |
## remove markdown tables from text
|
| 34 |
+
# to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
| 35 |
## remove empty lines from text
|
| 36 |
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
|
|
|
| 37 |
return to_translate
|
| 38 |
|
| 39 |
|
| 40 |
+
def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
|
| 41 |
+
base_prompt = string.Template(
|
| 42 |
"What do these sentences about Hugging Face Transformers "
|
| 43 |
"(a machine learning library) mean in $language? "
|
| 44 |
"Please do not translate the word after a 🤗 emoji "
|
| 45 |
+
"as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
|
| 46 |
+
"No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
|
| 47 |
).safe_substitute(language=language)
|
| 48 |
+
|
| 49 |
+
base_prompt += "\n\n```md"
|
| 50 |
+
|
| 51 |
+
full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
|
| 52 |
+
|
| 53 |
+
if additional_instruction.strip():
|
| 54 |
+
full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
|
| 55 |
+
|
| 56 |
+
return full_prompt
|
| 57 |
|
| 58 |
|
| 59 |
def split_markdown_sections(markdown: str) -> list:
|
|
|
|
| 76 |
scaffold = content
|
| 77 |
for i, text in enumerate(to_translate.split("\n\n")):
|
| 78 |
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
| 79 |
+
print("inner scaffold:")
|
| 80 |
+
print(scaffold)
|
| 81 |
return string.Template(scaffold)
|
| 82 |
|
| 83 |
|
| 84 |
+
def is_in_code_block(text: str, position: int) -> bool:
|
| 85 |
+
"""Check if a position in text is inside a code block"""
|
| 86 |
+
text_before = text[:position]
|
| 87 |
+
code_block_starts = text_before.count("```")
|
| 88 |
+
return code_block_starts % 2 == 1
|
| 89 |
+
|
| 90 |
+
|
| 91 |
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
| 92 |
scaffold = make_scaffold(content, to_translate)
|
| 93 |
+
print("scaffold:")
|
| 94 |
+
print(scaffold.template)
|
| 95 |
+
|
| 96 |
+
# Get original text sections to maintain structure
|
| 97 |
+
original_sections = to_translate.split("\n\n")
|
| 98 |
+
|
| 99 |
+
# Split markdown sections to get headers and anchors
|
| 100 |
divided = split_markdown_sections(to_translate)
|
| 101 |
+
print("divided:")
|
| 102 |
+
print(divided)
|
| 103 |
anchors = get_anchors(divided)
|
| 104 |
+
|
| 105 |
+
# Split translated content by markdown sections
|
| 106 |
+
translated_divided = split_markdown_sections(translated)
|
| 107 |
+
print("translated divided:")
|
| 108 |
+
print(translated_divided)
|
| 109 |
+
|
| 110 |
+
# Ensure we have the same number of headers as the original
|
| 111 |
+
if len(translated_divided[1::3]) != len(anchors):
|
| 112 |
+
print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
|
| 113 |
+
# Adjust anchors list to match translated headers
|
| 114 |
+
if len(translated_divided[1::3]) < len(anchors):
|
| 115 |
+
anchors = anchors[:len(translated_divided[1::3])]
|
| 116 |
+
else:
|
| 117 |
+
# Add empty anchors for extra headers
|
| 118 |
+
anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
|
| 119 |
+
|
| 120 |
+
# Add anchors to translated headers only if they're not in code blocks
|
| 121 |
+
for i, korean_title in enumerate(translated_divided[1::3]):
|
| 122 |
+
if i < len(anchors):
|
| 123 |
+
# Find the position of this header in the original translated text
|
| 124 |
+
header_pos = translated.find(korean_title.strip())
|
| 125 |
+
if header_pos != -1 and not is_in_code_block(translated, header_pos):
|
| 126 |
+
translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
|
| 127 |
+
else:
|
| 128 |
+
translated_divided[1 + i * 3] = korean_title
|
| 129 |
+
|
| 130 |
+
# Reconstruct translated content with proper structure
|
| 131 |
+
reconstructed_translated = "".join([
|
| 132 |
+
"".join(translated_divided[i * 3 : i * 3 + 3])
|
| 133 |
+
for i in range(len(translated_divided) // 3)
|
| 134 |
+
])
|
| 135 |
+
|
| 136 |
+
# Split by double newlines to match original structure
|
| 137 |
+
translated_sections = reconstructed_translated.split("\n\n")
|
| 138 |
+
|
| 139 |
+
print("scaffold template count:")
|
| 140 |
+
print(scaffold.template.count("$hf_i18n_placeholder"))
|
| 141 |
+
print("original sections length:")
|
| 142 |
+
print(len(original_sections))
|
| 143 |
+
print("translated sections length:")
|
| 144 |
+
print(len(translated_sections))
|
| 145 |
+
|
| 146 |
+
# Ensure section counts match
|
| 147 |
+
placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
|
| 148 |
+
|
| 149 |
+
if len(translated_sections) < placeholder_count:
|
| 150 |
+
# Add empty sections if translated has fewer sections
|
| 151 |
+
translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
|
| 152 |
+
elif len(translated_sections) > placeholder_count:
|
| 153 |
+
# Truncate if translated has more sections
|
| 154 |
+
translated_sections = translated_sections[:placeholder_count]
|
| 155 |
+
|
| 156 |
+
# Final check
|
| 157 |
+
if len(translated_sections) != placeholder_count:
|
| 158 |
+
return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
|
| 159 |
+
|
| 160 |
translated_doc = scaffold.safe_substitute(
|
| 161 |
+
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
|
| 162 |
)
|
| 163 |
return translated_doc
|
| 164 |
|
translator/retriever.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
|
|
@@ -25,6 +26,59 @@ def get_github_repo_files():
|
|
| 25 |
return file_paths
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
| 29 |
"""
|
| 30 |
Retrieve missing docs
|
|
|
|
| 1 |
+
import re
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
| 4 |
|
|
|
|
| 26 |
return file_paths
|
| 27 |
|
| 28 |
|
| 29 |
+
def get_github_issue_open_pr(lang: str = "ko"):
|
| 30 |
+
"""
|
| 31 |
+
Get open PR in the github issue, filtered by title starting with '🌐 [i18n-KO]'.
|
| 32 |
+
"""
|
| 33 |
+
if lang == "ko":
|
| 34 |
+
issue_id = "20179"
|
| 35 |
+
else:
|
| 36 |
+
raise ValueError(
|
| 37 |
+
"No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
headers = {
|
| 41 |
+
"Accept": "application/vnd.github+json",
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
all_open_prs = []
|
| 45 |
+
page = 1
|
| 46 |
+
per_page = 100 # Maximum allowed by GitHub API
|
| 47 |
+
|
| 48 |
+
while True:
|
| 49 |
+
url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open&page={page}&per_page={per_page}"
|
| 50 |
+
response = requests.get(url, headers=headers)
|
| 51 |
+
|
| 52 |
+
if response.status_code != 200:
|
| 53 |
+
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
|
| 54 |
+
|
| 55 |
+
page_prs = response.json()
|
| 56 |
+
if not page_prs: # No more PRs
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
all_open_prs.extend(page_prs)
|
| 60 |
+
page += 1
|
| 61 |
+
|
| 62 |
+
# Break if we got less than per_page results (last page)
|
| 63 |
+
if len(page_prs) < per_page:
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
filtered_prs = [pr for pr in all_open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
|
| 67 |
+
|
| 68 |
+
pattern = re.compile(r"`([^`]+\.md)`")
|
| 69 |
+
|
| 70 |
+
filenames = [
|
| 71 |
+
"docs/source/en/" + match.group(1)
|
| 72 |
+
for pr in filtered_prs
|
| 73 |
+
if (match := pattern.search(pr["title"]))
|
| 74 |
+
]
|
| 75 |
+
pr_info_list = [
|
| 76 |
+
f"https://github.com/huggingface/transformers/pull/{pr["url"].rstrip('/').split('/')[-1]}"
|
| 77 |
+
for pr in filtered_prs
|
| 78 |
+
]
|
| 79 |
+
return filenames, pr_info_list
|
| 80 |
+
|
| 81 |
+
|
| 82 |
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
| 83 |
"""
|
| 84 |
Retrieve missing docs
|