Instructions to use openpecha/aligner with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use openpecha/aligner with Transformers:
# Load model directly from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("openpecha/aligner") model = AutoModelForSeq2SeqLM.from_pretrained("openpecha/aligner") - Notebooks
- Google Colab
- Kaggle
Update handler.py
Browse files- handler.py +45 -37
handler.py
CHANGED
|
@@ -59,46 +59,54 @@ def make_dir_executable(dir_path: Path):
|
|
| 59 |
make_dir_executable(ALIGNER_SCRIPT_DIR)
|
| 60 |
|
| 61 |
|
| 62 |
-
def create_github_repo(repo_path: Path, repo_name: str):
|
| 63 |
logging.info("[INFO] Creating GitHub repo...")
|
| 64 |
|
| 65 |
-
#
|
| 66 |
subprocess.run(f"git config --global user.name {GITHUB_USERNAME}".split())
|
| 67 |
subprocess.run(f"git config --global user.email {GITHUB_EMAIL}".split())
|
| 68 |
|
| 69 |
# Initialize a Git repository
|
| 70 |
-
subprocess.run(f"git init
|
| 71 |
|
| 72 |
# Commit the changes
|
| 73 |
-
subprocess.run("git add .
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
"
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
|
| 91 |
-
# Add the GitHub remote to the local Git repository
|
| 92 |
-
remote_url = f"https://{
|
| 93 |
subprocess.run(
|
| 94 |
f"git remote add origin {remote_url}", cwd=str(repo_path), shell=True
|
| 95 |
)
|
| 96 |
-
# rename default branch to main
|
| 97 |
-
subprocess.run("git branch -M main".split(), cwd=str(repo_path))
|
| 98 |
-
subprocess.run(f"git push {quiet} -u origin main".split(), cwd=str(repo_path))
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
|
|
|
|
| 102 |
|
| 103 |
def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
|
| 104 |
if DEBUG:
|
|
@@ -116,10 +124,6 @@ def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
|
|
| 116 |
|
| 117 |
# Read and log the content of the file
|
| 118 |
content = fn.read_text()
|
| 119 |
-
if content:
|
| 120 |
-
logging.info(f"Content length of {fn.name}: {len(content)} characters") # Log first 100 characters
|
| 121 |
-
else:
|
| 122 |
-
logging.warning(f"File is empty: {fn}")
|
| 123 |
|
| 124 |
except Exception as e:
|
| 125 |
logging.error(f"Error while reading file {fn}: {e}")
|
|
@@ -201,9 +205,17 @@ def add_to_publish_todo_repo(org, repo_name, file_path, access_token):
|
|
| 201 |
print(f"[ERROR] Response: {response.text}")
|
| 202 |
|
| 203 |
|
| 204 |
-
def create_tm(align_fn: Path, text_pair
|
| 205 |
align_fn = Path(align_fn)
|
| 206 |
text_id = text_pair["text_id"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 208 |
output_dir = Path(tmp_dir)
|
| 209 |
repo_name = f"TM{text_id}"
|
|
@@ -211,12 +223,12 @@ def create_tm(align_fn: Path, text_pair: Dict[str, str]):
|
|
| 211 |
tm_path.mkdir(exist_ok=True, parents=True)
|
| 212 |
repo_path = convert_raw_align_to_tm(align_fn, tm_path)
|
| 213 |
repo_path = add_input_in_readme(text_pair, tm_path)
|
| 214 |
-
repo_url = create_github_repo(repo_path, repo_name)
|
| 215 |
logging.info(f"TM repo created: {repo_url}")
|
| 216 |
add_to_publish_todo_repo(GITHUB_ORG, MAI_TM_PUBLISH_TODO_REPO, repo_name, GITHUB_ACCESS_TOKEN)
|
| 217 |
return repo_url
|
| 218 |
|
| 219 |
-
##----------------------- MAIN -----------------------##
|
| 220 |
|
| 221 |
|
| 222 |
@contextmanager
|
|
@@ -287,7 +299,6 @@ def _run_align_script(bo_fn, en_fn, output_dir):
|
|
| 287 |
content = output_fn.read_text()
|
| 288 |
if content:
|
| 289 |
logging.info(f"Length of content in bo.tx.clean{output_fn}: {len(content)}")
|
| 290 |
-
logging.info(f"Contents of {output}: {output.stdout}")
|
| 291 |
else:
|
| 292 |
logging.warning(f"The file {output_fn} is empty.")
|
| 293 |
else:
|
|
@@ -318,13 +329,10 @@ def align(text_pair):
|
|
| 318 |
return {"error": "Failed to download English file or file is empty"}
|
| 319 |
|
| 320 |
# Log content of files for verification
|
| 321 |
-
logging.info(f"Content of {bo_fn.name}: {bo_fn.read_text()[:100]}...")
|
| 322 |
-
logging.info(f"Content of {en_fn.name}: {en_fn.read_text()[:100]}...")
|
| 323 |
-
|
| 324 |
# Run alignment script
|
|
|
|
| 325 |
try:
|
| 326 |
aligned_fn = _run_align_script(bo_fn, en_fn, output_dir)
|
| 327 |
-
logging.info(f"Alignment script output file: {aligned_fn}")
|
| 328 |
except Exception as e:
|
| 329 |
logging.error(f"Alignment script error: {e}")
|
| 330 |
return {"error": f"Alignment script failed: {e}"}
|
|
|
|
| 59 |
make_dir_executable(ALIGNER_SCRIPT_DIR)
|
| 60 |
|
| 61 |
|
| 62 |
+
def create_github_repo(repo_path: Path, repo_name: str, version: str, realign: bool):
|
| 63 |
logging.info("[INFO] Creating GitHub repo...")
|
| 64 |
|
| 65 |
+
# Configure git users
|
| 66 |
subprocess.run(f"git config --global user.name {GITHUB_USERNAME}".split())
|
| 67 |
subprocess.run(f"git config --global user.email {GITHUB_EMAIL}".split())
|
| 68 |
|
| 69 |
# Initialize a Git repository
|
| 70 |
+
subprocess.run(f"git init".split(), cwd=str(repo_path))
|
| 71 |
|
| 72 |
# Commit the changes
|
| 73 |
+
subprocess.run("git add .".split(), cwd=str(repo_path))
|
| 74 |
+
commit_message = "Initial commit" if not realign else f"Commit for version {version}"
|
| 75 |
+
subprocess.run(f"git commit -m".split() + [commit_message], cwd=str(repo_path))
|
| 76 |
+
|
| 77 |
+
# Check if realigning
|
| 78 |
+
if realign:
|
| 79 |
+
if version:
|
| 80 |
+
# Checkout the new branch
|
| 81 |
+
subprocess.run(f"git checkout -b {version}".split(), cwd=str(repo_path))
|
| 82 |
+
else:
|
| 83 |
+
logging.error("Version not specified for realignment.")
|
| 84 |
+
return
|
| 85 |
+
else:
|
| 86 |
+
# Create a new repository on GitHub
|
| 87 |
+
response = requests.post(
|
| 88 |
+
GITHUB_API_ENDPOINT,
|
| 89 |
+
json={
|
| 90 |
+
"name": repo_name,
|
| 91 |
+
"private": True,
|
| 92 |
+
},
|
| 93 |
+
auth=(GITHUB_USERNAME, GITHUB_ACCESS_TOKEN),
|
| 94 |
+
)
|
| 95 |
+
response.raise_for_status()
|
| 96 |
|
| 97 |
+
time.sleep(3)
|
| 98 |
|
| 99 |
+
# Add the GitHub remote to the local Git repository
|
| 100 |
+
remote_url = f"https://{GITHUB_USERNAME}:{GITHUB_ACCESS_TOKEN}@github.com/{GITHUB_ORG}/{repo_name}.git"
|
| 101 |
subprocess.run(
|
| 102 |
f"git remote add origin {remote_url}", cwd=str(repo_path), shell=True
|
| 103 |
)
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
# Push the changes to the appropriate branch
|
| 106 |
+
branch_name = version if realign else "main"
|
| 107 |
+
subprocess.run(f"git push -u origin {branch_name}".split(), cwd=str(repo_path))
|
| 108 |
|
| 109 |
+
return f"Branch '{branch_name}' updated in {repo_name}" if realign else response.json()["html_url"]
|
| 110 |
|
| 111 |
def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
|
| 112 |
if DEBUG:
|
|
|
|
| 124 |
|
| 125 |
# Read and log the content of the file
|
| 126 |
content = fn.read_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
except Exception as e:
|
| 129 |
logging.error(f"Error while reading file {fn}: {e}")
|
|
|
|
| 205 |
print(f"[ERROR] Response: {response.text}")
|
| 206 |
|
| 207 |
|
| 208 |
+
def create_tm(align_fn: Path, text_pair):
|
| 209 |
align_fn = Path(align_fn)
|
| 210 |
text_id = text_pair["text_id"]
|
| 211 |
+
try:
|
| 212 |
+
|
| 213 |
+
version= text_pair["version"]
|
| 214 |
+
realign=text_pair["realign"]
|
| 215 |
+
except:
|
| 216 |
+
version=""
|
| 217 |
+
realign=False
|
| 218 |
+
|
| 219 |
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 220 |
output_dir = Path(tmp_dir)
|
| 221 |
repo_name = f"TM{text_id}"
|
|
|
|
| 223 |
tm_path.mkdir(exist_ok=True, parents=True)
|
| 224 |
repo_path = convert_raw_align_to_tm(align_fn, tm_path)
|
| 225 |
repo_path = add_input_in_readme(text_pair, tm_path)
|
| 226 |
+
repo_url = create_github_repo(repo_path, repo_name,version,realign)
|
| 227 |
logging.info(f"TM repo created: {repo_url}")
|
| 228 |
add_to_publish_todo_repo(GITHUB_ORG, MAI_TM_PUBLISH_TODO_REPO, repo_name, GITHUB_ACCESS_TOKEN)
|
| 229 |
return repo_url
|
| 230 |
|
| 231 |
+
##--------------------------------------------- MAIN -----------------------##
|
| 232 |
|
| 233 |
|
| 234 |
@contextmanager
|
|
|
|
| 299 |
content = output_fn.read_text()
|
| 300 |
if content:
|
| 301 |
logging.info(f"Length of content in bo.tx.clean{output_fn}: {len(content)}")
|
|
|
|
| 302 |
else:
|
| 303 |
logging.warning(f"The file {output_fn} is empty.")
|
| 304 |
else:
|
|
|
|
| 329 |
return {"error": "Failed to download English file or file is empty"}
|
| 330 |
|
| 331 |
# Log content of files for verification
|
|
|
|
|
|
|
|
|
|
| 332 |
# Run alignment script
|
| 333 |
+
|
| 334 |
try:
|
| 335 |
aligned_fn = _run_align_script(bo_fn, en_fn, output_dir)
|
|
|
|
| 336 |
except Exception as e:
|
| 337 |
logging.error(f"Alignment script error: {e}")
|
| 338 |
return {"error": f"Alignment script failed: {e}"}
|