TenzinGayche commited on
Commit
6617e81
·
verified ·
1 Parent(s): 19a74d0

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +270 -0
handler.py CHANGED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ from typing import Dict, List, Any
3
+ import os
4
+ import json
5
+ import logging
6
+ import sys
7
+ import tempfile
8
+ import time
9
+ from pathlib import Path
10
+ import re
11
+ import shutil
12
+ import stat
13
+ import subprocess
14
+
15
+ import uuid
16
+ from contextlib import contextmanager
17
+
18
+ import requests
19
+ logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
20
+ # Git clone command
21
+ git_clone_command = "git clone https://github.com/OpenPecha/tibetan-aligner"
22
+
23
+ # Run the command using subprocess
24
+ try:
25
+ subprocess.run(git_clone_command, shell=True, check=True)
26
+ print("Git clone successful!")
27
+ except subprocess.CalledProcessError as e:
28
+ print(f"Error while running Git clone command: {e}")
29
+
30
+
31
+ ALIGNER_SCRIPT_DIR = Path("./tibetan-aligner").resolve()
32
+ ALIGNER_SCRIPT_NAME = "align_tib_en.sh"
33
+ ALIGNER_SCRIPT_PATH = ALIGNER_SCRIPT_DIR / ALIGNER_SCRIPT_NAME
34
+ assert ALIGNER_SCRIPT_PATH.is_file()
35
+
36
+ import requests
37
+
38
+ GITHUB_USERNAME = "pechawa"
39
+ GITHUB_ACCESS_TOKEN = "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"
40
+ GITHUB_TOKEN = "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"
41
+ GITHUB_EMAIL = "openpecha-bot@openpecha.org"
42
+ GITHUB_ORG = "MonlamAI"
43
+ MAI_TM_PUBLISH_TODO_REPO = "MonlamAI_TMs_Publish_TODO"
44
+ GITHUB_API_ENDPOINT = f"https://api.github.com/orgs/{GITHUB_ORG}/repos"
45
+
46
+ DEBUG = False
47
+
48
+ quiet = "-q" if DEBUG else ""
49
+ def make_dir_executable(dir_path: Path):
50
+ for fn in dir_path.iterdir():
51
+ st = os.stat(fn)
52
+ os.chmod(fn, st.st_mode | stat.S_IEXEC)
53
+ st = os.stat(fn)
54
+ os.chmod(fn, st.st_mode | stat.S_IXGRP)
55
+ st = os.stat(fn)
56
+ os.chmod(fn, st.st_mode | stat.S_IXOTH)
57
+
58
+
59
+ make_dir_executable(ALIGNER_SCRIPT_DIR)
60
+
61
+
62
+ def create_github_repo(repo_path: Path, repo_name: str):
63
+ logging.info("[INFO] Creating GitHub repo...")
64
+
65
+ # configure git users
66
+ subprocess.run(f"git config --global user.name {GITHUB_USERNAME}".split())
67
+ subprocess.run(f"git config --global user.email {GITHUB_EMAIL}".split())
68
+
69
+ # Initialize a Git repository
70
+ subprocess.run(f"git init {quiet}".split(), cwd=str(repo_path))
71
+
72
+ # Commit the changes
73
+ subprocess.run("git add . ".split(), cwd=str(repo_path))
74
+ subprocess.run(
75
+ f"git commit {quiet} -m".split() + ["Initial commit"], cwd=str(repo_path)
76
+ )
77
+
78
+ # Create a new repository on GitHub
79
+ response = requests.post(
80
+ GITHUB_API_ENDPOINT,
81
+ json={
82
+ "name": repo_name,
83
+ "private": True,
84
+ },
85
+ auth=(GITHUB_USERNAME, GITHUB_ACCESS_TOKEN),
86
+ )
87
+ response.raise_for_status()
88
+
89
+ time.sleep(3)
90
+
91
+ # Add the GitHub remote to the local Git repository and push the changes
92
+ remote_url = f"https://{GITHUB_ORG}:{GITHUB_ACCESS_TOKEN}@github.com/{GITHUB_ORG}/{repo_name}.git"
93
+ subprocess.run(
94
+ f"git remote add origin {remote_url}", cwd=str(repo_path), shell=True
95
+ )
96
+ # rename default branch to main
97
+ subprocess.run("git branch -M main".split(), cwd=str(repo_path))
98
+ subprocess.run(f"git push {quiet} -u origin main".split(), cwd=str(repo_path))
99
+
100
+ return response.json()["html_url"]
101
+
102
+
103
+ def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
104
+ if DEBUG:
105
+ logging.debug("[INFO] Conerting raw alignment to TM repo...")
106
+
107
+ def load_alignment(fn: Path):
108
+ content = fn.read_text()
109
+ if not content:
110
+ return []
111
+
112
+ for seg_pair in content.splitlines():
113
+ if not seg_pair:
114
+ continue
115
+
116
+ if "\t" in seg_pair:
117
+ try:
118
+ bo_seg, en_seg = seg_pair.split("\t", 1)
119
+ except Exception as e:
120
+ logging.error(f"{e} in {fn}")
121
+ raise
122
+
123
+ else:
124
+ bo_seg = seg_pair
125
+ en_seg = "\n"
126
+ yield bo_seg, en_seg
127
+
128
+ text_bo_fn = tm_path / f"{tm_path.name}-bo.txt"
129
+ text_en_fn = tm_path / f"{tm_path.name}-en.txt"
130
+
131
+ with open(text_bo_fn, "w", encoding="utf-8") as bo_file, open(
132
+ text_en_fn, "w", encoding="utf-8"
133
+ ) as en_file:
134
+ for bo_seg, en_seg in load_alignment(align_fn):
135
+ bo_file.write(bo_seg + "\n")
136
+ en_file.write(en_seg + "\n")
137
+
138
+ return tm_path
139
+
140
+
141
+ def get_github_dev_url(raw_github_url: str) -> str:
142
+ base_url = "https://github.dev"
143
+ _, file_path = raw_github_url.split(".com")
144
+ blob_file_path = file_path.replace("main", "blob/main")
145
+ return base_url + blob_file_path
146
+
147
+
148
+ def add_input_in_readme(input_dict: Dict[str, str], path: Path) -> Path:
149
+ input_readme_fn = path / "README.md"
150
+ text_id = input_dict["text_id"]
151
+ bo_file_url = get_github_dev_url(input_dict["bo_file_url"])
152
+ en_file_url = get_github_dev_url(input_dict["en_file_url"])
153
+ input_string = "## Input\n- [BO{}]({})\n- [EN{}]({})".format(
154
+ text_id, bo_file_url, text_id, en_file_url
155
+ )
156
+
157
+ input_readme_fn.write_text(input_string)
158
+
159
+ return path
160
+
161
+ def add_to_publish_todo_repo(org, repo_name, file_path, access_token):
162
+ base_url = f"https://api.github.com/repos/{org}/{repo_name}/contents/"
163
+
164
+ headers = {
165
+ "Authorization": f"Bearer {access_token}",
166
+ "Accept": "application/vnd.github.v3+json",
167
+ }
168
+
169
+ url = base_url + file_path
170
+
171
+ response = requests.get(url, headers=headers)
172
+
173
+ if response.status_code == 200:
174
+ print(f"[INFO] '{file_path}' already added.")
175
+ return
176
+
177
+ payload = {"message": f"Add {file_path}", "content": ""}
178
+
179
+ response = requests.put(url, headers=headers, json=payload)
180
+
181
+ if response.status_code == 201:
182
+ print(f"[INFO] '{file_path}' added to publish todo")
183
+ else:
184
+ print(f"[ERROR] Failed to add '{file_path}'.")
185
+ print(f"[ERROR] Response: {response.text}")
186
+
187
+
188
+ def create_tm(align_fn: Path, text_pair: Dict[str, str]):
189
+ align_fn = Path(align_fn)
190
+ text_id = text_pair["text_id"]
191
+ with tempfile.TemporaryDirectory() as tmp_dir:
192
+ output_dir = Path(tmp_dir)
193
+ repo_name = f"TM{text_id}"
194
+ tm_path = output_dir / repo_name
195
+ tm_path.mkdir(exist_ok=True, parents=True)
196
+ repo_path = convert_raw_align_to_tm(align_fn, tm_path)
197
+ repo_path = add_input_in_readme(text_pair, tm_path)
198
+ repo_url = create_github_repo(repo_path, repo_name)
199
+ logging.info(f"TM repo created: {repo_url}")
200
+ add_to_publish_todo_repo(GITHUB_ORG, MAI_TM_PUBLISH_TODO_REPO, repo_name, GITHUB_ACCESS_TOKEN)
201
+ return repo_url
202
+
203
+ ##----------------------- MAIN -----------------------##
204
+
205
+
206
+ @contextmanager
207
+ def TemporaryDirectory():
208
+ tmpdir = Path("./output").resolve() / uuid.uuid4().hex[:8]
209
+ tmpdir.mkdir(exist_ok=True, parents=True)
210
+ try:
211
+ yield tmpdir
212
+ finally:
213
+ shutil.rmtree(str(tmpdir))
214
+
215
+
216
+ def download_file(github_file_url: str, output_fn) -> Path:
217
+ """Download file from github"""
218
+ headers = {
219
+ "Authorization": f"token {GITHUB_TOKEN}",
220
+ "Accept": "application/vnd.github+json",
221
+ }
222
+ authenticated_file_url = f"{github_file_url}?token={GITHUB_TOKEN}"
223
+ with requests.get(authenticated_file_url, headers=headers, stream=True) as r:
224
+ r.raise_for_status()
225
+ with open(output_fn, "wb") as f:
226
+ for chunk in r.iter_content(chunk_size=8192):
227
+ f.write(chunk)
228
+ return output_fn
229
+
230
+
231
+ def _run_align_script(bo_fn, en_fn, output_dir):
232
+ start = time.time()
233
+ cmd = [str(ALIGNER_SCRIPT_PATH), str(bo_fn), str(en_fn), str(output_dir)]
234
+ output = subprocess.run(
235
+ cmd,
236
+ check=True,
237
+ capture_output=True,
238
+ text=True,
239
+ cwd=str(ALIGNER_SCRIPT_DIR),
240
+ )
241
+ output_fn = re.search(r"\[OUTPUT\] (.*)", output.stdout).group(1)
242
+ output_fn = "/" + output_fn.split("//")[-1]
243
+ end = time.time()
244
+ total_time = round((end - start) / 60, 2)
245
+ logging.info(f"Total time taken for Aligning: {total_time} mins")
246
+ return output_fn
247
+ def align(text_pair):
248
+ logging.info(f"Running aligner for TM{text_pair['text_id']}...")
249
+ with TemporaryDirectory() as tmpdir:
250
+ output_dir = Path(tmpdir)
251
+ bo_fn = download_file(text_pair["bo_file_url"], output_fn=output_dir / "bo.tx")
252
+ en_fn = download_file(text_pair["en_file_url"], output_fn=output_dir / "en.tx")
253
+ aligned_fn = _run_align_script(bo_fn, en_fn, output_dir)
254
+ repo_url = create_tm(aligned_fn, text_pair=text_pair)
255
+ return {"tm_repo_url": repo_url}
256
+
257
+ class EndpointHandler():
258
+ def __init__(self, path=""):
259
+ self.path = path
260
+
261
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
262
+ """
263
+ Args:
264
+ data (:obj:):
265
+ includes the input data and the parameters for the inference.
266
+ Return:
267
+ A :obj:`list`:. The list contains the embeddings of the inference inputs
268
+ """
269
+ return align(data)
270
+