Spaces:
Build error
Build error
DWizard commited on
Commit ·
be41edb
1
Parent(s): 5e8cc27
new dict utils need prune
Browse filesFormer-commit-id: c3d204c86e52f7ef0958fd59487a12505addea3c
- dict_util.py +26 -1
- domain_dict/SC2/EN.csv +36 -36
- domain_dict/SC2/ZH.csv +34 -34
- src/srt_util/srt.py +19 -18
dict_util.py
CHANGED
|
@@ -52,4 +52,29 @@ with open("../test.csv", "w", encoding='utf-8') as w:
|
|
| 52 |
export_csv_dict(term_dict_sc2,w)
|
| 53 |
|
| 54 |
## for load pickle, just:
|
| 55 |
-
# pickle.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
export_csv_dict(term_dict_sc2,w)
|
| 53 |
|
| 54 |
## for load pickle, just:
|
| 55 |
+
# pickle.load(f)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def form_dict(src_path,tgt_path) -> dict:
|
| 59 |
+
final_dict = {}
|
| 60 |
+
with open(src_path, 'r', encoding="utf-8") as file:
|
| 61 |
+
src_dict = list(csv.reader(file, delimiter=","))
|
| 62 |
+
with open(tgt_path, 'r', encoding="utf-8") as file:
|
| 63 |
+
tgt_dict = list(csv.reader(file, delimiter="," ))
|
| 64 |
+
for idx, value in enumerate(src_dict):
|
| 65 |
+
for item in value:
|
| 66 |
+
final_dict.update({item:tgt_dict[idx]})
|
| 67 |
+
return final_dict
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class term_dict(dict):
|
| 71 |
+
def __init__(self, path, src_lang, tgt_lang) -> None:
|
| 72 |
+
src_dict = f"{path}/{src_lang}.csv"
|
| 73 |
+
tgt_dict = f"{path}/{tgt_lang}.csv"
|
| 74 |
+
super().__init__(form_dict(src_dict, tgt_dict))
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get(self, key:str) -> str:
|
| 78 |
+
word = self[key][randint(0,len(self[key])-1)]
|
| 79 |
+
return word
|
| 80 |
+
|
domain_dict/SC2/EN.csv
CHANGED
|
@@ -1,43 +1,43 @@
|
|
| 1 |
-
|
| 2 |
-
zerg
|
| 3 |
-
protoss
|
| 4 |
-
terran
|
| 5 |
engineering bay,engin bay
|
| 6 |
-
forge
|
| 7 |
-
blink
|
| 8 |
-
evolution chamber
|
| 9 |
cybernetics core,cybercore
|
| 10 |
-
enhanced shockwaves
|
| 11 |
-
gravitic boosters
|
| 12 |
-
armory
|
| 13 |
robotics bay,robo bay
|
| 14 |
twilight council,twilight
|
| 15 |
-
fusion core
|
| 16 |
-
fleet beacon
|
| 17 |
-
factory
|
| 18 |
-
ghost academy
|
| 19 |
-
infestation pit
|
| 20 |
robotics facility,robo
|
| 21 |
-
stargate
|
| 22 |
-
starport
|
| 23 |
-
archon
|
| 24 |
-
smart servos
|
| 25 |
-
gateway
|
| 26 |
-
warpgate
|
| 27 |
-
immortal
|
| 28 |
-
zealot
|
| 29 |
-
nydus network
|
| 30 |
-
nydus worm
|
| 31 |
hydralisk,hydra
|
| 32 |
-
grooved spines
|
| 33 |
-
muscular augments
|
| 34 |
hydralisk den,hydra den
|
| 35 |
-
planetary fortress
|
| 36 |
-
battle cruiser
|
| 37 |
-
weapon refit
|
| 38 |
-
brood lord
|
| 39 |
-
broodling
|
| 40 |
-
greater spire
|
| 41 |
-
anabolic synthesis
|
| 42 |
-
cyclone
|
| 43 |
-
bunker
|
|
|
|
| 1 |
+
barracks
|
| 2 |
+
zerg
|
| 3 |
+
protoss
|
| 4 |
+
terran
|
| 5 |
engineering bay,engin bay
|
| 6 |
+
forge
|
| 7 |
+
blink
|
| 8 |
+
evolution chamber
|
| 9 |
cybernetics core,cybercore
|
| 10 |
+
enhanced shockwaves
|
| 11 |
+
gravitic boosters
|
| 12 |
+
armory
|
| 13 |
robotics bay,robo bay
|
| 14 |
twilight council,twilight
|
| 15 |
+
fusion core
|
| 16 |
+
fleet beacon
|
| 17 |
+
factory
|
| 18 |
+
ghost academy
|
| 19 |
+
infestation pit
|
| 20 |
robotics facility,robo
|
| 21 |
+
stargate
|
| 22 |
+
starport
|
| 23 |
+
archon
|
| 24 |
+
smart servos
|
| 25 |
+
gateway
|
| 26 |
+
warpgate
|
| 27 |
+
immortal
|
| 28 |
+
zealot
|
| 29 |
+
nydus network
|
| 30 |
+
nydus worm
|
| 31 |
hydralisk,hydra
|
| 32 |
+
grooved spines
|
| 33 |
+
muscular augments
|
| 34 |
hydralisk den,hydra den
|
| 35 |
+
planetary fortress
|
| 36 |
+
battle cruiser
|
| 37 |
+
weapon refit
|
| 38 |
+
brood lord
|
| 39 |
+
broodling
|
| 40 |
+
greater spire
|
| 41 |
+
anabolic synthesis
|
| 42 |
+
cyclone
|
| 43 |
+
bunker
|
domain_dict/SC2/ZH.csv
CHANGED
|
@@ -1,43 +1,43 @@
|
|
| 1 |
-
|
| 2 |
-
虫族
|
| 3 |
-
神族
|
| 4 |
-
人族
|
| 5 |
工程站,BE
|
| 6 |
BF,锻炉
|
| 7 |
-
闪现
|
| 8 |
-
进化腔
|
| 9 |
BY,赛博核心
|
| 10 |
-
EMP范围
|
| 11 |
-
ob速度
|
| 12 |
-
军械库
|
| 13 |
机械研究所,VB
|
| 14 |
光影议会,VC
|
| 15 |
-
聚变芯体
|
| 16 |
-
舰队航标
|
| 17 |
-
重工厂
|
| 18 |
-
幽灵军校
|
| 19 |
-
感染深渊
|
| 20 |
VR,机械台
|
| 21 |
神族VS,星门
|
| 22 |
星港,人族VS
|
| 23 |
-
白球
|
| 24 |
-
变形加速
|
| 25 |
-
传送门
|
| 26 |
-
折跃门
|
| 27 |
-
不朽
|
| 28 |
-
叉叉
|
| 29 |
-
虫道网络
|
| 30 |
-
坑道虫
|
| 31 |
-
刺蛇
|
| 32 |
-
刺蛇射程
|
| 33 |
-
刺蛇速度
|
| 34 |
-
刺蛇塔
|
| 35 |
大地堡,行星要塞
|
| 36 |
-
大和
|
| 37 |
-
大和炮
|
| 38 |
-
大龙
|
| 39 |
-
巢虫
|
| 40 |
-
大龙塔
|
| 41 |
-
大牛速度
|
| 42 |
-
导弹车
|
| 43 |
-
地堡
|
|
|
|
| 1 |
+
兵营
|
| 2 |
+
虫族
|
| 3 |
+
神族
|
| 4 |
+
人族
|
| 5 |
工程站,BE
|
| 6 |
BF,锻炉
|
| 7 |
+
闪现
|
| 8 |
+
进化腔
|
| 9 |
BY,赛博核心
|
| 10 |
+
EMP范围
|
| 11 |
+
ob速度
|
| 12 |
+
军械库
|
| 13 |
机械研究所,VB
|
| 14 |
光影议会,VC
|
| 15 |
+
聚变芯体
|
| 16 |
+
舰队航标
|
| 17 |
+
重工厂
|
| 18 |
+
幽灵军校
|
| 19 |
+
感染深渊
|
| 20 |
VR,机械台
|
| 21 |
神族VS,星门
|
| 22 |
星港,人族VS
|
| 23 |
+
白球
|
| 24 |
+
变形加速
|
| 25 |
+
传送门
|
| 26 |
+
折跃门
|
| 27 |
+
不朽
|
| 28 |
+
叉叉
|
| 29 |
+
虫道网络
|
| 30 |
+
坑道虫
|
| 31 |
+
刺蛇
|
| 32 |
+
刺蛇射程
|
| 33 |
+
刺蛇速度
|
| 34 |
+
刺蛇塔
|
| 35 |
大地堡,行星要塞
|
| 36 |
+
大和
|
| 37 |
+
大和炮
|
| 38 |
+
大龙
|
| 39 |
+
巢虫
|
| 40 |
+
大龙塔
|
| 41 |
+
大牛速度
|
| 42 |
+
导弹车
|
| 43 |
+
地堡
|
src/srt_util/srt.py
CHANGED
|
@@ -7,6 +7,7 @@ from datetime import timedelta
|
|
| 7 |
import logging
|
| 8 |
import openai
|
| 9 |
from tqdm import tqdm
|
|
|
|
| 10 |
|
| 11 |
# punctuation dictionary for supported languages
|
| 12 |
punctuation_dict = {
|
|
@@ -161,9 +162,13 @@ class SrtScript(object):
|
|
| 161 |
if self.domain != "General":
|
| 162 |
if os.path.exists(f"{dict_path}/{self.domain}"):
|
| 163 |
# TODO: load dictionary
|
|
|
|
|
|
|
| 164 |
...
|
| 165 |
else:
|
| 166 |
-
logging.error(f"domain {self.domain} doesn't exist")
|
|
|
|
|
|
|
| 167 |
|
| 168 |
@classmethod
|
| 169 |
def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
|
|
@@ -444,23 +449,19 @@ class SrtScript(object):
|
|
| 444 |
if self.domain == "General":
|
| 445 |
logging.info("General domain could not perform correct_with_force_term. skip this step.")
|
| 446 |
pass
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
logging.info(
|
| 461 |
-
"replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(
|
| 462 |
-
i + 1))
|
| 463 |
-
logging.info("source text becomes: " + seg.source_text)
|
| 464 |
|
| 465 |
comp_dict = []
|
| 466 |
|
|
|
|
| 7 |
import logging
|
| 8 |
import openai
|
| 9 |
from tqdm import tqdm
|
| 10 |
+
import dict_util
|
| 11 |
|
| 12 |
# punctuation dictionary for supported languages
|
| 13 |
punctuation_dict = {
|
|
|
|
| 162 |
if self.domain != "General":
|
| 163 |
if os.path.exists(f"{dict_path}/{self.domain}"):
|
| 164 |
# TODO: load dictionary
|
| 165 |
+
self.dict = dict_util.term_dict(f"{dict_path}/{self.domain}", src_lang, tgt_lang)
|
| 166 |
+
print(self.dict["robo"])
|
| 167 |
...
|
| 168 |
else:
|
| 169 |
+
logging.error(f"domain {self.domain} doesn't exist, fallback to general domain, this will disable correct_with_force_term and spell_check_term")
|
| 170 |
+
self.domain = "General"
|
| 171 |
+
|
| 172 |
|
| 173 |
@classmethod
|
| 174 |
def parse_from_srt_file(cls, src_lang, tgt_lang, path: str):
|
|
|
|
| 449 |
if self.domain == "General":
|
| 450 |
logging.info("General domain could not perform correct_with_force_term. skip this step.")
|
| 451 |
pass
|
| 452 |
+
else:
|
| 453 |
+
keywords = list(self.dict.keys())
|
| 454 |
+
keywords.sort(key=lambda x: len(x), reverse=True)
|
| 455 |
+
|
| 456 |
+
for word in keywords:
|
| 457 |
+
for i, seg in enumerate(self.segments):
|
| 458 |
+
if word in seg.source_text.lower():
|
| 459 |
+
seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(self.dict.get(word)),
|
| 460 |
+
seg.source_text, flags=re.IGNORECASE)
|
| 461 |
+
logging.info(
|
| 462 |
+
"replace term: " + word + " --> " + self.dict.get(word) + " in time stamp {}".format(
|
| 463 |
+
i + 1))
|
| 464 |
+
logging.info("source text becomes: " + seg.source_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
comp_dict = []
|
| 467 |
|