Upload /my_pipeline_wo_ckpt/segment_sent_results/inference.py with huggingface_hub

Browse files

Files changed (1) hide show

my_pipeline_wo_ckpt/segment_sent_results/inference.py +104 -0

my_pipeline_wo_ckpt/segment_sent_results/inference.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+import json
+from trankit import Pipeline
+trankit_cache_dir = "./sen_split_models"
+# utils
+trankit_pipe = Pipeline(lang="english", gpu=True, cache_dir=trankit_cache_dir, embedding='xlm-roberta-large')
+sen_spliter = trankit_pipe.ssplit
+def sen_split_model(word_list, ):
+    text = " ".join(word_list)
+    text_split_list = sen_spliter(text)["sentences"]
+    """
+        text_tgt_split_list形如：[
+    {
+      'id': 1,
+      'text': 'Hello!',
+      'dspan': (0, 6)
+    },
+    {
+      'id': 2,
+      'text': 'This is Trankit.',
+      'dspan': (7, 23)
+    }
+  ]
+    """
+    # print(text_tgt_split_list)
+    # 我们不希望像"Hunan Biological Medicine Factory, China.through Hunan Yahua Seed Corporation Ltd."这样的句子被切分为2个句子（从China.处切断），我们只希望断句处的后一个字符一定是空格，这样才能把切分后的所有句子的所有单词和切分前的doc中的单词一一对应起来。因此对sen_spliter的结果进行处理，如果一个句子和它的后一个句子是在非空格处被切分的，那么将这2个句子合并，不进行切分。用前后指针法来处理。
+    post_processed_text_split_list = []
+    head_ptr, tail_ptr = 0, 0
+    while True:
+        head_item = text_split_list[head_ptr]
+        if tail_ptr + 1 < len(text_split_list):
+            tail_ptr += 1
+            tail_item = text_split_list[tail_ptr]
+            if head_item["dspan"][1] == tail_item["dspan"][0]: # 从非空格处被切断了。
+                head_item["text"] = head_item["text"] + tail_item["text"]
+                head_item["dspan"] = (head_item["dspan"][0], tail_item["dspan"][1])
+            else:
+                post_processed_text_split_list.append(head_item)
+                head_ptr = tail_ptr
+        else: # 最后一项容易被遗漏
+            post_processed_text_split_list.append(head_item)
+            break
+    word_idx = 0
+    # text_split_res_list = []
+    word_sublist = []
+    for sen_dict in post_processed_text_split_list:
+        sen = sen_dict["text"]
+        word_sublist.append(sen.split(" "))
+    return word_sublist
+def infer_one_block(reordered_block_textlines_list):
+    texts = [item["text"] for item in reordered_block_textlines_list]
+    block_text = " ".join(texts)
+    split_word_sublist = sen_split_model(block_text.split())
+    sents = [" ".join(word_sublist) for word_sublist in split_word_sublist]
+    return sents
+def infer_one_img(img_line_dict):
+    reordered_block_sents = []
+    for block_textlines_list in img_line_dict["reordered_blocks_textlines"]:
+        sents = infer_one_block(block_textlines_list)
+        reordered_block_sents.append(sents)
+    img_line_dict.update({
+        "reordered_block_sents": reordered_block_sents
+    })
+    return img_line_dict
+if __name__ == "__main__":
+    domain = "纸质文档" # ["标牌标语", "电脑屏拍", "商品包装", "手机屏拍", "纸质文档"]
+    tgt_dir = f"./results/{domain}"
+    os.makedirs(tgt_dir, exist_ok=True)
+    src_filepath = f"../blocked_reordered_results/results/{domain}/reordered_blocks_textlines.json"
+    tgt_filepath = f"{tgt_dir}/segment_sent.json"
+    with open(src_filepath, "r", encoding="utf8") as src_file, open(tgt_filepath, "w") as tgt_file:
+        while True:
+            line_str = src_file.readline().strip()
+            if not line_str:
+                break
+            line_dict = json.loads(line_str)
+            new_line_dict = infer_one_img(line_dict)
+            tgt_file.write(f"{json.dumps(new_line_dict, ensure_ascii=False)}\n")