File size: 4,780 Bytes
335644f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ca87ce
335644f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c2eca
335644f
f4c2eca
335644f
 
 
 
 
 
 
 
 
 
 
 
f4c2eca
 
 
 
 
335644f
 
 
 
 
 
f4c2eca
335644f
 
 
 
 
 
 
 
4ca87ce
335644f
f4c2eca
335644f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ca87ce
 
 
 
 
 
 
 
 
 
335644f
 
4ca87ce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import io
import jiwer
import gradio as gr

import sys
from contextlib import contextmanager
@contextmanager
def switch_to_stdout():
  f = sys.stdout
  yield f

def ali(ref, hyp, col=80, remove_punc=False, file=None):
    if remove_punc:
      tr2 = str.maketrans(
          ', ?.`%()⋯',
          ', ?.`%()⋯', ', ?.`%()⋯。、',
      )
      ref = ref.translate(tr2)
      hyp = hyp.translate(tr2)
    out = jiwer.process_characters(ref, hyp)
    vis = jiwer.visualize_alignment(out)

    tr = str.maketrans(
        '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*, ?.’%()…|',
        "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*, ?.`%()⋯|",
    )


    visl = vis.splitlines()
    visl[1] = visl[1][:5] + visl[1][5:].translate(tr)
    visl[2] = visl[2][:5] + visl[2][5:].translate(tr)
    visl[3] = visl[3][:5] + visl[3][5:].translate(tr)

    # with switch_to_stdout() as f:
    if isinstance(file, io.StringIO):
        f = file
        for i in range(4, len(visl)):
            print(visl[i], file=f)

        for c in range(0, len(visl[1]), col):
            if c == 0:
                print(visl[1][c:c+col+5], file=f)
                print(visl[2][c:c+col+5], file=f)
                print(visl[3][c:c+col+5], file=f)
                print('', file=f)
            else:
                print("     "+visl[1][c+5:c+5+col], file=f)
                print("     "+visl[2][c+5:c+5+col], file=f)
                print("     "+visl[3][c+5:c+5+col], file=f)
                print('', file=f)
    else:
        with open(file, 'w') as f:
            for i in range(4, len(visl)):
                print(visl[i], file=f)

            for c in range(0, len(visl[1]), col):
                if c == 0:
                    print(visl[1][c:c+col+5], file=f)
                    print(visl[2][c:c+col+5], file=f)
                    print(visl[3][c:c+col+5], file=f)
                    print('', file=f)
                else:
                    print("     "+visl[1][c+5:c+5+col], file=f)
                    print("     "+visl[2][c+5:c+5+col], file=f)
                    print("     "+visl[3][c+5:c+5+col], file=f)
                    print('', file=f)
    return out

def process_ours(path):
    with open(path) as f:
        ours = f.read()
    print(ours)
    ours = [i.split('\u3000', maxsplit=1)[-1] for i in ours.splitlines()]
    print(ours)
    ours = ' '.join(ours)
    return ours

def process_theirs(path):
    with open(path) as f:
        theirs = f.read().splitlines()
    i = 0
    # print(theirs)
    while i < len(theirs):
        if theirs[i].startswith('會議記錄:'):
            break
        i+=1
    theirs = theirs[i+1:]
    transcript = ' '.join(theirs)
    # for i in range(0, len(theirs), 4):
    #     text = (theirs[i+2])
    #     transcript += text
    return transcript

def compare_transcripts(ours_file, theirs_file, remove_punc, number_box):
    ours = process_ours(ours_file)
    theirs = process_theirs(theirs_file)
    output = io.StringIO()
    ali(theirs.replace(' ', ''), ours.replace(' ', ''), remove_punc=remove_punc, file=output, col=int(number_box))
    return output.getvalue()

custom_css = """
textarea[data-testid="textbox"] {
  font-family: monospace !important;
}
"""

with gr.Blocks(title="Transcript Alignment Viewer",css=custom_css) as demo:
    gr.Markdown("## Transcript Alignment Viewer")
    gr.Markdown("上傳請確認你用的是 `不分段會議紀錄`")

    with gr.Row(equal_height=True):
        ours_file = gr.File(label="Our Transcript", file_types=[".txt"], scale=1)
        theirs_file = gr.File(label="Their Transcript", file_types=[".txt"], scale=1)

    with gr.Row():
        compare_btn = gr.Button("Generate Alignment", scale=1)
        remove_punc = gr.Checkbox(label="Remove Punctuation", scale=1)
        gr.Markdown("**Column size:**")
        number_box = gr.Textbox(show_label=False,value="80",max_lines=1,scale=1)

    output_text = gr.Textbox(
        label="Alignment Output",
        lines=30,
        max_lines=100,
        show_copy_button=True,
        interactive=False,
        elem_id="mono"
    )

    compare_btn.click(
        fn=compare_transcripts,
        inputs=[ours_file, theirs_file, remove_punc, number_box],
        outputs=output_text
    )

# demo.launch(
#     # css="""
#     # #output-box textarea {
#     #     font-family: monospace !important;
#     #     white-space: pre !important;
#     #     overflow-y: scroll;
#     #     height: 70vh !important;
#     # }
#     # """
# )


if __name__ == "__main__":
    demo.launch()