Ericwang commited on
Commit
d186f96
·
1 Parent(s): b905ddd

added transcript sorter

Browse files
Files changed (3) hide show
  1. Dockerfile +3 -0
  2. app.py +25 -4
  3. utlis.py +37 -0
Dockerfile CHANGED
@@ -19,6 +19,9 @@ RUN chmod -R 777 /app/results_vt
19
  RUN mkdir /app/results_wt
20
  RUN chmod -R 777 /app/results_wt
21
 
 
 
 
22
  COPY . .
23
 
24
  CMD ["python", "app.py"]
 
19
  RUN mkdir /app/results_wt
20
  RUN chmod -R 777 /app/results_wt
21
 
22
+ RUN mkdir /app/results_st
23
+ RUN chmod -R 777 /app/results_st
24
+
25
  COPY . .
26
 
27
  CMD ["python", "app.py"]
app.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
 
7
  import gradio as gr
8
 
9
- from utlis import HHMMSS_to_sec, convert_video_format, molly_xlsx_to_table, xlsx_to_table, table_to_ELAN_tsv, trim_media
10
 
11
 
12
  def delete_files(files):
@@ -36,11 +36,11 @@ def classify_input_format(input_string):
36
  # if the input format is neither seconds nor HH:MM:SS, return None
37
  gradio.Error("Input format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
38
 
39
- def set_output_file(input_file, output_format, folder):
40
  # Set output file name and extension
41
  if not os.path.exists(folder):
42
  os.makedirs(folder)
43
- file_name = f"{Path(input_file.name).stem.partition('.')[0]}_trimmed.{output_format}"
44
  output_file = os.path.join(folder, file_name)
45
  print(f"Output folder: {folder}")
46
  print(f"file_name: {file_name}")
@@ -105,6 +105,21 @@ def trim_video_wt(input_file, input_transcript, output_format, start_time, end_t
105
  except Exception as e:
106
  return f"Error: {str(e)}"
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def trim_video(input_file, output_format, start_time, end_time):
109
  try:
110
  # Set output file
@@ -180,6 +195,12 @@ interface_c = gr.Interface(fn=convert_video, inputs=[input_file_c, output_format
180
  description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
181
 
182
 
 
 
 
 
 
 
183
 
184
- demo = gr.TabbedInterface([interface_wt, interface, interface_c], ["Video Trimmer with transcript converted","Video Trimmer", "Video Converter"])
185
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
6
 
7
  import gradio as gr
8
 
9
+ from utlis import HHMMSS_to_sec, convert_video_format, molly_xlsx_to_table, xlsx_to_table, table_to_ELAN_tsv, trim_media, sort_transcript
10
 
11
 
12
  def delete_files(files):
 
36
  # if the input format is neither seconds nor HH:MM:SS, return None
37
  gradio.Error("Input format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
38
 
39
+ def set_output_file(input_file, output_format, folder, insert_string = 'trimmed'):
40
  # Set output file name and extension
41
  if not os.path.exists(folder):
42
  os.makedirs(folder)
43
+ file_name = f"{Path(input_file.name).stem.partition('.')[0]}_{insert_string}.{output_format}"
44
  output_file = os.path.join(folder, file_name)
45
  print(f"Output folder: {folder}")
46
  print(f"file_name: {file_name}")
 
105
  except Exception as e:
106
  return f"Error: {str(e)}"
107
 
108
+ def sort_transcript_helper(input_transcript, output_transcript):
109
+ # sort transcript
110
+ print("start sorting transcript")
111
+ print("input_transcript: ", input_transcript)
112
+ print("output_transcript: ", output_transcript)
113
+ output_transcript = sort_transcript(input_transcript, output_transcript)
114
+ print("finish sorting transcript")
115
+ return output_transcript
116
+
117
+ def sort_transcript_wrapper(input_file):
118
+ output_folder = f"{os.getcwd()}/results_st/"
119
+ output_file_path = set_output_file(input_file, "tsv", output_folder, insert_string = 'sorted')
120
+ output_file_path = sort_transcript_helper(input_file.name, output_file_path)
121
+ return output_file_path
122
+
123
  def trim_video(input_file, output_format, start_time, end_time):
124
  try:
125
  # Set output file
 
195
  description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
196
 
197
 
198
+ # gr components for transcript sorter
199
+ input_file_s = gr.File(label="Select transcript file")
200
+ output_file_s = gr.File(label="Download sorted transcript")
201
+ interface_s = gr.Interface(fn=sort_transcript_wrapper, inputs=input_file_s, outputs=output_file_s, title="Transcript Sorter", allow_flagging="never",
202
+ description="Sort a transcript file by time. Please wait for the file to upload before clicking the 'Submit' button.")
203
+
204
 
205
+ demo = gr.TabbedInterface([interface_wt, interface, interface_c, interface_s], ["Video Trimmer with transcript converted","Video Trimmer", "Video Converter", "Transcript Sorter"])
206
  demo.launch(server_name="0.0.0.0", server_port=7860)
utlis.py CHANGED
@@ -8,6 +8,43 @@ import pandas as pd
8
 
9
  import gradio as gr
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def convert_video_format(media_in, media_out):
13
  try:
 
8
 
9
  import gradio as gr
10
 
11
+ def sort_transcript(file_path, save_path):
12
+ """
13
+ Sorts the rows of a transcript file by start time.
14
+
15
+ Parameters:
16
+ file_path (str): The file path of the transcript file.
17
+ save_path (str): The file path to save the sorted transcript file.
18
+
19
+ Returns:
20
+ save_path (str): The file path of the sorted transcript file.
21
+ """
22
+ file_ext = os.path.splitext(file_path)[1]
23
+
24
+ if file_ext == '.txt' or file_ext == '.tsv':
25
+ # read the tab-separated plaintext file into a DataFrame
26
+ table = pd.read_csv(file_path, sep='\t', header=None, \
27
+ names=['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript'],
28
+ index_col=False)
29
+ else:
30
+ raise ValueError("Unsupported file format. Must be '.txt' or '.tsv'.")
31
+
32
+ # make the column names lowercase
33
+ table.columns = map(str.lower, table.columns)
34
+
35
+ # select and reorder the desired columns
36
+ table = table[['speaker', 'transcript', 'start', 'end']]
37
+
38
+ # extract the start time from the 'start' column
39
+ table['start_time'] = table['start'].str.split('.', expand=True)[0]
40
+
41
+ # sort by start_time
42
+ sorted_table = table.sort_values('start_time')
43
+ sorted_table.to_csv(save_path, sep='\t', index=False)
44
+ print("saved sorted transcript to", save_path)
45
+ return save_path
46
+
47
+
48
 
49
  def convert_video_format(media_in, media_out):
50
  try: