Spaces:
Sleeping
Sleeping
added transcript sorter
Browse files- Dockerfile +3 -0
- app.py +25 -4
- utlis.py +37 -0
Dockerfile
CHANGED
|
@@ -19,6 +19,9 @@ RUN chmod -R 777 /app/results_vt
|
|
| 19 |
RUN mkdir /app/results_wt
|
| 20 |
RUN chmod -R 777 /app/results_wt
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
COPY . .
|
| 23 |
|
| 24 |
CMD ["python", "app.py"]
|
|
|
|
| 19 |
RUN mkdir /app/results_wt
|
| 20 |
RUN chmod -R 777 /app/results_wt
|
| 21 |
|
| 22 |
+
RUN mkdir /app/results_st
|
| 23 |
+
RUN chmod -R 777 /app/results_st
|
| 24 |
+
|
| 25 |
COPY . .
|
| 26 |
|
| 27 |
CMD ["python", "app.py"]
|
app.py
CHANGED
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
-
from utlis import HHMMSS_to_sec, convert_video_format, molly_xlsx_to_table, xlsx_to_table, table_to_ELAN_tsv, trim_media
|
| 10 |
|
| 11 |
|
| 12 |
def delete_files(files):
|
|
@@ -36,11 +36,11 @@ def classify_input_format(input_string):
|
|
| 36 |
# if the input format is neither seconds nor HH:MM:SS, return None
|
| 37 |
gradio.Error("Input format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
|
| 38 |
|
| 39 |
-
def set_output_file(input_file, output_format, folder):
|
| 40 |
# Set output file name and extension
|
| 41 |
if not os.path.exists(folder):
|
| 42 |
os.makedirs(folder)
|
| 43 |
-
file_name = f"{Path(input_file.name).stem.partition('.')[0]}
|
| 44 |
output_file = os.path.join(folder, file_name)
|
| 45 |
print(f"Output folder: {folder}")
|
| 46 |
print(f"file_name: {file_name}")
|
|
@@ -105,6 +105,21 @@ def trim_video_wt(input_file, input_transcript, output_format, start_time, end_t
|
|
| 105 |
except Exception as e:
|
| 106 |
return f"Error: {str(e)}"
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def trim_video(input_file, output_format, start_time, end_time):
|
| 109 |
try:
|
| 110 |
# Set output file
|
|
@@ -180,6 +195,12 @@ interface_c = gr.Interface(fn=convert_video, inputs=[input_file_c, output_format
|
|
| 180 |
description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
|
| 181 |
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
demo = gr.TabbedInterface([interface_wt, interface, interface_c], ["Video Trimmer with transcript converted","Video Trimmer", "Video Converter"])
|
| 185 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
+
from utlis import HHMMSS_to_sec, convert_video_format, molly_xlsx_to_table, xlsx_to_table, table_to_ELAN_tsv, trim_media, sort_transcript
|
| 10 |
|
| 11 |
|
| 12 |
def delete_files(files):
|
|
|
|
| 36 |
# if the input format is neither seconds nor HH:MM:SS, return None
|
| 37 |
gradio.Error("Input format not supported. Please enter a valid time in the format HH:MM:SS or seconds.")
|
| 38 |
|
| 39 |
+
def set_output_file(input_file, output_format, folder, insert_string = 'trimmed'):
|
| 40 |
# Set output file name and extension
|
| 41 |
if not os.path.exists(folder):
|
| 42 |
os.makedirs(folder)
|
| 43 |
+
file_name = f"{Path(input_file.name).stem.partition('.')[0]}_{insert_string}.{output_format}"
|
| 44 |
output_file = os.path.join(folder, file_name)
|
| 45 |
print(f"Output folder: {folder}")
|
| 46 |
print(f"file_name: {file_name}")
|
|
|
|
| 105 |
except Exception as e:
|
| 106 |
return f"Error: {str(e)}"
|
| 107 |
|
| 108 |
+
def sort_transcript_helper(input_transcript, output_transcript):
|
| 109 |
+
# sort transcript
|
| 110 |
+
print("start sorting transcript")
|
| 111 |
+
print("input_transcript: ", input_transcript)
|
| 112 |
+
print("output_transcript: ", output_transcript)
|
| 113 |
+
output_transcript = sort_transcript(input_transcript, output_transcript)
|
| 114 |
+
print("finish sorting transcript")
|
| 115 |
+
return output_transcript
|
| 116 |
+
|
| 117 |
+
def sort_transcript_wrapper(input_file):
|
| 118 |
+
output_folder = f"{os.getcwd()}/results_st/"
|
| 119 |
+
output_file_path = set_output_file(input_file, "tsv", output_folder, insert_string = 'sorted')
|
| 120 |
+
output_file_path = sort_transcript_helper(input_file.name, output_file_path)
|
| 121 |
+
return output_file_path
|
| 122 |
+
|
| 123 |
def trim_video(input_file, output_format, start_time, end_time):
|
| 124 |
try:
|
| 125 |
# Set output file
|
|
|
|
| 195 |
description="Convert a video file to a different format. Please wait for the file to upload before clicking the 'Submit' button.")
|
| 196 |
|
| 197 |
|
| 198 |
+
# gr components for transcript sorter
|
| 199 |
+
input_file_s = gr.File(label="Select transcript file")
|
| 200 |
+
output_file_s = gr.File(label="Download sorted transcript")
|
| 201 |
+
interface_s = gr.Interface(fn=sort_transcript_wrapper, inputs=input_file_s, outputs=output_file_s, title="Transcript Sorter", allow_flagging="never",
|
| 202 |
+
description="Sort a transcript file by time. Please wait for the file to upload before clicking the 'Submit' button.")
|
| 203 |
+
|
| 204 |
|
| 205 |
+
demo = gr.TabbedInterface([interface_wt, interface, interface_c, interface_s], ["Video Trimmer with transcript converted","Video Trimmer", "Video Converter", "Transcript Sorter"])
|
| 206 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
utlis.py
CHANGED
|
@@ -8,6 +8,43 @@ import pandas as pd
|
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def convert_video_format(media_in, media_out):
|
| 13 |
try:
|
|
|
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
+
def sort_transcript(file_path, save_path):
|
| 12 |
+
"""
|
| 13 |
+
Sorts the rows of a transcript file by start time.
|
| 14 |
+
|
| 15 |
+
Parameters:
|
| 16 |
+
file_path (str): The file path of the transcript file.
|
| 17 |
+
save_path (str): The file path to save the sorted transcript file.
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
save_path (str): The file path of the sorted transcript file.
|
| 21 |
+
"""
|
| 22 |
+
file_ext = os.path.splitext(file_path)[1]
|
| 23 |
+
|
| 24 |
+
if file_ext == '.txt' or file_ext == '.tsv':
|
| 25 |
+
# read the tab-separated plaintext file into a DataFrame
|
| 26 |
+
table = pd.read_csv(file_path, sep='\t', header=None, \
|
| 27 |
+
names=['Speaker', 'Empty', 'Start', 'Start (s)', 'End', 'End (s)', 'Duration', 'Duration (s)', 'Transcript'],
|
| 28 |
+
index_col=False)
|
| 29 |
+
else:
|
| 30 |
+
raise ValueError("Unsupported file format. Must be '.txt' or '.tsv'.")
|
| 31 |
+
|
| 32 |
+
# make the column names lowercase
|
| 33 |
+
table.columns = map(str.lower, table.columns)
|
| 34 |
+
|
| 35 |
+
# select and reorder the desired columns
|
| 36 |
+
table = table[['speaker', 'transcript', 'start', 'end']]
|
| 37 |
+
|
| 38 |
+
# extract the start time from the 'start' column
|
| 39 |
+
table['start_time'] = table['start'].str.split('.', expand=True)[0]
|
| 40 |
+
|
| 41 |
+
# sort by start_time
|
| 42 |
+
sorted_table = table.sort_values('start_time')
|
| 43 |
+
sorted_table.to_csv(save_path, sep='\t', index=False)
|
| 44 |
+
print("saved sorted transcript to", save_path)
|
| 45 |
+
return save_path
|
| 46 |
+
|
| 47 |
+
|
| 48 |
|
| 49 |
def convert_video_format(media_in, media_out):
|
| 50 |
try:
|