| import pandas as pd |
| import re |
| import string |
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.stem import WordNetLemmatizer |
|
|
| import json |
| from datetime import datetime |
| import gradio as gr |
| import io |
|
|
| nltk_data_dir = "./nltk_data" |
| nltk.data.path.append(nltk_data_dir) |
|
|
| nltk_resources = ["stopwords", "punkt", "wordnet"] |
| for resource in nltk_resources: |
| try: |
| nltk.data.find(resource) |
| except LookupError: |
| nltk.download(resource, download_dir=nltk_data_dir) |
|
|
| |
| |
| |
| |
|
|
| stop_words = set(stopwords.words('english')) |
| lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
|
| def clean_text(text): |
| text = text.lower() |
| text = text.translate(str.maketrans('', '', string.punctuation)) |
| tokens = nltk.word_tokenize(text) |
| tokens = [lemmatizer.lemmatize(word) for word in tokens] |
| cleaned_text = ' '.join(tokens) |
| return cleaned_text |
|
|
| def capitalize_sentences(text): |
| sentences = [s.strip().capitalize() for s in text.strip().split('.')] |
| return '. '.join([s for s in sentences if s]).strip() + '.' |
|
|
| def process_transcript(csv_file, txt_file): |
| transcript = pd.read_csv(csv_file.name) |
| |
| loi_chuan = pd.read_csv(txt_file.name, sep='\t', header=None, encoding='utf-8', engine='python') |
|
|
|
|
| |
| |
|
|
| loi_chuan[0] = loi_chuan[0].astype(str).str.strip() |
| loi_chuan = loi_chuan[~loi_chuan[0].str.fullmatch(r'\[.*\]', na=False)] |
| loi_chuan['cleaned_text'] = loi_chuan[0].apply(clean_text) |
| loi_chuan = loi_chuan.reset_index(drop=True) |
|
|
| |
| lyric = '' |
| for i in range(len(loi_chuan['cleaned_text'])): |
| lyric = lyric.strip() + ". " + loi_chuan['cleaned_text'][i].strip() |
| lyric = lyric.lstrip('.').strip() |
| lyric_list = lyric.split('. ') |
|
|
| |
| transcript['processed_text'] = "" |
| for i in range(len(transcript)): |
| transcript.at[i, 'processed_text'] = transcript['Text'][i].split('.') |
| for j in range(len(transcript.at[i, 'processed_text'])): |
| transcript.at[i, 'processed_text'][j] = clean_text(transcript.at[i, 'processed_text'][j]) |
|
|
| transcript['Renew_processed_text'] = '' |
| for i in range(len(transcript)): |
| for j in range(len(transcript['processed_text'][i])): |
| transcript.at[i, 'Renew_processed_text'] = transcript.at[i, 'Renew_processed_text'].strip() + " " + transcript.at[i, 'processed_text'][j].strip() |
|
|
| transcript['Renew_processed_text'] = transcript['Renew_processed_text'].astype(str).str.lstrip('.') |
|
|
| start = 0 |
| end = 0 |
| transcript['lyric'] = '' |
| max_lyric_index = len(lyric_list) |
|
|
| for i in range(len(transcript)): |
| len_transcript = len(transcript['Renew_processed_text'][i].split()) |
| len_lyric = 0 |
|
|
| while (len_lyric / len_transcript < 1) and (end < max_lyric_index): |
| sequence = " ".join(lyric_list[start:end+1]) |
| len_lyric = len(sequence.split()) |
|
|
| if len_lyric / len_transcript >= 1: |
| transcript.at[i, 'lyric'] = ". ".join(lyric_list[start:end+1]) |
| start = end + 1 |
| end = start |
| else: |
| end += 1 |
|
|
| for i in range(len(transcript)): |
| if transcript.at[i, 'lyric']: |
| transcript.at[i, 'lyric'] = capitalize_sentences(transcript.at[i, 'lyric']) |
|
|
| df_final = transcript[['Speaker Name', 'Start Time', 'End Time', 'lyric']] |
| df_final.columns = ['Speaker Name', 'Start Time', 'End Time', 'Text'] |
|
|
| df_final['Text'] = df_final['Text'].astype(str).str.strip() |
| df_final['Speaker Name'] = df_final['Speaker Name'].astype(str).str.strip() |
|
|
| output_path = "formatted_transcript.txt" |
| with open(output_path, "w", encoding="utf-8") as f: |
| for _, row in df_final.iterrows(): |
| f.write(f"{row['Start Time']} - {row['End Time']}\n") |
| f.write(f"{row['Speaker Name']}\n") |
| f.write(f"{row['Text']}\n\n") |
|
|
| return output_path |
|
|
| demo = gr.Interface( |
| fn=process_transcript, |
| inputs=[ |
| gr.File(label="Upload Transcript CSV"), |
| gr.File(label="Upload Loi Chuan TXT") |
| ], |
| outputs=gr.File(label="Download formatted_transcript.txt"), |
| title="Transcript Correction to TXT", |
| description="Upload your raw transcript and clean lyric file to generate a formatted .txt output." |
| ) |
|
|
| demo.launch() |
|
|