Spaces:

SPACERUNNER99
/

Transcribe

Paused

File size: 10,731 Bytes

23d4cfa
 
 
 
0b484f3
23d4cfa
 
 
 
 
 
b0c0845
23d4cfa
1c45af0
fb7175b
3e04bc5
67dfaaf
23d4cfa
 
 
 
 
8bec4e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23d4cfa
b0c0845
ac14dfb
 
b0c0845
 
23d4cfa
b0c0845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23d4cfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0c0845
 
23d4cfa
 
b0c0845
23d4cfa
 
 
 
 
 
f779c4b
 
 
 
 
 
93b36e1
b0c0845
c685b52
067e41e
c685b52
 
 
 
b0c0845
c685b52
 
 
 
fb7175b
c980ffd
 
 
 
 
30a5fb1
06308da
2354500
 
 
 
 
 
 
 
 
c980ffd
 
 
 
 
 
 
 
f779c4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c685b52
b0c0845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23d4cfa
 
 
b0c0845
23d4cfa
b0c0845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220bcbd
b0c0845
220bcbd
b0c0845
 
220bcbd
b0c0845
 
 
 
 
 
55a8ed6
220bcbd
ed401f2
b0c0845
 
115e46c
 
b0c0845
6f7b62a
b0c0845
 
88ac7d9
 
 
b0c0845
115e46c
2a6df21
115e46c
f8fdfda
 
 
 
b0c0845
f8fdfda
115e46c

from pytubefix import YouTube
from pytubefix.cli import on_progress
import time
import math
import gradio as gr
import ffmpeg
from faster_whisper import WhisperModel
import requests
import json
import arabic_reshaper # pip install arabic-reshaper
from bidi.algorithm import get_display # pip install python-bidi
from moviepy import *
import pysrt
import time
import re
import concurrent.futures
import os
api_key = "268976:66f4f58a2a905"



def extract_audio(input_video_name):
    # Define the input video file and output audio file
    mp3_file = "audio.mp3"

    # Load the video clip
    video_clip = VideoFileClip(input_video_name)

    # Extract the audio from the video clip
    audio_clip = video_clip.audio

    # Write the audio to a separate file
    audio_clip.write_audiofile(mp3_file)

    # Close the video and audio clips
    audio_clip.close()
    video_clip.close()

    print("Audio extraction successful!")
    return mp3_file

def transcribe(audio, max_segment_duration=2.0):  # Set your desired max duration here
    model = WhisperModel("tiny", device="cpu")
    segments, info = model.transcribe(audio, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1000), word_timestamps=True)
    segments = list(segments)  # The transcription will actually run here.
    wordlevel_info = []
    for segment in segments:
        for word in segment.words:
          print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
          wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end})
    return wordlevel_info

def create_subtitles(wordlevel_info):
    punctuation_marks = {'.', '!', '?', ',', ';', ':', '—', '-', '。', '！', '？'}  # Add/remove punctuation as needed
    subtitles = []
    line = []

    for word_data in wordlevel_info:
        line.append(word_data)
        current_word = word_data['word']

        # Check if current word ends with punctuation or line reached 5 words
        ends_with_punct = current_word and (current_word[-1] in punctuation_marks)

        if ends_with_punct or len(line) == 5:
            # Create a new subtitle segment
            subtitle = {
                "word": " ".join(item["word"] for item in line),
                "start": line[0]["start"],
                "end": line[-1]["end"],
                "textcontents": line.copy()
            }
            subtitles.append(subtitle)
            line = []

    # Add remaining words if any
    if line:
        subtitle = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line.copy()
        }
        subtitles.append(subtitle)

    # Remove gaps between segments by extending the previous segment's end time
    for i in range(1, len(subtitles)):
        prev_subtitle = subtitles[i - 1]
        current_subtitle = subtitles[i]

        # Extend the previous segment's end time to the start of the current segment
        prev_subtitle["end"] = current_subtitle["start"]

    return subtitles

def format_time(seconds):
    hours = math.floor(seconds / 3600)
    seconds %= 3600
    minutes = math.floor(seconds / 60)
    seconds %= 60
    milliseconds = round((seconds - math.floor(seconds)) * 1000)
    seconds = math.floor(seconds)
    formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
    return formatted_time

def generate_subtitle_file(language, segments, input_video_name):
    subtitle_file = f"sub-{input_video_name}.{language}.srt"
    text = ""
    for index, segment in enumerate(segments):
        segment_start = format_time(segment['start'])
        segment_end = format_time(segment['end'])
        text += f"{str(index+1)} \n"
        text += f"{segment_start} --> {segment_end} \n"
        text += f"{segment['word']} \n"
        text += "\n"
    f = open(subtitle_file, "w", encoding='utf8')
    f.write(text)
    f.close()
    return subtitle_file

def clean_text(text):
    # Remove 'srt ' from the start of each line
    # Remove ''' from the start and end
    text = re.sub(r"^```|```$", '', text)
    text = re.sub(r'^srt', '', text, flags=re.MULTILINE)
    return text

def translate_text(api_key, text, source_language = "en", target_language = "fa"):
    url = "https://api.one-api.ir/translate/v1/google/"
    request_body = {"source": source_language, "target": target_language, "text": text}
    headers = {"one-api-token": api_key, "Content-Type": "application/json"}
    response = requests.post(url, headers=headers, json=request_body)
    if response.status_code == 200:
        result = response.json()
        return result['result']
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

def enhance_text(api_key, text):
    url = "https://api.one-api.ir/chatbot/v1/gpt4o/"

    # Prepare the request body
    request_body = [{
        "role": "user",
        "content": "Instructions: You have received the following English SRT subtitle file for translation into Persian. **Your goal:** is to provide an accurate, fluent, and high-quality translation of this SRT file into Persian. **Context:** First, carefully analyze the English text to understand the content type (general, technical, colloquial, etc.) and its tone (formal, informal, serious, humorous, etc.). **Instructions & Steps:** 1. Review the English SRT file line by line. 2. Determine the content type and overall tone of the text. 3. Provide a Persian translation that accurately conveys the meaning of the English text and maintains an appropriate tone for a Persian-speaking audience. 4. **Response Format:** The output must be a Persian SRT file that precisely preserves the structure of the original English SRT file in terms of timestamps. **Constraints:** 1. The word count of each Persian subtitle should be as close as possible to the word count of the corresponding English subtitle to maintain a natural reading pace. 2. Do not alter the timestamp structure of the SRT file under any circumstances. 3. The translation must be grammatically and stylistically correct and fluent. **Behavior Setting:** The tone of the Persian translation should be consistent with the tone of the English text. If the English text is formal, the Persian translation should also be formal, and if it is informal, the Persian translation should also be informal and colloquial. **Negative Feedback:** Avoid literal and unnatural translations. The translation should be fluent and understandable to a native Persian speaker.**Iteration and Experimentation:** If necessary, review and revise the translation to achieve the best possible result."
    },
    {
    "role": "assistant",
    "content": "okay"
    },
    {
    "role": "user",
    "content": text
    }
    ]

    # Add the API key to the request
    headers = {
        "one-api-token": api_key,
        "Content-Type": "application/json"
    }

    # Make the POST request
    attempts = 0
    max_attempts = 3

    while attempts < max_attempts:
        response = requests.post(url, headers=headers, json=request_body)
        if response.status_code == 200:
            result = response.json()
            if result["status"] == 200:
                print("status: ", result["status"])
                te = clean_text(result["result"][0])
                print("result: ", te)
                return te
            else:
                print(f"Error: status {result['status']}, retrying in 30 seconds...")
        else:
            print(f"Error: {response.status_code}, {response.text}, retrying in 30 seconds...")
        attempts += 1
        time.sleep(30)
    print("Error Max attempts reached. Could not retrieve a successful response.")
    te = translate_text(api_key, text)
    return te

def read_srt_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            srt_content = file.read()
            return srt_content
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

def write_srt(subtitle_text, output_file="edited_srt.srt"):
    with open(output_file, 'w', encoding="utf-8") as file:
        file.write(subtitle_text)

def write_google(google_translate):
    google = "google_translate.srt"
    with open(google, 'w', encoding="utf-8") as f:
        f.write(google_translate)
    return google

def generate_translated_subtitle(language, segments, input_video_name):
    input_video_name=input_video_name.split('/')[-1]
    subtitle_file = f"{input_video_name}.srt"
    text = ""
    lines = segments.split('\n')
    new_list = [item for item in lines if item != '']
    segment_number = 1

    for index, segment in enumerate(new_list):
        if (index+1) % 3 == 1 or (index+1)==1:
            text += f"{segment}\n"
            segment_number += 1
        if (index+1) % 3 == 2 or (index+1)==2:
            text += segment + "\n"
        if (index+1) % 3 == 0:
            text += f"\u200F{segment}\n\n"

    with open(subtitle_file, "w", encoding='utf8') as f:
        f.write(text)
    return subtitle_file

def process_video(video, progress=gr.Progress()):

    progress(0, desc="Starting")
    mp3_file=extract_audio(video)
    wordlevel_info=transcribe(mp3_file)
    progress(50, desc="transcribe")
    subtitles = create_subtitles(wordlevel_info)
    subtitle_file = generate_subtitle_file('fa', subtitles, 'video_subtitled')
    srt_string = read_srt_file(subtitle_file)
    google_translate = enhance_text(api_key, srt_string)
    srt = write_google(google_translate)
    #segments = pysrt.open(srt, encoding="utf-8")
    sub = read_srt_file(generate_translated_subtitle("fa", google_translate, "video_subtitled"))
    progress(100, desc="Finish")
    return srt_string, sub


with gr.Blocks() as demo:
    gr.Markdown("Start typing below and then click **Run** to see the output.")
    with gr.Column():
        video_file_input = gr.Video(label="Upload Video File")
        clip_type = gr.Dropdown(["auto edit", "default"], label="Clip Type")
        btn = gr.Button("create")
        srt_en_file_output = gr.Text(label="result: ")
        srt_fa_file_output = gr.Text(label="result: ")
        btn.click(fn=process_video, inputs=[video_file_input], outputs=[srt_en_file_output, srt_fa_file_output])
"""    with gr.Row():
        vid_out = gr.Video()
        srt_file = gr.File()
        btn2 = gr.Button("transcribe")
        gr.on(
            triggers=[btn2.click],
            fn=write_google,
            inputs=out,
        ).then(video_edit, [out, video_path_output, audio_path_output], outputs=[vid_out, srt_file])"""


demo.launch(debug=True)