Spaces:

shethjenil
/

OCRBook2TTSConf

Paused

File size: 6,757 Bytes

import json
import pysrt
import re
import gradio as gr
import fitz
from base64 import b64decode, b64encode
from numpy import array as np_array
from edge_tts import Communicate, SubMaker
from concurrent.futures import ThreadPoolExecutor
from PIL import Image, ImageOps
from io import BytesIO
from pydub import AudioSegment
from moviepy import ImageSequenceClip
from proglog import TqdmProgressBarLogger
from deep_translator import GoogleTranslator

SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+=\{\}\[\]|\\:;\"'<>,.?/~`]")
def srt_string_to_obj(srt_string):
    return [
        {"index": sub.index, "start": str(sub.start), "end": str(sub.end), "text": sub.text.replace("\n", " ")}
        for sub in pysrt.from_string(srt_string)
    ]

def remove_special_chars(word):
    return re.sub(SPECIAL_CHARS, "", word)

def tts(text, voice):
    submaker = SubMaker()
    mp3_data = b""
    
    communicator = Communicate(text, voice)
    for chunk in communicator.stream_sync():
        if chunk["type"] == "audio":
            mp3_data += chunk["data"]
        elif chunk["type"] == "WordBoundary":
            submaker.feed(chunk)
    srt = submaker.get_srt()
    return mp3_data, srt_string_to_obj(srt),srt

def metadata2transcript(page_metadata, transcription):
    conf = []
    while page_metadata and transcription:
        w = page_metadata.pop(0)
        if w["type"] == "word" and remove_special_chars(w["content"]):
            t = transcription.pop(0)
            if remove_special_chars(w["content"]) == remove_special_chars(t["text"]):
                w["start"], w["end"] = t["start"], t["end"]
        conf.append(w)
    return conf

def tts_process_page(page_metadata, selected_voice):
    text_content = " ".join(i["content"] for i in page_metadata if i["type"] == "word")
    mp3_data, transcription,srt_file = tts(text_content, selected_voice)
    return {
        "metadata": metadata2transcript(page_metadata, transcription),
        "mp3data": b64encode(mp3_data).decode("utf-8"),
        "srt":srt_file
    }

def book2tts(book_conf, selected_voice, progress=gr.Progress(track_tqdm=True)):  
    with open(book_conf.name, "r", encoding="utf-8") as f:
        pages = json.load(f)

    output = []
    with ThreadPoolExecutor() as executor:
        results = list(progress.tqdm(
            executor.map(lambda p: tts_process_page(p, selected_voice), pages), 
            desc="Processing TTS", 
            total=len(pages),  # Ensure progress tracking works correctly
            unit="page"
        ))

    output.extend(results)
    output_file = "book_tts.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False)
    return output_file


def merge_srt_files(srt_strings, durations):
    merged_subs = pysrt.SubRipFile()
    current_offset = 0  
    for srt_string, duration in zip(srt_strings, durations):
        subs = pysrt.from_string(srt_string)
        for sub in subs:
            sub.shift(seconds=current_offset)
            merged_subs.append(sub)
        current_offset += duration
    return merged_subs

class CustomLogger(TqdmProgressBarLogger):
    def __init__(self, gradio_progress: gr.Progress):
        self.gradio_progress = gradio_progress
        super().__init__(print_messages=False)
    def bars_callback(self, bar, attr, value, old_value):
        if bar=='frame_index':
            self.gradio_progress(value / self.bars[bar]['total'],"Rendering Video",unit="Frames")

def process_page(page_num, page, tts_conf, max_width, max_height):
    pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img_pil = img_pil.convert("RGBA")
    img_pil = ImageOps.pad(img_pil, (max_width, max_height), color=(255, 255, 255))
    frame = np_array(img_pil)
    page_tts_conf = tts_conf[page_num]
    try:
        mp3_data = b64decode(page_tts_conf["mp3data"], validate=True)
        audio = AudioSegment.from_file(BytesIO(mp3_data), format="mp3")
    except:
        audio = AudioSegment.silent(duration=3000)  
    return frame, audio.duration_seconds, audio,page_tts_conf["srt"]

def pdf_to_video(pdf_file, tts_file, progress=gr.Progress()):
    pdf_path = pdf_file.name
    tts_path = tts_file.name
    pdf = fitz.open(pdf_path)
    with open(tts_path, "r", encoding="utf-8") as f:
        tts_conf = json.load(f)
    max_width, max_height = 0, 0
    for page in pdf:
        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
        max_width = max(max_width, pix.width)
        max_height = max(max_height, pix.height)
    frames, durations, audio_clips, srt_files = [], [], [], []
    with ThreadPoolExecutor() as executor:
        results = list(progress.tqdm(executor.map(lambda x: process_page(x, pdf[x], tts_conf, max_width, max_height), range(len(pdf))), total=len(pdf), desc="Processing Pages & Audio", unit="page"))

    for frame, duration, audio, srt_file in results:
        frames.append(frame)
        durations.append(duration)
        audio_clips.append(audio)
        srt_files.append(srt_file)

    combined_audio = sum(audio_clips)
    combined_audio.export("merged_audio.mp3", format="mp3")

    ImageSequenceClip(frames, durations=durations).write_videofile(
    "output.mp4",
    fps=1,
    codec="libx264",  
    audio="merged_audio.mp3",  # Adds audio directly
    audio_codec="aac",  
    audio_bitrate="128k",  
    preset="slow",  
    ffmpeg_params=["-crf", "20"],  
    logger=CustomLogger(progress)
)
    merge_srt_files(srt_files, durations).save("merged_subs.srt", encoding="utf-8")
    return "output.mp4", "merged_subs.srt"

TransLanguages = json.load(open("translate_language.json"))

def translate_json(book_conf:str,source:str):
    texts = [i for i in ["\n".join(line.strip() for line in " ".join(i['content'] if i['type'] == 'word' else "\n" for i in con).split("\n")).strip() for con in json.load(open(book_conf.name, "r", encoding="utf-8"))]]
    json.dump([{"original":ot,"translate":tr} for ot,tr in zip(texts,GoogleTranslator(source=TransLanguages[source], target='en').translate_batch(texts))],open("translate.json","w"),ensure_ascii=False)
    return "translate.json"

gr.TabbedInterface([gr.Interface(book2tts,[gr.File(label="Upload Book JSON"),gr.Dropdown(choices=[voice["Name"] for voice in json.load(open("voices.json"))], label="Select Voice")],gr.File(label="Download TTS JSON"),),gr.Interface(pdf_to_video,[gr.File(label="Upload PDF"),gr.File(label="Upload TTS JSON")],[gr.Video(label="Output Video"),gr.File(label="Subtitle File")],),gr.Interface(translate_json,[gr.File(label="Upload Book Conf JSON"),gr.Dropdown(list(TransLanguages.keys()))],gr.File(label="Translated JSON"))],["TTSMaker","VideoMaker","TranslationMaker"]).launch()