Spaces:

shethjenil
/

OCRBook2TTSConf

Paused

App Files Files Community

shethjenil commited on Apr 5, 2025

Commit

cfc35b3

verified ·

1 Parent(s): 94f1087

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -157

app.py CHANGED Viewed

@@ -1,158 +1,158 @@
-import json
-import pysrt
-import re
-import gradio as gr
-import fitz
-from base64 import b64decode, b64encode
-from numpy import array as np_array
-from edge_tts import Communicate, SubMaker
-from concurrent.futures import ThreadPoolExecutor
-from PIL import Image, ImageOps
-from io import BytesIO
-from pydub import AudioSegment
-from moviepy import ImageSequenceClip
-from proglog import TqdmProgressBarLogger
-from deep_translator import GoogleTranslator
-SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+=\{\}\[\]|\\:;\"'<>,.?/~`]")
-def srt_string_to_obj(srt_string):
-    return [
-        {"index": sub.index, "start": str(sub.start), "end": str(sub.end), "text": sub.text.replace("\n", " ")}
-        for sub in pysrt.from_string(srt_string)
-    ]
-def remove_special_chars(word):
-    return re.sub(SPECIAL_CHARS, "", word)
-def tts(text, voice):
-    submaker = SubMaker()
-    mp3_data = b""
-    communicator = Communicate(text, voice)
-    for chunk in communicator.stream_sync():
-        if chunk["type"] == "audio":
-            mp3_data += chunk["data"]
-        elif chunk["type"] == "WordBoundary":
-            submaker.feed(chunk)
-    srt = submaker.get_srt()
-    return mp3_data, srt_string_to_obj(srt),srt
-def metadata2transcript(page_metadata, transcription):
-    conf = []
-    while page_metadata and transcription:
-        w = page_metadata.pop(0)
-        if w["type"] == "word" and remove_special_chars(w["content"]):
-            t = transcription.pop(0)
-            if remove_special_chars(w["content"]) == remove_special_chars(t["text"]):
-                w["start"], w["end"] = t["start"], t["end"]
-        conf.append(w)
-    return conf
-def tts_process_page(page_metadata, selected_voice):
-    text_content = " ".join(i["content"] for i in page_metadata if i["type"] == "word")
-    mp3_data, transcription,srt_file = tts(text_content, selected_voice)
-    return {
-        "metadata": metadata2transcript(page_metadata, transcription),
-        "mp3data": b64encode(mp3_data).decode("utf-8"),
-        "srt":srt_file
-    }
-def book2tts(book_conf, selected_voice, progress=gr.Progress(track_tqdm=True)):
-    with open(book_conf.name, "r", encoding="utf-8") as f:
-        pages = json.load(f)
-    output = []
-    with ThreadPoolExecutor() as executor:
-        results = list(progress.tqdm(
-            executor.map(lambda p: tts_process_page(p, selected_voice), pages),
-            desc="Processing TTS",
-            total=len(pages),  # Ensure progress tracking works correctly
-            unit="page"
-        ))
-    output.extend(results)
-    output_file = "book_tts.json"
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(output, f, ensure_ascii=False)
-    return output_file
-def merge_srt_files(srt_strings, durations):
-    merged_subs = pysrt.SubRipFile()
-    current_offset = 0
-    for srt_string, duration in zip(srt_strings, durations):
-        subs = pysrt.from_string(srt_string)
-        for sub in subs:
-            sub.shift(seconds=current_offset)
-            merged_subs.append(sub)
-        current_offset += duration
-    return merged_subs
-class CustomLogger(TqdmProgressBarLogger):
-    def __init__(self, gradio_progress: gr.Progress):
-        self.gradio_progress = gradio_progress
-        super().__init__(print_messages=False)
-    def bars_callback(self, bar, attr, value, old_value):
-        if bar=='frame_index':
-            self.gradio_progress(value / self.bars[bar]['total'],"Rendering Video",unit="Frames")
-def process_page(page_num, page, tts_conf, max_width, max_height):
-    pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
-    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-    img_pil = img_pil.convert("RGBA")
-    img_pil = ImageOps.pad(img_pil, (max_width, max_height), color=(255, 255, 255))
-    frame = np_array(img_pil)
-    page_tts_conf = tts_conf[page_num]
-    try:
-        mp3_data = b64decode(page_tts_conf["mp3data"], validate=True)
-        audio = AudioSegment.from_file(BytesIO(mp3_data), format="mp3")
-    except:
-        audio = AudioSegment.silent(duration=3000)
-    return frame, audio.duration_seconds, audio,page_tts_conf["srt"]
-def pdf_to_video(pdf_file, tts_file, progress=gr.Progress()):
-    pdf_path = pdf_file.name
-    tts_path = tts_file.name
-    pdf = fitz.open(pdf_path)
-    with open(tts_path, "r", encoding="utf-8") as f:
-        tts_conf = json.load(f)
-    max_width, max_height = 0, 0
-    for page in pdf:
-        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
-        max_width = max(max_width, pix.width)
-        max_height = max(max_height, pix.height)
-    frames, durations, audio_clips, srt_files = [], [], [], []
-    with ThreadPoolExecutor() as executor:
-        results = list(progress.tqdm(executor.map(lambda x: process_page(x, pdf[x], tts_conf, max_width, max_height), range(len(pdf))), total=len(pdf), desc="Processing Pages & Audio", unit="page"))
-    for frame, duration, audio, srt_file in results:
-        frames.append(frame)
-        durations.append(duration)
-        audio_clips.append(audio)
-        srt_files.append(srt_file)
-    combined_audio = sum(audio_clips)
-    combined_audio.export("merged_audio.mp3", format="mp3")
-    ImageSequenceClip(frames, durations=durations).write_videofile(
-    "output.mp4",
-    fps=1,
-    codec="libx264",
-    audio="merged_audio.mp3",  # Adds audio directly
-    audio_codec="aac",
-    audio_bitrate="128k",
-    preset="slow",
-    ffmpeg_params=["-crf", "20"],
-    logger=CustomLogger(progress)
-)
-    merge_srt_files(srt_files, durations).save("merged_subs.srt", encoding="utf-8")
-    return "output.mp4", "merged_subs.srt"
-TransLanguages = json.load(open("translate_language.json"))
-def translate_json(book_conf:str,source:str):
-    texts = [i for i in ["\n".join(line.strip() for line in " ".join(i['content'] if i['type'] == 'word' else "\n" for i in con).split("\n")).strip() for con in json.load(open(book_conf.name, "r", encoding="utf-8"))]]
-    json.dump([{"original":ot,"translate":tr} for ot,tr in zip(texts,GoogleTranslator(source=TransLanguages[source], target='en').translate_batch(texts))],open("translate.json","w"))
-    return "translate.json"
 gr.TabbedInterface([gr.Interface(book2tts,[gr.File(label="Upload Book JSON"),gr.Dropdown(choices=[voice["Name"] for voice in json.load(open("voices.json"))], label="Select Voice")],gr.File(label="Download TTS JSON"),),gr.Interface(pdf_to_video,[gr.File(label="Upload PDF"),gr.File(label="Upload TTS JSON")],[gr.Video(label="Output Video"),gr.File(label="Subtitle File")],),gr.Interface(translate_json,[gr.File(label="Upload Book Conf JSON"),gr.Dropdown(list(TransLanguages.keys()))],gr.File(label="Translated JSON"))],["TTSMaker","VideoMaker","TranslationMaker"]).launch()

+import json
+import pysrt
+import re
+import gradio as gr
+import fitz
+from base64 import b64decode, b64encode
+from numpy import array as np_array
+from edge_tts import Communicate, SubMaker
+from concurrent.futures import ThreadPoolExecutor
+from PIL import Image, ImageOps
+from io import BytesIO
+from pydub import AudioSegment
+from moviepy import ImageSequenceClip
+from proglog import TqdmProgressBarLogger
+from deep_translator import GoogleTranslator
+SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+=\{\}\[\]|\\:;\"'<>,.?/~`]")
+def srt_string_to_obj(srt_string):
+    return [
+        {"index": sub.index, "start": str(sub.start), "end": str(sub.end), "text": sub.text.replace("\n", " ")}
+        for sub in pysrt.from_string(srt_string)
+    ]
+def remove_special_chars(word):
+    return re.sub(SPECIAL_CHARS, "", word)
+def tts(text, voice):
+    submaker = SubMaker()
+    mp3_data = b""
+    communicator = Communicate(text, voice)
+    for chunk in communicator.stream_sync():
+        if chunk["type"] == "audio":
+            mp3_data += chunk["data"]
+        elif chunk["type"] == "WordBoundary":
+            submaker.feed(chunk)
+    srt = submaker.get_srt()
+    return mp3_data, srt_string_to_obj(srt),srt
+def metadata2transcript(page_metadata, transcription):
+    conf = []
+    while page_metadata and transcription:
+        w = page_metadata.pop(0)
+        if w["type"] == "word" and remove_special_chars(w["content"]):
+            t = transcription.pop(0)
+            if remove_special_chars(w["content"]) == remove_special_chars(t["text"]):
+                w["start"], w["end"] = t["start"], t["end"]
+        conf.append(w)
+    return conf
+def tts_process_page(page_metadata, selected_voice):
+    text_content = " ".join(i["content"] for i in page_metadata if i["type"] == "word")
+    mp3_data, transcription,srt_file = tts(text_content, selected_voice)
+    return {
+        "metadata": metadata2transcript(page_metadata, transcription),
+        "mp3data": b64encode(mp3_data).decode("utf-8"),
+        "srt":srt_file
+    }
+def book2tts(book_conf, selected_voice, progress=gr.Progress(track_tqdm=True)):
+    with open(book_conf.name, "r", encoding="utf-8") as f:
+        pages = json.load(f)
+    output = []
+    with ThreadPoolExecutor() as executor:
+        results = list(progress.tqdm(
+            executor.map(lambda p: tts_process_page(p, selected_voice), pages),
+            desc="Processing TTS",
+            total=len(pages),  # Ensure progress tracking works correctly
+            unit="page"
+        ))
+    output.extend(results)
+    output_file = "book_tts.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(output, f, ensure_ascii=False)
+    return output_file
+def merge_srt_files(srt_strings, durations):
+    merged_subs = pysrt.SubRipFile()
+    current_offset = 0
+    for srt_string, duration in zip(srt_strings, durations):
+        subs = pysrt.from_string(srt_string)
+        for sub in subs:
+            sub.shift(seconds=current_offset)
+            merged_subs.append(sub)
+        current_offset += duration
+    return merged_subs
+class CustomLogger(TqdmProgressBarLogger):
+    def __init__(self, gradio_progress: gr.Progress):
+        self.gradio_progress = gradio_progress
+        super().__init__(print_messages=False)
+    def bars_callback(self, bar, attr, value, old_value):
+        if bar=='frame_index':
+            self.gradio_progress(value / self.bars[bar]['total'],"Rendering Video",unit="Frames")
+def process_page(page_num, page, tts_conf, max_width, max_height):
+    pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
+    img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    img_pil = img_pil.convert("RGBA")
+    img_pil = ImageOps.pad(img_pil, (max_width, max_height), color=(255, 255, 255))
+    frame = np_array(img_pil)
+    page_tts_conf = tts_conf[page_num]
+    try:
+        mp3_data = b64decode(page_tts_conf["mp3data"], validate=True)
+        audio = AudioSegment.from_file(BytesIO(mp3_data), format="mp3")
+    except:
+        audio = AudioSegment.silent(duration=3000)
+    return frame, audio.duration_seconds, audio,page_tts_conf["srt"]
+def pdf_to_video(pdf_file, tts_file, progress=gr.Progress()):
+    pdf_path = pdf_file.name
+    tts_path = tts_file.name
+    pdf = fitz.open(pdf_path)
+    with open(tts_path, "r", encoding="utf-8") as f:
+        tts_conf = json.load(f)
+    max_width, max_height = 0, 0
+    for page in pdf:
+        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
+        max_width = max(max_width, pix.width)
+        max_height = max(max_height, pix.height)
+    frames, durations, audio_clips, srt_files = [], [], [], []
+    with ThreadPoolExecutor() as executor:
+        results = list(progress.tqdm(executor.map(lambda x: process_page(x, pdf[x], tts_conf, max_width, max_height), range(len(pdf))), total=len(pdf), desc="Processing Pages & Audio", unit="page"))
+    for frame, duration, audio, srt_file in results:
+        frames.append(frame)
+        durations.append(duration)
+        audio_clips.append(audio)
+        srt_files.append(srt_file)
+    combined_audio = sum(audio_clips)
+    combined_audio.export("merged_audio.mp3", format="mp3")
+    ImageSequenceClip(frames, durations=durations).write_videofile(
+    "output.mp4",
+    fps=1,
+    codec="libx264",
+    audio="merged_audio.mp3",  # Adds audio directly
+    audio_codec="aac",
+    audio_bitrate="128k",
+    preset="slow",
+    ffmpeg_params=["-crf", "20"],
+    logger=CustomLogger(progress)
+)
+    merge_srt_files(srt_files, durations).save("merged_subs.srt", encoding="utf-8")
+    return "output.mp4", "merged_subs.srt"
+TransLanguages = json.load(open("translate_language.json"))
+def translate_json(book_conf:str,source:str):
+    texts = [i for i in ["\n".join(line.strip() for line in " ".join(i['content'] if i['type'] == 'word' else "\n" for i in con).split("\n")).strip() for con in json.load(open(book_conf.name, "r", encoding="utf-8"))]]
+    json.dump([{"original":ot,"translate":tr} for ot,tr in zip(texts,GoogleTranslator(source=TransLanguages[source], target='en').translate_batch(texts))],open("translate.json","w"),ensure_ascii=False)
+    return "translate.json"
 gr.TabbedInterface([gr.Interface(book2tts,[gr.File(label="Upload Book JSON"),gr.Dropdown(choices=[voice["Name"] for voice in json.load(open("voices.json"))], label="Select Voice")],gr.File(label="Download TTS JSON"),),gr.Interface(pdf_to_video,[gr.File(label="Upload PDF"),gr.File(label="Upload TTS JSON")],[gr.Video(label="Output Video"),gr.File(label="Subtitle File")],),gr.Interface(translate_json,[gr.File(label="Upload Book Conf JSON"),gr.Dropdown(list(TransLanguages.keys()))],gr.File(label="Translated JSON"))],["TTSMaker","VideoMaker","TranslationMaker"]).launch()