shethjenil commited on
Commit
cfc35b3
·
verified ·
1 Parent(s): 94f1087

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -157
app.py CHANGED
@@ -1,158 +1,158 @@
1
- import json
2
- import pysrt
3
- import re
4
- import gradio as gr
5
- import fitz
6
- from base64 import b64decode, b64encode
7
- from numpy import array as np_array
8
- from edge_tts import Communicate, SubMaker
9
- from concurrent.futures import ThreadPoolExecutor
10
- from PIL import Image, ImageOps
11
- from io import BytesIO
12
- from pydub import AudioSegment
13
- from moviepy import ImageSequenceClip
14
- from proglog import TqdmProgressBarLogger
15
- from deep_translator import GoogleTranslator
16
-
17
- SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+=\{\}\[\]|\\:;\"'<>,.?/~`]")
18
- def srt_string_to_obj(srt_string):
19
- return [
20
- {"index": sub.index, "start": str(sub.start), "end": str(sub.end), "text": sub.text.replace("\n", " ")}
21
- for sub in pysrt.from_string(srt_string)
22
- ]
23
-
24
- def remove_special_chars(word):
25
- return re.sub(SPECIAL_CHARS, "", word)
26
-
27
- def tts(text, voice):
28
- submaker = SubMaker()
29
- mp3_data = b""
30
-
31
- communicator = Communicate(text, voice)
32
- for chunk in communicator.stream_sync():
33
- if chunk["type"] == "audio":
34
- mp3_data += chunk["data"]
35
- elif chunk["type"] == "WordBoundary":
36
- submaker.feed(chunk)
37
- srt = submaker.get_srt()
38
- return mp3_data, srt_string_to_obj(srt),srt
39
-
40
- def metadata2transcript(page_metadata, transcription):
41
- conf = []
42
- while page_metadata and transcription:
43
- w = page_metadata.pop(0)
44
- if w["type"] == "word" and remove_special_chars(w["content"]):
45
- t = transcription.pop(0)
46
- if remove_special_chars(w["content"]) == remove_special_chars(t["text"]):
47
- w["start"], w["end"] = t["start"], t["end"]
48
- conf.append(w)
49
- return conf
50
-
51
- def tts_process_page(page_metadata, selected_voice):
52
- text_content = " ".join(i["content"] for i in page_metadata if i["type"] == "word")
53
- mp3_data, transcription,srt_file = tts(text_content, selected_voice)
54
- return {
55
- "metadata": metadata2transcript(page_metadata, transcription),
56
- "mp3data": b64encode(mp3_data).decode("utf-8"),
57
- "srt":srt_file
58
- }
59
-
60
- def book2tts(book_conf, selected_voice, progress=gr.Progress(track_tqdm=True)):
61
- with open(book_conf.name, "r", encoding="utf-8") as f:
62
- pages = json.load(f)
63
-
64
- output = []
65
- with ThreadPoolExecutor() as executor:
66
- results = list(progress.tqdm(
67
- executor.map(lambda p: tts_process_page(p, selected_voice), pages),
68
- desc="Processing TTS",
69
- total=len(pages), # Ensure progress tracking works correctly
70
- unit="page"
71
- ))
72
-
73
- output.extend(results)
74
- output_file = "book_tts.json"
75
- with open(output_file, "w", encoding="utf-8") as f:
76
- json.dump(output, f, ensure_ascii=False)
77
- return output_file
78
-
79
-
80
- def merge_srt_files(srt_strings, durations):
81
- merged_subs = pysrt.SubRipFile()
82
- current_offset = 0
83
- for srt_string, duration in zip(srt_strings, durations):
84
- subs = pysrt.from_string(srt_string)
85
- for sub in subs:
86
- sub.shift(seconds=current_offset)
87
- merged_subs.append(sub)
88
- current_offset += duration
89
- return merged_subs
90
-
91
- class CustomLogger(TqdmProgressBarLogger):
92
- def __init__(self, gradio_progress: gr.Progress):
93
- self.gradio_progress = gradio_progress
94
- super().__init__(print_messages=False)
95
- def bars_callback(self, bar, attr, value, old_value):
96
- if bar=='frame_index':
97
- self.gradio_progress(value / self.bars[bar]['total'],"Rendering Video",unit="Frames")
98
-
99
- def process_page(page_num, page, tts_conf, max_width, max_height):
100
- pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
101
- img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
102
- img_pil = img_pil.convert("RGBA")
103
- img_pil = ImageOps.pad(img_pil, (max_width, max_height), color=(255, 255, 255))
104
- frame = np_array(img_pil)
105
- page_tts_conf = tts_conf[page_num]
106
- try:
107
- mp3_data = b64decode(page_tts_conf["mp3data"], validate=True)
108
- audio = AudioSegment.from_file(BytesIO(mp3_data), format="mp3")
109
- except:
110
- audio = AudioSegment.silent(duration=3000)
111
- return frame, audio.duration_seconds, audio,page_tts_conf["srt"]
112
-
113
- def pdf_to_video(pdf_file, tts_file, progress=gr.Progress()):
114
- pdf_path = pdf_file.name
115
- tts_path = tts_file.name
116
- pdf = fitz.open(pdf_path)
117
- with open(tts_path, "r", encoding="utf-8") as f:
118
- tts_conf = json.load(f)
119
- max_width, max_height = 0, 0
120
- for page in pdf:
121
- pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
122
- max_width = max(max_width, pix.width)
123
- max_height = max(max_height, pix.height)
124
- frames, durations, audio_clips, srt_files = [], [], [], []
125
- with ThreadPoolExecutor() as executor:
126
- results = list(progress.tqdm(executor.map(lambda x: process_page(x, pdf[x], tts_conf, max_width, max_height), range(len(pdf))), total=len(pdf), desc="Processing Pages & Audio", unit="page"))
127
-
128
- for frame, duration, audio, srt_file in results:
129
- frames.append(frame)
130
- durations.append(duration)
131
- audio_clips.append(audio)
132
- srt_files.append(srt_file)
133
-
134
- combined_audio = sum(audio_clips)
135
- combined_audio.export("merged_audio.mp3", format="mp3")
136
-
137
- ImageSequenceClip(frames, durations=durations).write_videofile(
138
- "output.mp4",
139
- fps=1,
140
- codec="libx264",
141
- audio="merged_audio.mp3", # Adds audio directly
142
- audio_codec="aac",
143
- audio_bitrate="128k",
144
- preset="slow",
145
- ffmpeg_params=["-crf", "20"],
146
- logger=CustomLogger(progress)
147
- )
148
- merge_srt_files(srt_files, durations).save("merged_subs.srt", encoding="utf-8")
149
- return "output.mp4", "merged_subs.srt"
150
-
151
- TransLanguages = json.load(open("translate_language.json"))
152
-
153
- def translate_json(book_conf:str,source:str):
154
- texts = [i for i in ["\n".join(line.strip() for line in " ".join(i['content'] if i['type'] == 'word' else "\n" for i in con).split("\n")).strip() for con in json.load(open(book_conf.name, "r", encoding="utf-8"))]]
155
- json.dump([{"original":ot,"translate":tr} for ot,tr in zip(texts,GoogleTranslator(source=TransLanguages[source], target='en').translate_batch(texts))],open("translate.json","w"))
156
- return "translate.json"
157
-
158
  gr.TabbedInterface([gr.Interface(book2tts,[gr.File(label="Upload Book JSON"),gr.Dropdown(choices=[voice["Name"] for voice in json.load(open("voices.json"))], label="Select Voice")],gr.File(label="Download TTS JSON"),),gr.Interface(pdf_to_video,[gr.File(label="Upload PDF"),gr.File(label="Upload TTS JSON")],[gr.Video(label="Output Video"),gr.File(label="Subtitle File")],),gr.Interface(translate_json,[gr.File(label="Upload Book Conf JSON"),gr.Dropdown(list(TransLanguages.keys()))],gr.File(label="Translated JSON"))],["TTSMaker","VideoMaker","TranslationMaker"]).launch()
 
1
+ import json
2
+ import pysrt
3
+ import re
4
+ import gradio as gr
5
+ import fitz
6
+ from base64 import b64decode, b64encode
7
+ from numpy import array as np_array
8
+ from edge_tts import Communicate, SubMaker
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from PIL import Image, ImageOps
11
+ from io import BytesIO
12
+ from pydub import AudioSegment
13
+ from moviepy import ImageSequenceClip
14
+ from proglog import TqdmProgressBarLogger
15
+ from deep_translator import GoogleTranslator
16
+
17
+ SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+=\{\}\[\]|\\:;\"'<>,.?/~`]")
18
+ def srt_string_to_obj(srt_string):
19
+ return [
20
+ {"index": sub.index, "start": str(sub.start), "end": str(sub.end), "text": sub.text.replace("\n", " ")}
21
+ for sub in pysrt.from_string(srt_string)
22
+ ]
23
+
24
+ def remove_special_chars(word):
25
+ return re.sub(SPECIAL_CHARS, "", word)
26
+
27
+ def tts(text, voice):
28
+ submaker = SubMaker()
29
+ mp3_data = b""
30
+
31
+ communicator = Communicate(text, voice)
32
+ for chunk in communicator.stream_sync():
33
+ if chunk["type"] == "audio":
34
+ mp3_data += chunk["data"]
35
+ elif chunk["type"] == "WordBoundary":
36
+ submaker.feed(chunk)
37
+ srt = submaker.get_srt()
38
+ return mp3_data, srt_string_to_obj(srt),srt
39
+
40
+ def metadata2transcript(page_metadata, transcription):
41
+ conf = []
42
+ while page_metadata and transcription:
43
+ w = page_metadata.pop(0)
44
+ if w["type"] == "word" and remove_special_chars(w["content"]):
45
+ t = transcription.pop(0)
46
+ if remove_special_chars(w["content"]) == remove_special_chars(t["text"]):
47
+ w["start"], w["end"] = t["start"], t["end"]
48
+ conf.append(w)
49
+ return conf
50
+
51
+ def tts_process_page(page_metadata, selected_voice):
52
+ text_content = " ".join(i["content"] for i in page_metadata if i["type"] == "word")
53
+ mp3_data, transcription,srt_file = tts(text_content, selected_voice)
54
+ return {
55
+ "metadata": metadata2transcript(page_metadata, transcription),
56
+ "mp3data": b64encode(mp3_data).decode("utf-8"),
57
+ "srt":srt_file
58
+ }
59
+
60
+ def book2tts(book_conf, selected_voice, progress=gr.Progress(track_tqdm=True)):
61
+ with open(book_conf.name, "r", encoding="utf-8") as f:
62
+ pages = json.load(f)
63
+
64
+ output = []
65
+ with ThreadPoolExecutor() as executor:
66
+ results = list(progress.tqdm(
67
+ executor.map(lambda p: tts_process_page(p, selected_voice), pages),
68
+ desc="Processing TTS",
69
+ total=len(pages), # Ensure progress tracking works correctly
70
+ unit="page"
71
+ ))
72
+
73
+ output.extend(results)
74
+ output_file = "book_tts.json"
75
+ with open(output_file, "w", encoding="utf-8") as f:
76
+ json.dump(output, f, ensure_ascii=False)
77
+ return output_file
78
+
79
+
80
+ def merge_srt_files(srt_strings, durations):
81
+ merged_subs = pysrt.SubRipFile()
82
+ current_offset = 0
83
+ for srt_string, duration in zip(srt_strings, durations):
84
+ subs = pysrt.from_string(srt_string)
85
+ for sub in subs:
86
+ sub.shift(seconds=current_offset)
87
+ merged_subs.append(sub)
88
+ current_offset += duration
89
+ return merged_subs
90
+
91
+ class CustomLogger(TqdmProgressBarLogger):
92
+ def __init__(self, gradio_progress: gr.Progress):
93
+ self.gradio_progress = gradio_progress
94
+ super().__init__(print_messages=False)
95
+ def bars_callback(self, bar, attr, value, old_value):
96
+ if bar=='frame_index':
97
+ self.gradio_progress(value / self.bars[bar]['total'],"Rendering Video",unit="Frames")
98
+
99
+ def process_page(page_num, page, tts_conf, max_width, max_height):
100
+ pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
101
+ img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
102
+ img_pil = img_pil.convert("RGBA")
103
+ img_pil = ImageOps.pad(img_pil, (max_width, max_height), color=(255, 255, 255))
104
+ frame = np_array(img_pil)
105
+ page_tts_conf = tts_conf[page_num]
106
+ try:
107
+ mp3_data = b64decode(page_tts_conf["mp3data"], validate=True)
108
+ audio = AudioSegment.from_file(BytesIO(mp3_data), format="mp3")
109
+ except:
110
+ audio = AudioSegment.silent(duration=3000)
111
+ return frame, audio.duration_seconds, audio,page_tts_conf["srt"]
112
+
113
+ def pdf_to_video(pdf_file, tts_file, progress=gr.Progress()):
114
+ pdf_path = pdf_file.name
115
+ tts_path = tts_file.name
116
+ pdf = fitz.open(pdf_path)
117
+ with open(tts_path, "r", encoding="utf-8") as f:
118
+ tts_conf = json.load(f)
119
+ max_width, max_height = 0, 0
120
+ for page in pdf:
121
+ pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
122
+ max_width = max(max_width, pix.width)
123
+ max_height = max(max_height, pix.height)
124
+ frames, durations, audio_clips, srt_files = [], [], [], []
125
+ with ThreadPoolExecutor() as executor:
126
+ results = list(progress.tqdm(executor.map(lambda x: process_page(x, pdf[x], tts_conf, max_width, max_height), range(len(pdf))), total=len(pdf), desc="Processing Pages & Audio", unit="page"))
127
+
128
+ for frame, duration, audio, srt_file in results:
129
+ frames.append(frame)
130
+ durations.append(duration)
131
+ audio_clips.append(audio)
132
+ srt_files.append(srt_file)
133
+
134
+ combined_audio = sum(audio_clips)
135
+ combined_audio.export("merged_audio.mp3", format="mp3")
136
+
137
+ ImageSequenceClip(frames, durations=durations).write_videofile(
138
+ "output.mp4",
139
+ fps=1,
140
+ codec="libx264",
141
+ audio="merged_audio.mp3", # Adds audio directly
142
+ audio_codec="aac",
143
+ audio_bitrate="128k",
144
+ preset="slow",
145
+ ffmpeg_params=["-crf", "20"],
146
+ logger=CustomLogger(progress)
147
+ )
148
+ merge_srt_files(srt_files, durations).save("merged_subs.srt", encoding="utf-8")
149
+ return "output.mp4", "merged_subs.srt"
150
+
151
+ TransLanguages = json.load(open("translate_language.json"))
152
+
153
+ def translate_json(book_conf:str,source:str):
154
+ texts = [i for i in ["\n".join(line.strip() for line in " ".join(i['content'] if i['type'] == 'word' else "\n" for i in con).split("\n")).strip() for con in json.load(open(book_conf.name, "r", encoding="utf-8"))]]
155
+ json.dump([{"original":ot,"translate":tr} for ot,tr in zip(texts,GoogleTranslator(source=TransLanguages[source], target='en').translate_batch(texts))],open("translate.json","w"),ensure_ascii=False)
156
+ return "translate.json"
157
+
158
  gr.TabbedInterface([gr.Interface(book2tts,[gr.File(label="Upload Book JSON"),gr.Dropdown(choices=[voice["Name"] for voice in json.load(open("voices.json"))], label="Select Voice")],gr.File(label="Download TTS JSON"),),gr.Interface(pdf_to_video,[gr.File(label="Upload PDF"),gr.File(label="Upload TTS JSON")],[gr.Video(label="Output Video"),gr.File(label="Subtitle File")],),gr.Interface(translate_json,[gr.File(label="Upload Book Conf JSON"),gr.Dropdown(list(TransLanguages.keys()))],gr.File(label="Translated JSON"))],["TTSMaker","VideoMaker","TranslationMaker"]).launch()