Delete src
Browse files- src/interface.py +0 -215
- src/processor.py +0 -120
src/interface.py
DELETED
|
@@ -1,215 +0,0 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from .processor import process_document
|
| 3 |
-
|
| 4 |
-
SYNTHESIS_MODES = {
|
| 5 |
-
"narration": {
|
| 6 |
-
"description": "Simple document narration with clear voice and natural pacing",
|
| 7 |
-
"styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
|
| 8 |
-
"default_temp": 0.7,
|
| 9 |
-
"default_chunks": 300,
|
| 10 |
-
"system_prompt": """Convert this content into clear narration."""
|
| 11 |
-
},
|
| 12 |
-
"podcast": {
|
| 13 |
-
"description": "Conversational style with engaging tone and dynamic pacing",
|
| 14 |
-
"styles": ["Casual", "Interview", "Educational", "Commentary"],
|
| 15 |
-
"default_temp": 0.8,
|
| 16 |
-
"default_chunks": 400,
|
| 17 |
-
"system_prompt": """Transform this content into engaging podcast-style speech."""
|
| 18 |
-
},
|
| 19 |
-
"presentation": {
|
| 20 |
-
"description": "Professional presentation style with clear structure",
|
| 21 |
-
"styles": ["Business", "Academic", "Sales", "Training"],
|
| 22 |
-
"default_temp": 0.6,
|
| 23 |
-
"default_chunks": 250,
|
| 24 |
-
"system_prompt": """Convert this content into a presentation format."""
|
| 25 |
-
},
|
| 26 |
-
"storytelling": {
|
| 27 |
-
"description": "Narrative style with emotional engagement",
|
| 28 |
-
"styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
|
| 29 |
-
"default_temp": 0.9,
|
| 30 |
-
"default_chunks": 500,
|
| 31 |
-
"system_prompt": """Transform this content into an engaging story."""
|
| 32 |
-
}
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
def create_interface():
|
| 36 |
-
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
| 37 |
-
gr.HTML(
|
| 38 |
-
"""
|
| 39 |
-
<div style="margin-bottom: 1rem;">
|
| 40 |
-
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
|
| 41 |
-
alt="Pixeltable" style="max-width: 150px;" />
|
| 42 |
-
<h1>📄 Document to Audio Synthesis 🎧</h1>
|
| 43 |
-
</div>
|
| 44 |
-
"""
|
| 45 |
-
)
|
| 46 |
-
|
| 47 |
-
# Overview Row
|
| 48 |
-
with gr.Row():
|
| 49 |
-
with gr.Column():
|
| 50 |
-
with gr.Accordion("🎯 What does it do?", open=True):
|
| 51 |
-
gr.Markdown("""
|
| 52 |
-
- 📄 Document processing - 🧠 Content transformation
|
| 53 |
-
- 🎧 Audio synthesis - ⚙️ Multiple output styles
|
| 54 |
-
""")
|
| 55 |
-
with gr.Column():
|
| 56 |
-
with gr.Accordion("⚡ How does it work?", open=True):
|
| 57 |
-
gr.Markdown("""
|
| 58 |
-
1. 📑 **Processing:** Token-based segmentation
|
| 59 |
-
2. 🔍 **Analysis:** LLM optimization & scripts
|
| 60 |
-
3. 🎵 **Synthesis:** Multiple voice options
|
| 61 |
-
""")
|
| 62 |
-
|
| 63 |
-
synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
|
| 64 |
-
|
| 65 |
-
# Main Settings Row
|
| 66 |
-
with gr.Row():
|
| 67 |
-
# Core Settings Column
|
| 68 |
-
with gr.Column():
|
| 69 |
-
with gr.Accordion("🔑 Core Settings", open=True):
|
| 70 |
-
with gr.Row():
|
| 71 |
-
api_key = gr.Textbox(
|
| 72 |
-
label="OpenAI API Key",
|
| 73 |
-
placeholder="sk-...",
|
| 74 |
-
type="password",
|
| 75 |
-
scale=2
|
| 76 |
-
)
|
| 77 |
-
file_input = gr.File(
|
| 78 |
-
label="PDF Document",
|
| 79 |
-
file_types=[".pdf"],
|
| 80 |
-
scale=1
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
# Mode Selection Column
|
| 84 |
-
with gr.Column():
|
| 85 |
-
with gr.Accordion("🎭 Output Mode", open=True):
|
| 86 |
-
mode_select = gr.Radio(
|
| 87 |
-
choices=list(SYNTHESIS_MODES.keys()),
|
| 88 |
-
value="narration",
|
| 89 |
-
label="Select Mode",
|
| 90 |
-
info="Choose output style"
|
| 91 |
-
)
|
| 92 |
-
mode_description = gr.Markdown(
|
| 93 |
-
SYNTHESIS_MODES["narration"]["description"]
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
# Voice and Processing Settings Row
|
| 97 |
-
with gr.Row():
|
| 98 |
-
# Voice Settings Column
|
| 99 |
-
with gr.Column():
|
| 100 |
-
with gr.Accordion("🎛️ Voice & Style", open=True):
|
| 101 |
-
voice_select = gr.Radio(
|
| 102 |
-
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
|
| 103 |
-
value="onyx",
|
| 104 |
-
label="🎙️ Voice",
|
| 105 |
-
interactive=True
|
| 106 |
-
)
|
| 107 |
-
style_select = gr.Radio(
|
| 108 |
-
choices=SYNTHESIS_MODES["narration"]["styles"],
|
| 109 |
-
value=SYNTHESIS_MODES["narration"]["styles"][0],
|
| 110 |
-
label="💫 Style",
|
| 111 |
-
interactive=True
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
# Processing Settings Column
|
| 115 |
-
with gr.Column():
|
| 116 |
-
with gr.Accordion("⚙️ Processing Parameters", open=True):
|
| 117 |
-
with gr.Row():
|
| 118 |
-
chunk_size = gr.Slider(
|
| 119 |
-
minimum=100, maximum=1000,
|
| 120 |
-
value=SYNTHESIS_MODES["narration"]["default_chunks"],
|
| 121 |
-
step=50,
|
| 122 |
-
label="📏 Chunk Size"
|
| 123 |
-
)
|
| 124 |
-
temperature = gr.Slider(
|
| 125 |
-
minimum=0, maximum=1,
|
| 126 |
-
value=SYNTHESIS_MODES["narration"]["default_temp"],
|
| 127 |
-
step=0.1,
|
| 128 |
-
label="🌡️ Temperature"
|
| 129 |
-
)
|
| 130 |
-
max_tokens = gr.Slider(
|
| 131 |
-
minimum=100, maximum=1000,
|
| 132 |
-
value=300,
|
| 133 |
-
step=50,
|
| 134 |
-
label="📊 Tokens"
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
# Process Button Row
|
| 138 |
-
with gr.Row():
|
| 139 |
-
process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
|
| 140 |
-
status_output = gr.Textbox(label="📋 Status", scale=1)
|
| 141 |
-
|
| 142 |
-
# Output Section
|
| 143 |
-
with gr.Tabs():
|
| 144 |
-
with gr.TabItem("📝 Content"):
|
| 145 |
-
output_table = gr.Dataframe(
|
| 146 |
-
headers=["🔍 Segment", "📄 Content", "🎭 Script"],
|
| 147 |
-
wrap=True
|
| 148 |
-
)
|
| 149 |
-
with gr.TabItem("🎧 Audio"):
|
| 150 |
-
with gr.Row():
|
| 151 |
-
with gr.Column(scale=2):
|
| 152 |
-
audio_output = gr.Audio(
|
| 153 |
-
label="🔊 Synthesized Audio",
|
| 154 |
-
type="filepath",
|
| 155 |
-
show_download_button=True
|
| 156 |
-
)
|
| 157 |
-
with gr.Column(scale=1):
|
| 158 |
-
with gr.Accordion("📚 Quick Tips", open=True):
|
| 159 |
-
gr.Markdown("""
|
| 160 |
-
- 🎯 Lower temperature = more consistent
|
| 161 |
-
- 📏 Smaller chunks = more precise
|
| 162 |
-
- 🎙️ Try different voices for best fit
|
| 163 |
-
- 💫 Match style to content type
|
| 164 |
-
""")
|
| 165 |
-
|
| 166 |
-
gr.HTML(
|
| 167 |
-
"""
|
| 168 |
-
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
|
| 169 |
-
<p style="margin: 0; color: #666; font-size: 0.8em;">
|
| 170 |
-
🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
|
| 171 |
-
| 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
|
| 172 |
-
| 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
|
| 173 |
-
</p>
|
| 174 |
-
</div>
|
| 175 |
-
"""
|
| 176 |
-
)
|
| 177 |
-
|
| 178 |
-
def update_mode(mode_name):
|
| 179 |
-
mode = SYNTHESIS_MODES[mode_name]
|
| 180 |
-
return (
|
| 181 |
-
gr.update(choices=mode["styles"], value=mode["styles"][0]),
|
| 182 |
-
gr.update(value=mode["default_chunks"]),
|
| 183 |
-
gr.update(value=mode["default_temp"]),
|
| 184 |
-
mode["description"]
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
mode_select.change(
|
| 188 |
-
update_mode,
|
| 189 |
-
inputs=[mode_select],
|
| 190 |
-
outputs=[style_select, chunk_size, temperature, mode_description]
|
| 191 |
-
)
|
| 192 |
-
|
| 193 |
-
def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
|
| 194 |
-
mode = SYNTHESIS_MODES[mode_name]
|
| 195 |
-
return process_document(
|
| 196 |
-
pdf_file=pdf_file,
|
| 197 |
-
api_key=api_key,
|
| 198 |
-
voice_choice=voice,
|
| 199 |
-
style_choice=style,
|
| 200 |
-
chunk_size=chunk_size,
|
| 201 |
-
temperature=temperature,
|
| 202 |
-
max_tokens=max_tokens,
|
| 203 |
-
system_prompt=mode["system_prompt"]
|
| 204 |
-
)
|
| 205 |
-
|
| 206 |
-
process_btn.click(
|
| 207 |
-
update_interface,
|
| 208 |
-
inputs=[
|
| 209 |
-
file_input, api_key, mode_select, voice_select, style_select,
|
| 210 |
-
chunk_size, temperature, max_tokens
|
| 211 |
-
],
|
| 212 |
-
outputs=[output_table, audio_output, status_output]
|
| 213 |
-
)
|
| 214 |
-
|
| 215 |
-
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/processor.py
DELETED
|
@@ -1,120 +0,0 @@
|
|
| 1 |
-
import pixeltable as pxt
|
| 2 |
-
from pixeltable.iterators import DocumentSplitter
|
| 3 |
-
from pixeltable.functions import openai
|
| 4 |
-
import os
|
| 5 |
-
import requests
|
| 6 |
-
import tempfile
|
| 7 |
-
import gradio as gr
|
| 8 |
-
|
| 9 |
-
def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, system_prompt, progress=gr.Progress()):
|
| 10 |
-
try:
|
| 11 |
-
os.environ['OPENAI_API_KEY'] = api_key
|
| 12 |
-
|
| 13 |
-
progress(0.1, desc="Initializing...")
|
| 14 |
-
pxt.drop_dir('document_audio', force=True)
|
| 15 |
-
pxt.create_dir('document_audio')
|
| 16 |
-
|
| 17 |
-
docs = pxt.create_table(
|
| 18 |
-
'document_audio.documents',
|
| 19 |
-
{
|
| 20 |
-
'document': pxt.Document,
|
| 21 |
-
'voice': pxt.String,
|
| 22 |
-
'style': pxt.String,
|
| 23 |
-
'mode_prompt': pxt.String
|
| 24 |
-
}
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
-
progress(0.2, desc="Processing document...")
|
| 28 |
-
docs.insert([{
|
| 29 |
-
'document': pdf_file.name,
|
| 30 |
-
'voice': voice_choice,
|
| 31 |
-
'style': style_choice,
|
| 32 |
-
'mode_prompt': system_prompt
|
| 33 |
-
}])
|
| 34 |
-
|
| 35 |
-
chunks = pxt.create_view(
|
| 36 |
-
'document_audio.chunks',
|
| 37 |
-
docs,
|
| 38 |
-
iterator=DocumentSplitter.create(
|
| 39 |
-
document=docs.document,
|
| 40 |
-
separators='token_limit',
|
| 41 |
-
limit=chunk_size
|
| 42 |
-
)
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
progress(0.4, desc="Text processing...")
|
| 46 |
-
chunks['content_response'] = openai.chat_completions(
|
| 47 |
-
messages=[
|
| 48 |
-
{
|
| 49 |
-
'role': 'system',
|
| 50 |
-
'content': docs.mode_prompt # Use the mode-specific prompt
|
| 51 |
-
},
|
| 52 |
-
{'role': 'user', 'content': chunks.text}
|
| 53 |
-
],
|
| 54 |
-
model='gpt-4o-mini-2024-07-18',
|
| 55 |
-
max_tokens=max_tokens,
|
| 56 |
-
temperature=temperature
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
chunks['content'] = chunks.content_response['choices'][0]['message']['content']
|
| 60 |
-
|
| 61 |
-
progress(0.6, desc="Script generation...")
|
| 62 |
-
chunks['script_response'] = openai.chat_completions(
|
| 63 |
-
messages=[
|
| 64 |
-
{
|
| 65 |
-
'role': 'system',
|
| 66 |
-
'content': f"""Convert content to audio script.
|
| 67 |
-
Style: {docs.style}
|
| 68 |
-
Format:
|
| 69 |
-
- Clear sentence structures
|
| 70 |
-
- Natural pauses (...)
|
| 71 |
-
- Term definitions when needed
|
| 72 |
-
- Proper transitions"""
|
| 73 |
-
},
|
| 74 |
-
{'role': 'user', 'content': chunks.content}
|
| 75 |
-
],
|
| 76 |
-
model='gpt-4o-mini-2024-07-18',
|
| 77 |
-
max_tokens=max_tokens,
|
| 78 |
-
temperature=temperature
|
| 79 |
-
)
|
| 80 |
-
chunks['script'] = chunks.script_response['choices'][0]['message']['content']
|
| 81 |
-
|
| 82 |
-
progress(0.8, desc="Audio synthesis...")
|
| 83 |
-
@pxt.udf(return_type=pxt.Audio)
|
| 84 |
-
def generate_audio(script: str, voice: str):
|
| 85 |
-
if not script or not voice:
|
| 86 |
-
return None
|
| 87 |
-
try:
|
| 88 |
-
response = requests.post(
|
| 89 |
-
"https://api.openai.com/v1/audio/speech",
|
| 90 |
-
headers={"Authorization": f"Bearer {api_key}"},
|
| 91 |
-
json={"model": "tts-1", "input": script, "voice": voice}
|
| 92 |
-
)
|
| 93 |
-
if response.status_code == 200:
|
| 94 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
|
| 95 |
-
temp_file.write(response.content)
|
| 96 |
-
temp_file.close()
|
| 97 |
-
return temp_file.name
|
| 98 |
-
except Exception as e:
|
| 99 |
-
print(f"Error in audio synthesis: {e}")
|
| 100 |
-
return None
|
| 101 |
-
|
| 102 |
-
chunks['audio'] = generate_audio(chunks.script, docs.voice)
|
| 103 |
-
|
| 104 |
-
audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
|
| 105 |
-
|
| 106 |
-
results = chunks.select(
|
| 107 |
-
chunks.content,
|
| 108 |
-
chunks.script
|
| 109 |
-
).collect()
|
| 110 |
-
|
| 111 |
-
display_data = [
|
| 112 |
-
[f"Segment {idx + 1}", row['content'], row['script']]
|
| 113 |
-
for idx, row in enumerate(results)
|
| 114 |
-
]
|
| 115 |
-
|
| 116 |
-
progress(1.0, desc="Complete")
|
| 117 |
-
return display_data, audio_path, "Processing complete"
|
| 118 |
-
|
| 119 |
-
except Exception as e:
|
| 120 |
-
return None, None, f"Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|