adelevett commited on
Commit
6f37f73
·
verified ·
1 Parent(s): dd5dddc

Upload 4 files

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +23 -7
  3. requirements.txt +2 -1
README.md CHANGED
@@ -4,10 +4,10 @@ emoji: 🎴
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
- python_version: '3.10'
11
  ---
12
 
13
  # Flashcard2Audio
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
+ python_version: "3.10"
11
  ---
12
 
13
  # Flashcard2Audio
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import genanki
4
- import pocket_tts
5
  import tempfile
6
  import os
7
  import shutil
@@ -11,6 +11,8 @@ import sqlite3
11
  import re
12
  import time
13
  import json
 
 
14
  from pathlib import Path
15
  from concurrent.futures import ThreadPoolExecutor, as_completed
16
  from pydub import AudioSegment
@@ -41,11 +43,19 @@ def has_existing_audio(text):
41
 
42
  print("Loading TTS Model...")
43
  try:
44
- TTS_MODEL = pocket_tts.load_model()
45
  print("Model Loaded Successfully.")
46
  except Exception as e:
47
  print(f"CRITICAL ERROR loading model: {e}")
48
  TTS_MODEL = None
 
 
 
 
 
 
 
 
49
 
50
  def wav_to_mp3(src_wav, dst_mp3):
51
  AudioSegment.from_wav(src_wav).export(dst_mp3, format="mp3", bitrate="64k")
@@ -70,8 +80,11 @@ def generate_audio_for_row(q_text, a_text, idx, tmpdir, mode):
70
  q_wav = os.path.join(tmpdir, f"q_{idx}.wav")
71
  try:
72
  clean = clean_text_for_tts(q_text)
73
- if clean and TTS_MODEL:
74
- pocket_tts.generate_to_file(TTS_MODEL, clean, q_wav)
 
 
 
75
  q_out = q_wav
76
  else:
77
  AudioSegment.silent(duration=500).export(q_wav, format="wav")
@@ -89,8 +102,11 @@ def generate_audio_for_row(q_text, a_text, idx, tmpdir, mode):
89
  a_wav = os.path.join(tmpdir, f"a_{idx}.wav")
90
  try:
91
  clean = clean_text_for_tts(a_text)
92
- if clean and TTS_MODEL:
93
- pocket_tts.generate_to_file(TTS_MODEL, clean, a_wav)
 
 
 
94
  a_out = a_wav
95
  else:
96
  AudioSegment.silent(duration=500).export(a_wav, format="wav")
@@ -344,7 +360,7 @@ with gr.Blocks(title="Pocket TTS Anki") as app:
344
  label="Generation Mode"
345
  )
346
 
347
- preview_table = gr.Dataframe(label="Preview (First 100)", interactive=False, height=300)
348
 
349
  with gr.Row():
350
  btn = gr.Button("🚀 Generate Deck", variant="primary")
 
1
  import gradio as gr
2
  import pandas as pd
3
  import genanki
4
+ from pocket_tts import TTSModel
5
  import tempfile
6
  import os
7
  import shutil
 
11
  import re
12
  import time
13
  import json
14
+ import torch
15
+ import scipy.io.wavfile
16
  from pathlib import Path
17
  from concurrent.futures import ThreadPoolExecutor, as_completed
18
  from pydub import AudioSegment
 
43
 
44
  print("Loading TTS Model...")
45
  try:
46
+ TTS_MODEL = TTSModel.load_model()
47
  print("Model Loaded Successfully.")
48
  except Exception as e:
49
  print(f"CRITICAL ERROR loading model: {e}")
50
  TTS_MODEL = None
51
+
52
+ # Get default voice state
53
+ VOICE_STATE = None
54
+ if TTS_MODEL:
55
+ try:
56
+ VOICE_STATE = TTS_MODEL.get_state_for_audio_prompt("alba") # Default voice
57
+ except Exception as e:
58
+ print(f"Warning: Could not load default voice: {e}")
59
 
60
  def wav_to_mp3(src_wav, dst_mp3):
61
  AudioSegment.from_wav(src_wav).export(dst_mp3, format="mp3", bitrate="64k")
 
80
  q_wav = os.path.join(tmpdir, f"q_{idx}.wav")
81
  try:
82
  clean = clean_text_for_tts(q_text)
83
+ if clean and TTS_MODEL and VOICE_STATE:
84
+ # Generate audio using new API
85
+ audio_tensor = TTS_MODEL.generate_audio(VOICE_STATE, clean)
86
+ # Convert tensor to numpy and save as wav
87
+ scipy.io.wavfile.write(q_wav, TTS_MODEL.sample_rate, audio_tensor.numpy())
88
  q_out = q_wav
89
  else:
90
  AudioSegment.silent(duration=500).export(q_wav, format="wav")
 
102
  a_wav = os.path.join(tmpdir, f"a_{idx}.wav")
103
  try:
104
  clean = clean_text_for_tts(a_text)
105
+ if clean and TTS_MODEL and VOICE_STATE:
106
+ # Generate audio using new API
107
+ audio_tensor = TTS_MODEL.generate_audio(VOICE_STATE, clean)
108
+ # Convert tensor to numpy and save as wav
109
+ scipy.io.wavfile.write(a_wav, TTS_MODEL.sample_rate, audio_tensor.numpy())
110
  a_out = a_wav
111
  else:
112
  AudioSegment.silent(duration=500).export(a_wav, format="wav")
 
360
  label="Generation Mode"
361
  )
362
 
363
+ preview_table = gr.Dataframe(label="Preview (First 100)", interactive=False)
364
 
365
  with gr.Row():
366
  btn = gr.Button("🚀 Generate Deck", variant="primary")
requirements.txt CHANGED
@@ -4,9 +4,10 @@
4
 
5
  # Linux (HF Spaces) - use CPU builds from extra index
6
  torch>=2.5.0
7
- gradio>=4.0.0
8
  pandas
9
  genanki
10
  pydub
 
11
  # Pocket TTS is not on PyPI - must install from GitHub
12
  git+https://github.com/kyutai-labs/pocket-tts.git
 
4
 
5
  # Linux (HF Spaces) - use CPU builds from extra index
6
  torch>=2.5.0
7
+ gradio
8
  pandas
9
  genanki
10
  pydub
11
+ scipy
12
  # Pocket TTS is not on PyPI - must install from GitHub
13
  git+https://github.com/kyutai-labs/pocket-tts.git