Sayiqa7 commited on
Commit
d66e6ff
Β·
verified Β·
1 Parent(s): 6683fa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -4
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import subprocess
 
2
  # Install required libraries
3
  subprocess.check_call(["pip", "install", "torch>=1.11.0"])
4
  subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
5
  subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
6
  subprocess.check_call(["pip", "install", "librosa"])
7
  subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
 
8
 
9
  import os
10
  import threading
@@ -44,7 +46,11 @@ else:
44
  raise ValueError("HF_TOKEN environment variable not set.")
45
 
46
  # Load speech-to-text model (Whisper)
47
- speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 
 
 
 
48
 
49
  # Load Stable Diffusion model for text-to-image
50
  text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
@@ -62,7 +68,7 @@ def preprocess_audio(audio_path):
62
  except Exception as e:
63
  return f"Error in preprocessing audio: {str(e)}"
64
 
65
- # Speech-to-text function
66
  @lru_cache(maxsize=10)
67
  def transcribe_audio(audio_path):
68
  try:
@@ -70,7 +76,9 @@ def transcribe_audio(audio_path):
70
  if isinstance(audio_array, str): # Error message from preprocessing
71
  return audio_array
72
  result = speech_to_text(audio_array)
73
- return result["text"]
 
 
74
  except Exception as e:
75
  return f"Error in transcription: {str(e)}"
76
 
@@ -142,4 +150,4 @@ iface = gr.TabbedInterface(
142
  )
143
 
144
  # Launch Gradio interface
145
- iface.launch(debug=True, share=True)
 
1
  import subprocess
2
+
3
  # Install required libraries
4
  subprocess.check_call(["pip", "install", "torch>=1.11.0"])
5
  subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
6
  subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
7
  subprocess.check_call(["pip", "install", "librosa"])
8
  subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
9
+ subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
10
 
11
  import os
12
  import threading
 
46
  raise ValueError("HF_TOKEN environment variable not set.")
47
 
48
  # Load speech-to-text model (Whisper)
49
+ speech_to_text = pipeline(
50
+ "automatic-speech-recognition",
51
+ model="openai/whisper-tiny",
52
+ return_timestamps=True
53
+ )
54
 
55
  # Load Stable Diffusion model for text-to-image
56
  text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
68
  except Exception as e:
69
  return f"Error in preprocessing audio: {str(e)}"
70
 
71
+ # Speech-to-text function with long-form transcription support
72
  @lru_cache(maxsize=10)
73
  def transcribe_audio(audio_path):
74
  try:
 
76
  if isinstance(audio_array, str): # Error message from preprocessing
77
  return audio_array
78
  result = speech_to_text(audio_array)
79
+ # Combine text from multiple segments for long-form transcription
80
+ transcription = " ".join(segment["text"] for segment in result["chunks"])
81
+ return transcription
82
  except Exception as e:
83
  return f"Error in transcription: {str(e)}"
84
 
 
150
  )
151
 
152
  # Launch Gradio interface
153
+ iface.launch(debug=True, share=True)