humanvprojectceo commited on
Commit
656e44c
·
verified ·
1 Parent(s): 2a67362

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -118
app.py CHANGED
@@ -1,122 +1,29 @@
1
- import os
2
- import io
3
- import asyncio
4
- import numpy as np
5
- import soundfile as sf
6
  import gradio as gr
7
- from google import genai
8
- from google.genai import types
9
-
10
- MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
11
-
12
- client = genai.Client(
13
- http_options={"api_version": "v1beta"},
14
- api_key=os.getenv("GEMINI_API_KEY"),
15
- )
16
-
17
- CONFIG = types.LiveConnectConfig(
18
- response_modalities=["AUDIO"]
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  )
20
 
21
- # ------------------------
22
- # Audio preprocessing
23
- # ------------------------
24
- def load_audio_as_pcm16(path):
25
- y, sr = sf.read(path)
26
-
27
- # تبدیل به mono
28
- if len(y.shape) > 1:
29
- y = y.mean(axis=1)
30
-
31
- # resample به 16k
32
- if sr != 16000:
33
- import resampy
34
- y = resampy.resample(y, sr, 16000)
35
-
36
- # float → int16
37
- pcm16 = (y * 32767).astype(np.int16)
38
- return pcm16.tobytes()
39
-
40
-
41
- # ------------------------
42
- # Gemini interaction
43
- # ------------------------
44
- async def send_audio_file(file_path):
45
- audio_bytes = load_audio_as_pcm16(file_path)
46
-
47
- async with client.aio.live.connect(model=MODEL, config=CONFIG) as session:
48
-
49
- await session.send(
50
- input={
51
- "data": audio_bytes,
52
- "mime_type": "audio/pcm"
53
- },
54
- end_of_turn=True
55
- )
56
-
57
- audio_chunks = []
58
-
59
- turn = session.receive()
60
- async for response in turn:
61
- if response.data:
62
- audio_chunks.append(response.data)
63
-
64
- full_audio = b"".join(audio_chunks)
65
-
66
- # تبدیل خروجی مدل به numpy
67
- buf = io.BytesIO(full_audio)
68
- y, sr = sf.read(
69
- buf,
70
- channels=1,
71
- samplerate=24000,
72
- format="RAW",
73
- subtype="PCM_16",
74
- dtype="float32"
75
- )
76
-
77
- return sr, y
78
-
79
-
80
- # ------------------------
81
- # Gradio function
82
- # ------------------------
83
- def process_audio(file):
84
- if file is None:
85
- return None, "Please upload an audio file."
86
-
87
- try:
88
- sr, audio_data = asyncio.run(send_audio_file(file))
89
- return (sr, audio_data), "Response generated successfully!"
90
- except Exception as e:
91
- return None, f"Error: {str(e)}"
92
-
93
-
94
- # ------------------------
95
- # Gradio UI
96
- # ------------------------
97
- with gr.Blocks() as demo:
98
- gr.Markdown("# Gemini Audio → Audio")
99
- gr.Markdown("Upload audio → Gemini responds with audio")
100
-
101
- input_audio = gr.Audio(
102
- label="Upload audio",
103
- type="filepath"
104
- )
105
-
106
- output_audio = gr.Audio(
107
- label="Gemini spoken response",
108
- type="numpy",
109
- autoplay=True
110
- )
111
-
112
- status = gr.Textbox(label="Status")
113
-
114
- btn = gr.Button("Send Audio")
115
-
116
- btn.click(
117
- fn=process_audio,
118
- inputs=input_audio,
119
- outputs=[output_audio, status]
120
- )
121
-
122
  demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
+ from faster_whisper import WhisperModel
3
+
4
+ # انتخاب مدل: 'base' یا 'small' برای CPU عالی هستند
5
+ # 'int8' باعث می‌شود مدل روی CPU بسیار سریع و کم‌حجم اجرا شود
6
+ model_size = "small"
7
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
8
+
9
+ def transcribe_audio(audio):
10
+ # اجرای تبدیل صدا به متن
11
+ # beam_size=5 دقت را بالا می‌برد
12
+ segments, info = model.transcribe(audio, beam_size=5)
13
+
14
+ full_text = ""
15
+ for segment in segments:
16
+ full_text += segment.text + " "
17
+
18
+ return full_text
19
+
20
+ # طراحی رابط کاربری
21
+ demo = gr.Interface(
22
+ fn=transcribe_audio,
23
+ inputs=gr.Audio(type="filepath"),
24
+ outputs="text",
25
+ title="تبدیل رایگان صدا به متن (Whisper CPU)",
26
+ description="فایل صوتی خود را آپلود کنید تا با دقت بالا به متن تبدیل شود. پشتیبانی از تمامی زبان‌ها از جمله فارسی."
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  demo.launch()