Spaces:
Running
Running
File size: 7,289 Bytes
92ddce4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | import os
import base64
import tempfile
import requests
from datetime import datetime
import gradio as gr
from dotenv import load_dotenv
from openai import AzureOpenAI # official OpenAI SDK, works with Azure endpoints
import json
import subprocess # to execute youtube-dl version
import Youtubetranscription_summarizer
# --- LLM call (Azure OpenAI with API key) -----------------------------------
def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> str:
"""
Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
"""
load_dotenv()
endpoint = os.getenv("AC_OPENAI_ENDPOINT")
api_key = os.getenv("AC_OPENAI_API_KEY")
deployment = os.getenv("AC_MODEL_DEPLOYMENT")
api_version = os.getenv("AC_OPENAI_API_VERSION")
if not endpoint or not api_key or not deployment:
return "Server misconfiguration: required env vars missing."
try:
client = AzureOpenAI(
api_key=api_key,
api_version=api_version,
azure_endpoint=endpoint,
)
system_message = sys_prompt.strip() if sys_prompt else (
"You are an AI assistant with a charter to clearly analyze the customer enquiry."
)
user_text = user_prompt.strip() if user_prompt else "Summarize the audio content."
response = client.chat.completions.create(
model=deployment,
messages=[
{"role": "system", "content": system_message},
{
"role": "user",
"content": [
{"type": "text", "text": user_text},
{
"type": "input_audio",
#"input_audio": {"data": audio_b64, "format": "mp3"},
"input_audio": {"data": audio_b64, "format": "wav"},
},
],
},
],
)
print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)}, audio_size={len(audio_b64)}")
return response.choices[0].message.content
except Exception as ex:
return print(f"Error from Azure OpenAI: {ex}")
#pass
#----Retrieve meta data from metadata.json file------------------------------
def retrieve_file_path(file_name):
path = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(path, file_name)
if os.path.isfile(file_path):
return file_path
elif not os.path.exists(file_path):
print(f"'{file_path}' does not exist.")
return None
return None
def retrieve_json_record(file_path, record_id):
with open(file_path, 'r') as file:
data = json.load(file)
if isinstance(data, list):
for record in data:
if record.get('metadata', {}).get('id') == record_id:
return record
elif isinstance(data, dict):
if data.get('metadata', {}).get('id') == record_id:
return data
return None
# --- I/O helpers ------------------------------------------------------------
def encode_audio_from_path(path: str) -> str:
with open(path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def download_to_temp_mp3(url: str) -> str:
r = requests.get(url, stream=True, timeout=30)
r.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
tmp.write(chunk)
return tmp.name
def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
tmp_to_cleanup = []
try:
audio_path = None
if upload_path:
audio_path = upload_path
elif record_path:
audio_path = record_path
elif url and url.strip():
#audio_path = download_to_temp_mp3(url.strip())
audio_path = Youtubetranscription_summarizer.main(url.strip())
tmp_to_cleanup.append(audio_path)
if not audio_path:
return "Please provide an audio file via upload, recording, or URL."
audio_b64 = encode_audio_from_path(audio_path)
return summarize_audio_b64(audio_b64, sys_prompt, user_prompt)
except Exception as e:
return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
finally:
for p in tmp_to_cleanup:
try:
if os.path.exists(p):
os.remove(p)
except Exception:
pass
# --- UI ---------------------------------------------------------------------
with gr.Blocks(title="Audio Summarizer") as demo:
gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
gr.Markdown("Upload a mp3, record audio, or paste a URL. The app sends base64 audio to Azure OpenAI.")
with gr.Row():
with gr.Column():
upload_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload mp3")
with gr.Column():
record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
with gr.Column():
url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")
### Get system and user prompts from metadata.json file
file_name = 'metadata.json'
record_id = '1'
file_path = retrieve_file_path(file_name)
jsonrecord = retrieve_json_record(file_path, record_id)
if jsonrecord:
print(json.dumps(jsonrecord, indent=2))
else:
print("Record not found.")
sysprompt_default = jsonrecord['metadata']['content']['system_prompt']['content']
userprompt_default = jsonrecord['metadata']['content']['user_prompt']['content']
with gr.Row():
userprompt_input = gr.Textbox(
label="User Prompt",
#value="Summarize the audio content",
value=userprompt_default,
placeholder="e.g., Extract key points and action items",
)
sysprompt_input = gr.Textbox(
label="System Prompt",
#value="You are an AI assistant with a charter to clearly analyze the customer enquiry.",
value=sysprompt_default,
)
submit_btn = gr.Button("Summarize")
output = gr.Textbox(label="Summary", lines=12)
# Capture inputs for logging
if upload_audio:
upload_audio.change(
fn=lambda x: print(f"Upload audio selected: {x}"),
inputs=[upload_audio],
outputs=[],
# Reset other inputs to avoid confusion
)
if record_audio:
record_audio.change(
fn=lambda x: print(f"Record audio selected: {x}"),
inputs=[record_audio],
outputs=[],
)
if url_input:
url_input.change(
fn=lambda x: print(f"URL input changed: {x}"),
inputs=[url_input],
outputs=[],
)
submit_btn.click(
fn=process_audio,
inputs=[upload_audio, record_audio, url_input, sysprompt_input, userprompt_input],
outputs=output,
)
if __name__ == "__main__":
demo.launch()
|