Spaces:

samir72
/

AudioChatTranscriber

Running

File size: 7,289 Bytes

92ddce4

import os
import base64
import tempfile
import requests
from datetime import datetime
import gradio as gr
from dotenv import load_dotenv
from openai import AzureOpenAI  # official OpenAI SDK, works with Azure endpoints
import json
import subprocess # to execute youtube-dl version
import Youtubetranscription_summarizer

# --- LLM call (Azure OpenAI with API key) -----------------------------------

def summarize_audio_b64(audio_b64: str, sys_prompt: str, user_prompt: str) -> str:
    """
    Calls Azure OpenAI Chat Completions with audio input (base64 mp3).
    """
    load_dotenv()

    endpoint = os.getenv("AC_OPENAI_ENDPOINT")
    api_key = os.getenv("AC_OPENAI_API_KEY")
    deployment = os.getenv("AC_MODEL_DEPLOYMENT")
    api_version = os.getenv("AC_OPENAI_API_VERSION")

    if not endpoint or not api_key or not deployment:
        return "Server misconfiguration: required env vars missing."
    

    try:
        client = AzureOpenAI(
            api_key=api_key,
            api_version=api_version,
            azure_endpoint=endpoint,
        )

        system_message = sys_prompt.strip() if sys_prompt else (
            "You are an AI assistant with a charter to clearly analyze the customer enquiry."
        )
        user_text = user_prompt.strip() if user_prompt else "Summarize the audio content."

        response = client.chat.completions.create(
            model=deployment,
            messages=[
                {"role": "system", "content": system_message},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": user_text},
                        {
                            "type": "input_audio",
                            #"input_audio": {"data": audio_b64, "format": "mp3"},
                            "input_audio": {"data": audio_b64, "format": "wav"},
                        },
                    ],
                },
            ],
        )
        print(f"Azure API call at {datetime.now()}: prompt_length={len(user_prompt)}, audio_size={len(audio_b64)}")
        return response.choices[0].message.content

    except Exception as ex:
        return print(f"Error from Azure OpenAI: {ex}")
        #pass

#----Retrieve meta data from metadata.json file------------------------------
def retrieve_file_path(file_name):
    path = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(path, file_name)
    if os.path.isfile(file_path):
        return file_path
    elif not os.path.exists(file_path):
        print(f"'{file_path}' does not exist.")
        return None
    return None

def retrieve_json_record(file_path, record_id):
    with open(file_path, 'r') as file:
        data = json.load(file)
        if isinstance(data, list):
            for record in data:
                if record.get('metadata', {}).get('id') == record_id:
                    return record
        elif isinstance(data, dict):
            if data.get('metadata', {}).get('id') == record_id:
                return data
    return None
# --- I/O helpers ------------------------------------------------------------

def encode_audio_from_path(path: str) -> str:
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def download_to_temp_mp3(url: str) -> str:
    r = requests.get(url, stream=True, timeout=30)
    r.raise_for_status()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                tmp.write(chunk)
        return tmp.name


def process_audio(upload_path, record_path, url, sys_prompt, user_prompt):
    tmp_to_cleanup = []
    try:
        audio_path = None
        if upload_path:
            audio_path = upload_path
        elif record_path:
            audio_path = record_path
        elif url and url.strip():
            #audio_path = download_to_temp_mp3(url.strip())
            audio_path = Youtubetranscription_summarizer.main(url.strip())
            tmp_to_cleanup.append(audio_path)

        if not audio_path:
            return "Please provide an audio file via upload, recording, or URL."

        audio_b64 = encode_audio_from_path(audio_path)
        return summarize_audio_b64(audio_b64, sys_prompt, user_prompt)

    except Exception as e:
        return print(f"Error processing audio at {datetime.now()}: prompt_length={len(user_prompt)}, audio_path={audio_path}: {str(e)}")
        

    finally:
        for p in tmp_to_cleanup:
            try:
                if os.path.exists(p):
                    os.remove(p)
            except Exception:
                pass


# --- UI ---------------------------------------------------------------------

with gr.Blocks(title="Audio Summarizer") as demo:
    gr.Markdown("# Audio File Summarizer (Azure OpenAI)")
    gr.Markdown("Upload a mp3, record audio, or paste a URL. The app sends base64 audio to Azure OpenAI.")

    with gr.Row():
        with gr.Column():
            upload_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload mp3")
        with gr.Column():
            record_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
        with gr.Column():
            url_input = gr.Textbox(label="mp3 URL", placeholder="https://example.com/audio.mp3")

    ### Get system and user prompts from metadata.json file
    file_name = 'metadata.json'
    record_id = '1'
    file_path = retrieve_file_path(file_name)
    
    jsonrecord = retrieve_json_record(file_path, record_id)
    if jsonrecord:
        print(json.dumps(jsonrecord, indent=2))
    else:
        print("Record not found.")

    sysprompt_default = jsonrecord['metadata']['content']['system_prompt']['content']
    userprompt_default = jsonrecord['metadata']['content']['user_prompt']['content']

    with gr.Row():
        userprompt_input = gr.Textbox(
            label="User Prompt",
            #value="Summarize the audio content",
            value=userprompt_default,
            placeholder="e.g., Extract key points and action items",
        )
        sysprompt_input = gr.Textbox(
            label="System Prompt",
            #value="You are an AI assistant with a charter to clearly analyze the customer enquiry.",
            value=sysprompt_default,
        )

    submit_btn = gr.Button("Summarize")
    output = gr.Textbox(label="Summary", lines=12)

    # Capture inputs for logging
    if upload_audio:
        upload_audio.change(
            fn=lambda x: print(f"Upload audio selected: {x}"),
            inputs=[upload_audio],
            outputs=[],
            # Reset other inputs to avoid confusion
        )
    if record_audio:
        record_audio.change(
            fn=lambda x: print(f"Record audio selected: {x}"),
            inputs=[record_audio],
            outputs=[],
        )
    if url_input:
        url_input.change(
            fn=lambda x: print(f"URL input changed: {x}"),
            inputs=[url_input],
            outputs=[],
        )
    submit_btn.click(
        fn=process_audio,
        inputs=[upload_audio, record_audio, url_input, sysprompt_input, userprompt_input],
        outputs=output,
    )

if __name__ == "__main__":
    demo.launch()