File size: 5,184 Bytes
86a9d2d a566ca0 86a9d2d 4197dc6 86a9d2d 76ac4df 86a9d2d a566ca0 86a9d2d 4197dc6 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d 4197dc6 86a9d2d 76ac4df 86a9d2d 4197dc6 a566ca0 4197dc6 a566ca0 4197dc6 a566ca0 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d 76ac4df 86a9d2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import os
import tempfile
import speech_recognition as sr
import gradio as gr
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import uvicorn
from pathlib import Path
from pydub import AudioSegment
# Create FastAPI app
app = FastAPI(title="Speech to Text Model")
# Configure CORS to allow requests from frontend
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # This can be more restrictive in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize speech recognition
recognizer = sr.Recognizer()
# FastAPI endpoint for direct API access
@app.post("/generate-story")
async def generate_story_api(file: UploadFile = File(...)):
try:
# Save uploaded audio to a temp file with original extension
file_extension = os.path.splitext(file.filename)[1] if file.filename else ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
# Process the audio using our function
transcript = transcribe_audio(tmp_path)
# Clean up temp file
os.remove(tmp_path)
# Return JSON response
return JSONResponse({
"transcript": transcript
})
except Exception as e:
return JSONResponse(
status_code=500,
content={"error": str(e)}
)
# Convert any audio format to WAV
def convert_to_wav(audio_path):
try:
# Get the file extension
file_extension = os.path.splitext(audio_path)[1].lower()
# If already WAV, don't convert
if file_extension == ".wav":
return audio_path
# Create a new temporary WAV file
wav_path = os.path.splitext(audio_path)[0] + "_converted.wav"
# Convert based on file extension
if file_extension in [".mp3", ".m4a", ".ogg", ".flac", ".aac"]:
audio = AudioSegment.from_file(audio_path)
audio.export(wav_path, format="wav")
return wav_path
else:
# For unknown formats, try a generic approach
audio = AudioSegment.from_file(audio_path)
audio.export(wav_path, format="wav")
return wav_path
except Exception as e:
raise Exception(f"Error converting audio format: {str(e)}")
# Function for processing audio (used by both FastAPI and Gradio)
def transcribe_audio(audio_path):
try:
# Convert audio to WAV format first
wav_path = convert_to_wav(audio_path)
# Use speech_recognition to transcribe
with sr.AudioFile(wav_path) as source:
audio_data = recognizer.record(source)
# Try to use Google's speech recognition for Arabic
text = recognizer.recognize_google(audio_data, language="ar-AR")
# Clean up converted file if it's different from the original
if wav_path != audio_path and os.path.exists(wav_path):
os.remove(wav_path)
return text
except sr.UnknownValueError:
return "لم يتم التعرف على الكلام"
except sr.RequestError as e:
return f"حدث خطأ في خدمة التعرف على الصوت: {e}"
except Exception as e:
return f"حدث خطأ: {str(e)}"
# Gradio interface wrapper for the model
def gradio_process(audio_file):
try:
# Handle the audio file whether it's a string path or an object
audio_path = audio_file if isinstance(audio_file, str) else audio_file.name
# Process the audio
transcript = transcribe_audio(audio_path)
return transcript
except Exception as e:
return f"حدث خطأ: {str(e)}"
# Define Gradio interface
with gr.Blocks(title="Speech to Text Model") as demo:
gr.Markdown("# Speech to Text")
gr.Markdown("قم بتسجيل أو تحميل ملف صوتي باللغة العربية وسيقوم النظام بتحويله إلى نص.")
with gr.Row():
audio_input = gr.Audio(label="تسجيل أو تحميل صوت", type="filepath")
with gr.Row():
submit_btn = gr.Button("تحويل إلى نص")
with gr.Row():
transcript_output = gr.Textbox(label="النص المستخرج من التسجيل الصوتي")
submit_btn.click(
fn=gradio_process,
inputs=audio_input,
outputs=transcript_output,
)
# Mount static files for frontend if they exist
frontend_path = Path("../front")
if frontend_path.exists():
app.mount("/", StaticFiles(directory=str(frontend_path), html=True), name="frontend")
# Launch with uvicorn when run directly
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
|