Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer | |
| import openai | |
| import os | |
| # --------------- STEP 1: Load the free STT model from Hugging Face --------------- | |
| # We use a Persian Wav2Vec2 model | |
| # Example: 'm3hrdadfi/wav2vec2-large-xlsr-persian-v2' | |
| stt_model_name = "m3hrdadfi/wav2vec2-large-xlsr-persian-v2" | |
| tokenizer = Wav2Vec2Tokenizer.from_pretrained(stt_model_name) | |
| stt_model = Wav2Vec2ForCTC.from_pretrained(stt_model_name) | |
| # --------------- STEP 2: Configure OpenAI API for GPT-4o mini (fine-tuned) --------------- | |
| # You need your OpenAI API key. Let's read from environment variable for safety. | |
| openai.api_key = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY_HERE") | |
| # If you'd rather hardcode, replace "YOUR_API_KEY_HERE" with your actual key. | |
| # But do not commit your real key publicly. | |
| OPENAI_MODEL_NAME = "gpt-4o-mini" # The exact model name you said you'd use (example) | |
| # --------------- STT Function --------------- | |
| def speech_to_text(audio): | |
| """ | |
| audio is a tuple (sample_rate, numpy_data) from Gradio's microphone or an audio file. | |
| Convert Persian speech to text using wav2vec2 model offline. | |
| """ | |
| if audio is None: | |
| return "" | |
| sample_rate, data = audio | |
| # Convert to torch tensor | |
| input_values = tokenizer(data, return_tensors="pt", sampling_rate=sample_rate).input_values | |
| with torch.no_grad(): | |
| logits = stt_model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = tokenizer.decode(predicted_ids[0]) | |
| # transcription might be uppercase and missing punctuation. We just return it raw. | |
| return transcription.lower() | |
| # --------------- Correction Function with GPT-4o mini --------------- | |
| def correct_text_with_gpt(text): | |
| """ | |
| Send the text to OpenAI GPT-4o mini for correction/improvement. | |
| The user wants a formal tone consistent with medical/legal usage. | |
| Fine-tuned model is presumably specialized for that domain. | |
| """ | |
| if not text.strip(): | |
| return "" | |
| system_message = ( | |
| "You are a specialized model that receives Persian text and corrects it with a formal tone, " | |
| "particularly for medical/legal context. Make the text coherent, unify spacing, and punctuation, " | |
| "and ensure a formal writing style. Do not add commentary. Just provide the corrected text." | |
| ) | |
| user_message = f"متن خام: {text}\n\nلطفاً متن فوق را در یک پاراگراف رسمی و اداری اصلاح کن." | |
| # We'll call the ChatCompletion API if GPT-4o mini is chat-based. | |
| # If it's a completion-based model, code differs. But let's assume it's chat-based for simplicity. | |
| response = openai.ChatCompletion.create( | |
| model=OPENAI_MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": user_message} | |
| ], | |
| temperature=0.2, | |
| max_tokens=1024 | |
| ) | |
| corrected = response.choices[0].message.content | |
| return corrected.strip() | |
| # --------------- Gradio Interface --------------- | |
| # We want: | |
| # - A widget to record or upload audio | |
| # - A box that shows transcribed text | |
| # - A button "اصلاح متن" that calls GPT-4o mini | |
| # - A final output box | |
| def process_audio_and_correct(audio): | |
| # Step A: convert speech to text | |
| raw_text = speech_to_text(audio) | |
| # Step B: correct with GPT | |
| final_text = correct_text_with_gpt(raw_text) | |
| return raw_text, final_text | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# وبسایت تبدیل گفتار به متن (فارسی) + اصلاح متن با GPT-4o mini") | |
| gr.Markdown("در اینجا میتوانید یک وویس فارسی ضبط یا فایل صوتی آپلود کنید. سپس متن استخراجشده و اصلاحشده را دریافت کنید.") | |
| with gr.Row(): | |
| audio_in = gr.Audio(source="microphone", type="numpy", label="ضبط صدا یا آپلود فایل (رایگان)") | |
| transcribed_text = gr.Textbox(label="متن خام (خروجی مرحله اول - رایگان)", lines=3) | |
| corrected_text = gr.Textbox(label="متن اصلاحشده (با GPT-4o mini - پولی)", lines=3) | |
| btn_process = gr.Button("تبدیل و اصلاح") | |
| def audio_workflow(audio): | |
| # step 1: stt | |
| raw_text = speech_to_text(audio) | |
| transcribed_text.update(raw_text) | |
| # step 2: correction | |
| corrected = correct_text_with_gpt(raw_text) | |
| corrected_text.update(corrected) | |
| return raw_text, corrected | |
| btn_process.click(fn=process_audio_and_correct, inputs=audio_in, outputs=[transcribed_text, corrected_text]) | |
| demo.launch() | |