ikram98ai's picture
adding progress
1b741b9
"""
Multi-Language Detection Demo with Gradio and OpenAI
Supports text, audio, and video input for language detection
"""
import gradio as gr
import openai
from openai import OpenAI
import json
import os
from pydantic import BaseModel
import tempfile
import base64
from moviepy import VideoFileClip
# Initialize OpenAI client (will be set with API key from UI)
client = None
# Model configuration
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
def get_client(api_key, model_name):
"""Get appropriate OpenAI client based on model type"""
if model_name.lower().startswith("gemini"):
return OpenAI(
api_key=api_key,
base_url=GEMINI_BASE_URL
)
else:
return OpenAI(api_key=api_key)
def extract_audio_from_video(video_path):
"""Extract audio from video file using moviepy"""
try:
video = VideoFileClip(video_path)
audio_path = tempfile.mktemp(suffix=".mp3")
video.audio.write_audiofile(audio_path, codec='libmp3lame')
return audio_path
except Exception as e:
raise Exception(f"Failed to extract audio from video: {str(e)}")
def transcribe_audio(api_key, audio_path, model_name="gemini-2.5-flash"):
"""Transcribe audio using OpenAI Whisper or Gemini"""
global client
if model_name.lower().startswith("gemini"):
# Use Gemini audio understanding
client = get_client(api_key, model_name)
try:
with open(audio_path, "rb") as audio_file:
base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
# Determine audio format from file extension
audio_format = audio_path.split('.')[-1].lower()
if audio_format == "mp3":
audio_format = "mp3"
elif audio_format in ["wav", "webm", "ogg"]:
audio_format = audio_format
else:
audio_format = "mp3" # Default to mp3
response = client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Transcribe this audio file exactly as spoken. Return only the transcription text, nothing else."
},
{
"type": "input_audio",
"input_audio": {
"data": base64_audio,
"format": audio_format
}
}
]
}
]
)
return response.choices[0].message.content.strip()
except Exception as e:
raise Exception(f"Gemini transcription failed: {str(e)}")
else:
# Use Whisper for OpenAI models
client = get_client(api_key, model_name)
try:
with open(audio_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
return transcript
except Exception as e:
raise Exception(f"Whisper transcription failed: {str(e)}")
class Language(BaseModel):
code: str
name: str
percentage: float
sample: str
class LanguageDetection(BaseModel):
languages: list[Language]
primary_language: str
is_multilingual: bool
confidence: str
LanguageDetection.model_rebuild()
def detect_language(api_key, text, model_name="gemini-2.5-flash")->LanguageDetection:
"""Detect language(s) in text using OpenAI GPT or Gemini"""
global client
client = get_client(api_key, model_name)
prompt = f"""Analyze the following text and identify all languages present.
If multiple languages are detected, provide the percentage breakdown.
Respond ONLY with valid JSON in this exact format (no markdown, no code blocks):
{{
"languages": [
{{
"code": "en",
"name": "English",
"percentage": 100,
"sample": "sample text from the language"
}}
],
"primary_language": "en",
"is_multilingual": false,
"confidence": "high"
}}
Text to analyze:
{text}"""
try:
response = client.beta.chat.completions.parse(
model=model_name,
messages=[
{"role": "system", "content": "You are a language detection expert. Always respond with valid JSON only."},
{"role": "user", "content": prompt}
],
response_format=LanguageDetection,
temperature=0.1,
max_tokens=1000
)
result = response.choices[0].message.parsed
return result
except json.JSONDecodeError as e:
raise Exception(f"Failed to parse language detection response: {str(e)}")
except Exception as e:
raise Exception(f"Language detection failed: {str(e)}")
def format_results(detection_result:LanguageDetection, transcribed_text=None):
"""Format detection results for display"""
output = "# 🌍 Language Detection Results\n\n"
if detection_result.is_multilingual:
output += "**📊 Status:** Multiple languages detected\n\n"
else:
output += "**📊 Status:** Single language detected\n\n"
output += f"**🎯 Primary Language:** {detection_result.primary_language}\n\n"
output += f"**✅ Confidence:** {detection_result.confidence}\n\n"
output += "---\n\n## Detected Languages:\n\n"
for lang in detection_result.languages:
output += f"### {lang.name} ({lang.code})\n"
output += f"- **Percentage:** {lang.percentage}%\n"
if lang.sample:
output += f"- **Sample:** *\"{lang.sample}\"*\n"
output += "\n"
if transcribed_text:
output += "---\n\n## 📝 Transcribed Text:\n\n"
output += f"```\n{transcribed_text}\n```\n"
return output
def process_text_input(api_key, text, model_name):
"""Process text input for language detection (provides progress updates)"""
if not api_key:
yield "❌ Error: Please enter your API key", ""
return
if not text or not text.strip():
yield "❌ Error: Please enter some text to analyze", ""
return
try:
yield "🔍 Starting language detection... (10%)", ""
result = detect_language(api_key, text, model_name)
yield "🧠 Analyzing results... (70%)", ""
formatted = format_results(result)
yield "✅ Done (100%)", formatted
except Exception as e:
yield f"❌ Error: {str(e)}", ""
def process_audio_input(api_key, audio_file, model_name):
"""Process audio input for transcription and language detection (provides progress updates)"""
if not api_key:
yield "❌ Error: Please enter your API key", ""
return
if audio_file is None:
yield "❌ Error: Please upload an audio file", ""
return
try:
yield "🎧 Upload received. Starting transcription... (10%)", ""
transcribed_text = transcribe_audio(api_key, audio_file, model_name)
yield "📝 Transcription complete. Detecting language... (60%)", ""
result = detect_language(api_key, transcribed_text, model_name)
yield "🧾 Analysis complete. Formatting results... (90%)", ""
formatted = format_results(result, transcribed_text)
yield "✅ Done (100%)", formatted
except Exception as e:
yield f"❌ Error: {str(e)}", ""
def process_video_input(api_key, video_file, model_name):
"""Process video input by extracting audio, transcribing, and detecting language (provides progress updates)"""
if not api_key:
yield "❌ Error: Please enter your API key", ""
return
if video_file is None:
yield "❌ Error: Please upload a video file", ""
return
audio_path = None
try:
yield "🎬 Received video. Extracting audio... (10%)", ""
audio_path = extract_audio_from_video(video_file)
yield "🎧 Audio extracted. Starting transcription... (40%)", ""
transcribed_text = transcribe_audio(api_key, audio_path, model_name)
yield "📝 Transcription complete. Detecting language... (70%)", ""
result = detect_language(api_key, transcribed_text, model_name)
yield "🧾 Analysis complete. Formatting results... (90%)", ""
formatted = format_results(result, transcribed_text)
yield "✅ Done (100%)", formatted
except Exception as e:
yield f"❌ Error: {str(e)}", ""
finally:
# Clean up temporary audio file
if audio_path and os.path.exists(audio_path):
try:
os.remove(audio_path)
except:
pass
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Multi-Language Detector") as demo:
gr.Markdown("""
# 🌍 Multi-Language Detector
Detect and distinguish multiple languages from text, audio, or video input using OpenAI or Gemini APIs.
""")
# API Key input
with gr.Row():
api_key_input = gr.Textbox(
label="API Key",
placeholder="sk-... (OpenAI) or GEMINI_API_KEY",
type="password",
info="Enter your OpenAI API key or Gemini API key"
)
# Model selector (supports GPT and Gemini families)
model_selector = gr.Dropdown(
label="Model",
choices=["gpt-4", "gpt-4o", "gemini-2.5-flash", "gemini-2.5-pro"],
value="gemini-2.5-flash",
info="Choose model. Gemini models use Google's API endpoint."
)
gr.Markdown("---")
# Create tabs for different input types
with gr.Tabs():
# Text Input Tab
with gr.Tab("📝 Text Input"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter Text",
placeholder="Type or paste text in any language...",
lines=8
)
text_button = gr.Button("🔍 Detect Language", variant="primary")
with gr.Column():
text_progress = gr.Markdown(value="", label="Progress")
text_output = gr.Markdown(label="Results")
# Examples
gr.Examples(
examples=[
["Hello, how are you today?"],
["Bonjour! Comment allez-vous?"],
["こんにちは、お元気ですか?"],
["Hola, ¿cómo estás? Hello, how are you?"],
["Привет! مرحبا! 你好!"]
],
inputs=text_input,
label="Example Texts"
)
# Audio Input Tab
with gr.Tab("🎤 Audio Input"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath"
)
audio_button = gr.Button("🔍 Transcribe & Detect Language", variant="primary")
with gr.Column():
audio_progress = gr.Markdown(value="", label="Progress")
audio_output = gr.Markdown(label="Results")
# Video Input Tab
with gr.Tab("🎥 Video Input"):
with gr.Row():
with gr.Column():
video_input = gr.Video(
label="Upload Video File"
)
video_button = gr.Button("🔍 Extract Audio, Transcribe & Detect", variant="primary")
with gr.Column():
video_progress = gr.Markdown(value="", label="Progress")
video_output = gr.Markdown(label="Results")
# Set up event handlers
text_button.click(
fn=process_text_input,
inputs=[api_key_input, text_input, model_selector],
outputs=[text_progress, text_output]
)
audio_button.click(
fn=process_audio_input,
inputs=[api_key_input, audio_input, model_selector],
outputs=[audio_progress, audio_output]
)
video_button.click(
fn=process_video_input,
inputs=[api_key_input, video_input, model_selector],
outputs=[video_progress, video_output]
)
# Launch the demo
if __name__ == "__main__":
demo.launch(share=False)