""" Multi-Language Detection Demo with Gradio and OpenAI Supports text, audio, and video input for language detection """ import gradio as gr import openai from openai import OpenAI import json import os from pydantic import BaseModel import tempfile import base64 from moviepy import VideoFileClip # Initialize OpenAI client (will be set with API key from UI) client = None # Model configuration GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/" def get_client(api_key, model_name): """Get appropriate OpenAI client based on model type""" if model_name.lower().startswith("gemini"): return OpenAI( api_key=api_key, base_url=GEMINI_BASE_URL ) else: return OpenAI(api_key=api_key) def extract_audio_from_video(video_path): """Extract audio from video file using moviepy""" try: video = VideoFileClip(video_path) audio_path = tempfile.mktemp(suffix=".mp3") video.audio.write_audiofile(audio_path, codec='libmp3lame') return audio_path except Exception as e: raise Exception(f"Failed to extract audio from video: {str(e)}") def transcribe_audio(api_key, audio_path, model_name="gemini-2.5-flash"): """Transcribe audio using OpenAI Whisper or Gemini""" global client if model_name.lower().startswith("gemini"): # Use Gemini audio understanding client = get_client(api_key, model_name) try: with open(audio_path, "rb") as audio_file: base64_audio = base64.b64encode(audio_file.read()).decode('utf-8') # Determine audio format from file extension audio_format = audio_path.split('.')[-1].lower() if audio_format == "mp3": audio_format = "mp3" elif audio_format in ["wav", "webm", "ogg"]: audio_format = audio_format else: audio_format = "mp3" # Default to mp3 response = client.chat.completions.create( model=model_name, messages=[ { "role": "user", "content": [ { "type": "text", "text": "Transcribe this audio file exactly as spoken. Return only the transcription text, nothing else." }, { "type": "input_audio", "input_audio": { "data": base64_audio, "format": audio_format } } ] } ] ) return response.choices[0].message.content.strip() except Exception as e: raise Exception(f"Gemini transcription failed: {str(e)}") else: # Use Whisper for OpenAI models client = get_client(api_key, model_name) try: with open(audio_path, "rb") as audio_file: transcript = client.audio.transcriptions.create( model="whisper-1", file=audio_file, response_format="text" ) return transcript except Exception as e: raise Exception(f"Whisper transcription failed: {str(e)}") class Language(BaseModel): code: str name: str percentage: float sample: str class LanguageDetection(BaseModel): languages: list[Language] primary_language: str is_multilingual: bool confidence: str LanguageDetection.model_rebuild() def detect_language(api_key, text, model_name="gemini-2.5-flash")->LanguageDetection: """Detect language(s) in text using OpenAI GPT or Gemini""" global client client = get_client(api_key, model_name) prompt = f"""Analyze the following text and identify all languages present. If multiple languages are detected, provide the percentage breakdown. Respond ONLY with valid JSON in this exact format (no markdown, no code blocks): {{ "languages": [ {{ "code": "en", "name": "English", "percentage": 100, "sample": "sample text from the language" }} ], "primary_language": "en", "is_multilingual": false, "confidence": "high" }} Text to analyze: {text}""" try: response = client.beta.chat.completions.parse( model=model_name, messages=[ {"role": "system", "content": "You are a language detection expert. Always respond with valid JSON only."}, {"role": "user", "content": prompt} ], response_format=LanguageDetection, temperature=0.1, max_tokens=1000 ) result = response.choices[0].message.parsed return result except json.JSONDecodeError as e: raise Exception(f"Failed to parse language detection response: {str(e)}") except Exception as e: raise Exception(f"Language detection failed: {str(e)}") def format_results(detection_result:LanguageDetection, transcribed_text=None): """Format detection results for display""" output = "# 🌍 Language Detection Results\n\n" if detection_result.is_multilingual: output += "**πŸ“Š Status:** Multiple languages detected\n\n" else: output += "**πŸ“Š Status:** Single language detected\n\n" output += f"**🎯 Primary Language:** {detection_result.primary_language}\n\n" output += f"**βœ… Confidence:** {detection_result.confidence}\n\n" output += "---\n\n## Detected Languages:\n\n" for lang in detection_result.languages: output += f"### {lang.name} ({lang.code})\n" output += f"- **Percentage:** {lang.percentage}%\n" if lang.sample: output += f"- **Sample:** *\"{lang.sample}\"*\n" output += "\n" if transcribed_text: output += "---\n\n## πŸ“ Transcribed Text:\n\n" output += f"```\n{transcribed_text}\n```\n" return output def process_text_input(api_key, text, model_name): """Process text input for language detection (provides progress updates)""" if not api_key: yield "❌ Error: Please enter your API key", "" return if not text or not text.strip(): yield "❌ Error: Please enter some text to analyze", "" return try: yield "πŸ” Starting language detection... (10%)", "" result = detect_language(api_key, text, model_name) yield "🧠 Analyzing results... (70%)", "" formatted = format_results(result) yield "βœ… Done (100%)", formatted except Exception as e: yield f"❌ Error: {str(e)}", "" def process_audio_input(api_key, audio_file, model_name): """Process audio input for transcription and language detection (provides progress updates)""" if not api_key: yield "❌ Error: Please enter your API key", "" return if audio_file is None: yield "❌ Error: Please upload an audio file", "" return try: yield "🎧 Upload received. Starting transcription... (10%)", "" transcribed_text = transcribe_audio(api_key, audio_file, model_name) yield "πŸ“ Transcription complete. Detecting language... (60%)", "" result = detect_language(api_key, transcribed_text, model_name) yield "🧾 Analysis complete. Formatting results... (90%)", "" formatted = format_results(result, transcribed_text) yield "βœ… Done (100%)", formatted except Exception as e: yield f"❌ Error: {str(e)}", "" def process_video_input(api_key, video_file, model_name): """Process video input by extracting audio, transcribing, and detecting language (provides progress updates)""" if not api_key: yield "❌ Error: Please enter your API key", "" return if video_file is None: yield "❌ Error: Please upload a video file", "" return audio_path = None try: yield "🎬 Received video. Extracting audio... (10%)", "" audio_path = extract_audio_from_video(video_file) yield "🎧 Audio extracted. Starting transcription... (40%)", "" transcribed_text = transcribe_audio(api_key, audio_path, model_name) yield "πŸ“ Transcription complete. Detecting language... (70%)", "" result = detect_language(api_key, transcribed_text, model_name) yield "🧾 Analysis complete. Formatting results... (90%)", "" formatted = format_results(result, transcribed_text) yield "βœ… Done (100%)", formatted except Exception as e: yield f"❌ Error: {str(e)}", "" finally: # Clean up temporary audio file if audio_path and os.path.exists(audio_path): try: os.remove(audio_path) except: pass # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(), title="Multi-Language Detector") as demo: gr.Markdown(""" # 🌍 Multi-Language Detector Detect and distinguish multiple languages from text, audio, or video input using OpenAI or Gemini APIs. """) # API Key input with gr.Row(): api_key_input = gr.Textbox( label="API Key", placeholder="sk-... (OpenAI) or GEMINI_API_KEY", type="password", info="Enter your OpenAI API key or Gemini API key" ) # Model selector (supports GPT and Gemini families) model_selector = gr.Dropdown( label="Model", choices=["gpt-4", "gpt-4o", "gemini-2.5-flash", "gemini-2.5-pro"], value="gemini-2.5-flash", info="Choose model. Gemini models use Google's API endpoint." ) gr.Markdown("---") # Create tabs for different input types with gr.Tabs(): # Text Input Tab with gr.Tab("πŸ“ Text Input"): with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter Text", placeholder="Type or paste text in any language...", lines=8 ) text_button = gr.Button("πŸ” Detect Language", variant="primary") with gr.Column(): text_progress = gr.Markdown(value="", label="Progress") text_output = gr.Markdown(label="Results") # Examples gr.Examples( examples=[ ["Hello, how are you today?"], ["Bonjour! Comment allez-vous?"], ["γ“γ‚“γ«γ‘γ―γ€γŠε…ƒζ°—γ§γ™γ‹οΌŸ"], ["Hola, ΒΏcΓ³mo estΓ‘s? Hello, how are you?"], ["ΠŸΡ€ΠΈΠ²Π΅Ρ‚! Ω…Ψ±Ψ­Ψ¨Ψ§! δ½ ε₯½οΌ"] ], inputs=text_input, label="Example Texts" ) # Audio Input Tab with gr.Tab("🎀 Audio Input"): with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Upload Audio File", type="filepath" ) audio_button = gr.Button("πŸ” Transcribe & Detect Language", variant="primary") with gr.Column(): audio_progress = gr.Markdown(value="", label="Progress") audio_output = gr.Markdown(label="Results") # Video Input Tab with gr.Tab("πŸŽ₯ Video Input"): with gr.Row(): with gr.Column(): video_input = gr.Video( label="Upload Video File" ) video_button = gr.Button("πŸ” Extract Audio, Transcribe & Detect", variant="primary") with gr.Column(): video_progress = gr.Markdown(value="", label="Progress") video_output = gr.Markdown(label="Results") # Set up event handlers text_button.click( fn=process_text_input, inputs=[api_key_input, text_input, model_selector], outputs=[text_progress, text_output] ) audio_button.click( fn=process_audio_input, inputs=[api_key_input, audio_input, model_selector], outputs=[audio_progress, audio_output] ) video_button.click( fn=process_video_input, inputs=[api_key_input, video_input, model_selector], outputs=[video_progress, video_output] ) # Launch the demo if __name__ == "__main__": demo.launch(share=False)