Spaces:
Sleeping
Sleeping
| """ | |
| Multi-Language Detection Demo with Gradio and OpenAI | |
| Supports text, audio, and video input for language detection | |
| """ | |
| import gradio as gr | |
| import openai | |
| from openai import OpenAI | |
| import json | |
| import os | |
| from pydantic import BaseModel | |
| import tempfile | |
| import base64 | |
| from moviepy import VideoFileClip | |
| # Initialize OpenAI client (will be set with API key from UI) | |
| client = None | |
| # Model configuration | |
| GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/" | |
| def get_client(api_key, model_name): | |
| """Get appropriate OpenAI client based on model type""" | |
| if model_name.lower().startswith("gemini"): | |
| return OpenAI( | |
| api_key=api_key, | |
| base_url=GEMINI_BASE_URL | |
| ) | |
| else: | |
| return OpenAI(api_key=api_key) | |
| def extract_audio_from_video(video_path): | |
| """Extract audio from video file using moviepy""" | |
| try: | |
| video = VideoFileClip(video_path) | |
| audio_path = tempfile.mktemp(suffix=".mp3") | |
| video.audio.write_audiofile(audio_path, codec='libmp3lame') | |
| return audio_path | |
| except Exception as e: | |
| raise Exception(f"Failed to extract audio from video: {str(e)}") | |
| def transcribe_audio(api_key, audio_path, model_name="gemini-2.5-flash"): | |
| """Transcribe audio using OpenAI Whisper or Gemini""" | |
| global client | |
| if model_name.lower().startswith("gemini"): | |
| # Use Gemini audio understanding | |
| client = get_client(api_key, model_name) | |
| try: | |
| with open(audio_path, "rb") as audio_file: | |
| base64_audio = base64.b64encode(audio_file.read()).decode('utf-8') | |
| # Determine audio format from file extension | |
| audio_format = audio_path.split('.')[-1].lower() | |
| if audio_format == "mp3": | |
| audio_format = "mp3" | |
| elif audio_format in ["wav", "webm", "ogg"]: | |
| audio_format = audio_format | |
| else: | |
| audio_format = "mp3" # Default to mp3 | |
| response = client.chat.completions.create( | |
| model=model_name, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Transcribe this audio file exactly as spoken. Return only the transcription text, nothing else." | |
| }, | |
| { | |
| "type": "input_audio", | |
| "input_audio": { | |
| "data": base64_audio, | |
| "format": audio_format | |
| } | |
| } | |
| ] | |
| } | |
| ] | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| raise Exception(f"Gemini transcription failed: {str(e)}") | |
| else: | |
| # Use Whisper for OpenAI models | |
| client = get_client(api_key, model_name) | |
| try: | |
| with open(audio_path, "rb") as audio_file: | |
| transcript = client.audio.transcriptions.create( | |
| model="whisper-1", | |
| file=audio_file, | |
| response_format="text" | |
| ) | |
| return transcript | |
| except Exception as e: | |
| raise Exception(f"Whisper transcription failed: {str(e)}") | |
| class Language(BaseModel): | |
| code: str | |
| name: str | |
| percentage: float | |
| sample: str | |
| class LanguageDetection(BaseModel): | |
| languages: list[Language] | |
| primary_language: str | |
| is_multilingual: bool | |
| confidence: str | |
| LanguageDetection.model_rebuild() | |
| def detect_language(api_key, text, model_name="gemini-2.5-flash")->LanguageDetection: | |
| """Detect language(s) in text using OpenAI GPT or Gemini""" | |
| global client | |
| client = get_client(api_key, model_name) | |
| prompt = f"""Analyze the following text and identify all languages present. | |
| If multiple languages are detected, provide the percentage breakdown. | |
| Respond ONLY with valid JSON in this exact format (no markdown, no code blocks): | |
| {{ | |
| "languages": [ | |
| {{ | |
| "code": "en", | |
| "name": "English", | |
| "percentage": 100, | |
| "sample": "sample text from the language" | |
| }} | |
| ], | |
| "primary_language": "en", | |
| "is_multilingual": false, | |
| "confidence": "high" | |
| }} | |
| Text to analyze: | |
| {text}""" | |
| try: | |
| response = client.beta.chat.completions.parse( | |
| model=model_name, | |
| messages=[ | |
| {"role": "system", "content": "You are a language detection expert. Always respond with valid JSON only."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| response_format=LanguageDetection, | |
| temperature=0.1, | |
| max_tokens=1000 | |
| ) | |
| result = response.choices[0].message.parsed | |
| return result | |
| except json.JSONDecodeError as e: | |
| raise Exception(f"Failed to parse language detection response: {str(e)}") | |
| except Exception as e: | |
| raise Exception(f"Language detection failed: {str(e)}") | |
| def format_results(detection_result:LanguageDetection, transcribed_text=None): | |
| """Format detection results for display""" | |
| output = "# 🌍 Language Detection Results\n\n" | |
| if detection_result.is_multilingual: | |
| output += "**📊 Status:** Multiple languages detected\n\n" | |
| else: | |
| output += "**📊 Status:** Single language detected\n\n" | |
| output += f"**🎯 Primary Language:** {detection_result.primary_language}\n\n" | |
| output += f"**✅ Confidence:** {detection_result.confidence}\n\n" | |
| output += "---\n\n## Detected Languages:\n\n" | |
| for lang in detection_result.languages: | |
| output += f"### {lang.name} ({lang.code})\n" | |
| output += f"- **Percentage:** {lang.percentage}%\n" | |
| if lang.sample: | |
| output += f"- **Sample:** *\"{lang.sample}\"*\n" | |
| output += "\n" | |
| if transcribed_text: | |
| output += "---\n\n## 📝 Transcribed Text:\n\n" | |
| output += f"```\n{transcribed_text}\n```\n" | |
| return output | |
| def process_text_input(api_key, text, model_name): | |
| """Process text input for language detection (provides progress updates)""" | |
| if not api_key: | |
| yield "❌ Error: Please enter your API key", "" | |
| return | |
| if not text or not text.strip(): | |
| yield "❌ Error: Please enter some text to analyze", "" | |
| return | |
| try: | |
| yield "🔍 Starting language detection... (10%)", "" | |
| result = detect_language(api_key, text, model_name) | |
| yield "🧠 Analyzing results... (70%)", "" | |
| formatted = format_results(result) | |
| yield "✅ Done (100%)", formatted | |
| except Exception as e: | |
| yield f"❌ Error: {str(e)}", "" | |
| def process_audio_input(api_key, audio_file, model_name): | |
| """Process audio input for transcription and language detection (provides progress updates)""" | |
| if not api_key: | |
| yield "❌ Error: Please enter your API key", "" | |
| return | |
| if audio_file is None: | |
| yield "❌ Error: Please upload an audio file", "" | |
| return | |
| try: | |
| yield "🎧 Upload received. Starting transcription... (10%)", "" | |
| transcribed_text = transcribe_audio(api_key, audio_file, model_name) | |
| yield "📝 Transcription complete. Detecting language... (60%)", "" | |
| result = detect_language(api_key, transcribed_text, model_name) | |
| yield "🧾 Analysis complete. Formatting results... (90%)", "" | |
| formatted = format_results(result, transcribed_text) | |
| yield "✅ Done (100%)", formatted | |
| except Exception as e: | |
| yield f"❌ Error: {str(e)}", "" | |
| def process_video_input(api_key, video_file, model_name): | |
| """Process video input by extracting audio, transcribing, and detecting language (provides progress updates)""" | |
| if not api_key: | |
| yield "❌ Error: Please enter your API key", "" | |
| return | |
| if video_file is None: | |
| yield "❌ Error: Please upload a video file", "" | |
| return | |
| audio_path = None | |
| try: | |
| yield "🎬 Received video. Extracting audio... (10%)", "" | |
| audio_path = extract_audio_from_video(video_file) | |
| yield "🎧 Audio extracted. Starting transcription... (40%)", "" | |
| transcribed_text = transcribe_audio(api_key, audio_path, model_name) | |
| yield "📝 Transcription complete. Detecting language... (70%)", "" | |
| result = detect_language(api_key, transcribed_text, model_name) | |
| yield "🧾 Analysis complete. Formatting results... (90%)", "" | |
| formatted = format_results(result, transcribed_text) | |
| yield "✅ Done (100%)", formatted | |
| except Exception as e: | |
| yield f"❌ Error: {str(e)}", "" | |
| finally: | |
| # Clean up temporary audio file | |
| if audio_path and os.path.exists(audio_path): | |
| try: | |
| os.remove(audio_path) | |
| except: | |
| pass | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Multi-Language Detector") as demo: | |
| gr.Markdown(""" | |
| # 🌍 Multi-Language Detector | |
| Detect and distinguish multiple languages from text, audio, or video input using OpenAI or Gemini APIs. | |
| """) | |
| # API Key input | |
| with gr.Row(): | |
| api_key_input = gr.Textbox( | |
| label="API Key", | |
| placeholder="sk-... (OpenAI) or GEMINI_API_KEY", | |
| type="password", | |
| info="Enter your OpenAI API key or Gemini API key" | |
| ) | |
| # Model selector (supports GPT and Gemini families) | |
| model_selector = gr.Dropdown( | |
| label="Model", | |
| choices=["gpt-4", "gpt-4o", "gemini-2.5-flash", "gemini-2.5-pro"], | |
| value="gemini-2.5-flash", | |
| info="Choose model. Gemini models use Google's API endpoint." | |
| ) | |
| gr.Markdown("---") | |
| # Create tabs for different input types | |
| with gr.Tabs(): | |
| # Text Input Tab | |
| with gr.Tab("📝 Text Input"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Enter Text", | |
| placeholder="Type or paste text in any language...", | |
| lines=8 | |
| ) | |
| text_button = gr.Button("🔍 Detect Language", variant="primary") | |
| with gr.Column(): | |
| text_progress = gr.Markdown(value="", label="Progress") | |
| text_output = gr.Markdown(label="Results") | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["Hello, how are you today?"], | |
| ["Bonjour! Comment allez-vous?"], | |
| ["こんにちは、お元気ですか?"], | |
| ["Hola, ¿cómo estás? Hello, how are you?"], | |
| ["Привет! مرحبا! 你好!"] | |
| ], | |
| inputs=text_input, | |
| label="Example Texts" | |
| ) | |
| # Audio Input Tab | |
| with gr.Tab("🎤 Audio Input"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="Upload Audio File", | |
| type="filepath" | |
| ) | |
| audio_button = gr.Button("🔍 Transcribe & Detect Language", variant="primary") | |
| with gr.Column(): | |
| audio_progress = gr.Markdown(value="", label="Progress") | |
| audio_output = gr.Markdown(label="Results") | |
| # Video Input Tab | |
| with gr.Tab("🎥 Video Input"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.Video( | |
| label="Upload Video File" | |
| ) | |
| video_button = gr.Button("🔍 Extract Audio, Transcribe & Detect", variant="primary") | |
| with gr.Column(): | |
| video_progress = gr.Markdown(value="", label="Progress") | |
| video_output = gr.Markdown(label="Results") | |
| # Set up event handlers | |
| text_button.click( | |
| fn=process_text_input, | |
| inputs=[api_key_input, text_input, model_selector], | |
| outputs=[text_progress, text_output] | |
| ) | |
| audio_button.click( | |
| fn=process_audio_input, | |
| inputs=[api_key_input, audio_input, model_selector], | |
| outputs=[audio_progress, audio_output] | |
| ) | |
| video_button.click( | |
| fn=process_video_input, | |
| inputs=[api_key_input, video_input, model_selector], | |
| outputs=[video_progress, video_output] | |
| ) | |
| # Launch the demo | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |