Spaces:

marshal-yash
/

Indic-Sentiment-Audio-App

Sleeping

App Files Files Community

marshal-yash commited on Jan 29

Commit

929c0eb

verified ·

1 Parent(s): 669722f

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -77

app.py CHANGED Viewed

@@ -1,152 +1,139 @@
 import gradio as gr
 from transformers import AutoProcessor, SeamlessM4Tv2Model, pipeline, XLMRobertaTokenizer, AutoModelForSequenceClassification
-from speechbrain.inference.classifiers import EncoderClassifier
 import torch
 import librosa
 import numpy as np
 # --- 1. CONFIGURATION ---
 SENTIMENT_MODEL_ID = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
 AUDIO_MODEL_ID = "facebook/seamless-m4t-v2-large"
-LANG_ID_MODEL = "speechbrain/lang-id-voxlingua107-ecapa"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Cloud Brain Running on: {device.upper()}")
 # --- 2. LOAD MODELS ---
 # A. Load Sentiment Model
-print(f"⏳ Loading Sentiment Model...")
 tokenizer = XLMRobertaTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
 sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
-sentiment_pipeline = pipeline("text-classification", model=sent_model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
 # B. Load Audio Model (SeamlessM4T)
-print(f"⏳ Loading Audio Model...")
 processor = AutoProcessor.from_pretrained(AUDIO_MODEL_ID)
 audio_model = SeamlessM4Tv2Model.from_pretrained(AUDIO_MODEL_ID).to(device)
-# C. Load Language Detector (SpeechBrain)
-# This small model detects the language automatically
-print(f"⏳ Loading Language Detector...")
-language_id_model = EncoderClassifier.from_hparams(
-    source=LANG_ID_MODEL,
-    savedir="tmp_lang_id",
-    run_opts={"device": device}
-)
-print("✅ All Models Loaded!")
-# --- 3. HELPER FUNCTIONS ---
-def detect_language_code(audio_path):
     """
-    Detects language (Hindi, Gujarati, English) and maps it to SeamlessM4T codes.
     """
-    try:
-        # SpeechBrain expects a waveform
-        signal = language_id_model.load_audio(audio_path)
-        prediction = language_id_model.classify_batch(signal)
-        # The model returns a label like 'hi: Hindi'
-        predicted_label = prediction[3][0]
-        confidence = prediction[1].exp().item()
-        # Extract the short code (e.g., 'hi', 'gu', 'en')
-        short_code = predicted_label.split(":")[0].strip()
-        print(f"🕵️ Auto-Detected: {predicted_label} ({short_code})")
-        # Map SpeechBrain (ISO-2) to SeamlessM4T (ISO-3)
-        mapping = {
-            "hi": "hin",  # Hindi
-            "gu": "guj",  # Gujarati
-            "en": "eng",  # English
-            "ur": "urd",  # Urdu (often detected for Hindi)
-            "bn": "ben"   # Bengali
-        }
-        # Default to English if detection is weird
-        return mapping.get(short_code, "eng"), predicted_label
-    except Exception as e:
-        print(f"Language Detection Error: {e}")
-        return "eng", "Error"
-def analyze_sentiment(text):
     if not text or text.strip() == "":
         return "Neutral", 0.0
     try:
         results = sentiment_pipeline(text)
         raw_label = results[0]['label']
         confidence = results[0]['score']
         label_map = {
             "LABEL_0": "Negative 🔴",
             "LABEL_1": "Neutral 🟡",
-            "LABEL_2": "Positive 🟢"
         }
-        return label_map.get(raw_label, "Neutral 🟡"), confidence
-    except:
         return "Error", 0.0
-# --- 4. MAIN PIPELINE ---
-def process_pipeline(audio_path, text_input):
     transcribed_text = ""
-    detected_info = "None"
-    # --- Step 1: Handle Audio (Auto-Detect + Transcribe) ---
     if audio_path is not None:
-        print(f"🎤 Processing Audio: {audio_path}")
         try:
-            # A. Auto-Detect Language
-            target_lang_code, detected_info = detect_language_code(audio_path)
-            # B. Load Audio for Seamless
             y, orig_sr = librosa.load(audio_path, sr=16000)
             inputs = processor(audio=y, return_tensors="pt", sampling_rate=16000).to(device)
-            # C. Transcribe (Using detected language)
             output_tokens = audio_model.generate(
                 **inputs,
-                tgt_lang=target_lang_code,
                 generate_speech=False
             )[0].cpu().numpy().squeeze()
             transcribed_text = processor.decode(output_tokens, skip_special_tokens=True)
-            print(f"📝 Transcribed ({target_lang_code}): {transcribed_text}")
         except Exception as e:
-            return f"Error: {str(e)}", "Error ⚠️", 0.0, f"Error: {str(e)}"
-    # --- Step 2: Handle Text Fallback ---
     if not transcribed_text and text_input:
         transcribed_text = text_input
-        detected_info = "Text Input"
     if not transcribed_text:
-        return "", "Neutral 🟡", 0.0, "No Input"
-    # --- Step 3: Analyze Sentiment ---
     sentiment_label, confidence = analyze_sentiment(transcribed_text)
-    return transcribed_text, sentiment_label, round(confidence, 3), detected_info
-# --- 5. UI ---
 with gr.Interface(
     fn=process_pipeline,
     inputs=[
-        gr.Audio(type="filepath", label="🎤 Speak (Hindi / Gujarati / English)"),
-        gr.Textbox(label="⌨️ Or Type Text")
     ],
     outputs=[
         gr.Textbox(label="📝 Transcription"),
         gr.Label(label="Sentiment Analysis"),
-        gr.Number(label="Confidence Score"),
-        gr.Textbox(label="🕵️ Detected Language") # Shows the user what model heard
     ],
-    title="SGP-IV: Auto-Detect Voice Brain",
-    description="Speak naturally in Hindi, Gujarati, or English. The model will auto-detect your language and analyze sentiment."
 ) as demo:
     pass

 import gradio as gr
 from transformers import AutoProcessor, SeamlessM4Tv2Model, pipeline, XLMRobertaTokenizer, AutoModelForSequenceClassification
 import torch
 import librosa
 import numpy as np
 # --- 1. CONFIGURATION ---
+# Sentiment Model (Multilingual: Hindi, English, etc.)
 SENTIMENT_MODEL_ID = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
+# Audio Model (SeamlessM4T v2 Large)
 AUDIO_MODEL_ID = "facebook/seamless-m4t-v2-large"
+# Auto-select GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Cloud Brain Running on: {device.upper()}")
 # --- 2. LOAD MODELS ---
 # A. Load Sentiment Model
+print(f"⏳ Loading Sentiment Model ({SENTIMENT_MODEL_ID})...")
 tokenizer = XLMRobertaTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
 sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
+sentiment_pipeline = pipeline(
+    "text-classification",
+    model=sent_model,
+    tokenizer=tokenizer,
+    device=0 if device == "cuda" else -1
+)
 # B. Load Audio Model (SeamlessM4T)
+print(f"⏳ Loading Audio Model ({AUDIO_MODEL_ID})...")
 processor = AutoProcessor.from_pretrained(AUDIO_MODEL_ID)
 audio_model = SeamlessM4Tv2Model.from_pretrained(AUDIO_MODEL_ID).to(device)
+print("✅ All Models Loaded Successfully!")
+# --- 3. INTELLIGENCE FUNCTIONS ---
+def analyze_sentiment(text):
     """
+    Analyzes text sentiment using XLM-Roberta.
     """
     if not text or text.strip() == "":
         return "Neutral", 0.0
     try:
+        # Run inference
         results = sentiment_pipeline(text)
+        # Get raw result
         raw_label = results[0]['label']
         confidence = results[0]['score']
+        # --- Label Map ---
         label_map = {
             "LABEL_0": "Negative 🔴",
             "LABEL_1": "Neutral 🟡",
+            "LABEL_2": "Positive 🟢",
+            "negative": "Negative 🔴",
+            "neutral": "Neutral 🟡",
+            "positive": "Positive 🟢"
         }
+        nice_label = label_map.get(raw_label, raw_label)
+        return nice_label, confidence
+    except Exception as e:
+        print(f"Sentiment Error: {e}")
         return "Error", 0.0
+def process_pipeline(audio_path, language_code, text_input):
+    """
+    Master function:
+    1. If Audio is provided -> Transcribe it (using selected language).
+    2. If Text is provided -> Use it directly.
+    3. Analyze Sentiment of the resulting text.
+    """
     transcribed_text = ""
+    # --- Step 1: Transcription (if Audio) ---
     if audio_path is not None:
+        print(f"🎤 Processing Audio: {audio_path} | Language: {language_code}")
         try:
+            # Load audio using librosa to ensure correct sample rate (16kHz required)
+            # This handles resampling automatically
             y, orig_sr = librosa.load(audio_path, sr=16000)
+            # Prepare inputs
             inputs = processor(audio=y, return_tensors="pt", sampling_rate=16000).to(device)
+            # Generate Transcription
+            # We explicitly tell the model which language to transcribe (tgt_lang)
             output_tokens = audio_model.generate(
                 **inputs,
+                tgt_lang=language_code,
                 generate_speech=False
             )[0].cpu().numpy().squeeze()
             transcribed_text = processor.decode(output_tokens, skip_special_tokens=True)
+            print(f"📝 Transcribed: {transcribed_text}")
         except Exception as e:
+            return f"Error in transcription: {str(e)}", "Error ⚠️", 0.0
+    # --- Step 2: Fallback to Text Input ---
     if not transcribed_text and text_input:
         transcribed_text = text_input
     if not transcribed_text:
+        return "", "Neutral 🟡", 0.0
+    # --- Step 3: Sentiment Analysis ---
     sentiment_label, confidence = analyze_sentiment(transcribed_text)
+    # Return: Transcription, Sentiment Label, Confidence Score
+    return transcribed_text, sentiment_label, round(confidence, 3)
+# --- 4. UI CONSTRUCTION ---
 with gr.Interface(
     fn=process_pipeline,
     inputs=[
+        gr.Audio(type="filepath", label="🎤 Upload Audio or Speak"),
+        # Dropdown prevents the crash by letting user define language
+        gr.Dropdown(
+            choices=["hin", "guj", "eng"],
+            value="hin",
+            label="🗣️ Select Language Spoken (hin=Hindi, guj=Gujarati)"
+        ),
+        gr.Textbox(label="⌨️ Or Type Text Here")
     ],
     outputs=[
         gr.Textbox(label="📝 Transcription"),
         gr.Label(label="Sentiment Analysis"),
+        gr.Number(label="Confidence Score")
     ],
+    title="SGP-IV: Voice Sentiment Brain",
+    description="Select your language, speak, and get real-time sentiment analysis."
 ) as demo:
     pass