marshal-yash commited on
Commit
929c0eb
Β·
verified Β·
1 Parent(s): 669722f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -77
app.py CHANGED
@@ -1,152 +1,139 @@
1
  import gradio as gr
2
  from transformers import AutoProcessor, SeamlessM4Tv2Model, pipeline, XLMRobertaTokenizer, AutoModelForSequenceClassification
3
- from speechbrain.inference.classifiers import EncoderClassifier
4
  import torch
5
  import librosa
6
  import numpy as np
7
 
8
  # --- 1. CONFIGURATION ---
 
9
  SENTIMENT_MODEL_ID = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
 
10
  AUDIO_MODEL_ID = "facebook/seamless-m4t-v2-large"
11
- LANG_ID_MODEL = "speechbrain/lang-id-voxlingua107-ecapa"
12
 
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  print(f"πŸš€ Cloud Brain Running on: {device.upper()}")
15
 
16
  # --- 2. LOAD MODELS ---
17
 
18
  # A. Load Sentiment Model
19
- print(f"⏳ Loading Sentiment Model...")
20
  tokenizer = XLMRobertaTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
21
  sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
22
- sentiment_pipeline = pipeline("text-classification", model=sent_model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
 
 
 
 
 
23
 
24
  # B. Load Audio Model (SeamlessM4T)
25
- print(f"⏳ Loading Audio Model...")
26
  processor = AutoProcessor.from_pretrained(AUDIO_MODEL_ID)
27
  audio_model = SeamlessM4Tv2Model.from_pretrained(AUDIO_MODEL_ID).to(device)
28
 
29
- # C. Load Language Detector (SpeechBrain)
30
- # This small model detects the language automatically
31
- print(f"⏳ Loading Language Detector...")
32
- language_id_model = EncoderClassifier.from_hparams(
33
- source=LANG_ID_MODEL,
34
- savedir="tmp_lang_id",
35
- run_opts={"device": device}
36
- )
37
-
38
- print("βœ… All Models Loaded!")
39
 
40
- # --- 3. HELPER FUNCTIONS ---
41
 
42
- def detect_language_code(audio_path):
43
  """
44
- Detects language (Hindi, Gujarati, English) and maps it to SeamlessM4T codes.
45
  """
46
- try:
47
- # SpeechBrain expects a waveform
48
- signal = language_id_model.load_audio(audio_path)
49
- prediction = language_id_model.classify_batch(signal)
50
-
51
- # The model returns a label like 'hi: Hindi'
52
- predicted_label = prediction[3][0]
53
- confidence = prediction[1].exp().item()
54
-
55
- # Extract the short code (e.g., 'hi', 'gu', 'en')
56
- short_code = predicted_label.split(":")[0].strip()
57
- print(f"πŸ•΅οΈ Auto-Detected: {predicted_label} ({short_code})")
58
-
59
- # Map SpeechBrain (ISO-2) to SeamlessM4T (ISO-3)
60
- mapping = {
61
- "hi": "hin", # Hindi
62
- "gu": "guj", # Gujarati
63
- "en": "eng", # English
64
- "ur": "urd", # Urdu (often detected for Hindi)
65
- "bn": "ben" # Bengali
66
- }
67
-
68
- # Default to English if detection is weird
69
- return mapping.get(short_code, "eng"), predicted_label
70
-
71
- except Exception as e:
72
- print(f"Language Detection Error: {e}")
73
- return "eng", "Error"
74
-
75
- def analyze_sentiment(text):
76
  if not text or text.strip() == "":
77
  return "Neutral", 0.0
 
78
  try:
 
79
  results = sentiment_pipeline(text)
 
 
80
  raw_label = results[0]['label']
81
  confidence = results[0]['score']
82
 
 
83
  label_map = {
84
  "LABEL_0": "Negative πŸ”΄",
85
  "LABEL_1": "Neutral 🟑",
86
- "LABEL_2": "Positive 🟒"
 
 
 
87
  }
88
- return label_map.get(raw_label, "Neutral 🟑"), confidence
89
- except:
 
 
 
90
  return "Error", 0.0
91
 
92
- # --- 4. MAIN PIPELINE ---
93
-
94
- def process_pipeline(audio_path, text_input):
 
 
 
 
95
  transcribed_text = ""
96
- detected_info = "None"
97
 
98
- # --- Step 1: Handle Audio (Auto-Detect + Transcribe) ---
99
  if audio_path is not None:
100
- print(f"🎀 Processing Audio: {audio_path}")
101
  try:
102
- # A. Auto-Detect Language
103
- target_lang_code, detected_info = detect_language_code(audio_path)
104
-
105
- # B. Load Audio for Seamless
106
  y, orig_sr = librosa.load(audio_path, sr=16000)
 
 
107
  inputs = processor(audio=y, return_tensors="pt", sampling_rate=16000).to(device)
108
 
109
- # C. Transcribe (Using detected language)
 
110
  output_tokens = audio_model.generate(
111
  **inputs,
112
- tgt_lang=target_lang_code,
113
  generate_speech=False
114
  )[0].cpu().numpy().squeeze()
115
 
116
  transcribed_text = processor.decode(output_tokens, skip_special_tokens=True)
117
- print(f"πŸ“ Transcribed ({target_lang_code}): {transcribed_text}")
118
 
119
  except Exception as e:
120
- return f"Error: {str(e)}", "Error ⚠️", 0.0, f"Error: {str(e)}"
121
 
122
- # --- Step 2: Handle Text Fallback ---
123
  if not transcribed_text and text_input:
124
  transcribed_text = text_input
125
- detected_info = "Text Input"
126
 
127
  if not transcribed_text:
128
- return "", "Neutral 🟑", 0.0, "No Input"
129
 
130
- # --- Step 3: Analyze Sentiment ---
131
  sentiment_label, confidence = analyze_sentiment(transcribed_text)
132
 
133
- return transcribed_text, sentiment_label, round(confidence, 3), detected_info
 
134
 
135
- # --- 5. UI ---
136
  with gr.Interface(
137
  fn=process_pipeline,
138
  inputs=[
139
- gr.Audio(type="filepath", label="🎀 Speak (Hindi / Gujarati / English)"),
140
- gr.Textbox(label="⌨️ Or Type Text")
 
 
 
 
 
 
141
  ],
142
  outputs=[
143
  gr.Textbox(label="πŸ“ Transcription"),
144
  gr.Label(label="Sentiment Analysis"),
145
- gr.Number(label="Confidence Score"),
146
- gr.Textbox(label="πŸ•΅οΈ Detected Language") # Shows the user what model heard
147
  ],
148
- title="SGP-IV: Auto-Detect Voice Brain",
149
- description="Speak naturally in Hindi, Gujarati, or English. The model will auto-detect your language and analyze sentiment."
150
  ) as demo:
151
  pass
152
 
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, SeamlessM4Tv2Model, pipeline, XLMRobertaTokenizer, AutoModelForSequenceClassification
 
3
  import torch
4
  import librosa
5
  import numpy as np
6
 
7
  # --- 1. CONFIGURATION ---
8
+ # Sentiment Model (Multilingual: Hindi, English, etc.)
9
  SENTIMENT_MODEL_ID = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
10
+ # Audio Model (SeamlessM4T v2 Large)
11
  AUDIO_MODEL_ID = "facebook/seamless-m4t-v2-large"
 
12
 
13
+ # Auto-select GPU if available
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
  print(f"πŸš€ Cloud Brain Running on: {device.upper()}")
16
 
17
  # --- 2. LOAD MODELS ---
18
 
19
  # A. Load Sentiment Model
20
+ print(f"⏳ Loading Sentiment Model ({SENTIMENT_MODEL_ID})...")
21
  tokenizer = XLMRobertaTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
22
  sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
23
+ sentiment_pipeline = pipeline(
24
+ "text-classification",
25
+ model=sent_model,
26
+ tokenizer=tokenizer,
27
+ device=0 if device == "cuda" else -1
28
+ )
29
 
30
  # B. Load Audio Model (SeamlessM4T)
31
+ print(f"⏳ Loading Audio Model ({AUDIO_MODEL_ID})...")
32
  processor = AutoProcessor.from_pretrained(AUDIO_MODEL_ID)
33
  audio_model = SeamlessM4Tv2Model.from_pretrained(AUDIO_MODEL_ID).to(device)
34
 
35
+ print("βœ… All Models Loaded Successfully!")
 
 
 
 
 
 
 
 
 
36
 
37
+ # --- 3. INTELLIGENCE FUNCTIONS ---
38
 
39
+ def analyze_sentiment(text):
40
  """
41
+ Analyzes text sentiment using XLM-Roberta.
42
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  if not text or text.strip() == "":
44
  return "Neutral", 0.0
45
+
46
  try:
47
+ # Run inference
48
  results = sentiment_pipeline(text)
49
+
50
+ # Get raw result
51
  raw_label = results[0]['label']
52
  confidence = results[0]['score']
53
 
54
+ # --- Label Map ---
55
  label_map = {
56
  "LABEL_0": "Negative πŸ”΄",
57
  "LABEL_1": "Neutral 🟑",
58
+ "LABEL_2": "Positive 🟒",
59
+ "negative": "Negative πŸ”΄",
60
+ "neutral": "Neutral 🟑",
61
+ "positive": "Positive 🟒"
62
  }
63
+
64
+ nice_label = label_map.get(raw_label, raw_label)
65
+ return nice_label, confidence
66
+ except Exception as e:
67
+ print(f"Sentiment Error: {e}")
68
  return "Error", 0.0
69
 
70
+ def process_pipeline(audio_path, language_code, text_input):
71
+ """
72
+ Master function:
73
+ 1. If Audio is provided -> Transcribe it (using selected language).
74
+ 2. If Text is provided -> Use it directly.
75
+ 3. Analyze Sentiment of the resulting text.
76
+ """
77
  transcribed_text = ""
 
78
 
79
+ # --- Step 1: Transcription (if Audio) ---
80
  if audio_path is not None:
81
+ print(f"🎀 Processing Audio: {audio_path} | Language: {language_code}")
82
  try:
83
+ # Load audio using librosa to ensure correct sample rate (16kHz required)
84
+ # This handles resampling automatically
 
 
85
  y, orig_sr = librosa.load(audio_path, sr=16000)
86
+
87
+ # Prepare inputs
88
  inputs = processor(audio=y, return_tensors="pt", sampling_rate=16000).to(device)
89
 
90
+ # Generate Transcription
91
+ # We explicitly tell the model which language to transcribe (tgt_lang)
92
  output_tokens = audio_model.generate(
93
  **inputs,
94
+ tgt_lang=language_code,
95
  generate_speech=False
96
  )[0].cpu().numpy().squeeze()
97
 
98
  transcribed_text = processor.decode(output_tokens, skip_special_tokens=True)
99
+ print(f"πŸ“ Transcribed: {transcribed_text}")
100
 
101
  except Exception as e:
102
+ return f"Error in transcription: {str(e)}", "Error ⚠️", 0.0
103
 
104
+ # --- Step 2: Fallback to Text Input ---
105
  if not transcribed_text and text_input:
106
  transcribed_text = text_input
 
107
 
108
  if not transcribed_text:
109
+ return "", "Neutral 🟑", 0.0
110
 
111
+ # --- Step 3: Sentiment Analysis ---
112
  sentiment_label, confidence = analyze_sentiment(transcribed_text)
113
 
114
+ # Return: Transcription, Sentiment Label, Confidence Score
115
+ return transcribed_text, sentiment_label, round(confidence, 3)
116
 
117
+ # --- 4. UI CONSTRUCTION ---
118
  with gr.Interface(
119
  fn=process_pipeline,
120
  inputs=[
121
+ gr.Audio(type="filepath", label="🎀 Upload Audio or Speak"),
122
+ # Dropdown prevents the crash by letting user define language
123
+ gr.Dropdown(
124
+ choices=["hin", "guj", "eng"],
125
+ value="hin",
126
+ label="πŸ—£οΈ Select Language Spoken (hin=Hindi, guj=Gujarati)"
127
+ ),
128
+ gr.Textbox(label="⌨️ Or Type Text Here")
129
  ],
130
  outputs=[
131
  gr.Textbox(label="πŸ“ Transcription"),
132
  gr.Label(label="Sentiment Analysis"),
133
+ gr.Number(label="Confidence Score")
 
134
  ],
135
+ title="SGP-IV: Voice Sentiment Brain",
136
+ description="Select your language, speak, and get real-time sentiment analysis."
137
  ) as demo:
138
  pass
139