JustNikunj commited on
Commit
4607f5c
·
verified ·
1 Parent(s): 51bbc18

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +136 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ from transformers import AutoModelForCTC, AutoProcessor, pipeline
5
+ from pydub import AudioSegment
6
+ import numpy as np
7
+ import librosa
8
+ import io
9
+ import tempfile
10
+
11
+ # Load ASR model and processor for Hindi speech recognition
12
+ print("Loading ASR model...")
13
+ asr_processor = AutoProcessor.from_pretrained("ai4bharat/indicwav2vec-hindi")
14
+ asr_model = AutoModelForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
15
+
16
+ # Load sentiment analysis pipeline for Hindi text
17
+ print("Loading sentiment analysis model...")
18
+ sentiment_pipeline = pipeline(
19
+ "text-classification",
20
+ model="LondonStory/txlm-roberta-hindi-sentiment",
21
+ return_all_scores=True
22
+ )
23
+
24
+ # Move models to appropriate device (CPU for free Hugging Face Space)
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ asr_model.to(device)
27
+ print(f"Models loaded on device: {device}")
28
+
29
+ def predict(audio_filepath):
30
+ """
31
+ Main prediction function that processes Hindi audio and returns sentiment analysis.
32
+
33
+ Args:
34
+ audio_filepath: Path to the uploaded audio file
35
+
36
+ Returns:
37
+ Dictionary with sentiment labels and confidence scores
38
+ """
39
+ try:
40
+ # Load and preprocess audio
41
+ print(f"Processing audio file: {audio_filepath}")
42
+
43
+ # Load audio using librosa and resample to 16kHz as required by the ASR model
44
+ audio_array, sample_rate = librosa.load(audio_filepath, sr=16000)
45
+
46
+ # Ensure audio is in the correct format
47
+ if len(audio_array.shape) > 1:
48
+ audio_array = np.mean(audio_array, axis=1)
49
+
50
+ # Process audio with ASR processor
51
+ inputs = asr_processor(
52
+ audio_array,
53
+ sampling_rate=16000,
54
+ return_tensors="pt",
55
+ padding=True
56
+ )
57
+
58
+ # Move inputs to device
59
+ inputs = {k: v.to(device) for k, v in inputs.items()}
60
+
61
+ # Transcribe audio to Hindi text
62
+ with torch.no_grad():
63
+ logits = asr_model(**inputs).logits
64
+
65
+ # Get predicted token IDs
66
+ predicted_ids = torch.argmax(logits, dim=-1)
67
+
68
+ # Decode the transcription
69
+ transcription = asr_processor.batch_decode(predicted_ids)[0]
70
+
71
+ print(f"Transcribed text: {transcription}")
72
+
73
+ # Handle empty transcription
74
+ if not transcription.strip():
75
+ return {"error": "Could not transcribe audio. Please ensure you're speaking in Hindi."}
76
+
77
+ # Perform sentiment analysis on the transcribed text
78
+ sentiment_results = sentiment_pipeline(transcription)
79
+
80
+ # Format results for Gradio
81
+ result_dict = {}
82
+ for result in sentiment_results[0]:
83
+ label = result['label']
84
+ score = result['score']
85
+ result_dict[label] = float(score)
86
+
87
+ # Add transcription to results for reference
88
+ result_dict['Transcription'] = transcription
89
+
90
+ return result_dict
91
+
92
+ except Exception as e:
93
+ print(f"Error processing audio: {str(e)}")
94
+ return {"error": f"Error processing audio: {str(e)}"}
95
+
96
+ # Create Gradio interface
97
+ demo = gr.Interface(
98
+ fn=predict,
99
+ inputs=gr.Audio(
100
+ type="filepath",
101
+ label="Upload Hindi Speech",
102
+ sources=["upload", "microphone"]
103
+ ),
104
+ outputs=gr.Label(
105
+ label="Sentiment Analysis Result",
106
+ num_top_classes=3
107
+ ),
108
+ title="🎤 Hindi Speech Sentiment Analysis",
109
+ description="""
110
+ ### Upload or record Hindi audio to analyze sentiment
111
+
112
+ This app performs the following steps:
113
+ 1. **Speech Recognition**: Converts your Hindi speech to text using AI4Bharat's IndicWav2Vec model
114
+ 2. **Sentiment Analysis**: Analyzes the emotional tone using a specialized Hindi sentiment model
115
+
116
+ **Instructions**:
117
+ - Upload an audio file or record directly using the microphone
118
+ - Speak clearly in Hindi for best results
119
+ - The app will show both the transcribed text and sentiment scores
120
+
121
+ **Supported sentiments**: Positive, Negative, and Neutral with confidence scores
122
+ """,
123
+ examples=None,
124
+ theme=gr.themes.Soft(),
125
+ allow_flagging="never"
126
+ )
127
+
128
+ # Launch the app
129
+ if __name__ == "__main__":
130
+ # Launch with share=True for public access, queue for handling multiple requests
131
+ demo.launch(
132
+ share=False, # Set to True if you want a public link for testing
133
+ server_name="0.0.0.0", # Required for Hugging Face Spaces
134
+ server_port=7860, # Default port for Hugging Face Spaces
135
+ show_error=True
136
+ )
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ torchaudio
4
+ datasets
5
+ gradio
6
+ pydub