Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| from datetime import datetime | |
| import torch | |
| from pydub import AudioSegment | |
| from deep_translator import GoogleTranslator | |
| from transformers import pipeline | |
| import pandas as pd | |
| import nltk | |
| from faster_whisper import WhisperModel | |
| import tempfile | |
| nltk_data_dir = os.path.join(os.getcwd(), "nltk_data") | |
| os.makedirs(nltk_data_dir, exist_ok=True) | |
| nltk.data.path.append(nltk_data_dir) | |
| class MalayalamTranscriptionPipeline: | |
| def __init__(self, model_size="large-v1"): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Loading Faster-Whisper {model_size} model on {self.device}...") | |
| compute_type = "float16" if self.device == "cuda" else "int8" | |
| self.model = WhisperModel(model_size, device=self.device, compute_type=compute_type) | |
| self.temp_files = [] | |
| def convert_to_whisper_format(self, input_path): | |
| supported_formats = ['.mp3', '.wav', '.aac', '.m4a', '.flac', '.ogg', '.wma'] | |
| if not os.path.exists(input_path): | |
| raise FileNotFoundError(f"Input file not found: {input_path}") | |
| file_ext = os.path.splitext(input_path)[1].lower() | |
| if file_ext not in supported_formats: | |
| raise ValueError(f"Unsupported audio format: {file_ext}") | |
| temp_dir = os.path.join(tempfile.gettempdir(), "whisper_temp") | |
| os.makedirs(temp_dir, exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d%H%M%S") | |
| wav_path = os.path.join(temp_dir, f"temp_{timestamp}.wav") | |
| audio = AudioSegment.from_file(input_path) | |
| audio = audio.set_frame_rate(16000).set_channels(1) | |
| audio.export(wav_path, format="wav") | |
| self.temp_files.append(wav_path) | |
| print(f"Converted to temporary WAV: {wav_path}") | |
| return wav_path | |
| def transcribe_audio(self, audio_path): | |
| if not audio_path.lower().endswith('.wav'): | |
| audio_path = self.convert_to_whisper_format(audio_path) | |
| if not audio_path: | |
| return None | |
| print("Transcribing audio with Faster-Whisper...") | |
| segments, info = self.model.transcribe( | |
| audio_path, | |
| beam_size=5, | |
| language="en" | |
| ) | |
| full_text = "" | |
| segment_list = [] | |
| for i, seg in enumerate(segments): | |
| text = seg.text.strip() | |
| confidence = seg.avg_logprob if hasattr(seg, 'avg_logprob') else 1.0 | |
| segment_list.append({ | |
| "start": seg.start, | |
| "end": seg.end, | |
| "text": text, | |
| "confidence": round(confidence, 3), | |
| "overlap": i > 0 and seg.start < segment_list[i - 1]["end"] | |
| }) | |
| full_text += f" {text}" | |
| return { | |
| "raw_transcription": full_text.strip(), | |
| "segments": segment_list, | |
| "audio_metadata": { | |
| "original_path": audio_path, | |
| "sample_rate": 16000, | |
| "duration": len(AudioSegment.from_wav(audio_path)) / 1000 | |
| } | |
| } | |
| def translate_to_malayalam(self, text_or_dict): | |
| try: | |
| if isinstance(text_or_dict, dict): | |
| text = text_or_dict.get('raw_transcription', '') | |
| else: | |
| text = text_or_dict | |
| if not text.strip(): | |
| raise ValueError("No text found for translation") | |
| print("Translating to Malayalam...") | |
| ml_text = GoogleTranslator(source='en', target='ml').translate(text) | |
| if isinstance(text_or_dict, dict): | |
| text_or_dict['translated_malayalam'] = ml_text | |
| return text_or_dict | |
| else: | |
| return ml_text | |
| except Exception as e: | |
| print(f"Translation error: {str(e)}") | |
| return text_or_dict | |
| def cleanup(self): | |
| for file_path in self.temp_files: | |
| try: | |
| if os.path.exists(file_path): | |
| os.remove(file_path) | |
| except Exception as e: | |
| print(f"Error deleting temp file {file_path}: {str(e)}") | |
| self.temp_files = [] | |
| # Sentiment analysis pipeline | |
| sentiment_pipeline = pipeline( | |
| "sentiment-analysis", | |
| model="nlptown/bert-base-multilingual-uncased-sentiment", | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| def split_into_sentences(text): | |
| try: | |
| sentences = nltk.sent_tokenize(text) | |
| return [s.strip() for s in sentences if s.strip()] | |
| except Exception as e: | |
| print(f"Sentence splitting failed: {e}") | |
| return [text] if text.strip() else [] | |
| def analyze_sentiment_batch(texts): | |
| results = sentiment_pipeline(texts) | |
| outputs = [] | |
| for result in results: | |
| label = result['label'] | |
| if "1 star" in label: | |
| sentiment = {"label": "very negative", "score": 0.1} | |
| elif "2 stars" in label: | |
| sentiment = {"label": "negative", "score": 0.3} | |
| elif "3 stars" in label: | |
| sentiment = {"label": "neutral", "score": 0.5} | |
| elif "4 stars" in label: | |
| sentiment = {"label": "positive", "score": 0.7} | |
| elif "5 stars" in label: | |
| sentiment = {"label": "very positive", "score": 0.9} | |
| else: | |
| sentiment = {"label": "neutral", "score": 0.5} | |
| outputs.append(sentiment) | |
| return outputs | |
| def detect_intent(text, language="en"): | |
| """Enhanced intent detection for internship interest analysis in English and Malayalam""" | |
| text_lower = text.lower().strip() | |
| intent_keywords = { | |
| "en": { | |
| # Interest Levels | |
| "Strong_interest": [ | |
| "yes", "definitely", "ready", "want to join", "interested", | |
| "share details", "send brochure", "i'll join", "let's proceed", | |
| "where do i sign", "how to apply", "when can i start", "accept", | |
| "looking forward", "excited", "happy to", "glad to", "eager", | |
| "share it", "i will come", "i'm in" | |
| ], | |
| "Moderate_interest": [ | |
| "maybe", "consider", "think about", "let me think", "tell me more", | |
| "more details", "explain", "clarify", "not sure", "possibly", | |
| "might", "could be", "depends", "need to check", "will decide", | |
| "get back", "discuss", "consult", "review", "evaluate" | |
| ], | |
| "No_interest": [ | |
| "no", "not interested", "can't", "won't", "don't like", | |
| "not now", "later", "not suitable", "inconvenient", "decline", | |
| "pass", "refuse", "reject", "not for me", "not my field" | |
| ], | |
| # Conversation Categories | |
| "Qualification_query": [ | |
| "qualification", "education", "degree", "studying", "course", | |
| "background", "academics", "university", "college", "bsc", | |
| "graduate", "year of study", "curriculum", "syllabus" | |
| ], | |
| "Internship_details": [ | |
| "internship", "program", "duration", "months", "period", | |
| "schedule", "timing", "timeframe", "1 to 3", "three months", | |
| "structure", "plan", "framework" | |
| ], | |
| "Location_query": [ | |
| "online", "offline", "location", "place", "where", | |
| "address", "relocate", "relocating", "from", "coming", | |
| "kozhikode", "kochi", "palarivattam", "hybrid", "remote" | |
| ], | |
| "Certificate_query": [ | |
| "certificate", "certification", "document", "proof", | |
| "experience certificate", "training certificate", "letter", | |
| "completion", "award", "recognition" | |
| ], | |
| "Fee_query": [ | |
| "fee", "payment", "cost", "amount", "charge", | |
| "6000", "six thousand", "money", "stipend", "salary", | |
| "compensation", "paid", "free" | |
| ], | |
| "Project_details": [ | |
| "live project", "work", "assignment", "task", "project", | |
| "trainee", "superiors", "team", "collaborate", "develop", | |
| "build", "create", "implement", "hands-on", "practical" | |
| ], | |
| "Confirmation": [ | |
| "ok", "looking for", "interested", "send whatsapp", "got it", | |
| "acknowledge", "noted", "please send", "sent details", "agreed" | |
| ] | |
| }, | |
| "ml": { | |
| # Interest Levels | |
| "Strong_interest": [ | |
| "เดคเดฏเตเดฏเดพเดฑเดพเดฃเต", "เดเดตเดถเตเดฏเดฎเตเดฃเตเดเต", "เดเตเดฏเตเดฏเดพเด", "เดเดเตเดฐเดนเดฎเตเดฃเตเดเต", | |
| "เดเดทเตเดเดฎเดพเดฃเต", "เด เดฑเดฟเดฏเดฟเดเตเดเตเดณเต", "เดฌเตเดฐเตเดทเตผ เดตเตเดฃเด", "เดตเดฟเดถเดฆเดพเดเดถเดเตเดเตพ เดตเตเดฃเด", | |
| "เดถเตเดฏเตผ เดเตเดฏเตเดฏเตเด", "เดเดพเตป เดตเดฐเดพเด", "เดเดคเตเดธเดพเดนเด", "เดคเดพเดคเตเดชเดฐเตเดฏเด", | |
| "เดธเดฎเตเดฎเดคเด", "เด เดเดเตเดเดฐเดฟเดเตเดเตเดจเตเดจเต", "เดนเดพเดชเตเดชเดฟเดฏเดพเดฃเต", "เดเดพเตป เดเตเดฏเตเดฏเดพเด", | |
| "เดจเดฟเดถเตเดเดฟเดคเดฎเดพเดฏเดฟ", "เดเดตเดถเตเดฏเดฎเดพเดฃเต" | |
| ], | |
| "Moderate_interest": [ | |
| "เดเดฒเตเดเดฟเดเตเดเดพเด", "เดจเตเดเตเดเดพเด", "เดคเดพเดฒเตเดชเดฐเตเดฏเดฎเตเดฃเตเดเต", "เดเดจเตเดฑเตเดฑเดธเตเดฑเตเดฑเดกเต", | |
| "เดชเดฑเดฏเดพเด", "เดเตเดทเดฃเดฟเดเตเดเตเด", "เดเดฟเดจเตเดคเดฟเดเตเดเดพเด", "เดเดพเดฃเดพเด", "เดเดคเตเดคเดฐเดฎเดฟเดฒเตเดฒ", | |
| "เดเตเดเตเดคเตฝ เดตเดฟเดตเดฐเดเตเดเตพ", "เดตเตเดฏเดพเดเตเดฏเดพเดจเดฟเดเตเดเตเด", "เด เดตเดฒเดเดฌเดฟเดเตเดเตเด" | |
| ], | |
| "No_interest": [ | |
| "เดเดฒเตเดฒ", "เดตเตเดฃเตเด", "เดธเดพเดงเตเดฏเดฎเดฒเตเดฒ", "เดเดทเตเดเดฎเดฒเตเดฒ", "เดเดเตเดเดจเตเดฏเดฒเตเดฒ", | |
| "เดจเดฟเดฐเดธเดฟเดเตเดเตเด", "เด เดจเดพเดตเดถเตเดฏเดฎเดพเดฃเต", "เดชเดฟเดจเตเดคเดฟเดฐเดฟเดฏเตเด", "เดเดคเดฒเตเดฒ", "เดจเดฟเดทเตเดงเด" | |
| ], | |
| # Conversation Categories | |
| "Qualification_query": [ | |
| "เดตเดฟเดฆเตเดฏเดพเดญเตเดฏเดพเดธเด", "เดกเดฟเดเตเดฐเดฟ", "เดฌเดฟเดธเดฟ", "เดชเด เดฟเดเตเดเตเดจเตเดจเต", | |
| "เดชเด เดจเด", "เด เดงเตเดฏเดฏเดจเด", "เดเตเดฒเดพเดธเต", "เดตเตผเดทเด", | |
| "เดเตเดดเตโเดธเต", "เดธเดฟเดฒเดฌเดธเต", "เดตเดฟเดฆเตเดฏเดพเตผเดฅเดฟ", "เดเดฃเดฟเดคเด", "เดธเดฏเตปเดธเต" | |
| ], | |
| "Internship_details": [ | |
| "เดเดจเตเดฑเตเดฃเตเดทเดฟเดชเต", "เดชเดฐเดฟเดถเตเดฒเดจเด", "เดชเตเดฐเตเดเตเดฐเดพเด", | |
| "เดฎเดพเดธเด", "เดธเดฎเดฏเดเตเดฐเดฎเด", "เดเตเดฎเดฟเดเดเต", "1 เดฎเตเดคเตฝ 3 เดตเดฐเต", | |
| "เด เดตเดธเดพเดจ เดตเตผเดทเด", "เดฒเตเดตเต", "เดซเตเดฐเตเดฏเดฟเดเดตเตผเดเตเดเต", "เดธเตเดฅเดฟเดฐเดฎเดพเดฏเดฟ" | |
| ], | |
| "Location_query": [ | |
| "เดเตบเดฒเตเตป", "เดเดซเตเดฒเตเตป", "เดธเตเดฅเดฒเด", "เดตเดฟเดฒเดพเดธเด", "เดเดดเดฟเดเตเดเต", | |
| "เดเดตเดฟเดเต", "เดเตเดดเดฟเดเตเดเตเดเต", "เดชเดพเดฒเดพเดฐเดฟเดตเดเตเดเด", "เดฎเดพเดฑเตเดฑเด", | |
| "เดฑเดฟเดฒเตเดเตเดเตเดฑเตเดฑเต", "เดตเดฐเตเดจเตเดจเต", "เดเดตเดฟเดเต เดจเดฟเดจเตเดจเดพเดฃเต", "เดนเตเดฌเตเดฐเดฟเดกเต" | |
| ], | |
| "Certificate_query": [ | |
| "เดธเตผเดเตเดเดฟเดซเดฟเดเตเดเดฑเตเดฑเต", "เดกเตเดเตเดฏเตเดฎเตเดจเตเดฑเต", "เด เดจเตเดญเดต เดธเตผเดเตเดเดฟเดซเดฟเดเตเดเดฑเตเดฑเต", | |
| "เดชเดฐเดฟเดถเตเดฒเดจ เดธเตผเดเตเดเดฟเดซเดฟเดเตเดเดฑเตเดฑเต", "เด เดตเดพเตผเดกเต", "เดฐเดเดฟเดธเตเดเตเดฐเตเดทเตป", | |
| "เดชเตเดฐเดฎเดพเดฃเด", "เดธเดพเดเตเดทเตเดฏเดชเดคเตเดฐเด", "เดเดฎเตเดชเตเดฒเตเดทเตป" | |
| ], | |
| "Fee_query": [ | |
| "เดซเตเดธเต", "เดชเดฃเด", "6000", "เดเดฑเต เดเดฏเดฟเดฐเด", "เดเดพเดฃเดฟเดเตเดเต", | |
| "เดฎเดพเดธเดคเตเดเตเดเดฟ", "เดเดพเตผเดเต", "เดฑเตเดฎเดฃเดฑเตเดทเตป", "เดซเตเดฐเต", | |
| "เดถเดฎเตเดชเดณเด", "เดธเตเดฑเตเดฑเตเดชเตเตปเดกเต" | |
| ], | |
| "Project_details": [ | |
| "เดชเตเดฐเตเดเดเตเดเต", "เดฒเตเดตเต เดชเตเดฐเตเดเดเตเดเต", "เดชเตเดฐเดตเตเดคเตเดคเดฟ", "เดเดพเดธเตโเดเต", | |
| "เดเตเด", "เดฎเตเดงเดพเดตเดฟ", "เดเตเดฐเตเดฏเดฟเดจเดฟ", "เดธเดนเดชเตเดฐเดตเตผเดคเตเดคเดจเด", | |
| "เดกเดตเดฒเดชเตเดชเตเดเตเดฏเตเดฏเตเด", "เดธเตเดทเตเดเดฟเดเตเดเตเด", "เดเดฎเตเดชเตเดฒเดฟเดฎเตเดจเตเดฑเตเดเตเดฏเตเดฏเตเด", | |
| "เดชเตเดฐเดพเดฏเตเดเดฟเดเด", "เด เดญเตเดฏเดพเดธเด" | |
| ], | |
| "Confirmation": [ | |
| "เดถเดฐเดฟ", "เดคเดพเดฒเตเดชเดฐเตเดฏเดฎเตเดฃเตเดเต", "เดเดทเตเดเดฎเตเดฃเตเดเต", "เดตเดพเดเตเดธเดพเดชเตเดชเดฟเตฝ เด เดฏเดเตเดเต", | |
| "เดตเดพเดเตเดธเดพเดชเตเดชเต", "เดตเดพเดเตเดเตเดธเดพเดชเตเดชเต", "เดเดฟเดเตเดเดฟ", "เด เดฑเดฟเดฏเดฟเดเตเดเต", | |
| "เดจเตเดเตเดเต เดเตเดฏเตเดคเต", "เดธเดฎเตเดฎเดคเด", "เดฌเตเดงเดฟเดเตเดเดฟเดเตเดเตเดฃเตเดเต", | |
| "เด เดเดเตเดเดฐเดฟเดเตเดเต", "เด เดเตเดเตเดจเดฒเดกเตเดเต", "เดเตเดฒเดฟเดฏเตผ", | |
| "เดคเดฏเดพเดฑเดพเดฃเต", "เด เดฑเดฟเดฏเดฟเดชเตเดชเต เดฒเดญเดฟเดเตเดเต" | |
| ] | |
| } | |
| } | |
| # Step 1: Detect interest level | |
| if any(keyword in text_lower for keyword in intent_keywords[language]["Strong_interest"]): | |
| return "Strong_interest" | |
| if any(keyword in text_lower for keyword in intent_keywords[language]["Moderate_interest"]): | |
| return "Moderate_interest" | |
| if any(keyword in text_lower for keyword in intent_keywords[language]["No_interest"]): | |
| return "No_interest" | |
| # Step 2: Detect conversation category | |
| for intent, keywords in intent_keywords[language].items(): | |
| if intent not in ["Strong_interest", "Moderate_interest", "No_interest"]: | |
| if any(keyword in text_lower for keyword in keywords): | |
| return intent | |
| return "Neutral_response" | |
| def analyze_text(text, language="en"): | |
| sentences = split_into_sentences(text) | |
| if not sentences: | |
| return [] | |
| sentiment_results = analyze_sentiment_batch(sentences) | |
| analysis = [] | |
| for i, sentence in enumerate(sentences): | |
| sentiment = sentiment_results[i] | |
| intent = detect_intent(sentence, language) | |
| analysis.append({ | |
| "sentence_id": f"{language}_{i+1}", | |
| "text": sentence, | |
| "language": language, | |
| "intent": intent, | |
| "sentiment": sentiment["label"], | |
| "sentiment_score": sentiment["score"], | |
| "word_count": len(sentence.split()), | |
| "char_count": len(sentence) | |
| }) | |
| return analysis | |
| def save_analysis_to_csv(analysis, filename_prefix): | |
| if not analysis: | |
| print("No analysis data to save") | |
| return None | |
| df = pd.DataFrame(analysis) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"{filename_prefix}_analysis_{timestamp}.csv" | |
| os.makedirs("analysis_results", exist_ok=True) | |
| full_path = os.path.join("analysis_results", filename) | |
| df.to_csv(full_path, index=False, encoding='utf-8-sig') | |
| print(f"โ Analysis saved to {full_path}") | |
| return full_path | |
| def compare_analyses(en_analysis, ml_analysis): | |
| comparison = [] | |
| for en, ml in zip(en_analysis, ml_analysis): | |
| comparison.append({ | |
| "sentence_id": en["sentence_id"], | |
| "english_text": en["text"], | |
| "malayalam_text": ml["text"], | |
| "intent_match": en["intent"] == ml["intent"], | |
| "english_intent": en["intent"], | |
| "malayalam_intent": ml["intent"], | |
| "sentiment_diff": abs(en["sentiment_score"] - ml["sentiment_score"]), | |
| "english_sentiment": en["sentiment"], | |
| "malayalam_sentiment": ml["sentiment"] | |
| }) | |
| return comparison | |
| def print_analysis_summary(analysis, title): | |
| print(f"\n=== {title} Analysis Summary ===") | |
| print(f"Total Sentences: {len(analysis)}") | |
| if not analysis: | |
| return | |
| sentiment_counts = pd.Series([item["sentiment"] for item in analysis]).value_counts() | |
| print("\nSentiment Distribution:") | |
| print(sentiment_counts.to_string()) | |
| intent_counts = pd.Series([item["intent"] for item in analysis]).value_counts() | |
| print("\nIntent Distribution:") | |
| print(intent_counts.to_string()) | |
| avg_score = sum(item["sentiment_score"] for item in analysis) / len(analysis) | |
| print(f"\nAverage Sentiment Score: {avg_score:.2f}") | |
| if __name__ == "__main__": | |
| transcriber = MalayalamTranscriptionPipeline() | |
| try: | |
| audio_path = input("Enter path to Malayalam audio file: ").strip() | |
| if not os.path.exists(audio_path): | |
| print("Error: File not found") | |
| exit(1) | |
| print("\n๐ Transcribing audio...") | |
| results = transcriber.transcribe_audio(audio_path) | |
| if not results or not results.get("raw_transcription"): | |
| print("Transcription failed.") | |
| exit(1) | |
| raw_transcription = results["raw_transcription"] | |
| print("\n=== Raw English Transcription ===") | |
| print(raw_transcription) | |
| print("\n๐ Translating to Malayalam...") | |
| results = transcriber.translate_to_malayalam(results) | |
| ml_translation = results.get("translated_malayalam", "") | |
| print("\n=== Malayalam Translation ===") | |
| print(ml_translation) | |
| print("\n๐ Analyzing texts...") | |
| en_analysis = analyze_text(raw_transcription, "en") | |
| ml_analysis = analyze_text(ml_translation, "ml") | |
| en_csv = save_analysis_to_csv(en_analysis, "english") | |
| ml_csv = save_analysis_to_csv(ml_analysis, "malayalam") | |
| comparison = compare_analyses(en_analysis, ml_analysis) | |
| comparison_csv = save_analysis_to_csv(comparison, "comparison") | |
| print_analysis_summary(en_analysis, "English") | |
| print_analysis_summary(ml_analysis, "Malayalam") | |
| print("\n=== Translation Accuracy Insights ===") | |
| intent_matches = sum(1 for item in comparison if item["intent_match"]) | |
| print(f"Intent Match Rate: {intent_matches / len(comparison):.1%}") | |
| avg_sentiment_diff = sum(item["sentiment_diff"] for item in comparison) / len(comparison) | |
| print(f"Average Sentiment Difference: {avg_sentiment_diff:.2f}") | |
| # Calculate Lead Score from average sentiment scores | |
| en_avg_score = sum(item["sentiment_score"] for item in en_analysis) / len(en_analysis) if en_analysis else 0 | |
| ml_avg_score = sum(item["sentiment_score"] for item in ml_analysis) / len(ml_analysis) if ml_analysis else 0 | |
| combined_avg = (en_avg_score + ml_avg_score) / 2 | |
| # Convert to lead score (0-100 scale) | |
| lead_score = int(combined_avg * 100) | |
| print(f"\n=== Lead Score ===") | |
| print(f"Calculated Lead Score: {lead_score}/100") | |
| if lead_score >= 70: | |
| print("Interpretation: High interest lead") | |
| elif lead_score >= 40: | |
| print("Interpretation: Moderate interest lead") | |
| else: | |
| print("Interpretation: Low interest lead") | |
| except Exception as e: | |
| print(f"\nโ An error occurred: {str(e)}") | |
| finally: | |
| transcriber.cleanup() | |