Spaces:

sreenathsree1578
/

sree

Sleeping

File size: 19,487 Bytes

8486e4e

import os
import tempfile
from datetime import datetime
import torch
from pydub import AudioSegment
from deep_translator import GoogleTranslator
from transformers import pipeline
import pandas as pd
import nltk
from faster_whisper import WhisperModel
import tempfile

nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)



class MalayalamTranscriptionPipeline:
    def __init__(self, model_size="large-v1"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Loading Faster-Whisper {model_size} model on {self.device}...")
        compute_type = "float16" if self.device == "cuda" else "int8"
        self.model = WhisperModel(model_size, device=self.device, compute_type=compute_type)
        self.temp_files = []

    def convert_to_whisper_format(self, input_path):
        supported_formats = ['.mp3', '.wav', '.aac', '.m4a', '.flac', '.ogg', '.wma']
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")
        file_ext = os.path.splitext(input_path)[1].lower()
        if file_ext not in supported_formats:
            raise ValueError(f"Unsupported audio format: {file_ext}")

        temp_dir = os.path.join(tempfile.gettempdir(), "whisper_temp")
        os.makedirs(temp_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        wav_path = os.path.join(temp_dir, f"temp_{timestamp}.wav")

        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        audio.export(wav_path, format="wav")

        self.temp_files.append(wav_path)
        print(f"Converted to temporary WAV: {wav_path}")
        return wav_path

    def transcribe_audio(self, audio_path):
        if not audio_path.lower().endswith('.wav'):
            audio_path = self.convert_to_whisper_format(audio_path)
            if not audio_path:
                return None

        print("Transcribing audio with Faster-Whisper...")
        segments, info = self.model.transcribe(
            audio_path,
            beam_size=5,
            language="en"
        )

        full_text = ""
        segment_list = []
        for i, seg in enumerate(segments):
            text = seg.text.strip()
            confidence = seg.avg_logprob if hasattr(seg, 'avg_logprob') else 1.0
            segment_list.append({
                "start": seg.start,
                "end": seg.end,
                "text": text,
                "confidence": round(confidence, 3),
                "overlap": i > 0 and seg.start < segment_list[i - 1]["end"]
            })
            full_text += f" {text}"

        return {
            "raw_transcription": full_text.strip(),
            "segments": segment_list,
            "audio_metadata": {
                "original_path": audio_path,
                "sample_rate": 16000,
                "duration": len(AudioSegment.from_wav(audio_path)) / 1000
            }
        }

    def translate_to_malayalam(self, text_or_dict):
        try:
            if isinstance(text_or_dict, dict):
                text = text_or_dict.get('raw_transcription', '')
            else:
                text = text_or_dict

            if not text.strip():
                raise ValueError("No text found for translation")

            print("Translating to Malayalam...")
            ml_text = GoogleTranslator(source='en', target='ml').translate(text)

            if isinstance(text_or_dict, dict):
                text_or_dict['translated_malayalam'] = ml_text
                return text_or_dict
            else:
                return ml_text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text_or_dict

    def cleanup(self):
        for file_path in self.temp_files:
            try:
                if os.path.exists(file_path):
                    os.remove(file_path)
            except Exception as e:
                print(f"Error deleting temp file {file_path}: {str(e)}")
        self.temp_files = []

# Sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=0 if torch.cuda.is_available() else -1
)

def split_into_sentences(text):
    try:
        sentences = nltk.sent_tokenize(text)
        return [s.strip() for s in sentences if s.strip()]
    except Exception as e:
        print(f"Sentence splitting failed: {e}")
        return [text] if text.strip() else []

def analyze_sentiment_batch(texts):
    results = sentiment_pipeline(texts)
    outputs = []
    for result in results:
        label = result['label']
        if "1 star" in label:
            sentiment = {"label": "very negative", "score": 0.1}
        elif "2 stars" in label:
            sentiment = {"label": "negative", "score": 0.3}
        elif "3 stars" in label:
            sentiment = {"label": "neutral", "score": 0.5}
        elif "4 stars" in label:
            sentiment = {"label": "positive", "score": 0.7}
        elif "5 stars" in label:
            sentiment = {"label": "very positive", "score": 0.9}
        else:
            sentiment = {"label": "neutral", "score": 0.5}
        outputs.append(sentiment)
    return outputs

def detect_intent(text, language="en"):
    """Enhanced intent detection for internship interest analysis in English and Malayalam"""
    text_lower = text.lower().strip()
    
    intent_keywords = {
        "en": {
            # Interest Levels
            "Strong_interest": [
                "yes", "definitely", "ready", "want to join", "interested", 
                "share details", "send brochure", "i'll join", "let's proceed",
                "where do i sign", "how to apply", "when can i start", "accept",
                "looking forward", "excited", "happy to", "glad to", "eager",
                "share it", "i will come", "i'm in"
            ],
            "Moderate_interest": [
                "maybe", "consider", "think about", "let me think", "tell me more",
                "more details", "explain", "clarify", "not sure", "possibly",
                "might", "could be", "depends", "need to check", "will decide",
                "get back", "discuss", "consult", "review", "evaluate"
            ],
            "No_interest": [
                "no", "not interested", "can't", "won't", "don't like",
                "not now", "later", "not suitable", "inconvenient", "decline",
                "pass", "refuse", "reject", "not for me", "not my field"
            ],

            # Conversation Categories
            "Qualification_query": [
                "qualification", "education", "degree", "studying", "course",
                "background", "academics", "university", "college", "bsc",
                "graduate", "year of study", "curriculum", "syllabus"
            ],
            "Internship_details": [
                "internship", "program", "duration", "months", "period",
                "schedule", "timing", "timeframe", "1 to 3", "three months",
                "structure", "plan", "framework"
            ],
            "Location_query": [
                "online", "offline", "location", "place", "where",
                "address", "relocate", "relocating", "from", "coming",
                "kozhikode", "kochi", "palarivattam", "hybrid", "remote"
            ],
            "Certificate_query": [
                "certificate", "certification", "document", "proof",
                "experience certificate", "training certificate", "letter",
                "completion", "award", "recognition"
            ],
            "Fee_query": [
                "fee", "payment", "cost", "amount", "charge",
                "6000", "six thousand", "money", "stipend", "salary",
                "compensation", "paid", "free"
            ],
            "Project_details": [
                "live project", "work", "assignment", "task", "project",
                "trainee", "superiors", "team", "collaborate", "develop",
                "build", "create", "implement", "hands-on", "practical"
            ],
            "Confirmation": [
                "ok", "looking for", "interested", "send whatsapp", "got it",
                "acknowledge", "noted", "please send", "sent details", "agreed"
            ]
        },

        "ml": {
            # Interest Levels
            "Strong_interest": [
                "തയ്യാറാണ്", "ആവശ്യമുണ്ട്", "ചെയ്യാം", "ആഗ്രഹമുണ്ട്", 
                "ഇഷ്ടമാണ്", "അറിയിച്ചോളൂ", "ബ്രോഷർ വേണം", "വിശദാംശങ്ങൾ വേണം",
                "ശെയർ ചെയ്യുക", "ഞാൻ വരാം", "ഉത്സാഹം", "താത്പര്യം",
                "സമ്മതം", "അംഗീകരിക്കുന്നു", "ഹാപ്പിയാണ്", "ഞാൻ ചെയ്യാം",
                "നിശ്ചിതമായി", "ആവശ്യമാണ്"
            ],
            "Moderate_interest": [
                "ആലോചിക്കാം", "നോക്കാം", "താല്പര്യമുണ്ട്", "ഇന്റെറസ്റ്റഡ്",
                "പറയാം", "ക്ഷണിക്കുക", "ചിന്തിക്കാം", "കാണാം", "ഉത്തരമില്ല",
                "കൂടുതൽ വിവരങ്ങൾ", "വ്യാഖ്യാനിക്കുക", "അവലംബിക്കുക"
            ],
            "No_interest": [
                "ഇല്ല", "വേണ്ട", "സാധ്യമല്ല", "ഇഷ്ടമല്ല", "ഇങ്ങനെയല്ല",
                "നിരസിക്കുക", "അനാവശ്യമാണ്", "പിന്തിരിയുക", "ഇതല്ല", "നിഷേധം"
            ],

            # Conversation Categories
            "Qualification_query": [
                "വിദ്യാഭ്യാസം", "ഡിഗ്രി", "ബിസി", "പഠിക്കുന്നു", 
                "പഠനം", "അധ്യയനം", "ക്ലാസ്", "വർഷം", 
                "കോഴ്‌സ്", "സിലബസ്", "വിദ്യാർഥി", "ഗണിതം", "സയൻസ്"
            ],
            "Internship_details": [
                "ഇന്റെണ്ഷിപ്", "പരിശീലനം", "പ്രോഗ്രാം", 
                "മാസം", "സമയക്രമം", "ടൈമിംഗ്", "1 മുതൽ 3 വരെ", 
                "അവസാന വർഷം", "ലൈവ്", "ഫ്രെയിംവർക്ക്", "സ്ഥിരമായി"
            ],
            "Location_query": [
                "ഓൺലൈൻ", "ഓഫ്ലൈൻ", "സ്ഥലം", "വിലാസം", "കഴിഞ്ഞ്", 
                "എവിടെ", "കൊഴിക്കോട്", "പാലാരിവട്ടം", "മാറ്റം", 
                "റിലൊക്കേറ്റ്", "വരുന്നു", "എവിടെ നിന്നാണ്", "ഹൈബ്രിഡ്"
            ],
            "Certificate_query": [
                "സർട്ടിഫിക്കറ്റ്", "ഡോക്യുമെന്റ്", "അനുഭവ സർട്ടിഫിക്കറ്റ്", 
                "പരിശീലന സർട്ടിഫിക്കറ്റ്", "അവാർഡ്", "രജിസ്ട്രേഷൻ", 
                "പ്രമാണം", "സാക്ഷ്യപത്രം", "കമ്പ്ലീഷൻ"
            ],
            "Fee_query": [
                "ഫീസ്", "പണം", "6000", "ആറ് ആയിരം", "കാണിക്ക്", 
                "മാസതൊട്ടി", "ചാർജ്", "റുമണറേഷൻ", "ഫ്രീ", 
                "ശമ്പളം", "സ്റ്റൈപെൻഡ്"
            ],
            "Project_details": [
                "പ്രോജക്ട്", "ലൈവ് പ്രോജക്ട്", "പ്രവൃത്തി", "ടാസ്‌ക്", 
                "ടീം", "മേധാവി", "ട്രെയിനി", "സഹപ്രവർത്തനം", 
                "ഡവലപ്പുചെയ്യുക", "സൃഷ്ടിക്കുക", "ഇമ്പ്ലിമെന്റുചെയ്യുക", 
                "പ്രായോഗികം", "അഭ്യാസം"
            ],
            "Confirmation": [
                 "ശരി", "താല്പര്യമുണ്ട്", "ഇഷ്ടമുണ്ട്", "വാട്സാപ്പിൽ അയക്കൂ", 
                 "വാട്സാപ്പ്", "വാട്ട്സാപ്പ്", "കിട്ടി", "അറിയിച്ചു", 
                 "നോട്ടു ചെയ്തു", "സമ്മതം", "ബോധിച്ചിട്ടുണ്ട്", 
                 "അംഗീകരിച്ചു", "അക്ക്നലഡ്ജ്", "ക്ലിയർ", 
                 "തയാറാണ്", "അറിയിപ്പ് ലഭിച്ചു"
            ]

        }
    }

    # Step 1: Detect interest level
    if any(keyword in text_lower for keyword in intent_keywords[language]["Strong_interest"]):
        return "Strong_interest"
    if any(keyword in text_lower for keyword in intent_keywords[language]["Moderate_interest"]):
        return "Moderate_interest"
    if any(keyword in text_lower for keyword in intent_keywords[language]["No_interest"]):
        return "No_interest"

    # Step 2: Detect conversation category
    for intent, keywords in intent_keywords[language].items():
        if intent not in ["Strong_interest", "Moderate_interest", "No_interest"]:
            if any(keyword in text_lower for keyword in keywords):
                return intent

    return "Neutral_response"



def analyze_text(text, language="en"):
    sentences = split_into_sentences(text)
    if not sentences:
        return []

    sentiment_results = analyze_sentiment_batch(sentences)

    analysis = []
    for i, sentence in enumerate(sentences):
        sentiment = sentiment_results[i]
        intent = detect_intent(sentence, language)
        analysis.append({
            "sentence_id": f"{language}_{i+1}",
            "text": sentence,
            "language": language,
            "intent": intent,
            "sentiment": sentiment["label"],
            "sentiment_score": sentiment["score"],
            "word_count": len(sentence.split()),
            "char_count": len(sentence)
        })
    return analysis

def save_analysis_to_csv(analysis, filename_prefix):
    if not analysis:
        print("No analysis data to save")
        return None

    df = pd.DataFrame(analysis)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{filename_prefix}_analysis_{timestamp}.csv"
    os.makedirs("analysis_results", exist_ok=True)
    full_path = os.path.join("analysis_results", filename)
    df.to_csv(full_path, index=False, encoding='utf-8-sig')
    print(f"✅ Analysis saved to {full_path}")
    return full_path

def compare_analyses(en_analysis, ml_analysis):
    comparison = []
    for en, ml in zip(en_analysis, ml_analysis):
        comparison.append({
            "sentence_id": en["sentence_id"],
            "english_text": en["text"],
            "malayalam_text": ml["text"],
            "intent_match": en["intent"] == ml["intent"],
            "english_intent": en["intent"],
            "malayalam_intent": ml["intent"],
            "sentiment_diff": abs(en["sentiment_score"] - ml["sentiment_score"]),
            "english_sentiment": en["sentiment"],
            "malayalam_sentiment": ml["sentiment"]
        })
    return comparison

def print_analysis_summary(analysis, title):
    print(f"\n=== {title} Analysis Summary ===")
    print(f"Total Sentences: {len(analysis)}")
    if not analysis:
        return
    sentiment_counts = pd.Series([item["sentiment"] for item in analysis]).value_counts()
    print("\nSentiment Distribution:")
    print(sentiment_counts.to_string())

    intent_counts = pd.Series([item["intent"] for item in analysis]).value_counts()
    print("\nIntent Distribution:")
    print(intent_counts.to_string())

    avg_score = sum(item["sentiment_score"] for item in analysis) / len(analysis)
    print(f"\nAverage Sentiment Score: {avg_score:.2f}")

if __name__ == "__main__":
    transcriber = MalayalamTranscriptionPipeline()

    try:
        audio_path = input("Enter path to Malayalam audio file: ").strip()
        if not os.path.exists(audio_path):
            print("Error: File not found")
            exit(1)

        print("\n🔊 Transcribing audio...")
        results = transcriber.transcribe_audio(audio_path)
        if not results or not results.get("raw_transcription"):
            print("Transcription failed.")
            exit(1)

        raw_transcription = results["raw_transcription"]
        print("\n=== Raw English Transcription ===")
        print(raw_transcription)

        print("\n🌐 Translating to Malayalam...")
        results = transcriber.translate_to_malayalam(results)
        ml_translation = results.get("translated_malayalam", "")
        print("\n=== Malayalam Translation ===")
        print(ml_translation)

        print("\n🔍 Analyzing texts...")
        en_analysis = analyze_text(raw_transcription, "en")
        ml_analysis = analyze_text(ml_translation, "ml")

        en_csv = save_analysis_to_csv(en_analysis, "english")
        ml_csv = save_analysis_to_csv(ml_analysis, "malayalam")

        comparison = compare_analyses(en_analysis, ml_analysis)
        comparison_csv = save_analysis_to_csv(comparison, "comparison")

        print_analysis_summary(en_analysis, "English")
        print_analysis_summary(ml_analysis, "Malayalam")

        print("\n=== Translation Accuracy Insights ===")
        intent_matches = sum(1 for item in comparison if item["intent_match"])
        print(f"Intent Match Rate: {intent_matches / len(comparison):.1%}")
        avg_sentiment_diff = sum(item["sentiment_diff"] for item in comparison) / len(comparison)
        print(f"Average Sentiment Difference: {avg_sentiment_diff:.2f}")

        # Calculate Lead Score from average sentiment scores
        en_avg_score = sum(item["sentiment_score"] for item in en_analysis) / len(en_analysis) if en_analysis else 0
        ml_avg_score = sum(item["sentiment_score"] for item in ml_analysis) / len(ml_analysis) if ml_analysis else 0
        combined_avg = (en_avg_score + ml_avg_score) / 2
        
        # Convert to lead score (0-100 scale)
        lead_score = int(combined_avg * 100)
        print(f"\n=== Lead Score ===")
        print(f"Calculated Lead Score: {lead_score}/100")
        if lead_score >= 70:
            print("Interpretation: High interest lead")
        elif lead_score >= 40:
            print("Interpretation: Moderate interest lead")
        else:
            print("Interpretation: Low interest lead")

    except Exception as e:
        print(f"\n❌ An error occurred: {str(e)}")
    finally:
        transcriber.cleanup()