sree / src /streamlit_app.py
sreenathsree1578's picture
Rename src/Lead_score_conversion.py to src/streamlit_app.py
ec74c1f verified
import os
import tempfile
from datetime import datetime
import torch
from pydub import AudioSegment
from deep_translator import GoogleTranslator
from transformers import pipeline
import pandas as pd
import nltk
from faster_whisper import WhisperModel
import tempfile
nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
class MalayalamTranscriptionPipeline:
def __init__(self, model_size="large-v1"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading Faster-Whisper {model_size} model on {self.device}...")
compute_type = "float16" if self.device == "cuda" else "int8"
self.model = WhisperModel(model_size, device=self.device, compute_type=compute_type)
self.temp_files = []
def convert_to_whisper_format(self, input_path):
supported_formats = ['.mp3', '.wav', '.aac', '.m4a', '.flac', '.ogg', '.wma']
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
file_ext = os.path.splitext(input_path)[1].lower()
if file_ext not in supported_formats:
raise ValueError(f"Unsupported audio format: {file_ext}")
temp_dir = os.path.join(tempfile.gettempdir(), "whisper_temp")
os.makedirs(temp_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
wav_path = os.path.join(temp_dir, f"temp_{timestamp}.wav")
audio = AudioSegment.from_file(input_path)
audio = audio.set_frame_rate(16000).set_channels(1)
audio.export(wav_path, format="wav")
self.temp_files.append(wav_path)
print(f"Converted to temporary WAV: {wav_path}")
return wav_path
def transcribe_audio(self, audio_path):
if not audio_path.lower().endswith('.wav'):
audio_path = self.convert_to_whisper_format(audio_path)
if not audio_path:
return None
print("Transcribing audio with Faster-Whisper...")
segments, info = self.model.transcribe(
audio_path,
beam_size=5,
language="en"
)
full_text = ""
segment_list = []
for i, seg in enumerate(segments):
text = seg.text.strip()
confidence = seg.avg_logprob if hasattr(seg, 'avg_logprob') else 1.0
segment_list.append({
"start": seg.start,
"end": seg.end,
"text": text,
"confidence": round(confidence, 3),
"overlap": i > 0 and seg.start < segment_list[i - 1]["end"]
})
full_text += f" {text}"
return {
"raw_transcription": full_text.strip(),
"segments": segment_list,
"audio_metadata": {
"original_path": audio_path,
"sample_rate": 16000,
"duration": len(AudioSegment.from_wav(audio_path)) / 1000
}
}
def translate_to_malayalam(self, text_or_dict):
try:
if isinstance(text_or_dict, dict):
text = text_or_dict.get('raw_transcription', '')
else:
text = text_or_dict
if not text.strip():
raise ValueError("No text found for translation")
print("Translating to Malayalam...")
ml_text = GoogleTranslator(source='en', target='ml').translate(text)
if isinstance(text_or_dict, dict):
text_or_dict['translated_malayalam'] = ml_text
return text_or_dict
else:
return ml_text
except Exception as e:
print(f"Translation error: {str(e)}")
return text_or_dict
def cleanup(self):
for file_path in self.temp_files:
try:
if os.path.exists(file_path):
os.remove(file_path)
except Exception as e:
print(f"Error deleting temp file {file_path}: {str(e)}")
self.temp_files = []
# Sentiment analysis pipeline
sentiment_pipeline = pipeline(
"sentiment-analysis",
model="nlptown/bert-base-multilingual-uncased-sentiment",
device=0 if torch.cuda.is_available() else -1
)
def split_into_sentences(text):
try:
sentences = nltk.sent_tokenize(text)
return [s.strip() for s in sentences if s.strip()]
except Exception as e:
print(f"Sentence splitting failed: {e}")
return [text] if text.strip() else []
def analyze_sentiment_batch(texts):
results = sentiment_pipeline(texts)
outputs = []
for result in results:
label = result['label']
if "1 star" in label:
sentiment = {"label": "very negative", "score": 0.1}
elif "2 stars" in label:
sentiment = {"label": "negative", "score": 0.3}
elif "3 stars" in label:
sentiment = {"label": "neutral", "score": 0.5}
elif "4 stars" in label:
sentiment = {"label": "positive", "score": 0.7}
elif "5 stars" in label:
sentiment = {"label": "very positive", "score": 0.9}
else:
sentiment = {"label": "neutral", "score": 0.5}
outputs.append(sentiment)
return outputs
def detect_intent(text, language="en"):
"""Enhanced intent detection for internship interest analysis in English and Malayalam"""
text_lower = text.lower().strip()
intent_keywords = {
"en": {
# Interest Levels
"Strong_interest": [
"yes", "definitely", "ready", "want to join", "interested",
"share details", "send brochure", "i'll join", "let's proceed",
"where do i sign", "how to apply", "when can i start", "accept",
"looking forward", "excited", "happy to", "glad to", "eager",
"share it", "i will come", "i'm in"
],
"Moderate_interest": [
"maybe", "consider", "think about", "let me think", "tell me more",
"more details", "explain", "clarify", "not sure", "possibly",
"might", "could be", "depends", "need to check", "will decide",
"get back", "discuss", "consult", "review", "evaluate"
],
"No_interest": [
"no", "not interested", "can't", "won't", "don't like",
"not now", "later", "not suitable", "inconvenient", "decline",
"pass", "refuse", "reject", "not for me", "not my field"
],
# Conversation Categories
"Qualification_query": [
"qualification", "education", "degree", "studying", "course",
"background", "academics", "university", "college", "bsc",
"graduate", "year of study", "curriculum", "syllabus"
],
"Internship_details": [
"internship", "program", "duration", "months", "period",
"schedule", "timing", "timeframe", "1 to 3", "three months",
"structure", "plan", "framework"
],
"Location_query": [
"online", "offline", "location", "place", "where",
"address", "relocate", "relocating", "from", "coming",
"kozhikode", "kochi", "palarivattam", "hybrid", "remote"
],
"Certificate_query": [
"certificate", "certification", "document", "proof",
"experience certificate", "training certificate", "letter",
"completion", "award", "recognition"
],
"Fee_query": [
"fee", "payment", "cost", "amount", "charge",
"6000", "six thousand", "money", "stipend", "salary",
"compensation", "paid", "free"
],
"Project_details": [
"live project", "work", "assignment", "task", "project",
"trainee", "superiors", "team", "collaborate", "develop",
"build", "create", "implement", "hands-on", "practical"
],
"Confirmation": [
"ok", "looking for", "interested", "send whatsapp", "got it",
"acknowledge", "noted", "please send", "sent details", "agreed"
]
},
"ml": {
# Interest Levels
"Strong_interest": [
"เดคเดฏเตเดฏเดพเดฑเดพเดฃเต", "เด†เดตเดถเตเดฏเดฎเตเดฃเตเดŸเต", "เดšเต†เดฏเตเดฏเดพเด‚", "เด†เด—เตเดฐเดนเดฎเตเดฃเตเดŸเต",
"เด‡เดทเตเดŸเดฎเดพเดฃเต", "เด…เดฑเดฟเดฏเดฟเดšเตเดšเต‹เดณเต‚", "เดฌเตเดฐเต‹เดทเตผ เดตเต‡เดฃเด‚", "เดตเดฟเดถเดฆเดพเด‚เดถเด™เตเด™เตพ เดตเต‡เดฃเด‚",
"เดถเต†เดฏเตผ เดšเต†เดฏเตเดฏเตเด•", "เดžเดพเตป เดตเดฐเดพเด‚", "เด‰เดคเตเดธเดพเดนเด‚", "เดคเดพเดคเตเดชเดฐเตเดฏเด‚",
"เดธเดฎเตเดฎเดคเด‚", "เด…เด‚เด—เต€เด•เดฐเดฟเด•เตเด•เตเดจเตเดจเต", "เดนเดพเดชเตเดชเดฟเดฏเดพเดฃเต", "เดžเดพเตป เดšเต†เดฏเตเดฏเดพเด‚",
"เดจเดฟเดถเตเดšเดฟเดคเดฎเดพเดฏเดฟ", "เด†เดตเดถเตเดฏเดฎเดพเดฃเต"
],
"Moderate_interest": [
"เด†เดฒเต‹เดšเดฟเด•เตเด•เดพเด‚", "เดจเต‹เด•เตเด•เดพเด‚", "เดคเดพเดฒเตเดชเดฐเตเดฏเดฎเตเดฃเตเดŸเต", "เด‡เดจเตเดฑเต†เดฑเดธเตเดฑเตเดฑเดกเต",
"เดชเดฑเดฏเดพเด‚", "เด•เตเดทเดฃเดฟเด•เตเด•เตเด•", "เดšเดฟเดจเตเดคเดฟเด•เตเด•เดพเด‚", "เด•เดพเดฃเดพเด‚", "เด‰เดคเตเดคเดฐเดฎเดฟเดฒเตเดฒ",
"เด•เต‚เดŸเตเดคเตฝ เดตเดฟเดตเดฐเด™เตเด™เตพ", "เดตเตเดฏเดพเด–เตเดฏเดพเดจเดฟเด•เตเด•เตเด•", "เด…เดตเดฒเด‚เดฌเดฟเด•เตเด•เตเด•"
],
"No_interest": [
"เด‡เดฒเตเดฒ", "เดตเต‡เดฃเตเดŸ", "เดธเดพเดงเตเดฏเดฎเดฒเตเดฒ", "เด‡เดทเตเดŸเดฎเดฒเตเดฒ", "เด‡เด™เตเด™เดจเต†เดฏเดฒเตเดฒ",
"เดจเดฟเดฐเดธเดฟเด•เตเด•เตเด•", "เด…เดจเดพเดตเดถเตเดฏเดฎเดพเดฃเต", "เดชเดฟเดจเตเดคเดฟเดฐเดฟเดฏเตเด•", "เด‡เดคเดฒเตเดฒ", "เดจเดฟเดทเต‡เดงเด‚"
],
# Conversation Categories
"Qualification_query": [
"เดตเดฟเดฆเตเดฏเดพเดญเตเดฏเดพเดธเด‚", "เดกเดฟเด—เตเดฐเดฟ", "เดฌเดฟเดธเดฟ", "เดชเด เดฟเด•เตเด•เตเดจเตเดจเต",
"เดชเด เดจเด‚", "เด…เดงเตเดฏเดฏเดจเด‚", "เด•เตเดฒเดพเดธเต", "เดตเตผเดทเด‚",
"เด•เต‹เดดเตโ€Œเดธเต", "เดธเดฟเดฒเดฌเดธเต", "เดตเดฟเดฆเตเดฏเดพเตผเดฅเดฟ", "เด—เดฃเดฟเดคเด‚", "เดธเดฏเตปเดธเต"
],
"Internship_details": [
"เด‡เดจเตเดฑเต†เดฃเตเดทเดฟเดชเต", "เดชเดฐเดฟเดถเต€เดฒเดจเด‚", "เดชเตเดฐเต‹เด—เตเดฐเดพเด‚",
"เดฎเดพเดธเด‚", "เดธเดฎเดฏเด•เตเดฐเดฎเด‚", "เดŸเตˆเดฎเดฟเด‚เด—เต", "1 เดฎเตเดคเตฝ 3 เดตเดฐเต†",
"เด…เดตเดธเดพเดจ เดตเตผเดทเด‚", "เดฒเตˆเดตเต", "เดซเตเดฐเต†เดฏเดฟเด‚เดตเตผเด•เตเด•เต", "เดธเตเดฅเดฟเดฐเดฎเดพเดฏเดฟ"
],
"Location_query": [
"เด“เตบเดฒเตˆเตป", "เด“เดซเตเดฒเตˆเตป", "เดธเตเดฅเดฒเด‚", "เดตเดฟเดฒเดพเดธเด‚", "เด•เดดเดฟเดžเตเดžเต",
"เดŽเดตเดฟเดŸเต†", "เด•เตŠเดดเดฟเด•เตเด•เต‹เดŸเต", "เดชเดพเดฒเดพเดฐเดฟเดตเดŸเตเดŸเด‚", "เดฎเดพเดฑเตเดฑเด‚",
"เดฑเดฟเดฒเตŠเด•เตเด•เต‡เดฑเตเดฑเต", "เดตเดฐเตเดจเตเดจเต", "เดŽเดตเดฟเดŸเต† เดจเดฟเดจเตเดจเดพเดฃเต", "เดนเตˆเดฌเตเดฐเดฟเดกเต"
],
"Certificate_query": [
"เดธเตผเดŸเตเดŸเดฟเดซเดฟเด•เตเด•เดฑเตเดฑเต", "เดกเต‹เด•เตเดฏเตเดฎเต†เดจเตเดฑเต", "เด…เดจเตเดญเดต เดธเตผเดŸเตเดŸเดฟเดซเดฟเด•เตเด•เดฑเตเดฑเต",
"เดชเดฐเดฟเดถเต€เดฒเดจ เดธเตผเดŸเตเดŸเดฟเดซเดฟเด•เตเด•เดฑเตเดฑเต", "เด…เดตเดพเตผเดกเต", "เดฐเดœเดฟเดธเตเดŸเตเดฐเต‡เดทเตป",
"เดชเตเดฐเดฎเดพเดฃเด‚", "เดธเดพเด•เตเดทเตเดฏเดชเดคเตเดฐเด‚", "เด•เดฎเตเดชเตเดฒเต€เดทเตป"
],
"Fee_query": [
"เดซเต€เดธเต", "เดชเดฃเด‚", "6000", "เด†เดฑเต เด†เดฏเดฟเดฐเด‚", "เด•เดพเดฃเดฟเด•เตเด•เต",
"เดฎเดพเดธเดคเตŠเดŸเตเดŸเดฟ", "เดšเดพเตผเดœเต", "เดฑเตเดฎเดฃเดฑเต‡เดทเตป", "เดซเตเดฐเต€",
"เดถเดฎเตเดชเดณเด‚", "เดธเตเดฑเตเดฑเตˆเดชเต†เตปเดกเต"
],
"Project_details": [
"เดชเตเดฐเต‹เดœเด•เตเดŸเต", "เดฒเตˆเดตเต เดชเตเดฐเต‹เดœเด•เตเดŸเต", "เดชเตเดฐเดตเตƒเดคเตเดคเดฟ", "เดŸเดพเดธเตโ€Œเด•เต",
"เดŸเต€เด‚", "เดฎเต‡เดงเดพเดตเดฟ", "เดŸเตเดฐเต†เดฏเดฟเดจเดฟ", "เดธเดนเดชเตเดฐเดตเตผเดคเตเดคเดจเด‚",
"เดกเดตเดฒเดชเตเดชเตเดšเต†เดฏเตเดฏเตเด•", "เดธเตƒเดทเตเดŸเดฟเด•เตเด•เตเด•", "เด‡เดฎเตเดชเตเดฒเดฟเดฎเต†เดจเตเดฑเตเดšเต†เดฏเตเดฏเตเด•",
"เดชเตเดฐเดพเดฏเต‹เด—เดฟเด•เด‚", "เด…เดญเตเดฏเดพเดธเด‚"
],
"Confirmation": [
"เดถเดฐเดฟ", "เดคเดพเดฒเตเดชเดฐเตเดฏเดฎเตเดฃเตเดŸเต", "เด‡เดทเตเดŸเดฎเตเดฃเตเดŸเต", "เดตเดพเดŸเตเดธเดพเดชเตเดชเดฟเตฝ เด…เดฏเด•เตเด•เต‚",
"เดตเดพเดŸเตเดธเดพเดชเตเดชเต", "เดตเดพเดŸเตเดŸเตเดธเดพเดชเตเดชเต", "เด•เดฟเดŸเตเดŸเดฟ", "เด…เดฑเดฟเดฏเดฟเดšเตเดšเต",
"เดจเต‹เดŸเตเดŸเต เดšเต†เดฏเตเดคเต", "เดธเดฎเตเดฎเดคเด‚", "เดฌเต‹เดงเดฟเดšเตเดšเดฟเดŸเตเดŸเตเดฃเตเดŸเต",
"เด…เด‚เด—เต€เด•เดฐเดฟเดšเตเดšเต", "เด…เด•เตเด•เตเดจเดฒเดกเตเดœเต", "เด•เตเดฒเดฟเดฏเตผ",
"เดคเดฏเดพเดฑเดพเดฃเต", "เด…เดฑเดฟเดฏเดฟเดชเตเดชเต เดฒเดญเดฟเดšเตเดšเต"
]
}
}
# Step 1: Detect interest level
if any(keyword in text_lower for keyword in intent_keywords[language]["Strong_interest"]):
return "Strong_interest"
if any(keyword in text_lower for keyword in intent_keywords[language]["Moderate_interest"]):
return "Moderate_interest"
if any(keyword in text_lower for keyword in intent_keywords[language]["No_interest"]):
return "No_interest"
# Step 2: Detect conversation category
for intent, keywords in intent_keywords[language].items():
if intent not in ["Strong_interest", "Moderate_interest", "No_interest"]:
if any(keyword in text_lower for keyword in keywords):
return intent
return "Neutral_response"
def analyze_text(text, language="en"):
sentences = split_into_sentences(text)
if not sentences:
return []
sentiment_results = analyze_sentiment_batch(sentences)
analysis = []
for i, sentence in enumerate(sentences):
sentiment = sentiment_results[i]
intent = detect_intent(sentence, language)
analysis.append({
"sentence_id": f"{language}_{i+1}",
"text": sentence,
"language": language,
"intent": intent,
"sentiment": sentiment["label"],
"sentiment_score": sentiment["score"],
"word_count": len(sentence.split()),
"char_count": len(sentence)
})
return analysis
def save_analysis_to_csv(analysis, filename_prefix):
if not analysis:
print("No analysis data to save")
return None
df = pd.DataFrame(analysis)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{filename_prefix}_analysis_{timestamp}.csv"
os.makedirs("analysis_results", exist_ok=True)
full_path = os.path.join("analysis_results", filename)
df.to_csv(full_path, index=False, encoding='utf-8-sig')
print(f"โœ… Analysis saved to {full_path}")
return full_path
def compare_analyses(en_analysis, ml_analysis):
comparison = []
for en, ml in zip(en_analysis, ml_analysis):
comparison.append({
"sentence_id": en["sentence_id"],
"english_text": en["text"],
"malayalam_text": ml["text"],
"intent_match": en["intent"] == ml["intent"],
"english_intent": en["intent"],
"malayalam_intent": ml["intent"],
"sentiment_diff": abs(en["sentiment_score"] - ml["sentiment_score"]),
"english_sentiment": en["sentiment"],
"malayalam_sentiment": ml["sentiment"]
})
return comparison
def print_analysis_summary(analysis, title):
print(f"\n=== {title} Analysis Summary ===")
print(f"Total Sentences: {len(analysis)}")
if not analysis:
return
sentiment_counts = pd.Series([item["sentiment"] for item in analysis]).value_counts()
print("\nSentiment Distribution:")
print(sentiment_counts.to_string())
intent_counts = pd.Series([item["intent"] for item in analysis]).value_counts()
print("\nIntent Distribution:")
print(intent_counts.to_string())
avg_score = sum(item["sentiment_score"] for item in analysis) / len(analysis)
print(f"\nAverage Sentiment Score: {avg_score:.2f}")
if __name__ == "__main__":
transcriber = MalayalamTranscriptionPipeline()
try:
audio_path = input("Enter path to Malayalam audio file: ").strip()
if not os.path.exists(audio_path):
print("Error: File not found")
exit(1)
print("\n๐Ÿ”Š Transcribing audio...")
results = transcriber.transcribe_audio(audio_path)
if not results or not results.get("raw_transcription"):
print("Transcription failed.")
exit(1)
raw_transcription = results["raw_transcription"]
print("\n=== Raw English Transcription ===")
print(raw_transcription)
print("\n๐ŸŒ Translating to Malayalam...")
results = transcriber.translate_to_malayalam(results)
ml_translation = results.get("translated_malayalam", "")
print("\n=== Malayalam Translation ===")
print(ml_translation)
print("\n๐Ÿ” Analyzing texts...")
en_analysis = analyze_text(raw_transcription, "en")
ml_analysis = analyze_text(ml_translation, "ml")
en_csv = save_analysis_to_csv(en_analysis, "english")
ml_csv = save_analysis_to_csv(ml_analysis, "malayalam")
comparison = compare_analyses(en_analysis, ml_analysis)
comparison_csv = save_analysis_to_csv(comparison, "comparison")
print_analysis_summary(en_analysis, "English")
print_analysis_summary(ml_analysis, "Malayalam")
print("\n=== Translation Accuracy Insights ===")
intent_matches = sum(1 for item in comparison if item["intent_match"])
print(f"Intent Match Rate: {intent_matches / len(comparison):.1%}")
avg_sentiment_diff = sum(item["sentiment_diff"] for item in comparison) / len(comparison)
print(f"Average Sentiment Difference: {avg_sentiment_diff:.2f}")
# Calculate Lead Score from average sentiment scores
en_avg_score = sum(item["sentiment_score"] for item in en_analysis) / len(en_analysis) if en_analysis else 0
ml_avg_score = sum(item["sentiment_score"] for item in ml_analysis) / len(ml_analysis) if ml_analysis else 0
combined_avg = (en_avg_score + ml_avg_score) / 2
# Convert to lead score (0-100 scale)
lead_score = int(combined_avg * 100)
print(f"\n=== Lead Score ===")
print(f"Calculated Lead Score: {lead_score}/100")
if lead_score >= 70:
print("Interpretation: High interest lead")
elif lead_score >= 40:
print("Interpretation: Moderate interest lead")
else:
print("Interpretation: Low interest lead")
except Exception as e:
print(f"\nโŒ An error occurred: {str(e)}")
finally:
transcriber.cleanup()