Spaces:

sreenathsree1578
/

sree

Sleeping

App Files Files Community

sree / src /streamlit_app.py

sreenathsree1578

Rename src/Lead_score_conversion.py to src/streamlit_app.py

ec74c1f verified 8 months ago

raw

history blame contribute delete

19.5 kB

	import os
	import tempfile
	from datetime import datetime
	import torch
	from pydub import AudioSegment
	from deep_translator import GoogleTranslator
	from transformers import pipeline
	import pandas as pd
	import nltk
	from faster_whisper import WhisperModel
	import tempfile

	nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
	os.makedirs(nltk_data_dir, exist_ok=True)
	nltk.data.path.append(nltk_data_dir)



	class MalayalamTranscriptionPipeline:
	def __init__(self, model_size="large-v1"):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Loading Faster-Whisper {model_size} model on {self.device}...")
	compute_type = "float16" if self.device == "cuda" else "int8"
	self.model = WhisperModel(model_size, device=self.device, compute_type=compute_type)
	self.temp_files = []

	def convert_to_whisper_format(self, input_path):
	supported_formats = ['.mp3', '.wav', '.aac', '.m4a', '.flac', '.ogg', '.wma']
	if not os.path.exists(input_path):
	raise FileNotFoundError(f"Input file not found: {input_path}")
	file_ext = os.path.splitext(input_path)[1].lower()
	if file_ext not in supported_formats:
	raise ValueError(f"Unsupported audio format: {file_ext}")

	temp_dir = os.path.join(tempfile.gettempdir(), "whisper_temp")
	os.makedirs(temp_dir, exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
	wav_path = os.path.join(temp_dir, f"temp_{timestamp}.wav")

	audio = AudioSegment.from_file(input_path)
	audio = audio.set_frame_rate(16000).set_channels(1)
	audio.export(wav_path, format="wav")

	self.temp_files.append(wav_path)
	print(f"Converted to temporary WAV: {wav_path}")
	return wav_path

	def transcribe_audio(self, audio_path):
	if not audio_path.lower().endswith('.wav'):
	audio_path = self.convert_to_whisper_format(audio_path)
	if not audio_path:
	return None

	print("Transcribing audio with Faster-Whisper...")
	segments, info = self.model.transcribe(
	audio_path,
	beam_size=5,
	language="en"
	)

	full_text = ""
	segment_list = []
	for i, seg in enumerate(segments):
	text = seg.text.strip()
	confidence = seg.avg_logprob if hasattr(seg, 'avg_logprob') else 1.0
	segment_list.append({
	"start": seg.start,
	"end": seg.end,
	"text": text,
	"confidence": round(confidence, 3),
	"overlap": i > 0 and seg.start < segment_list[i - 1]["end"]
	})
	full_text += f" {text}"

	return {
	"raw_transcription": full_text.strip(),
	"segments": segment_list,
	"audio_metadata": {
	"original_path": audio_path,
	"sample_rate": 16000,
	"duration": len(AudioSegment.from_wav(audio_path)) / 1000
	}
	}

	def translate_to_malayalam(self, text_or_dict):
	try:
	if isinstance(text_or_dict, dict):
	text = text_or_dict.get('raw_transcription', '')
	else:
	text = text_or_dict

	if not text.strip():
	raise ValueError("No text found for translation")

	print("Translating to Malayalam...")
	ml_text = GoogleTranslator(source='en', target='ml').translate(text)

	if isinstance(text_or_dict, dict):
	text_or_dict['translated_malayalam'] = ml_text
	return text_or_dict
	else:
	return ml_text
	except Exception as e:
	print(f"Translation error: {str(e)}")
	return text_or_dict

	def cleanup(self):
	for file_path in self.temp_files:
	try:
	if os.path.exists(file_path):
	os.remove(file_path)
	except Exception as e:
	print(f"Error deleting temp file {file_path}: {str(e)}")
	self.temp_files = []

	# Sentiment analysis pipeline
	sentiment_pipeline = pipeline(
	"sentiment-analysis",
	model="nlptown/bert-base-multilingual-uncased-sentiment",
	device=0 if torch.cuda.is_available() else -1
	)

	def split_into_sentences(text):
	try:
	sentences = nltk.sent_tokenize(text)
	return [s.strip() for s in sentences if s.strip()]
	except Exception as e:
	print(f"Sentence splitting failed: {e}")
	return [text] if text.strip() else []

	def analyze_sentiment_batch(texts):
	results = sentiment_pipeline(texts)
	outputs = []
	for result in results:
	label = result['label']
	if "1 star" in label:
	sentiment = {"label": "very negative", "score": 0.1}
	elif "2 stars" in label:
	sentiment = {"label": "negative", "score": 0.3}
	elif "3 stars" in label:
	sentiment = {"label": "neutral", "score": 0.5}
	elif "4 stars" in label:
	sentiment = {"label": "positive", "score": 0.7}
	elif "5 stars" in label:
	sentiment = {"label": "very positive", "score": 0.9}
	else:
	sentiment = {"label": "neutral", "score": 0.5}
	outputs.append(sentiment)
	return outputs

	def detect_intent(text, language="en"):
	"""Enhanced intent detection for internship interest analysis in English and Malayalam"""
	text_lower = text.lower().strip()

	intent_keywords = {
	"en": {
	# Interest Levels
	"Strong_interest": [
	"yes", "definitely", "ready", "want to join", "interested",
	"share details", "send brochure", "i'll join", "let's proceed",
	"where do i sign", "how to apply", "when can i start", "accept",
	"looking forward", "excited", "happy to", "glad to", "eager",
	"share it", "i will come", "i'm in"
	],
	"Moderate_interest": [
	"maybe", "consider", "think about", "let me think", "tell me more",
	"more details", "explain", "clarify", "not sure", "possibly",
	"might", "could be", "depends", "need to check", "will decide",
	"get back", "discuss", "consult", "review", "evaluate"
	],
	"No_interest": [
	"no", "not interested", "can't", "won't", "don't like",
	"not now", "later", "not suitable", "inconvenient", "decline",
	"pass", "refuse", "reject", "not for me", "not my field"
	],

	# Conversation Categories
	"Qualification_query": [
	"qualification", "education", "degree", "studying", "course",
	"background", "academics", "university", "college", "bsc",
	"graduate", "year of study", "curriculum", "syllabus"
	],
	"Internship_details": [
	"internship", "program", "duration", "months", "period",
	"schedule", "timing", "timeframe", "1 to 3", "three months",
	"structure", "plan", "framework"
	],
	"Location_query": [
	"online", "offline", "location", "place", "where",
	"address", "relocate", "relocating", "from", "coming",
	"kozhikode", "kochi", "palarivattam", "hybrid", "remote"
	],
	"Certificate_query": [
	"certificate", "certification", "document", "proof",
	"experience certificate", "training certificate", "letter",
	"completion", "award", "recognition"
	],
	"Fee_query": [
	"fee", "payment", "cost", "amount", "charge",
	"6000", "six thousand", "money", "stipend", "salary",
	"compensation", "paid", "free"
	],
	"Project_details": [
	"live project", "work", "assignment", "task", "project",
	"trainee", "superiors", "team", "collaborate", "develop",
	"build", "create", "implement", "hands-on", "practical"
	],
	"Confirmation": [
	"ok", "looking for", "interested", "send whatsapp", "got it",
	"acknowledge", "noted", "please send", "sent details", "agreed"
	]
	},

	"ml": {
	# Interest Levels
	"Strong_interest": [
	"തയ്യാറാണ്", "ആവശ്യമുണ്ട്", "ചെയ്യാം", "ആഗ്രഹമുണ്ട്",
	"ഇഷ്ടമാണ്", "അറിയിച്ചോളൂ", "ബ്രോഷർ വേണം", "വിശദാംശങ്ങൾ വേണം",
	"ശെയർ ചെയ്യുക", "ഞാൻ വരാം", "ഉത്സാഹം", "താത്പര്യം",
	"സമ്മതം", "അംഗീകരിക്കുന്നു", "ഹാപ്പിയാണ്", "ഞാൻ ചെയ്യാം",
	"നിശ്ചിതമായി", "ആവശ്യമാണ്"
	],
	"Moderate_interest": [
	"ആലോചിക്കാം", "നോക്കാം", "താല്പര്യമുണ്ട്", "ഇന്റെറസ്റ്റഡ്",
	"പറയാം", "ക്ഷണിക്കുക", "ചിന്തിക്കാം", "കാണാം", "ഉത്തരമില്ല",
	"കൂടുതൽ വിവരങ്ങൾ", "വ്യാഖ്യാനിക്കുക", "അവലംബിക്കുക"
	],
	"No_interest": [
	"ഇല്ല", "വേണ്ട", "സാധ്യമല്ല", "ഇഷ്ടമല്ല", "ഇങ്ങനെയല്ല",
	"നിരസിക്കുക", "അനാവശ്യമാണ്", "പിന്തിരിയുക", "ഇതല്ല", "നിഷേധം"
	],

	# Conversation Categories
	"Qualification_query": [
	"വിദ്യാഭ്യാസം", "ഡിഗ്രി", "ബിസി", "പഠിക്കുന്നു",
	"പഠനം", "അധ്യയനം", "ക്ലാസ്", "വർഷം",
	"കോഴ്‌സ്", "സിലബസ്", "വിദ്യാർഥി", "ഗണിതം", "സയൻസ്"
	],
	"Internship_details": [
	"ഇന്റെണ്ഷിപ്", "പരിശീലനം", "പ്രോഗ്രാം",
	"മാസം", "സമയക്രമം", "ടൈമിംഗ്", "1 മുതൽ 3 വരെ",
	"അവസാന വർഷം", "ലൈവ്", "ഫ്രെയിംവർക്ക്", "സ്ഥിരമായി"
	],
	"Location_query": [
	"ഓൺലൈൻ", "ഓഫ്ലൈൻ", "സ്ഥലം", "വിലാസം", "കഴിഞ്ഞ്",
	"എവിടെ", "കൊഴിക്കോട്", "പാലാരിവട്ടം", "മാറ്റം",
	"റിലൊക്കേറ്റ്", "വരുന്നു", "എവിടെ നിന്നാണ്", "ഹൈബ്രിഡ്"
	],
	"Certificate_query": [
	"സർട്ടിഫിക്കറ്റ്", "ഡോക്യുമെന്റ്", "അനുഭവ സർട്ടിഫിക്കറ്റ്",
	"പരിശീലന സർട്ടിഫിക്കറ്റ്", "അവാർഡ്", "രജിസ്ട്രേഷൻ",
	"പ്രമാണം", "സാക്ഷ്യപത്രം", "കമ്പ്ലീഷൻ"
	],
	"Fee_query": [
	"ഫീസ്", "പണം", "6000", "ആറ് ആയിരം", "കാണിക്ക്",
	"മാസതൊട്ടി", "ചാർജ്", "റുമണറേഷൻ", "ഫ്രീ",
	"ശമ്പളം", "സ്റ്റൈപെൻഡ്"
	],
	"Project_details": [
	"പ്രോജക്ട്", "ലൈവ് പ്രോജക്ട്", "പ്രവൃത്തി", "ടാസ്‌ക്",
	"ടീം", "മേധാവി", "ട്രെയിനി", "സഹപ്രവർത്തനം",
	"ഡവലപ്പുചെയ്യുക", "സൃഷ്ടിക്കുക", "ഇമ്പ്ലിമെന്റുചെയ്യുക",
	"പ്രായോഗികം", "അഭ്യാസം"
	],
	"Confirmation": [
	"ശരി", "താല്പര്യമുണ്ട്", "ഇഷ്ടമുണ്ട്", "വാട്സാപ്പിൽ അയക്കൂ",
	"വാട്സാപ്പ്", "വാട്ട്സാപ്പ്", "കിട്ടി", "അറിയിച്ചു",
	"നോട്ടു ചെയ്തു", "സമ്മതം", "ബോധിച്ചിട്ടുണ്ട്",
	"അംഗീകരിച്ചു", "അക്ക്നലഡ്ജ്", "ക്ലിയർ",
	"തയാറാണ്", "അറിയിപ്പ് ലഭിച്ചു"
	]

	}
	}

	# Step 1: Detect interest level
	if any(keyword in text_lower for keyword in intent_keywords[language]["Strong_interest"]):
	return "Strong_interest"
	if any(keyword in text_lower for keyword in intent_keywords[language]["Moderate_interest"]):
	return "Moderate_interest"
	if any(keyword in text_lower for keyword in intent_keywords[language]["No_interest"]):
	return "No_interest"

	# Step 2: Detect conversation category
	for intent, keywords in intent_keywords[language].items():
	if intent not in ["Strong_interest", "Moderate_interest", "No_interest"]:
	if any(keyword in text_lower for keyword in keywords):
	return intent

	return "Neutral_response"



	def analyze_text(text, language="en"):
	sentences = split_into_sentences(text)
	if not sentences:
	return []

	sentiment_results = analyze_sentiment_batch(sentences)

	analysis = []
	for i, sentence in enumerate(sentences):
	sentiment = sentiment_results[i]
	intent = detect_intent(sentence, language)
	analysis.append({
	"sentence_id": f"{language}_{i+1}",
	"text": sentence,
	"language": language,
	"intent": intent,
	"sentiment": sentiment["label"],
	"sentiment_score": sentiment["score"],
	"word_count": len(sentence.split()),
	"char_count": len(sentence)
	})
	return analysis

	def save_analysis_to_csv(analysis, filename_prefix):
	if not analysis:
	print("No analysis data to save")
	return None

	df = pd.DataFrame(analysis)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"{filename_prefix}_analysis_{timestamp}.csv"
	os.makedirs("analysis_results", exist_ok=True)
	full_path = os.path.join("analysis_results", filename)
	df.to_csv(full_path, index=False, encoding='utf-8-sig')
	print(f"✅ Analysis saved to {full_path}")
	return full_path

	def compare_analyses(en_analysis, ml_analysis):
	comparison = []
	for en, ml in zip(en_analysis, ml_analysis):
	comparison.append({
	"sentence_id": en["sentence_id"],
	"english_text": en["text"],
	"malayalam_text": ml["text"],
	"intent_match": en["intent"] == ml["intent"],
	"english_intent": en["intent"],
	"malayalam_intent": ml["intent"],
	"sentiment_diff": abs(en["sentiment_score"] - ml["sentiment_score"]),
	"english_sentiment": en["sentiment"],
	"malayalam_sentiment": ml["sentiment"]
	})
	return comparison

	def print_analysis_summary(analysis, title):
	print(f"\n=== {title} Analysis Summary ===")
	print(f"Total Sentences: {len(analysis)}")
	if not analysis:
	return
	sentiment_counts = pd.Series([item["sentiment"] for item in analysis]).value_counts()
	print("\nSentiment Distribution:")
	print(sentiment_counts.to_string())

	intent_counts = pd.Series([item["intent"] for item in analysis]).value_counts()
	print("\nIntent Distribution:")
	print(intent_counts.to_string())

	avg_score = sum(item["sentiment_score"] for item in analysis) / len(analysis)
	print(f"\nAverage Sentiment Score: {avg_score:.2f}")

	if __name__ == "__main__":
	transcriber = MalayalamTranscriptionPipeline()

	try:
	audio_path = input("Enter path to Malayalam audio file: ").strip()
	if not os.path.exists(audio_path):
	print("Error: File not found")
	exit(1)

	print("\n🔊 Transcribing audio...")
	results = transcriber.transcribe_audio(audio_path)
	if not results or not results.get("raw_transcription"):
	print("Transcription failed.")
	exit(1)

	raw_transcription = results["raw_transcription"]
	print("\n=== Raw English Transcription ===")
	print(raw_transcription)

	print("\n🌐 Translating to Malayalam...")
	results = transcriber.translate_to_malayalam(results)
	ml_translation = results.get("translated_malayalam", "")
	print("\n=== Malayalam Translation ===")
	print(ml_translation)

	print("\n🔍 Analyzing texts...")
	en_analysis = analyze_text(raw_transcription, "en")
	ml_analysis = analyze_text(ml_translation, "ml")

	en_csv = save_analysis_to_csv(en_analysis, "english")
	ml_csv = save_analysis_to_csv(ml_analysis, "malayalam")

	comparison = compare_analyses(en_analysis, ml_analysis)
	comparison_csv = save_analysis_to_csv(comparison, "comparison")

	print_analysis_summary(en_analysis, "English")
	print_analysis_summary(ml_analysis, "Malayalam")

	print("\n=== Translation Accuracy Insights ===")
	intent_matches = sum(1 for item in comparison if item["intent_match"])
	print(f"Intent Match Rate: {intent_matches / len(comparison):.1%}")
	avg_sentiment_diff = sum(item["sentiment_diff"] for item in comparison) / len(comparison)
	print(f"Average Sentiment Difference: {avg_sentiment_diff:.2f}")

	# Calculate Lead Score from average sentiment scores
	en_avg_score = sum(item["sentiment_score"] for item in en_analysis) / len(en_analysis) if en_analysis else 0
	ml_avg_score = sum(item["sentiment_score"] for item in ml_analysis) / len(ml_analysis) if ml_analysis else 0
	combined_avg = (en_avg_score + ml_avg_score) / 2

	# Convert to lead score (0-100 scale)
	lead_score = int(combined_avg * 100)
	print(f"\n=== Lead Score ===")
	print(f"Calculated Lead Score: {lead_score}/100")
	if lead_score >= 70:
	print("Interpretation: High interest lead")
	elif lead_score >= 40:
	print("Interpretation: Moderate interest lead")
	else:
	print("Interpretation: Low interest lead")

	except Exception as e:
	print(f"\n❌ An error occurred: {str(e)}")
	finally:
	transcriber.cleanup()