restart / app.py
ombhojane's picture
Update app.py
2b3b522 verified
import streamlit as st
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
nltk.download('stopwords')
@st.cache_data
def load_models():
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sentiment_analysis = pipeline("sentiment-analysis")
zero_shot_classifier = pipeline("zero-shot-classification")
return tokenizer, model, sentiment_analysis, zero_shot_classifier
tokenizer, model, sentiment_analysis, zero_shot_classifier = load_models()
stop_words = set(stopwords.words('english'))
# Define necessary functions for encoding, similarity calculation, and parameter analysis here...
def encode(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Ensure the output is a 1-D tensor by using .squeeze()
return outputs.last_hidden_state.mean(dim=1).squeeze()
def calculate_similarity(embedding1, embedding2):
# Ensure embeddings are numpy arrays in 1-D format
embedding1_np = embedding1.numpy()
embedding2_np = embedding2.numpy()
# Check if embeddings are already 1-D, if not, flatten them
if embedding1_np.ndim > 1:
embedding1_np = embedding1_np.flatten()
if embedding2_np.ndim > 1:
embedding2_np = embedding2_np.flatten()
return 1 - cosine(embedding1_np, embedding2_np)
# Analysis Functions for Each Parameter
def analyze_positivity(text):
result = sentiment_analysis(text)
score = result[0]['score']
return 50 + (score * 50) if result[0]['label'] == 'POSITIVE' else 50 - (score * 50)
def analyze_decisiveness(text):
candidate_labels = ['confident', 'uncertain']
result = zero_shot_classifier(text, candidate_labels=candidate_labels)
score = max(result['scores'])
return score * 100
def analyze_specificity(text):
words = [word for word in re.findall(r'\b\w+\b', text.lower()) if word not in stop_words and word.isalpha()]
unique_words = set(words)
technical_terms = {'algorithm', 'data', 'analysis', 'design', 'programming'} # Example set
technical_term_count = sum(1 for word in unique_words if word in technical_terms)
specificity_score = (len(unique_words) / len(words) * 50) + (technical_term_count * 50 / len(technical_terms))
return min(100, specificity_score)
def analyze_self_awareness(text):
reflection_keywords = ['strengths', 'weaknesses', 'learned', 'improved', 'challenge', 'goal', 'feedback']
reflection_count = sum(text.lower().count(keyword) for keyword in reflection_keywords)
total_words = len(re.findall(r'\b\w+\b', text.lower()))
reflection_density = (reflection_count / total_words) * 100
positive_score = analyze_sentiment_for_self_awareness(text)
self_awareness_score = (reflection_density * 0.7) + (positive_score * 0.3)
return min(100, self_awareness_score)
def analyze_sentiment_for_self_awareness(text):
result = sentiment_analysis(text)
positive_score = result[0]['score'] if result[0]['label'] == 'POSITIVE' else 0
return positive_score * 100
def analyze_career_knowledge(text):
career_topics = ["data science", "software engineering", "product management"]
result = zero_shot_classifier(text, candidate_labels=career_topics)
highest_score = max(result['scores'])
return highest_score * 100
def analyze_decision_anxiety(text):
anxiety_keywords = ['uncertain', 'doubt', 'unsure', 'anxious', 'overwhelmed']
anxiety_keyword_count = sum(text.lower().count(keyword) for keyword in anxiety_keywords)
sentiment_result = sentiment_analysis(text)
negative_score = sentiment_result[0]['score'] if sentiment_result[0]['label'] == 'NEGATIVE' else 0
anxiety_score = negative_score + (anxiety_keyword_count * 0.1)
return min(100, anxiety_score * 100)
def analyze_values_alignment(text):
# Placeholder: Replace with your values and method for calculating semantic similarity
values = "innovation, teamwork, impact"
embedding_text = encode(text)
embedding_values = encode(values)
similarity_score = calculate_similarity(embedding_text, embedding_values)
return similarity_score * 100
def analyze_interests_alignment(text):
# Placeholder: Replace with your interests and method for calculating semantic similarity
interests = "technology, programming, design"
embedding_text = encode(text)
embedding_interests = encode(interests)
similarity_score = calculate_similarity(embedding_text, embedding_interests)
return similarity_score * 100
def analyze_qa_pair(question, answer):
combined_text = f"{question} {answer}"
# Calculate scores for each parameter using the combined text
positivity_score = analyze_positivity(combined_text)
decisiveness_score = analyze_decisiveness(combined_text)
specificity_score = analyze_specificity(combined_text)
self_awareness_score = analyze_self_awareness(combined_text)
career_knowledge_score = analyze_career_knowledge(combined_text)
decision_anxiety_score = analyze_decision_anxiety(combined_text)
values_alignment_score = analyze_values_alignment(combined_text)
interests_alignment_score = analyze_interests_alignment(combined_text)
return {
"Positivity": positivity_score,
"Decisiveness": decisiveness_score,
"Specificity": specificity_score,
"Self-awareness": self_awareness_score,
"Career Knowledge": career_knowledge_score,
"Decision Anxiety": decision_anxiety_score,
"Values Alignment": values_alignment_score,
"Interests Alignment": interests_alignment_score,
}
# Load Q&A data from Excel
def plot_average_scores(avg_scores):
"""
Plots the average scores across all Q&A pairs with parameters on the x-axis
and values on the y-axis ranging from 0-100.
"""
parameters = list(avg_scores.keys())
scores = list(avg_scores.values())
fig, ax = plt.subplots(figsize=(10, 8))
ax.bar(parameters, scores, color='skyblue')
# Set limits for y-axis from 0 to 100
ax.set_ylim(0, 100)
# Rotate parameter labels to fit and make readable
plt.xticks(rotation=45, ha="right")
# Setting labels and title
ax.set_ylabel('Average Score')
ax.set_title('Average Scores Across All Q&A Pairs')
plt.tight_layout()
return fig
# Streamlit app main function
def main():
st.title("Q&A Analysis App")
uploaded_file = st.file_uploader("Choose an Excel file with Q&A", type=["xlsx"])
if uploaded_file is not None:
df = pd.read_excel(uploaded_file)
avg_scores = process_and_analyze_qa(df)
# Display parameter values in a table
st.subheader("Average Scores Across All Q&A Pairs:")
st.table(pd.DataFrame(avg_scores.items(), columns=['Parameter', 'Value']))
# Generate and display the custom graph
fig = plot_average_scores(avg_scores)
st.pyplot(fig)
def process_and_analyze_qa(df):
scores_dict = {key: [] for key in analyze_qa_pair("Q", "A").keys()}
for question in df.columns:
for answer in df[question].dropna():
scores = analyze_qa_pair(question, answer)
for key, value in scores.items():
scores_dict[key].append(value)
avg_scores = {key: sum(values) / len(values) for key, values in scores_dict.items() if values}
return avg_scores
if __name__ == "__main__":
main()