Spaces:
Build error
Build error
Emily Witko
commited on
Commit
·
ec15d48
1
Parent(s):
335a565
Initial commit
Browse files- app.py +167 -0
- requirements.txt +6 -0
app.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 5 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
| 6 |
+
from rake_nltk import Rake
|
| 7 |
+
from collections import Counter
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
def analyze_demographics(file):
|
| 11 |
+
df = pd.read_excel(file.name)
|
| 12 |
+
|
| 13 |
+
results = {
|
| 14 |
+
"Overall Metrics": {},
|
| 15 |
+
"Underrepresented Group Metrics": {},
|
| 16 |
+
"Tenure Metrics": {},
|
| 17 |
+
"Team Metrics": {},
|
| 18 |
+
"Nationality Metrics": {},
|
| 19 |
+
"Legal Entity Metrics": {},
|
| 20 |
+
"Work Location Metrics": {}
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]
|
| 24 |
+
|
| 25 |
+
recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
|
| 26 |
+
if recommend_col in df.columns:
|
| 27 |
+
promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
|
| 28 |
+
detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
|
| 29 |
+
total_respondents = df[recommend_col].notna().sum()
|
| 30 |
+
recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
|
| 31 |
+
recommend_avg = df[recommend_col].mean()
|
| 32 |
+
results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
|
| 33 |
+
results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)
|
| 34 |
+
|
| 35 |
+
support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
|
| 36 |
+
if support_col in df.columns:
|
| 37 |
+
promoters = df[support_col].apply(lambda x: x >= 9).sum()
|
| 38 |
+
detractors = df[support_col].apply(lambda x: x <= 6).sum()
|
| 39 |
+
total_respondents = df[support_col].notna().sum()
|
| 40 |
+
support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
|
| 41 |
+
support_avg = df[support_col].mean()
|
| 42 |
+
results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
|
| 43 |
+
results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)
|
| 44 |
+
|
| 45 |
+
demographic_columns = [
|
| 46 |
+
("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
|
| 47 |
+
("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
|
| 48 |
+
("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
|
| 49 |
+
("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
|
| 50 |
+
("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
|
| 51 |
+
("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for demo_col, demo_category in demographic_columns:
|
| 55 |
+
if demo_col in df.columns:
|
| 56 |
+
for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
|
| 57 |
+
if col in df.columns:
|
| 58 |
+
grouped_demo = df.groupby(demo_col)[col]
|
| 59 |
+
nps_by_demo = {}
|
| 60 |
+
for group, scores in grouped_demo:
|
| 61 |
+
promoters = scores.apply(lambda x: x >= 9).sum()
|
| 62 |
+
detractors = scores.apply(lambda x: x <= 6).sum()
|
| 63 |
+
total = scores.notna().sum()
|
| 64 |
+
nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
|
| 65 |
+
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
|
| 66 |
+
sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
|
| 67 |
+
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
|
| 68 |
+
else:
|
| 69 |
+
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
|
| 70 |
+
averages_demo = grouped_demo.mean()
|
| 71 |
+
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
|
| 72 |
+
sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
|
| 73 |
+
results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
|
| 74 |
+
else:
|
| 75 |
+
results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()
|
| 76 |
+
|
| 77 |
+
return results
|
| 78 |
+
|
| 79 |
+
def analyze_why_columns(file):
|
| 80 |
+
df = pd.read_excel(file.name)
|
| 81 |
+
why_columns = [col for col in df.columns if col.startswith("Why")]
|
| 82 |
+
|
| 83 |
+
results = {}
|
| 84 |
+
|
| 85 |
+
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
| 86 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 87 |
+
|
| 88 |
+
for col in why_columns:
|
| 89 |
+
column_data = df[col].dropna().tolist()
|
| 90 |
+
|
| 91 |
+
# Sentiment Analysis with Confidence Scores
|
| 92 |
+
sentiments = sentiment_analyzer(column_data)
|
| 93 |
+
sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
|
| 94 |
+
detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}
|
| 95 |
+
|
| 96 |
+
for response, sentiment in zip(column_data, sentiments):
|
| 97 |
+
label = sentiment["label"]
|
| 98 |
+
score = sentiment["score"]
|
| 99 |
+
sentiment_summary[label] += 1
|
| 100 |
+
detailed_sentiments[label].append({"response": response, "score": round(score, 2)})
|
| 101 |
+
|
| 102 |
+
# Topic Modeling
|
| 103 |
+
vectorizer = CountVectorizer(stop_words='english')
|
| 104 |
+
X = vectorizer.fit_transform(column_data)
|
| 105 |
+
lda = LatentDirichletAllocation(n_components=3, random_state=0)
|
| 106 |
+
lda.fit(X)
|
| 107 |
+
topics = []
|
| 108 |
+
for idx, topic in enumerate(lda.components_):
|
| 109 |
+
top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
|
| 110 |
+
topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))
|
| 111 |
+
|
| 112 |
+
# Keyword Extraction
|
| 113 |
+
combined_text = " ".join(column_data)
|
| 114 |
+
word_list = re.findall(r"\b\w+\b", combined_text.lower())
|
| 115 |
+
bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
|
| 116 |
+
bigram_counts = bigram_vectorizer.fit_transform([combined_text])
|
| 117 |
+
bigram_features = bigram_vectorizer.get_feature_names_out()
|
| 118 |
+
bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
|
| 119 |
+
bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
|
| 120 |
+
keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]
|
| 121 |
+
|
| 122 |
+
# Summarization
|
| 123 |
+
def split_text(text, max_length=1000):
|
| 124 |
+
words = text.split()
|
| 125 |
+
for i in range(0, len(words), max_length):
|
| 126 |
+
yield " ".join(words[i:i + max_length])
|
| 127 |
+
|
| 128 |
+
summaries = []
|
| 129 |
+
for chunk in split_text(combined_text, max_length=500):
|
| 130 |
+
summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
|
| 131 |
+
summaries.append(summary)
|
| 132 |
+
|
| 133 |
+
final_summary = " ".join(summaries)
|
| 134 |
+
|
| 135 |
+
# Store results
|
| 136 |
+
results[col] = {
|
| 137 |
+
"Sentiment Analysis Summary": sentiment_summary,
|
| 138 |
+
"Detailed Sentiments": detailed_sentiments,
|
| 139 |
+
"Topics": topics,
|
| 140 |
+
"Keywords": keywords,
|
| 141 |
+
"Summary": final_summary
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
return results
|
| 145 |
+
|
| 146 |
+
def process_file(file):
|
| 147 |
+
quantitative_results = analyze_demographics(file)
|
| 148 |
+
qualitative_results = analyze_why_columns(file)
|
| 149 |
+
|
| 150 |
+
return quantitative_results, qualitative_results
|
| 151 |
+
|
| 152 |
+
def app():
|
| 153 |
+
file_input = gr.File(label="Upload Survey Data (Excel format)")
|
| 154 |
+
text_output = gr.JSON(label="Quantitative Analysis Results")
|
| 155 |
+
qualitative_output = gr.JSON(label="Qualitative Analysis Results")
|
| 156 |
+
|
| 157 |
+
iface = gr.Interface(
|
| 158 |
+
fn=process_file,
|
| 159 |
+
inputs=file_input,
|
| 160 |
+
outputs=[text_output, qualitative_output],
|
| 161 |
+
title="Survey Data Analyzer",
|
| 162 |
+
description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
|
| 163 |
+
)
|
| 164 |
+
return iface
|
| 165 |
+
|
| 166 |
+
if __name__ == "__main__":
|
| 167 |
+
app().launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==3.40.0
|
| 2 |
+
pandas==1.5.3
|
| 3 |
+
openpyxl==3.1.2
|
| 4 |
+
scikit-learn==1.2.2
|
| 5 |
+
transformers==4.34.0
|
| 6 |
+
torch==2.0.1
|