Spaces:

EmilyWitko
/

survey-analyzer

Build error

App Files Files Community

Emily Witko commited on Jan 14, 2025

Commit

ec15d48

1 Parent(s): 335a565

Initial commit

Browse files

Files changed (2) hide show

app.py +167 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import gradio as gr
+import pandas as pd
+from transformers import pipeline
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+from rake_nltk import Rake
+from collections import Counter
+import re
+def analyze_demographics(file):
+    df = pd.read_excel(file.name)
+    results = {
+        "Overall Metrics": {},
+        "Underrepresented Group Metrics": {},
+        "Tenure Metrics": {},
+        "Team Metrics": {},
+        "Nationality Metrics": {},
+        "Legal Entity Metrics": {},
+        "Work Location Metrics": {}
+    }
+    tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]
+    recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
+    if recommend_col in df.columns:
+        promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
+        detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
+        total_respondents = df[recommend_col].notna().sum()
+        recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
+        recommend_avg = df[recommend_col].mean()
+        results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
+        results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)
+    support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
+    if support_col in df.columns:
+        promoters = df[support_col].apply(lambda x: x >= 9).sum()
+        detractors = df[support_col].apply(lambda x: x <= 6).sum()
+        total_respondents = df[support_col].notna().sum()
+        support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
+        support_avg = df[support_col].mean()
+        results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
+        results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)
+    demographic_columns = [
+        ("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
+        ("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
+        ("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
+        ("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
+        ("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
+        ("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
+    ]
+    for demo_col, demo_category in demographic_columns:
+        if demo_col in df.columns:
+            for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
+                if col in df.columns:
+                    grouped_demo = df.groupby(demo_col)[col]
+                    nps_by_demo = {}
+                    for group, scores in grouped_demo:
+                        promoters = scores.apply(lambda x: x >= 9).sum()
+                        detractors = scores.apply(lambda x: x <= 6).sum()
+                        total = scores.notna().sum()
+                        nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
+                    if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
+                        sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
+                        results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
+                    else:
+                        results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
+                    averages_demo = grouped_demo.mean()
+                    if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
+                        sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
+                        results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
+                    else:
+                        results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()
+    return results
+def analyze_why_columns(file):
+    df = pd.read_excel(file.name)
+    why_columns = [col for col in df.columns if col.startswith("Why")]
+    results = {}
+    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    for col in why_columns:
+        column_data = df[col].dropna().tolist()
+        # Sentiment Analysis with Confidence Scores
+        sentiments = sentiment_analyzer(column_data)
+        sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
+        detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}
+        for response, sentiment in zip(column_data, sentiments):
+            label = sentiment["label"]
+            score = sentiment["score"]
+            sentiment_summary[label] += 1
+            detailed_sentiments[label].append({"response": response, "score": round(score, 2)})
+        # Topic Modeling
+        vectorizer = CountVectorizer(stop_words='english')
+        X = vectorizer.fit_transform(column_data)
+        lda = LatentDirichletAllocation(n_components=3, random_state=0)
+        lda.fit(X)
+        topics = []
+        for idx, topic in enumerate(lda.components_):
+            top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
+            topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))
+        # Keyword Extraction
+        combined_text = " ".join(column_data)
+        word_list = re.findall(r"\b\w+\b", combined_text.lower())
+        bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
+        bigram_counts = bigram_vectorizer.fit_transform([combined_text])
+        bigram_features = bigram_vectorizer.get_feature_names_out()
+        bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
+        bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
+        keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]
+        # Summarization
+        def split_text(text, max_length=1000):
+            words = text.split()
+            for i in range(0, len(words), max_length):
+                yield " ".join(words[i:i + max_length])
+        summaries = []
+        for chunk in split_text(combined_text, max_length=500):
+            summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
+            summaries.append(summary)
+        final_summary = " ".join(summaries)
+        # Store results
+        results[col] = {
+            "Sentiment Analysis Summary": sentiment_summary,
+            "Detailed Sentiments": detailed_sentiments,
+            "Topics": topics,
+            "Keywords": keywords,
+            "Summary": final_summary
+        }
+    return results
+def process_file(file):
+    quantitative_results = analyze_demographics(file)
+    qualitative_results = analyze_why_columns(file)
+    return quantitative_results, qualitative_results
+def app():
+    file_input = gr.File(label="Upload Survey Data (Excel format)")
+    text_output = gr.JSON(label="Quantitative Analysis Results")
+    qualitative_output = gr.JSON(label="Qualitative Analysis Results")
+    iface = gr.Interface(
+        fn=process_file,
+        inputs=file_input,
+        outputs=[text_output, qualitative_output],
+        title="Survey Data Analyzer",
+        description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
+    )
+    return iface
+if __name__ == "__main__":
+    app().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==3.40.0
+pandas==1.5.3
+openpyxl==3.1.2
+scikit-learn==1.2.2
+transformers==4.34.0
+torch==2.0.1