Spaces:
Build error
Build error
| import gradio as gr | |
| import pandas as pd | |
| from transformers import pipeline | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.decomposition import LatentDirichletAllocation | |
| from collections import Counter | |
| import re | |
| def analyze_demographics(file): | |
| df = pd.read_excel(file.name) | |
| results = { | |
| "Overall Metrics": {}, | |
| "Underrepresented Group Metrics": {}, | |
| "Tenure Metrics": {}, | |
| "Team Metrics": {}, | |
| "Nationality Metrics": {}, | |
| "Legal Entity Metrics": {}, | |
| "Work Location Metrics": {} | |
| } | |
| tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"] | |
| recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?" | |
| if recommend_col in df.columns: | |
| promoters = df[recommend_col].apply(lambda x: x >= 9).sum() | |
| detractors = df[recommend_col].apply(lambda x: x <= 6).sum() | |
| total_respondents = df[recommend_col].notna().sum() | |
| recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None | |
| recommend_avg = df[recommend_col].mean() | |
| results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2) | |
| results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2) | |
| support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?" | |
| if support_col in df.columns: | |
| promoters = df[support_col].apply(lambda x: x >= 9).sum() | |
| detractors = df[support_col].apply(lambda x: x <= 6).sum() | |
| total_respondents = df[support_col].notna().sum() | |
| support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None | |
| support_avg = df[support_col].mean() | |
| results["Overall Metrics"]['Support NPS'] = round(support_nps, 2) | |
| results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2) | |
| demographic_columns = [ | |
| ("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"), | |
| ("How long have you been at Hugging Face? (optional)", "Tenure Metrics"), | |
| ("Which team are you on here at Hugging Face? (optional)", "Team Metrics"), | |
| ("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"), | |
| ("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"), | |
| ("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics") | |
| ] | |
| for demo_col, demo_category in demographic_columns: | |
| if demo_col in df.columns: | |
| for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]: | |
| if col in df.columns: | |
| grouped_demo = df.groupby(demo_col)[col] | |
| nps_by_demo = {} | |
| for group, scores in grouped_demo: | |
| promoters = scores.apply(lambda x: x >= 9).sum() | |
| detractors = scores.apply(lambda x: x <= 6).sum() | |
| total = scores.notna().sum() | |
| nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None | |
| if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)": | |
| sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo} | |
| results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()} | |
| else: | |
| results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()} | |
| averages_demo = grouped_demo.mean() | |
| if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)": | |
| sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo} | |
| results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()} | |
| else: | |
| results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict() | |
| return results | |
| def analyze_why_columns(file): | |
| df = pd.read_excel(file.name) | |
| # Map column names to new labels | |
| column_label_map = { | |
| "Why? (optional)": "HF NPS Why?", | |
| "Why? (optional.1)": "Support Team NPS Why?", | |
| "Why? (optional.2)": "Productivity Why?" | |
| } | |
| # Rename columns in the DataFrame | |
| df = df.rename(columns=column_label_map) | |
| # Get the renamed columns that start with "Why" | |
| why_columns = [col for col in df.columns if col in column_label_map.values()] | |
| results = {} | |
| sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| for col in why_columns: | |
| column_data = df[col].dropna().tolist() | |
| # Sentiment Analysis with Confidence Scores | |
| sentiments = sentiment_analyzer(column_data) | |
| sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0} | |
| detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []} | |
| for response, sentiment in zip(column_data, sentiments): | |
| label = sentiment["label"] | |
| score = sentiment["score"] | |
| sentiment_summary[label] += 1 | |
| detailed_sentiments[label].append({"response": response, "score": round(score, 2)}) | |
| # Topic Modeling | |
| vectorizer = CountVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(column_data) | |
| lda = LatentDirichletAllocation(n_components=3, random_state=0) | |
| lda.fit(X) | |
| topics = [] | |
| for idx, topic in enumerate(lda.components_): | |
| top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]] | |
| topics.append(f"Topic {idx + 1}: " + ", ".join(top_words)) | |
| # Keyword Extraction | |
| combined_text = " ".join(column_data) | |
| word_list = re.findall(r"\\b\\w+\\b", combined_text.lower()) | |
| bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english') | |
| bigram_counts = bigram_vectorizer.fit_transform([combined_text]) | |
| bigram_features = bigram_vectorizer.get_feature_names_out() | |
| bigram_counts_sum = bigram_counts.toarray().sum(axis=0) | |
| bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10) | |
| keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency] | |
| # Summarization | |
| def split_text(text, max_length=1000): | |
| words = text.split() | |
| for i in range(0, len(words), max_length): | |
| yield " ".join(words[i:i + max_length]) | |
| summaries = [] | |
| for chunk in split_text(combined_text, max_length=500): | |
| summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] | |
| summaries.append(summary) | |
| final_summary = " ".join(summaries) | |
| # Store results | |
| results[col] = { | |
| "Sentiment Analysis Summary": sentiment_summary, | |
| "Detailed Sentiments": detailed_sentiments, | |
| "Topics": topics, | |
| "Keywords": keywords, | |
| "Summary": final_summary | |
| } | |
| return results | |
| def process_file(file): | |
| quantitative_results = analyze_demographics(file) | |
| qualitative_results = analyze_why_columns(file) | |
| return quantitative_results, qualitative_results | |
| def app(): | |
| file_input = gr.File(label="Upload Survey Data (Excel format)") | |
| text_output = gr.JSON(label="Quantitative Analysis Results") | |
| qualitative_output = gr.JSON(label="Qualitative Analysis Results") | |
| iface = gr.Interface( | |
| fn=process_file, | |
| inputs=file_input, | |
| outputs=[text_output, qualitative_output], | |
| title="Survey Data Analyzer", | |
| description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights." | |
| ) | |
| return iface | |
| if __name__ == "__main__": | |
| app().launch(share=True) | |