Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import fitz # PyMuPDF | |
| import re | |
| import json | |
| # LICENSE.numpy.BSD-3 - Copyright (c) 2005-2024, NumPy Developers (https://github.com/numpy/numpy/blob/main/LICENSE.txt) | |
| # LICENSE.streamlit.Apachev2 - Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE) | |
| # LICENSE.pandas.BSD-3 - Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE) | |
| # LICENSE.re.CNRI - Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. (https://www.handle.net/python_licenses/python1.6_9-5-00.html) | |
| # LICENSE.json.LGPL - Copyright: (c) 2017-2019 by Brad Jasper (c) 2012-2017 by Trevor Lohrbeer (https://github.com/bradjasper/ImportJSON/blob/master/LICENSE) | |
| # LICENSE.pymupdf.AGPL - Copyright (C) 2023 Artifex Software, Inc. (https://github.com/pymupdf/PyMuPDF/blob/main/COPYING) | |
| def extract_pdf_text(pdf_path): | |
| """Extract text from a PDF file.""" | |
| with fitz.open(pdf_path) as pdf_document: | |
| content_text = "" | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| content_text += page.get_text() + "\n" | |
| return content_text | |
| # Streamlit Application | |
| st.title("PDF Data Extractor") | |
| uploaded_file = st.file_uploader("Upload a PDF File", type="pdf") | |
| if uploaded_file is not None: | |
| with open("temp.pdf", "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| pdf_text = extract_pdf_text("temp.pdf") | |
| # Step 2: Extract relevant information from the text using regex | |
| pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback" | |
| matches = re.findall(pattern, pdf_text) | |
| json_chunks = [] | |
| for match in matches: | |
| match = match.replace(",", ".") | |
| values = [value.strip() for value in match.split("\n") if value.strip()] | |
| if len(values) == 22: | |
| json_chunks.append({"current": values}) | |
| else: | |
| current = values[1::2] | |
| json_chunks.append({"current": current}) | |
| json_output = json.dumps(json_chunks, indent=2) | |
| json_data = json.loads(json_output) | |
| # Define the original data structure | |
| original_data = { | |
| 'Title': [ | |
| "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", | |
| "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", | |
| "Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence", | |
| "Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence", | |
| ], | |
| 'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12", | |
| "P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"], | |
| 'Dimensions': [ | |
| "Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility", | |
| "Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations", | |
| "Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity", | |
| "Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills", | |
| "Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor" | |
| ] | |
| } | |
| df = pd.DataFrame(original_data) | |
| # Add extracted scores to the DataFrame | |
| score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score'] | |
| for idx, col in enumerate(score_columns): | |
| df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current'])) | |
| score_pattern = r"\d{1,2},\d{2}" | |
| code_pattern = r"[A-Z]\.[0-9]{1,2}" | |
| all_scores = re.findall(score_pattern, pdf_text) | |
| all_codes = re.findall(code_pattern, pdf_text) | |
| scores = [float(score.replace(",", ".")) for score in all_scores] | |
| codes = [code.strip() for code in all_codes] | |
| if len(codes) >= 44: | |
| codes = codes[22:44] | |
| if len(scores) >= 22: | |
| scores = scores[0:22] | |
| df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores}) | |
| df_combined = pd.merge(df, df1, on="Code", how="inner") | |
| feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score'] | |
| df_combined[feature_cols] = df_combined[feature_cols].astype(float) | |
| def calculate_self_score(row): | |
| valid_features = [val for val in row[feature_cols] if pd.notna(val)] | |
| num_features = len(valid_features) | |
| if num_features > 1: | |
| sum_features = sum(valid_features) - row['All_raters_Score'] | |
| return (row['All_raters_Score'] * num_features) - sum_features | |
| return np.nan | |
| df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1) | |
| df_combined['Benchmark_score'] = np.random.uniform(4.8, 5.9, size=len(df_combined)).round(1) | |
| #Step 7 : Picking strengths and weaknesses | |
| # List of keywords/phrases to capture | |
| keywords = [ | |
| 'Integrity/ Reliability', 'Appearance', 'Enthusiasm/Passion', | |
| 'Learning Motivation/ Self-Development', 'Ability to Adapt/Flexibility', | |
| 'Communication/Information', 'Cooperation/ Team spirit', | |
| 'Handling of Complex Situations', 'Coolness/Handling of Unclear Situations', 'Self-reliance/Initiative', | |
| 'Conflict Management', 'Ability to Assert Oneself/ Negotiation Skills', | |
| 'Tact and Sensitivity', 'Quality Orientation', 'Client Orientation', | |
| 'Specialized Knowledge', 'Methodology/ Didactics/ Language', | |
| 'Creativity/ Conceptional Skills', 'Project Management', | |
| 'Result Orientation', 'Leadership Skills', 'Coach and Advisor' | |
| ] | |
| # Extract phrases between "Topics I would like to discuss... " and "Schedule for the follow-up meeting" | |
| phrases_pattern = r"Please use the form at the end of the section to finalize your development planning\.\s*(.*?)\s*Schedule for the follow-up meeting" | |
| phrases_matches = re.findall(phrases_pattern, pdf_text, re.DOTALL) | |
| # Extract the word after "The biggest strengths and room for improvements perceived by:" | |
| label_pattern = r"The biggest strengths and room for improvements perceived by:\s*(\w+)" | |
| labels = re.findall(label_pattern, pdf_text) | |
| # Process each match and extract only the required keywords | |
| json_output = [] | |
| for i, phrases_text in enumerate(phrases_matches): | |
| extracted_phrases = [ | |
| phrase for phrase in keywords if phrase in phrases_text | |
| ] | |
| if extracted_phrases: | |
| label = labels[i] if i < len(labels) else f"Phrases_{i+1}" | |
| json_output.append({label: extracted_phrases}) | |
| # Convert to JSON format | |
| json_output_str = json.dumps(json_output, indent=2) | |
| # Print the JSON result | |
| #print(json_output_str) | |
| json_data = df.to_json(orient='records') | |
| data = [] | |
| for item in json_output: | |
| for label, phrases in item.items(): | |
| for phrase in phrases: | |
| data.append({'Rater': label, 'Dimensions': phrase}) | |
| df4 = pd.DataFrame(data) | |
| #Step 9: Converting Streangths and Weaknesses with scores into json | |
| # Filter dataframes based on 'Label' value | |
| boss, direct, colleague, other_colleague = [df4[df4['Rater'] == label].copy() for label in ['Boss', 'Direct', 'Colleagues', 'Colleague (o']] | |
| # Create mapping dictionaries from df3 | |
| mappings = { | |
| 'Boss_score': df_combined.set_index('Dimensions')['Boss_score'].to_dict(), | |
| 'Report_score': df_combined.set_index('Dimensions')['Report_score'].to_dict(), | |
| 'Colleague_score': df_combined.set_index('Dimensions')['Colleague_score'].to_dict(), | |
| 'Other_colleague_score': df_combined.set_index('Dimensions')['Colleague_other_score'].to_dict() | |
| } | |
| # Map the values from df3 to the appropriate DataFrames | |
| boss['Score'] = boss['Dimensions'].map(mappings['Boss_score']) | |
| direct['Score'] = direct['Dimensions'].map(mappings['Report_score']) | |
| colleague['Score'] = colleague['Dimensions'].map(mappings['Colleague_score']) | |
| other_colleague['Score'] = other_colleague['Dimensions'].map(mappings['Other_colleague_score']) | |
| boss = boss.sort_values(by = 'Score', ascending = False).reset_index(drop = True) | |
| direct = direct.sort_values(by = 'Score', ascending = False).reset_index(drop = True) | |
| colleague = colleague.sort_values(by = 'Score', ascending = False).reset_index(drop = True) | |
| other_colleague = other_colleague.sort_values(by = 'Score', ascending = False).reset_index(drop = True) | |
| def assign_strength_weakness(df): | |
| df['Strength/Weakness'] = np.nan | |
| df.loc[df.index.isin([0, 1, 2]) & df['Score'].notna(), 'Strength/Weakness'] = 'S' | |
| df.loc[df.index.isin([3, 4, 5]) & df['Score'].notna(), 'Strength/Weakness'] = 'W' | |
| return df | |
| # Apply the function to each DataFrame | |
| boss = assign_strength_weakness(boss) | |
| direct = assign_strength_weakness(direct) | |
| colleague = assign_strength_weakness(colleague) | |
| other_colleague = assign_strength_weakness(other_colleague) | |
| df5 = pd.concat([boss, direct, colleague, other_colleague], axis = 0) | |
| df5 = df5.dropna() | |
| sections = [ | |
| "Continue doing the following", | |
| "Start doing the following", | |
| "Reasons why I think that your behavior has worsened concerning the dimensions marked in the \"Perception & Change Section\" of the questionnaire", | |
| "Further tips for your work in our organisation" | |
| ] | |
| patterns = { | |
| "Boss": r"VG\n(.*?)(?=\(Boss\))", | |
| "Colleagues": r"Ke\n(.*?)(?=\(Colleagues\))", | |
| "Customers": r"KU\n(.*?)(?=\(Internal/external customers\))" | |
| } | |
| # Function to extract comments for each section | |
| def extract_comments(data, section): | |
| section_pattern = rf"Kom\s+{re.escape(section)}:\n(.*?)(?=(?:IX\. Open Comments|$))" | |
| section_data = re.search(section_pattern, data, re.DOTALL) | |
| if not section_data: | |
| return [] | |
| section_text = section_data.group(1) | |
| comments = [] | |
| for rater, pattern in patterns.items(): | |
| matches = re.findall(pattern, section_text, re.DOTALL) | |
| for match in matches: | |
| comments.append({ | |
| "Section": section, | |
| "Rater": rater, | |
| "Comment": match.strip() | |
| }) | |
| return comments | |
| # Create dataframes for each section | |
| all_comments = [] | |
| for section in sections: | |
| all_comments.extend(extract_comments(pdf_text, section)) | |
| df6 = pd.DataFrame(all_comments) | |
| st.write("## Output:") | |
| st.write("### 1. Dataset: Compentency Cluster, Code, Dimensions, Raters and Score") | |
| st.dataframe(df_combined) | |
| st.write("#### Note: The Self Score is calculated as: (All Raters × Number of Raters) − (Sum of Rater Scores)") | |
| st.write("### 2. Extracted list of Strengths and Weaknesses rated by each Rater") | |
| st.write(df5) | |
| st.write("### 3. Extracted list of Open Comments by each Rater") | |
| st.write(df6) | |
| st.write("#### Note: This extraction is not 100% able to extract each Rater comments / feedback. This is will be improved") |