Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import fitz # PyMuPDF
|
| 5 |
+
import re
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
def extract_pdf_text(pdf_path):
|
| 9 |
+
"""Extract text from a PDF file."""
|
| 10 |
+
with fitz.open(pdf_path) as pdf_document:
|
| 11 |
+
content_text = ""
|
| 12 |
+
for page_num in range(len(pdf_document)):
|
| 13 |
+
page = pdf_document.load_page(page_num)
|
| 14 |
+
content_text += page.get_text() + "\n"
|
| 15 |
+
return content_text
|
| 16 |
+
|
| 17 |
+
# Streamlit Application
|
| 18 |
+
st.title("PDF Data Extractor")
|
| 19 |
+
|
| 20 |
+
uploaded_file = st.file_uploader("Upload a PDF File", type="pdf")
|
| 21 |
+
|
| 22 |
+
if uploaded_file is not None:
|
| 23 |
+
with open("temp.pdf", "wb") as f:
|
| 24 |
+
f.write(uploaded_file.getbuffer())
|
| 25 |
+
|
| 26 |
+
pdf_text = extract_pdf_text("temp.pdf")
|
| 27 |
+
|
| 28 |
+
# Step 2: Extract relevant information from the text using regex
|
| 29 |
+
pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback"
|
| 30 |
+
matches = re.findall(pattern, pdf_text)
|
| 31 |
+
|
| 32 |
+
json_chunks = []
|
| 33 |
+
for match in matches:
|
| 34 |
+
match = match.replace(",", ".")
|
| 35 |
+
values = [value.strip() for value in match.split("\n") if value.strip()]
|
| 36 |
+
if len(values) == 22:
|
| 37 |
+
json_chunks.append({"current": values})
|
| 38 |
+
else:
|
| 39 |
+
current = values[1::2]
|
| 40 |
+
json_chunks.append({"current": current})
|
| 41 |
+
|
| 42 |
+
json_output = json.dumps(json_chunks, indent=2)
|
| 43 |
+
json_data = json.loads(json_output)
|
| 44 |
+
|
| 45 |
+
# Define the original data structure
|
| 46 |
+
original_data = {
|
| 47 |
+
'Title': [
|
| 48 |
+
"Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
|
| 49 |
+
"Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
|
| 50 |
+
"Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence",
|
| 51 |
+
"Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence",
|
| 52 |
+
],
|
| 53 |
+
'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12",
|
| 54 |
+
"P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"],
|
| 55 |
+
'Dimensions': [
|
| 56 |
+
"Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility",
|
| 57 |
+
"Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations",
|
| 58 |
+
"Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity",
|
| 59 |
+
"Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills",
|
| 60 |
+
"Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor"
|
| 61 |
+
]
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
df = pd.DataFrame(original_data)
|
| 65 |
+
|
| 66 |
+
# Add extracted scores to the DataFrame
|
| 67 |
+
score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score']
|
| 68 |
+
for idx, col in enumerate(score_columns):
|
| 69 |
+
df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current']))
|
| 70 |
+
|
| 71 |
+
score_pattern = r"\d{1,2},\d{2}"
|
| 72 |
+
code_pattern = r"[A-Z]\.[0-9]{1,2}"
|
| 73 |
+
|
| 74 |
+
all_scores = re.findall(score_pattern, pdf_text)
|
| 75 |
+
all_codes = re.findall(code_pattern, pdf_text)
|
| 76 |
+
|
| 77 |
+
scores = [float(score.replace(",", ".")) for score in all_scores]
|
| 78 |
+
codes = [code.strip() for code in all_codes]
|
| 79 |
+
|
| 80 |
+
if len(codes) >= 44:
|
| 81 |
+
codes = codes[22:44]
|
| 82 |
+
if len(scores) >= 22:
|
| 83 |
+
scores = scores[0:22]
|
| 84 |
+
|
| 85 |
+
df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores})
|
| 86 |
+
df_combined = pd.merge(df, df1, on="Code", how="inner")
|
| 87 |
+
|
| 88 |
+
feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score']
|
| 89 |
+
df_combined[feature_cols] = df_combined[feature_cols].astype(float)
|
| 90 |
+
|
| 91 |
+
def calculate_self_score(row):
|
| 92 |
+
valid_features = [val for val in row[feature_cols] if pd.notna(val)]
|
| 93 |
+
num_features = len(valid_features)
|
| 94 |
+
if num_features > 1:
|
| 95 |
+
sum_features = sum(valid_features) - row['All_raters_Score']
|
| 96 |
+
return (row['All_raters_Score'] * num_features) - sum_features
|
| 97 |
+
return np.nan
|
| 98 |
+
|
| 99 |
+
df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1)
|
| 100 |
+
|
| 101 |
+
# Display the resultant DataFrame
|
| 102 |
+
st.write("### Extracted Dataset")
|
| 103 |
+
st.dataframe(df_combined)
|