Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import tabula | |
| from PyPDF2 import PdfReader | |
| import re | |
| import numpy as np | |
| import os | |
| import tempfile | |
| import shutil | |
| def pdf_processing(pdf_file): | |
| def extract_tables_from_pdf(pdf_file): | |
| # Open the PDF file in read-binary mode | |
| with open(pdf_file.name, 'rb') as file: | |
| # Create a PDF reader object | |
| pdf_reader = PdfReader(file) | |
| # Initialize a list to store the extracted tables | |
| tables = [] | |
| # Iterate over each page in the PDF | |
| for page_number in range(len(pdf_reader.pages)): | |
| # Extract the page as a DataFrame using tabula-py | |
| df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True) | |
| # Append the extracted DataFrame to the tables list | |
| tables.append(df) | |
| return tables | |
| def extract_substring(input_string): | |
| pattern = r'(\d{3})\.[\w]{3}' | |
| match = re.search(pattern, input_string) | |
| return match.group() if match else '' | |
| def extract_information(input_string): | |
| # Define regex patterns for extracting information | |
| lva_pattern = r'^(.*?)\r' | |
| studium_pattern = r'\r(.*?)(?=\d)' | |
| lva_nr_pattern = r'(\d{3})\.\d{3}' | |
| pruefer_pattern = r'(?<=\d)\r(.*)$' | |
| # Extract LVA | |
| lva = re.search(lva_pattern, input_string, re.DOTALL) | |
| lva = lva.group(1).strip() if lva else '' | |
| # Extract LVA Nr. | |
| lva_nr = extract_substring(input_string) | |
| # Extract Studium | |
| i = input_string.index(lva_nr) | |
| studium = input_string[len(lva):i][1:] | |
| # Extract Prüfer'in | |
| j = i + len(lva_nr) | |
| pruefer = input_string[j:] | |
| return { | |
| 'LVA': lva, | |
| 'Studium': studium, | |
| 'LVA Nr.': lva_nr, | |
| 'Prüfer\'in': pruefer | |
| } | |
| # Example usage | |
| extracted_tables = extract_tables_from_pdf(pdf_file) | |
| # Create a new DataFrame | |
| new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']) | |
| final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']) | |
| for j in range(len(extracted_tables)): | |
| # Data to append | |
| try: | |
| data = extracted_tables[j][1] | |
| except: | |
| print('Process finished!') | |
| # Create a DataFrame from the data | |
| data_df = pd.DataFrame(data)#, columns=new_df.columns) | |
| # Concatenate the new DataFrame with the data DataFrame | |
| new_df = pd.concat([new_df, data_df], ignore_index=True) | |
| # New header row | |
| new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'] | |
| # Save the 0-th row | |
| data_df.loc[-1] = data_df.columns | |
| data_df.index = data_df.index + 1 | |
| data_df.sort_index(inplace=True) | |
| # Assign the new header row | |
| data_df.columns = new_header | |
| # Define the index position to insert the new columns | |
| insert_index = 1 | |
| # Add empty columns at the specified index | |
| data_df.insert(insert_index, 'LVA-Nr.', '') | |
| data_df.insert(insert_index, 'Studium', '') | |
| data_df.insert(insert_index, 'Prüfer*in', '') | |
| for i in range(len(data_df)): | |
| input_string = data_df['LVA'][i] | |
| information = extract_information(input_string) | |
| data_df['LVA'][i] = information['LVA'] | |
| data_df['Studium'][i] = information['Studium'] | |
| data_df['LVA-Nr.'][i] = information['LVA Nr.'] | |
| data_df['Prüfer*in'][i] = information['Prüfer\'in'] | |
| final_df = pd.concat([final_df, data_df], ignore_index=True) | |
| # Remove \r from a column | |
| final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ') | |
| # Convert the data type of 'ECTS' column to float | |
| final_df['ECTS'] = final_df['ECTS'].astype(float) | |
| # Drop duplicates | |
| final_df = final_df.drop_duplicates() | |
| # Sum the values in 'ECTS' column | |
| column_sum = final_df['ECTS'].sum() | |
| # Calculate weighted mean | |
| vector = final_df['Beurteilung'] | |
| numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector)) | |
| ects = list(final_df['ECTS']) | |
| wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum | |
| # Save the CSV file to the working directory | |
| csv_save_path = os.path.join(os.getcwd(), "sample.csv") | |
| final_df.to_csv(csv_save_path, index=False) | |
| return column_sum, np.round(wm, 2), final_df, csv_save_path | |
| # Define the Gradio interface | |
| inputs = gr.inputs.File(label="Select PDF file", type="file") | |
| #inputs = gr.inputs.Textbox(label="Enter the PDF file path:") | |
| outputs = [ | |
| gr.outputs.Textbox(label="Total ECTS"), | |
| gr.outputs.Textbox(label="Weighted Mean"), | |
| gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True), | |
| gr.outputs.File(), | |
| ] | |
| iface = gr.Interface( | |
| fn=pdf_processing, | |
| inputs=inputs, | |
| outputs=outputs, | |
| title="KUSSS Buddy", | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch(share=False) | |