Spaces:

marcelhuber
/

kusssbuddy

Sleeping

App Files Files Community

kusssbuddy / app.py

marcelhuber

Update app.py

f0e873d over 2 years ago

raw

history blame contribute delete

5.32 kB

	import gradio as gr
	import pandas as pd
	import tabula
	from PyPDF2 import PdfReader
	import re
	import numpy as np
	import os
	import tempfile
	import shutil

	def pdf_processing(pdf_file):
	def extract_tables_from_pdf(pdf_file):
	# Open the PDF file in read-binary mode
	with open(pdf_file.name, 'rb') as file:
	# Create a PDF reader object
	pdf_reader = PdfReader(file)

	# Initialize a list to store the extracted tables
	tables = []

	# Iterate over each page in the PDF
	for page_number in range(len(pdf_reader.pages)):
	# Extract the page as a DataFrame using tabula-py
	df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True)

	# Append the extracted DataFrame to the tables list
	tables.append(df)

	return tables

	def extract_substring(input_string):
	pattern = r'(\d{3})\.[\w]{3}'
	match = re.search(pattern, input_string)
	return match.group() if match else ''

	def extract_information(input_string):
	# Define regex patterns for extracting information
	lva_pattern = r'^(.*?)\r'
	studium_pattern = r'\r(.*?)(?=\d)'
	lva_nr_pattern = r'(\d{3})\.\d{3}'
	pruefer_pattern = r'(?<=\d)\r(.*)$'

	# Extract LVA
	lva = re.search(lva_pattern, input_string, re.DOTALL)
	lva = lva.group(1).strip() if lva else ''

	# Extract LVA Nr.
	lva_nr = extract_substring(input_string)

	# Extract Studium
	i = input_string.index(lva_nr)
	studium = input_string[len(lva):i][1:]

	# Extract Prüfer'in
	j = i + len(lva_nr)
	pruefer = input_string[j:]

	return {
	'LVA': lva,
	'Studium': studium,
	'LVA Nr.': lva_nr,
	'Prüfer\'in': pruefer
	}

	# Example usage
	extracted_tables = extract_tables_from_pdf(pdf_file)

	# Create a new DataFrame
	new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
	final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])

	for j in range(len(extracted_tables)):
	# Data to append
	try:
	data = extracted_tables[j][1]
	except:
	print('Process finished!')

	# Create a DataFrame from the data
	data_df = pd.DataFrame(data)#, columns=new_df.columns)

	# Concatenate the new DataFrame with the data DataFrame
	new_df = pd.concat([new_df, data_df], ignore_index=True)

	# New header row
	new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']

	# Save the 0-th row
	data_df.loc[-1] = data_df.columns
	data_df.index = data_df.index + 1
	data_df.sort_index(inplace=True)

	# Assign the new header row
	data_df.columns = new_header

	# Define the index position to insert the new columns
	insert_index = 1

	# Add empty columns at the specified index
	data_df.insert(insert_index, 'LVA-Nr.', '')
	data_df.insert(insert_index, 'Studium', '')
	data_df.insert(insert_index, 'Prüfer*in', '')

	for i in range(len(data_df)):
	input_string = data_df['LVA'][i]
	information = extract_information(input_string)

	data_df['LVA'][i] = information['LVA']
	data_df['Studium'][i] = information['Studium']
	data_df['LVA-Nr.'][i] = information['LVA Nr.']
	data_df['Prüfer*in'][i] = information['Prüfer\'in']

	final_df = pd.concat([final_df, data_df], ignore_index=True)

	# Remove \r from a column
	final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')

	# Convert the data type of 'ECTS' column to float
	final_df['ECTS'] = final_df['ECTS'].astype(float)

	# Drop duplicates
	final_df = final_df.drop_duplicates()

	# Sum the values in 'ECTS' column
	column_sum = final_df['ECTS'].sum()

	# Calculate weighted mean
	vector = final_df['Beurteilung']
	numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
	ects = list(final_df['ECTS'])
	wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum

	# Save the CSV file to the working directory
	csv_save_path = os.path.join(os.getcwd(), "sample.csv")
	final_df.to_csv(csv_save_path, index=False)

	return column_sum, np.round(wm, 2), final_df, csv_save_path

	# Define the Gradio interface
	inputs = gr.inputs.File(label="Select PDF file", type="file")
	#inputs = gr.inputs.Textbox(label="Enter the PDF file path:")

	outputs = [
	gr.outputs.Textbox(label="Total ECTS"),
	gr.outputs.Textbox(label="Weighted Mean"),
	gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
	gr.outputs.File(),
	]
	iface = gr.Interface(
	fn=pdf_processing,
	inputs=inputs,
	outputs=outputs,
	title="KUSSS Buddy",
	)

	# Launch the Gradio interface
	iface.launch(share=False)