Spaces:

ShebMichel
/

GeoScience_Exam_Marker

Build error

App Files Files Community

GeoScience_Exam_Marker / exam_data_scrapper.py

ShebMichel

Update exam_data_scrapper.py

aeba1c4 verified about 1 year ago

raw

history blame contribute delete

3.44 kB

	#!pip install python-docx
	#!pip install PyPDF2 --upgrade


	import os
	import json
	from PyPDF2 import PdfReader
	from docx import Document

	def extract_from_pdf(pdf_path):
	"""Extract text from a PDF file."""
	pdf_data = ""
	with open(pdf_path, "rb") as pdf_file:
	reader = PdfReader(pdf_file)
	for page_num in range(len(reader.pages)):
	page = reader.pages[page_num]
	pdf_data += page.extract_text()
	return pdf_data

	def extract_from_json(json_path):
	"""Extract data from a JSON file."""
	with open(json_path, "r") as json_file:
	json_data = json.load(json_file)
	return json_data

	def extract_from_word(word_path):
	"""Extract text from a Word (.docx) file."""
	doc = Document(word_path)
	word_data = ""
	for para in doc.paragraphs:
	word_data += para.text + "\n"
	return word_data

	def extract_data(file_path):
	"""Extract data from a file based on its extension."""
	_, file_extension = os.path.splitext(file_path)

	if file_extension == ".pdf":
	return extract_from_pdf(file_path)
	elif file_extension == ".json":
	return extract_from_json(file_path)
	elif file_extension == ".docx":
	return extract_from_word(file_path)
	else:
	raise ValueError("Unsupported file extension: " + file_extension)

	def create_data_dictionary(files):
	"""Create a dictionary containing data from files based on their extension."""
	data_dict = {}
	for file_path in files:
	try:
	file_data = extract_data(file_path)
	data_dict[file_path] = file_data
	except ValueError as e:
	print(e)
	return data_dict

	# Usage example
	path = ''
	# Usage example
	exam_files = 'data'
	#exam_data = [files[1]]
	print(exam_files)
	data_dict = create_data_dictionary(exam_files)

	##
	school_data = ['university','department','course_code','course_title','date','duration','instructor']
	qcm_data = ['question','options', 'answer']
	short_data = ['question','answer']
	#print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
	multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
	short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions']
	long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions']

	for s_data in school_data:
	print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
	print(f"*************'school data'**********************")

	for idx,qcm in enumerate(multiple_choice_questions):
	print(f" Index is: {idx} and 'Question': {qcm['question']}")
	print(f" Index is: {idx} and 'Options': {qcm['options']}")
	print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
	print(f"*************'multiple_choice_questions'**********************")
	for idx,qcm in enumerate(short_answer_questions):
	print(f" Index is: {idx} and 'Question': {qcm['question']}")
	print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
	print(f"*************' END short_answer_questions'**********************")
	print(f"*************' START long_answer_questions'**********************")
	for idx,qcm in enumerate(long_answer_questions):
	print(f" Index is: {idx} and 'Question': {qcm['question']}")
	print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
	print(f"*************' END long_answer_questions'**********************")