Spaces:
Build error
Build error
| #!pip install python-docx | |
| #!pip install PyPDF2 --upgrade | |
| import os | |
| import json | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| def extract_from_pdf(pdf_path): | |
| """Extract text from a PDF file.""" | |
| pdf_data = "" | |
| with open(pdf_path, "rb") as pdf_file: | |
| reader = PdfReader(pdf_file) | |
| for page_num in range(len(reader.pages)): | |
| page = reader.pages[page_num] | |
| pdf_data += page.extract_text() | |
| return pdf_data | |
| def extract_from_json(json_path): | |
| """Extract data from a JSON file.""" | |
| with open(json_path, "r") as json_file: | |
| json_data = json.load(json_file) | |
| return json_data | |
| def extract_from_word(word_path): | |
| """Extract text from a Word (.docx) file.""" | |
| doc = Document(word_path) | |
| word_data = "" | |
| for para in doc.paragraphs: | |
| word_data += para.text + "\n" | |
| return word_data | |
| def extract_data(file_path): | |
| """Extract data from a file based on its extension.""" | |
| _, file_extension = os.path.splitext(file_path) | |
| if file_extension == ".pdf": | |
| return extract_from_pdf(file_path) | |
| elif file_extension == ".json": | |
| return extract_from_json(file_path) | |
| elif file_extension == ".docx": | |
| return extract_from_word(file_path) | |
| else: | |
| raise ValueError("Unsupported file extension: " + file_extension) | |
| def create_data_dictionary(files): | |
| """Create a dictionary containing data from files based on their extension.""" | |
| data_dict = {} | |
| for file_path in files: | |
| try: | |
| file_data = extract_data(file_path) | |
| data_dict[file_path] = file_data | |
| except ValueError as e: | |
| print(e) | |
| return data_dict | |
| # Usage example | |
| path = '' | |
| # Usage example | |
| exam_files = 'data' | |
| #exam_data = [files[1]] | |
| print(exam_files) | |
| data_dict = create_data_dictionary(exam_files) | |
| ## | |
| school_data = ['university','department','course_code','course_title','date','duration','instructor'] | |
| qcm_data = ['question','options', 'answer'] | |
| short_data = ['question','answer'] | |
| #print(data_dict[str(exam_data[0])]['multiple_choice_questions']) | |
| multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions'] | |
| short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions'] | |
| long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions'] | |
| for s_data in school_data: | |
| print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}") | |
| print(f"***************'school data'************************") | |
| for idx,qcm in enumerate(multiple_choice_questions): | |
| print(f" Index is: {idx} and 'Question': {qcm['question']}") | |
| print(f" Index is: {idx} and 'Options': {qcm['options']}") | |
| print(f" Index is: {idx} and 'Answer': {qcm['answer']}") | |
| print(f"***************'multiple_choice_questions'************************") | |
| for idx,qcm in enumerate(short_answer_questions): | |
| print(f" Index is: {idx} and 'Question': {qcm['question']}") | |
| print(f" Index is: {idx} and 'Answer': {qcm['answer']}") | |
| print(f"***************' END short_answer_questions'************************") | |
| print(f"***************' START long_answer_questions'************************") | |
| for idx,qcm in enumerate(long_answer_questions): | |
| print(f" Index is: {idx} and 'Question': {qcm['question']}") | |
| print(f" Index is: {idx} and 'Answer': {qcm['answer']}") | |
| print(f"***************' END long_answer_questions'************************") |