Spaces:

ShebMichel
/

GeoScience_Exam_Marker

Build error

App Files Files Community

ShebMichel commited on Nov 1, 2024

Commit

aeba1c4

verified ·

1 Parent(s): e251c7d

Update exam_data_scrapper.py

Browse files

Files changed (1) hide show

exam_data_scrapper.py +91 -89

exam_data_scrapper.py CHANGED Viewed

@@ -1,90 +1,92 @@
-#!pip install python-docx
-#!pip install PyPDF2 --upgrade
-import os
-import json
-from PyPDF2 import PdfReader
-from docx import Document
-def extract_from_pdf(pdf_path):
-    """Extract text from a PDF file."""
-    pdf_data = ""
-    with open(pdf_path, "rb") as pdf_file:
-        reader = PdfReader(pdf_file)
-        for page_num in range(len(reader.pages)):
-            page = reader.pages[page_num]
-            pdf_data += page.extract_text()
-    return pdf_data
-def extract_from_json(json_path):
-    """Extract data from a JSON file."""
-    with open(json_path, "r") as json_file:
-        json_data = json.load(json_file)
-    return json_data
-def extract_from_word(word_path):
-    """Extract text from a Word (.docx) file."""
-    doc = Document(word_path)
-    word_data = ""
-    for para in doc.paragraphs:
-        word_data += para.text + "\n"
-    return word_data
-def extract_data(file_path):
-    """Extract data from a file based on its extension."""
-    _, file_extension = os.path.splitext(file_path)
-    if file_extension == ".pdf":
-        return extract_from_pdf(file_path)
-    elif file_extension == ".json":
-        return extract_from_json(file_path)
-    elif file_extension == ".docx":
-        return extract_from_word(file_path)
-    else:
-        raise ValueError("Unsupported file extension: " + file_extension)
-def create_data_dictionary(files):
-    """Create a dictionary containing data from files based on their extension."""
-    data_dict = {}
-    for file_path in files:
-        try:
-            file_data = extract_data(file_path)
-            data_dict[file_path] = file_data
-        except ValueError as e:
-            print(e)
-    return data_dict
-# Usage example
-path      = r'C:\Users\00110138\OneDrive - The University of Western Australia\Project\KaggleX FellowshipProgram\code\Exam_Data'
-# Usage example
-files     = [str(path)+"/Geology_Geophysics_Exam.pdf", str(path)+"/Geology_Geophysics_Exam.json", str(path)+"/Geology_Geophysics_Exam.docx"]
-exam_data = [files[1]]
-data_dict = create_data_dictionary(exam_data)
-##
-school_data   = ['university','department','course_code','course_title','date','duration','instructor']
-qcm_data      = ['question','options', 'answer']
-short_data    = ['question','answer']
-#print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
-multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
-short_answer_questions    = data_dict[str(exam_data[0])]['short_answer_questions']
-long_answer_questions     = data_dict[str(exam_data[0])]['long_answer_questions']
-for s_data in school_data:
-   print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
-print(f"***************'school data'************************")
-for idx,qcm in enumerate(multiple_choice_questions):
-    print(f" Index is: {idx} and 'Question': {qcm['question']}")
-    print(f" Index is: {idx} and 'Options': {qcm['options']}")
-    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
-print(f"***************'multiple_choice_questions'************************")
-for idx,qcm in enumerate(short_answer_questions):
-    print(f" Index is: {idx} and 'Question': {qcm['question']}")
-    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
-print(f"***************' END short_answer_questions'************************")
-print(f"***************' START long_answer_questions'************************")
-for idx,qcm in enumerate(long_answer_questions):
-    print(f" Index is: {idx} and 'Question': {qcm['question']}")
-    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
 print(f"***************' END long_answer_questions'************************")

+#!pip install python-docx
+#!pip install PyPDF2 --upgrade
+import os
+import json
+from PyPDF2 import PdfReader
+from docx import Document
+def extract_from_pdf(pdf_path):
+    """Extract text from a PDF file."""
+    pdf_data = ""
+    with open(pdf_path, "rb") as pdf_file:
+        reader = PdfReader(pdf_file)
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            pdf_data += page.extract_text()
+    return pdf_data
+def extract_from_json(json_path):
+    """Extract data from a JSON file."""
+    with open(json_path, "r") as json_file:
+        json_data = json.load(json_file)
+    return json_data
+def extract_from_word(word_path):
+    """Extract text from a Word (.docx) file."""
+    doc = Document(word_path)
+    word_data = ""
+    for para in doc.paragraphs:
+        word_data += para.text + "\n"
+    return word_data
+def extract_data(file_path):
+    """Extract data from a file based on its extension."""
+    _, file_extension = os.path.splitext(file_path)
+    if file_extension == ".pdf":
+        return extract_from_pdf(file_path)
+    elif file_extension == ".json":
+        return extract_from_json(file_path)
+    elif file_extension == ".docx":
+        return extract_from_word(file_path)
+    else:
+        raise ValueError("Unsupported file extension: " + file_extension)
+def create_data_dictionary(files):
+    """Create a dictionary containing data from files based on their extension."""
+    data_dict = {}
+    for file_path in files:
+        try:
+            file_data = extract_data(file_path)
+            data_dict[file_path] = file_data
+        except ValueError as e:
+            print(e)
+    return data_dict
+# Usage example
+path      = ''
+# Usage example
+exam_files     = 'data'
+#exam_data = [files[1]]
+print(exam_files)
+data_dict = create_data_dictionary(exam_files)
+##
+school_data   = ['university','department','course_code','course_title','date','duration','instructor']
+qcm_data      = ['question','options', 'answer']
+short_data    = ['question','answer']
+#print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
+multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
+short_answer_questions    = data_dict[str(exam_data[0])]['short_answer_questions']
+long_answer_questions     = data_dict[str(exam_data[0])]['long_answer_questions']
+for s_data in school_data:
+   print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
+print(f"***************'school data'************************")
+for idx,qcm in enumerate(multiple_choice_questions):
+    print(f" Index is: {idx} and 'Question': {qcm['question']}")
+    print(f" Index is: {idx} and 'Options': {qcm['options']}")
+    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
+print(f"***************'multiple_choice_questions'************************")
+for idx,qcm in enumerate(short_answer_questions):
+    print(f" Index is: {idx} and 'Question': {qcm['question']}")
+    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
+print(f"***************' END short_answer_questions'************************")
+print(f"***************' START long_answer_questions'************************")
+for idx,qcm in enumerate(long_answer_questions):
+    print(f" Index is: {idx} and 'Question': {qcm['question']}")
+    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
 print(f"***************' END long_answer_questions'************************")