Spaces:

Krish30
/

pdf_to_csv_converter

Runtime error

File size: 5,838 Bytes

4f5bb45

# -*- coding: utf-8 -*-
"""app2.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1EcIl8KoJxnisZgC7-76wSYIyis_eRKAL
"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile app.py
# import streamlit as st
# import PyPDF2
# import re
# import csv
# import base64
# import os  # Import the os module for file operations
# 
# def extract_data_from_pdf(pdf_path):
#     data_list = []
#     with open(pdf_path, "rb") as file:
#         reader = PyPDF2.PdfReader(file)
#         for page_num in range(2, len(reader.pages)):
#             single_page = reader.pages[page_num].extract_text()
#             data = singlePageData(single_page)
#             data_list.append(data)
#     return data_list
# 
# def singlePageData(singlePage):
#     seat_no_pattern = re.compile(r"Seat No:\s*([^\s]+)")
#     seat_match = seat_no_pattern.search(singlePage)
#     seat_no = seat_match.group(1) if seat_match else ""
# 
#     prn_no_pattern = re.compile(r"PRN:\s*(\d+)")
#     prn_no_match = prn_no_pattern.search(singlePage)
#     prn_no = prn_no_match.group(1) if prn_no_match else ""
# 
#     name_pattern = re.compile(r"Name:\s*([^\n]+)")
#     name_match = name_pattern.search(singlePage)
#     name = name_match.group(1).strip() if name_match else ""
# 
#     sem3_data = semData(singlePage, 3)
#     sem4_data = semData(singlePage, 4)
# 
#     overall_status_pattern = re.compile(r"\|Status:\s*(\w+)\s*\|C")
#     overall_status_match = overall_status_pattern.search(singlePage)
#     overall_status = overall_status_match.group(1) if overall_status_match else ""
# 
#     percentage_match = re.compile(r"\|Percentage:\s*(\d+\.\d+)\s*\%").search(singlePage)
#     percentage = percentage_match.group(1) if percentage_match else ""
# 
#     return {
#         "Exam_Seat_No": seat_no,
#         "PRN_No": prn_no,
#         "Name": name,
#         "Sem3": sem3_data,
#         "Sem4": sem4_data,
#         "Status": overall_status,
#         "Percentage": percentage,
#     }
# 
# def semData(singlePage, sem):
#     data = {}
#     subject_pattern = re.compile(fr"BTN06{sem}\d+\s*\|\s*(\S+)\s*\|\s*\S+\s*\|\s*\S+\s*\|\s*\S+\s*\|\s*(\d+)\s*\|\s*(\d+)")
#     matches = subject_pattern.findall(singlePage)
#     for match in matches:
#         subject_code = match[0]
#         ese_marks = match[1]
#         ise_marks = match[2]
#         total_marks = str(int(ese_marks) + int(ise_marks))
#         data[subject_code] = {
#             "ESE": ese_marks,
#             "ISE": ise_marks,
#             "Total": total_marks
#         }
#     return data
# 
# def write_data_to_csv(data_list, output_path):
#     fieldnames = [
#         "Exam_Seat_No", "PRN_No", "Name",
#         "Sem3_Subject", "Sem3_ESE", "Sem3_ISE", "Sem3_Total",
#         "Sem4_Subject", "Sem4_ESE", "Sem4_ISE", "Sem4_Total",
#         "Status", "Percentage"
#     ]
# 
#     with open(output_path, "w", newline="") as csvfile:
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#         writer.writeheader()
# 
#         for student in data_list:
#             sem3_data = student["Sem3"]
#             sem4_data = student["Sem4"]
# 
#             for subject_code, sem3_marks in sem3_data.items():
#                 sem4_marks = sem4_data.get(subject_code, {"ESE": "", "ISE": "", "Total": ""})
# 
#                 writer.writerow({
#                     "Exam_Seat_No": student["Exam_Seat_No"],
#                     "PRN_No": student["PRN_No"],
#                     "Name": student["Name"],
#                     "Sem3_Subject": subject_code,
#                     "Sem3_ESE": sem3_marks["ESE"],
#                     "Sem3_ISE": sem3_marks["ISE"],
#                     "Sem3_Total": sem3_marks["Total"],
#                     "Sem4_Subject": subject_code,
#                     "Sem4_ESE": sem4_marks["ESE"],
#                     "Sem4_ISE": sem4_marks["ISE"],
#                     "Sem4_Total": sem4_marks["Total"],
#                     "Status": student["Status"],
#                     "Percentage": student["Percentage"]
#                 })
# 
# def main():
#     st.title("PDF to CSV Converter")
# 
#     # File upload section
#     uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
# 
#     if uploaded_file is not None:
#         # Save the uploaded PDF file to a temporary location
#         input_pdf_path = save_uploaded_file(uploaded_file)
# 
#         try:
#             # Extract data from the PDF
#             data_list = extract_data_from_pdf(input_pdf_path)
# 
#             # Save extracted data to CSV
#             output_csv_path = "/tmp/output.csv"
#             write_data_to_csv(data_list, output_csv_path)
# 
#             # Provide download link for the CSV file
#             st.success("PDF successfully processed!")
#             st.markdown(get_binary_file_downloader_html(output_csv_path, "CSV"), unsafe_allow_html=True)
# 
#         except Exception as e:
#             st.error(f"Error encountered during PDF extraction: {str(e)}")
# 
# def save_uploaded_file(uploaded_file):
#     # Save the uploaded PDF file to a temporary location
#     temp_dir = "/tmp/pdf_converter"
#     os.makedirs(temp_dir, exist_ok=True)
#     input_pdf_path = os.path.join(temp_dir, "input.pdf")
#     with open(input_pdf_path, "wb") as f:
#         f.write(uploaded_file.read())
#     return input_pdf_path
# 
# def get_binary_file_downloader_html(bin_file, file_label='File'):
#     # Generate download link for the CSV file
#     with open(bin_file, 'rb') as f:
#         data = f.read()
#     b64 = base64.b64encode(data).decode()
#     href = f'<a href="data:application/octet-stream;base64,{b64}" download="{os.path.basename(bin_file)}">Download {file_label}</a>'
#     return href
# 
# if __name__ == "__main__":
#     main()
#